1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten
= 0;
172 // the number of NULs terminating this string
173 size_t nulLen
= 0; // not really needed, but just to avoid warnings
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
181 if ( srcLen
!= wxNO_LEN
)
183 // we need to know how to find the end of this string
184 nulLen
= GetMBNulLen();
185 if ( nulLen
== wxCONV_FAILED
)
186 return wxCONV_FAILED
;
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
193 char * const p
= bufTmp
.data();
194 memcpy(p
, src
, srcLen
);
195 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
201 srcEnd
= src
+ srcLen
;
203 else // quit after the first loop iteration
210 // try to convert the current chunk
211 size_t lenChunk
= MB2WC(NULL
, src
, 0);
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 lenChunk
++; // for the L'\0' at the end of this chunk
217 dstWritten
+= lenChunk
;
221 // nothing left in the input string, conversion succeeded
227 if ( dstWritten
> dstLen
)
228 return wxCONV_FAILED
;
230 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
231 return wxCONV_FAILED
;
238 // we convert just one chunk in this case as this is the entire
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src
, nulLen
) )
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
253 src
+= nulLen
; // skipping over its terminator as well
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
266 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
267 const wchar_t *src
, size_t srcLen
) const
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten
= 0;
272 // make a copy of the input string unless it is already properly
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp
;
278 if ( srcLen
== wxNO_LEN
)
280 srcLen
= wxWcslen(src
) + 1;
282 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp
= wxWCharBuffer(srcLen
);
286 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
290 const size_t lenNul
= GetMBNulLen();
291 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
293 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
295 // try to convert the current chunk
296 size_t lenChunk
= WC2MB(NULL
, src
, 0);
298 if ( lenChunk
== wxCONV_FAILED
)
299 return wxCONV_FAILED
;
302 dstWritten
+= lenChunk
;
306 if ( dstWritten
> dstLen
)
307 return wxCONV_FAILED
;
309 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
310 return wxCONV_FAILED
;
319 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
321 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
322 if ( rc
!= wxCONV_FAILED
)
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
332 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
334 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
335 if ( rc
!= wxCONV_FAILED
)
343 wxMBConv::~wxMBConv()
345 // nothing to do here (necessary for Darwin linking probably)
348 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
352 // calculate the length of the buffer needed first
353 const size_t nLen
= ToWChar(NULL
, 0, psz
);
354 if ( nLen
!= wxCONV_FAILED
)
356 // now do the actual conversion
357 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
359 // +1 for the trailing NULL
360 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
365 return wxWCharBuffer();
368 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
372 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
373 if ( nLen
!= wxCONV_FAILED
)
375 wxCharBuffer
buf(nLen
- 1);
376 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
381 return wxCharBuffer();
385 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
387 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
388 if ( dstLen
!= wxCONV_FAILED
)
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer
wbuf(dstLen
);
394 wbuf
.data()[dstLen
- 1] = L
'\0';
395 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
400 if ( wbuf
[dstLen
- 1] == L
'\0' )
411 return wxWCharBuffer();
415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
417 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
418 if ( dstLen
!= wxCONV_FAILED
)
420 const size_t nulLen
= GetMBNulLen();
422 // as above, ensure that the buffer is always NUL-terminated, even if
424 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
425 memset(buf
.data() + dstLen
, 0, nulLen
);
426 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
432 if ( dstLen
>= nulLen
&&
433 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
448 return wxCharBuffer();
451 // ----------------------------------------------------------------------------
453 // ----------------------------------------------------------------------------
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
457 return wxMB2WC(buf
, psz
, n
);
460 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
462 return wxWC2MB(buf
, psz
, n
);
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
473 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
474 wxStricmp(charset
, _T("UTF8")) == 0 )
475 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
477 m_conv
= new wxCSConv(charset
);
482 // ----------------------------------------------------------------------------
484 // ----------------------------------------------------------------------------
486 // Implementation (C) 2004 Fredrik Roubert
488 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
491 // BASE64 decoding table
493 static const unsigned char utf7unb64
[] =
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
529 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
530 const char *src
, size_t srcLen
) const
532 DecoderState stateOrig
,
534 if ( srcLen
== wxNO_LEN
)
536 // convert the entire string, up to and including the trailing NUL
537 srcLen
= strlen(src
) + 1;
539 // when working on the entire strings we don't update nor use the shift
540 // state from the previous call
541 statePtr
= &stateOrig
;
543 else // when working with partial strings we do use the shift state
545 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
547 // also save the old state to be able to rollback to it on error
548 stateOrig
= m_stateDecoder
;
551 // but to simplify the code below we use this variable in both cases
552 DecoderState
& state
= *statePtr
;
555 // number of characters [which would have been] written to dst [if it were
559 const char * const srcEnd
= src
+ srcLen
;
561 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
563 const unsigned char cc
= *src
++;
565 if ( state
.IsShifted() )
567 const unsigned char dc
= utf7unb64
[cc
];
570 // end of encoded part
573 // re-parse this character normally below unless it's '-' which
574 // is consumed by the decoder
578 else // valid encoded character
580 // mini base64 decoder: each character is 6 bits
585 if ( state
.bit
>= 8 )
587 // got the full byte, consume it
589 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
593 // we've got the full word, output it
595 *dst
++ = (state
.msb
<< 8) | b
;
601 // just store it while we wait for LSB
609 if ( state
.IsDirect() )
611 // start of an encoded segment?
615 return wxCONV_FAILED
; // can't have '+' at the end
619 // just the encoded plus sign, don't switch to shifted mode
632 // only printable 7 bit ASCII characters (with the exception of
633 // NUL, TAB, CR and LF) can be used directly
634 if ( cc
>= 0x7f || (cc
< ' ' &&
635 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
636 return wxCONV_FAILED
;
647 // as we didn't read any characters we should be called with the same
648 // data (followed by some more new data) again later so don't save our
652 return wxCONV_FAILED
;
659 // BASE64 encoding table
661 static const unsigned char utf7enb64
[] =
663 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
664 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
665 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
666 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
667 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
668 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
669 'w', 'x', 'y', 'z', '0', '1', '2', '3',
670 '4', '5', '6', '7', '8', '9', '+', '/'
674 // UTF-7 encoding table
676 // 0 - Set D (directly encoded characters)
677 // 1 - Set O (optional direct characters)
678 // 2 - whitespace characters (optional)
679 // 3 - special characters
681 static const unsigned char utf7encode
[128] =
683 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
684 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
685 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
689 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
693 static inline bool wxIsUTF7Direct(wchar_t wc
)
695 return wc
< 0x80 && utf7encode
[wc
] < 1;
698 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
699 const wchar_t *src
, size_t srcLen
) const
701 EncoderState stateOrig
,
703 if ( srcLen
== wxNO_LEN
)
705 // we don't apply the stored state when operating on entire strings at
707 statePtr
= &stateOrig
;
709 srcLen
= wxWcslen(src
) + 1;
711 else // do use the mode we left the output in previously
713 stateOrig
= m_stateEncoder
;
714 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
717 EncoderState
& state
= *statePtr
;
722 const wchar_t * const srcEnd
= src
+ srcLen
;
723 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
726 if ( wxIsUTF7Direct(cc
) )
728 if ( state
.IsShifted() )
730 // pad with zeros the last encoded block if necessary
734 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
749 else if ( cc
== '+' && state
.IsDirect() )
760 else if (((wxUint32
)cc
) > 0xffff)
762 // no surrogate pair generation (yet?)
763 return wxCONV_FAILED
;
768 if ( state
.IsDirect() )
777 // BASE64 encode string
780 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
783 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
785 for (state
.bit
+= 8; state
.bit
>= 6; )
789 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
794 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
802 // we need to restore the original encoder state if we were called just to
803 // calculate the amount of space needed as we will presumably be called
804 // again to really convert the data now
811 // ----------------------------------------------------------------------------
813 // ----------------------------------------------------------------------------
815 static const wxUint32 utf8_max
[]=
816 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
818 // boundaries of the private use area we use to (temporarily) remap invalid
819 // characters invalid in a UTF-8 encoded string
820 const wxUint32 wxUnicodePUA
= 0x100000;
821 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
823 // this table gives the length of the UTF-8 encoding from its first character:
824 const unsigned char tableUtf8Lengths
[256] = {
825 // single-byte sequences (ASCII):
826 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
828 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
835 // these are invalid:
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
842 // two-byte sequences:
843 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
846 // three-byte sequences:
847 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
849 // four-byte sequences:
850 4, 4, 4, 4, 4, // F0..F4
852 // these are invalid again (5- or 6-byte
853 // sequences and sequences for code points
854 // above U+10FFFF, as restricted by RFC 3629):
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
859 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
860 const char *src
, size_t srcLen
) const
862 wchar_t *out
= dstLen
? dst
: NULL
;
865 if ( srcLen
== wxNO_LEN
)
866 srcLen
= strlen(src
) + 1;
868 for ( const char *p
= src
; ; p
++ )
870 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
872 // all done successfully, just add the trailing NULL if we are not
873 // using explicit length
874 if ( srcLen
== wxNO_LEN
)
890 if ( out
&& !dstLen
-- )
894 unsigned char c
= *p
;
898 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
901 if ( srcLen
!= wxNO_LEN
)
908 unsigned len
= tableUtf8Lengths
[c
];
912 if ( srcLen
< len
) // the test works for wxNO_LEN too
915 if ( srcLen
!= wxNO_LEN
)
918 // Char. number range | UTF-8 octet sequence
919 // (hexadecimal) | (binary)
920 // ----------------------+----------------------------------------
921 // 0000 0000 - 0000 007F | 0xxxxxxx
922 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
923 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
924 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
926 // Code point value is stored in bits marked with 'x',
927 // lowest-order bit of the value on the right side in the diagram
928 // above. (from RFC 3629)
930 // mask to extract lead byte's value ('x' bits above), by sequence
932 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
934 // mask and value of lead byte's most significant bits, by length:
935 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
936 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
938 len
--; // it's more convenient to work with 0-based length here
940 // extract the lead byte's value bits:
941 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
944 code
= c
& leadValueMask
[len
];
946 // all remaining bytes, if any, are handled in the same way
947 // regardless of sequence's length:
951 if ( (c
& 0xC0) != 0x80 )
952 return wxCONV_FAILED
;
960 // cast is ok because wchar_t == wxUint16 if WC_UTF16
961 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
970 #endif // WC_UTF16/!WC_UTF16
978 return wxCONV_FAILED
;
982 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
983 const wchar_t *src
, size_t srcLen
) const
985 char *out
= dstLen
? dst
: NULL
;
988 for ( const wchar_t *wp
= src
; ; wp
++ )
990 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
992 // all done successfully, just add the trailing NULL if we are not
993 // using explicit length
994 if ( srcLen
== wxNO_LEN
)
1013 // cast is ok for WC_UTF16
1014 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1016 // skip the next char too as we decoded a surrogate
1019 #else // wchar_t is UTF-32
1020 code
= *wp
& 0x7fffffff;
1032 out
[0] = (char)code
;
1035 else if ( code
<= 0x07FF )
1043 // NB: this line takes 6 least significant bits, encodes them as
1044 // 10xxxxxx and discards them so that the next byte can be encoded:
1045 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1046 out
[0] = 0xC0 | code
;
1049 else if ( code
< 0xFFFF )
1057 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1058 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1059 out
[0] = 0xE0 | code
;
1062 else if ( code
<= 0x10FFFF )
1070 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1071 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1072 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1073 out
[0] = 0xF0 | code
;
1078 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1091 // we only get here if an error occurs during decoding
1092 return wxCONV_FAILED
;
1095 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1096 const char *psz
, size_t srcLen
) const
1098 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1099 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1103 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1105 const char *opsz
= psz
;
1106 bool invalid
= false;
1107 unsigned char cc
= *psz
++, fc
= cc
;
1109 for (cnt
= 0; fc
& 0x80; cnt
++)
1119 // escape the escape character for octal escapes
1120 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1121 && cc
== '\\' && (!buf
|| len
< n
))
1133 // invalid UTF-8 sequence
1138 unsigned ocnt
= cnt
- 1;
1139 wxUint32 res
= cc
& (0x3f >> cnt
);
1143 if ((cc
& 0xC0) != 0x80)
1145 // invalid UTF-8 sequence
1151 res
= (res
<< 6) | (cc
& 0x3f);
1154 if (invalid
|| res
<= utf8_max
[ocnt
])
1156 // illegal UTF-8 encoding
1159 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1160 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1162 // if one of our PUA characters turns up externally
1163 // it must also be treated as an illegal sequence
1164 // (a bit like you have to escape an escape character)
1170 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1171 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1172 if (pa
== wxCONV_FAILED
)
1184 *buf
++ = (wchar_t)res
;
1186 #endif // WC_UTF16/!WC_UTF16
1192 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1194 while (opsz
< psz
&& (!buf
|| len
< n
))
1197 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1198 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1199 wxASSERT(pa
!= wxCONV_FAILED
);
1206 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1212 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1214 while (opsz
< psz
&& (!buf
|| len
< n
))
1216 if ( buf
&& len
+ 3 < n
)
1218 unsigned char on
= *opsz
;
1220 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1221 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1222 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1229 else // MAP_INVALID_UTF8_NOT
1231 return wxCONV_FAILED
;
1237 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1243 static inline bool isoctal(wchar_t wch
)
1245 return L
'0' <= wch
&& wch
<= L
'7';
1248 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1249 const wchar_t *psz
, size_t srcLen
) const
1251 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1252 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1256 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1261 // cast is ok for WC_UTF16
1262 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1263 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1265 cc
= (*psz
++) & 0x7fffffff;
1268 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1269 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1272 *buf
++ = (char)(cc
- wxUnicodePUA
);
1275 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1276 && cc
== L
'\\' && psz
[0] == L
'\\' )
1283 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1285 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1289 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1290 (psz
[1] - L
'0') * 010 +
1300 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1316 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1318 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1324 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1330 // ============================================================================
1332 // ============================================================================
1334 #ifdef WORDS_BIGENDIAN
1335 #define wxMBConvUTF16straight wxMBConvUTF16BE
1336 #define wxMBConvUTF16swap wxMBConvUTF16LE
1338 #define wxMBConvUTF16swap wxMBConvUTF16BE
1339 #define wxMBConvUTF16straight wxMBConvUTF16LE
1343 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1345 if ( srcLen
== wxNO_LEN
)
1347 // count the number of bytes in input, including the trailing NULs
1348 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1349 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1352 srcLen
*= BYTES_PER_CHAR
;
1354 else // we already have the length
1356 // we can only convert an entire number of UTF-16 characters
1357 if ( srcLen
% BYTES_PER_CHAR
)
1358 return wxCONV_FAILED
;
1364 // case when in-memory representation is UTF-16 too
1367 // ----------------------------------------------------------------------------
1368 // conversions without endianness change
1369 // ----------------------------------------------------------------------------
1372 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1373 const char *src
, size_t srcLen
) const
1375 // set up the scene for using memcpy() (which is presumably more efficient
1376 // than copying the bytes one by one)
1377 srcLen
= GetLength(src
, srcLen
);
1378 if ( srcLen
== wxNO_LEN
)
1379 return wxCONV_FAILED
;
1381 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1384 if ( dstLen
< inLen
)
1385 return wxCONV_FAILED
;
1387 memcpy(dst
, src
, srcLen
);
1394 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1395 const wchar_t *src
, size_t srcLen
) const
1397 if ( srcLen
== wxNO_LEN
)
1398 srcLen
= wxWcslen(src
) + 1;
1400 srcLen
*= BYTES_PER_CHAR
;
1404 if ( dstLen
< srcLen
)
1405 return wxCONV_FAILED
;
1407 memcpy(dst
, src
, srcLen
);
1413 // ----------------------------------------------------------------------------
1414 // endian-reversing conversions
1415 // ----------------------------------------------------------------------------
1418 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1419 const char *src
, size_t srcLen
) const
1421 srcLen
= GetLength(src
, srcLen
);
1422 if ( srcLen
== wxNO_LEN
)
1423 return wxCONV_FAILED
;
1425 srcLen
/= BYTES_PER_CHAR
;
1429 if ( dstLen
< srcLen
)
1430 return wxCONV_FAILED
;
1432 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1433 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1435 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1443 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1444 const wchar_t *src
, size_t srcLen
) const
1446 if ( srcLen
== wxNO_LEN
)
1447 srcLen
= wxWcslen(src
) + 1;
1449 srcLen
*= BYTES_PER_CHAR
;
1453 if ( dstLen
< srcLen
)
1454 return wxCONV_FAILED
;
1456 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1457 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1459 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1466 #else // !WC_UTF16: wchar_t is UTF-32
1468 // ----------------------------------------------------------------------------
1469 // conversions without endianness change
1470 // ----------------------------------------------------------------------------
1473 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1474 const char *src
, size_t srcLen
) const
1476 srcLen
= GetLength(src
, srcLen
);
1477 if ( srcLen
== wxNO_LEN
)
1478 return wxCONV_FAILED
;
1480 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1483 // optimization: return maximal space which could be needed for this
1484 // string even if the real size could be smaller if the buffer contains
1490 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1491 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1493 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1495 return wxCONV_FAILED
;
1497 if ( ++outLen
> dstLen
)
1498 return wxCONV_FAILED
;
1508 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1509 const wchar_t *src
, size_t srcLen
) const
1511 if ( srcLen
== wxNO_LEN
)
1512 srcLen
= wxWcslen(src
) + 1;
1515 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1516 for ( size_t n
= 0; n
< srcLen
; n
++ )
1519 const size_t numChars
= encode_utf16(*src
++, cc
);
1520 if ( numChars
== wxCONV_FAILED
)
1521 return wxCONV_FAILED
;
1523 outLen
+= numChars
* BYTES_PER_CHAR
;
1526 if ( outLen
> dstLen
)
1527 return wxCONV_FAILED
;
1530 if ( numChars
== 2 )
1532 // second character of a surrogate
1541 // ----------------------------------------------------------------------------
1542 // endian-reversing conversions
1543 // ----------------------------------------------------------------------------
1546 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1547 const char *src
, size_t srcLen
) const
1549 srcLen
= GetLength(src
, srcLen
);
1550 if ( srcLen
== wxNO_LEN
)
1551 return wxCONV_FAILED
;
1553 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1556 // optimization: return maximal space which could be needed for this
1557 // string even if the real size could be smaller if the buffer contains
1563 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1564 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1569 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1571 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1573 const size_t numChars
= decode_utf16(tmp
, ch
);
1574 if ( numChars
== wxCONV_FAILED
)
1575 return wxCONV_FAILED
;
1577 if ( numChars
== 2 )
1580 if ( ++outLen
> dstLen
)
1581 return wxCONV_FAILED
;
1591 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1592 const wchar_t *src
, size_t srcLen
) const
1594 if ( srcLen
== wxNO_LEN
)
1595 srcLen
= wxWcslen(src
) + 1;
1598 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1599 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1602 const size_t numChars
= encode_utf16(*src
, cc
);
1603 if ( numChars
== wxCONV_FAILED
)
1604 return wxCONV_FAILED
;
1606 outLen
+= numChars
* BYTES_PER_CHAR
;
1609 if ( outLen
> dstLen
)
1610 return wxCONV_FAILED
;
1612 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1613 if ( numChars
== 2 )
1615 // second character of a surrogate
1616 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1624 #endif // WC_UTF16/!WC_UTF16
1627 // ============================================================================
1629 // ============================================================================
1631 #ifdef WORDS_BIGENDIAN
1632 #define wxMBConvUTF32straight wxMBConvUTF32BE
1633 #define wxMBConvUTF32swap wxMBConvUTF32LE
1635 #define wxMBConvUTF32swap wxMBConvUTF32BE
1636 #define wxMBConvUTF32straight wxMBConvUTF32LE
1640 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1641 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1644 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1646 if ( srcLen
== wxNO_LEN
)
1648 // count the number of bytes in input, including the trailing NULs
1649 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1650 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1653 srcLen
*= BYTES_PER_CHAR
;
1655 else // we already have the length
1657 // we can only convert an entire number of UTF-32 characters
1658 if ( srcLen
% BYTES_PER_CHAR
)
1659 return wxCONV_FAILED
;
1665 // case when in-memory representation is UTF-16
1668 // ----------------------------------------------------------------------------
1669 // conversions without endianness change
1670 // ----------------------------------------------------------------------------
1673 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1674 const char *src
, size_t srcLen
) const
1676 srcLen
= GetLength(src
, srcLen
);
1677 if ( srcLen
== wxNO_LEN
)
1678 return wxCONV_FAILED
;
1680 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1681 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1683 for ( size_t n
= 0; n
< inLen
; n
++ )
1686 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1687 if ( numChars
== wxCONV_FAILED
)
1688 return wxCONV_FAILED
;
1693 if ( outLen
> dstLen
)
1694 return wxCONV_FAILED
;
1697 if ( numChars
== 2 )
1699 // second character of a surrogate
1709 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1710 const wchar_t *src
, size_t srcLen
) const
1712 if ( srcLen
== wxNO_LEN
)
1713 srcLen
= wxWcslen(src
) + 1;
1717 // optimization: return maximal space which could be needed for this
1718 // string instead of the exact amount which could be less if there are
1719 // any surrogates in the input
1721 // we consider that surrogates are rare enough to make it worthwhile to
1722 // avoid running the loop below at the cost of slightly extra memory
1724 return srcLen
* BYTES_PER_CHAR
;
1727 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1729 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1731 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1733 return wxCONV_FAILED
;
1735 outLen
+= BYTES_PER_CHAR
;
1737 if ( outLen
> dstLen
)
1738 return wxCONV_FAILED
;
1746 // ----------------------------------------------------------------------------
1747 // endian-reversing conversions
1748 // ----------------------------------------------------------------------------
1751 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1752 const char *src
, size_t srcLen
) const
1754 srcLen
= GetLength(src
, srcLen
);
1755 if ( srcLen
== wxNO_LEN
)
1756 return wxCONV_FAILED
;
1758 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1759 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1761 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1764 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1765 if ( numChars
== wxCONV_FAILED
)
1766 return wxCONV_FAILED
;
1771 if ( outLen
> dstLen
)
1772 return wxCONV_FAILED
;
1775 if ( numChars
== 2 )
1777 // second character of a surrogate
1787 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1788 const wchar_t *src
, size_t srcLen
) const
1790 if ( srcLen
== wxNO_LEN
)
1791 srcLen
= wxWcslen(src
) + 1;
1795 // optimization: return maximal space which could be needed for this
1796 // string instead of the exact amount which could be less if there are
1797 // any surrogates in the input
1799 // we consider that surrogates are rare enough to make it worthwhile to
1800 // avoid running the loop below at the cost of slightly extra memory
1802 return srcLen
*BYTES_PER_CHAR
;
1805 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1807 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1809 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1811 return wxCONV_FAILED
;
1813 outLen
+= BYTES_PER_CHAR
;
1815 if ( outLen
> dstLen
)
1816 return wxCONV_FAILED
;
1818 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1824 #else // !WC_UTF16: wchar_t is UTF-32
1826 // ----------------------------------------------------------------------------
1827 // conversions without endianness change
1828 // ----------------------------------------------------------------------------
1831 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1832 const char *src
, size_t srcLen
) const
1834 // use memcpy() as it should be much faster than hand-written loop
1835 srcLen
= GetLength(src
, srcLen
);
1836 if ( srcLen
== wxNO_LEN
)
1837 return wxCONV_FAILED
;
1839 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1842 if ( dstLen
< inLen
)
1843 return wxCONV_FAILED
;
1845 memcpy(dst
, src
, srcLen
);
1852 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1853 const wchar_t *src
, size_t srcLen
) const
1855 if ( srcLen
== wxNO_LEN
)
1856 srcLen
= wxWcslen(src
) + 1;
1858 srcLen
*= BYTES_PER_CHAR
;
1862 if ( dstLen
< srcLen
)
1863 return wxCONV_FAILED
;
1865 memcpy(dst
, src
, srcLen
);
1871 // ----------------------------------------------------------------------------
1872 // endian-reversing conversions
1873 // ----------------------------------------------------------------------------
1876 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1877 const char *src
, size_t srcLen
) const
1879 srcLen
= GetLength(src
, srcLen
);
1880 if ( srcLen
== wxNO_LEN
)
1881 return wxCONV_FAILED
;
1883 srcLen
/= BYTES_PER_CHAR
;
1887 if ( dstLen
< srcLen
)
1888 return wxCONV_FAILED
;
1890 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1891 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1893 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1901 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1902 const wchar_t *src
, size_t srcLen
) const
1904 if ( srcLen
== wxNO_LEN
)
1905 srcLen
= wxWcslen(src
) + 1;
1907 srcLen
*= BYTES_PER_CHAR
;
1911 if ( dstLen
< srcLen
)
1912 return wxCONV_FAILED
;
1914 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1915 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1917 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1924 #endif // WC_UTF16/!WC_UTF16
1927 // ============================================================================
1928 // The classes doing conversion using the iconv_xxx() functions
1929 // ============================================================================
1933 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1934 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1935 // (unless there's yet another bug in glibc) the only case when iconv()
1936 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1937 // left in the input buffer -- when _real_ error occurs,
1938 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1940 // [This bug does not appear in glibc 2.2.]
1941 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1942 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1943 (errno != E2BIG || bufLeft != 0))
1945 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1948 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1950 #define ICONV_T_INVALID ((iconv_t)-1)
1952 #if SIZEOF_WCHAR_T == 4
1953 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1954 #define WC_ENC wxFONTENCODING_UTF32
1955 #elif SIZEOF_WCHAR_T == 2
1956 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1957 #define WC_ENC wxFONTENCODING_UTF16
1958 #else // sizeof(wchar_t) != 2 nor 4
1959 // does this ever happen?
1960 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1963 // ----------------------------------------------------------------------------
1964 // wxMBConv_iconv: encapsulates an iconv character set
1965 // ----------------------------------------------------------------------------
1967 class wxMBConv_iconv
: public wxMBConv
1970 wxMBConv_iconv(const char *name
);
1971 virtual ~wxMBConv_iconv();
1973 // implement base class virtual methods
1974 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
1975 const char *src
, size_t srcLen
= wxNO_LEN
) const;
1976 virtual size_t FromWChar(char *dst
, size_t dstLen
,
1977 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
1978 virtual size_t GetMBNulLen() const;
1980 #if wxUSE_UNICODE_UTF8
1981 virtual bool IsUTF8() const;
1984 virtual wxMBConv
*Clone() const
1986 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1987 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1992 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1995 // the iconv handlers used to translate from multibyte
1996 // to wide char and in the other direction
2001 // guards access to m2w and w2m objects
2002 wxMutex m_iconvMutex
;
2006 // the name (for iconv_open()) of a wide char charset -- if none is
2007 // available on this machine, it will remain NULL
2008 static wxString ms_wcCharsetName
;
2010 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2011 // different endian-ness than the native one
2012 static bool ms_wcNeedsSwap
;
2015 // name of the encoding handled by this conversion
2018 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2020 size_t m_minMBCharWidth
;
2023 // make the constructor available for unit testing
2024 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2026 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2027 if ( !result
->IsOk() )
2036 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2037 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2039 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2042 m_minMBCharWidth
= 0;
2044 // check for charset that represents wchar_t:
2045 if ( ms_wcCharsetName
.empty() )
2047 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2050 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2051 #else // !wxUSE_FONTMAP
2052 static const wxChar
*names_static
[] =
2054 #if SIZEOF_WCHAR_T == 4
2056 #elif SIZEOF_WCHAR_T = 2
2061 const wxChar
**names
= names_static
;
2062 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2064 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2066 const wxString
nameCS(*names
);
2068 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2069 wxString
nameXE(nameCS
);
2071 #ifdef WORDS_BIGENDIAN
2073 #else // little endian
2077 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2080 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2081 if ( m2w
== ICONV_T_INVALID
)
2083 // try charset w/o bytesex info (e.g. "UCS4")
2084 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2086 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2088 // and check for bytesex ourselves:
2089 if ( m2w
!= ICONV_T_INVALID
)
2091 char buf
[2], *bufPtr
;
2100 outsz
= SIZEOF_WCHAR_T
* 2;
2101 char* wbufPtr
= (char*)wbuf
;
2105 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2108 if (ICONV_FAILED(res
, insz
))
2110 wxLogLastError(wxT("iconv"));
2111 wxLogError(_("Conversion to charset '%s' doesn't work."),
2114 else // ok, can convert to this encoding, remember it
2116 ms_wcCharsetName
= nameCS
;
2117 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2121 else // use charset not requiring byte swapping
2123 ms_wcCharsetName
= nameXE
;
2127 wxLogTrace(TRACE_STRCONV
,
2128 wxT("iconv wchar_t charset is \"%s\"%s"),
2129 ms_wcCharsetName
.empty() ? wxString("<none>")
2131 ms_wcNeedsSwap
? _T(" (needs swap)")
2134 else // we already have ms_wcCharsetName
2136 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2139 if ( ms_wcCharsetName
.empty() )
2141 w2m
= ICONV_T_INVALID
;
2145 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2146 if ( w2m
== ICONV_T_INVALID
)
2148 wxLogTrace(TRACE_STRCONV
,
2149 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2150 ms_wcCharsetName
.c_str(), name
);
2155 wxMBConv_iconv::~wxMBConv_iconv()
2157 if ( m2w
!= ICONV_T_INVALID
)
2159 if ( w2m
!= ICONV_T_INVALID
)
2164 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2165 const char *src
, size_t srcLen
) const
2167 if ( srcLen
== wxNO_LEN
)
2169 // find the string length: notice that must be done differently for
2170 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2172 const size_t nulLen
= GetMBNulLen();
2176 return wxCONV_FAILED
;
2179 srcLen
= strlen(src
); // arguably more optimized than our version
2184 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2185 // but they also have to start at character boundary and not
2186 // span two adjacent characters
2188 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2194 // when we're determining the length of the string ourselves we count
2195 // the terminating NUL(s) as part of it and always NUL-terminate the
2200 // we express length in the number of (wide) characters but iconv always
2201 // counts buffer sizes it in bytes
2202 dstLen
*= SIZEOF_WCHAR_T
;
2205 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2206 // Unfortunately there are a couple of global wxCSConv objects such as
2207 // wxConvLocal that are used all over wx code, so we have to make sure
2208 // the handle is used by at most one thread at the time. Otherwise
2209 // only a few wx classes would be safe to use from non-main threads
2210 // as MB<->WC conversion would fail "randomly".
2211 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2212 #endif // wxUSE_THREADS
2215 const char *pszPtr
= src
;
2219 char* bufPtr
= (char*)dst
;
2221 // have destination buffer, convert there
2222 size_t dstLenOrig
= dstLen
;
2224 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2227 // convert the number of bytes converted as returned by iconv to the
2228 // number of (wide) characters converted that we need
2229 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2233 // convert to native endianness
2234 for ( unsigned i
= 0; i
< res
; i
++ )
2235 dst
[i
] = WC_BSWAP(dst
[i
]);
2238 else // no destination buffer
2240 // convert using temp buffer to calculate the size of the buffer needed
2246 char* bufPtr
= (char*)tbuf
;
2247 dstLen
= 8 * SIZEOF_WCHAR_T
;
2250 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2253 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2255 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2258 if (ICONV_FAILED(cres
, srcLen
))
2260 //VS: it is ok if iconv fails, hence trace only
2261 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2262 return wxCONV_FAILED
;
2268 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2269 const wchar_t *src
, size_t srcLen
) const
2272 // NB: explained in MB2WC
2273 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2276 if ( srcLen
== wxNO_LEN
)
2277 srcLen
= wxWcslen(src
) + 1;
2279 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2280 size_t outbuflen
= dstLen
;
2283 wchar_t *tmpbuf
= 0;
2287 // need to copy to temp buffer to switch endianness
2288 // (doing WC_BSWAP twice on the original buffer won't help, as it
2289 // could be in read-only memory, or be accessed in some other thread)
2290 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2291 for ( size_t i
= 0; i
< srcLen
; i
++ )
2292 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2294 tmpbuf
[srcLen
] = L
'\0';
2298 char* inbuf
= (char*)src
;
2301 // have destination buffer, convert there
2302 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2304 res
= dstLen
- outbuflen
;
2306 else // no destination buffer
2308 // convert using temp buffer to calculate the size of the buffer needed
2316 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2318 res
+= 16 - outbuflen
;
2320 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2328 if (ICONV_FAILED(cres
, inbuflen
))
2330 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2331 return wxCONV_FAILED
;
2337 size_t wxMBConv_iconv::GetMBNulLen() const
2339 if ( m_minMBCharWidth
== 0 )
2341 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2344 // NB: explained in MB2WC
2345 wxMutexLocker
lock(self
->m_iconvMutex
);
2348 const wchar_t *wnul
= L
"";
2349 char buf
[8]; // should be enough for NUL in any encoding
2350 size_t inLen
= sizeof(wchar_t),
2351 outLen
= WXSIZEOF(buf
);
2352 char *inBuff
= (char *)wnul
;
2353 char *outBuff
= buf
;
2354 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2356 self
->m_minMBCharWidth
= (size_t)-1;
2360 self
->m_minMBCharWidth
= outBuff
- buf
;
2364 return m_minMBCharWidth
;
2367 #if wxUSE_UNICODE_UTF8
2368 bool wxMBConv_iconv::IsUTF8() const
2370 return wxStricmp(m_name
, "UTF-8") == 0 ||
2371 wxStricmp(m_name
, "UTF8") == 0;
2375 #endif // HAVE_ICONV
2378 // ============================================================================
2379 // Win32 conversion classes
2380 // ============================================================================
2382 #ifdef wxHAVE_WIN32_MB2WC
2386 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2387 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2390 class wxMBConv_win32
: public wxMBConv
2395 m_CodePage
= CP_ACP
;
2396 m_minMBCharWidth
= 0;
2399 wxMBConv_win32(const wxMBConv_win32
& conv
)
2402 m_CodePage
= conv
.m_CodePage
;
2403 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2407 wxMBConv_win32(const char* name
)
2409 m_CodePage
= wxCharsetToCodepage(name
);
2410 m_minMBCharWidth
= 0;
2413 wxMBConv_win32(wxFontEncoding encoding
)
2415 m_CodePage
= wxEncodingToCodepage(encoding
);
2416 m_minMBCharWidth
= 0;
2418 #endif // wxUSE_FONTMAP
2420 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2422 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2423 // the behaviour is not compatible with the Unix version (using iconv)
2424 // and break the library itself, e.g. wxTextInputStream::NextChar()
2425 // wouldn't work if reading an incomplete MB char didn't result in an
2428 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2429 // Win XP or newer and it is not supported for UTF-[78] so we always
2430 // use our own conversions in this case. See
2431 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2432 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2433 if ( m_CodePage
== CP_UTF8
)
2435 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2438 if ( m_CodePage
== CP_UTF7
)
2440 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2444 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2445 IsAtLeastWin2kSP4() )
2447 flags
= MB_ERR_INVALID_CHARS
;
2450 const size_t len
= ::MultiByteToWideChar
2452 m_CodePage
, // code page
2453 flags
, // flags: fall on error
2454 psz
, // input string
2455 -1, // its length (NUL-terminated)
2456 buf
, // output string
2457 buf
? n
: 0 // size of output buffer
2461 // function totally failed
2462 return wxCONV_FAILED
;
2465 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2466 // check if we succeeded, by doing a double trip:
2467 if ( !flags
&& buf
)
2469 const size_t mbLen
= strlen(psz
);
2470 wxCharBuffer
mbBuf(mbLen
);
2471 if ( ::WideCharToMultiByte
2478 mbLen
+ 1, // size in bytes, not length
2482 strcmp(mbBuf
, psz
) != 0 )
2484 // we didn't obtain the same thing we started from, hence
2485 // the conversion was lossy and we consider that it failed
2486 return wxCONV_FAILED
;
2490 // note that it returns count of written chars for buf != NULL and size
2491 // of the needed buffer for buf == NULL so in either case the length of
2492 // the string (which never includes the terminating NUL) is one less
2496 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2499 we have a problem here: by default, WideCharToMultiByte() may
2500 replace characters unrepresentable in the target code page with bad
2501 quality approximations such as turning "1/2" symbol (U+00BD) into
2502 "1" for the code pages which don't have it and we, obviously, want
2503 to avoid this at any price
2505 the trouble is that this function does it _silently_, i.e. it won't
2506 even tell us whether it did or not... Win98/2000 and higher provide
2507 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2508 we have to resort to a round trip, i.e. check that converting back
2509 results in the same string -- this is, of course, expensive but
2510 otherwise we simply can't be sure to not garble the data.
2513 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2514 // it doesn't work with CJK encodings (which we test for rather roughly
2515 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2517 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2520 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2522 // it's our lucky day
2523 flags
= WC_NO_BEST_FIT_CHARS
;
2524 pUsedDef
= &usedDef
;
2526 else // old system or unsupported encoding
2532 const size_t len
= ::WideCharToMultiByte
2534 m_CodePage
, // code page
2535 flags
, // either none or no best fit
2536 pwz
, // input string
2537 -1, // it is (wide) NUL-terminated
2538 buf
, // output buffer
2539 buf
? n
: 0, // and its size
2540 NULL
, // default "replacement" char
2541 pUsedDef
// [out] was it used?
2546 // function totally failed
2547 return wxCONV_FAILED
;
2550 // we did something, check if we really succeeded
2553 // check if the conversion failed, i.e. if any replacements
2556 return wxCONV_FAILED
;
2558 else // we must resort to double tripping...
2560 // first we need to ensure that we really have the MB data: this is
2561 // not the case if we're called with NULL buffer, in which case we
2562 // need to do the conversion yet again
2563 wxCharBuffer bufDef
;
2566 bufDef
= wxCharBuffer(len
);
2567 buf
= bufDef
.data();
2568 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2569 buf
, len
, NULL
, NULL
) )
2570 return wxCONV_FAILED
;
2575 wxWCharBuffer
wcBuf(n
);
2576 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2577 wcscmp(wcBuf
, pwz
) != 0 )
2579 // we didn't obtain the same thing we started from, hence
2580 // the conversion was lossy and we consider that it failed
2581 return wxCONV_FAILED
;
2585 // see the comment above for the reason of "len - 1"
2589 virtual size_t GetMBNulLen() const
2591 if ( m_minMBCharWidth
== 0 )
2593 int len
= ::WideCharToMultiByte
2595 m_CodePage
, // code page
2597 L
"", // input string
2598 1, // translate just the NUL
2599 NULL
, // output buffer
2601 NULL
, // no replacement char
2602 NULL
// [out] don't care if it was used
2605 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2609 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2610 self
->m_minMBCharWidth
= (size_t)-1;
2614 self
->m_minMBCharWidth
= (size_t)-1;
2620 self
->m_minMBCharWidth
= len
;
2625 return m_minMBCharWidth
;
2628 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2630 bool IsOk() const { return m_CodePage
!= -1; }
2633 static bool CanUseNoBestFit()
2635 static int s_isWin98Or2k
= -1;
2637 if ( s_isWin98Or2k
== -1 )
2640 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2642 case wxOS_WINDOWS_9X
:
2643 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2646 case wxOS_WINDOWS_NT
:
2647 s_isWin98Or2k
= verMaj
>= 5;
2651 // unknown: be conservative by default
2656 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2659 return s_isWin98Or2k
== 1;
2662 static bool IsAtLeastWin2kSP4()
2667 static int s_isAtLeastWin2kSP4
= -1;
2669 if ( s_isAtLeastWin2kSP4
== -1 )
2671 OSVERSIONINFOEX ver
;
2673 memset(&ver
, 0, sizeof(ver
));
2674 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2675 GetVersionEx((OSVERSIONINFO
*)&ver
);
2677 s_isAtLeastWin2kSP4
=
2678 ((ver
.dwMajorVersion
> 5) || // Vista+
2679 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2680 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2681 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2685 return s_isAtLeastWin2kSP4
== 1;
2690 // the code page we're working with
2693 // cached result of GetMBNulLen(), set to 0 initially meaning
2695 size_t m_minMBCharWidth
;
2698 #endif // wxHAVE_WIN32_MB2WC
2701 // ============================================================================
2702 // wxEncodingConverter based conversion classes
2703 // ============================================================================
2707 class wxMBConv_wxwin
: public wxMBConv
2712 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2713 // The wxMBConv_cf class does a better job.
2714 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2715 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2716 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2720 // temporarily just use wxEncodingConverter stuff,
2721 // so that it works while a better implementation is built
2722 wxMBConv_wxwin(const char* name
)
2725 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2727 m_enc
= wxFONTENCODING_SYSTEM
;
2732 wxMBConv_wxwin(wxFontEncoding enc
)
2739 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2741 size_t inbuf
= strlen(psz
);
2744 if (!m2w
.Convert(psz
, buf
))
2745 return wxCONV_FAILED
;
2750 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2752 const size_t inbuf
= wxWcslen(psz
);
2755 if (!w2m
.Convert(psz
, buf
))
2756 return wxCONV_FAILED
;
2762 virtual size_t GetMBNulLen() const
2766 case wxFONTENCODING_UTF16BE
:
2767 case wxFONTENCODING_UTF16LE
:
2770 case wxFONTENCODING_UTF32BE
:
2771 case wxFONTENCODING_UTF32LE
:
2779 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2781 bool IsOk() const { return m_ok
; }
2784 wxFontEncoding m_enc
;
2785 wxEncodingConverter m2w
, w2m
;
2788 // were we initialized successfully?
2791 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2794 // make the constructors available for unit testing
2795 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2797 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2798 if ( !result
->IsOk() )
2807 #endif // wxUSE_FONTMAP
2809 // ============================================================================
2810 // wxCSConv implementation
2811 // ============================================================================
2813 void wxCSConv::Init()
2820 wxCSConv::wxCSConv(const wxString
& charset
)
2824 if ( !charset
.empty() )
2826 SetName(charset
.ToAscii());
2830 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2832 m_encoding
= wxFONTENCODING_SYSTEM
;
2836 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2838 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2840 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2842 encoding
= wxFONTENCODING_SYSTEM
;
2847 m_encoding
= encoding
;
2850 wxCSConv::~wxCSConv()
2855 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2860 SetName(conv
.m_name
);
2861 m_encoding
= conv
.m_encoding
;
2864 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2868 SetName(conv
.m_name
);
2869 m_encoding
= conv
.m_encoding
;
2874 void wxCSConv::Clear()
2883 void wxCSConv::SetName(const char *charset
)
2887 m_name
= wxStrdup(charset
);
2894 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2895 wxEncodingNameCache
);
2897 static wxEncodingNameCache gs_nameCache
;
2900 wxMBConv
*wxCSConv::DoCreate() const
2903 wxLogTrace(TRACE_STRCONV
,
2904 wxT("creating conversion for %s"),
2906 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2907 #endif // wxUSE_FONTMAP
2909 // check for the special case of ASCII or ISO8859-1 charset: as we have
2910 // special knowledge of it anyhow, we don't need to create a special
2911 // conversion object
2912 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2913 m_encoding
== wxFONTENCODING_DEFAULT
)
2915 // don't convert at all
2919 // we trust OS to do conversion better than we can so try external
2920 // conversion methods first
2922 // the full order is:
2923 // 1. OS conversion (iconv() under Unix or Win32 API)
2924 // 2. hard coded conversions for UTF
2925 // 3. wxEncodingConverter as fall back
2931 #endif // !wxUSE_FONTMAP
2934 wxFontEncoding
encoding(m_encoding
);
2939 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2947 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2948 #endif // wxUSE_FONTMAP
2952 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2953 if ( it
!= gs_nameCache
.end() )
2955 if ( it
->second
.empty() )
2958 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2965 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2966 // CS : in case this does not return valid names (eg for MacRoman)
2967 // encoding got a 'failure' entry in the cache all the same,
2968 // although it just has to be created using a different method, so
2969 // only store failed iconv creation attempts (or perhaps we
2970 // shoulnd't do this at all ?)
2971 if ( names
[0] != NULL
)
2973 for ( ; *names
; ++names
)
2975 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2976 // will need changes that will obsolete this
2977 wxString
name(*names
);
2978 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2981 gs_nameCache
[encoding
] = *names
;
2988 gs_nameCache
[encoding
] = _T(""); // cache the failure
2991 #endif // wxUSE_FONTMAP
2993 #endif // HAVE_ICONV
2995 #ifdef wxHAVE_WIN32_MB2WC
2998 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2999 : new wxMBConv_win32(m_encoding
);
3008 #endif // wxHAVE_WIN32_MB2WC
3012 // leave UTF16 and UTF32 to the built-ins of wx
3013 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3014 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3017 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3018 : new wxMBConv_cf(m_encoding
);
3020 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3029 #endif // __DARWIN__
3032 wxFontEncoding enc
= m_encoding
;
3034 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3036 // use "false" to suppress interactive dialogs -- we can be called from
3037 // anywhere and popping up a dialog from here is the last thing we want to
3039 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3041 #endif // wxUSE_FONTMAP
3045 case wxFONTENCODING_UTF7
:
3046 return new wxMBConvUTF7
;
3048 case wxFONTENCODING_UTF8
:
3049 return new wxMBConvUTF8
;
3051 case wxFONTENCODING_UTF16BE
:
3052 return new wxMBConvUTF16BE
;
3054 case wxFONTENCODING_UTF16LE
:
3055 return new wxMBConvUTF16LE
;
3057 case wxFONTENCODING_UTF32BE
:
3058 return new wxMBConvUTF32BE
;
3060 case wxFONTENCODING_UTF32LE
:
3061 return new wxMBConvUTF32LE
;
3064 // nothing to do but put here to suppress gcc warnings
3071 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3072 : new wxMBConv_wxwin(m_encoding
);
3078 #endif // wxUSE_FONTMAP
3080 // NB: This is a hack to prevent deadlock. What could otherwise happen
3081 // in Unicode build: wxConvLocal creation ends up being here
3082 // because of some failure and logs the error. But wxLog will try to
3083 // attach a timestamp, for which it will need wxConvLocal (to convert
3084 // time to char* and then wchar_t*), but that fails, tries to log the
3085 // error, but wxLog has an (already locked) critical section that
3086 // guards the static buffer.
3087 static bool alreadyLoggingError
= false;
3088 if (!alreadyLoggingError
)
3090 alreadyLoggingError
= true;
3091 wxLogError(_("Cannot convert from the charset '%s'!"),
3095 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3096 #else // !wxUSE_FONTMAP
3097 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3098 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3101 alreadyLoggingError
= false;
3107 void wxCSConv::CreateConvIfNeeded() const
3111 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3113 // if we don't have neither the name nor the encoding, use the default
3114 // encoding for this system
3115 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3118 self
->m_encoding
= wxLocale::GetSystemEncoding();
3120 // fallback to some reasonable default:
3121 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3122 #endif // wxUSE_INTL
3125 self
->m_convReal
= DoCreate();
3126 self
->m_deferred
= false;
3130 bool wxCSConv::IsOk() const
3132 CreateConvIfNeeded();
3134 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3135 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3136 return true; // always ok as we do it ourselves
3138 // m_convReal->IsOk() is called at its own creation, so we know it must
3139 // be ok if m_convReal is non-NULL
3140 return m_convReal
!= NULL
;
3143 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3144 const char *src
, size_t srcLen
) const
3146 CreateConvIfNeeded();
3149 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3152 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3155 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3156 const wchar_t *src
, size_t srcLen
) const
3158 CreateConvIfNeeded();
3161 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3164 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3167 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3169 CreateConvIfNeeded();
3172 return m_convReal
->MB2WC(buf
, psz
, n
);
3175 size_t len
= strlen(psz
);
3179 for (size_t c
= 0; c
<= len
; c
++)
3180 buf
[c
] = (unsigned char)(psz
[c
]);
3186 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3188 CreateConvIfNeeded();
3191 return m_convReal
->WC2MB(buf
, psz
, n
);
3194 const size_t len
= wxWcslen(psz
);
3197 for (size_t c
= 0; c
<= len
; c
++)
3200 return wxCONV_FAILED
;
3202 buf
[c
] = (char)psz
[c
];
3207 for (size_t c
= 0; c
<= len
; c
++)
3210 return wxCONV_FAILED
;
3217 size_t wxCSConv::GetMBNulLen() const
3219 CreateConvIfNeeded();
3223 return m_convReal
->GetMBNulLen();
3226 // otherwise, we are ISO-8859-1
3230 #if wxUSE_UNICODE_UTF8
3231 bool wxCSConv::IsUTF8() const
3233 CreateConvIfNeeded();
3237 return m_convReal
->IsUTF8();
3240 // otherwise, we are ISO-8859-1
3248 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3251 return wxWCharBuffer();
3253 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3255 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3257 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3262 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3265 return wxCharBuffer();
3267 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3269 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3274 #endif // wxUSE_UNICODE
3276 // ----------------------------------------------------------------------------
3278 // ----------------------------------------------------------------------------
3280 // NB: The reason why we create converted objects in this convoluted way,
3281 // using a factory function instead of global variable, is that they
3282 // may be used at static initialization time (some of them are used by
3283 // wxString ctors and there may be a global wxString object). In other
3284 // words, possibly _before_ the converter global object would be
3291 #undef wxConvISO8859_1
3293 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3294 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3295 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3297 static impl_klass name##Obj ctor_args; \
3298 return &name##Obj; \
3300 /* this ensures that all global converter objects are created */ \
3301 /* by the time static initialization is done, i.e. before any */ \
3302 /* thread is launched: */ \
3303 static klass* gs_##name##instance = wxGet_##name##Ptr()
3305 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3306 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3309 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3311 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3314 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3315 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3316 // provokes an error message about "not enough macro parameters"; and we
3317 // can't use "()" here as the name##Obj declaration would be parsed as a
3318 // function declaration then, so use a semicolon and live with an extra
3319 // empty statement (and hope that no compilers warns about this)
3320 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3321 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3323 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3324 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3326 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3327 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3330 // The xnu kernel always communicates file paths in decomposed UTF-8.
3331 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3332 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3335 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3338 #else // !__DARWIN__
3339 wxGet_wxConvLibcPtr();
3340 #endif // __DARWIN__/!__DARWIN__
3342 #else // !wxUSE_WCHAR_T
3344 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3345 // stand-ins in absence of wchar_t
3346 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3351 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T