1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten
= 0;
172 // the number of NULs terminating this string
173 size_t nulLen
= 0; // not really needed, but just to avoid warnings
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
181 if ( srcLen
!= wxNO_LEN
)
183 // we need to know how to find the end of this string
184 nulLen
= GetMBNulLen();
185 if ( nulLen
== wxCONV_FAILED
)
186 return wxCONV_FAILED
;
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
193 char * const p
= bufTmp
.data();
194 memcpy(p
, src
, srcLen
);
195 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
201 srcEnd
= src
+ srcLen
;
203 else // quit after the first loop iteration
210 // try to convert the current chunk
211 size_t lenChunk
= MB2WC(NULL
, src
, 0);
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 dstWritten
+= lenChunk
;
221 // nothing left in the input string, conversion succeeded
227 if ( dstWritten
> dstLen
)
228 return wxCONV_FAILED
;
230 // +1 is for trailing NUL
231 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
232 return wxCONV_FAILED
;
241 // we convert just one chunk in this case as this is the entire
246 // advance the input pointer past the end of this chunk
247 while ( NotAllNULs(src
, nulLen
) )
249 // notice that we must skip over multiple bytes here as we suppose
250 // that if NUL takes 2 or 4 bytes, then all the other characters do
251 // too and so if advanced by a single byte we might erroneously
252 // detect sequences of NUL bytes in the middle of the input
256 src
+= nulLen
; // skipping over its terminator as well
258 // note that ">=" (and not just "==") is needed here as the terminator
259 // we skipped just above could be inside or just after the buffer
260 // delimited by inEnd
269 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
270 const wchar_t *src
, size_t srcLen
) const
272 // the number of chars [which would be] written to dst [if it were not NULL]
273 size_t dstWritten
= 0;
275 // if we don't know its length we have no choice but to assume that it is
276 // NUL-terminated (notice that it can still be NUL-terminated even if
277 // explicit length is given but it doesn't change our return value)
278 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
280 // make a copy of the input string unless it is already properly
282 wxWCharBuffer bufTmp
;
283 if ( isNulTerminated
)
285 srcLen
= wxWcslen(src
) + 1;
287 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
289 // make a copy in order to properly NUL-terminate the string
290 bufTmp
= wxWCharBuffer(srcLen
);
291 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
295 const size_t lenNul
= GetMBNulLen();
296 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
298 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
300 // try to convert the current chunk
301 size_t lenChunk
= WC2MB(NULL
, src
, 0);
303 if ( lenChunk
== wxCONV_FAILED
)
304 return wxCONV_FAILED
;
306 dstWritten
+= lenChunk
;
307 if ( isNulTerminated
)
308 dstWritten
+= lenNul
;
312 if ( dstWritten
> dstLen
)
313 return wxCONV_FAILED
;
315 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
316 return wxCONV_FAILED
;
319 if ( isNulTerminated
)
327 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
329 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
330 if ( rc
!= wxCONV_FAILED
)
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
340 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
342 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
343 if ( rc
!= wxCONV_FAILED
)
351 wxMBConv::~wxMBConv()
353 // nothing to do here (necessary for Darwin linking probably)
356 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
360 // calculate the length of the buffer needed first
361 const size_t nLen
= ToWChar(NULL
, 0, psz
);
362 if ( nLen
!= wxCONV_FAILED
)
364 // now do the actual conversion
365 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
367 // +1 for the trailing NULL
368 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
373 return wxWCharBuffer();
376 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
380 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
381 if ( nLen
!= wxCONV_FAILED
)
383 wxCharBuffer
buf(nLen
- 1);
384 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
389 return wxCharBuffer();
393 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
395 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
396 if ( dstLen
!= wxCONV_FAILED
)
398 // notice that we allocate space for dstLen+1 wide characters here
399 // because we want the buffer to always be NUL-terminated, even if the
400 // input isn't (as otherwise the caller has no way to know its length)
401 wxWCharBuffer
wbuf(dstLen
);
402 wbuf
.data()[dstLen
] = L
'\0';
403 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
409 // we also need to handle NUL-terminated input strings
410 // specially: for them the output is the length of the string
411 // excluding the trailing NUL, however if we're asked to
412 // convert a specific number of characters we return the length
413 // of the resulting output even if it's NUL-terminated
414 if ( inLen
== wxNO_LEN
)
425 return wxWCharBuffer();
429 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
431 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
432 if ( dstLen
!= wxCONV_FAILED
)
434 const size_t nulLen
= GetMBNulLen();
436 // as above, ensure that the buffer is always NUL-terminated, even if
438 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
439 memset(buf
.data() + dstLen
, 0, nulLen
);
440 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
446 if ( inLen
== wxNO_LEN
)
448 // in this case both input and output are NUL-terminated
449 // and we're not supposed to count NUL
461 return wxCharBuffer();
464 // ----------------------------------------------------------------------------
466 // ----------------------------------------------------------------------------
468 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
470 return wxMB2WC(buf
, psz
, n
);
473 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
475 return wxWC2MB(buf
, psz
, n
);
478 // ----------------------------------------------------------------------------
479 // wxConvBrokenFileNames
480 // ----------------------------------------------------------------------------
484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
486 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
487 wxStricmp(charset
, _T("UTF8")) == 0 )
488 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
490 m_conv
= new wxCSConv(charset
);
495 // ----------------------------------------------------------------------------
497 // ----------------------------------------------------------------------------
499 // Implementation (C) 2004 Fredrik Roubert
501 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
504 // BASE64 decoding table
506 static const unsigned char utf7unb64
[] =
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
542 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
543 const char *src
, size_t srcLen
) const
545 DecoderState stateOrig
,
547 if ( srcLen
== wxNO_LEN
)
549 // convert the entire string, up to and including the trailing NUL
550 srcLen
= strlen(src
) + 1;
552 // when working on the entire strings we don't update nor use the shift
553 // state from the previous call
554 statePtr
= &stateOrig
;
556 else // when working with partial strings we do use the shift state
558 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
560 // also save the old state to be able to rollback to it on error
561 stateOrig
= m_stateDecoder
;
564 // but to simplify the code below we use this variable in both cases
565 DecoderState
& state
= *statePtr
;
568 // number of characters [which would have been] written to dst [if it were
572 const char * const srcEnd
= src
+ srcLen
;
574 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
576 const unsigned char cc
= *src
++;
578 if ( state
.IsShifted() )
580 const unsigned char dc
= utf7unb64
[cc
];
583 // end of encoded part, check that nothing was left: the bit
584 // field cycles through 0,6,4,2 sequence so check that we're at
586 if ( state
.bit
!= 2 )
587 return wxCONV_FAILED
;
591 // re-parse this character normally below unless it's '-' which
592 // is consumed by the decoder
596 else // valid encoded character
598 // mini base64 decoder: each character is 6 bits
603 if ( state
.bit
>= 8 )
605 // got the full byte, consume it
607 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
611 // we've got the full word, output it
613 *dst
++ = (state
.msb
<< 8) | b
;
619 // just store it while we wait for LSB
627 if ( state
.IsDirect() )
629 // start of an encoded segment?
634 // just the encoded plus sign, don't switch to shifted mode
647 // only printable 7 bit ASCII characters (with the exception of
648 // NUL, TAB, CR and LF) can be used directly
649 if ( cc
>= 0x7f || (cc
< ' ' &&
650 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
651 return wxCONV_FAILED
;
662 // as we didn't read any characters we should be called with the same
663 // data (followed by some more new data) again later so don't save our
667 return wxCONV_FAILED
;
674 // BASE64 encoding table
676 static const unsigned char utf7enb64
[] =
678 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
679 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
680 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
681 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
682 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
683 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
684 'w', 'x', 'y', 'z', '0', '1', '2', '3',
685 '4', '5', '6', '7', '8', '9', '+', '/'
689 // UTF-7 encoding table
691 // 0 - Set D (directly encoded characters)
692 // 1 - Set O (optional direct characters)
693 // 2 - whitespace characters (optional)
694 // 3 - special characters
696 static const unsigned char utf7encode
[128] =
698 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
699 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
700 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
704 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
708 static inline bool wxIsUTF7Direct(wchar_t wc
)
710 return wc
< 0x80 && utf7encode
[wc
] < 1;
713 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
714 const wchar_t *src
, size_t srcLen
) const
716 EncoderState stateOrig
,
718 if ( srcLen
== wxNO_LEN
)
720 // we don't apply the stored state when operating on entire strings at
722 statePtr
= &stateOrig
;
724 srcLen
= wxWcslen(src
) + 1;
726 else // do use the mode we left the output in previously
728 stateOrig
= m_stateEncoder
;
729 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
732 EncoderState
& state
= *statePtr
;
737 const wchar_t * const srcEnd
= src
+ srcLen
;
738 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
741 if ( wxIsUTF7Direct(cc
) )
743 if ( state
.IsShifted() )
745 // pad with zeros the last encoded block if necessary
749 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
764 else if ( cc
== '+' && state
.IsDirect() )
775 else if (((wxUint32
)cc
) > 0xffff)
777 // no surrogate pair generation (yet?)
778 return wxCONV_FAILED
;
783 if ( state
.IsDirect() )
792 // BASE64 encode string
795 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
798 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
800 for (state
.bit
+= 8; state
.bit
>= 6; )
804 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
809 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
817 // we need to restore the original encoder state if we were called just to
818 // calculate the amount of space needed as we will presumably be called
819 // again to really convert the data now
826 // ----------------------------------------------------------------------------
828 // ----------------------------------------------------------------------------
830 static const wxUint32 utf8_max
[]=
831 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
833 // boundaries of the private use area we use to (temporarily) remap invalid
834 // characters invalid in a UTF-8 encoded string
835 const wxUint32 wxUnicodePUA
= 0x100000;
836 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
838 // this table gives the length of the UTF-8 encoding from its first character:
839 const unsigned char tableUtf8Lengths
[256] = {
840 // single-byte sequences (ASCII):
841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
847 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
850 // these are invalid:
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
857 // two-byte sequences:
858 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
859 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
861 // three-byte sequences:
862 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
864 // four-byte sequences:
865 4, 4, 4, 4, 4, // F0..F4
867 // these are invalid again (5- or 6-byte
868 // sequences and sequences for code points
869 // above U+10FFFF, as restricted by RFC 3629):
870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
874 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
875 const char *src
, size_t srcLen
) const
877 wchar_t *out
= dstLen
? dst
: NULL
;
880 if ( srcLen
== wxNO_LEN
)
881 srcLen
= strlen(src
) + 1;
883 for ( const char *p
= src
; ; p
++ )
885 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
887 // all done successfully, just add the trailing NULL if we are not
888 // using explicit length
889 if ( srcLen
== wxNO_LEN
)
905 if ( out
&& !dstLen
-- )
909 unsigned char c
= *p
;
913 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
916 if ( srcLen
!= wxNO_LEN
)
923 unsigned len
= tableUtf8Lengths
[c
];
927 if ( srcLen
< len
) // the test works for wxNO_LEN too
930 if ( srcLen
!= wxNO_LEN
)
933 // Char. number range | UTF-8 octet sequence
934 // (hexadecimal) | (binary)
935 // ----------------------+----------------------------------------
936 // 0000 0000 - 0000 007F | 0xxxxxxx
937 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
938 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
939 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
941 // Code point value is stored in bits marked with 'x',
942 // lowest-order bit of the value on the right side in the diagram
943 // above. (from RFC 3629)
945 // mask to extract lead byte's value ('x' bits above), by sequence
947 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
949 // mask and value of lead byte's most significant bits, by length:
950 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
951 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
953 len
--; // it's more convenient to work with 0-based length here
955 // extract the lead byte's value bits:
956 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
959 code
= c
& leadValueMask
[len
];
961 // all remaining bytes, if any, are handled in the same way
962 // regardless of sequence's length:
966 if ( (c
& 0xC0) != 0x80 )
967 return wxCONV_FAILED
;
975 // cast is ok because wchar_t == wxUint16 if WC_UTF16
976 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
985 #endif // WC_UTF16/!WC_UTF16
993 return wxCONV_FAILED
;
997 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
998 const wchar_t *src
, size_t srcLen
) const
1000 char *out
= dstLen
? dst
: NULL
;
1003 for ( const wchar_t *wp
= src
; ; wp
++ )
1005 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1007 // all done successfully, just add the trailing NULL if we are not
1008 // using explicit length
1009 if ( srcLen
== wxNO_LEN
)
1025 if ( srcLen
!= wxNO_LEN
)
1030 // cast is ok for WC_UTF16
1031 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1033 // skip the next char too as we decoded a surrogate
1036 #else // wchar_t is UTF-32
1037 code
= *wp
& 0x7fffffff;
1049 out
[0] = (char)code
;
1052 else if ( code
<= 0x07FF )
1060 // NB: this line takes 6 least significant bits, encodes them as
1061 // 10xxxxxx and discards them so that the next byte can be encoded:
1062 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1063 out
[0] = 0xC0 | code
;
1066 else if ( code
< 0xFFFF )
1074 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1075 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1076 out
[0] = 0xE0 | code
;
1079 else if ( code
<= 0x10FFFF )
1087 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1088 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1089 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1090 out
[0] = 0xF0 | code
;
1095 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1108 // we only get here if an error occurs during decoding
1109 return wxCONV_FAILED
;
1112 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1113 const char *psz
, size_t srcLen
) const
1115 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1116 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1120 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1122 const char *opsz
= psz
;
1123 bool invalid
= false;
1124 unsigned char cc
= *psz
++, fc
= cc
;
1126 for (cnt
= 0; fc
& 0x80; cnt
++)
1136 // escape the escape character for octal escapes
1137 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1138 && cc
== '\\' && (!buf
|| len
< n
))
1150 // invalid UTF-8 sequence
1155 unsigned ocnt
= cnt
- 1;
1156 wxUint32 res
= cc
& (0x3f >> cnt
);
1160 if ((cc
& 0xC0) != 0x80)
1162 // invalid UTF-8 sequence
1168 res
= (res
<< 6) | (cc
& 0x3f);
1171 if (invalid
|| res
<= utf8_max
[ocnt
])
1173 // illegal UTF-8 encoding
1176 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1177 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1179 // if one of our PUA characters turns up externally
1180 // it must also be treated as an illegal sequence
1181 // (a bit like you have to escape an escape character)
1187 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1188 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1189 if (pa
== wxCONV_FAILED
)
1201 *buf
++ = (wchar_t)res
;
1203 #endif // WC_UTF16/!WC_UTF16
1209 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1211 while (opsz
< psz
&& (!buf
|| len
< n
))
1214 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1215 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1216 wxASSERT(pa
!= wxCONV_FAILED
);
1223 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1229 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1231 while (opsz
< psz
&& (!buf
|| len
< n
))
1233 if ( buf
&& len
+ 3 < n
)
1235 unsigned char on
= *opsz
;
1237 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1238 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1239 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1246 else // MAP_INVALID_UTF8_NOT
1248 return wxCONV_FAILED
;
1254 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1260 static inline bool isoctal(wchar_t wch
)
1262 return L
'0' <= wch
&& wch
<= L
'7';
1265 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1266 const wchar_t *psz
, size_t srcLen
) const
1268 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1269 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1273 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1278 // cast is ok for WC_UTF16
1279 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1280 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1282 cc
= (*psz
++) & 0x7fffffff;
1285 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1286 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1289 *buf
++ = (char)(cc
- wxUnicodePUA
);
1292 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1293 && cc
== L
'\\' && psz
[0] == L
'\\' )
1300 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1302 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1306 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1307 (psz
[1] - L
'0') * 010 +
1317 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1333 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1335 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1341 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1347 // ============================================================================
1349 // ============================================================================
1351 #ifdef WORDS_BIGENDIAN
1352 #define wxMBConvUTF16straight wxMBConvUTF16BE
1353 #define wxMBConvUTF16swap wxMBConvUTF16LE
1355 #define wxMBConvUTF16swap wxMBConvUTF16BE
1356 #define wxMBConvUTF16straight wxMBConvUTF16LE
1360 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1362 if ( srcLen
== wxNO_LEN
)
1364 // count the number of bytes in input, including the trailing NULs
1365 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1366 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1369 srcLen
*= BYTES_PER_CHAR
;
1371 else // we already have the length
1373 // we can only convert an entire number of UTF-16 characters
1374 if ( srcLen
% BYTES_PER_CHAR
)
1375 return wxCONV_FAILED
;
1381 // case when in-memory representation is UTF-16 too
1384 // ----------------------------------------------------------------------------
1385 // conversions without endianness change
1386 // ----------------------------------------------------------------------------
1389 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1390 const char *src
, size_t srcLen
) const
1392 // set up the scene for using memcpy() (which is presumably more efficient
1393 // than copying the bytes one by one)
1394 srcLen
= GetLength(src
, srcLen
);
1395 if ( srcLen
== wxNO_LEN
)
1396 return wxCONV_FAILED
;
1398 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1401 if ( dstLen
< inLen
)
1402 return wxCONV_FAILED
;
1404 memcpy(dst
, src
, srcLen
);
1411 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1412 const wchar_t *src
, size_t srcLen
) const
1414 if ( srcLen
== wxNO_LEN
)
1415 srcLen
= wxWcslen(src
) + 1;
1417 srcLen
*= BYTES_PER_CHAR
;
1421 if ( dstLen
< srcLen
)
1422 return wxCONV_FAILED
;
1424 memcpy(dst
, src
, srcLen
);
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1436 const char *src
, size_t srcLen
) const
1438 srcLen
= GetLength(src
, srcLen
);
1439 if ( srcLen
== wxNO_LEN
)
1440 return wxCONV_FAILED
;
1442 srcLen
/= BYTES_PER_CHAR
;
1446 if ( dstLen
< srcLen
)
1447 return wxCONV_FAILED
;
1449 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1450 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1452 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1460 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1461 const wchar_t *src
, size_t srcLen
) const
1463 if ( srcLen
== wxNO_LEN
)
1464 srcLen
= wxWcslen(src
) + 1;
1466 srcLen
*= BYTES_PER_CHAR
;
1470 if ( dstLen
< srcLen
)
1471 return wxCONV_FAILED
;
1473 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1474 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1476 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1483 #else // !WC_UTF16: wchar_t is UTF-32
1485 // ----------------------------------------------------------------------------
1486 // conversions without endianness change
1487 // ----------------------------------------------------------------------------
1490 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1491 const char *src
, size_t srcLen
) const
1493 srcLen
= GetLength(src
, srcLen
);
1494 if ( srcLen
== wxNO_LEN
)
1495 return wxCONV_FAILED
;
1497 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1500 // optimization: return maximal space which could be needed for this
1501 // string even if the real size could be smaller if the buffer contains
1507 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1508 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1510 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1512 return wxCONV_FAILED
;
1514 if ( ++outLen
> dstLen
)
1515 return wxCONV_FAILED
;
1525 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1526 const wchar_t *src
, size_t srcLen
) const
1528 if ( srcLen
== wxNO_LEN
)
1529 srcLen
= wxWcslen(src
) + 1;
1532 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1533 for ( size_t n
= 0; n
< srcLen
; n
++ )
1536 const size_t numChars
= encode_utf16(*src
++, cc
);
1537 if ( numChars
== wxCONV_FAILED
)
1538 return wxCONV_FAILED
;
1540 outLen
+= numChars
* BYTES_PER_CHAR
;
1543 if ( outLen
> dstLen
)
1544 return wxCONV_FAILED
;
1547 if ( numChars
== 2 )
1549 // second character of a surrogate
1558 // ----------------------------------------------------------------------------
1559 // endian-reversing conversions
1560 // ----------------------------------------------------------------------------
1563 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1564 const char *src
, size_t srcLen
) const
1566 srcLen
= GetLength(src
, srcLen
);
1567 if ( srcLen
== wxNO_LEN
)
1568 return wxCONV_FAILED
;
1570 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1573 // optimization: return maximal space which could be needed for this
1574 // string even if the real size could be smaller if the buffer contains
1580 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1581 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1586 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1588 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1590 const size_t numChars
= decode_utf16(tmp
, ch
);
1591 if ( numChars
== wxCONV_FAILED
)
1592 return wxCONV_FAILED
;
1594 if ( numChars
== 2 )
1597 if ( ++outLen
> dstLen
)
1598 return wxCONV_FAILED
;
1608 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1609 const wchar_t *src
, size_t srcLen
) const
1611 if ( srcLen
== wxNO_LEN
)
1612 srcLen
= wxWcslen(src
) + 1;
1615 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1616 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1619 const size_t numChars
= encode_utf16(*src
, cc
);
1620 if ( numChars
== wxCONV_FAILED
)
1621 return wxCONV_FAILED
;
1623 outLen
+= numChars
* BYTES_PER_CHAR
;
1626 if ( outLen
> dstLen
)
1627 return wxCONV_FAILED
;
1629 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1630 if ( numChars
== 2 )
1632 // second character of a surrogate
1633 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1641 #endif // WC_UTF16/!WC_UTF16
1644 // ============================================================================
1646 // ============================================================================
1648 #ifdef WORDS_BIGENDIAN
1649 #define wxMBConvUTF32straight wxMBConvUTF32BE
1650 #define wxMBConvUTF32swap wxMBConvUTF32LE
1652 #define wxMBConvUTF32swap wxMBConvUTF32BE
1653 #define wxMBConvUTF32straight wxMBConvUTF32LE
1657 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1658 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1661 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1663 if ( srcLen
== wxNO_LEN
)
1665 // count the number of bytes in input, including the trailing NULs
1666 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1667 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1670 srcLen
*= BYTES_PER_CHAR
;
1672 else // we already have the length
1674 // we can only convert an entire number of UTF-32 characters
1675 if ( srcLen
% BYTES_PER_CHAR
)
1676 return wxCONV_FAILED
;
1682 // case when in-memory representation is UTF-16
1685 // ----------------------------------------------------------------------------
1686 // conversions without endianness change
1687 // ----------------------------------------------------------------------------
1690 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1691 const char *src
, size_t srcLen
) const
1693 srcLen
= GetLength(src
, srcLen
);
1694 if ( srcLen
== wxNO_LEN
)
1695 return wxCONV_FAILED
;
1697 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1698 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1700 for ( size_t n
= 0; n
< inLen
; n
++ )
1703 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1704 if ( numChars
== wxCONV_FAILED
)
1705 return wxCONV_FAILED
;
1710 if ( outLen
> dstLen
)
1711 return wxCONV_FAILED
;
1714 if ( numChars
== 2 )
1716 // second character of a surrogate
1726 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1727 const wchar_t *src
, size_t srcLen
) const
1729 if ( srcLen
== wxNO_LEN
)
1730 srcLen
= wxWcslen(src
) + 1;
1734 // optimization: return maximal space which could be needed for this
1735 // string instead of the exact amount which could be less if there are
1736 // any surrogates in the input
1738 // we consider that surrogates are rare enough to make it worthwhile to
1739 // avoid running the loop below at the cost of slightly extra memory
1741 return srcLen
* BYTES_PER_CHAR
;
1744 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1746 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1748 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1750 return wxCONV_FAILED
;
1752 outLen
+= BYTES_PER_CHAR
;
1754 if ( outLen
> dstLen
)
1755 return wxCONV_FAILED
;
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1769 const char *src
, size_t srcLen
) const
1771 srcLen
= GetLength(src
, srcLen
);
1772 if ( srcLen
== wxNO_LEN
)
1773 return wxCONV_FAILED
;
1775 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1776 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1778 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1781 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1782 if ( numChars
== wxCONV_FAILED
)
1783 return wxCONV_FAILED
;
1788 if ( outLen
> dstLen
)
1789 return wxCONV_FAILED
;
1792 if ( numChars
== 2 )
1794 // second character of a surrogate
1804 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1805 const wchar_t *src
, size_t srcLen
) const
1807 if ( srcLen
== wxNO_LEN
)
1808 srcLen
= wxWcslen(src
) + 1;
1812 // optimization: return maximal space which could be needed for this
1813 // string instead of the exact amount which could be less if there are
1814 // any surrogates in the input
1816 // we consider that surrogates are rare enough to make it worthwhile to
1817 // avoid running the loop below at the cost of slightly extra memory
1819 return srcLen
*BYTES_PER_CHAR
;
1822 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1824 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1826 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1828 return wxCONV_FAILED
;
1830 outLen
+= BYTES_PER_CHAR
;
1832 if ( outLen
> dstLen
)
1833 return wxCONV_FAILED
;
1835 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1841 #else // !WC_UTF16: wchar_t is UTF-32
1843 // ----------------------------------------------------------------------------
1844 // conversions without endianness change
1845 // ----------------------------------------------------------------------------
1848 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1849 const char *src
, size_t srcLen
) const
1851 // use memcpy() as it should be much faster than hand-written loop
1852 srcLen
= GetLength(src
, srcLen
);
1853 if ( srcLen
== wxNO_LEN
)
1854 return wxCONV_FAILED
;
1856 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1859 if ( dstLen
< inLen
)
1860 return wxCONV_FAILED
;
1862 memcpy(dst
, src
, srcLen
);
1869 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1870 const wchar_t *src
, size_t srcLen
) const
1872 if ( srcLen
== wxNO_LEN
)
1873 srcLen
= wxWcslen(src
) + 1;
1875 srcLen
*= BYTES_PER_CHAR
;
1879 if ( dstLen
< srcLen
)
1880 return wxCONV_FAILED
;
1882 memcpy(dst
, src
, srcLen
);
1888 // ----------------------------------------------------------------------------
1889 // endian-reversing conversions
1890 // ----------------------------------------------------------------------------
1893 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1894 const char *src
, size_t srcLen
) const
1896 srcLen
= GetLength(src
, srcLen
);
1897 if ( srcLen
== wxNO_LEN
)
1898 return wxCONV_FAILED
;
1900 srcLen
/= BYTES_PER_CHAR
;
1904 if ( dstLen
< srcLen
)
1905 return wxCONV_FAILED
;
1907 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1908 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1910 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1918 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1919 const wchar_t *src
, size_t srcLen
) const
1921 if ( srcLen
== wxNO_LEN
)
1922 srcLen
= wxWcslen(src
) + 1;
1924 srcLen
*= BYTES_PER_CHAR
;
1928 if ( dstLen
< srcLen
)
1929 return wxCONV_FAILED
;
1931 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1932 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1934 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1941 #endif // WC_UTF16/!WC_UTF16
1944 // ============================================================================
1945 // The classes doing conversion using the iconv_xxx() functions
1946 // ============================================================================
1950 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1951 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1952 // (unless there's yet another bug in glibc) the only case when iconv()
1953 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1954 // left in the input buffer -- when _real_ error occurs,
1955 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1957 // [This bug does not appear in glibc 2.2.]
1958 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1959 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1960 (errno != E2BIG || bufLeft != 0))
1962 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1965 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1967 #define ICONV_T_INVALID ((iconv_t)-1)
1969 #if SIZEOF_WCHAR_T == 4
1970 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1971 #define WC_ENC wxFONTENCODING_UTF32
1972 #elif SIZEOF_WCHAR_T == 2
1973 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1974 #define WC_ENC wxFONTENCODING_UTF16
1975 #else // sizeof(wchar_t) != 2 nor 4
1976 // does this ever happen?
1977 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1980 // ----------------------------------------------------------------------------
1981 // wxMBConv_iconv: encapsulates an iconv character set
1982 // ----------------------------------------------------------------------------
1984 class wxMBConv_iconv
: public wxMBConv
1987 wxMBConv_iconv(const char *name
);
1988 virtual ~wxMBConv_iconv();
1990 // implement base class virtual methods
1991 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
1992 const char *src
, size_t srcLen
= wxNO_LEN
) const;
1993 virtual size_t FromWChar(char *dst
, size_t dstLen
,
1994 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
1995 virtual size_t GetMBNulLen() const;
1997 #if wxUSE_UNICODE_UTF8
1998 virtual bool IsUTF8() const;
2001 virtual wxMBConv
*Clone() const
2003 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2004 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2009 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2012 // the iconv handlers used to translate from multibyte
2013 // to wide char and in the other direction
2018 // guards access to m2w and w2m objects
2019 wxMutex m_iconvMutex
;
2023 // the name (for iconv_open()) of a wide char charset -- if none is
2024 // available on this machine, it will remain NULL
2025 static wxString ms_wcCharsetName
;
2027 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2028 // different endian-ness than the native one
2029 static bool ms_wcNeedsSwap
;
2032 // name of the encoding handled by this conversion
2035 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2037 size_t m_minMBCharWidth
;
2040 // make the constructor available for unit testing
2041 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2043 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2044 if ( !result
->IsOk() )
2053 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2054 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2056 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2059 m_minMBCharWidth
= 0;
2061 // check for charset that represents wchar_t:
2062 if ( ms_wcCharsetName
.empty() )
2064 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2067 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2068 #else // !wxUSE_FONTMAP
2069 static const wxChar
*names_static
[] =
2071 #if SIZEOF_WCHAR_T == 4
2073 #elif SIZEOF_WCHAR_T = 2
2078 const wxChar
**names
= names_static
;
2079 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2081 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2083 const wxString
nameCS(*names
);
2085 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2086 wxString
nameXE(nameCS
);
2088 #ifdef WORDS_BIGENDIAN
2090 #else // little endian
2094 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2097 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2098 if ( m2w
== ICONV_T_INVALID
)
2100 // try charset w/o bytesex info (e.g. "UCS4")
2101 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2103 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2105 // and check for bytesex ourselves:
2106 if ( m2w
!= ICONV_T_INVALID
)
2108 char buf
[2], *bufPtr
;
2117 outsz
= SIZEOF_WCHAR_T
* 2;
2118 char* wbufPtr
= (char*)wbuf
;
2122 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2125 if (ICONV_FAILED(res
, insz
))
2127 wxLogLastError(wxT("iconv"));
2128 wxLogError(_("Conversion to charset '%s' doesn't work."),
2131 else // ok, can convert to this encoding, remember it
2133 ms_wcCharsetName
= nameCS
;
2134 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2138 else // use charset not requiring byte swapping
2140 ms_wcCharsetName
= nameXE
;
2144 wxLogTrace(TRACE_STRCONV
,
2145 wxT("iconv wchar_t charset is \"%s\"%s"),
2146 ms_wcCharsetName
.empty() ? wxString("<none>")
2148 ms_wcNeedsSwap
? _T(" (needs swap)")
2151 else // we already have ms_wcCharsetName
2153 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2156 if ( ms_wcCharsetName
.empty() )
2158 w2m
= ICONV_T_INVALID
;
2162 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2163 if ( w2m
== ICONV_T_INVALID
)
2165 wxLogTrace(TRACE_STRCONV
,
2166 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2167 ms_wcCharsetName
.c_str(), name
);
2172 wxMBConv_iconv::~wxMBConv_iconv()
2174 if ( m2w
!= ICONV_T_INVALID
)
2176 if ( w2m
!= ICONV_T_INVALID
)
2181 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2182 const char *src
, size_t srcLen
) const
2184 if ( srcLen
== wxNO_LEN
)
2186 // find the string length: notice that must be done differently for
2187 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2189 const size_t nulLen
= GetMBNulLen();
2193 return wxCONV_FAILED
;
2196 srcLen
= strlen(src
); // arguably more optimized than our version
2201 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2202 // but they also have to start at character boundary and not
2203 // span two adjacent characters
2205 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2211 // when we're determining the length of the string ourselves we count
2212 // the terminating NUL(s) as part of it and always NUL-terminate the
2217 // we express length in the number of (wide) characters but iconv always
2218 // counts buffer sizes it in bytes
2219 dstLen
*= SIZEOF_WCHAR_T
;
2222 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2223 // Unfortunately there are a couple of global wxCSConv objects such as
2224 // wxConvLocal that are used all over wx code, so we have to make sure
2225 // the handle is used by at most one thread at the time. Otherwise
2226 // only a few wx classes would be safe to use from non-main threads
2227 // as MB<->WC conversion would fail "randomly".
2228 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2229 #endif // wxUSE_THREADS
2232 const char *pszPtr
= src
;
2236 char* bufPtr
= (char*)dst
;
2238 // have destination buffer, convert there
2239 size_t dstLenOrig
= dstLen
;
2241 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2244 // convert the number of bytes converted as returned by iconv to the
2245 // number of (wide) characters converted that we need
2246 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2250 // convert to native endianness
2251 for ( unsigned i
= 0; i
< res
; i
++ )
2252 dst
[i
] = WC_BSWAP(dst
[i
]);
2255 else // no destination buffer
2257 // convert using temp buffer to calculate the size of the buffer needed
2263 char* bufPtr
= (char*)tbuf
;
2264 dstLen
= 8 * SIZEOF_WCHAR_T
;
2267 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2270 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2272 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2275 if (ICONV_FAILED(cres
, srcLen
))
2277 //VS: it is ok if iconv fails, hence trace only
2278 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2279 return wxCONV_FAILED
;
2285 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2286 const wchar_t *src
, size_t srcLen
) const
2289 // NB: explained in MB2WC
2290 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2293 if ( srcLen
== wxNO_LEN
)
2294 srcLen
= wxWcslen(src
) + 1;
2296 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2297 size_t outbuflen
= dstLen
;
2300 wchar_t *tmpbuf
= 0;
2304 // need to copy to temp buffer to switch endianness
2305 // (doing WC_BSWAP twice on the original buffer won't help, as it
2306 // could be in read-only memory, or be accessed in some other thread)
2307 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2308 for ( size_t i
= 0; i
< srcLen
; i
++ )
2309 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2311 tmpbuf
[srcLen
] = L
'\0';
2315 char* inbuf
= (char*)src
;
2318 // have destination buffer, convert there
2319 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2321 res
= dstLen
- outbuflen
;
2323 else // no destination buffer
2325 // convert using temp buffer to calculate the size of the buffer needed
2333 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2335 res
+= 16 - outbuflen
;
2337 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2345 if (ICONV_FAILED(cres
, inbuflen
))
2347 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2348 return wxCONV_FAILED
;
2354 size_t wxMBConv_iconv::GetMBNulLen() const
2356 if ( m_minMBCharWidth
== 0 )
2358 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2361 // NB: explained in MB2WC
2362 wxMutexLocker
lock(self
->m_iconvMutex
);
2365 const wchar_t *wnul
= L
"";
2366 char buf
[8]; // should be enough for NUL in any encoding
2367 size_t inLen
= sizeof(wchar_t),
2368 outLen
= WXSIZEOF(buf
);
2369 char *inBuff
= (char *)wnul
;
2370 char *outBuff
= buf
;
2371 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2373 self
->m_minMBCharWidth
= (size_t)-1;
2377 self
->m_minMBCharWidth
= outBuff
- buf
;
2381 return m_minMBCharWidth
;
2384 #if wxUSE_UNICODE_UTF8
2385 bool wxMBConv_iconv::IsUTF8() const
2387 return wxStricmp(m_name
, "UTF-8") == 0 ||
2388 wxStricmp(m_name
, "UTF8") == 0;
2392 #endif // HAVE_ICONV
2395 // ============================================================================
2396 // Win32 conversion classes
2397 // ============================================================================
2399 #ifdef wxHAVE_WIN32_MB2WC
2403 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2404 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2407 class wxMBConv_win32
: public wxMBConv
2412 m_CodePage
= CP_ACP
;
2413 m_minMBCharWidth
= 0;
2416 wxMBConv_win32(const wxMBConv_win32
& conv
)
2419 m_CodePage
= conv
.m_CodePage
;
2420 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2424 wxMBConv_win32(const char* name
)
2426 m_CodePage
= wxCharsetToCodepage(name
);
2427 m_minMBCharWidth
= 0;
2430 wxMBConv_win32(wxFontEncoding encoding
)
2432 m_CodePage
= wxEncodingToCodepage(encoding
);
2433 m_minMBCharWidth
= 0;
2435 #endif // wxUSE_FONTMAP
2437 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2439 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2440 // the behaviour is not compatible with the Unix version (using iconv)
2441 // and break the library itself, e.g. wxTextInputStream::NextChar()
2442 // wouldn't work if reading an incomplete MB char didn't result in an
2445 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2446 // Win XP or newer and it is not supported for UTF-[78] so we always
2447 // use our own conversions in this case. See
2448 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2449 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2450 if ( m_CodePage
== CP_UTF8
)
2452 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2455 if ( m_CodePage
== CP_UTF7
)
2457 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2461 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2462 IsAtLeastWin2kSP4() )
2464 flags
= MB_ERR_INVALID_CHARS
;
2467 const size_t len
= ::MultiByteToWideChar
2469 m_CodePage
, // code page
2470 flags
, // flags: fall on error
2471 psz
, // input string
2472 -1, // its length (NUL-terminated)
2473 buf
, // output string
2474 buf
? n
: 0 // size of output buffer
2478 // function totally failed
2479 return wxCONV_FAILED
;
2482 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2483 // check if we succeeded, by doing a double trip:
2484 if ( !flags
&& buf
)
2486 const size_t mbLen
= strlen(psz
);
2487 wxCharBuffer
mbBuf(mbLen
);
2488 if ( ::WideCharToMultiByte
2495 mbLen
+ 1, // size in bytes, not length
2499 strcmp(mbBuf
, psz
) != 0 )
2501 // we didn't obtain the same thing we started from, hence
2502 // the conversion was lossy and we consider that it failed
2503 return wxCONV_FAILED
;
2507 // note that it returns count of written chars for buf != NULL and size
2508 // of the needed buffer for buf == NULL so in either case the length of
2509 // the string (which never includes the terminating NUL) is one less
2513 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2516 we have a problem here: by default, WideCharToMultiByte() may
2517 replace characters unrepresentable in the target code page with bad
2518 quality approximations such as turning "1/2" symbol (U+00BD) into
2519 "1" for the code pages which don't have it and we, obviously, want
2520 to avoid this at any price
2522 the trouble is that this function does it _silently_, i.e. it won't
2523 even tell us whether it did or not... Win98/2000 and higher provide
2524 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2525 we have to resort to a round trip, i.e. check that converting back
2526 results in the same string -- this is, of course, expensive but
2527 otherwise we simply can't be sure to not garble the data.
2530 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2531 // it doesn't work with CJK encodings (which we test for rather roughly
2532 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2534 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2537 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2539 // it's our lucky day
2540 flags
= WC_NO_BEST_FIT_CHARS
;
2541 pUsedDef
= &usedDef
;
2543 else // old system or unsupported encoding
2549 const size_t len
= ::WideCharToMultiByte
2551 m_CodePage
, // code page
2552 flags
, // either none or no best fit
2553 pwz
, // input string
2554 -1, // it is (wide) NUL-terminated
2555 buf
, // output buffer
2556 buf
? n
: 0, // and its size
2557 NULL
, // default "replacement" char
2558 pUsedDef
// [out] was it used?
2563 // function totally failed
2564 return wxCONV_FAILED
;
2567 // we did something, check if we really succeeded
2570 // check if the conversion failed, i.e. if any replacements
2573 return wxCONV_FAILED
;
2575 else // we must resort to double tripping...
2577 // first we need to ensure that we really have the MB data: this is
2578 // not the case if we're called with NULL buffer, in which case we
2579 // need to do the conversion yet again
2580 wxCharBuffer bufDef
;
2583 bufDef
= wxCharBuffer(len
);
2584 buf
= bufDef
.data();
2585 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2586 buf
, len
, NULL
, NULL
) )
2587 return wxCONV_FAILED
;
2592 wxWCharBuffer
wcBuf(n
);
2593 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2594 wcscmp(wcBuf
, pwz
) != 0 )
2596 // we didn't obtain the same thing we started from, hence
2597 // the conversion was lossy and we consider that it failed
2598 return wxCONV_FAILED
;
2602 // see the comment above for the reason of "len - 1"
2606 virtual size_t GetMBNulLen() const
2608 if ( m_minMBCharWidth
== 0 )
2610 int len
= ::WideCharToMultiByte
2612 m_CodePage
, // code page
2614 L
"", // input string
2615 1, // translate just the NUL
2616 NULL
, // output buffer
2618 NULL
, // no replacement char
2619 NULL
// [out] don't care if it was used
2622 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2626 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2627 self
->m_minMBCharWidth
= (size_t)-1;
2631 self
->m_minMBCharWidth
= (size_t)-1;
2637 self
->m_minMBCharWidth
= len
;
2642 return m_minMBCharWidth
;
2645 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2647 bool IsOk() const { return m_CodePage
!= -1; }
2650 static bool CanUseNoBestFit()
2652 static int s_isWin98Or2k
= -1;
2654 if ( s_isWin98Or2k
== -1 )
2657 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2659 case wxOS_WINDOWS_9X
:
2660 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2663 case wxOS_WINDOWS_NT
:
2664 s_isWin98Or2k
= verMaj
>= 5;
2668 // unknown: be conservative by default
2673 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2676 return s_isWin98Or2k
== 1;
2679 static bool IsAtLeastWin2kSP4()
2684 static int s_isAtLeastWin2kSP4
= -1;
2686 if ( s_isAtLeastWin2kSP4
== -1 )
2688 OSVERSIONINFOEX ver
;
2690 memset(&ver
, 0, sizeof(ver
));
2691 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2692 GetVersionEx((OSVERSIONINFO
*)&ver
);
2694 s_isAtLeastWin2kSP4
=
2695 ((ver
.dwMajorVersion
> 5) || // Vista+
2696 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2697 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2698 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2702 return s_isAtLeastWin2kSP4
== 1;
2707 // the code page we're working with
2710 // cached result of GetMBNulLen(), set to 0 initially meaning
2712 size_t m_minMBCharWidth
;
2715 #endif // wxHAVE_WIN32_MB2WC
2718 // ============================================================================
2719 // wxEncodingConverter based conversion classes
2720 // ============================================================================
2724 class wxMBConv_wxwin
: public wxMBConv
2729 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2730 // The wxMBConv_cf class does a better job.
2731 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2732 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2733 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2737 // temporarily just use wxEncodingConverter stuff,
2738 // so that it works while a better implementation is built
2739 wxMBConv_wxwin(const char* name
)
2742 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2744 m_enc
= wxFONTENCODING_SYSTEM
;
2749 wxMBConv_wxwin(wxFontEncoding enc
)
2756 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2758 size_t inbuf
= strlen(psz
);
2761 if (!m2w
.Convert(psz
, buf
))
2762 return wxCONV_FAILED
;
2767 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2769 const size_t inbuf
= wxWcslen(psz
);
2772 if (!w2m
.Convert(psz
, buf
))
2773 return wxCONV_FAILED
;
2779 virtual size_t GetMBNulLen() const
2783 case wxFONTENCODING_UTF16BE
:
2784 case wxFONTENCODING_UTF16LE
:
2787 case wxFONTENCODING_UTF32BE
:
2788 case wxFONTENCODING_UTF32LE
:
2796 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2798 bool IsOk() const { return m_ok
; }
2801 wxFontEncoding m_enc
;
2802 wxEncodingConverter m2w
, w2m
;
2805 // were we initialized successfully?
2808 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2811 // make the constructors available for unit testing
2812 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2814 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2815 if ( !result
->IsOk() )
2824 #endif // wxUSE_FONTMAP
2826 // ============================================================================
2827 // wxCSConv implementation
2828 // ============================================================================
2830 void wxCSConv::Init()
2837 wxCSConv::wxCSConv(const wxString
& charset
)
2841 if ( !charset
.empty() )
2843 SetName(charset
.ToAscii());
2847 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2849 m_encoding
= wxFONTENCODING_SYSTEM
;
2853 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2855 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2857 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2859 encoding
= wxFONTENCODING_SYSTEM
;
2864 m_encoding
= encoding
;
2867 wxCSConv::~wxCSConv()
2872 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2877 SetName(conv
.m_name
);
2878 m_encoding
= conv
.m_encoding
;
2881 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2885 SetName(conv
.m_name
);
2886 m_encoding
= conv
.m_encoding
;
2891 void wxCSConv::Clear()
2900 void wxCSConv::SetName(const char *charset
)
2904 m_name
= wxStrdup(charset
);
2911 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2912 wxEncodingNameCache
);
2914 static wxEncodingNameCache gs_nameCache
;
2917 wxMBConv
*wxCSConv::DoCreate() const
2920 wxLogTrace(TRACE_STRCONV
,
2921 wxT("creating conversion for %s"),
2923 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2924 #endif // wxUSE_FONTMAP
2926 // check for the special case of ASCII or ISO8859-1 charset: as we have
2927 // special knowledge of it anyhow, we don't need to create a special
2928 // conversion object
2929 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2930 m_encoding
== wxFONTENCODING_DEFAULT
)
2932 // don't convert at all
2936 // we trust OS to do conversion better than we can so try external
2937 // conversion methods first
2939 // the full order is:
2940 // 1. OS conversion (iconv() under Unix or Win32 API)
2941 // 2. hard coded conversions for UTF
2942 // 3. wxEncodingConverter as fall back
2948 #endif // !wxUSE_FONTMAP
2951 wxFontEncoding
encoding(m_encoding
);
2956 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2964 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2965 #endif // wxUSE_FONTMAP
2969 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2970 if ( it
!= gs_nameCache
.end() )
2972 if ( it
->second
.empty() )
2975 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2982 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2983 // CS : in case this does not return valid names (eg for MacRoman)
2984 // encoding got a 'failure' entry in the cache all the same,
2985 // although it just has to be created using a different method, so
2986 // only store failed iconv creation attempts (or perhaps we
2987 // shoulnd't do this at all ?)
2988 if ( names
[0] != NULL
)
2990 for ( ; *names
; ++names
)
2992 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2993 // will need changes that will obsolete this
2994 wxString
name(*names
);
2995 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2998 gs_nameCache
[encoding
] = *names
;
3005 gs_nameCache
[encoding
] = _T(""); // cache the failure
3008 #endif // wxUSE_FONTMAP
3010 #endif // HAVE_ICONV
3012 #ifdef wxHAVE_WIN32_MB2WC
3015 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3016 : new wxMBConv_win32(m_encoding
);
3025 #endif // wxHAVE_WIN32_MB2WC
3029 // leave UTF16 and UTF32 to the built-ins of wx
3030 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3031 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3034 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3035 : new wxMBConv_cf(m_encoding
);
3037 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3046 #endif // __DARWIN__
3049 wxFontEncoding enc
= m_encoding
;
3051 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3053 // use "false" to suppress interactive dialogs -- we can be called from
3054 // anywhere and popping up a dialog from here is the last thing we want to
3056 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3058 #endif // wxUSE_FONTMAP
3062 case wxFONTENCODING_UTF7
:
3063 return new wxMBConvUTF7
;
3065 case wxFONTENCODING_UTF8
:
3066 return new wxMBConvUTF8
;
3068 case wxFONTENCODING_UTF16BE
:
3069 return new wxMBConvUTF16BE
;
3071 case wxFONTENCODING_UTF16LE
:
3072 return new wxMBConvUTF16LE
;
3074 case wxFONTENCODING_UTF32BE
:
3075 return new wxMBConvUTF32BE
;
3077 case wxFONTENCODING_UTF32LE
:
3078 return new wxMBConvUTF32LE
;
3081 // nothing to do but put here to suppress gcc warnings
3088 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3089 : new wxMBConv_wxwin(m_encoding
);
3095 #endif // wxUSE_FONTMAP
3097 // NB: This is a hack to prevent deadlock. What could otherwise happen
3098 // in Unicode build: wxConvLocal creation ends up being here
3099 // because of some failure and logs the error. But wxLog will try to
3100 // attach a timestamp, for which it will need wxConvLocal (to convert
3101 // time to char* and then wchar_t*), but that fails, tries to log the
3102 // error, but wxLog has an (already locked) critical section that
3103 // guards the static buffer.
3104 static bool alreadyLoggingError
= false;
3105 if (!alreadyLoggingError
)
3107 alreadyLoggingError
= true;
3108 wxLogError(_("Cannot convert from the charset '%s'!"),
3112 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3113 #else // !wxUSE_FONTMAP
3114 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3115 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3118 alreadyLoggingError
= false;
3124 void wxCSConv::CreateConvIfNeeded() const
3128 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3130 // if we don't have neither the name nor the encoding, use the default
3131 // encoding for this system
3132 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3135 self
->m_encoding
= wxLocale::GetSystemEncoding();
3137 // fallback to some reasonable default:
3138 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3139 #endif // wxUSE_INTL
3142 self
->m_convReal
= DoCreate();
3143 self
->m_deferred
= false;
3147 bool wxCSConv::IsOk() const
3149 CreateConvIfNeeded();
3151 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3152 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3153 return true; // always ok as we do it ourselves
3155 // m_convReal->IsOk() is called at its own creation, so we know it must
3156 // be ok if m_convReal is non-NULL
3157 return m_convReal
!= NULL
;
3160 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3161 const char *src
, size_t srcLen
) const
3163 CreateConvIfNeeded();
3166 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3169 if ( srcLen
== wxNO_LEN
)
3170 srcLen
= strlen(src
) + 1; // take trailing NUL too
3174 if ( dstLen
< srcLen
)
3175 return wxCONV_FAILED
;
3177 for ( size_t n
= 0; n
< srcLen
; n
++ )
3178 dst
[n
] = (unsigned char)(src
[n
]);
3184 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3185 const wchar_t *src
, size_t srcLen
) const
3187 CreateConvIfNeeded();
3190 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3193 if ( srcLen
== wxNO_LEN
)
3194 srcLen
= wxWcslen(src
) + 1;
3198 if ( dstLen
< srcLen
)
3199 return wxCONV_FAILED
;
3201 for ( size_t n
= 0; n
< srcLen
; n
++ )
3203 if ( src
[n
] > 0xFF )
3204 return wxCONV_FAILED
;
3206 dst
[n
] = (char)src
[n
];
3210 else // still need to check the input validity
3212 for ( size_t n
= 0; n
< srcLen
; n
++ )
3214 if ( src
[n
] > 0xFF )
3215 return wxCONV_FAILED
;
3222 size_t wxCSConv::GetMBNulLen() const
3224 CreateConvIfNeeded();
3228 return m_convReal
->GetMBNulLen();
3231 // otherwise, we are ISO-8859-1
3235 #if wxUSE_UNICODE_UTF8
3236 bool wxCSConv::IsUTF8() const
3238 CreateConvIfNeeded();
3242 return m_convReal
->IsUTF8();
3245 // otherwise, we are ISO-8859-1
3253 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3256 return wxWCharBuffer();
3258 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3260 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3262 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3267 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3270 return wxCharBuffer();
3272 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3274 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3279 #endif // wxUSE_UNICODE
3281 // ----------------------------------------------------------------------------
3283 // ----------------------------------------------------------------------------
3285 // NB: The reason why we create converted objects in this convoluted way,
3286 // using a factory function instead of global variable, is that they
3287 // may be used at static initialization time (some of them are used by
3288 // wxString ctors and there may be a global wxString object). In other
3289 // words, possibly _before_ the converter global object would be
3296 #undef wxConvISO8859_1
3298 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3299 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3300 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3302 static impl_klass name##Obj ctor_args; \
3303 return &name##Obj; \
3305 /* this ensures that all global converter objects are created */ \
3306 /* by the time static initialization is done, i.e. before any */ \
3307 /* thread is launched: */ \
3308 static klass* gs_##name##instance = wxGet_##name##Ptr()
3310 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3311 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3314 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3316 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3319 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3320 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3321 // provokes an error message about "not enough macro parameters"; and we
3322 // can't use "()" here as the name##Obj declaration would be parsed as a
3323 // function declaration then, so use a semicolon and live with an extra
3324 // empty statement (and hope that no compilers warns about this)
3325 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3326 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3328 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3329 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3331 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3332 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3335 // The xnu kernel always communicates file paths in decomposed UTF-8.
3336 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3337 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3340 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3343 #else // !__DARWIN__
3344 wxGet_wxConvLibcPtr();
3345 #endif // __DARWIN__/!__DARWIN__
3347 #else // !wxUSE_WCHAR_T
3349 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3350 // stand-ins in absence of wchar_t
3351 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3356 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T