1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten
= 0;
172 // the number of NULs terminating this string
173 size_t nulLen
= 0; // not really needed, but just to avoid warnings
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
181 if ( srcLen
!= wxNO_LEN
)
183 // we need to know how to find the end of this string
184 nulLen
= GetMBNulLen();
185 if ( nulLen
== wxCONV_FAILED
)
186 return wxCONV_FAILED
;
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
193 char * const p
= bufTmp
.data();
194 memcpy(p
, src
, srcLen
);
195 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
201 srcEnd
= src
+ srcLen
;
203 else // quit after the first loop iteration
210 // try to convert the current chunk
211 size_t lenChunk
= MB2WC(NULL
, src
, 0);
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 dstWritten
+= lenChunk
;
221 // nothing left in the input string, conversion succeeded
227 if ( dstWritten
> dstLen
)
228 return wxCONV_FAILED
;
230 // +1 is for trailing NUL
231 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
232 return wxCONV_FAILED
;
241 // we convert just one chunk in this case as this is the entire
246 // advance the input pointer past the end of this chunk
247 while ( NotAllNULs(src
, nulLen
) )
249 // notice that we must skip over multiple bytes here as we suppose
250 // that if NUL takes 2 or 4 bytes, then all the other characters do
251 // too and so if advanced by a single byte we might erroneously
252 // detect sequences of NUL bytes in the middle of the input
256 src
+= nulLen
; // skipping over its terminator as well
258 // note that ">=" (and not just "==") is needed here as the terminator
259 // we skipped just above could be inside or just after the buffer
260 // delimited by inEnd
269 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
270 const wchar_t *src
, size_t srcLen
) const
272 // the number of chars [which would be] written to dst [if it were not NULL]
273 size_t dstWritten
= 0;
275 // if we don't know its length we have no choice but to assume that it is
276 // NUL-terminated (notice that it can still be NUL-terminated even if
277 // explicit length is given but it doesn't change our return value)
278 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
280 // make a copy of the input string unless it is already properly
282 wxWCharBuffer bufTmp
;
283 if ( isNulTerminated
)
285 srcLen
= wxWcslen(src
) + 1;
287 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
289 // make a copy in order to properly NUL-terminate the string
290 bufTmp
= wxWCharBuffer(srcLen
);
291 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
295 const size_t lenNul
= GetMBNulLen();
296 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
298 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
300 // try to convert the current chunk
301 size_t lenChunk
= WC2MB(NULL
, src
, 0);
303 if ( lenChunk
== wxCONV_FAILED
)
304 return wxCONV_FAILED
;
306 dstWritten
+= lenChunk
;
307 if ( isNulTerminated
)
308 dstWritten
+= lenNul
;
312 if ( dstWritten
> dstLen
)
313 return wxCONV_FAILED
;
315 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
316 return wxCONV_FAILED
;
319 if ( isNulTerminated
)
327 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
329 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
330 if ( rc
!= wxCONV_FAILED
)
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
340 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
342 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
343 if ( rc
!= wxCONV_FAILED
)
351 wxMBConv::~wxMBConv()
353 // nothing to do here (necessary for Darwin linking probably)
356 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
360 // calculate the length of the buffer needed first
361 const size_t nLen
= ToWChar(NULL
, 0, psz
);
362 if ( nLen
!= wxCONV_FAILED
)
364 // now do the actual conversion
365 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
367 // +1 for the trailing NULL
368 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
373 return wxWCharBuffer();
376 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
380 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
381 if ( nLen
!= wxCONV_FAILED
)
383 wxCharBuffer
buf(nLen
- 1);
384 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
389 return wxCharBuffer();
393 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
395 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
396 if ( dstLen
!= wxCONV_FAILED
)
398 // notice that we allocate space for dstLen+1 wide characters here
399 // because we want the buffer to always be NUL-terminated, even if the
400 // input isn't (as otherwise the caller has no way to know its length)
401 wxWCharBuffer
wbuf(dstLen
);
402 wbuf
.data()[dstLen
] = L
'\0';
403 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
409 // we also need to handle NUL-terminated input strings
410 // specially: for them the output is the length of the string
411 // excluding the trailing NUL, however if we're asked to
412 // convert a specific number of characters we return the length
413 // of the resulting output even if it's NUL-terminated
414 if ( inLen
== wxNO_LEN
)
425 return wxWCharBuffer();
429 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
431 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
432 if ( dstLen
!= wxCONV_FAILED
)
434 const size_t nulLen
= GetMBNulLen();
436 // as above, ensure that the buffer is always NUL-terminated, even if
438 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
439 memset(buf
.data() + dstLen
, 0, nulLen
);
440 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
446 if ( inLen
== wxNO_LEN
)
448 // in this case both input and output are NUL-terminated
449 // and we're not supposed to count NUL
461 return wxCharBuffer();
464 // ----------------------------------------------------------------------------
466 // ----------------------------------------------------------------------------
468 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
470 return wxMB2WC(buf
, psz
, n
);
473 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
475 return wxWC2MB(buf
, psz
, n
);
478 // ----------------------------------------------------------------------------
479 // wxConvBrokenFileNames
480 // ----------------------------------------------------------------------------
484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
486 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
487 wxStricmp(charset
, _T("UTF8")) == 0 )
488 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
490 m_conv
= new wxCSConv(charset
);
495 // ----------------------------------------------------------------------------
497 // ----------------------------------------------------------------------------
499 // Implementation (C) 2004 Fredrik Roubert
501 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
504 // BASE64 decoding table
506 static const unsigned char utf7unb64
[] =
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
542 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
543 const char *src
, size_t srcLen
) const
545 DecoderState stateOrig
,
547 if ( srcLen
== wxNO_LEN
)
549 // convert the entire string, up to and including the trailing NUL
550 srcLen
= strlen(src
) + 1;
552 // when working on the entire strings we don't update nor use the shift
553 // state from the previous call
554 statePtr
= &stateOrig
;
556 else // when working with partial strings we do use the shift state
558 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
560 // also save the old state to be able to rollback to it on error
561 stateOrig
= m_stateDecoder
;
564 // but to simplify the code below we use this variable in both cases
565 DecoderState
& state
= *statePtr
;
568 // number of characters [which would have been] written to dst [if it were
572 const char * const srcEnd
= src
+ srcLen
;
574 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
576 const unsigned char cc
= *src
++;
578 if ( state
.IsShifted() )
580 const unsigned char dc
= utf7unb64
[cc
];
583 // end of encoded part
586 // re-parse this character normally below unless it's '-' which
587 // is consumed by the decoder
591 else // valid encoded character
593 // mini base64 decoder: each character is 6 bits
598 if ( state
.bit
>= 8 )
600 // got the full byte, consume it
602 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
606 // we've got the full word, output it
608 *dst
++ = (state
.msb
<< 8) | b
;
614 // just store it while we wait for LSB
622 if ( state
.IsDirect() )
624 // start of an encoded segment?
628 return wxCONV_FAILED
; // can't have '+' at the end
632 // just the encoded plus sign, don't switch to shifted mode
645 // only printable 7 bit ASCII characters (with the exception of
646 // NUL, TAB, CR and LF) can be used directly
647 if ( cc
>= 0x7f || (cc
< ' ' &&
648 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
649 return wxCONV_FAILED
;
660 // as we didn't read any characters we should be called with the same
661 // data (followed by some more new data) again later so don't save our
665 return wxCONV_FAILED
;
672 // BASE64 encoding table
674 static const unsigned char utf7enb64
[] =
676 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
677 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
678 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
679 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
680 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
681 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
682 'w', 'x', 'y', 'z', '0', '1', '2', '3',
683 '4', '5', '6', '7', '8', '9', '+', '/'
687 // UTF-7 encoding table
689 // 0 - Set D (directly encoded characters)
690 // 1 - Set O (optional direct characters)
691 // 2 - whitespace characters (optional)
692 // 3 - special characters
694 static const unsigned char utf7encode
[128] =
696 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
697 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
698 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
700 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
706 static inline bool wxIsUTF7Direct(wchar_t wc
)
708 return wc
< 0x80 && utf7encode
[wc
] < 1;
711 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
712 const wchar_t *src
, size_t srcLen
) const
714 EncoderState stateOrig
,
716 if ( srcLen
== wxNO_LEN
)
718 // we don't apply the stored state when operating on entire strings at
720 statePtr
= &stateOrig
;
722 srcLen
= wxWcslen(src
) + 1;
724 else // do use the mode we left the output in previously
726 stateOrig
= m_stateEncoder
;
727 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
730 EncoderState
& state
= *statePtr
;
735 const wchar_t * const srcEnd
= src
+ srcLen
;
736 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
739 if ( wxIsUTF7Direct(cc
) )
741 if ( state
.IsShifted() )
743 // pad with zeros the last encoded block if necessary
747 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
762 else if ( cc
== '+' && state
.IsDirect() )
773 else if (((wxUint32
)cc
) > 0xffff)
775 // no surrogate pair generation (yet?)
776 return wxCONV_FAILED
;
781 if ( state
.IsDirect() )
790 // BASE64 encode string
793 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
796 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
798 for (state
.bit
+= 8; state
.bit
>= 6; )
802 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
807 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
815 // we need to restore the original encoder state if we were called just to
816 // calculate the amount of space needed as we will presumably be called
817 // again to really convert the data now
824 // ----------------------------------------------------------------------------
826 // ----------------------------------------------------------------------------
828 static const wxUint32 utf8_max
[]=
829 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
831 // boundaries of the private use area we use to (temporarily) remap invalid
832 // characters invalid in a UTF-8 encoded string
833 const wxUint32 wxUnicodePUA
= 0x100000;
834 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
836 // this table gives the length of the UTF-8 encoding from its first character:
837 const unsigned char tableUtf8Lengths
[256] = {
838 // single-byte sequences (ASCII):
839 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
840 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
848 // these are invalid:
849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
855 // two-byte sequences:
856 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
857 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
859 // three-byte sequences:
860 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
862 // four-byte sequences:
863 4, 4, 4, 4, 4, // F0..F4
865 // these are invalid again (5- or 6-byte
866 // sequences and sequences for code points
867 // above U+10FFFF, as restricted by RFC 3629):
868 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
872 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
873 const char *src
, size_t srcLen
) const
875 wchar_t *out
= dstLen
? dst
: NULL
;
878 if ( srcLen
== wxNO_LEN
)
879 srcLen
= strlen(src
) + 1;
881 for ( const char *p
= src
; ; p
++ )
883 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
885 // all done successfully, just add the trailing NULL if we are not
886 // using explicit length
887 if ( srcLen
== wxNO_LEN
)
903 if ( out
&& !dstLen
-- )
907 unsigned char c
= *p
;
911 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
914 if ( srcLen
!= wxNO_LEN
)
921 unsigned len
= tableUtf8Lengths
[c
];
925 if ( srcLen
< len
) // the test works for wxNO_LEN too
928 if ( srcLen
!= wxNO_LEN
)
931 // Char. number range | UTF-8 octet sequence
932 // (hexadecimal) | (binary)
933 // ----------------------+----------------------------------------
934 // 0000 0000 - 0000 007F | 0xxxxxxx
935 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
936 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
937 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
939 // Code point value is stored in bits marked with 'x',
940 // lowest-order bit of the value on the right side in the diagram
941 // above. (from RFC 3629)
943 // mask to extract lead byte's value ('x' bits above), by sequence
945 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
947 // mask and value of lead byte's most significant bits, by length:
948 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
949 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
951 len
--; // it's more convenient to work with 0-based length here
953 // extract the lead byte's value bits:
954 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
957 code
= c
& leadValueMask
[len
];
959 // all remaining bytes, if any, are handled in the same way
960 // regardless of sequence's length:
964 if ( (c
& 0xC0) != 0x80 )
965 return wxCONV_FAILED
;
973 // cast is ok because wchar_t == wxUint16 if WC_UTF16
974 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
983 #endif // WC_UTF16/!WC_UTF16
991 return wxCONV_FAILED
;
995 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
996 const wchar_t *src
, size_t srcLen
) const
998 char *out
= dstLen
? dst
: NULL
;
1001 for ( const wchar_t *wp
= src
; ; wp
++ )
1003 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1005 // all done successfully, just add the trailing NULL if we are not
1006 // using explicit length
1007 if ( srcLen
== wxNO_LEN
)
1023 if ( srcLen
!= wxNO_LEN
)
1028 // cast is ok for WC_UTF16
1029 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1031 // skip the next char too as we decoded a surrogate
1034 #else // wchar_t is UTF-32
1035 code
= *wp
& 0x7fffffff;
1047 out
[0] = (char)code
;
1050 else if ( code
<= 0x07FF )
1058 // NB: this line takes 6 least significant bits, encodes them as
1059 // 10xxxxxx and discards them so that the next byte can be encoded:
1060 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1061 out
[0] = 0xC0 | code
;
1064 else if ( code
< 0xFFFF )
1072 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1073 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1074 out
[0] = 0xE0 | code
;
1077 else if ( code
<= 0x10FFFF )
1085 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1086 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1087 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1088 out
[0] = 0xF0 | code
;
1093 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1106 // we only get here if an error occurs during decoding
1107 return wxCONV_FAILED
;
1110 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1111 const char *psz
, size_t srcLen
) const
1113 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1114 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1118 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1120 const char *opsz
= psz
;
1121 bool invalid
= false;
1122 unsigned char cc
= *psz
++, fc
= cc
;
1124 for (cnt
= 0; fc
& 0x80; cnt
++)
1134 // escape the escape character for octal escapes
1135 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1136 && cc
== '\\' && (!buf
|| len
< n
))
1148 // invalid UTF-8 sequence
1153 unsigned ocnt
= cnt
- 1;
1154 wxUint32 res
= cc
& (0x3f >> cnt
);
1158 if ((cc
& 0xC0) != 0x80)
1160 // invalid UTF-8 sequence
1166 res
= (res
<< 6) | (cc
& 0x3f);
1169 if (invalid
|| res
<= utf8_max
[ocnt
])
1171 // illegal UTF-8 encoding
1174 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1175 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1177 // if one of our PUA characters turns up externally
1178 // it must also be treated as an illegal sequence
1179 // (a bit like you have to escape an escape character)
1185 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1186 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1187 if (pa
== wxCONV_FAILED
)
1199 *buf
++ = (wchar_t)res
;
1201 #endif // WC_UTF16/!WC_UTF16
1207 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1209 while (opsz
< psz
&& (!buf
|| len
< n
))
1212 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1213 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1214 wxASSERT(pa
!= wxCONV_FAILED
);
1221 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1227 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1229 while (opsz
< psz
&& (!buf
|| len
< n
))
1231 if ( buf
&& len
+ 3 < n
)
1233 unsigned char on
= *opsz
;
1235 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1236 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1237 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1244 else // MAP_INVALID_UTF8_NOT
1246 return wxCONV_FAILED
;
1252 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1258 static inline bool isoctal(wchar_t wch
)
1260 return L
'0' <= wch
&& wch
<= L
'7';
1263 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1264 const wchar_t *psz
, size_t srcLen
) const
1266 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1267 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1271 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1276 // cast is ok for WC_UTF16
1277 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1278 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1280 cc
= (*psz
++) & 0x7fffffff;
1283 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1284 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1287 *buf
++ = (char)(cc
- wxUnicodePUA
);
1290 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1291 && cc
== L
'\\' && psz
[0] == L
'\\' )
1298 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1300 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1304 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1305 (psz
[1] - L
'0') * 010 +
1315 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1331 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1333 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1339 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1345 // ============================================================================
1347 // ============================================================================
1349 #ifdef WORDS_BIGENDIAN
1350 #define wxMBConvUTF16straight wxMBConvUTF16BE
1351 #define wxMBConvUTF16swap wxMBConvUTF16LE
1353 #define wxMBConvUTF16swap wxMBConvUTF16BE
1354 #define wxMBConvUTF16straight wxMBConvUTF16LE
1358 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1360 if ( srcLen
== wxNO_LEN
)
1362 // count the number of bytes in input, including the trailing NULs
1363 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1364 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1367 srcLen
*= BYTES_PER_CHAR
;
1369 else // we already have the length
1371 // we can only convert an entire number of UTF-16 characters
1372 if ( srcLen
% BYTES_PER_CHAR
)
1373 return wxCONV_FAILED
;
1379 // case when in-memory representation is UTF-16 too
1382 // ----------------------------------------------------------------------------
1383 // conversions without endianness change
1384 // ----------------------------------------------------------------------------
1387 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1388 const char *src
, size_t srcLen
) const
1390 // set up the scene for using memcpy() (which is presumably more efficient
1391 // than copying the bytes one by one)
1392 srcLen
= GetLength(src
, srcLen
);
1393 if ( srcLen
== wxNO_LEN
)
1394 return wxCONV_FAILED
;
1396 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1399 if ( dstLen
< inLen
)
1400 return wxCONV_FAILED
;
1402 memcpy(dst
, src
, srcLen
);
1409 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1410 const wchar_t *src
, size_t srcLen
) const
1412 if ( srcLen
== wxNO_LEN
)
1413 srcLen
= wxWcslen(src
) + 1;
1415 srcLen
*= BYTES_PER_CHAR
;
1419 if ( dstLen
< srcLen
)
1420 return wxCONV_FAILED
;
1422 memcpy(dst
, src
, srcLen
);
1428 // ----------------------------------------------------------------------------
1429 // endian-reversing conversions
1430 // ----------------------------------------------------------------------------
1433 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1434 const char *src
, size_t srcLen
) const
1436 srcLen
= GetLength(src
, srcLen
);
1437 if ( srcLen
== wxNO_LEN
)
1438 return wxCONV_FAILED
;
1440 srcLen
/= BYTES_PER_CHAR
;
1444 if ( dstLen
< srcLen
)
1445 return wxCONV_FAILED
;
1447 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1448 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1450 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1458 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1459 const wchar_t *src
, size_t srcLen
) const
1461 if ( srcLen
== wxNO_LEN
)
1462 srcLen
= wxWcslen(src
) + 1;
1464 srcLen
*= BYTES_PER_CHAR
;
1468 if ( dstLen
< srcLen
)
1469 return wxCONV_FAILED
;
1471 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1472 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1474 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1481 #else // !WC_UTF16: wchar_t is UTF-32
1483 // ----------------------------------------------------------------------------
1484 // conversions without endianness change
1485 // ----------------------------------------------------------------------------
1488 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1489 const char *src
, size_t srcLen
) const
1491 srcLen
= GetLength(src
, srcLen
);
1492 if ( srcLen
== wxNO_LEN
)
1493 return wxCONV_FAILED
;
1495 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1498 // optimization: return maximal space which could be needed for this
1499 // string even if the real size could be smaller if the buffer contains
1505 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1506 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1508 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1510 return wxCONV_FAILED
;
1512 if ( ++outLen
> dstLen
)
1513 return wxCONV_FAILED
;
1523 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1524 const wchar_t *src
, size_t srcLen
) const
1526 if ( srcLen
== wxNO_LEN
)
1527 srcLen
= wxWcslen(src
) + 1;
1530 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1531 for ( size_t n
= 0; n
< srcLen
; n
++ )
1534 const size_t numChars
= encode_utf16(*src
++, cc
);
1535 if ( numChars
== wxCONV_FAILED
)
1536 return wxCONV_FAILED
;
1538 outLen
+= numChars
* BYTES_PER_CHAR
;
1541 if ( outLen
> dstLen
)
1542 return wxCONV_FAILED
;
1545 if ( numChars
== 2 )
1547 // second character of a surrogate
1556 // ----------------------------------------------------------------------------
1557 // endian-reversing conversions
1558 // ----------------------------------------------------------------------------
1561 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1562 const char *src
, size_t srcLen
) const
1564 srcLen
= GetLength(src
, srcLen
);
1565 if ( srcLen
== wxNO_LEN
)
1566 return wxCONV_FAILED
;
1568 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1571 // optimization: return maximal space which could be needed for this
1572 // string even if the real size could be smaller if the buffer contains
1578 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1579 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1584 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1586 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1588 const size_t numChars
= decode_utf16(tmp
, ch
);
1589 if ( numChars
== wxCONV_FAILED
)
1590 return wxCONV_FAILED
;
1592 if ( numChars
== 2 )
1595 if ( ++outLen
> dstLen
)
1596 return wxCONV_FAILED
;
1606 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1607 const wchar_t *src
, size_t srcLen
) const
1609 if ( srcLen
== wxNO_LEN
)
1610 srcLen
= wxWcslen(src
) + 1;
1613 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1614 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1617 const size_t numChars
= encode_utf16(*src
, cc
);
1618 if ( numChars
== wxCONV_FAILED
)
1619 return wxCONV_FAILED
;
1621 outLen
+= numChars
* BYTES_PER_CHAR
;
1624 if ( outLen
> dstLen
)
1625 return wxCONV_FAILED
;
1627 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1628 if ( numChars
== 2 )
1630 // second character of a surrogate
1631 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1639 #endif // WC_UTF16/!WC_UTF16
1642 // ============================================================================
1644 // ============================================================================
1646 #ifdef WORDS_BIGENDIAN
1647 #define wxMBConvUTF32straight wxMBConvUTF32BE
1648 #define wxMBConvUTF32swap wxMBConvUTF32LE
1650 #define wxMBConvUTF32swap wxMBConvUTF32BE
1651 #define wxMBConvUTF32straight wxMBConvUTF32LE
1655 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1656 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1659 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1661 if ( srcLen
== wxNO_LEN
)
1663 // count the number of bytes in input, including the trailing NULs
1664 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1665 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1668 srcLen
*= BYTES_PER_CHAR
;
1670 else // we already have the length
1672 // we can only convert an entire number of UTF-32 characters
1673 if ( srcLen
% BYTES_PER_CHAR
)
1674 return wxCONV_FAILED
;
1680 // case when in-memory representation is UTF-16
1683 // ----------------------------------------------------------------------------
1684 // conversions without endianness change
1685 // ----------------------------------------------------------------------------
1688 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1689 const char *src
, size_t srcLen
) const
1691 srcLen
= GetLength(src
, srcLen
);
1692 if ( srcLen
== wxNO_LEN
)
1693 return wxCONV_FAILED
;
1695 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1696 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1698 for ( size_t n
= 0; n
< inLen
; n
++ )
1701 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1702 if ( numChars
== wxCONV_FAILED
)
1703 return wxCONV_FAILED
;
1708 if ( outLen
> dstLen
)
1709 return wxCONV_FAILED
;
1712 if ( numChars
== 2 )
1714 // second character of a surrogate
1724 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1725 const wchar_t *src
, size_t srcLen
) const
1727 if ( srcLen
== wxNO_LEN
)
1728 srcLen
= wxWcslen(src
) + 1;
1732 // optimization: return maximal space which could be needed for this
1733 // string instead of the exact amount which could be less if there are
1734 // any surrogates in the input
1736 // we consider that surrogates are rare enough to make it worthwhile to
1737 // avoid running the loop below at the cost of slightly extra memory
1739 return srcLen
* BYTES_PER_CHAR
;
1742 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1744 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1746 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1748 return wxCONV_FAILED
;
1750 outLen
+= BYTES_PER_CHAR
;
1752 if ( outLen
> dstLen
)
1753 return wxCONV_FAILED
;
1761 // ----------------------------------------------------------------------------
1762 // endian-reversing conversions
1763 // ----------------------------------------------------------------------------
1766 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1767 const char *src
, size_t srcLen
) const
1769 srcLen
= GetLength(src
, srcLen
);
1770 if ( srcLen
== wxNO_LEN
)
1771 return wxCONV_FAILED
;
1773 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1774 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1776 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1779 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1780 if ( numChars
== wxCONV_FAILED
)
1781 return wxCONV_FAILED
;
1786 if ( outLen
> dstLen
)
1787 return wxCONV_FAILED
;
1790 if ( numChars
== 2 )
1792 // second character of a surrogate
1802 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1803 const wchar_t *src
, size_t srcLen
) const
1805 if ( srcLen
== wxNO_LEN
)
1806 srcLen
= wxWcslen(src
) + 1;
1810 // optimization: return maximal space which could be needed for this
1811 // string instead of the exact amount which could be less if there are
1812 // any surrogates in the input
1814 // we consider that surrogates are rare enough to make it worthwhile to
1815 // avoid running the loop below at the cost of slightly extra memory
1817 return srcLen
*BYTES_PER_CHAR
;
1820 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1822 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1824 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1826 return wxCONV_FAILED
;
1828 outLen
+= BYTES_PER_CHAR
;
1830 if ( outLen
> dstLen
)
1831 return wxCONV_FAILED
;
1833 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1839 #else // !WC_UTF16: wchar_t is UTF-32
1841 // ----------------------------------------------------------------------------
1842 // conversions without endianness change
1843 // ----------------------------------------------------------------------------
1846 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1847 const char *src
, size_t srcLen
) const
1849 // use memcpy() as it should be much faster than hand-written loop
1850 srcLen
= GetLength(src
, srcLen
);
1851 if ( srcLen
== wxNO_LEN
)
1852 return wxCONV_FAILED
;
1854 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1857 if ( dstLen
< inLen
)
1858 return wxCONV_FAILED
;
1860 memcpy(dst
, src
, srcLen
);
1867 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1868 const wchar_t *src
, size_t srcLen
) const
1870 if ( srcLen
== wxNO_LEN
)
1871 srcLen
= wxWcslen(src
) + 1;
1873 srcLen
*= BYTES_PER_CHAR
;
1877 if ( dstLen
< srcLen
)
1878 return wxCONV_FAILED
;
1880 memcpy(dst
, src
, srcLen
);
1886 // ----------------------------------------------------------------------------
1887 // endian-reversing conversions
1888 // ----------------------------------------------------------------------------
1891 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1892 const char *src
, size_t srcLen
) const
1894 srcLen
= GetLength(src
, srcLen
);
1895 if ( srcLen
== wxNO_LEN
)
1896 return wxCONV_FAILED
;
1898 srcLen
/= BYTES_PER_CHAR
;
1902 if ( dstLen
< srcLen
)
1903 return wxCONV_FAILED
;
1905 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1906 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1908 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1916 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1917 const wchar_t *src
, size_t srcLen
) const
1919 if ( srcLen
== wxNO_LEN
)
1920 srcLen
= wxWcslen(src
) + 1;
1922 srcLen
*= BYTES_PER_CHAR
;
1926 if ( dstLen
< srcLen
)
1927 return wxCONV_FAILED
;
1929 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1930 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1932 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1939 #endif // WC_UTF16/!WC_UTF16
1942 // ============================================================================
1943 // The classes doing conversion using the iconv_xxx() functions
1944 // ============================================================================
1948 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1949 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1950 // (unless there's yet another bug in glibc) the only case when iconv()
1951 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1952 // left in the input buffer -- when _real_ error occurs,
1953 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1955 // [This bug does not appear in glibc 2.2.]
1956 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1957 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1958 (errno != E2BIG || bufLeft != 0))
1960 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1963 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1965 #define ICONV_T_INVALID ((iconv_t)-1)
1967 #if SIZEOF_WCHAR_T == 4
1968 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1969 #define WC_ENC wxFONTENCODING_UTF32
1970 #elif SIZEOF_WCHAR_T == 2
1971 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1972 #define WC_ENC wxFONTENCODING_UTF16
1973 #else // sizeof(wchar_t) != 2 nor 4
1974 // does this ever happen?
1975 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1978 // ----------------------------------------------------------------------------
1979 // wxMBConv_iconv: encapsulates an iconv character set
1980 // ----------------------------------------------------------------------------
1982 class wxMBConv_iconv
: public wxMBConv
1985 wxMBConv_iconv(const char *name
);
1986 virtual ~wxMBConv_iconv();
1988 // implement base class virtual methods
1989 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
1990 const char *src
, size_t srcLen
= wxNO_LEN
) const;
1991 virtual size_t FromWChar(char *dst
, size_t dstLen
,
1992 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
1993 virtual size_t GetMBNulLen() const;
1995 #if wxUSE_UNICODE_UTF8
1996 virtual bool IsUTF8() const;
1999 virtual wxMBConv
*Clone() const
2001 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2002 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2007 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2010 // the iconv handlers used to translate from multibyte
2011 // to wide char and in the other direction
2016 // guards access to m2w and w2m objects
2017 wxMutex m_iconvMutex
;
2021 // the name (for iconv_open()) of a wide char charset -- if none is
2022 // available on this machine, it will remain NULL
2023 static wxString ms_wcCharsetName
;
2025 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2026 // different endian-ness than the native one
2027 static bool ms_wcNeedsSwap
;
2030 // name of the encoding handled by this conversion
2033 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2035 size_t m_minMBCharWidth
;
2038 // make the constructor available for unit testing
2039 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2041 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2042 if ( !result
->IsOk() )
2051 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2052 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2054 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2057 m_minMBCharWidth
= 0;
2059 // check for charset that represents wchar_t:
2060 if ( ms_wcCharsetName
.empty() )
2062 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2065 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2066 #else // !wxUSE_FONTMAP
2067 static const wxChar
*names_static
[] =
2069 #if SIZEOF_WCHAR_T == 4
2071 #elif SIZEOF_WCHAR_T = 2
2076 const wxChar
**names
= names_static
;
2077 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2079 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2081 const wxString
nameCS(*names
);
2083 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2084 wxString
nameXE(nameCS
);
2086 #ifdef WORDS_BIGENDIAN
2088 #else // little endian
2092 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2095 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2096 if ( m2w
== ICONV_T_INVALID
)
2098 // try charset w/o bytesex info (e.g. "UCS4")
2099 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2101 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2103 // and check for bytesex ourselves:
2104 if ( m2w
!= ICONV_T_INVALID
)
2106 char buf
[2], *bufPtr
;
2115 outsz
= SIZEOF_WCHAR_T
* 2;
2116 char* wbufPtr
= (char*)wbuf
;
2120 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2123 if (ICONV_FAILED(res
, insz
))
2125 wxLogLastError(wxT("iconv"));
2126 wxLogError(_("Conversion to charset '%s' doesn't work."),
2129 else // ok, can convert to this encoding, remember it
2131 ms_wcCharsetName
= nameCS
;
2132 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2136 else // use charset not requiring byte swapping
2138 ms_wcCharsetName
= nameXE
;
2142 wxLogTrace(TRACE_STRCONV
,
2143 wxT("iconv wchar_t charset is \"%s\"%s"),
2144 ms_wcCharsetName
.empty() ? wxString("<none>")
2146 ms_wcNeedsSwap
? _T(" (needs swap)")
2149 else // we already have ms_wcCharsetName
2151 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2154 if ( ms_wcCharsetName
.empty() )
2156 w2m
= ICONV_T_INVALID
;
2160 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2161 if ( w2m
== ICONV_T_INVALID
)
2163 wxLogTrace(TRACE_STRCONV
,
2164 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2165 ms_wcCharsetName
.c_str(), name
);
2170 wxMBConv_iconv::~wxMBConv_iconv()
2172 if ( m2w
!= ICONV_T_INVALID
)
2174 if ( w2m
!= ICONV_T_INVALID
)
2179 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2180 const char *src
, size_t srcLen
) const
2182 if ( srcLen
== wxNO_LEN
)
2184 // find the string length: notice that must be done differently for
2185 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2187 const size_t nulLen
= GetMBNulLen();
2191 return wxCONV_FAILED
;
2194 srcLen
= strlen(src
); // arguably more optimized than our version
2199 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2200 // but they also have to start at character boundary and not
2201 // span two adjacent characters
2203 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2209 // when we're determining the length of the string ourselves we count
2210 // the terminating NUL(s) as part of it and always NUL-terminate the
2215 // we express length in the number of (wide) characters but iconv always
2216 // counts buffer sizes it in bytes
2217 dstLen
*= SIZEOF_WCHAR_T
;
2220 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2221 // Unfortunately there are a couple of global wxCSConv objects such as
2222 // wxConvLocal that are used all over wx code, so we have to make sure
2223 // the handle is used by at most one thread at the time. Otherwise
2224 // only a few wx classes would be safe to use from non-main threads
2225 // as MB<->WC conversion would fail "randomly".
2226 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2227 #endif // wxUSE_THREADS
2230 const char *pszPtr
= src
;
2234 char* bufPtr
= (char*)dst
;
2236 // have destination buffer, convert there
2237 size_t dstLenOrig
= dstLen
;
2239 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2242 // convert the number of bytes converted as returned by iconv to the
2243 // number of (wide) characters converted that we need
2244 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2248 // convert to native endianness
2249 for ( unsigned i
= 0; i
< res
; i
++ )
2250 dst
[i
] = WC_BSWAP(dst
[i
]);
2253 else // no destination buffer
2255 // convert using temp buffer to calculate the size of the buffer needed
2261 char* bufPtr
= (char*)tbuf
;
2262 dstLen
= 8 * SIZEOF_WCHAR_T
;
2265 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2268 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2270 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2273 if (ICONV_FAILED(cres
, srcLen
))
2275 //VS: it is ok if iconv fails, hence trace only
2276 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2277 return wxCONV_FAILED
;
2283 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2284 const wchar_t *src
, size_t srcLen
) const
2287 // NB: explained in MB2WC
2288 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2291 if ( srcLen
== wxNO_LEN
)
2292 srcLen
= wxWcslen(src
) + 1;
2294 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2295 size_t outbuflen
= dstLen
;
2298 wchar_t *tmpbuf
= 0;
2302 // need to copy to temp buffer to switch endianness
2303 // (doing WC_BSWAP twice on the original buffer won't help, as it
2304 // could be in read-only memory, or be accessed in some other thread)
2305 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2306 for ( size_t i
= 0; i
< srcLen
; i
++ )
2307 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2309 tmpbuf
[srcLen
] = L
'\0';
2313 char* inbuf
= (char*)src
;
2316 // have destination buffer, convert there
2317 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2319 res
= dstLen
- outbuflen
;
2321 else // no destination buffer
2323 // convert using temp buffer to calculate the size of the buffer needed
2331 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2333 res
+= 16 - outbuflen
;
2335 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2343 if (ICONV_FAILED(cres
, inbuflen
))
2345 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2346 return wxCONV_FAILED
;
2352 size_t wxMBConv_iconv::GetMBNulLen() const
2354 if ( m_minMBCharWidth
== 0 )
2356 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2359 // NB: explained in MB2WC
2360 wxMutexLocker
lock(self
->m_iconvMutex
);
2363 const wchar_t *wnul
= L
"";
2364 char buf
[8]; // should be enough for NUL in any encoding
2365 size_t inLen
= sizeof(wchar_t),
2366 outLen
= WXSIZEOF(buf
);
2367 char *inBuff
= (char *)wnul
;
2368 char *outBuff
= buf
;
2369 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2371 self
->m_minMBCharWidth
= (size_t)-1;
2375 self
->m_minMBCharWidth
= outBuff
- buf
;
2379 return m_minMBCharWidth
;
2382 #if wxUSE_UNICODE_UTF8
2383 bool wxMBConv_iconv::IsUTF8() const
2385 return wxStricmp(m_name
, "UTF-8") == 0 ||
2386 wxStricmp(m_name
, "UTF8") == 0;
2390 #endif // HAVE_ICONV
2393 // ============================================================================
2394 // Win32 conversion classes
2395 // ============================================================================
2397 #ifdef wxHAVE_WIN32_MB2WC
2401 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2402 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2405 class wxMBConv_win32
: public wxMBConv
2410 m_CodePage
= CP_ACP
;
2411 m_minMBCharWidth
= 0;
2414 wxMBConv_win32(const wxMBConv_win32
& conv
)
2417 m_CodePage
= conv
.m_CodePage
;
2418 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2422 wxMBConv_win32(const char* name
)
2424 m_CodePage
= wxCharsetToCodepage(name
);
2425 m_minMBCharWidth
= 0;
2428 wxMBConv_win32(wxFontEncoding encoding
)
2430 m_CodePage
= wxEncodingToCodepage(encoding
);
2431 m_minMBCharWidth
= 0;
2433 #endif // wxUSE_FONTMAP
2435 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2437 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2438 // the behaviour is not compatible with the Unix version (using iconv)
2439 // and break the library itself, e.g. wxTextInputStream::NextChar()
2440 // wouldn't work if reading an incomplete MB char didn't result in an
2443 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2444 // Win XP or newer and it is not supported for UTF-[78] so we always
2445 // use our own conversions in this case. See
2446 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2447 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2448 if ( m_CodePage
== CP_UTF8
)
2450 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2453 if ( m_CodePage
== CP_UTF7
)
2455 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2459 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2460 IsAtLeastWin2kSP4() )
2462 flags
= MB_ERR_INVALID_CHARS
;
2465 const size_t len
= ::MultiByteToWideChar
2467 m_CodePage
, // code page
2468 flags
, // flags: fall on error
2469 psz
, // input string
2470 -1, // its length (NUL-terminated)
2471 buf
, // output string
2472 buf
? n
: 0 // size of output buffer
2476 // function totally failed
2477 return wxCONV_FAILED
;
2480 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2481 // check if we succeeded, by doing a double trip:
2482 if ( !flags
&& buf
)
2484 const size_t mbLen
= strlen(psz
);
2485 wxCharBuffer
mbBuf(mbLen
);
2486 if ( ::WideCharToMultiByte
2493 mbLen
+ 1, // size in bytes, not length
2497 strcmp(mbBuf
, psz
) != 0 )
2499 // we didn't obtain the same thing we started from, hence
2500 // the conversion was lossy and we consider that it failed
2501 return wxCONV_FAILED
;
2505 // note that it returns count of written chars for buf != NULL and size
2506 // of the needed buffer for buf == NULL so in either case the length of
2507 // the string (which never includes the terminating NUL) is one less
2511 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2514 we have a problem here: by default, WideCharToMultiByte() may
2515 replace characters unrepresentable in the target code page with bad
2516 quality approximations such as turning "1/2" symbol (U+00BD) into
2517 "1" for the code pages which don't have it and we, obviously, want
2518 to avoid this at any price
2520 the trouble is that this function does it _silently_, i.e. it won't
2521 even tell us whether it did or not... Win98/2000 and higher provide
2522 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2523 we have to resort to a round trip, i.e. check that converting back
2524 results in the same string -- this is, of course, expensive but
2525 otherwise we simply can't be sure to not garble the data.
2528 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2529 // it doesn't work with CJK encodings (which we test for rather roughly
2530 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2532 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2535 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2537 // it's our lucky day
2538 flags
= WC_NO_BEST_FIT_CHARS
;
2539 pUsedDef
= &usedDef
;
2541 else // old system or unsupported encoding
2547 const size_t len
= ::WideCharToMultiByte
2549 m_CodePage
, // code page
2550 flags
, // either none or no best fit
2551 pwz
, // input string
2552 -1, // it is (wide) NUL-terminated
2553 buf
, // output buffer
2554 buf
? n
: 0, // and its size
2555 NULL
, // default "replacement" char
2556 pUsedDef
// [out] was it used?
2561 // function totally failed
2562 return wxCONV_FAILED
;
2565 // we did something, check if we really succeeded
2568 // check if the conversion failed, i.e. if any replacements
2571 return wxCONV_FAILED
;
2573 else // we must resort to double tripping...
2575 // first we need to ensure that we really have the MB data: this is
2576 // not the case if we're called with NULL buffer, in which case we
2577 // need to do the conversion yet again
2578 wxCharBuffer bufDef
;
2581 bufDef
= wxCharBuffer(len
);
2582 buf
= bufDef
.data();
2583 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2584 buf
, len
, NULL
, NULL
) )
2585 return wxCONV_FAILED
;
2590 wxWCharBuffer
wcBuf(n
);
2591 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2592 wcscmp(wcBuf
, pwz
) != 0 )
2594 // we didn't obtain the same thing we started from, hence
2595 // the conversion was lossy and we consider that it failed
2596 return wxCONV_FAILED
;
2600 // see the comment above for the reason of "len - 1"
2604 virtual size_t GetMBNulLen() const
2606 if ( m_minMBCharWidth
== 0 )
2608 int len
= ::WideCharToMultiByte
2610 m_CodePage
, // code page
2612 L
"", // input string
2613 1, // translate just the NUL
2614 NULL
, // output buffer
2616 NULL
, // no replacement char
2617 NULL
// [out] don't care if it was used
2620 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2624 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2625 self
->m_minMBCharWidth
= (size_t)-1;
2629 self
->m_minMBCharWidth
= (size_t)-1;
2635 self
->m_minMBCharWidth
= len
;
2640 return m_minMBCharWidth
;
2643 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2645 bool IsOk() const { return m_CodePage
!= -1; }
2648 static bool CanUseNoBestFit()
2650 static int s_isWin98Or2k
= -1;
2652 if ( s_isWin98Or2k
== -1 )
2655 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2657 case wxOS_WINDOWS_9X
:
2658 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2661 case wxOS_WINDOWS_NT
:
2662 s_isWin98Or2k
= verMaj
>= 5;
2666 // unknown: be conservative by default
2671 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2674 return s_isWin98Or2k
== 1;
2677 static bool IsAtLeastWin2kSP4()
2682 static int s_isAtLeastWin2kSP4
= -1;
2684 if ( s_isAtLeastWin2kSP4
== -1 )
2686 OSVERSIONINFOEX ver
;
2688 memset(&ver
, 0, sizeof(ver
));
2689 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2690 GetVersionEx((OSVERSIONINFO
*)&ver
);
2692 s_isAtLeastWin2kSP4
=
2693 ((ver
.dwMajorVersion
> 5) || // Vista+
2694 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2695 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2696 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2700 return s_isAtLeastWin2kSP4
== 1;
2705 // the code page we're working with
2708 // cached result of GetMBNulLen(), set to 0 initially meaning
2710 size_t m_minMBCharWidth
;
2713 #endif // wxHAVE_WIN32_MB2WC
2716 // ============================================================================
2717 // wxEncodingConverter based conversion classes
2718 // ============================================================================
2722 class wxMBConv_wxwin
: public wxMBConv
2727 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2728 // The wxMBConv_cf class does a better job.
2729 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2730 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2731 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2735 // temporarily just use wxEncodingConverter stuff,
2736 // so that it works while a better implementation is built
2737 wxMBConv_wxwin(const char* name
)
2740 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2742 m_enc
= wxFONTENCODING_SYSTEM
;
2747 wxMBConv_wxwin(wxFontEncoding enc
)
2754 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2756 size_t inbuf
= strlen(psz
);
2759 if (!m2w
.Convert(psz
, buf
))
2760 return wxCONV_FAILED
;
2765 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2767 const size_t inbuf
= wxWcslen(psz
);
2770 if (!w2m
.Convert(psz
, buf
))
2771 return wxCONV_FAILED
;
2777 virtual size_t GetMBNulLen() const
2781 case wxFONTENCODING_UTF16BE
:
2782 case wxFONTENCODING_UTF16LE
:
2785 case wxFONTENCODING_UTF32BE
:
2786 case wxFONTENCODING_UTF32LE
:
2794 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2796 bool IsOk() const { return m_ok
; }
2799 wxFontEncoding m_enc
;
2800 wxEncodingConverter m2w
, w2m
;
2803 // were we initialized successfully?
2806 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2809 // make the constructors available for unit testing
2810 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2812 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2813 if ( !result
->IsOk() )
2822 #endif // wxUSE_FONTMAP
2824 // ============================================================================
2825 // wxCSConv implementation
2826 // ============================================================================
2828 void wxCSConv::Init()
2835 wxCSConv::wxCSConv(const wxString
& charset
)
2839 if ( !charset
.empty() )
2841 SetName(charset
.ToAscii());
2845 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2847 m_encoding
= wxFONTENCODING_SYSTEM
;
2851 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2853 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2855 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2857 encoding
= wxFONTENCODING_SYSTEM
;
2862 m_encoding
= encoding
;
2865 wxCSConv::~wxCSConv()
2870 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2875 SetName(conv
.m_name
);
2876 m_encoding
= conv
.m_encoding
;
2879 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2883 SetName(conv
.m_name
);
2884 m_encoding
= conv
.m_encoding
;
2889 void wxCSConv::Clear()
2898 void wxCSConv::SetName(const char *charset
)
2902 m_name
= wxStrdup(charset
);
2909 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2910 wxEncodingNameCache
);
2912 static wxEncodingNameCache gs_nameCache
;
2915 wxMBConv
*wxCSConv::DoCreate() const
2918 wxLogTrace(TRACE_STRCONV
,
2919 wxT("creating conversion for %s"),
2921 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2922 #endif // wxUSE_FONTMAP
2924 // check for the special case of ASCII or ISO8859-1 charset: as we have
2925 // special knowledge of it anyhow, we don't need to create a special
2926 // conversion object
2927 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2928 m_encoding
== wxFONTENCODING_DEFAULT
)
2930 // don't convert at all
2934 // we trust OS to do conversion better than we can so try external
2935 // conversion methods first
2937 // the full order is:
2938 // 1. OS conversion (iconv() under Unix or Win32 API)
2939 // 2. hard coded conversions for UTF
2940 // 3. wxEncodingConverter as fall back
2946 #endif // !wxUSE_FONTMAP
2949 wxFontEncoding
encoding(m_encoding
);
2954 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2962 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2963 #endif // wxUSE_FONTMAP
2967 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2968 if ( it
!= gs_nameCache
.end() )
2970 if ( it
->second
.empty() )
2973 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2980 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2981 // CS : in case this does not return valid names (eg for MacRoman)
2982 // encoding got a 'failure' entry in the cache all the same,
2983 // although it just has to be created using a different method, so
2984 // only store failed iconv creation attempts (or perhaps we
2985 // shoulnd't do this at all ?)
2986 if ( names
[0] != NULL
)
2988 for ( ; *names
; ++names
)
2990 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2991 // will need changes that will obsolete this
2992 wxString
name(*names
);
2993 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2996 gs_nameCache
[encoding
] = *names
;
3003 gs_nameCache
[encoding
] = _T(""); // cache the failure
3006 #endif // wxUSE_FONTMAP
3008 #endif // HAVE_ICONV
3010 #ifdef wxHAVE_WIN32_MB2WC
3013 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3014 : new wxMBConv_win32(m_encoding
);
3023 #endif // wxHAVE_WIN32_MB2WC
3027 // leave UTF16 and UTF32 to the built-ins of wx
3028 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3029 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3032 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3033 : new wxMBConv_cf(m_encoding
);
3035 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3044 #endif // __DARWIN__
3047 wxFontEncoding enc
= m_encoding
;
3049 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3051 // use "false" to suppress interactive dialogs -- we can be called from
3052 // anywhere and popping up a dialog from here is the last thing we want to
3054 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3056 #endif // wxUSE_FONTMAP
3060 case wxFONTENCODING_UTF7
:
3061 return new wxMBConvUTF7
;
3063 case wxFONTENCODING_UTF8
:
3064 return new wxMBConvUTF8
;
3066 case wxFONTENCODING_UTF16BE
:
3067 return new wxMBConvUTF16BE
;
3069 case wxFONTENCODING_UTF16LE
:
3070 return new wxMBConvUTF16LE
;
3072 case wxFONTENCODING_UTF32BE
:
3073 return new wxMBConvUTF32BE
;
3075 case wxFONTENCODING_UTF32LE
:
3076 return new wxMBConvUTF32LE
;
3079 // nothing to do but put here to suppress gcc warnings
3086 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3087 : new wxMBConv_wxwin(m_encoding
);
3093 #endif // wxUSE_FONTMAP
3095 // NB: This is a hack to prevent deadlock. What could otherwise happen
3096 // in Unicode build: wxConvLocal creation ends up being here
3097 // because of some failure and logs the error. But wxLog will try to
3098 // attach a timestamp, for which it will need wxConvLocal (to convert
3099 // time to char* and then wchar_t*), but that fails, tries to log the
3100 // error, but wxLog has an (already locked) critical section that
3101 // guards the static buffer.
3102 static bool alreadyLoggingError
= false;
3103 if (!alreadyLoggingError
)
3105 alreadyLoggingError
= true;
3106 wxLogError(_("Cannot convert from the charset '%s'!"),
3110 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3111 #else // !wxUSE_FONTMAP
3112 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3113 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3116 alreadyLoggingError
= false;
3122 void wxCSConv::CreateConvIfNeeded() const
3126 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3128 // if we don't have neither the name nor the encoding, use the default
3129 // encoding for this system
3130 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3133 self
->m_encoding
= wxLocale::GetSystemEncoding();
3135 // fallback to some reasonable default:
3136 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3137 #endif // wxUSE_INTL
3140 self
->m_convReal
= DoCreate();
3141 self
->m_deferred
= false;
3145 bool wxCSConv::IsOk() const
3147 CreateConvIfNeeded();
3149 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3150 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3151 return true; // always ok as we do it ourselves
3153 // m_convReal->IsOk() is called at its own creation, so we know it must
3154 // be ok if m_convReal is non-NULL
3155 return m_convReal
!= NULL
;
3158 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3159 const char *src
, size_t srcLen
) const
3161 CreateConvIfNeeded();
3164 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3167 if ( srcLen
== wxNO_LEN
)
3168 srcLen
= strlen(src
) + 1; // take trailing NUL too
3172 if ( dstLen
< srcLen
)
3173 return wxCONV_FAILED
;
3175 for ( size_t n
= 0; n
< srcLen
; n
++ )
3176 dst
[n
] = (unsigned char)(src
[n
]);
3182 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3183 const wchar_t *src
, size_t srcLen
) const
3185 CreateConvIfNeeded();
3188 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3191 if ( srcLen
== wxNO_LEN
)
3192 srcLen
= wxWcslen(src
) + 1;
3196 if ( dstLen
< srcLen
)
3197 return wxCONV_FAILED
;
3199 for ( size_t n
= 0; n
< srcLen
; n
++ )
3201 if ( src
[n
] > 0xFF )
3202 return wxCONV_FAILED
;
3204 dst
[n
] = (char)src
[n
];
3208 else // still need to check the input validity
3210 for ( size_t n
= 0; n
< srcLen
; n
++ )
3212 if ( src
[n
] > 0xFF )
3213 return wxCONV_FAILED
;
3220 size_t wxCSConv::GetMBNulLen() const
3222 CreateConvIfNeeded();
3226 return m_convReal
->GetMBNulLen();
3229 // otherwise, we are ISO-8859-1
3233 #if wxUSE_UNICODE_UTF8
3234 bool wxCSConv::IsUTF8() const
3236 CreateConvIfNeeded();
3240 return m_convReal
->IsUTF8();
3243 // otherwise, we are ISO-8859-1
3251 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3254 return wxWCharBuffer();
3256 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3258 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3260 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3265 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3268 return wxCharBuffer();
3270 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3272 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3277 #endif // wxUSE_UNICODE
3279 // ----------------------------------------------------------------------------
3281 // ----------------------------------------------------------------------------
3283 // NB: The reason why we create converted objects in this convoluted way,
3284 // using a factory function instead of global variable, is that they
3285 // may be used at static initialization time (some of them are used by
3286 // wxString ctors and there may be a global wxString object). In other
3287 // words, possibly _before_ the converter global object would be
3294 #undef wxConvISO8859_1
3296 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3297 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3298 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3300 static impl_klass name##Obj ctor_args; \
3301 return &name##Obj; \
3303 /* this ensures that all global converter objects are created */ \
3304 /* by the time static initialization is done, i.e. before any */ \
3305 /* thread is launched: */ \
3306 static klass* gs_##name##instance = wxGet_##name##Ptr()
3308 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3309 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3312 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3314 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3317 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3318 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3319 // provokes an error message about "not enough macro parameters"; and we
3320 // can't use "()" here as the name##Obj declaration would be parsed as a
3321 // function declaration then, so use a semicolon and live with an extra
3322 // empty statement (and hope that no compilers warns about this)
3323 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3324 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3326 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3327 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3329 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3330 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3333 // The xnu kernel always communicates file paths in decomposed UTF-8.
3334 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3335 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3338 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3341 #else // !__DARWIN__
3342 wxGet_wxConvLibcPtr();
3343 #endif // __DARWIN__/!__DARWIN__
3345 #else // !wxUSE_WCHAR_T
3347 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3348 // stand-ins in absence of wchar_t
3349 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3354 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T