1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
65 #include <ATSUnicode.h>
66 #include <TextCommon.h>
67 #include <TextEncodingConverter.h>
70 // includes Mac headers
71 #include "wx/mac/private.h"
75 #define TRACE_STRCONV _T("strconv")
77 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
79 #if SIZEOF_WCHAR_T == 2
84 // ============================================================================
86 // ============================================================================
88 // helper function of cMB2WC(): check if n bytes at this location are all NUL
89 static bool NotAllNULs(const char *p
, size_t n
)
91 while ( n
&& *p
++ == '\0' )
97 // ----------------------------------------------------------------------------
98 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
99 // ----------------------------------------------------------------------------
101 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
106 *output
= (wxUint16
) input
;
110 else if (input
>= 0x110000)
112 return wxCONV_FAILED
;
118 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
119 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
126 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
128 if ((*input
< 0xd800) || (*input
> 0xdfff))
133 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
136 return wxCONV_FAILED
;
140 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
146 typedef wchar_t wxDecodeSurrogate_t
;
148 typedef wxUint16 wxDecodeSurrogate_t
;
149 #endif // WC_UTF16/!WC_UTF16
151 // returns the next UTF-32 character from the wchar_t buffer and advances the
152 // pointer to the character after this one
154 // if an invalid character is found, *pSrc is set to NULL, the caller must
156 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
160 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
161 if ( n
== wxCONV_FAILED
)
169 // ----------------------------------------------------------------------------
171 // ----------------------------------------------------------------------------
174 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
175 const char *src
, size_t srcLen
) const
177 // although new conversion classes are supposed to implement this function
178 // directly, the existins ones only implement the old MB2WC() and so, to
179 // avoid to have to rewrite all conversion classes at once, we provide a
180 // default (but not efficient) implementation of this one in terms of the
181 // old function by copying the input to ensure that it's NUL-terminated and
182 // then using MB2WC() to convert it
184 // the number of chars [which would be] written to dst [if it were not NULL]
185 size_t dstWritten
= 0;
187 // the number of NULs terminating this string
188 size_t nulLen
= 0; // not really needed, but just to avoid warnings
190 // if we were not given the input size we just have to assume that the
191 // string is properly terminated as we have no way of knowing how long it
192 // is anyhow, but if we do have the size check whether there are enough
196 if ( srcLen
!= wxNO_LEN
)
198 // we need to know how to find the end of this string
199 nulLen
= GetMBNulLen();
200 if ( nulLen
== wxCONV_FAILED
)
201 return wxCONV_FAILED
;
203 // if there are enough NULs we can avoid the copy
204 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
206 // make a copy in order to properly NUL-terminate the string
207 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
208 char * const p
= bufTmp
.data();
209 memcpy(p
, src
, srcLen
);
210 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
216 srcEnd
= src
+ srcLen
;
218 else // quit after the first loop iteration
225 // try to convert the current chunk
226 size_t lenChunk
= MB2WC(NULL
, src
, 0);
227 if ( lenChunk
== wxCONV_FAILED
)
228 return wxCONV_FAILED
;
230 lenChunk
++; // for the L'\0' at the end of this chunk
232 dstWritten
+= lenChunk
;
236 // nothing left in the input string, conversion succeeded
242 if ( dstWritten
> dstLen
)
243 return wxCONV_FAILED
;
245 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
246 return wxCONV_FAILED
;
253 // we convert just one chunk in this case as this is the entire
258 // advance the input pointer past the end of this chunk
259 while ( NotAllNULs(src
, nulLen
) )
261 // notice that we must skip over multiple bytes here as we suppose
262 // that if NUL takes 2 or 4 bytes, then all the other characters do
263 // too and so if advanced by a single byte we might erroneously
264 // detect sequences of NUL bytes in the middle of the input
268 src
+= nulLen
; // skipping over its terminator as well
270 // note that ">=" (and not just "==") is needed here as the terminator
271 // we skipped just above could be inside or just after the buffer
272 // delimited by inEnd
281 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
282 const wchar_t *src
, size_t srcLen
) const
284 // the number of chars [which would be] written to dst [if it were not NULL]
285 size_t dstWritten
= 0;
287 // make a copy of the input string unless it is already properly
290 // if we don't know its length we have no choice but to assume that it is,
291 // indeed, properly terminated
292 wxWCharBuffer bufTmp
;
293 if ( srcLen
== wxNO_LEN
)
295 srcLen
= wxWcslen(src
) + 1;
297 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
299 // make a copy in order to properly NUL-terminate the string
300 bufTmp
= wxWCharBuffer(srcLen
);
301 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
305 const size_t lenNul
= GetMBNulLen();
306 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
308 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
310 // try to convert the current chunk
311 size_t lenChunk
= WC2MB(NULL
, src
, 0);
313 if ( lenChunk
== wxCONV_FAILED
)
314 return wxCONV_FAILED
;
317 dstWritten
+= lenChunk
;
321 if ( dstWritten
> dstLen
)
322 return wxCONV_FAILED
;
324 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
325 return wxCONV_FAILED
;
334 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
336 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
337 if ( rc
!= wxCONV_FAILED
)
339 // ToWChar() returns the buffer length, i.e. including the trailing
340 // NUL, while this method doesn't take it into account
347 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
349 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
350 if ( rc
!= wxCONV_FAILED
)
358 wxMBConv::~wxMBConv()
360 // nothing to do here (necessary for Darwin linking probably)
363 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
367 // calculate the length of the buffer needed first
368 const size_t nLen
= MB2WC(NULL
, psz
, 0);
369 if ( nLen
!= wxCONV_FAILED
)
371 // now do the actual conversion
372 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
374 // +1 for the trailing NULL
375 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
380 return wxWCharBuffer();
383 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
387 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
388 if ( nLen
!= wxCONV_FAILED
)
390 // extra space for trailing NUL(s)
391 static const size_t extraLen
= GetMaxMBNulLen();
393 wxCharBuffer
buf(nLen
+ extraLen
- 1);
394 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
399 return wxCharBuffer();
403 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
405 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
406 if ( dstLen
!= wxCONV_FAILED
)
408 wxWCharBuffer
wbuf(dstLen
- 1);
409 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
414 if ( wbuf
[dstLen
- 1] == L
'\0' )
425 return wxWCharBuffer();
429 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
431 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
432 if ( dstLen
!= wxCONV_FAILED
)
434 // special case of empty input: can't allocate 0 size buffer below as
435 // wxCharBuffer insists on NUL-terminating it
436 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
437 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
443 const size_t nulLen
= GetMBNulLen();
444 if ( dstLen
>= nulLen
&&
445 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
447 // in this case the output is NUL-terminated and we're not
448 // supposed to count NUL
460 return wxCharBuffer();
463 // ----------------------------------------------------------------------------
465 // ----------------------------------------------------------------------------
467 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
469 return wxMB2WC(buf
, psz
, n
);
472 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
474 return wxWC2MB(buf
, psz
, n
);
477 // ----------------------------------------------------------------------------
478 // wxConvBrokenFileNames
479 // ----------------------------------------------------------------------------
483 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
485 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
486 wxStricmp(charset
, _T("UTF8")) == 0 )
487 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
489 m_conv
= new wxCSConv(charset
);
494 // ----------------------------------------------------------------------------
496 // ----------------------------------------------------------------------------
498 // Implementation (C) 2004 Fredrik Roubert
501 // BASE64 decoding table
503 static const unsigned char utf7unb64
[] =
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
511 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
512 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
514 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
515 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
516 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
518 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
519 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
520 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
539 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
543 while ( *psz
&& (!buf
|| (len
< n
)) )
545 unsigned char cc
= *psz
++;
553 else if (*psz
== '-')
561 else // start of BASE64 encoded string
565 for ( ok
= lsb
= false, d
= 0, l
= 0;
566 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
571 for (l
+= 6; l
>= 8; lsb
= !lsb
)
573 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
583 *buf
= (wchar_t)(c
<< 8);
592 // in valid UTF7 we should have valid characters after '+'
593 return wxCONV_FAILED
;
601 if ( buf
&& (len
< n
) )
608 // BASE64 encoding table
610 static const unsigned char utf7enb64
[] =
612 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
613 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
614 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
615 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
616 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
617 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
618 'w', 'x', 'y', 'z', '0', '1', '2', '3',
619 '4', '5', '6', '7', '8', '9', '+', '/'
623 // UTF-7 encoding table
625 // 0 - Set D (directly encoded characters)
626 // 1 - Set O (optional direct characters)
627 // 2 - whitespace characters (optional)
628 // 3 - special characters
630 static const unsigned char utf7encode
[128] =
632 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
633 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
634 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
636 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
638 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
642 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
646 while (*psz
&& ((!buf
) || (len
< n
)))
649 if (cc
< 0x80 && utf7encode
[cc
] < 1)
658 else if (((wxUint32
)cc
) > 0xffff)
660 // no surrogate pair generation (yet?)
661 return wxCONV_FAILED
;
672 // BASE64 encode string
673 unsigned int lsb
, d
, l
;
674 for (d
= 0, l
= 0; /*nothing*/; psz
++)
676 for (lsb
= 0; lsb
< 2; lsb
++)
679 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
681 for (l
+= 8; l
>= 6; )
685 *buf
++ = utf7enb64
[(d
>> l
) % 64];
691 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
698 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
710 if (buf
&& (len
< n
))
716 // ----------------------------------------------------------------------------
718 // ----------------------------------------------------------------------------
720 static wxUint32 utf8_max
[]=
721 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
723 // boundaries of the private use area we use to (temporarily) remap invalid
724 // characters invalid in a UTF-8 encoded string
725 const wxUint32 wxUnicodePUA
= 0x100000;
726 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
728 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
732 while (*psz
&& ((!buf
) || (len
< n
)))
734 const char *opsz
= psz
;
735 bool invalid
= false;
736 unsigned char cc
= *psz
++, fc
= cc
;
738 for (cnt
= 0; fc
& 0x80; cnt
++)
748 // escape the escape character for octal escapes
749 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
750 && cc
== '\\' && (!buf
|| len
< n
))
762 // invalid UTF-8 sequence
767 unsigned ocnt
= cnt
- 1;
768 wxUint32 res
= cc
& (0x3f >> cnt
);
772 if ((cc
& 0xC0) != 0x80)
774 // invalid UTF-8 sequence
780 res
= (res
<< 6) | (cc
& 0x3f);
783 if (invalid
|| res
<= utf8_max
[ocnt
])
785 // illegal UTF-8 encoding
788 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
789 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
791 // if one of our PUA characters turns up externally
792 // it must also be treated as an illegal sequence
793 // (a bit like you have to escape an escape character)
799 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
800 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
801 if (pa
== wxCONV_FAILED
)
813 *buf
++ = (wchar_t)res
;
815 #endif // WC_UTF16/!WC_UTF16
821 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
823 while (opsz
< psz
&& (!buf
|| len
< n
))
826 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
827 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
828 wxASSERT(pa
!= wxCONV_FAILED
);
835 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
841 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
843 while (opsz
< psz
&& (!buf
|| len
< n
))
845 if ( buf
&& len
+ 3 < n
)
847 unsigned char on
= *opsz
;
849 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
850 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
851 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
858 else // MAP_INVALID_UTF8_NOT
860 return wxCONV_FAILED
;
866 if (buf
&& (len
< n
))
872 static inline bool isoctal(wchar_t wch
)
874 return L
'0' <= wch
&& wch
<= L
'7';
877 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
881 while (*psz
&& ((!buf
) || (len
< n
)))
886 // cast is ok for WC_UTF16
887 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
888 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
890 cc
= (*psz
++) & 0x7fffffff;
893 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
894 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
897 *buf
++ = (char)(cc
- wxUnicodePUA
);
900 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
901 && cc
== L
'\\' && psz
[0] == L
'\\' )
908 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
910 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
914 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
915 (psz
[1] - L
'0') * 010 +
925 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
941 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
943 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
949 if (buf
&& (len
< n
))
955 // ============================================================================
957 // ============================================================================
959 #ifdef WORDS_BIGENDIAN
960 #define wxMBConvUTF16straight wxMBConvUTF16BE
961 #define wxMBConvUTF16swap wxMBConvUTF16LE
963 #define wxMBConvUTF16swap wxMBConvUTF16BE
964 #define wxMBConvUTF16straight wxMBConvUTF16LE
968 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
970 if ( srcLen
== wxNO_LEN
)
972 // count the number of bytes in input, including the trailing NULs
973 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
974 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
977 srcLen
*= BYTES_PER_CHAR
;
979 else // we already have the length
981 // we can only convert an entire number of UTF-16 characters
982 if ( srcLen
% BYTES_PER_CHAR
)
983 return wxCONV_FAILED
;
989 // case when in-memory representation is UTF-16 too
992 // ----------------------------------------------------------------------------
993 // conversions without endianness change
994 // ----------------------------------------------------------------------------
997 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
998 const char *src
, size_t srcLen
) const
1000 // set up the scene for using memcpy() (which is presumably more efficient
1001 // than copying the bytes one by one)
1002 srcLen
= GetLength(src
, srcLen
);
1003 if ( srcLen
== wxNO_LEN
)
1004 return wxCONV_FAILED
;
1006 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1009 if ( dstLen
< inLen
)
1010 return wxCONV_FAILED
;
1012 memcpy(dst
, src
, srcLen
);
1019 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1020 const wchar_t *src
, size_t srcLen
) const
1022 if ( srcLen
== wxNO_LEN
)
1023 srcLen
= wxWcslen(src
) + 1;
1025 srcLen
*= BYTES_PER_CHAR
;
1029 if ( dstLen
< srcLen
)
1030 return wxCONV_FAILED
;
1032 memcpy(dst
, src
, srcLen
);
1038 // ----------------------------------------------------------------------------
1039 // endian-reversing conversions
1040 // ----------------------------------------------------------------------------
1043 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1044 const char *src
, size_t srcLen
) const
1046 srcLen
= GetLength(src
, srcLen
);
1047 if ( srcLen
== wxNO_LEN
)
1048 return wxCONV_FAILED
;
1050 srcLen
/= BYTES_PER_CHAR
;
1054 if ( dstLen
< srcLen
)
1055 return wxCONV_FAILED
;
1057 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1058 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1060 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1068 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1069 const wchar_t *src
, size_t srcLen
) const
1071 if ( srcLen
== wxNO_LEN
)
1072 srcLen
= wxWcslen(src
) + 1;
1074 srcLen
*= BYTES_PER_CHAR
;
1078 if ( dstLen
< srcLen
)
1079 return wxCONV_FAILED
;
1081 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1082 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1084 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1091 #else // !WC_UTF16: wchar_t is UTF-32
1093 // ----------------------------------------------------------------------------
1094 // conversions without endianness change
1095 // ----------------------------------------------------------------------------
1098 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1099 const char *src
, size_t srcLen
) const
1101 srcLen
= GetLength(src
, srcLen
);
1102 if ( srcLen
== wxNO_LEN
)
1103 return wxCONV_FAILED
;
1105 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1108 // optimization: return maximal space which could be needed for this
1109 // string even if the real size could be smaller if the buffer contains
1115 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1116 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1118 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1120 return wxCONV_FAILED
;
1122 if ( ++outLen
> dstLen
)
1123 return wxCONV_FAILED
;
1133 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1134 const wchar_t *src
, size_t srcLen
) const
1136 if ( srcLen
== wxNO_LEN
)
1137 srcLen
= wxWcslen(src
) + 1;
1140 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1141 for ( size_t n
= 0; n
< srcLen
; n
++ )
1144 const size_t numChars
= encode_utf16(*src
++, cc
);
1145 if ( numChars
== wxCONV_FAILED
)
1146 return wxCONV_FAILED
;
1148 outLen
+= numChars
* BYTES_PER_CHAR
;
1151 if ( outLen
> dstLen
)
1152 return wxCONV_FAILED
;
1155 if ( numChars
== 2 )
1157 // second character of a surrogate
1166 // ----------------------------------------------------------------------------
1167 // endian-reversing conversions
1168 // ----------------------------------------------------------------------------
1171 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1172 const char *src
, size_t srcLen
) const
1174 srcLen
= GetLength(src
, srcLen
);
1175 if ( srcLen
== wxNO_LEN
)
1176 return wxCONV_FAILED
;
1178 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1181 // optimization: return maximal space which could be needed for this
1182 // string even if the real size could be smaller if the buffer contains
1188 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1189 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1194 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1196 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1198 const size_t numChars
= decode_utf16(tmp
, ch
);
1199 if ( numChars
== wxCONV_FAILED
)
1200 return wxCONV_FAILED
;
1202 if ( numChars
== 2 )
1205 if ( ++outLen
> dstLen
)
1206 return wxCONV_FAILED
;
1216 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1217 const wchar_t *src
, size_t srcLen
) const
1219 if ( srcLen
== wxNO_LEN
)
1220 srcLen
= wxWcslen(src
) + 1;
1223 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1224 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1227 const size_t numChars
= encode_utf16(*src
, cc
);
1228 if ( numChars
== wxCONV_FAILED
)
1229 return wxCONV_FAILED
;
1231 outLen
+= numChars
* BYTES_PER_CHAR
;
1234 if ( outLen
> dstLen
)
1235 return wxCONV_FAILED
;
1237 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1238 if ( numChars
== 2 )
1240 // second character of a surrogate
1241 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1249 #endif // WC_UTF16/!WC_UTF16
1252 // ============================================================================
1254 // ============================================================================
1256 #ifdef WORDS_BIGENDIAN
1257 #define wxMBConvUTF32straight wxMBConvUTF32BE
1258 #define wxMBConvUTF32swap wxMBConvUTF32LE
1260 #define wxMBConvUTF32swap wxMBConvUTF32BE
1261 #define wxMBConvUTF32straight wxMBConvUTF32LE
1265 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1266 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1269 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1271 if ( srcLen
== wxNO_LEN
)
1273 // count the number of bytes in input, including the trailing NULs
1274 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1275 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1278 srcLen
*= BYTES_PER_CHAR
;
1280 else // we already have the length
1282 // we can only convert an entire number of UTF-32 characters
1283 if ( srcLen
% BYTES_PER_CHAR
)
1284 return wxCONV_FAILED
;
1290 // case when in-memory representation is UTF-16
1293 // ----------------------------------------------------------------------------
1294 // conversions without endianness change
1295 // ----------------------------------------------------------------------------
1298 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1299 const char *src
, size_t srcLen
) const
1301 srcLen
= GetLength(src
, srcLen
);
1302 if ( srcLen
== wxNO_LEN
)
1303 return wxCONV_FAILED
;
1305 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1306 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1308 for ( size_t n
= 0; n
< inLen
; n
++ )
1311 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1312 if ( numChars
== wxCONV_FAILED
)
1313 return wxCONV_FAILED
;
1318 if ( outLen
> dstLen
)
1319 return wxCONV_FAILED
;
1322 if ( numChars
== 2 )
1324 // second character of a surrogate
1334 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1335 const wchar_t *src
, size_t srcLen
) const
1337 if ( srcLen
== wxNO_LEN
)
1338 srcLen
= wxWcslen(src
) + 1;
1342 // optimization: return maximal space which could be needed for this
1343 // string instead of the exact amount which could be less if there are
1344 // any surrogates in the input
1346 // we consider that surrogates are rare enough to make it worthwhile to
1347 // avoid running the loop below at the cost of slightly extra memory
1349 return srcLen
* BYTES_PER_CHAR
;
1352 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1354 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1356 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1358 return wxCONV_FAILED
;
1360 outLen
+= BYTES_PER_CHAR
;
1362 if ( outLen
> dstLen
)
1363 return wxCONV_FAILED
;
1371 // ----------------------------------------------------------------------------
1372 // endian-reversing conversions
1373 // ----------------------------------------------------------------------------
1376 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1377 const char *src
, size_t srcLen
) const
1379 srcLen
= GetLength(src
, srcLen
);
1380 if ( srcLen
== wxNO_LEN
)
1381 return wxCONV_FAILED
;
1383 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1384 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1386 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1389 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1390 if ( numChars
== wxCONV_FAILED
)
1391 return wxCONV_FAILED
;
1396 if ( outLen
> dstLen
)
1397 return wxCONV_FAILED
;
1400 if ( numChars
== 2 )
1402 // second character of a surrogate
1412 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1413 const wchar_t *src
, size_t srcLen
) const
1415 if ( srcLen
== wxNO_LEN
)
1416 srcLen
= wxWcslen(src
) + 1;
1420 // optimization: return maximal space which could be needed for this
1421 // string instead of the exact amount which could be less if there are
1422 // any surrogates in the input
1424 // we consider that surrogates are rare enough to make it worthwhile to
1425 // avoid running the loop below at the cost of slightly extra memory
1427 return srcLen
*BYTES_PER_CHAR
;
1430 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1432 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1434 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1436 return wxCONV_FAILED
;
1438 outLen
+= BYTES_PER_CHAR
;
1440 if ( outLen
> dstLen
)
1441 return wxCONV_FAILED
;
1443 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1449 #else // !WC_UTF16: wchar_t is UTF-32
1451 // ----------------------------------------------------------------------------
1452 // conversions without endianness change
1453 // ----------------------------------------------------------------------------
1456 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1457 const char *src
, size_t srcLen
) const
1459 // use memcpy() as it should be much faster than hand-written loop
1460 srcLen
= GetLength(src
, srcLen
);
1461 if ( srcLen
== wxNO_LEN
)
1462 return wxCONV_FAILED
;
1464 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1467 if ( dstLen
< inLen
)
1468 return wxCONV_FAILED
;
1470 memcpy(dst
, src
, srcLen
);
1477 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1478 const wchar_t *src
, size_t srcLen
) const
1480 if ( srcLen
== wxNO_LEN
)
1481 srcLen
= wxWcslen(src
) + 1;
1483 srcLen
*= BYTES_PER_CHAR
;
1487 if ( dstLen
< srcLen
)
1488 return wxCONV_FAILED
;
1490 memcpy(dst
, src
, srcLen
);
1496 // ----------------------------------------------------------------------------
1497 // endian-reversing conversions
1498 // ----------------------------------------------------------------------------
1501 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1502 const char *src
, size_t srcLen
) const
1504 srcLen
= GetLength(src
, srcLen
);
1505 if ( srcLen
== wxNO_LEN
)
1506 return wxCONV_FAILED
;
1508 srcLen
/= BYTES_PER_CHAR
;
1512 if ( dstLen
< srcLen
)
1513 return wxCONV_FAILED
;
1515 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1516 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1518 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1526 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1527 const wchar_t *src
, size_t srcLen
) const
1529 if ( srcLen
== wxNO_LEN
)
1530 srcLen
= wxWcslen(src
) + 1;
1532 srcLen
*= BYTES_PER_CHAR
;
1536 if ( dstLen
< srcLen
)
1537 return wxCONV_FAILED
;
1539 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1540 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1542 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1549 #endif // WC_UTF16/!WC_UTF16
1552 // ============================================================================
1553 // The classes doing conversion using the iconv_xxx() functions
1554 // ============================================================================
1558 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1559 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1560 // (unless there's yet another bug in glibc) the only case when iconv()
1561 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1562 // left in the input buffer -- when _real_ error occurs,
1563 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1565 // [This bug does not appear in glibc 2.2.]
1566 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1567 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1568 (errno != E2BIG || bufLeft != 0))
1570 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1573 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1575 #define ICONV_T_INVALID ((iconv_t)-1)
1577 #if SIZEOF_WCHAR_T == 4
1578 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1579 #define WC_ENC wxFONTENCODING_UTF32
1580 #elif SIZEOF_WCHAR_T == 2
1581 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1582 #define WC_ENC wxFONTENCODING_UTF16
1583 #else // sizeof(wchar_t) != 2 nor 4
1584 // does this ever happen?
1585 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1588 // ----------------------------------------------------------------------------
1589 // wxMBConv_iconv: encapsulates an iconv character set
1590 // ----------------------------------------------------------------------------
1592 class wxMBConv_iconv
: public wxMBConv
1595 wxMBConv_iconv(const char *name
);
1596 virtual ~wxMBConv_iconv();
1598 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1599 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1601 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1602 virtual size_t GetMBNulLen() const;
1604 #if wxUSE_UNICODE_UTF8
1605 virtual bool IsUTF8() const;
1608 virtual wxMBConv
*Clone() const
1610 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1611 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1616 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1619 // the iconv handlers used to translate from multibyte
1620 // to wide char and in the other direction
1625 // guards access to m2w and w2m objects
1626 wxMutex m_iconvMutex
;
1630 // the name (for iconv_open()) of a wide char charset -- if none is
1631 // available on this machine, it will remain NULL
1632 static wxString ms_wcCharsetName
;
1634 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1635 // different endian-ness than the native one
1636 static bool ms_wcNeedsSwap
;
1639 // name of the encoding handled by this conversion
1642 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1644 size_t m_minMBCharWidth
;
1647 // make the constructor available for unit testing
1648 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1650 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1651 if ( !result
->IsOk() )
1660 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1661 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1663 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1666 m_minMBCharWidth
= 0;
1668 // check for charset that represents wchar_t:
1669 if ( ms_wcCharsetName
.empty() )
1671 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1674 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1675 #else // !wxUSE_FONTMAP
1676 static const wxChar
*names_static
[] =
1678 #if SIZEOF_WCHAR_T == 4
1680 #elif SIZEOF_WCHAR_T = 2
1685 const wxChar
**names
= names_static
;
1686 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1688 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1690 const wxString
nameCS(*names
);
1692 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1693 wxString
nameXE(nameCS
);
1695 #ifdef WORDS_BIGENDIAN
1697 #else // little endian
1701 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1704 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1705 if ( m2w
== ICONV_T_INVALID
)
1707 // try charset w/o bytesex info (e.g. "UCS4")
1708 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1710 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1712 // and check for bytesex ourselves:
1713 if ( m2w
!= ICONV_T_INVALID
)
1715 char buf
[2], *bufPtr
;
1716 wchar_t wbuf
[2], *wbufPtr
;
1724 outsz
= SIZEOF_WCHAR_T
* 2;
1729 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1730 (char**)&wbufPtr
, &outsz
);
1732 if (ICONV_FAILED(res
, insz
))
1734 wxLogLastError(wxT("iconv"));
1735 wxLogError(_("Conversion to charset '%s' doesn't work."),
1738 else // ok, can convert to this encoding, remember it
1740 ms_wcCharsetName
= nameCS
;
1741 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1745 else // use charset not requiring byte swapping
1747 ms_wcCharsetName
= nameXE
;
1751 wxLogTrace(TRACE_STRCONV
,
1752 wxT("iconv wchar_t charset is \"%s\"%s"),
1753 ms_wcCharsetName
.empty() ? wxString("<none>")
1755 ms_wcNeedsSwap
? _T(" (needs swap)")
1758 else // we already have ms_wcCharsetName
1760 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
1763 if ( ms_wcCharsetName
.empty() )
1765 w2m
= ICONV_T_INVALID
;
1769 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
1770 if ( w2m
== ICONV_T_INVALID
)
1772 wxLogTrace(TRACE_STRCONV
,
1773 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1774 ms_wcCharsetName
.c_str(), name
);
1779 wxMBConv_iconv::~wxMBConv_iconv()
1781 if ( m2w
!= ICONV_T_INVALID
)
1783 if ( w2m
!= ICONV_T_INVALID
)
1787 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1789 // find the string length: notice that must be done differently for
1790 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1792 const size_t nulLen
= GetMBNulLen();
1796 return wxCONV_FAILED
;
1799 inbuf
= strlen(psz
); // arguably more optimized than our version
1804 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1805 // they also have to start at character boundary and not span two
1806 // adjacent characters
1808 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1815 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1816 // Unfortunately there are a couple of global wxCSConv objects such as
1817 // wxConvLocal that are used all over wx code, so we have to make sure
1818 // the handle is used by at most one thread at the time. Otherwise
1819 // only a few wx classes would be safe to use from non-main threads
1820 // as MB<->WC conversion would fail "randomly".
1821 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1822 #endif // wxUSE_THREADS
1824 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1826 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1827 wchar_t *bufPtr
= buf
;
1828 const char *pszPtr
= psz
;
1832 // have destination buffer, convert there
1834 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1835 (char**)&bufPtr
, &outbuf
);
1836 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1840 // convert to native endianness
1841 for ( unsigned i
= 0; i
< res
; i
++ )
1842 buf
[n
] = WC_BSWAP(buf
[i
]);
1845 // NUL-terminate the string if there is any space left
1851 // no destination buffer... convert using temp buffer
1852 // to calculate destination buffer requirement
1859 outbuf
= 8 * SIZEOF_WCHAR_T
;
1862 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1863 (char**)&bufPtr
, &outbuf
);
1865 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1867 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1870 if (ICONV_FAILED(cres
, inbuf
))
1872 //VS: it is ok if iconv fails, hence trace only
1873 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1874 return wxCONV_FAILED
;
1880 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1883 // NB: explained in MB2WC
1884 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1887 size_t inlen
= wxWcslen(psz
);
1888 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1892 wchar_t *tmpbuf
= 0;
1896 // need to copy to temp buffer to switch endianness
1897 // (doing WC_BSWAP twice on the original buffer won't help, as it
1898 // could be in read-only memory, or be accessed in some other thread)
1899 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1900 for ( size_t i
= 0; i
< inlen
; i
++ )
1901 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1903 tmpbuf
[inlen
] = L
'\0';
1909 // have destination buffer, convert there
1910 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1914 // NB: iconv was given only wcslen(psz) characters on input, and so
1915 // it couldn't convert the trailing zero. Let's do it ourselves
1916 // if there's some room left for it in the output buffer.
1922 // no destination buffer: convert using temp buffer
1923 // to calculate destination buffer requirement
1931 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1935 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1943 if (ICONV_FAILED(cres
, inbuf
))
1945 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1946 return wxCONV_FAILED
;
1952 size_t wxMBConv_iconv::GetMBNulLen() const
1954 if ( m_minMBCharWidth
== 0 )
1956 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1959 // NB: explained in MB2WC
1960 wxMutexLocker
lock(self
->m_iconvMutex
);
1963 const wchar_t *wnul
= L
"";
1964 char buf
[8]; // should be enough for NUL in any encoding
1965 size_t inLen
= sizeof(wchar_t),
1966 outLen
= WXSIZEOF(buf
);
1967 char *inBuff
= (char *)wnul
;
1968 char *outBuff
= buf
;
1969 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1971 self
->m_minMBCharWidth
= (size_t)-1;
1975 self
->m_minMBCharWidth
= outBuff
- buf
;
1979 return m_minMBCharWidth
;
1982 #if wxUSE_UNICODE_UTF8
1983 bool wxMBConv_iconv::IsUTF8() const
1985 return wxStricmp(m_name
, "UTF-8") == 0 ||
1986 wxStricmp(m_name
, "UTF8") == 0;
1990 #endif // HAVE_ICONV
1993 // ============================================================================
1994 // Win32 conversion classes
1995 // ============================================================================
1997 #ifdef wxHAVE_WIN32_MB2WC
2001 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2002 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2005 class wxMBConv_win32
: public wxMBConv
2010 m_CodePage
= CP_ACP
;
2011 m_minMBCharWidth
= 0;
2014 wxMBConv_win32(const wxMBConv_win32
& conv
)
2017 m_CodePage
= conv
.m_CodePage
;
2018 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2022 wxMBConv_win32(const char* name
)
2024 m_CodePage
= wxCharsetToCodepage(name
);
2025 m_minMBCharWidth
= 0;
2028 wxMBConv_win32(wxFontEncoding encoding
)
2030 m_CodePage
= wxEncodingToCodepage(encoding
);
2031 m_minMBCharWidth
= 0;
2033 #endif // wxUSE_FONTMAP
2035 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2037 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2038 // the behaviour is not compatible with the Unix version (using iconv)
2039 // and break the library itself, e.g. wxTextInputStream::NextChar()
2040 // wouldn't work if reading an incomplete MB char didn't result in an
2043 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2044 // Win XP or newer and it is not supported for UTF-[78] so we always
2045 // use our own conversions in this case. See
2046 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2047 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2048 if ( m_CodePage
== CP_UTF8
)
2050 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2053 if ( m_CodePage
== CP_UTF7
)
2055 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2059 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2060 IsAtLeastWin2kSP4() )
2062 flags
= MB_ERR_INVALID_CHARS
;
2065 const size_t len
= ::MultiByteToWideChar
2067 m_CodePage
, // code page
2068 flags
, // flags: fall on error
2069 psz
, // input string
2070 -1, // its length (NUL-terminated)
2071 buf
, // output string
2072 buf
? n
: 0 // size of output buffer
2076 // function totally failed
2077 return wxCONV_FAILED
;
2080 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2081 // check if we succeeded, by doing a double trip:
2082 if ( !flags
&& buf
)
2084 const size_t mbLen
= strlen(psz
);
2085 wxCharBuffer
mbBuf(mbLen
);
2086 if ( ::WideCharToMultiByte
2093 mbLen
+ 1, // size in bytes, not length
2097 strcmp(mbBuf
, psz
) != 0 )
2099 // we didn't obtain the same thing we started from, hence
2100 // the conversion was lossy and we consider that it failed
2101 return wxCONV_FAILED
;
2105 // note that it returns count of written chars for buf != NULL and size
2106 // of the needed buffer for buf == NULL so in either case the length of
2107 // the string (which never includes the terminating NUL) is one less
2111 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2114 we have a problem here: by default, WideCharToMultiByte() may
2115 replace characters unrepresentable in the target code page with bad
2116 quality approximations such as turning "1/2" symbol (U+00BD) into
2117 "1" for the code pages which don't have it and we, obviously, want
2118 to avoid this at any price
2120 the trouble is that this function does it _silently_, i.e. it won't
2121 even tell us whether it did or not... Win98/2000 and higher provide
2122 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2123 we have to resort to a round trip, i.e. check that converting back
2124 results in the same string -- this is, of course, expensive but
2125 otherwise we simply can't be sure to not garble the data.
2128 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2129 // it doesn't work with CJK encodings (which we test for rather roughly
2130 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2132 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2135 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2137 // it's our lucky day
2138 flags
= WC_NO_BEST_FIT_CHARS
;
2139 pUsedDef
= &usedDef
;
2141 else // old system or unsupported encoding
2147 const size_t len
= ::WideCharToMultiByte
2149 m_CodePage
, // code page
2150 flags
, // either none or no best fit
2151 pwz
, // input string
2152 -1, // it is (wide) NUL-terminated
2153 buf
, // output buffer
2154 buf
? n
: 0, // and its size
2155 NULL
, // default "replacement" char
2156 pUsedDef
// [out] was it used?
2161 // function totally failed
2162 return wxCONV_FAILED
;
2165 // if we were really converting, check if we succeeded
2170 // check if the conversion failed, i.e. if any replacements
2173 return wxCONV_FAILED
;
2175 else // we must resort to double tripping...
2177 wxWCharBuffer
wcBuf(n
);
2178 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2179 wcscmp(wcBuf
, pwz
) != 0 )
2181 // we didn't obtain the same thing we started from, hence
2182 // the conversion was lossy and we consider that it failed
2183 return wxCONV_FAILED
;
2188 // see the comment above for the reason of "len - 1"
2192 virtual size_t GetMBNulLen() const
2194 if ( m_minMBCharWidth
== 0 )
2196 int len
= ::WideCharToMultiByte
2198 m_CodePage
, // code page
2200 L
"", // input string
2201 1, // translate just the NUL
2202 NULL
, // output buffer
2204 NULL
, // no replacement char
2205 NULL
// [out] don't care if it was used
2208 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2212 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2213 self
->m_minMBCharWidth
= (size_t)-1;
2217 self
->m_minMBCharWidth
= (size_t)-1;
2223 self
->m_minMBCharWidth
= len
;
2228 return m_minMBCharWidth
;
2231 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2233 bool IsOk() const { return m_CodePage
!= -1; }
2236 static bool CanUseNoBestFit()
2238 static int s_isWin98Or2k
= -1;
2240 if ( s_isWin98Or2k
== -1 )
2243 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2245 case wxOS_WINDOWS_9X
:
2246 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2249 case wxOS_WINDOWS_NT
:
2250 s_isWin98Or2k
= verMaj
>= 5;
2254 // unknown: be conservative by default
2259 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2262 return s_isWin98Or2k
== 1;
2265 static bool IsAtLeastWin2kSP4()
2270 static int s_isAtLeastWin2kSP4
= -1;
2272 if ( s_isAtLeastWin2kSP4
== -1 )
2274 OSVERSIONINFOEX ver
;
2276 memset(&ver
, 0, sizeof(ver
));
2277 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2278 GetVersionEx((OSVERSIONINFO
*)&ver
);
2280 s_isAtLeastWin2kSP4
=
2281 ((ver
.dwMajorVersion
> 5) || // Vista+
2282 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2283 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2284 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2288 return s_isAtLeastWin2kSP4
== 1;
2293 // the code page we're working with
2296 // cached result of GetMBNulLen(), set to 0 initially meaning
2298 size_t m_minMBCharWidth
;
2301 #endif // wxHAVE_WIN32_MB2WC
2304 // ============================================================================
2305 // Mac conversion classes
2306 // ============================================================================
2308 /* Although we are in the base library we currently have this wxMac
2309 * conditional. This is not generally good but fortunately does not affect
2310 * the ABI of the base library, only what encodings might work.
2311 * It does mean that a wxBase built as part of wxMac has slightly more support
2312 * than one built for wxCocoa or even wxGtk.
2314 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2316 class wxMBConv_mac
: public wxMBConv
2321 Init(CFStringGetSystemEncoding()) ;
2324 wxMBConv_mac(const wxMBConv_mac
& conv
)
2326 Init(conv
.m_char_encoding
);
2330 wxMBConv_mac(const char* name
)
2332 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2336 wxMBConv_mac(wxFontEncoding encoding
)
2338 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2341 virtual ~wxMBConv_mac()
2343 OSStatus status
= noErr
;
2344 if (m_MB2WC_converter
)
2345 status
= TECDisposeConverter(m_MB2WC_converter
);
2346 if (m_WC2MB_converter
)
2347 status
= TECDisposeConverter(m_WC2MB_converter
);
2350 void Init( TextEncodingBase encoding
,TextEncodingVariant encodingVariant
= kTextEncodingDefaultVariant
,
2351 TextEncodingFormat encodingFormat
= kTextEncodingDefaultFormat
)
2353 m_MB2WC_converter
= NULL
;
2354 m_WC2MB_converter
= NULL
;
2355 m_char_encoding
= CreateTextEncoding(encoding
, encodingVariant
, encodingFormat
) ;
2356 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2359 virtual void CreateIfNeeded() const
2361 if ( m_MB2WC_converter
== NULL
&& m_WC2MB_converter
== NULL
)
2363 OSStatus status
= noErr
;
2364 status
= TECCreateConverter(&m_MB2WC_converter
,
2366 m_unicode_encoding
);
2367 wxASSERT_MSG( status
== noErr
, _("Unable to create TextEncodingConverter")) ;
2368 status
= TECCreateConverter(&m_WC2MB_converter
,
2371 wxASSERT_MSG( status
== noErr
, _("Unable to create TextEncodingConverter")) ;
2375 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2378 OSStatus status
= noErr
;
2379 ByteCount byteOutLen
;
2380 ByteCount byteInLen
= strlen(psz
) + 1;
2381 wchar_t *tbuf
= NULL
;
2382 UniChar
* ubuf
= NULL
;
2387 // Apple specs say at least 32
2388 n
= wxMax( 32, byteInLen
) ;
2389 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2392 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2394 #if SIZEOF_WCHAR_T == 4
2395 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2397 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2400 status
= TECConvertText(
2401 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2402 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2404 #if SIZEOF_WCHAR_T == 4
2405 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2406 // is not properly terminated we get random characters at the end
2407 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2408 wxMBConvUTF16 converter
;
2409 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2412 res
= byteOutLen
/ sizeof( UniChar
) ;
2418 if ( buf
&& res
< n
)
2424 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2427 OSStatus status
= noErr
;
2428 ByteCount byteOutLen
;
2429 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2435 // Apple specs say at least 32
2436 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2437 tbuf
= (char*) malloc( n
) ;
2440 ByteCount byteBufferLen
= n
;
2441 UniChar
* ubuf
= NULL
;
2443 #if SIZEOF_WCHAR_T == 4
2444 wxMBConvUTF16 converter
;
2445 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2446 byteInLen
= unicharlen
;
2447 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2448 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2450 ubuf
= (UniChar
*) psz
;
2453 status
= TECConvertText(
2454 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2455 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2457 #if SIZEOF_WCHAR_T == 4
2464 size_t res
= byteOutLen
;
2465 if ( buf
&& res
< n
)
2469 //we need to double-trip to verify it didn't insert any ? in place
2470 //of bogus characters
2471 wxWCharBuffer
wcBuf(n
);
2472 size_t pszlen
= wxWcslen(psz
);
2473 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2474 wxWcslen(wcBuf
) != pszlen
||
2475 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2477 // we didn't obtain the same thing we started from, hence
2478 // the conversion was lossy and we consider that it failed
2479 return wxCONV_FAILED
;
2486 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2491 return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
;
2495 mutable TECObjectRef m_MB2WC_converter
;
2496 mutable TECObjectRef m_WC2MB_converter
;
2498 TextEncodingBase m_char_encoding
;
2499 TextEncodingBase m_unicode_encoding
;
2502 // MB is decomposed (D) normalized UTF8
2504 class wxMBConv_macUTF8D
: public wxMBConv_mac
2509 Init( kTextEncodingUnicodeDefault
, kUnicodeNoSubset
, kUnicodeUTF8Format
) ;
2514 virtual ~wxMBConv_macUTF8D()
2517 DisposeUnicodeToTextInfo(&m_uni
);
2518 if (m_uniBack
!=NULL
)
2519 DisposeUnicodeToTextInfo(&m_uniBack
);
2522 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2525 OSStatus status
= noErr
;
2526 ByteCount byteOutLen
;
2527 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2533 // Apple specs say at least 32
2534 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2535 tbuf
= (char*) malloc( n
) ;
2538 ByteCount byteBufferLen
= n
;
2539 UniChar
* ubuf
= NULL
;
2541 #if SIZEOF_WCHAR_T == 4
2542 wxMBConvUTF16 converter
;
2543 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2544 byteInLen
= unicharlen
;
2545 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2546 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2548 ubuf
= (UniChar
*) psz
;
2551 // ubuf is a non-decomposed UniChar buffer
2553 ByteCount dcubuflen
= byteInLen
* 2 + 2 ;
2554 ByteCount dcubufread
, dcubufwritten
;
2555 UniChar
*dcubuf
= (UniChar
*) malloc( dcubuflen
) ;
2557 ConvertFromUnicodeToText( m_uni
, byteInLen
, ubuf
,
2558 kUnicodeDefaultDirectionMask
, 0, NULL
, NULL
, NULL
, dcubuflen
, &dcubufread
, &dcubufwritten
, dcubuf
) ;
2560 // we now convert that decomposed buffer into UTF8
2562 status
= TECConvertText(
2563 m_WC2MB_converter
, (ConstTextPtr
) dcubuf
, dcubufwritten
, &dcubufread
,
2564 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2568 #if SIZEOF_WCHAR_T == 4
2575 size_t res
= byteOutLen
;
2576 if ( buf
&& res
< n
)
2579 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2585 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2588 OSStatus status
= noErr
;
2589 ByteCount byteOutLen
;
2590 ByteCount byteInLen
= strlen(psz
) + 1;
2591 wchar_t *tbuf
= NULL
;
2592 UniChar
* ubuf
= NULL
;
2597 // Apple specs say at least 32
2598 n
= wxMax( 32, byteInLen
) ;
2599 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2602 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2604 #if SIZEOF_WCHAR_T == 4
2605 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2607 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2610 ByteCount dcubuflen
= byteBufferLen
* 2 + 2 ;
2611 ByteCount dcubufread
, dcubufwritten
;
2612 UniChar
*dcubuf
= (UniChar
*) malloc( dcubuflen
) ;
2614 status
= TECConvertText(
2615 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2616 (TextPtr
) dcubuf
, dcubuflen
, &byteOutLen
);
2617 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2618 // is not properly terminated we get random characters at the end
2619 dcubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2621 // now from the decomposed UniChar to properly composed uniChar
2622 ConvertFromUnicodeToText( m_uniBack
, byteOutLen
, dcubuf
,
2623 kUnicodeDefaultDirectionMask
, 0, NULL
, NULL
, NULL
, dcubuflen
, &dcubufread
, &dcubufwritten
, ubuf
) ;
2626 byteOutLen
= dcubufwritten
;
2627 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2630 #if SIZEOF_WCHAR_T == 4
2631 wxMBConvUTF16 converter
;
2632 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2635 res
= byteOutLen
/ sizeof( UniChar
) ;
2641 if ( buf
&& res
< n
)
2647 virtual void CreateIfNeeded() const
2649 wxMBConv_mac::CreateIfNeeded() ;
2650 if ( m_uni
== NULL
)
2652 m_map
.unicodeEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
2653 kUnicodeNoSubset
, kTextEncodingDefaultFormat
);
2654 m_map
.otherEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
2655 kUnicodeCanonicalDecompVariant
, kTextEncodingDefaultFormat
);
2656 m_map
.mappingVersion
= kUnicodeUseLatestMapping
;
2658 OSStatus err
= CreateUnicodeToTextInfo(&m_map
, &m_uni
);
2659 wxASSERT_MSG( err
== noErr
, _(" Couldn't create the UnicodeConverter")) ;
2661 m_map
.unicodeEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
2662 kUnicodeNoSubset
, kTextEncodingDefaultFormat
);
2663 m_map
.otherEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
2664 kUnicodeCanonicalCompVariant
, kTextEncodingDefaultFormat
);
2665 m_map
.mappingVersion
= kUnicodeUseLatestMapping
;
2666 err
= CreateUnicodeToTextInfo(&m_map
, &m_uniBack
);
2667 wxASSERT_MSG( err
== noErr
, _(" Couldn't create the UnicodeConverter")) ;
2671 mutable UnicodeToTextInfo m_uni
;
2672 mutable UnicodeToTextInfo m_uniBack
;
2673 mutable UnicodeMapping m_map
;
2675 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2677 // ============================================================================
2678 // wxEncodingConverter based conversion classes
2679 // ============================================================================
2683 class wxMBConv_wxwin
: public wxMBConv
2688 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2689 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2693 // temporarily just use wxEncodingConverter stuff,
2694 // so that it works while a better implementation is built
2695 wxMBConv_wxwin(const char* name
)
2698 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2700 m_enc
= wxFONTENCODING_SYSTEM
;
2705 wxMBConv_wxwin(wxFontEncoding enc
)
2712 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2714 size_t inbuf
= strlen(psz
);
2717 if (!m2w
.Convert(psz
, buf
))
2718 return wxCONV_FAILED
;
2723 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2725 const size_t inbuf
= wxWcslen(psz
);
2728 if (!w2m
.Convert(psz
, buf
))
2729 return wxCONV_FAILED
;
2735 virtual size_t GetMBNulLen() const
2739 case wxFONTENCODING_UTF16BE
:
2740 case wxFONTENCODING_UTF16LE
:
2743 case wxFONTENCODING_UTF32BE
:
2744 case wxFONTENCODING_UTF32LE
:
2752 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2754 bool IsOk() const { return m_ok
; }
2757 wxFontEncoding m_enc
;
2758 wxEncodingConverter m2w
, w2m
;
2761 // were we initialized successfully?
2764 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2767 // make the constructors available for unit testing
2768 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2770 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2771 if ( !result
->IsOk() )
2780 #endif // wxUSE_FONTMAP
2782 // ============================================================================
2783 // wxCSConv implementation
2784 // ============================================================================
2786 void wxCSConv::Init()
2793 wxCSConv::wxCSConv(const wxString
& charset
)
2797 if ( !charset
.empty() )
2799 SetName(charset
.ToAscii());
2803 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2805 m_encoding
= wxFONTENCODING_SYSTEM
;
2809 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2811 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2813 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2815 encoding
= wxFONTENCODING_SYSTEM
;
2820 m_encoding
= encoding
;
2823 wxCSConv::~wxCSConv()
2828 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2833 SetName(conv
.m_name
);
2834 m_encoding
= conv
.m_encoding
;
2837 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2841 SetName(conv
.m_name
);
2842 m_encoding
= conv
.m_encoding
;
2847 void wxCSConv::Clear()
2856 void wxCSConv::SetName(const char *charset
)
2860 m_name
= strdup(charset
);
2867 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2868 wxEncodingNameCache
);
2870 static wxEncodingNameCache gs_nameCache
;
2873 wxMBConv
*wxCSConv::DoCreate() const
2876 wxLogTrace(TRACE_STRCONV
,
2877 wxT("creating conversion for %s"),
2879 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2880 #endif // wxUSE_FONTMAP
2882 // check for the special case of ASCII or ISO8859-1 charset: as we have
2883 // special knowledge of it anyhow, we don't need to create a special
2884 // conversion object
2885 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2886 m_encoding
== wxFONTENCODING_DEFAULT
)
2888 // don't convert at all
2892 // we trust OS to do conversion better than we can so try external
2893 // conversion methods first
2895 // the full order is:
2896 // 1. OS conversion (iconv() under Unix or Win32 API)
2897 // 2. hard coded conversions for UTF
2898 // 3. wxEncodingConverter as fall back
2904 #endif // !wxUSE_FONTMAP
2907 wxFontEncoding
encoding(m_encoding
);
2912 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2920 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2921 #endif // wxUSE_FONTMAP
2925 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2926 if ( it
!= gs_nameCache
.end() )
2928 if ( it
->second
.empty() )
2931 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2938 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2939 // CS : in case this does not return valid names (eg for MacRoman)
2940 // encoding got a 'failure' entry in the cache all the same,
2941 // although it just has to be created using a different method, so
2942 // only store failed iconv creation attempts (or perhaps we
2943 // shoulnd't do this at all ?)
2944 if ( names
[0] != NULL
)
2946 for ( ; *names
; ++names
)
2948 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2949 // will need changes that will obsolete this
2950 wxString
name(*names
);
2951 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2954 gs_nameCache
[encoding
] = *names
;
2961 gs_nameCache
[encoding
] = _T(""); // cache the failure
2964 #endif // wxUSE_FONTMAP
2966 #endif // HAVE_ICONV
2968 #ifdef wxHAVE_WIN32_MB2WC
2971 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2972 : new wxMBConv_win32(m_encoding
);
2981 #endif // wxHAVE_WIN32_MB2WC
2983 #if defined(__WXMAC__)
2985 // leave UTF16 and UTF32 to the built-ins of wx
2986 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2987 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2990 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2991 : new wxMBConv_mac(m_encoding
);
2993 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3005 // leave UTF16 and UTF32 to the built-ins of wx
3006 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3007 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3010 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3011 : new wxMBConv_cf(m_encoding
);
3013 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3022 #endif // __DARWIN__
3025 wxFontEncoding enc
= m_encoding
;
3027 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3029 // use "false" to suppress interactive dialogs -- we can be called from
3030 // anywhere and popping up a dialog from here is the last thing we want to
3032 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3034 #endif // wxUSE_FONTMAP
3038 case wxFONTENCODING_UTF7
:
3039 return new wxMBConvUTF7
;
3041 case wxFONTENCODING_UTF8
:
3042 return new wxMBConvUTF8
;
3044 case wxFONTENCODING_UTF16BE
:
3045 return new wxMBConvUTF16BE
;
3047 case wxFONTENCODING_UTF16LE
:
3048 return new wxMBConvUTF16LE
;
3050 case wxFONTENCODING_UTF32BE
:
3051 return new wxMBConvUTF32BE
;
3053 case wxFONTENCODING_UTF32LE
:
3054 return new wxMBConvUTF32LE
;
3057 // nothing to do but put here to suppress gcc warnings
3064 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3065 : new wxMBConv_wxwin(m_encoding
);
3071 #endif // wxUSE_FONTMAP
3073 // NB: This is a hack to prevent deadlock. What could otherwise happen
3074 // in Unicode build: wxConvLocal creation ends up being here
3075 // because of some failure and logs the error. But wxLog will try to
3076 // attach a timestamp, for which it will need wxConvLocal (to convert
3077 // time to char* and then wchar_t*), but that fails, tries to log the
3078 // error, but wxLog has an (already locked) critical section that
3079 // guards the static buffer.
3080 static bool alreadyLoggingError
= false;
3081 if (!alreadyLoggingError
)
3083 alreadyLoggingError
= true;
3084 wxLogError(_("Cannot convert from the charset '%s'!"),
3088 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3089 #else // !wxUSE_FONTMAP
3090 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3091 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3094 alreadyLoggingError
= false;
3100 void wxCSConv::CreateConvIfNeeded() const
3104 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3106 // if we don't have neither the name nor the encoding, use the default
3107 // encoding for this system
3108 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3111 self
->m_encoding
= wxLocale::GetSystemEncoding();
3113 // fallback to some reasonable default:
3114 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3115 #endif // wxUSE_INTL
3118 self
->m_convReal
= DoCreate();
3119 self
->m_deferred
= false;
3123 bool wxCSConv::IsOk() const
3125 CreateConvIfNeeded();
3127 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3128 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3129 return true; // always ok as we do it ourselves
3131 // m_convReal->IsOk() is called at its own creation, so we know it must
3132 // be ok if m_convReal is non-NULL
3133 return m_convReal
!= NULL
;
3136 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3137 const char *src
, size_t srcLen
) const
3139 CreateConvIfNeeded();
3142 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3145 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3148 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3149 const wchar_t *src
, size_t srcLen
) const
3151 CreateConvIfNeeded();
3154 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3157 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3160 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3162 CreateConvIfNeeded();
3165 return m_convReal
->MB2WC(buf
, psz
, n
);
3168 size_t len
= strlen(psz
);
3172 for (size_t c
= 0; c
<= len
; c
++)
3173 buf
[c
] = (unsigned char)(psz
[c
]);
3179 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3181 CreateConvIfNeeded();
3184 return m_convReal
->WC2MB(buf
, psz
, n
);
3187 const size_t len
= wxWcslen(psz
);
3190 for (size_t c
= 0; c
<= len
; c
++)
3193 return wxCONV_FAILED
;
3195 buf
[c
] = (char)psz
[c
];
3200 for (size_t c
= 0; c
<= len
; c
++)
3203 return wxCONV_FAILED
;
3210 size_t wxCSConv::GetMBNulLen() const
3212 CreateConvIfNeeded();
3216 return m_convReal
->GetMBNulLen();
3219 // otherwise, we are ISO-8859-1
3223 #if wxUSE_UNICODE_UTF8
3224 bool wxCSConv::IsUTF8() const
3226 CreateConvIfNeeded();
3230 return m_convReal
->IsUTF8();
3233 // otherwise, we are ISO-8859-1
3241 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3244 return wxWCharBuffer();
3246 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3248 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3250 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3255 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3258 return wxCharBuffer();
3260 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3262 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3267 #endif // wxUSE_UNICODE
3269 // ----------------------------------------------------------------------------
3271 // ----------------------------------------------------------------------------
3273 // NB: The reason why we create converted objects in this convoluted way,
3274 // using a factory function instead of global variable, is that they
3275 // may be used at static initialization time (some of them are used by
3276 // wxString ctors and there may be a global wxString object). In other
3277 // words, possibly _before_ the converter global object would be
3284 #undef wxConvISO8859_1
3286 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3287 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3288 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3290 static impl_klass name##Obj ctor_args; \
3291 return &name##Obj; \
3293 /* this ensures that all global converter objects are created */ \
3294 /* by the time static initialization is done, i.e. before any */ \
3295 /* thread is launched: */ \
3296 static klass* gs_##name##instance = wxGet_##name##Ptr()
3298 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3299 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3302 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3303 #elif defined(__WXMAC__) && !defined(__MACH__)
3304 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_mac
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3306 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3309 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
);
3310 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
);
3312 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3313 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3315 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3318 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3319 static wxMBConv_macUTF8D wxConvMacUTF8DObj
;
3321 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3323 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3326 wxGet_wxConvUTF8Ptr();
3329 wxGet_wxConvLibcPtr();
3330 #endif // __WXOSX__/!__WXOSX__
3332 #else // !wxUSE_WCHAR_T
3334 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3335 // stand-ins in absence of wchar_t
3336 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3341 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T