1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include <CoreFoundation/CFString.h>
61 #include <CoreFoundation/CFStringEncodingExt.h>
63 #include "wx/mac/corefoundation/cfref.h"
64 #endif //def __DARWIN__
68 #include <ATSUnicode.h>
69 #include <TextCommon.h>
70 #include <TextEncodingConverter.h>
73 // includes Mac headers
74 #include "wx/mac/private.h"
78 #define TRACE_STRCONV _T("strconv")
80 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
82 #if SIZEOF_WCHAR_T == 2
87 // ============================================================================
89 // ============================================================================
91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
92 static bool NotAllNULs(const char *p
, size_t n
)
94 while ( n
&& *p
++ == '\0' )
100 // ----------------------------------------------------------------------------
101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
102 // ----------------------------------------------------------------------------
104 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
109 *output
= (wxUint16
) input
;
113 else if (input
>= 0x110000)
115 return wxCONV_FAILED
;
121 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
122 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
129 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
131 if ((*input
< 0xd800) || (*input
> 0xdfff))
136 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
139 return wxCONV_FAILED
;
143 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
149 typedef wchar_t wxDecodeSurrogate_t
;
151 typedef wxUint16 wxDecodeSurrogate_t
;
152 #endif // WC_UTF16/!WC_UTF16
154 // returns the next UTF-32 character from the wchar_t buffer and advances the
155 // pointer to the character after this one
157 // if an invalid character is found, *pSrc is set to NULL, the caller must
159 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
163 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
164 if ( n
== wxCONV_FAILED
)
172 // ----------------------------------------------------------------------------
174 // ----------------------------------------------------------------------------
177 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
178 const char *src
, size_t srcLen
) const
180 // although new conversion classes are supposed to implement this function
181 // directly, the existins ones only implement the old MB2WC() and so, to
182 // avoid to have to rewrite all conversion classes at once, we provide a
183 // default (but not efficient) implementation of this one in terms of the
184 // old function by copying the input to ensure that it's NUL-terminated and
185 // then using MB2WC() to convert it
187 // the number of chars [which would be] written to dst [if it were not NULL]
188 size_t dstWritten
= 0;
190 // the number of NULs terminating this string
191 size_t nulLen
= 0; // not really needed, but just to avoid warnings
193 // if we were not given the input size we just have to assume that the
194 // string is properly terminated as we have no way of knowing how long it
195 // is anyhow, but if we do have the size check whether there are enough
199 if ( srcLen
!= wxNO_LEN
)
201 // we need to know how to find the end of this string
202 nulLen
= GetMBNulLen();
203 if ( nulLen
== wxCONV_FAILED
)
204 return wxCONV_FAILED
;
206 // if there are enough NULs we can avoid the copy
207 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
209 // make a copy in order to properly NUL-terminate the string
210 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
211 char * const p
= bufTmp
.data();
212 memcpy(p
, src
, srcLen
);
213 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
219 srcEnd
= src
+ srcLen
;
221 else // quit after the first loop iteration
228 // try to convert the current chunk
229 size_t lenChunk
= MB2WC(NULL
, src
, 0);
230 if ( lenChunk
== wxCONV_FAILED
)
231 return wxCONV_FAILED
;
233 lenChunk
++; // for the L'\0' at the end of this chunk
235 dstWritten
+= lenChunk
;
239 // nothing left in the input string, conversion succeeded
245 if ( dstWritten
> dstLen
)
246 return wxCONV_FAILED
;
248 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
249 return wxCONV_FAILED
;
256 // we convert just one chunk in this case as this is the entire
261 // advance the input pointer past the end of this chunk
262 while ( NotAllNULs(src
, nulLen
) )
264 // notice that we must skip over multiple bytes here as we suppose
265 // that if NUL takes 2 or 4 bytes, then all the other characters do
266 // too and so if advanced by a single byte we might erroneously
267 // detect sequences of NUL bytes in the middle of the input
271 src
+= nulLen
; // skipping over its terminator as well
273 // note that ">=" (and not just "==") is needed here as the terminator
274 // we skipped just above could be inside or just after the buffer
275 // delimited by inEnd
284 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
285 const wchar_t *src
, size_t srcLen
) const
287 // the number of chars [which would be] written to dst [if it were not NULL]
288 size_t dstWritten
= 0;
290 // make a copy of the input string unless it is already properly
293 // if we don't know its length we have no choice but to assume that it is,
294 // indeed, properly terminated
295 wxWCharBuffer bufTmp
;
296 if ( srcLen
== wxNO_LEN
)
298 srcLen
= wxWcslen(src
) + 1;
300 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
302 // make a copy in order to properly NUL-terminate the string
303 bufTmp
= wxWCharBuffer(srcLen
);
304 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
308 const size_t lenNul
= GetMBNulLen();
309 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
311 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
313 // try to convert the current chunk
314 size_t lenChunk
= WC2MB(NULL
, src
, 0);
316 if ( lenChunk
== wxCONV_FAILED
)
317 return wxCONV_FAILED
;
320 dstWritten
+= lenChunk
;
324 if ( dstWritten
> dstLen
)
325 return wxCONV_FAILED
;
327 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
328 return wxCONV_FAILED
;
337 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
339 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
340 if ( rc
!= wxCONV_FAILED
)
342 // ToWChar() returns the buffer length, i.e. including the trailing
343 // NUL, while this method doesn't take it into account
350 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
352 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
353 if ( rc
!= wxCONV_FAILED
)
361 wxMBConv::~wxMBConv()
363 // nothing to do here (necessary for Darwin linking probably)
366 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
370 // calculate the length of the buffer needed first
371 const size_t nLen
= MB2WC(NULL
, psz
, 0);
372 if ( nLen
!= wxCONV_FAILED
)
374 // now do the actual conversion
375 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
377 // +1 for the trailing NULL
378 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
383 return wxWCharBuffer();
386 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
390 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
391 if ( nLen
!= wxCONV_FAILED
)
393 // extra space for trailing NUL(s)
394 static const size_t extraLen
= GetMaxMBNulLen();
396 wxCharBuffer
buf(nLen
+ extraLen
- 1);
397 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
402 return wxCharBuffer();
406 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
408 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
409 if ( dstLen
!= wxCONV_FAILED
)
411 wxWCharBuffer
wbuf(dstLen
- 1);
412 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
417 if ( wbuf
[dstLen
- 1] == L
'\0' )
428 return wxWCharBuffer();
432 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
434 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
435 if ( dstLen
!= wxCONV_FAILED
)
437 // special case of empty input: can't allocate 0 size buffer below as
438 // wxCharBuffer insists on NUL-terminating it
439 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
440 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
446 const size_t nulLen
= GetMBNulLen();
447 if ( dstLen
>= nulLen
&&
448 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
450 // in this case the output is NUL-terminated and we're not
451 // supposed to count NUL
463 return wxCharBuffer();
466 // ----------------------------------------------------------------------------
468 // ----------------------------------------------------------------------------
470 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
472 return wxMB2WC(buf
, psz
, n
);
475 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
477 return wxWC2MB(buf
, psz
, n
);
480 // ----------------------------------------------------------------------------
481 // wxConvBrokenFileNames
482 // ----------------------------------------------------------------------------
486 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
488 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
489 wxStricmp(charset
, _T("UTF8")) == 0 )
490 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
492 m_conv
= new wxCSConv(charset
);
497 // ----------------------------------------------------------------------------
499 // ----------------------------------------------------------------------------
501 // Implementation (C) 2004 Fredrik Roubert
504 // BASE64 decoding table
506 static const unsigned char utf7unb64
[] =
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
542 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
546 while ( *psz
&& (!buf
|| (len
< n
)) )
548 unsigned char cc
= *psz
++;
556 else if (*psz
== '-')
564 else // start of BASE64 encoded string
568 for ( ok
= lsb
= false, d
= 0, l
= 0;
569 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
574 for (l
+= 6; l
>= 8; lsb
= !lsb
)
576 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
586 *buf
= (wchar_t)(c
<< 8);
595 // in valid UTF7 we should have valid characters after '+'
596 return wxCONV_FAILED
;
604 if ( buf
&& (len
< n
) )
611 // BASE64 encoding table
613 static const unsigned char utf7enb64
[] =
615 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
616 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
617 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
618 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
619 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
620 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
621 'w', 'x', 'y', 'z', '0', '1', '2', '3',
622 '4', '5', '6', '7', '8', '9', '+', '/'
626 // UTF-7 encoding table
628 // 0 - Set D (directly encoded characters)
629 // 1 - Set O (optional direct characters)
630 // 2 - whitespace characters (optional)
631 // 3 - special characters
633 static const unsigned char utf7encode
[128] =
635 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
636 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
637 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
639 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
641 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
645 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
649 while (*psz
&& ((!buf
) || (len
< n
)))
652 if (cc
< 0x80 && utf7encode
[cc
] < 1)
661 else if (((wxUint32
)cc
) > 0xffff)
663 // no surrogate pair generation (yet?)
664 return wxCONV_FAILED
;
675 // BASE64 encode string
676 unsigned int lsb
, d
, l
;
677 for (d
= 0, l
= 0; /*nothing*/; psz
++)
679 for (lsb
= 0; lsb
< 2; lsb
++)
682 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
684 for (l
+= 8; l
>= 6; )
688 *buf
++ = utf7enb64
[(d
>> l
) % 64];
694 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
701 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
713 if (buf
&& (len
< n
))
719 // ----------------------------------------------------------------------------
721 // ----------------------------------------------------------------------------
723 static wxUint32 utf8_max
[]=
724 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
726 // boundaries of the private use area we use to (temporarily) remap invalid
727 // characters invalid in a UTF-8 encoded string
728 const wxUint32 wxUnicodePUA
= 0x100000;
729 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
731 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
735 while (*psz
&& ((!buf
) || (len
< n
)))
737 const char *opsz
= psz
;
738 bool invalid
= false;
739 unsigned char cc
= *psz
++, fc
= cc
;
741 for (cnt
= 0; fc
& 0x80; cnt
++)
751 // escape the escape character for octal escapes
752 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
753 && cc
== '\\' && (!buf
|| len
< n
))
765 // invalid UTF-8 sequence
770 unsigned ocnt
= cnt
- 1;
771 wxUint32 res
= cc
& (0x3f >> cnt
);
775 if ((cc
& 0xC0) != 0x80)
777 // invalid UTF-8 sequence
783 res
= (res
<< 6) | (cc
& 0x3f);
786 if (invalid
|| res
<= utf8_max
[ocnt
])
788 // illegal UTF-8 encoding
791 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
792 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
794 // if one of our PUA characters turns up externally
795 // it must also be treated as an illegal sequence
796 // (a bit like you have to escape an escape character)
802 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
803 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
804 if (pa
== wxCONV_FAILED
)
816 *buf
++ = (wchar_t)res
;
818 #endif // WC_UTF16/!WC_UTF16
824 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
826 while (opsz
< psz
&& (!buf
|| len
< n
))
829 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
830 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
831 wxASSERT(pa
!= wxCONV_FAILED
);
838 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
844 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
846 while (opsz
< psz
&& (!buf
|| len
< n
))
848 if ( buf
&& len
+ 3 < n
)
850 unsigned char on
= *opsz
;
852 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
853 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
854 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
861 else // MAP_INVALID_UTF8_NOT
863 return wxCONV_FAILED
;
869 if (buf
&& (len
< n
))
875 static inline bool isoctal(wchar_t wch
)
877 return L
'0' <= wch
&& wch
<= L
'7';
880 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
884 while (*psz
&& ((!buf
) || (len
< n
)))
889 // cast is ok for WC_UTF16
890 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
891 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
893 cc
= (*psz
++) & 0x7fffffff;
896 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
897 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
900 *buf
++ = (char)(cc
- wxUnicodePUA
);
903 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
904 && cc
== L
'\\' && psz
[0] == L
'\\' )
911 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
913 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
917 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
918 (psz
[1] - L
'0') * 010 +
928 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
944 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
946 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
952 if (buf
&& (len
< n
))
958 // ============================================================================
960 // ============================================================================
962 #ifdef WORDS_BIGENDIAN
963 #define wxMBConvUTF16straight wxMBConvUTF16BE
964 #define wxMBConvUTF16swap wxMBConvUTF16LE
966 #define wxMBConvUTF16swap wxMBConvUTF16BE
967 #define wxMBConvUTF16straight wxMBConvUTF16LE
971 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
973 if ( srcLen
== wxNO_LEN
)
975 // count the number of bytes in input, including the trailing NULs
976 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
977 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
980 srcLen
*= BYTES_PER_CHAR
;
982 else // we already have the length
984 // we can only convert an entire number of UTF-16 characters
985 if ( srcLen
% BYTES_PER_CHAR
)
986 return wxCONV_FAILED
;
992 // case when in-memory representation is UTF-16 too
995 // ----------------------------------------------------------------------------
996 // conversions without endianness change
997 // ----------------------------------------------------------------------------
1000 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1001 const char *src
, size_t srcLen
) const
1003 // set up the scene for using memcpy() (which is presumably more efficient
1004 // than copying the bytes one by one)
1005 srcLen
= GetLength(src
, srcLen
);
1006 if ( srcLen
== wxNO_LEN
)
1007 return wxCONV_FAILED
;
1009 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1012 if ( dstLen
< inLen
)
1013 return wxCONV_FAILED
;
1015 memcpy(dst
, src
, srcLen
);
1022 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1023 const wchar_t *src
, size_t srcLen
) const
1025 if ( srcLen
== wxNO_LEN
)
1026 srcLen
= wxWcslen(src
) + 1;
1028 srcLen
*= BYTES_PER_CHAR
;
1032 if ( dstLen
< srcLen
)
1033 return wxCONV_FAILED
;
1035 memcpy(dst
, src
, srcLen
);
1041 // ----------------------------------------------------------------------------
1042 // endian-reversing conversions
1043 // ----------------------------------------------------------------------------
1046 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1047 const char *src
, size_t srcLen
) const
1049 srcLen
= GetLength(src
, srcLen
);
1050 if ( srcLen
== wxNO_LEN
)
1051 return wxCONV_FAILED
;
1053 srcLen
/= BYTES_PER_CHAR
;
1057 if ( dstLen
< srcLen
)
1058 return wxCONV_FAILED
;
1060 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1061 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1063 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1071 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1072 const wchar_t *src
, size_t srcLen
) const
1074 if ( srcLen
== wxNO_LEN
)
1075 srcLen
= wxWcslen(src
) + 1;
1077 srcLen
*= BYTES_PER_CHAR
;
1081 if ( dstLen
< srcLen
)
1082 return wxCONV_FAILED
;
1084 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1085 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1087 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1094 #else // !WC_UTF16: wchar_t is UTF-32
1096 // ----------------------------------------------------------------------------
1097 // conversions without endianness change
1098 // ----------------------------------------------------------------------------
1101 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1102 const char *src
, size_t srcLen
) const
1104 srcLen
= GetLength(src
, srcLen
);
1105 if ( srcLen
== wxNO_LEN
)
1106 return wxCONV_FAILED
;
1108 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1111 // optimization: return maximal space which could be needed for this
1112 // string even if the real size could be smaller if the buffer contains
1118 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1119 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1121 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1123 return wxCONV_FAILED
;
1125 if ( ++outLen
> dstLen
)
1126 return wxCONV_FAILED
;
1136 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1137 const wchar_t *src
, size_t srcLen
) const
1139 if ( srcLen
== wxNO_LEN
)
1140 srcLen
= wxWcslen(src
) + 1;
1143 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1144 for ( size_t n
= 0; n
< srcLen
; n
++ )
1147 const size_t numChars
= encode_utf16(*src
++, cc
);
1148 if ( numChars
== wxCONV_FAILED
)
1149 return wxCONV_FAILED
;
1151 outLen
+= numChars
* BYTES_PER_CHAR
;
1154 if ( outLen
> dstLen
)
1155 return wxCONV_FAILED
;
1158 if ( numChars
== 2 )
1160 // second character of a surrogate
1169 // ----------------------------------------------------------------------------
1170 // endian-reversing conversions
1171 // ----------------------------------------------------------------------------
1174 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1175 const char *src
, size_t srcLen
) const
1177 srcLen
= GetLength(src
, srcLen
);
1178 if ( srcLen
== wxNO_LEN
)
1179 return wxCONV_FAILED
;
1181 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1184 // optimization: return maximal space which could be needed for this
1185 // string even if the real size could be smaller if the buffer contains
1191 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1192 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1197 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1199 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1201 const size_t numChars
= decode_utf16(tmp
, ch
);
1202 if ( numChars
== wxCONV_FAILED
)
1203 return wxCONV_FAILED
;
1205 if ( numChars
== 2 )
1208 if ( ++outLen
> dstLen
)
1209 return wxCONV_FAILED
;
1219 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1220 const wchar_t *src
, size_t srcLen
) const
1222 if ( srcLen
== wxNO_LEN
)
1223 srcLen
= wxWcslen(src
) + 1;
1226 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1227 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1230 const size_t numChars
= encode_utf16(*src
, cc
);
1231 if ( numChars
== wxCONV_FAILED
)
1232 return wxCONV_FAILED
;
1234 outLen
+= numChars
* BYTES_PER_CHAR
;
1237 if ( outLen
> dstLen
)
1238 return wxCONV_FAILED
;
1240 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1241 if ( numChars
== 2 )
1243 // second character of a surrogate
1244 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1252 #endif // WC_UTF16/!WC_UTF16
1255 // ============================================================================
1257 // ============================================================================
1259 #ifdef WORDS_BIGENDIAN
1260 #define wxMBConvUTF32straight wxMBConvUTF32BE
1261 #define wxMBConvUTF32swap wxMBConvUTF32LE
1263 #define wxMBConvUTF32swap wxMBConvUTF32BE
1264 #define wxMBConvUTF32straight wxMBConvUTF32LE
1268 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1269 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1272 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1274 if ( srcLen
== wxNO_LEN
)
1276 // count the number of bytes in input, including the trailing NULs
1277 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1278 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1281 srcLen
*= BYTES_PER_CHAR
;
1283 else // we already have the length
1285 // we can only convert an entire number of UTF-32 characters
1286 if ( srcLen
% BYTES_PER_CHAR
)
1287 return wxCONV_FAILED
;
1293 // case when in-memory representation is UTF-16
1296 // ----------------------------------------------------------------------------
1297 // conversions without endianness change
1298 // ----------------------------------------------------------------------------
1301 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1302 const char *src
, size_t srcLen
) const
1304 srcLen
= GetLength(src
, srcLen
);
1305 if ( srcLen
== wxNO_LEN
)
1306 return wxCONV_FAILED
;
1308 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1309 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1311 for ( size_t n
= 0; n
< inLen
; n
++ )
1314 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1315 if ( numChars
== wxCONV_FAILED
)
1316 return wxCONV_FAILED
;
1321 if ( outLen
> dstLen
)
1322 return wxCONV_FAILED
;
1325 if ( numChars
== 2 )
1327 // second character of a surrogate
1337 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1338 const wchar_t *src
, size_t srcLen
) const
1340 if ( srcLen
== wxNO_LEN
)
1341 srcLen
= wxWcslen(src
) + 1;
1345 // optimization: return maximal space which could be needed for this
1346 // string instead of the exact amount which could be less if there are
1347 // any surrogates in the input
1349 // we consider that surrogates are rare enough to make it worthwhile to
1350 // avoid running the loop below at the cost of slightly extra memory
1352 return srcLen
* BYTES_PER_CHAR
;
1355 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1357 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1359 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1361 return wxCONV_FAILED
;
1363 outLen
+= BYTES_PER_CHAR
;
1365 if ( outLen
> dstLen
)
1366 return wxCONV_FAILED
;
1374 // ----------------------------------------------------------------------------
1375 // endian-reversing conversions
1376 // ----------------------------------------------------------------------------
1379 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1380 const char *src
, size_t srcLen
) const
1382 srcLen
= GetLength(src
, srcLen
);
1383 if ( srcLen
== wxNO_LEN
)
1384 return wxCONV_FAILED
;
1386 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1387 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1389 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1392 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1393 if ( numChars
== wxCONV_FAILED
)
1394 return wxCONV_FAILED
;
1399 if ( outLen
> dstLen
)
1400 return wxCONV_FAILED
;
1403 if ( numChars
== 2 )
1405 // second character of a surrogate
1415 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1416 const wchar_t *src
, size_t srcLen
) const
1418 if ( srcLen
== wxNO_LEN
)
1419 srcLen
= wxWcslen(src
) + 1;
1423 // optimization: return maximal space which could be needed for this
1424 // string instead of the exact amount which could be less if there are
1425 // any surrogates in the input
1427 // we consider that surrogates are rare enough to make it worthwhile to
1428 // avoid running the loop below at the cost of slightly extra memory
1430 return srcLen
*BYTES_PER_CHAR
;
1433 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1435 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1437 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1439 return wxCONV_FAILED
;
1441 outLen
+= BYTES_PER_CHAR
;
1443 if ( outLen
> dstLen
)
1444 return wxCONV_FAILED
;
1446 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1452 #else // !WC_UTF16: wchar_t is UTF-32
1454 // ----------------------------------------------------------------------------
1455 // conversions without endianness change
1456 // ----------------------------------------------------------------------------
1459 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1460 const char *src
, size_t srcLen
) const
1462 // use memcpy() as it should be much faster than hand-written loop
1463 srcLen
= GetLength(src
, srcLen
);
1464 if ( srcLen
== wxNO_LEN
)
1465 return wxCONV_FAILED
;
1467 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1470 if ( dstLen
< inLen
)
1471 return wxCONV_FAILED
;
1473 memcpy(dst
, src
, srcLen
);
1480 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1481 const wchar_t *src
, size_t srcLen
) const
1483 if ( srcLen
== wxNO_LEN
)
1484 srcLen
= wxWcslen(src
) + 1;
1486 srcLen
*= BYTES_PER_CHAR
;
1490 if ( dstLen
< srcLen
)
1491 return wxCONV_FAILED
;
1493 memcpy(dst
, src
, srcLen
);
1499 // ----------------------------------------------------------------------------
1500 // endian-reversing conversions
1501 // ----------------------------------------------------------------------------
1504 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1505 const char *src
, size_t srcLen
) const
1507 srcLen
= GetLength(src
, srcLen
);
1508 if ( srcLen
== wxNO_LEN
)
1509 return wxCONV_FAILED
;
1511 srcLen
/= BYTES_PER_CHAR
;
1515 if ( dstLen
< srcLen
)
1516 return wxCONV_FAILED
;
1518 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1519 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1521 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1529 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1530 const wchar_t *src
, size_t srcLen
) const
1532 if ( srcLen
== wxNO_LEN
)
1533 srcLen
= wxWcslen(src
) + 1;
1535 srcLen
*= BYTES_PER_CHAR
;
1539 if ( dstLen
< srcLen
)
1540 return wxCONV_FAILED
;
1542 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1543 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1545 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1552 #endif // WC_UTF16/!WC_UTF16
1555 // ============================================================================
1556 // The classes doing conversion using the iconv_xxx() functions
1557 // ============================================================================
1561 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1562 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1563 // (unless there's yet another bug in glibc) the only case when iconv()
1564 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1565 // left in the input buffer -- when _real_ error occurs,
1566 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1568 // [This bug does not appear in glibc 2.2.]
1569 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1570 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1571 (errno != E2BIG || bufLeft != 0))
1573 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1576 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1578 #define ICONV_T_INVALID ((iconv_t)-1)
1580 #if SIZEOF_WCHAR_T == 4
1581 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1582 #define WC_ENC wxFONTENCODING_UTF32
1583 #elif SIZEOF_WCHAR_T == 2
1584 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1585 #define WC_ENC wxFONTENCODING_UTF16
1586 #else // sizeof(wchar_t) != 2 nor 4
1587 // does this ever happen?
1588 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1591 // ----------------------------------------------------------------------------
1592 // wxMBConv_iconv: encapsulates an iconv character set
1593 // ----------------------------------------------------------------------------
1595 class wxMBConv_iconv
: public wxMBConv
1598 wxMBConv_iconv(const char *name
);
1599 virtual ~wxMBConv_iconv();
1601 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1602 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1604 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1605 virtual size_t GetMBNulLen() const;
1607 #if wxUSE_UNICODE_UTF8
1608 virtual bool IsUTF8() const;
1611 virtual wxMBConv
*Clone() const
1613 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1614 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1619 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1622 // the iconv handlers used to translate from multibyte
1623 // to wide char and in the other direction
1628 // guards access to m2w and w2m objects
1629 wxMutex m_iconvMutex
;
1633 // the name (for iconv_open()) of a wide char charset -- if none is
1634 // available on this machine, it will remain NULL
1635 static wxString ms_wcCharsetName
;
1637 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1638 // different endian-ness than the native one
1639 static bool ms_wcNeedsSwap
;
1642 // name of the encoding handled by this conversion
1645 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1647 size_t m_minMBCharWidth
;
1650 // make the constructor available for unit testing
1651 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1653 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1654 if ( !result
->IsOk() )
1663 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1664 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1666 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1669 m_minMBCharWidth
= 0;
1671 // check for charset that represents wchar_t:
1672 if ( ms_wcCharsetName
.empty() )
1674 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1677 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1678 #else // !wxUSE_FONTMAP
1679 static const wxChar
*names_static
[] =
1681 #if SIZEOF_WCHAR_T == 4
1683 #elif SIZEOF_WCHAR_T = 2
1688 const wxChar
**names
= names_static
;
1689 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1691 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1693 const wxString
nameCS(*names
);
1695 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1696 wxString
nameXE(nameCS
);
1698 #ifdef WORDS_BIGENDIAN
1700 #else // little endian
1704 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1707 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1708 if ( m2w
== ICONV_T_INVALID
)
1710 // try charset w/o bytesex info (e.g. "UCS4")
1711 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1713 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1715 // and check for bytesex ourselves:
1716 if ( m2w
!= ICONV_T_INVALID
)
1718 char buf
[2], *bufPtr
;
1719 wchar_t wbuf
[2], *wbufPtr
;
1727 outsz
= SIZEOF_WCHAR_T
* 2;
1732 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1733 (char**)&wbufPtr
, &outsz
);
1735 if (ICONV_FAILED(res
, insz
))
1737 wxLogLastError(wxT("iconv"));
1738 wxLogError(_("Conversion to charset '%s' doesn't work."),
1741 else // ok, can convert to this encoding, remember it
1743 ms_wcCharsetName
= nameCS
;
1744 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1748 else // use charset not requiring byte swapping
1750 ms_wcCharsetName
= nameXE
;
1754 wxLogTrace(TRACE_STRCONV
,
1755 wxT("iconv wchar_t charset is \"%s\"%s"),
1756 ms_wcCharsetName
.empty() ? wxString("<none>")
1758 ms_wcNeedsSwap
? _T(" (needs swap)")
1761 else // we already have ms_wcCharsetName
1763 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
1766 if ( ms_wcCharsetName
.empty() )
1768 w2m
= ICONV_T_INVALID
;
1772 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
1773 if ( w2m
== ICONV_T_INVALID
)
1775 wxLogTrace(TRACE_STRCONV
,
1776 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1777 ms_wcCharsetName
.c_str(), name
);
1782 wxMBConv_iconv::~wxMBConv_iconv()
1784 if ( m2w
!= ICONV_T_INVALID
)
1786 if ( w2m
!= ICONV_T_INVALID
)
1790 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1792 // find the string length: notice that must be done differently for
1793 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1795 const size_t nulLen
= GetMBNulLen();
1799 return wxCONV_FAILED
;
1802 inbuf
= strlen(psz
); // arguably more optimized than our version
1807 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1808 // they also have to start at character boundary and not span two
1809 // adjacent characters
1811 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1818 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1819 // Unfortunately there are a couple of global wxCSConv objects such as
1820 // wxConvLocal that are used all over wx code, so we have to make sure
1821 // the handle is used by at most one thread at the time. Otherwise
1822 // only a few wx classes would be safe to use from non-main threads
1823 // as MB<->WC conversion would fail "randomly".
1824 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1825 #endif // wxUSE_THREADS
1827 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1829 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1830 wchar_t *bufPtr
= buf
;
1831 const char *pszPtr
= psz
;
1835 // have destination buffer, convert there
1837 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1838 (char**)&bufPtr
, &outbuf
);
1839 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1843 // convert to native endianness
1844 for ( unsigned i
= 0; i
< res
; i
++ )
1845 buf
[n
] = WC_BSWAP(buf
[i
]);
1848 // NUL-terminate the string if there is any space left
1854 // no destination buffer... convert using temp buffer
1855 // to calculate destination buffer requirement
1862 outbuf
= 8 * SIZEOF_WCHAR_T
;
1865 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1866 (char**)&bufPtr
, &outbuf
);
1868 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1870 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1873 if (ICONV_FAILED(cres
, inbuf
))
1875 //VS: it is ok if iconv fails, hence trace only
1876 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1877 return wxCONV_FAILED
;
1883 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1886 // NB: explained in MB2WC
1887 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1890 size_t inlen
= wxWcslen(psz
);
1891 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1895 wchar_t *tmpbuf
= 0;
1899 // need to copy to temp buffer to switch endianness
1900 // (doing WC_BSWAP twice on the original buffer won't help, as it
1901 // could be in read-only memory, or be accessed in some other thread)
1902 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1903 for ( size_t i
= 0; i
< inlen
; i
++ )
1904 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1906 tmpbuf
[inlen
] = L
'\0';
1912 // have destination buffer, convert there
1913 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1917 // NB: iconv was given only wcslen(psz) characters on input, and so
1918 // it couldn't convert the trailing zero. Let's do it ourselves
1919 // if there's some room left for it in the output buffer.
1925 // no destination buffer: convert using temp buffer
1926 // to calculate destination buffer requirement
1934 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1938 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1946 if (ICONV_FAILED(cres
, inbuf
))
1948 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1949 return wxCONV_FAILED
;
1955 size_t wxMBConv_iconv::GetMBNulLen() const
1957 if ( m_minMBCharWidth
== 0 )
1959 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1962 // NB: explained in MB2WC
1963 wxMutexLocker
lock(self
->m_iconvMutex
);
1966 const wchar_t *wnul
= L
"";
1967 char buf
[8]; // should be enough for NUL in any encoding
1968 size_t inLen
= sizeof(wchar_t),
1969 outLen
= WXSIZEOF(buf
);
1970 char *inBuff
= (char *)wnul
;
1971 char *outBuff
= buf
;
1972 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1974 self
->m_minMBCharWidth
= (size_t)-1;
1978 self
->m_minMBCharWidth
= outBuff
- buf
;
1982 return m_minMBCharWidth
;
1985 #if wxUSE_UNICODE_UTF8
1986 bool wxMBConv_iconv::IsUTF8() const
1988 return wxStricmp(m_name
, "UTF-8") == 0 ||
1989 wxStricmp(m_name
, "UTF8") == 0;
1993 #endif // HAVE_ICONV
1996 // ============================================================================
1997 // Win32 conversion classes
1998 // ============================================================================
2000 #ifdef wxHAVE_WIN32_MB2WC
2004 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2005 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2008 class wxMBConv_win32
: public wxMBConv
2013 m_CodePage
= CP_ACP
;
2014 m_minMBCharWidth
= 0;
2017 wxMBConv_win32(const wxMBConv_win32
& conv
)
2020 m_CodePage
= conv
.m_CodePage
;
2021 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2025 wxMBConv_win32(const char* name
)
2027 m_CodePage
= wxCharsetToCodepage(name
);
2028 m_minMBCharWidth
= 0;
2031 wxMBConv_win32(wxFontEncoding encoding
)
2033 m_CodePage
= wxEncodingToCodepage(encoding
);
2034 m_minMBCharWidth
= 0;
2036 #endif // wxUSE_FONTMAP
2038 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2040 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2041 // the behaviour is not compatible with the Unix version (using iconv)
2042 // and break the library itself, e.g. wxTextInputStream::NextChar()
2043 // wouldn't work if reading an incomplete MB char didn't result in an
2046 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2047 // Win XP or newer and it is not supported for UTF-[78] so we always
2048 // use our own conversions in this case. See
2049 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2050 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2051 if ( m_CodePage
== CP_UTF8
)
2053 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2056 if ( m_CodePage
== CP_UTF7
)
2058 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2062 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2063 IsAtLeastWin2kSP4() )
2065 flags
= MB_ERR_INVALID_CHARS
;
2068 const size_t len
= ::MultiByteToWideChar
2070 m_CodePage
, // code page
2071 flags
, // flags: fall on error
2072 psz
, // input string
2073 -1, // its length (NUL-terminated)
2074 buf
, // output string
2075 buf
? n
: 0 // size of output buffer
2079 // function totally failed
2080 return wxCONV_FAILED
;
2083 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2084 // check if we succeeded, by doing a double trip:
2085 if ( !flags
&& buf
)
2087 const size_t mbLen
= strlen(psz
);
2088 wxCharBuffer
mbBuf(mbLen
);
2089 if ( ::WideCharToMultiByte
2096 mbLen
+ 1, // size in bytes, not length
2100 strcmp(mbBuf
, psz
) != 0 )
2102 // we didn't obtain the same thing we started from, hence
2103 // the conversion was lossy and we consider that it failed
2104 return wxCONV_FAILED
;
2108 // note that it returns count of written chars for buf != NULL and size
2109 // of the needed buffer for buf == NULL so in either case the length of
2110 // the string (which never includes the terminating NUL) is one less
2114 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2117 we have a problem here: by default, WideCharToMultiByte() may
2118 replace characters unrepresentable in the target code page with bad
2119 quality approximations such as turning "1/2" symbol (U+00BD) into
2120 "1" for the code pages which don't have it and we, obviously, want
2121 to avoid this at any price
2123 the trouble is that this function does it _silently_, i.e. it won't
2124 even tell us whether it did or not... Win98/2000 and higher provide
2125 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2126 we have to resort to a round trip, i.e. check that converting back
2127 results in the same string -- this is, of course, expensive but
2128 otherwise we simply can't be sure to not garble the data.
2131 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2132 // it doesn't work with CJK encodings (which we test for rather roughly
2133 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2135 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2138 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2140 // it's our lucky day
2141 flags
= WC_NO_BEST_FIT_CHARS
;
2142 pUsedDef
= &usedDef
;
2144 else // old system or unsupported encoding
2150 const size_t len
= ::WideCharToMultiByte
2152 m_CodePage
, // code page
2153 flags
, // either none or no best fit
2154 pwz
, // input string
2155 -1, // it is (wide) NUL-terminated
2156 buf
, // output buffer
2157 buf
? n
: 0, // and its size
2158 NULL
, // default "replacement" char
2159 pUsedDef
// [out] was it used?
2164 // function totally failed
2165 return wxCONV_FAILED
;
2168 // if we were really converting, check if we succeeded
2173 // check if the conversion failed, i.e. if any replacements
2176 return wxCONV_FAILED
;
2178 else // we must resort to double tripping...
2180 wxWCharBuffer
wcBuf(n
);
2181 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2182 wcscmp(wcBuf
, pwz
) != 0 )
2184 // we didn't obtain the same thing we started from, hence
2185 // the conversion was lossy and we consider that it failed
2186 return wxCONV_FAILED
;
2191 // see the comment above for the reason of "len - 1"
2195 virtual size_t GetMBNulLen() const
2197 if ( m_minMBCharWidth
== 0 )
2199 int len
= ::WideCharToMultiByte
2201 m_CodePage
, // code page
2203 L
"", // input string
2204 1, // translate just the NUL
2205 NULL
, // output buffer
2207 NULL
, // no replacement char
2208 NULL
// [out] don't care if it was used
2211 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2215 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2216 self
->m_minMBCharWidth
= (size_t)-1;
2220 self
->m_minMBCharWidth
= (size_t)-1;
2226 self
->m_minMBCharWidth
= len
;
2231 return m_minMBCharWidth
;
2234 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2236 bool IsOk() const { return m_CodePage
!= -1; }
2239 static bool CanUseNoBestFit()
2241 static int s_isWin98Or2k
= -1;
2243 if ( s_isWin98Or2k
== -1 )
2246 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2248 case wxOS_WINDOWS_9X
:
2249 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2252 case wxOS_WINDOWS_NT
:
2253 s_isWin98Or2k
= verMaj
>= 5;
2257 // unknown: be conservative by default
2262 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2265 return s_isWin98Or2k
== 1;
2268 static bool IsAtLeastWin2kSP4()
2273 static int s_isAtLeastWin2kSP4
= -1;
2275 if ( s_isAtLeastWin2kSP4
== -1 )
2277 OSVERSIONINFOEX ver
;
2279 memset(&ver
, 0, sizeof(ver
));
2280 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2281 GetVersionEx((OSVERSIONINFO
*)&ver
);
2283 s_isAtLeastWin2kSP4
=
2284 ((ver
.dwMajorVersion
> 5) || // Vista+
2285 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2286 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2287 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2291 return s_isAtLeastWin2kSP4
== 1;
2296 // the code page we're working with
2299 // cached result of GetMBNulLen(), set to 0 initially meaning
2301 size_t m_minMBCharWidth
;
2304 #endif // wxHAVE_WIN32_MB2WC
2306 // ============================================================================
2307 // CoreFoundation conversion classes
2308 // ============================================================================
2312 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2314 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2318 case wxFONTENCODING_DEFAULT
:
2319 enc
= CFStringGetSystemEncoding();
2322 case wxFONTENCODING_ISO8859_1
:
2323 enc
= kCFStringEncodingISOLatin1
;
2325 case wxFONTENCODING_ISO8859_2
:
2326 enc
= kCFStringEncodingISOLatin2
;
2328 case wxFONTENCODING_ISO8859_3
:
2329 enc
= kCFStringEncodingISOLatin3
;
2331 case wxFONTENCODING_ISO8859_4
:
2332 enc
= kCFStringEncodingISOLatin4
;
2334 case wxFONTENCODING_ISO8859_5
:
2335 enc
= kCFStringEncodingISOLatinCyrillic
;
2337 case wxFONTENCODING_ISO8859_6
:
2338 enc
= kCFStringEncodingISOLatinArabic
;
2340 case wxFONTENCODING_ISO8859_7
:
2341 enc
= kCFStringEncodingISOLatinGreek
;
2343 case wxFONTENCODING_ISO8859_8
:
2344 enc
= kCFStringEncodingISOLatinHebrew
;
2346 case wxFONTENCODING_ISO8859_9
:
2347 enc
= kCFStringEncodingISOLatin5
;
2349 case wxFONTENCODING_ISO8859_10
:
2350 enc
= kCFStringEncodingISOLatin6
;
2352 case wxFONTENCODING_ISO8859_11
:
2353 enc
= kCFStringEncodingISOLatinThai
;
2355 case wxFONTENCODING_ISO8859_13
:
2356 enc
= kCFStringEncodingISOLatin7
;
2358 case wxFONTENCODING_ISO8859_14
:
2359 enc
= kCFStringEncodingISOLatin8
;
2361 case wxFONTENCODING_ISO8859_15
:
2362 enc
= kCFStringEncodingISOLatin9
;
2365 case wxFONTENCODING_KOI8
:
2366 enc
= kCFStringEncodingKOI8_R
;
2368 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2369 enc
= kCFStringEncodingDOSRussian
;
2372 // case wxFONTENCODING_BULGARIAN :
2376 case wxFONTENCODING_CP437
:
2377 enc
= kCFStringEncodingDOSLatinUS
;
2379 case wxFONTENCODING_CP850
:
2380 enc
= kCFStringEncodingDOSLatin1
;
2382 case wxFONTENCODING_CP852
:
2383 enc
= kCFStringEncodingDOSLatin2
;
2385 case wxFONTENCODING_CP855
:
2386 enc
= kCFStringEncodingDOSCyrillic
;
2388 case wxFONTENCODING_CP866
:
2389 enc
= kCFStringEncodingDOSRussian
;
2391 case wxFONTENCODING_CP874
:
2392 enc
= kCFStringEncodingDOSThai
;
2394 case wxFONTENCODING_CP932
:
2395 enc
= kCFStringEncodingDOSJapanese
;
2397 case wxFONTENCODING_CP936
:
2398 enc
= kCFStringEncodingDOSChineseSimplif
;
2400 case wxFONTENCODING_CP949
:
2401 enc
= kCFStringEncodingDOSKorean
;
2403 case wxFONTENCODING_CP950
:
2404 enc
= kCFStringEncodingDOSChineseTrad
;
2406 case wxFONTENCODING_CP1250
:
2407 enc
= kCFStringEncodingWindowsLatin2
;
2409 case wxFONTENCODING_CP1251
:
2410 enc
= kCFStringEncodingWindowsCyrillic
;
2412 case wxFONTENCODING_CP1252
:
2413 enc
= kCFStringEncodingWindowsLatin1
;
2415 case wxFONTENCODING_CP1253
:
2416 enc
= kCFStringEncodingWindowsGreek
;
2418 case wxFONTENCODING_CP1254
:
2419 enc
= kCFStringEncodingWindowsLatin5
;
2421 case wxFONTENCODING_CP1255
:
2422 enc
= kCFStringEncodingWindowsHebrew
;
2424 case wxFONTENCODING_CP1256
:
2425 enc
= kCFStringEncodingWindowsArabic
;
2427 case wxFONTENCODING_CP1257
:
2428 enc
= kCFStringEncodingWindowsBalticRim
;
2430 // This only really encodes to UTF7 (if that) evidently
2431 // case wxFONTENCODING_UTF7 :
2432 // enc = kCFStringEncodingNonLossyASCII ;
2434 case wxFONTENCODING_UTF8
:
2435 enc
= kCFStringEncodingUTF8
;
2437 case wxFONTENCODING_EUC_JP
:
2438 enc
= kCFStringEncodingEUC_JP
;
2440 /* Don't support conversion to/from UTF16 as wxWidgets can do this better.
2441 * In particular, ToWChar would fail miserably using strlen on an input UTF16.
2442 case wxFONTENCODING_UTF16 :
2443 enc = kCFStringEncodingUnicode ;
2446 case wxFONTENCODING_MACROMAN
:
2447 enc
= kCFStringEncodingMacRoman
;
2449 case wxFONTENCODING_MACJAPANESE
:
2450 enc
= kCFStringEncodingMacJapanese
;
2452 case wxFONTENCODING_MACCHINESETRAD
:
2453 enc
= kCFStringEncodingMacChineseTrad
;
2455 case wxFONTENCODING_MACKOREAN
:
2456 enc
= kCFStringEncodingMacKorean
;
2458 case wxFONTENCODING_MACARABIC
:
2459 enc
= kCFStringEncodingMacArabic
;
2461 case wxFONTENCODING_MACHEBREW
:
2462 enc
= kCFStringEncodingMacHebrew
;
2464 case wxFONTENCODING_MACGREEK
:
2465 enc
= kCFStringEncodingMacGreek
;
2467 case wxFONTENCODING_MACCYRILLIC
:
2468 enc
= kCFStringEncodingMacCyrillic
;
2470 case wxFONTENCODING_MACDEVANAGARI
:
2471 enc
= kCFStringEncodingMacDevanagari
;
2473 case wxFONTENCODING_MACGURMUKHI
:
2474 enc
= kCFStringEncodingMacGurmukhi
;
2476 case wxFONTENCODING_MACGUJARATI
:
2477 enc
= kCFStringEncodingMacGujarati
;
2479 case wxFONTENCODING_MACORIYA
:
2480 enc
= kCFStringEncodingMacOriya
;
2482 case wxFONTENCODING_MACBENGALI
:
2483 enc
= kCFStringEncodingMacBengali
;
2485 case wxFONTENCODING_MACTAMIL
:
2486 enc
= kCFStringEncodingMacTamil
;
2488 case wxFONTENCODING_MACTELUGU
:
2489 enc
= kCFStringEncodingMacTelugu
;
2491 case wxFONTENCODING_MACKANNADA
:
2492 enc
= kCFStringEncodingMacKannada
;
2494 case wxFONTENCODING_MACMALAJALAM
:
2495 enc
= kCFStringEncodingMacMalayalam
;
2497 case wxFONTENCODING_MACSINHALESE
:
2498 enc
= kCFStringEncodingMacSinhalese
;
2500 case wxFONTENCODING_MACBURMESE
:
2501 enc
= kCFStringEncodingMacBurmese
;
2503 case wxFONTENCODING_MACKHMER
:
2504 enc
= kCFStringEncodingMacKhmer
;
2506 case wxFONTENCODING_MACTHAI
:
2507 enc
= kCFStringEncodingMacThai
;
2509 case wxFONTENCODING_MACLAOTIAN
:
2510 enc
= kCFStringEncodingMacLaotian
;
2512 case wxFONTENCODING_MACGEORGIAN
:
2513 enc
= kCFStringEncodingMacGeorgian
;
2515 case wxFONTENCODING_MACARMENIAN
:
2516 enc
= kCFStringEncodingMacArmenian
;
2518 case wxFONTENCODING_MACCHINESESIMP
:
2519 enc
= kCFStringEncodingMacChineseSimp
;
2521 case wxFONTENCODING_MACTIBETAN
:
2522 enc
= kCFStringEncodingMacTibetan
;
2524 case wxFONTENCODING_MACMONGOLIAN
:
2525 enc
= kCFStringEncodingMacMongolian
;
2527 case wxFONTENCODING_MACETHIOPIC
:
2528 enc
= kCFStringEncodingMacEthiopic
;
2530 case wxFONTENCODING_MACCENTRALEUR
:
2531 enc
= kCFStringEncodingMacCentralEurRoman
;
2533 case wxFONTENCODING_MACVIATNAMESE
:
2534 enc
= kCFStringEncodingMacVietnamese
;
2536 case wxFONTENCODING_MACARABICEXT
:
2537 enc
= kCFStringEncodingMacExtArabic
;
2539 case wxFONTENCODING_MACSYMBOL
:
2540 enc
= kCFStringEncodingMacSymbol
;
2542 case wxFONTENCODING_MACDINGBATS
:
2543 enc
= kCFStringEncodingMacDingbats
;
2545 case wxFONTENCODING_MACTURKISH
:
2546 enc
= kCFStringEncodingMacTurkish
;
2548 case wxFONTENCODING_MACCROATIAN
:
2549 enc
= kCFStringEncodingMacCroatian
;
2551 case wxFONTENCODING_MACICELANDIC
:
2552 enc
= kCFStringEncodingMacIcelandic
;
2554 case wxFONTENCODING_MACROMANIAN
:
2555 enc
= kCFStringEncodingMacRomanian
;
2557 case wxFONTENCODING_MACCELTIC
:
2558 enc
= kCFStringEncodingMacCeltic
;
2560 case wxFONTENCODING_MACGAELIC
:
2561 enc
= kCFStringEncodingMacGaelic
;
2563 // case wxFONTENCODING_MACKEYBOARD :
2564 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2568 // because gcc is picky
2575 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2576 // Provide a constant for the wchat_t encoding used by the host platform.
2577 #ifdef WORDS_BIGENDIAN
2578 static const CFStringEncoding wxCFStringEncodingWcharT
= kCFStringEncodingUTF32BE
;
2580 static const CFStringEncoding wxCFStringEncodingWcharT
= kCFStringEncodingUTF32LE
;
2583 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2585 class wxMBConv_cf
: public wxMBConv
2590 Init(CFStringGetSystemEncoding()) ;
2593 wxMBConv_cf(const wxMBConv_cf
& conv
)
2595 m_encoding
= conv
.m_encoding
;
2599 wxMBConv_cf(const char* name
)
2601 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2605 wxMBConv_cf(wxFontEncoding encoding
)
2607 Init( wxCFStringEncFromFontEnc(encoding
) );
2610 virtual ~wxMBConv_cf()
2614 void Init( CFStringEncoding encoding
)
2616 m_encoding
= encoding
;
2619 virtual size_t ToWChar(wchar_t * dst
, size_t dstSize
, const char * src
, size_t srcSize
= wxNO_LEN
) const
2621 wxCHECK(src
, wxCONV_FAILED
);
2623 /* NOTE: This is wrong if the source encoding has an element size
2624 * other than char (e.g. it's kCFStringEncodingUnicode)
2625 * If the user specifies it, it's presumably right though.
2626 * Right now we don't support UTF-16 in anyway since wx can do a better job.
2628 if(srcSize
== wxNO_LEN
)
2629 srcSize
= strlen(src
) + 1;
2631 // First create the temporary CFString
2632 wxCFRef
<CFStringRef
> theString( CFStringCreateWithBytes (
2633 NULL
, //the allocator
2637 false //no BOM/external representation
2640 wxCHECK(theString
!= NULL
, wxCONV_FAILED
);
2642 /* NOTE: The string content includes the NULL element if the source string did
2643 * That means we have to do nothing special because the destination will have
2644 * the NULL element iff the source did and the NULL element will be included
2645 * in the count iff it was included in the source count.
2649 /* If we're compiling against Tiger headers we can support direct conversion
2650 * to UTF32. If we are then run against a pre-Tiger system, the encoding
2651 * won't be available so we'll defer to the string->UTF-16->UTF-32 conversion.
2653 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2654 if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT
))
2656 CFRange fullStringRange
= CFRangeMake(0, CFStringGetLength(theString
));
2659 CFIndex charsConverted
= CFStringGetBytes(
2662 wxCFStringEncodingWcharT
,
2665 // if dstSize is 0 then pass NULL to get required length in usedBufLen
2666 dstSize
!= 0?(UInt8
*)dst
:NULL
,
2667 dstSize
* sizeof(wchar_t),
2670 // charsConverted is > 0 iff conversion succeeded
2671 if(charsConverted
<= 0)
2672 return wxCONV_FAILED
;
2674 /* usedBufLen is the number of bytes written, so we divide by
2675 * sizeof(wchar_t) to get the number of elements written.
2677 wxASSERT( (usedBufLen
% sizeof(wchar_t)) == 0 );
2679 // CFStringGetBytes does exactly the right thing when buffer
2680 // pointer is NULL and returns the number of bytes required
2681 return usedBufLen
/ sizeof(wchar_t);
2684 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2686 // NOTE: Includes NULL iff source did
2687 /* NOTE: This is an approximation. The eventual UTF-32 will
2688 * possibly have less elements but certainly not more.
2690 size_t returnSize
= CFStringGetLength(theString
);
2692 if (dstSize
== 0 || dst
== NULL
)
2697 // Convert the entire string.. too hard to figure out how many UTF-16 we'd need
2698 // for an undersized UTF-32 destination buffer.
2699 CFRange fullStringRange
= CFRangeMake(0, CFStringGetLength(theString
));
2700 UniChar
*szUniCharBuffer
= new UniChar
[fullStringRange
.length
];
2702 CFStringGetCharacters(theString
, fullStringRange
, szUniCharBuffer
);
2704 wxMBConvUTF16 converter
;
2705 returnSize
= converter
.ToWChar( dst
, dstSize
, (const char*)szUniCharBuffer
, fullStringRange
.length
);
2706 delete [] szUniCharBuffer
;
2713 virtual size_t FromWChar(char *dst
, size_t dstSize
, const wchar_t *src
, size_t srcSize
) const
2715 wxCHECK(src
, wxCONV_FAILED
);
2717 if(srcSize
== wxNO_LEN
)
2718 srcSize
= wxStrlen(src
) + 1;
2720 // Temporary CFString
2721 wxCFRef
<CFStringRef
> theString
;
2723 /* If we're compiling against Tiger headers we can support direct conversion
2724 * from UTF32. If we are then run against a pre-Tiger system, the encoding
2725 * won't be available so we'll defer to the UTF-32->UTF-16->string conversion.
2727 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2728 if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT
))
2730 theString
= wxCFRef
<CFStringRef
>(CFStringCreateWithBytes(
2731 kCFAllocatorDefault
,
2733 srcSize
* sizeof(wchar_t),
2734 wxCFStringEncodingWcharT
,
2738 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2740 wxMBConvUTF16 converter
;
2741 size_t cbUniBuffer
= converter
.FromWChar( NULL
, 0, src
, srcSize
);
2742 wxASSERT(cbUniBuffer
% sizeof(UniChar
));
2744 // Will be free'd by kCFAllocatorMalloc when CFString is released
2745 UniChar
*tmpUniBuffer
= (UniChar
*)malloc(cbUniBuffer
);
2747 cbUniBuffer
= converter
.FromWChar( (char*) tmpUniBuffer
, cbUniBuffer
, src
, srcSize
);
2748 wxASSERT(cbUniBuffer
% sizeof(UniChar
));
2750 theString
= wxCFRef
<CFStringRef
>(CFStringCreateWithCharactersNoCopy(
2751 kCFAllocatorDefault
,
2753 cbUniBuffer
/ sizeof(UniChar
),
2759 wxCHECK(theString
!= NULL
, wxCONV_FAILED
);
2763 CFIndex charsConverted
= CFStringGetBytes(
2765 CFRangeMake(0, CFStringGetLength(theString
)),
2767 0, // FAIL on unconvertible characters
2768 false, // not an external representation
2769 // if dstSize is 0 then pass NULL to get required length in usedBufLen
2770 (dstSize
!= 0)?(UInt8
*)dst
:NULL
,
2775 // charsConverted is > 0 iff conversion succeeded
2776 if(charsConverted
<= 0)
2777 return wxCONV_FAILED
;
2782 virtual wxMBConv
*Clone() const { return new wxMBConv_cf(*this); }
2786 return m_encoding
!= kCFStringEncodingInvalidId
&&
2787 CFStringIsEncodingAvailable(m_encoding
);
2791 CFStringEncoding m_encoding
;
2794 #endif // __DARWIN__
2796 // ============================================================================
2797 // Mac conversion classes
2798 // ============================================================================
2800 /* Although we are in the base library we currently have this wxMac
2801 * conditional. This is not generally good but fortunately does not affect
2802 * the ABI of the base library, only what encodings might work.
2803 * It does mean that a wxBase built as part of wxMac has slightly more support
2804 * than one built for wxCocoa or even wxGtk.
2806 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2808 class wxMBConv_mac
: public wxMBConv
2813 Init(CFStringGetSystemEncoding()) ;
2816 wxMBConv_mac(const wxMBConv_mac
& conv
)
2818 Init(conv
.m_char_encoding
);
2822 wxMBConv_mac(const char* name
)
2824 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2828 wxMBConv_mac(wxFontEncoding encoding
)
2830 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2833 virtual ~wxMBConv_mac()
2835 OSStatus status
= noErr
;
2836 if (m_MB2WC_converter
)
2837 status
= TECDisposeConverter(m_MB2WC_converter
);
2838 if (m_WC2MB_converter
)
2839 status
= TECDisposeConverter(m_WC2MB_converter
);
2842 void Init( TextEncodingBase encoding
,TextEncodingVariant encodingVariant
= kTextEncodingDefaultVariant
,
2843 TextEncodingFormat encodingFormat
= kTextEncodingDefaultFormat
)
2845 m_MB2WC_converter
= NULL
;
2846 m_WC2MB_converter
= NULL
;
2847 m_char_encoding
= CreateTextEncoding(encoding
, encodingVariant
, encodingFormat
) ;
2848 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2851 virtual void CreateIfNeeded() const
2853 if ( m_MB2WC_converter
== NULL
&& m_WC2MB_converter
== NULL
)
2855 OSStatus status
= noErr
;
2856 status
= TECCreateConverter(&m_MB2WC_converter
,
2858 m_unicode_encoding
);
2859 wxASSERT_MSG( status
== noErr
, _("Unable to create TextEncodingConverter")) ;
2860 status
= TECCreateConverter(&m_WC2MB_converter
,
2863 wxASSERT_MSG( status
== noErr
, _("Unable to create TextEncodingConverter")) ;
2867 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2870 OSStatus status
= noErr
;
2871 ByteCount byteOutLen
;
2872 ByteCount byteInLen
= strlen(psz
) + 1;
2873 wchar_t *tbuf
= NULL
;
2874 UniChar
* ubuf
= NULL
;
2879 // Apple specs say at least 32
2880 n
= wxMax( 32, byteInLen
) ;
2881 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2884 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2886 #if SIZEOF_WCHAR_T == 4
2887 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2889 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2892 status
= TECConvertText(
2893 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2894 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2896 #if SIZEOF_WCHAR_T == 4
2897 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2898 // is not properly terminated we get random characters at the end
2899 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2900 wxMBConvUTF16 converter
;
2901 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2904 res
= byteOutLen
/ sizeof( UniChar
) ;
2910 if ( buf
&& res
< n
)
2916 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2919 OSStatus status
= noErr
;
2920 ByteCount byteOutLen
;
2921 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2927 // Apple specs say at least 32
2928 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2929 tbuf
= (char*) malloc( n
) ;
2932 ByteCount byteBufferLen
= n
;
2933 UniChar
* ubuf
= NULL
;
2935 #if SIZEOF_WCHAR_T == 4
2936 wxMBConvUTF16 converter
;
2937 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2938 byteInLen
= unicharlen
;
2939 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2940 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2942 ubuf
= (UniChar
*) psz
;
2945 status
= TECConvertText(
2946 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2947 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2949 #if SIZEOF_WCHAR_T == 4
2956 size_t res
= byteOutLen
;
2957 if ( buf
&& res
< n
)
2961 //we need to double-trip to verify it didn't insert any ? in place
2962 //of bogus characters
2963 wxWCharBuffer
wcBuf(n
);
2964 size_t pszlen
= wxWcslen(psz
);
2965 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2966 wxWcslen(wcBuf
) != pszlen
||
2967 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2969 // we didn't obtain the same thing we started from, hence
2970 // the conversion was lossy and we consider that it failed
2971 return wxCONV_FAILED
;
2978 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2983 return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
;
2987 mutable TECObjectRef m_MB2WC_converter
;
2988 mutable TECObjectRef m_WC2MB_converter
;
2990 TextEncodingBase m_char_encoding
;
2991 TextEncodingBase m_unicode_encoding
;
2994 // MB is decomposed (D) normalized UTF8
2996 class wxMBConv_macUTF8D
: public wxMBConv_mac
3001 Init( kTextEncodingUnicodeDefault
, kUnicodeNoSubset
, kUnicodeUTF8Format
) ;
3006 virtual ~wxMBConv_macUTF8D()
3009 DisposeUnicodeToTextInfo(&m_uni
);
3010 if (m_uniBack
!=NULL
)
3011 DisposeUnicodeToTextInfo(&m_uniBack
);
3014 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3017 OSStatus status
= noErr
;
3018 ByteCount byteOutLen
;
3019 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
3025 // Apple specs say at least 32
3026 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
3027 tbuf
= (char*) malloc( n
) ;
3030 ByteCount byteBufferLen
= n
;
3031 UniChar
* ubuf
= NULL
;
3033 #if SIZEOF_WCHAR_T == 4
3034 wxMBConvUTF16 converter
;
3035 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
3036 byteInLen
= unicharlen
;
3037 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
3038 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
3040 ubuf
= (UniChar
*) psz
;
3043 // ubuf is a non-decomposed UniChar buffer
3045 ByteCount dcubuflen
= byteInLen
* 2 + 2 ;
3046 ByteCount dcubufread
, dcubufwritten
;
3047 UniChar
*dcubuf
= (UniChar
*) malloc( dcubuflen
) ;
3049 ConvertFromUnicodeToText( m_uni
, byteInLen
, ubuf
,
3050 kUnicodeDefaultDirectionMask
, 0, NULL
, NULL
, NULL
, dcubuflen
, &dcubufread
, &dcubufwritten
, dcubuf
) ;
3052 // we now convert that decomposed buffer into UTF8
3054 status
= TECConvertText(
3055 m_WC2MB_converter
, (ConstTextPtr
) dcubuf
, dcubufwritten
, &dcubufread
,
3056 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
3060 #if SIZEOF_WCHAR_T == 4
3067 size_t res
= byteOutLen
;
3068 if ( buf
&& res
< n
)
3071 // don't test for round-trip fidelity yet, we cannot guarantee it yet
3077 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3080 OSStatus status
= noErr
;
3081 ByteCount byteOutLen
;
3082 ByteCount byteInLen
= strlen(psz
) + 1;
3083 wchar_t *tbuf
= NULL
;
3084 UniChar
* ubuf
= NULL
;
3089 // Apple specs say at least 32
3090 n
= wxMax( 32, byteInLen
) ;
3091 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
3094 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
3096 #if SIZEOF_WCHAR_T == 4
3097 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
3099 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
3102 ByteCount dcubuflen
= byteBufferLen
* 2 + 2 ;
3103 ByteCount dcubufread
, dcubufwritten
;
3104 UniChar
*dcubuf
= (UniChar
*) malloc( dcubuflen
) ;
3106 status
= TECConvertText(
3107 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
3108 (TextPtr
) dcubuf
, dcubuflen
, &byteOutLen
);
3109 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3110 // is not properly terminated we get random characters at the end
3111 dcubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
3113 // now from the decomposed UniChar to properly composed uniChar
3114 ConvertFromUnicodeToText( m_uniBack
, byteOutLen
, dcubuf
,
3115 kUnicodeDefaultDirectionMask
, 0, NULL
, NULL
, NULL
, dcubuflen
, &dcubufread
, &dcubufwritten
, ubuf
) ;
3118 byteOutLen
= dcubufwritten
;
3119 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
3122 #if SIZEOF_WCHAR_T == 4
3123 wxMBConvUTF16 converter
;
3124 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
3127 res
= byteOutLen
/ sizeof( UniChar
) ;
3133 if ( buf
&& res
< n
)
3139 virtual void CreateIfNeeded() const
3141 wxMBConv_mac::CreateIfNeeded() ;
3142 if ( m_uni
== NULL
)
3144 m_map
.unicodeEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3145 kUnicodeNoSubset
, kTextEncodingDefaultFormat
);
3146 m_map
.otherEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3147 kUnicodeCanonicalDecompVariant
, kTextEncodingDefaultFormat
);
3148 m_map
.mappingVersion
= kUnicodeUseLatestMapping
;
3150 OSStatus err
= CreateUnicodeToTextInfo(&m_map
, &m_uni
);
3151 wxASSERT_MSG( err
== noErr
, _(" Couldn't create the UnicodeConverter")) ;
3153 m_map
.unicodeEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3154 kUnicodeNoSubset
, kTextEncodingDefaultFormat
);
3155 m_map
.otherEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3156 kUnicodeCanonicalCompVariant
, kTextEncodingDefaultFormat
);
3157 m_map
.mappingVersion
= kUnicodeUseLatestMapping
;
3158 err
= CreateUnicodeToTextInfo(&m_map
, &m_uniBack
);
3159 wxASSERT_MSG( err
== noErr
, _(" Couldn't create the UnicodeConverter")) ;
3163 mutable UnicodeToTextInfo m_uni
;
3164 mutable UnicodeToTextInfo m_uniBack
;
3165 mutable UnicodeMapping m_map
;
3167 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3169 // ============================================================================
3170 // wxEncodingConverter based conversion classes
3171 // ============================================================================
3175 class wxMBConv_wxwin
: public wxMBConv
3180 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
3181 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
3185 // temporarily just use wxEncodingConverter stuff,
3186 // so that it works while a better implementation is built
3187 wxMBConv_wxwin(const char* name
)
3190 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3192 m_enc
= wxFONTENCODING_SYSTEM
;
3197 wxMBConv_wxwin(wxFontEncoding enc
)
3204 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
3206 size_t inbuf
= strlen(psz
);
3209 if (!m2w
.Convert(psz
, buf
))
3210 return wxCONV_FAILED
;
3215 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
3217 const size_t inbuf
= wxWcslen(psz
);
3220 if (!w2m
.Convert(psz
, buf
))
3221 return wxCONV_FAILED
;
3227 virtual size_t GetMBNulLen() const
3231 case wxFONTENCODING_UTF16BE
:
3232 case wxFONTENCODING_UTF16LE
:
3235 case wxFONTENCODING_UTF32BE
:
3236 case wxFONTENCODING_UTF32LE
:
3244 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
3246 bool IsOk() const { return m_ok
; }
3249 wxFontEncoding m_enc
;
3250 wxEncodingConverter m2w
, w2m
;
3253 // were we initialized successfully?
3256 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
3259 // make the constructors available for unit testing
3260 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
3262 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
3263 if ( !result
->IsOk() )
3272 #endif // wxUSE_FONTMAP
3274 // ============================================================================
3275 // wxCSConv implementation
3276 // ============================================================================
3278 void wxCSConv::Init()
3285 wxCSConv::wxCSConv(const wxString
& charset
)
3289 if ( !charset
.empty() )
3291 SetName(charset
.ToAscii());
3295 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
3297 m_encoding
= wxFONTENCODING_SYSTEM
;
3301 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3303 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3305 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3307 encoding
= wxFONTENCODING_SYSTEM
;
3312 m_encoding
= encoding
;
3315 wxCSConv::~wxCSConv()
3320 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3325 SetName(conv
.m_name
);
3326 m_encoding
= conv
.m_encoding
;
3329 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3333 SetName(conv
.m_name
);
3334 m_encoding
= conv
.m_encoding
;
3339 void wxCSConv::Clear()
3348 void wxCSConv::SetName(const char *charset
)
3352 m_name
= strdup(charset
);
3359 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3360 wxEncodingNameCache
);
3362 static wxEncodingNameCache gs_nameCache
;
3365 wxMBConv
*wxCSConv::DoCreate() const
3368 wxLogTrace(TRACE_STRCONV
,
3369 wxT("creating conversion for %s"),
3371 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3372 #endif // wxUSE_FONTMAP
3374 // check for the special case of ASCII or ISO8859-1 charset: as we have
3375 // special knowledge of it anyhow, we don't need to create a special
3376 // conversion object
3377 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3378 m_encoding
== wxFONTENCODING_DEFAULT
)
3380 // don't convert at all
3384 // we trust OS to do conversion better than we can so try external
3385 // conversion methods first
3387 // the full order is:
3388 // 1. OS conversion (iconv() under Unix or Win32 API)
3389 // 2. hard coded conversions for UTF
3390 // 3. wxEncodingConverter as fall back
3396 #endif // !wxUSE_FONTMAP
3399 wxFontEncoding
encoding(m_encoding
);
3404 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3412 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3413 #endif // wxUSE_FONTMAP
3417 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3418 if ( it
!= gs_nameCache
.end() )
3420 if ( it
->second
.empty() )
3423 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3430 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3431 // CS : in case this does not return valid names (eg for MacRoman)
3432 // encoding got a 'failure' entry in the cache all the same,
3433 // although it just has to be created using a different method, so
3434 // only store failed iconv creation attempts (or perhaps we
3435 // shoulnd't do this at all ?)
3436 if ( names
[0] != NULL
)
3438 for ( ; *names
; ++names
)
3440 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3441 // will need changes that will obsolete this
3442 wxString
name(*names
);
3443 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3446 gs_nameCache
[encoding
] = *names
;
3453 gs_nameCache
[encoding
] = _T(""); // cache the failure
3456 #endif // wxUSE_FONTMAP
3458 #endif // HAVE_ICONV
3460 #ifdef wxHAVE_WIN32_MB2WC
3463 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3464 : new wxMBConv_win32(m_encoding
);
3473 #endif // wxHAVE_WIN32_MB2WC
3475 #if defined(__WXMAC__)
3477 // leave UTF16 and UTF32 to the built-ins of wx
3478 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3479 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3482 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3483 : new wxMBConv_mac(m_encoding
);
3485 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3497 // leave UTF16 and UTF32 to the built-ins of wx
3498 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3499 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3502 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3503 : new wxMBConv_cf(m_encoding
);
3505 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3514 #endif // __DARWIN__
3517 wxFontEncoding enc
= m_encoding
;
3519 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3521 // use "false" to suppress interactive dialogs -- we can be called from
3522 // anywhere and popping up a dialog from here is the last thing we want to
3524 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3526 #endif // wxUSE_FONTMAP
3530 case wxFONTENCODING_UTF7
:
3531 return new wxMBConvUTF7
;
3533 case wxFONTENCODING_UTF8
:
3534 return new wxMBConvUTF8
;
3536 case wxFONTENCODING_UTF16BE
:
3537 return new wxMBConvUTF16BE
;
3539 case wxFONTENCODING_UTF16LE
:
3540 return new wxMBConvUTF16LE
;
3542 case wxFONTENCODING_UTF32BE
:
3543 return new wxMBConvUTF32BE
;
3545 case wxFONTENCODING_UTF32LE
:
3546 return new wxMBConvUTF32LE
;
3549 // nothing to do but put here to suppress gcc warnings
3556 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3557 : new wxMBConv_wxwin(m_encoding
);
3563 #endif // wxUSE_FONTMAP
3565 // NB: This is a hack to prevent deadlock. What could otherwise happen
3566 // in Unicode build: wxConvLocal creation ends up being here
3567 // because of some failure and logs the error. But wxLog will try to
3568 // attach a timestamp, for which it will need wxConvLocal (to convert
3569 // time to char* and then wchar_t*), but that fails, tries to log the
3570 // error, but wxLog has an (already locked) critical section that
3571 // guards the static buffer.
3572 static bool alreadyLoggingError
= false;
3573 if (!alreadyLoggingError
)
3575 alreadyLoggingError
= true;
3576 wxLogError(_("Cannot convert from the charset '%s'!"),
3580 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3581 #else // !wxUSE_FONTMAP
3582 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3583 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3586 alreadyLoggingError
= false;
3592 void wxCSConv::CreateConvIfNeeded() const
3596 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3598 // if we don't have neither the name nor the encoding, use the default
3599 // encoding for this system
3600 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3603 self
->m_encoding
= wxLocale::GetSystemEncoding();
3605 // fallback to some reasonable default:
3606 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3607 #endif // wxUSE_INTL
3610 self
->m_convReal
= DoCreate();
3611 self
->m_deferred
= false;
3615 bool wxCSConv::IsOk() const
3617 CreateConvIfNeeded();
3619 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3620 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3621 return true; // always ok as we do it ourselves
3623 // m_convReal->IsOk() is called at its own creation, so we know it must
3624 // be ok if m_convReal is non-NULL
3625 return m_convReal
!= NULL
;
3628 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3629 const char *src
, size_t srcLen
) const
3631 CreateConvIfNeeded();
3634 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3637 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3640 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3641 const wchar_t *src
, size_t srcLen
) const
3643 CreateConvIfNeeded();
3646 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3649 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3652 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3654 CreateConvIfNeeded();
3657 return m_convReal
->MB2WC(buf
, psz
, n
);
3660 size_t len
= strlen(psz
);
3664 for (size_t c
= 0; c
<= len
; c
++)
3665 buf
[c
] = (unsigned char)(psz
[c
]);
3671 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3673 CreateConvIfNeeded();
3676 return m_convReal
->WC2MB(buf
, psz
, n
);
3679 const size_t len
= wxWcslen(psz
);
3682 for (size_t c
= 0; c
<= len
; c
++)
3685 return wxCONV_FAILED
;
3687 buf
[c
] = (char)psz
[c
];
3692 for (size_t c
= 0; c
<= len
; c
++)
3695 return wxCONV_FAILED
;
3702 size_t wxCSConv::GetMBNulLen() const
3704 CreateConvIfNeeded();
3708 return m_convReal
->GetMBNulLen();
3711 // otherwise, we are ISO-8859-1
3715 #if wxUSE_UNICODE_UTF8
3716 bool wxCSConv::IsUTF8() const
3718 CreateConvIfNeeded();
3722 return m_convReal
->IsUTF8();
3725 // otherwise, we are ISO-8859-1
3733 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3736 return wxWCharBuffer();
3738 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3740 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3742 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3747 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3750 return wxCharBuffer();
3752 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3754 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3759 #endif // wxUSE_UNICODE
3761 // ----------------------------------------------------------------------------
3763 // ----------------------------------------------------------------------------
3765 // NB: The reason why we create converted objects in this convoluted way,
3766 // using a factory function instead of global variable, is that they
3767 // may be used at static initialization time (some of them are used by
3768 // wxString ctors and there may be a global wxString object). In other
3769 // words, possibly _before_ the converter global object would be
3776 #undef wxConvISO8859_1
3778 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3779 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3780 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3782 static impl_klass name##Obj ctor_args; \
3783 return &name##Obj; \
3785 /* this ensures that all global converter objects are created */ \
3786 /* by the time static initialization is done, i.e. before any */ \
3787 /* thread is launched: */ \
3788 static klass* gs_##name##instance = wxGet_##name##Ptr()
3790 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3791 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3794 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3795 #elif defined(__WXMAC__) && !defined(__MACH__)
3796 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_mac
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3798 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3801 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
);
3802 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
);
3804 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3805 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3807 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3808 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3810 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3811 static wxMBConv_macUTF8D wxConvMacUTF8DObj
;
3813 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3815 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3818 wxGet_wxConvUTF8Ptr();
3821 wxGet_wxConvLibcPtr();
3822 #endif // __WXOSX__/!__WXOSX__
3824 #else // !wxUSE_WCHAR_T
3826 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3827 // stand-ins in absence of wchar_t
3828 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3833 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T