1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include <CoreFoundation/CFString.h>
61 #include <CoreFoundation/CFStringEncodingExt.h>
62 #endif //def __DARWIN__
66 #include <ATSUnicode.h>
67 #include <TextCommon.h>
68 #include <TextEncodingConverter.h>
71 // includes Mac headers
72 #include "wx/mac/private.h"
76 #define TRACE_STRCONV _T("strconv")
78 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
80 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p
, size_t n
)
92 while ( n
&& *p
++ == '\0' )
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
100 // ----------------------------------------------------------------------------
102 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
107 *output
= (wxUint16
) input
;
111 else if (input
>= 0x110000)
113 return wxCONV_FAILED
;
119 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
120 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
127 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
129 if ((*input
< 0xd800) || (*input
> 0xdfff))
134 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
137 return wxCONV_FAILED
;
141 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
147 typedef wchar_t wxDecodeSurrogate_t
;
149 typedef wxUint16 wxDecodeSurrogate_t
;
150 #endif // WC_UTF16/!WC_UTF16
152 // returns the next UTF-32 character from the wchar_t buffer and advances the
153 // pointer to the character after this one
155 // if an invalid character is found, *pSrc is set to NULL, the caller must
157 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
161 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
162 if ( n
== wxCONV_FAILED
)
170 // ----------------------------------------------------------------------------
172 // ----------------------------------------------------------------------------
175 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
176 const char *src
, size_t srcLen
) const
178 // although new conversion classes are supposed to implement this function
179 // directly, the existins ones only implement the old MB2WC() and so, to
180 // avoid to have to rewrite all conversion classes at once, we provide a
181 // default (but not efficient) implementation of this one in terms of the
182 // old function by copying the input to ensure that it's NUL-terminated and
183 // then using MB2WC() to convert it
185 // the number of chars [which would be] written to dst [if it were not NULL]
186 size_t dstWritten
= 0;
188 // the number of NULs terminating this string
189 size_t nulLen
= 0; // not really needed, but just to avoid warnings
191 // if we were not given the input size we just have to assume that the
192 // string is properly terminated as we have no way of knowing how long it
193 // is anyhow, but if we do have the size check whether there are enough
197 if ( srcLen
!= wxNO_LEN
)
199 // we need to know how to find the end of this string
200 nulLen
= GetMBNulLen();
201 if ( nulLen
== wxCONV_FAILED
)
202 return wxCONV_FAILED
;
204 // if there are enough NULs we can avoid the copy
205 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
207 // make a copy in order to properly NUL-terminate the string
208 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
209 char * const p
= bufTmp
.data();
210 memcpy(p
, src
, srcLen
);
211 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
217 srcEnd
= src
+ srcLen
;
219 else // quit after the first loop iteration
226 // try to convert the current chunk
227 size_t lenChunk
= MB2WC(NULL
, src
, 0);
228 if ( lenChunk
== wxCONV_FAILED
)
229 return wxCONV_FAILED
;
231 lenChunk
++; // for the L'\0' at the end of this chunk
233 dstWritten
+= lenChunk
;
237 // nothing left in the input string, conversion succeeded
243 if ( dstWritten
> dstLen
)
244 return wxCONV_FAILED
;
246 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
247 return wxCONV_FAILED
;
254 // we convert just one chunk in this case as this is the entire
259 // advance the input pointer past the end of this chunk
260 while ( NotAllNULs(src
, nulLen
) )
262 // notice that we must skip over multiple bytes here as we suppose
263 // that if NUL takes 2 or 4 bytes, then all the other characters do
264 // too and so if advanced by a single byte we might erroneously
265 // detect sequences of NUL bytes in the middle of the input
269 src
+= nulLen
; // skipping over its terminator as well
271 // note that ">=" (and not just "==") is needed here as the terminator
272 // we skipped just above could be inside or just after the buffer
273 // delimited by inEnd
282 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
283 const wchar_t *src
, size_t srcLen
) const
285 // the number of chars [which would be] written to dst [if it were not NULL]
286 size_t dstWritten
= 0;
288 // make a copy of the input string unless it is already properly
291 // if we don't know its length we have no choice but to assume that it is,
292 // indeed, properly terminated
293 wxWCharBuffer bufTmp
;
294 if ( srcLen
== wxNO_LEN
)
296 srcLen
= wxWcslen(src
) + 1;
298 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
300 // make a copy in order to properly NUL-terminate the string
301 bufTmp
= wxWCharBuffer(srcLen
);
302 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
306 const size_t lenNul
= GetMBNulLen();
307 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
309 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
311 // try to convert the current chunk
312 size_t lenChunk
= WC2MB(NULL
, src
, 0);
314 if ( lenChunk
== wxCONV_FAILED
)
315 return wxCONV_FAILED
;
318 dstWritten
+= lenChunk
;
322 if ( dstWritten
> dstLen
)
323 return wxCONV_FAILED
;
325 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
326 return wxCONV_FAILED
;
335 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
337 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
338 if ( rc
!= wxCONV_FAILED
)
340 // ToWChar() returns the buffer length, i.e. including the trailing
341 // NUL, while this method doesn't take it into account
348 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
350 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
351 if ( rc
!= wxCONV_FAILED
)
359 wxMBConv::~wxMBConv()
361 // nothing to do here (necessary for Darwin linking probably)
364 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
368 // calculate the length of the buffer needed first
369 const size_t nLen
= MB2WC(NULL
, psz
, 0);
370 if ( nLen
!= wxCONV_FAILED
)
372 // now do the actual conversion
373 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
375 // +1 for the trailing NULL
376 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
381 return wxWCharBuffer();
384 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
388 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
389 if ( nLen
!= wxCONV_FAILED
)
391 // extra space for trailing NUL(s)
392 static const size_t extraLen
= GetMaxMBNulLen();
394 wxCharBuffer
buf(nLen
+ extraLen
- 1);
395 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
400 return wxCharBuffer();
404 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
406 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
407 if ( dstLen
!= wxCONV_FAILED
)
409 wxWCharBuffer
wbuf(dstLen
- 1);
410 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
415 if ( wbuf
[dstLen
- 1] == L
'\0' )
426 return wxWCharBuffer();
430 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
432 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
433 if ( dstLen
!= wxCONV_FAILED
)
435 // special case of empty input: can't allocate 0 size buffer below as
436 // wxCharBuffer insists on NUL-terminating it
437 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
438 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
444 const size_t nulLen
= GetMBNulLen();
445 if ( dstLen
>= nulLen
&&
446 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
448 // in this case the output is NUL-terminated and we're not
449 // supposed to count NUL
461 return wxCharBuffer();
464 // ----------------------------------------------------------------------------
466 // ----------------------------------------------------------------------------
468 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
470 return wxMB2WC(buf
, psz
, n
);
473 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
475 return wxWC2MB(buf
, psz
, n
);
478 // ----------------------------------------------------------------------------
479 // wxConvBrokenFileNames
480 // ----------------------------------------------------------------------------
484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
486 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
487 wxStricmp(charset
, _T("UTF8")) == 0 )
488 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
490 m_conv
= new wxCSConv(charset
);
495 // ----------------------------------------------------------------------------
497 // ----------------------------------------------------------------------------
499 // Implementation (C) 2004 Fredrik Roubert
502 // BASE64 decoding table
504 static const unsigned char utf7unb64
[] =
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
512 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
513 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
515 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
516 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
517 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
519 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
520 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
521 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
540 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
544 while ( *psz
&& (!buf
|| (len
< n
)) )
546 unsigned char cc
= *psz
++;
554 else if (*psz
== '-')
562 else // start of BASE64 encoded string
566 for ( ok
= lsb
= false, d
= 0, l
= 0;
567 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
572 for (l
+= 6; l
>= 8; lsb
= !lsb
)
574 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
584 *buf
= (wchar_t)(c
<< 8);
593 // in valid UTF7 we should have valid characters after '+'
594 return wxCONV_FAILED
;
602 if ( buf
&& (len
< n
) )
609 // BASE64 encoding table
611 static const unsigned char utf7enb64
[] =
613 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
614 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
615 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
616 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
617 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
618 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
619 'w', 'x', 'y', 'z', '0', '1', '2', '3',
620 '4', '5', '6', '7', '8', '9', '+', '/'
624 // UTF-7 encoding table
626 // 0 - Set D (directly encoded characters)
627 // 1 - Set O (optional direct characters)
628 // 2 - whitespace characters (optional)
629 // 3 - special characters
631 static const unsigned char utf7encode
[128] =
633 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
634 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
635 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
637 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
639 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
643 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
647 while (*psz
&& ((!buf
) || (len
< n
)))
650 if (cc
< 0x80 && utf7encode
[cc
] < 1)
659 else if (((wxUint32
)cc
) > 0xffff)
661 // no surrogate pair generation (yet?)
662 return wxCONV_FAILED
;
673 // BASE64 encode string
674 unsigned int lsb
, d
, l
;
675 for (d
= 0, l
= 0; /*nothing*/; psz
++)
677 for (lsb
= 0; lsb
< 2; lsb
++)
680 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
682 for (l
+= 8; l
>= 6; )
686 *buf
++ = utf7enb64
[(d
>> l
) % 64];
692 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
699 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
711 if (buf
&& (len
< n
))
717 // ----------------------------------------------------------------------------
719 // ----------------------------------------------------------------------------
721 static wxUint32 utf8_max
[]=
722 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
724 // boundaries of the private use area we use to (temporarily) remap invalid
725 // characters invalid in a UTF-8 encoded string
726 const wxUint32 wxUnicodePUA
= 0x100000;
727 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
729 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
733 while (*psz
&& ((!buf
) || (len
< n
)))
735 const char *opsz
= psz
;
736 bool invalid
= false;
737 unsigned char cc
= *psz
++, fc
= cc
;
739 for (cnt
= 0; fc
& 0x80; cnt
++)
749 // escape the escape character for octal escapes
750 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
751 && cc
== '\\' && (!buf
|| len
< n
))
763 // invalid UTF-8 sequence
768 unsigned ocnt
= cnt
- 1;
769 wxUint32 res
= cc
& (0x3f >> cnt
);
773 if ((cc
& 0xC0) != 0x80)
775 // invalid UTF-8 sequence
781 res
= (res
<< 6) | (cc
& 0x3f);
784 if (invalid
|| res
<= utf8_max
[ocnt
])
786 // illegal UTF-8 encoding
789 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
790 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
792 // if one of our PUA characters turns up externally
793 // it must also be treated as an illegal sequence
794 // (a bit like you have to escape an escape character)
800 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
801 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
802 if (pa
== wxCONV_FAILED
)
814 *buf
++ = (wchar_t)res
;
816 #endif // WC_UTF16/!WC_UTF16
822 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
824 while (opsz
< psz
&& (!buf
|| len
< n
))
827 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
828 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
829 wxASSERT(pa
!= wxCONV_FAILED
);
836 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
842 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
844 while (opsz
< psz
&& (!buf
|| len
< n
))
846 if ( buf
&& len
+ 3 < n
)
848 unsigned char on
= *opsz
;
850 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
851 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
852 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
859 else // MAP_INVALID_UTF8_NOT
861 return wxCONV_FAILED
;
867 if (buf
&& (len
< n
))
873 static inline bool isoctal(wchar_t wch
)
875 return L
'0' <= wch
&& wch
<= L
'7';
878 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
882 while (*psz
&& ((!buf
) || (len
< n
)))
887 // cast is ok for WC_UTF16
888 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
889 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
891 cc
= (*psz
++) & 0x7fffffff;
894 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
895 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
898 *buf
++ = (char)(cc
- wxUnicodePUA
);
901 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
902 && cc
== L
'\\' && psz
[0] == L
'\\' )
909 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
911 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
915 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
916 (psz
[1] - L
'0') * 010 +
926 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
942 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
944 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
950 if (buf
&& (len
< n
))
956 // ============================================================================
958 // ============================================================================
960 #ifdef WORDS_BIGENDIAN
961 #define wxMBConvUTF16straight wxMBConvUTF16BE
962 #define wxMBConvUTF16swap wxMBConvUTF16LE
964 #define wxMBConvUTF16swap wxMBConvUTF16BE
965 #define wxMBConvUTF16straight wxMBConvUTF16LE
969 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
971 if ( srcLen
== wxNO_LEN
)
973 // count the number of bytes in input, including the trailing NULs
974 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
975 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
978 srcLen
*= BYTES_PER_CHAR
;
980 else // we already have the length
982 // we can only convert an entire number of UTF-16 characters
983 if ( srcLen
% BYTES_PER_CHAR
)
984 return wxCONV_FAILED
;
990 // case when in-memory representation is UTF-16 too
993 // ----------------------------------------------------------------------------
994 // conversions without endianness change
995 // ----------------------------------------------------------------------------
998 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
999 const char *src
, size_t srcLen
) const
1001 // set up the scene for using memcpy() (which is presumably more efficient
1002 // than copying the bytes one by one)
1003 srcLen
= GetLength(src
, srcLen
);
1004 if ( srcLen
== wxNO_LEN
)
1005 return wxCONV_FAILED
;
1007 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1010 if ( dstLen
< inLen
)
1011 return wxCONV_FAILED
;
1013 memcpy(dst
, src
, srcLen
);
1020 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1021 const wchar_t *src
, size_t srcLen
) const
1023 if ( srcLen
== wxNO_LEN
)
1024 srcLen
= wxWcslen(src
) + 1;
1026 srcLen
*= BYTES_PER_CHAR
;
1030 if ( dstLen
< srcLen
)
1031 return wxCONV_FAILED
;
1033 memcpy(dst
, src
, srcLen
);
1039 // ----------------------------------------------------------------------------
1040 // endian-reversing conversions
1041 // ----------------------------------------------------------------------------
1044 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1045 const char *src
, size_t srcLen
) const
1047 srcLen
= GetLength(src
, srcLen
);
1048 if ( srcLen
== wxNO_LEN
)
1049 return wxCONV_FAILED
;
1051 srcLen
/= BYTES_PER_CHAR
;
1055 if ( dstLen
< srcLen
)
1056 return wxCONV_FAILED
;
1058 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1059 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1061 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1069 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1070 const wchar_t *src
, size_t srcLen
) const
1072 if ( srcLen
== wxNO_LEN
)
1073 srcLen
= wxWcslen(src
) + 1;
1075 srcLen
*= BYTES_PER_CHAR
;
1079 if ( dstLen
< srcLen
)
1080 return wxCONV_FAILED
;
1082 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1083 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1085 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1092 #else // !WC_UTF16: wchar_t is UTF-32
1094 // ----------------------------------------------------------------------------
1095 // conversions without endianness change
1096 // ----------------------------------------------------------------------------
1099 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1100 const char *src
, size_t srcLen
) const
1102 srcLen
= GetLength(src
, srcLen
);
1103 if ( srcLen
== wxNO_LEN
)
1104 return wxCONV_FAILED
;
1106 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1109 // optimization: return maximal space which could be needed for this
1110 // string even if the real size could be smaller if the buffer contains
1116 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1117 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1119 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1121 return wxCONV_FAILED
;
1123 if ( ++outLen
> dstLen
)
1124 return wxCONV_FAILED
;
1134 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1135 const wchar_t *src
, size_t srcLen
) const
1137 if ( srcLen
== wxNO_LEN
)
1138 srcLen
= wxWcslen(src
) + 1;
1141 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1142 for ( size_t n
= 0; n
< srcLen
; n
++ )
1145 const size_t numChars
= encode_utf16(*src
++, cc
);
1146 if ( numChars
== wxCONV_FAILED
)
1147 return wxCONV_FAILED
;
1149 outLen
+= numChars
* BYTES_PER_CHAR
;
1152 if ( outLen
> dstLen
)
1153 return wxCONV_FAILED
;
1156 if ( numChars
== 2 )
1158 // second character of a surrogate
1167 // ----------------------------------------------------------------------------
1168 // endian-reversing conversions
1169 // ----------------------------------------------------------------------------
1172 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1173 const char *src
, size_t srcLen
) const
1175 srcLen
= GetLength(src
, srcLen
);
1176 if ( srcLen
== wxNO_LEN
)
1177 return wxCONV_FAILED
;
1179 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1182 // optimization: return maximal space which could be needed for this
1183 // string even if the real size could be smaller if the buffer contains
1189 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1190 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1195 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1197 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1199 const size_t numChars
= decode_utf16(tmp
, ch
);
1200 if ( numChars
== wxCONV_FAILED
)
1201 return wxCONV_FAILED
;
1203 if ( numChars
== 2 )
1206 if ( ++outLen
> dstLen
)
1207 return wxCONV_FAILED
;
1217 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1218 const wchar_t *src
, size_t srcLen
) const
1220 if ( srcLen
== wxNO_LEN
)
1221 srcLen
= wxWcslen(src
) + 1;
1224 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1225 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1228 const size_t numChars
= encode_utf16(*src
, cc
);
1229 if ( numChars
== wxCONV_FAILED
)
1230 return wxCONV_FAILED
;
1232 outLen
+= numChars
* BYTES_PER_CHAR
;
1235 if ( outLen
> dstLen
)
1236 return wxCONV_FAILED
;
1238 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1239 if ( numChars
== 2 )
1241 // second character of a surrogate
1242 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1250 #endif // WC_UTF16/!WC_UTF16
1253 // ============================================================================
1255 // ============================================================================
1257 #ifdef WORDS_BIGENDIAN
1258 #define wxMBConvUTF32straight wxMBConvUTF32BE
1259 #define wxMBConvUTF32swap wxMBConvUTF32LE
1261 #define wxMBConvUTF32swap wxMBConvUTF32BE
1262 #define wxMBConvUTF32straight wxMBConvUTF32LE
1266 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1267 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1270 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1272 if ( srcLen
== wxNO_LEN
)
1274 // count the number of bytes in input, including the trailing NULs
1275 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1276 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1279 srcLen
*= BYTES_PER_CHAR
;
1281 else // we already have the length
1283 // we can only convert an entire number of UTF-32 characters
1284 if ( srcLen
% BYTES_PER_CHAR
)
1285 return wxCONV_FAILED
;
1291 // case when in-memory representation is UTF-16
1294 // ----------------------------------------------------------------------------
1295 // conversions without endianness change
1296 // ----------------------------------------------------------------------------
1299 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1300 const char *src
, size_t srcLen
) const
1302 srcLen
= GetLength(src
, srcLen
);
1303 if ( srcLen
== wxNO_LEN
)
1304 return wxCONV_FAILED
;
1306 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1307 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1309 for ( size_t n
= 0; n
< inLen
; n
++ )
1312 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1313 if ( numChars
== wxCONV_FAILED
)
1314 return wxCONV_FAILED
;
1319 if ( outLen
> dstLen
)
1320 return wxCONV_FAILED
;
1323 if ( numChars
== 2 )
1325 // second character of a surrogate
1335 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1336 const wchar_t *src
, size_t srcLen
) const
1338 if ( srcLen
== wxNO_LEN
)
1339 srcLen
= wxWcslen(src
) + 1;
1343 // optimization: return maximal space which could be needed for this
1344 // string instead of the exact amount which could be less if there are
1345 // any surrogates in the input
1347 // we consider that surrogates are rare enough to make it worthwhile to
1348 // avoid running the loop below at the cost of slightly extra memory
1350 return srcLen
* BYTES_PER_CHAR
;
1353 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1355 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1357 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1359 return wxCONV_FAILED
;
1361 outLen
+= BYTES_PER_CHAR
;
1363 if ( outLen
> dstLen
)
1364 return wxCONV_FAILED
;
1372 // ----------------------------------------------------------------------------
1373 // endian-reversing conversions
1374 // ----------------------------------------------------------------------------
1377 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1378 const char *src
, size_t srcLen
) const
1380 srcLen
= GetLength(src
, srcLen
);
1381 if ( srcLen
== wxNO_LEN
)
1382 return wxCONV_FAILED
;
1384 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1385 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1387 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1390 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1391 if ( numChars
== wxCONV_FAILED
)
1392 return wxCONV_FAILED
;
1397 if ( outLen
> dstLen
)
1398 return wxCONV_FAILED
;
1401 if ( numChars
== 2 )
1403 // second character of a surrogate
1413 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1414 const wchar_t *src
, size_t srcLen
) const
1416 if ( srcLen
== wxNO_LEN
)
1417 srcLen
= wxWcslen(src
) + 1;
1421 // optimization: return maximal space which could be needed for this
1422 // string instead of the exact amount which could be less if there are
1423 // any surrogates in the input
1425 // we consider that surrogates are rare enough to make it worthwhile to
1426 // avoid running the loop below at the cost of slightly extra memory
1428 return srcLen
*BYTES_PER_CHAR
;
1431 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1433 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1435 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1437 return wxCONV_FAILED
;
1439 outLen
+= BYTES_PER_CHAR
;
1441 if ( outLen
> dstLen
)
1442 return wxCONV_FAILED
;
1444 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1450 #else // !WC_UTF16: wchar_t is UTF-32
1452 // ----------------------------------------------------------------------------
1453 // conversions without endianness change
1454 // ----------------------------------------------------------------------------
1457 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1458 const char *src
, size_t srcLen
) const
1460 // use memcpy() as it should be much faster than hand-written loop
1461 srcLen
= GetLength(src
, srcLen
);
1462 if ( srcLen
== wxNO_LEN
)
1463 return wxCONV_FAILED
;
1465 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1468 if ( dstLen
< inLen
)
1469 return wxCONV_FAILED
;
1471 memcpy(dst
, src
, srcLen
);
1478 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1479 const wchar_t *src
, size_t srcLen
) const
1481 if ( srcLen
== wxNO_LEN
)
1482 srcLen
= wxWcslen(src
) + 1;
1484 srcLen
*= BYTES_PER_CHAR
;
1488 if ( dstLen
< srcLen
)
1489 return wxCONV_FAILED
;
1491 memcpy(dst
, src
, srcLen
);
1497 // ----------------------------------------------------------------------------
1498 // endian-reversing conversions
1499 // ----------------------------------------------------------------------------
1502 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1503 const char *src
, size_t srcLen
) const
1505 srcLen
= GetLength(src
, srcLen
);
1506 if ( srcLen
== wxNO_LEN
)
1507 return wxCONV_FAILED
;
1509 srcLen
/= BYTES_PER_CHAR
;
1513 if ( dstLen
< srcLen
)
1514 return wxCONV_FAILED
;
1516 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1517 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1519 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1527 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1528 const wchar_t *src
, size_t srcLen
) const
1530 if ( srcLen
== wxNO_LEN
)
1531 srcLen
= wxWcslen(src
) + 1;
1533 srcLen
*= BYTES_PER_CHAR
;
1537 if ( dstLen
< srcLen
)
1538 return wxCONV_FAILED
;
1540 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1541 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1543 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1550 #endif // WC_UTF16/!WC_UTF16
1553 // ============================================================================
1554 // The classes doing conversion using the iconv_xxx() functions
1555 // ============================================================================
1559 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1560 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1561 // (unless there's yet another bug in glibc) the only case when iconv()
1562 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1563 // left in the input buffer -- when _real_ error occurs,
1564 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1566 // [This bug does not appear in glibc 2.2.]
1567 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1568 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1569 (errno != E2BIG || bufLeft != 0))
1571 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1574 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1576 #define ICONV_T_INVALID ((iconv_t)-1)
1578 #if SIZEOF_WCHAR_T == 4
1579 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1580 #define WC_ENC wxFONTENCODING_UTF32
1581 #elif SIZEOF_WCHAR_T == 2
1582 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1583 #define WC_ENC wxFONTENCODING_UTF16
1584 #else // sizeof(wchar_t) != 2 nor 4
1585 // does this ever happen?
1586 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1589 // ----------------------------------------------------------------------------
1590 // wxMBConv_iconv: encapsulates an iconv character set
1591 // ----------------------------------------------------------------------------
1593 class wxMBConv_iconv
: public wxMBConv
1596 wxMBConv_iconv(const char *name
);
1597 virtual ~wxMBConv_iconv();
1599 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1600 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1602 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1603 virtual size_t GetMBNulLen() const;
1605 #if wxUSE_UNICODE_UTF8
1606 virtual bool IsUTF8() const;
1609 virtual wxMBConv
*Clone() const
1611 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1612 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1617 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1620 // the iconv handlers used to translate from multibyte
1621 // to wide char and in the other direction
1626 // guards access to m2w and w2m objects
1627 wxMutex m_iconvMutex
;
1631 // the name (for iconv_open()) of a wide char charset -- if none is
1632 // available on this machine, it will remain NULL
1633 static wxString ms_wcCharsetName
;
1635 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1636 // different endian-ness than the native one
1637 static bool ms_wcNeedsSwap
;
1640 // name of the encoding handled by this conversion
1643 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1645 size_t m_minMBCharWidth
;
1648 // make the constructor available for unit testing
1649 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1651 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1652 if ( !result
->IsOk() )
1661 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1662 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1664 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1667 m_minMBCharWidth
= 0;
1669 // check for charset that represents wchar_t:
1670 if ( ms_wcCharsetName
.empty() )
1672 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1675 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1676 #else // !wxUSE_FONTMAP
1677 static const wxChar
*names_static
[] =
1679 #if SIZEOF_WCHAR_T == 4
1681 #elif SIZEOF_WCHAR_T = 2
1686 const wxChar
**names
= names_static
;
1687 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1689 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1691 const wxString
nameCS(*names
);
1693 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1694 wxString
nameXE(nameCS
);
1696 #ifdef WORDS_BIGENDIAN
1698 #else // little endian
1702 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1705 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1706 if ( m2w
== ICONV_T_INVALID
)
1708 // try charset w/o bytesex info (e.g. "UCS4")
1709 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1711 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1713 // and check for bytesex ourselves:
1714 if ( m2w
!= ICONV_T_INVALID
)
1716 char buf
[2], *bufPtr
;
1717 wchar_t wbuf
[2], *wbufPtr
;
1725 outsz
= SIZEOF_WCHAR_T
* 2;
1730 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1731 (char**)&wbufPtr
, &outsz
);
1733 if (ICONV_FAILED(res
, insz
))
1735 wxLogLastError(wxT("iconv"));
1736 wxLogError(_("Conversion to charset '%s' doesn't work."),
1739 else // ok, can convert to this encoding, remember it
1741 ms_wcCharsetName
= nameCS
;
1742 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1746 else // use charset not requiring byte swapping
1748 ms_wcCharsetName
= nameXE
;
1752 wxLogTrace(TRACE_STRCONV
,
1753 wxT("iconv wchar_t charset is \"%s\"%s"),
1754 ms_wcCharsetName
.empty() ? wxString("<none>")
1756 ms_wcNeedsSwap
? _T(" (needs swap)")
1759 else // we already have ms_wcCharsetName
1761 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
1764 if ( ms_wcCharsetName
.empty() )
1766 w2m
= ICONV_T_INVALID
;
1770 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
1771 if ( w2m
== ICONV_T_INVALID
)
1773 wxLogTrace(TRACE_STRCONV
,
1774 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1775 ms_wcCharsetName
.c_str(), name
);
1780 wxMBConv_iconv::~wxMBConv_iconv()
1782 if ( m2w
!= ICONV_T_INVALID
)
1784 if ( w2m
!= ICONV_T_INVALID
)
1788 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1790 // find the string length: notice that must be done differently for
1791 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1793 const size_t nulLen
= GetMBNulLen();
1797 return wxCONV_FAILED
;
1800 inbuf
= strlen(psz
); // arguably more optimized than our version
1805 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1806 // they also have to start at character boundary and not span two
1807 // adjacent characters
1809 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1816 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1817 // Unfortunately there are a couple of global wxCSConv objects such as
1818 // wxConvLocal that are used all over wx code, so we have to make sure
1819 // the handle is used by at most one thread at the time. Otherwise
1820 // only a few wx classes would be safe to use from non-main threads
1821 // as MB<->WC conversion would fail "randomly".
1822 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1823 #endif // wxUSE_THREADS
1825 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1827 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1828 wchar_t *bufPtr
= buf
;
1829 const char *pszPtr
= psz
;
1833 // have destination buffer, convert there
1835 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1836 (char**)&bufPtr
, &outbuf
);
1837 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1841 // convert to native endianness
1842 for ( unsigned i
= 0; i
< res
; i
++ )
1843 buf
[n
] = WC_BSWAP(buf
[i
]);
1846 // NUL-terminate the string if there is any space left
1852 // no destination buffer... convert using temp buffer
1853 // to calculate destination buffer requirement
1860 outbuf
= 8 * SIZEOF_WCHAR_T
;
1863 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1864 (char**)&bufPtr
, &outbuf
);
1866 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1868 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1871 if (ICONV_FAILED(cres
, inbuf
))
1873 //VS: it is ok if iconv fails, hence trace only
1874 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1875 return wxCONV_FAILED
;
1881 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1884 // NB: explained in MB2WC
1885 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1888 size_t inlen
= wxWcslen(psz
);
1889 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1893 wchar_t *tmpbuf
= 0;
1897 // need to copy to temp buffer to switch endianness
1898 // (doing WC_BSWAP twice on the original buffer won't help, as it
1899 // could be in read-only memory, or be accessed in some other thread)
1900 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1901 for ( size_t i
= 0; i
< inlen
; i
++ )
1902 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1904 tmpbuf
[inlen
] = L
'\0';
1910 // have destination buffer, convert there
1911 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1915 // NB: iconv was given only wcslen(psz) characters on input, and so
1916 // it couldn't convert the trailing zero. Let's do it ourselves
1917 // if there's some room left for it in the output buffer.
1923 // no destination buffer: convert using temp buffer
1924 // to calculate destination buffer requirement
1932 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1936 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1944 if (ICONV_FAILED(cres
, inbuf
))
1946 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1947 return wxCONV_FAILED
;
1953 size_t wxMBConv_iconv::GetMBNulLen() const
1955 if ( m_minMBCharWidth
== 0 )
1957 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1960 // NB: explained in MB2WC
1961 wxMutexLocker
lock(self
->m_iconvMutex
);
1964 const wchar_t *wnul
= L
"";
1965 char buf
[8]; // should be enough for NUL in any encoding
1966 size_t inLen
= sizeof(wchar_t),
1967 outLen
= WXSIZEOF(buf
);
1968 char *inBuff
= (char *)wnul
;
1969 char *outBuff
= buf
;
1970 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1972 self
->m_minMBCharWidth
= (size_t)-1;
1976 self
->m_minMBCharWidth
= outBuff
- buf
;
1980 return m_minMBCharWidth
;
1983 #if wxUSE_UNICODE_UTF8
1984 bool wxMBConv_iconv::IsUTF8() const
1986 return wxStricmp(m_name
, "UTF-8") == 0 ||
1987 wxStricmp(m_name
, "UTF8") == 0;
1991 #endif // HAVE_ICONV
1994 // ============================================================================
1995 // Win32 conversion classes
1996 // ============================================================================
1998 #ifdef wxHAVE_WIN32_MB2WC
2002 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2003 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2006 class wxMBConv_win32
: public wxMBConv
2011 m_CodePage
= CP_ACP
;
2012 m_minMBCharWidth
= 0;
2015 wxMBConv_win32(const wxMBConv_win32
& conv
)
2018 m_CodePage
= conv
.m_CodePage
;
2019 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2023 wxMBConv_win32(const char* name
)
2025 m_CodePage
= wxCharsetToCodepage(name
);
2026 m_minMBCharWidth
= 0;
2029 wxMBConv_win32(wxFontEncoding encoding
)
2031 m_CodePage
= wxEncodingToCodepage(encoding
);
2032 m_minMBCharWidth
= 0;
2034 #endif // wxUSE_FONTMAP
2036 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2038 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2039 // the behaviour is not compatible with the Unix version (using iconv)
2040 // and break the library itself, e.g. wxTextInputStream::NextChar()
2041 // wouldn't work if reading an incomplete MB char didn't result in an
2044 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2045 // Win XP or newer and it is not supported for UTF-[78] so we always
2046 // use our own conversions in this case. See
2047 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2048 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2049 if ( m_CodePage
== CP_UTF8
)
2051 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2054 if ( m_CodePage
== CP_UTF7
)
2056 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2060 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2061 IsAtLeastWin2kSP4() )
2063 flags
= MB_ERR_INVALID_CHARS
;
2066 const size_t len
= ::MultiByteToWideChar
2068 m_CodePage
, // code page
2069 flags
, // flags: fall on error
2070 psz
, // input string
2071 -1, // its length (NUL-terminated)
2072 buf
, // output string
2073 buf
? n
: 0 // size of output buffer
2077 // function totally failed
2078 return wxCONV_FAILED
;
2081 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2082 // check if we succeeded, by doing a double trip:
2083 if ( !flags
&& buf
)
2085 const size_t mbLen
= strlen(psz
);
2086 wxCharBuffer
mbBuf(mbLen
);
2087 if ( ::WideCharToMultiByte
2094 mbLen
+ 1, // size in bytes, not length
2098 strcmp(mbBuf
, psz
) != 0 )
2100 // we didn't obtain the same thing we started from, hence
2101 // the conversion was lossy and we consider that it failed
2102 return wxCONV_FAILED
;
2106 // note that it returns count of written chars for buf != NULL and size
2107 // of the needed buffer for buf == NULL so in either case the length of
2108 // the string (which never includes the terminating NUL) is one less
2112 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2115 we have a problem here: by default, WideCharToMultiByte() may
2116 replace characters unrepresentable in the target code page with bad
2117 quality approximations such as turning "1/2" symbol (U+00BD) into
2118 "1" for the code pages which don't have it and we, obviously, want
2119 to avoid this at any price
2121 the trouble is that this function does it _silently_, i.e. it won't
2122 even tell us whether it did or not... Win98/2000 and higher provide
2123 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2124 we have to resort to a round trip, i.e. check that converting back
2125 results in the same string -- this is, of course, expensive but
2126 otherwise we simply can't be sure to not garble the data.
2129 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2130 // it doesn't work with CJK encodings (which we test for rather roughly
2131 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2133 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2136 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2138 // it's our lucky day
2139 flags
= WC_NO_BEST_FIT_CHARS
;
2140 pUsedDef
= &usedDef
;
2142 else // old system or unsupported encoding
2148 const size_t len
= ::WideCharToMultiByte
2150 m_CodePage
, // code page
2151 flags
, // either none or no best fit
2152 pwz
, // input string
2153 -1, // it is (wide) NUL-terminated
2154 buf
, // output buffer
2155 buf
? n
: 0, // and its size
2156 NULL
, // default "replacement" char
2157 pUsedDef
// [out] was it used?
2162 // function totally failed
2163 return wxCONV_FAILED
;
2166 // if we were really converting, check if we succeeded
2171 // check if the conversion failed, i.e. if any replacements
2174 return wxCONV_FAILED
;
2176 else // we must resort to double tripping...
2178 wxWCharBuffer
wcBuf(n
);
2179 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2180 wcscmp(wcBuf
, pwz
) != 0 )
2182 // we didn't obtain the same thing we started from, hence
2183 // the conversion was lossy and we consider that it failed
2184 return wxCONV_FAILED
;
2189 // see the comment above for the reason of "len - 1"
2193 virtual size_t GetMBNulLen() const
2195 if ( m_minMBCharWidth
== 0 )
2197 int len
= ::WideCharToMultiByte
2199 m_CodePage
, // code page
2201 L
"", // input string
2202 1, // translate just the NUL
2203 NULL
, // output buffer
2205 NULL
, // no replacement char
2206 NULL
// [out] don't care if it was used
2209 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2213 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2214 self
->m_minMBCharWidth
= (size_t)-1;
2218 self
->m_minMBCharWidth
= (size_t)-1;
2224 self
->m_minMBCharWidth
= len
;
2229 return m_minMBCharWidth
;
2232 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2234 bool IsOk() const { return m_CodePage
!= -1; }
2237 static bool CanUseNoBestFit()
2239 static int s_isWin98Or2k
= -1;
2241 if ( s_isWin98Or2k
== -1 )
2244 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2246 case wxOS_WINDOWS_9X
:
2247 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2250 case wxOS_WINDOWS_NT
:
2251 s_isWin98Or2k
= verMaj
>= 5;
2255 // unknown: be conservative by default
2260 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2263 return s_isWin98Or2k
== 1;
2266 static bool IsAtLeastWin2kSP4()
2271 static int s_isAtLeastWin2kSP4
= -1;
2273 if ( s_isAtLeastWin2kSP4
== -1 )
2275 OSVERSIONINFOEX ver
;
2277 memset(&ver
, 0, sizeof(ver
));
2278 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2279 GetVersionEx((OSVERSIONINFO
*)&ver
);
2281 s_isAtLeastWin2kSP4
=
2282 ((ver
.dwMajorVersion
> 5) || // Vista+
2283 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2284 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2285 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2289 return s_isAtLeastWin2kSP4
== 1;
2294 // the code page we're working with
2297 // cached result of GetMBNulLen(), set to 0 initially meaning
2299 size_t m_minMBCharWidth
;
2302 #endif // wxHAVE_WIN32_MB2WC
2304 // ============================================================================
2305 // CoreFoundation conversion classes
2306 // ============================================================================
2310 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2311 // Strangely enough, internally Core Foundation uses
2312 // UTF-32 internally quite a bit - its just not public (yet).
2314 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2316 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2320 case wxFONTENCODING_DEFAULT
:
2321 enc
= CFStringGetSystemEncoding();
2324 case wxFONTENCODING_ISO8859_1
:
2325 enc
= kCFStringEncodingISOLatin1
;
2327 case wxFONTENCODING_ISO8859_2
:
2328 enc
= kCFStringEncodingISOLatin2
;
2330 case wxFONTENCODING_ISO8859_3
:
2331 enc
= kCFStringEncodingISOLatin3
;
2333 case wxFONTENCODING_ISO8859_4
:
2334 enc
= kCFStringEncodingISOLatin4
;
2336 case wxFONTENCODING_ISO8859_5
:
2337 enc
= kCFStringEncodingISOLatinCyrillic
;
2339 case wxFONTENCODING_ISO8859_6
:
2340 enc
= kCFStringEncodingISOLatinArabic
;
2342 case wxFONTENCODING_ISO8859_7
:
2343 enc
= kCFStringEncodingISOLatinGreek
;
2345 case wxFONTENCODING_ISO8859_8
:
2346 enc
= kCFStringEncodingISOLatinHebrew
;
2348 case wxFONTENCODING_ISO8859_9
:
2349 enc
= kCFStringEncodingISOLatin5
;
2351 case wxFONTENCODING_ISO8859_10
:
2352 enc
= kCFStringEncodingISOLatin6
;
2354 case wxFONTENCODING_ISO8859_11
:
2355 enc
= kCFStringEncodingISOLatinThai
;
2357 case wxFONTENCODING_ISO8859_13
:
2358 enc
= kCFStringEncodingISOLatin7
;
2360 case wxFONTENCODING_ISO8859_14
:
2361 enc
= kCFStringEncodingISOLatin8
;
2363 case wxFONTENCODING_ISO8859_15
:
2364 enc
= kCFStringEncodingISOLatin9
;
2367 case wxFONTENCODING_KOI8
:
2368 enc
= kCFStringEncodingKOI8_R
;
2370 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2371 enc
= kCFStringEncodingDOSRussian
;
2374 // case wxFONTENCODING_BULGARIAN :
2378 case wxFONTENCODING_CP437
:
2379 enc
= kCFStringEncodingDOSLatinUS
;
2381 case wxFONTENCODING_CP850
:
2382 enc
= kCFStringEncodingDOSLatin1
;
2384 case wxFONTENCODING_CP852
:
2385 enc
= kCFStringEncodingDOSLatin2
;
2387 case wxFONTENCODING_CP855
:
2388 enc
= kCFStringEncodingDOSCyrillic
;
2390 case wxFONTENCODING_CP866
:
2391 enc
= kCFStringEncodingDOSRussian
;
2393 case wxFONTENCODING_CP874
:
2394 enc
= kCFStringEncodingDOSThai
;
2396 case wxFONTENCODING_CP932
:
2397 enc
= kCFStringEncodingDOSJapanese
;
2399 case wxFONTENCODING_CP936
:
2400 enc
= kCFStringEncodingDOSChineseSimplif
;
2402 case wxFONTENCODING_CP949
:
2403 enc
= kCFStringEncodingDOSKorean
;
2405 case wxFONTENCODING_CP950
:
2406 enc
= kCFStringEncodingDOSChineseTrad
;
2408 case wxFONTENCODING_CP1250
:
2409 enc
= kCFStringEncodingWindowsLatin2
;
2411 case wxFONTENCODING_CP1251
:
2412 enc
= kCFStringEncodingWindowsCyrillic
;
2414 case wxFONTENCODING_CP1252
:
2415 enc
= kCFStringEncodingWindowsLatin1
;
2417 case wxFONTENCODING_CP1253
:
2418 enc
= kCFStringEncodingWindowsGreek
;
2420 case wxFONTENCODING_CP1254
:
2421 enc
= kCFStringEncodingWindowsLatin5
;
2423 case wxFONTENCODING_CP1255
:
2424 enc
= kCFStringEncodingWindowsHebrew
;
2426 case wxFONTENCODING_CP1256
:
2427 enc
= kCFStringEncodingWindowsArabic
;
2429 case wxFONTENCODING_CP1257
:
2430 enc
= kCFStringEncodingWindowsBalticRim
;
2432 // This only really encodes to UTF7 (if that) evidently
2433 // case wxFONTENCODING_UTF7 :
2434 // enc = kCFStringEncodingNonLossyASCII ;
2436 case wxFONTENCODING_UTF8
:
2437 enc
= kCFStringEncodingUTF8
;
2439 case wxFONTENCODING_EUC_JP
:
2440 enc
= kCFStringEncodingEUC_JP
;
2442 case wxFONTENCODING_UTF16
:
2443 enc
= kCFStringEncodingUnicode
;
2445 case wxFONTENCODING_MACROMAN
:
2446 enc
= kCFStringEncodingMacRoman
;
2448 case wxFONTENCODING_MACJAPANESE
:
2449 enc
= kCFStringEncodingMacJapanese
;
2451 case wxFONTENCODING_MACCHINESETRAD
:
2452 enc
= kCFStringEncodingMacChineseTrad
;
2454 case wxFONTENCODING_MACKOREAN
:
2455 enc
= kCFStringEncodingMacKorean
;
2457 case wxFONTENCODING_MACARABIC
:
2458 enc
= kCFStringEncodingMacArabic
;
2460 case wxFONTENCODING_MACHEBREW
:
2461 enc
= kCFStringEncodingMacHebrew
;
2463 case wxFONTENCODING_MACGREEK
:
2464 enc
= kCFStringEncodingMacGreek
;
2466 case wxFONTENCODING_MACCYRILLIC
:
2467 enc
= kCFStringEncodingMacCyrillic
;
2469 case wxFONTENCODING_MACDEVANAGARI
:
2470 enc
= kCFStringEncodingMacDevanagari
;
2472 case wxFONTENCODING_MACGURMUKHI
:
2473 enc
= kCFStringEncodingMacGurmukhi
;
2475 case wxFONTENCODING_MACGUJARATI
:
2476 enc
= kCFStringEncodingMacGujarati
;
2478 case wxFONTENCODING_MACORIYA
:
2479 enc
= kCFStringEncodingMacOriya
;
2481 case wxFONTENCODING_MACBENGALI
:
2482 enc
= kCFStringEncodingMacBengali
;
2484 case wxFONTENCODING_MACTAMIL
:
2485 enc
= kCFStringEncodingMacTamil
;
2487 case wxFONTENCODING_MACTELUGU
:
2488 enc
= kCFStringEncodingMacTelugu
;
2490 case wxFONTENCODING_MACKANNADA
:
2491 enc
= kCFStringEncodingMacKannada
;
2493 case wxFONTENCODING_MACMALAJALAM
:
2494 enc
= kCFStringEncodingMacMalayalam
;
2496 case wxFONTENCODING_MACSINHALESE
:
2497 enc
= kCFStringEncodingMacSinhalese
;
2499 case wxFONTENCODING_MACBURMESE
:
2500 enc
= kCFStringEncodingMacBurmese
;
2502 case wxFONTENCODING_MACKHMER
:
2503 enc
= kCFStringEncodingMacKhmer
;
2505 case wxFONTENCODING_MACTHAI
:
2506 enc
= kCFStringEncodingMacThai
;
2508 case wxFONTENCODING_MACLAOTIAN
:
2509 enc
= kCFStringEncodingMacLaotian
;
2511 case wxFONTENCODING_MACGEORGIAN
:
2512 enc
= kCFStringEncodingMacGeorgian
;
2514 case wxFONTENCODING_MACARMENIAN
:
2515 enc
= kCFStringEncodingMacArmenian
;
2517 case wxFONTENCODING_MACCHINESESIMP
:
2518 enc
= kCFStringEncodingMacChineseSimp
;
2520 case wxFONTENCODING_MACTIBETAN
:
2521 enc
= kCFStringEncodingMacTibetan
;
2523 case wxFONTENCODING_MACMONGOLIAN
:
2524 enc
= kCFStringEncodingMacMongolian
;
2526 case wxFONTENCODING_MACETHIOPIC
:
2527 enc
= kCFStringEncodingMacEthiopic
;
2529 case wxFONTENCODING_MACCENTRALEUR
:
2530 enc
= kCFStringEncodingMacCentralEurRoman
;
2532 case wxFONTENCODING_MACVIATNAMESE
:
2533 enc
= kCFStringEncodingMacVietnamese
;
2535 case wxFONTENCODING_MACARABICEXT
:
2536 enc
= kCFStringEncodingMacExtArabic
;
2538 case wxFONTENCODING_MACSYMBOL
:
2539 enc
= kCFStringEncodingMacSymbol
;
2541 case wxFONTENCODING_MACDINGBATS
:
2542 enc
= kCFStringEncodingMacDingbats
;
2544 case wxFONTENCODING_MACTURKISH
:
2545 enc
= kCFStringEncodingMacTurkish
;
2547 case wxFONTENCODING_MACCROATIAN
:
2548 enc
= kCFStringEncodingMacCroatian
;
2550 case wxFONTENCODING_MACICELANDIC
:
2551 enc
= kCFStringEncodingMacIcelandic
;
2553 case wxFONTENCODING_MACROMANIAN
:
2554 enc
= kCFStringEncodingMacRomanian
;
2556 case wxFONTENCODING_MACCELTIC
:
2557 enc
= kCFStringEncodingMacCeltic
;
2559 case wxFONTENCODING_MACGAELIC
:
2560 enc
= kCFStringEncodingMacGaelic
;
2562 // case wxFONTENCODING_MACKEYBOARD :
2563 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2567 // because gcc is picky
2574 class wxMBConv_cf
: public wxMBConv
2579 Init(CFStringGetSystemEncoding()) ;
2582 wxMBConv_cf(const wxMBConv_cf
& conv
)
2584 m_encoding
= conv
.m_encoding
;
2588 wxMBConv_cf(const char* name
)
2590 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2594 wxMBConv_cf(wxFontEncoding encoding
)
2596 Init( wxCFStringEncFromFontEnc(encoding
) );
2599 virtual ~wxMBConv_cf()
2603 void Init( CFStringEncoding encoding
)
2605 m_encoding
= encoding
;
2608 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2612 CFStringRef theString
= CFStringCreateWithBytes (
2613 NULL
, //the allocator
2614 (const UInt8
*)szUnConv
,
2617 false //no BOM/external representation
2620 wxASSERT(theString
);
2622 size_t nOutLength
= CFStringGetLength(theString
);
2626 CFRelease(theString
);
2630 CFRange theRange
= { 0, nOutSize
};
2632 #if SIZEOF_WCHAR_T == 4
2633 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2636 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2638 CFRelease(theString
);
2640 szUniCharBuffer
[nOutLength
] = '\0';
2642 #if SIZEOF_WCHAR_T == 4
2643 wxMBConvUTF16 converter
;
2644 converter
.MB2WC( szOut
, (const char*)szUniCharBuffer
, nOutSize
);
2645 delete [] szUniCharBuffer
;
2651 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2655 size_t nRealOutSize
;
2656 size_t nBufSize
= wxWcslen(szUnConv
);
2657 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2659 #if SIZEOF_WCHAR_T == 4
2660 wxMBConvUTF16 converter
;
2661 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2662 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1];
2663 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
));
2664 nBufSize
/= sizeof(UniChar
);
2667 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2671 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2674 wxASSERT(theString
);
2676 //Note that CER puts a BOM when converting to unicode
2677 //so we check and use getchars instead in that case
2678 if (m_encoding
== kCFStringEncodingUnicode
)
2681 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2683 nRealOutSize
= CFStringGetLength(theString
) + 1;
2689 CFRangeMake(0, CFStringGetLength(theString
)),
2691 0, //what to put in characters that can't be converted -
2692 //0 tells CFString to return NULL if it meets such a character
2693 false, //not an external representation
2696 (CFIndex
*) &nRealOutSize
2700 CFRelease(theString
);
2702 #if SIZEOF_WCHAR_T == 4
2703 delete[] szUniBuffer
;
2706 return nRealOutSize
- 1;
2709 virtual wxMBConv
*Clone() const { return new wxMBConv_cf(*this); }
2713 return m_encoding
!= kCFStringEncodingInvalidId
&&
2714 CFStringIsEncodingAvailable(m_encoding
);
2718 CFStringEncoding m_encoding
;
2721 #endif // __DARWIN__
2723 // ============================================================================
2724 // Mac conversion classes
2725 // ============================================================================
2727 /* Although we are in the base library we currently have this wxMac
2728 * conditional. This is not generally good but fortunately does not affect
2729 * the ABI of the base library, only what encodings might work.
2730 * It does mean that a wxBase built as part of wxMac has slightly more support
2731 * than one built for wxCocoa or even wxGtk.
2733 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2735 class wxMBConv_mac
: public wxMBConv
2740 Init(CFStringGetSystemEncoding()) ;
2743 wxMBConv_mac(const wxMBConv_mac
& conv
)
2745 Init(conv
.m_char_encoding
);
2749 wxMBConv_mac(const char* name
)
2751 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2755 wxMBConv_mac(wxFontEncoding encoding
)
2757 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2760 virtual ~wxMBConv_mac()
2762 OSStatus status
= noErr
;
2763 if (m_MB2WC_converter
)
2764 status
= TECDisposeConverter(m_MB2WC_converter
);
2765 if (m_WC2MB_converter
)
2766 status
= TECDisposeConverter(m_WC2MB_converter
);
2769 void Init( TextEncodingBase encoding
,TextEncodingVariant encodingVariant
= kTextEncodingDefaultVariant
,
2770 TextEncodingFormat encodingFormat
= kTextEncodingDefaultFormat
)
2772 m_MB2WC_converter
= NULL
;
2773 m_WC2MB_converter
= NULL
;
2774 m_char_encoding
= CreateTextEncoding(encoding
, encodingVariant
, encodingFormat
) ;
2775 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2778 virtual void CreateIfNeeded() const
2780 if ( m_MB2WC_converter
== NULL
&& m_WC2MB_converter
== NULL
)
2782 OSStatus status
= noErr
;
2783 status
= TECCreateConverter(&m_MB2WC_converter
,
2785 m_unicode_encoding
);
2786 wxASSERT_MSG( status
== noErr
, _("Unable to create TextEncodingConverter")) ;
2787 status
= TECCreateConverter(&m_WC2MB_converter
,
2790 wxASSERT_MSG( status
== noErr
, _("Unable to create TextEncodingConverter")) ;
2794 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2797 OSStatus status
= noErr
;
2798 ByteCount byteOutLen
;
2799 ByteCount byteInLen
= strlen(psz
) + 1;
2800 wchar_t *tbuf
= NULL
;
2801 UniChar
* ubuf
= NULL
;
2806 // Apple specs say at least 32
2807 n
= wxMax( 32, byteInLen
) ;
2808 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2811 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2813 #if SIZEOF_WCHAR_T == 4
2814 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2816 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2819 status
= TECConvertText(
2820 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2821 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2823 #if SIZEOF_WCHAR_T == 4
2824 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2825 // is not properly terminated we get random characters at the end
2826 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2827 wxMBConvUTF16 converter
;
2828 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2831 res
= byteOutLen
/ sizeof( UniChar
) ;
2837 if ( buf
&& res
< n
)
2843 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2846 OSStatus status
= noErr
;
2847 ByteCount byteOutLen
;
2848 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2854 // Apple specs say at least 32
2855 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2856 tbuf
= (char*) malloc( n
) ;
2859 ByteCount byteBufferLen
= n
;
2860 UniChar
* ubuf
= NULL
;
2862 #if SIZEOF_WCHAR_T == 4
2863 wxMBConvUTF16 converter
;
2864 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2865 byteInLen
= unicharlen
;
2866 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2867 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2869 ubuf
= (UniChar
*) psz
;
2872 status
= TECConvertText(
2873 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2874 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2876 #if SIZEOF_WCHAR_T == 4
2883 size_t res
= byteOutLen
;
2884 if ( buf
&& res
< n
)
2888 //we need to double-trip to verify it didn't insert any ? in place
2889 //of bogus characters
2890 wxWCharBuffer
wcBuf(n
);
2891 size_t pszlen
= wxWcslen(psz
);
2892 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2893 wxWcslen(wcBuf
) != pszlen
||
2894 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2896 // we didn't obtain the same thing we started from, hence
2897 // the conversion was lossy and we consider that it failed
2898 return wxCONV_FAILED
;
2905 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2910 return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
;
2914 mutable TECObjectRef m_MB2WC_converter
;
2915 mutable TECObjectRef m_WC2MB_converter
;
2917 TextEncodingBase m_char_encoding
;
2918 TextEncodingBase m_unicode_encoding
;
2921 // MB is decomposed (D) normalized UTF8
2923 class wxMBConv_macUTF8D
: public wxMBConv_mac
2928 Init( kTextEncodingUnicodeDefault
, kUnicodeNoSubset
, kUnicodeUTF8Format
) ;
2933 virtual ~wxMBConv_macUTF8D()
2936 DisposeUnicodeToTextInfo(&m_uni
);
2937 if (m_uniBack
!=NULL
)
2938 DisposeUnicodeToTextInfo(&m_uniBack
);
2941 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2944 OSStatus status
= noErr
;
2945 ByteCount byteOutLen
;
2946 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2952 // Apple specs say at least 32
2953 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2954 tbuf
= (char*) malloc( n
) ;
2957 ByteCount byteBufferLen
= n
;
2958 UniChar
* ubuf
= NULL
;
2960 #if SIZEOF_WCHAR_T == 4
2961 wxMBConvUTF16 converter
;
2962 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2963 byteInLen
= unicharlen
;
2964 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2965 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2967 ubuf
= (UniChar
*) psz
;
2970 // ubuf is a non-decomposed UniChar buffer
2972 ByteCount dcubuflen
= byteInLen
* 2 + 2 ;
2973 ByteCount dcubufread
, dcubufwritten
;
2974 UniChar
*dcubuf
= (UniChar
*) malloc( dcubuflen
) ;
2976 ConvertFromUnicodeToText( m_uni
, byteInLen
, ubuf
,
2977 kUnicodeDefaultDirectionMask
, 0, NULL
, NULL
, NULL
, dcubuflen
, &dcubufread
, &dcubufwritten
, dcubuf
) ;
2979 // we now convert that decomposed buffer into UTF8
2981 status
= TECConvertText(
2982 m_WC2MB_converter
, (ConstTextPtr
) dcubuf
, dcubufwritten
, &dcubufread
,
2983 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2987 #if SIZEOF_WCHAR_T == 4
2994 size_t res
= byteOutLen
;
2995 if ( buf
&& res
< n
)
2998 // don't test for round-trip fidelity yet, we cannot guarantee it yet
3004 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3007 OSStatus status
= noErr
;
3008 ByteCount byteOutLen
;
3009 ByteCount byteInLen
= strlen(psz
) + 1;
3010 wchar_t *tbuf
= NULL
;
3011 UniChar
* ubuf
= NULL
;
3016 // Apple specs say at least 32
3017 n
= wxMax( 32, byteInLen
) ;
3018 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
3021 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
3023 #if SIZEOF_WCHAR_T == 4
3024 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
3026 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
3029 ByteCount dcubuflen
= byteBufferLen
* 2 + 2 ;
3030 ByteCount dcubufread
, dcubufwritten
;
3031 UniChar
*dcubuf
= (UniChar
*) malloc( dcubuflen
) ;
3033 status
= TECConvertText(
3034 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
3035 (TextPtr
) dcubuf
, dcubuflen
, &byteOutLen
);
3036 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3037 // is not properly terminated we get random characters at the end
3038 dcubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
3040 // now from the decomposed UniChar to properly composed uniChar
3041 ConvertFromUnicodeToText( m_uniBack
, byteOutLen
, dcubuf
,
3042 kUnicodeDefaultDirectionMask
, 0, NULL
, NULL
, NULL
, dcubuflen
, &dcubufread
, &dcubufwritten
, ubuf
) ;
3045 byteOutLen
= dcubufwritten
;
3046 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
3049 #if SIZEOF_WCHAR_T == 4
3050 wxMBConvUTF16 converter
;
3051 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
3054 res
= byteOutLen
/ sizeof( UniChar
) ;
3060 if ( buf
&& res
< n
)
3066 virtual void CreateIfNeeded() const
3068 wxMBConv_mac::CreateIfNeeded() ;
3069 if ( m_uni
== NULL
)
3071 m_map
.unicodeEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3072 kUnicodeNoSubset
, kTextEncodingDefaultFormat
);
3073 m_map
.otherEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3074 kUnicodeCanonicalDecompVariant
, kTextEncodingDefaultFormat
);
3075 m_map
.mappingVersion
= kUnicodeUseLatestMapping
;
3077 OSStatus err
= CreateUnicodeToTextInfo(&m_map
, &m_uni
);
3078 wxASSERT_MSG( err
== noErr
, _(" Couldn't create the UnicodeConverter")) ;
3080 m_map
.unicodeEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3081 kUnicodeNoSubset
, kTextEncodingDefaultFormat
);
3082 m_map
.otherEncoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,
3083 kUnicodeCanonicalCompVariant
, kTextEncodingDefaultFormat
);
3084 m_map
.mappingVersion
= kUnicodeUseLatestMapping
;
3085 err
= CreateUnicodeToTextInfo(&m_map
, &m_uniBack
);
3086 wxASSERT_MSG( err
== noErr
, _(" Couldn't create the UnicodeConverter")) ;
3090 mutable UnicodeToTextInfo m_uni
;
3091 mutable UnicodeToTextInfo m_uniBack
;
3092 mutable UnicodeMapping m_map
;
3094 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3096 // ============================================================================
3097 // wxEncodingConverter based conversion classes
3098 // ============================================================================
3102 class wxMBConv_wxwin
: public wxMBConv
3107 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
3108 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
3112 // temporarily just use wxEncodingConverter stuff,
3113 // so that it works while a better implementation is built
3114 wxMBConv_wxwin(const char* name
)
3117 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3119 m_enc
= wxFONTENCODING_SYSTEM
;
3124 wxMBConv_wxwin(wxFontEncoding enc
)
3131 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
3133 size_t inbuf
= strlen(psz
);
3136 if (!m2w
.Convert(psz
, buf
))
3137 return wxCONV_FAILED
;
3142 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
3144 const size_t inbuf
= wxWcslen(psz
);
3147 if (!w2m
.Convert(psz
, buf
))
3148 return wxCONV_FAILED
;
3154 virtual size_t GetMBNulLen() const
3158 case wxFONTENCODING_UTF16BE
:
3159 case wxFONTENCODING_UTF16LE
:
3162 case wxFONTENCODING_UTF32BE
:
3163 case wxFONTENCODING_UTF32LE
:
3171 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
3173 bool IsOk() const { return m_ok
; }
3176 wxFontEncoding m_enc
;
3177 wxEncodingConverter m2w
, w2m
;
3180 // were we initialized successfully?
3183 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
3186 // make the constructors available for unit testing
3187 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
3189 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
3190 if ( !result
->IsOk() )
3199 #endif // wxUSE_FONTMAP
3201 // ============================================================================
3202 // wxCSConv implementation
3203 // ============================================================================
3205 void wxCSConv::Init()
3212 wxCSConv::wxCSConv(const wxString
& charset
)
3216 if ( !charset
.empty() )
3218 SetName(charset
.ToAscii());
3222 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
3224 m_encoding
= wxFONTENCODING_SYSTEM
;
3228 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3230 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3232 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3234 encoding
= wxFONTENCODING_SYSTEM
;
3239 m_encoding
= encoding
;
3242 wxCSConv::~wxCSConv()
3247 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3252 SetName(conv
.m_name
);
3253 m_encoding
= conv
.m_encoding
;
3256 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3260 SetName(conv
.m_name
);
3261 m_encoding
= conv
.m_encoding
;
3266 void wxCSConv::Clear()
3275 void wxCSConv::SetName(const char *charset
)
3279 m_name
= strdup(charset
);
3286 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3287 wxEncodingNameCache
);
3289 static wxEncodingNameCache gs_nameCache
;
3292 wxMBConv
*wxCSConv::DoCreate() const
3295 wxLogTrace(TRACE_STRCONV
,
3296 wxT("creating conversion for %s"),
3298 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3299 #endif // wxUSE_FONTMAP
3301 // check for the special case of ASCII or ISO8859-1 charset: as we have
3302 // special knowledge of it anyhow, we don't need to create a special
3303 // conversion object
3304 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3305 m_encoding
== wxFONTENCODING_DEFAULT
)
3307 // don't convert at all
3311 // we trust OS to do conversion better than we can so try external
3312 // conversion methods first
3314 // the full order is:
3315 // 1. OS conversion (iconv() under Unix or Win32 API)
3316 // 2. hard coded conversions for UTF
3317 // 3. wxEncodingConverter as fall back
3323 #endif // !wxUSE_FONTMAP
3326 wxFontEncoding
encoding(m_encoding
);
3331 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3339 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3340 #endif // wxUSE_FONTMAP
3344 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3345 if ( it
!= gs_nameCache
.end() )
3347 if ( it
->second
.empty() )
3350 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3357 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3358 // CS : in case this does not return valid names (eg for MacRoman)
3359 // encoding got a 'failure' entry in the cache all the same,
3360 // although it just has to be created using a different method, so
3361 // only store failed iconv creation attempts (or perhaps we
3362 // shoulnd't do this at all ?)
3363 if ( names
[0] != NULL
)
3365 for ( ; *names
; ++names
)
3367 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3368 // will need changes that will obsolete this
3369 wxString
name(*names
);
3370 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3373 gs_nameCache
[encoding
] = *names
;
3380 gs_nameCache
[encoding
] = _T(""); // cache the failure
3383 #endif // wxUSE_FONTMAP
3385 #endif // HAVE_ICONV
3387 #ifdef wxHAVE_WIN32_MB2WC
3390 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3391 : new wxMBConv_win32(m_encoding
);
3400 #endif // wxHAVE_WIN32_MB2WC
3402 #if defined(__WXMAC__)
3404 // leave UTF16 and UTF32 to the built-ins of wx
3405 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3406 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3409 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3410 : new wxMBConv_mac(m_encoding
);
3412 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3424 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3427 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3428 : new wxMBConv_cf(m_encoding
);
3430 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3439 #endif // __DARWIN__
3442 wxFontEncoding enc
= m_encoding
;
3444 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3446 // use "false" to suppress interactive dialogs -- we can be called from
3447 // anywhere and popping up a dialog from here is the last thing we want to
3449 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3451 #endif // wxUSE_FONTMAP
3455 case wxFONTENCODING_UTF7
:
3456 return new wxMBConvUTF7
;
3458 case wxFONTENCODING_UTF8
:
3459 return new wxMBConvUTF8
;
3461 case wxFONTENCODING_UTF16BE
:
3462 return new wxMBConvUTF16BE
;
3464 case wxFONTENCODING_UTF16LE
:
3465 return new wxMBConvUTF16LE
;
3467 case wxFONTENCODING_UTF32BE
:
3468 return new wxMBConvUTF32BE
;
3470 case wxFONTENCODING_UTF32LE
:
3471 return new wxMBConvUTF32LE
;
3474 // nothing to do but put here to suppress gcc warnings
3481 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3482 : new wxMBConv_wxwin(m_encoding
);
3488 #endif // wxUSE_FONTMAP
3490 // NB: This is a hack to prevent deadlock. What could otherwise happen
3491 // in Unicode build: wxConvLocal creation ends up being here
3492 // because of some failure and logs the error. But wxLog will try to
3493 // attach a timestamp, for which it will need wxConvLocal (to convert
3494 // time to char* and then wchar_t*), but that fails, tries to log the
3495 // error, but wxLog has an (already locked) critical section that
3496 // guards the static buffer.
3497 static bool alreadyLoggingError
= false;
3498 if (!alreadyLoggingError
)
3500 alreadyLoggingError
= true;
3501 wxLogError(_("Cannot convert from the charset '%s'!"),
3505 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3506 #else // !wxUSE_FONTMAP
3507 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3508 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3511 alreadyLoggingError
= false;
3517 void wxCSConv::CreateConvIfNeeded() const
3521 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3523 // if we don't have neither the name nor the encoding, use the default
3524 // encoding for this system
3525 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3528 self
->m_encoding
= wxLocale::GetSystemEncoding();
3530 // fallback to some reasonable default:
3531 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3532 #endif // wxUSE_INTL
3535 self
->m_convReal
= DoCreate();
3536 self
->m_deferred
= false;
3540 bool wxCSConv::IsOk() const
3542 CreateConvIfNeeded();
3544 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3545 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3546 return true; // always ok as we do it ourselves
3548 // m_convReal->IsOk() is called at its own creation, so we know it must
3549 // be ok if m_convReal is non-NULL
3550 return m_convReal
!= NULL
;
3553 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3554 const char *src
, size_t srcLen
) const
3556 CreateConvIfNeeded();
3559 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3562 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3565 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3566 const wchar_t *src
, size_t srcLen
) const
3568 CreateConvIfNeeded();
3571 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3574 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3577 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3579 CreateConvIfNeeded();
3582 return m_convReal
->MB2WC(buf
, psz
, n
);
3585 size_t len
= strlen(psz
);
3589 for (size_t c
= 0; c
<= len
; c
++)
3590 buf
[c
] = (unsigned char)(psz
[c
]);
3596 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3598 CreateConvIfNeeded();
3601 return m_convReal
->WC2MB(buf
, psz
, n
);
3604 const size_t len
= wxWcslen(psz
);
3607 for (size_t c
= 0; c
<= len
; c
++)
3610 return wxCONV_FAILED
;
3612 buf
[c
] = (char)psz
[c
];
3617 for (size_t c
= 0; c
<= len
; c
++)
3620 return wxCONV_FAILED
;
3627 size_t wxCSConv::GetMBNulLen() const
3629 CreateConvIfNeeded();
3633 return m_convReal
->GetMBNulLen();
3636 // otherwise, we are ISO-8859-1
3640 #if wxUSE_UNICODE_UTF8
3641 bool wxCSConv::IsUTF8() const
3643 CreateConvIfNeeded();
3647 return m_convReal
->IsUTF8();
3650 // otherwise, we are ISO-8859-1
3658 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3661 return wxWCharBuffer();
3663 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3665 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3667 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3672 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3675 return wxCharBuffer();
3677 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3679 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3684 #endif // wxUSE_UNICODE
3686 // ----------------------------------------------------------------------------
3688 // ----------------------------------------------------------------------------
3690 // NB: The reason why we create converted objects in this convoluted way,
3691 // using a factory function instead of global variable, is that they
3692 // may be used at static initialization time (some of them are used by
3693 // wxString ctors and there may be a global wxString object). In other
3694 // words, possibly _before_ the converter global object would be
3701 #undef wxConvISO8859_1
3703 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3704 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3705 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3707 static impl_klass name##Obj ctor_args; \
3708 return &name##Obj; \
3710 /* this ensures that all global converter objects are created */ \
3711 /* by the time static initialization is done, i.e. before any */ \
3712 /* thread is launched: */ \
3713 static klass* gs_##name##instance = wxGet_##name##Ptr()
3715 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3716 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3719 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3720 #elif defined(__WXMAC__) && !defined(__MACH__)
3721 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_mac
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3723 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3726 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
);
3727 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
);
3729 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3730 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3732 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3733 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3735 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3736 static wxMBConv_macUTF8D wxConvMacUTF8DObj
;
3738 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3740 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3743 wxGet_wxConvUTF8Ptr();
3746 wxGet_wxConvLibcPtr();
3747 #endif // __WXOSX__/!__WXOSX__
3749 #else // !wxUSE_WCHAR_T
3751 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3752 // stand-ins in absence of wchar_t
3753 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3758 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T