1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // ----------------------------------------------------------------------------
90 // UTF-16 en/decoding to/from UCS-4
91 // ----------------------------------------------------------------------------
94 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
99 *output
= (wxUint16
) input
;
102 else if (input
>=0x110000)
110 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
111 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
117 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
119 if ((*input
<0xd800) || (*input
>0xdfff))
124 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
131 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
137 // ----------------------------------------------------------------------------
139 // ----------------------------------------------------------------------------
141 wxMBConv::~wxMBConv()
143 // nothing to do here (necessary for Darwin linking probably)
146 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
150 // calculate the length of the buffer needed first
151 size_t nLen
= MB2WC(NULL
, psz
, 0);
152 if ( nLen
!= (size_t)-1 )
154 // now do the actual conversion
155 wxWCharBuffer
buf(nLen
);
156 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
157 if ( nLen
!= (size_t)-1 )
164 wxWCharBuffer
buf((wchar_t *)NULL
);
169 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
173 size_t nLen
= WC2MB(NULL
, pwz
, 0);
174 if ( nLen
!= (size_t)-1 )
176 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
177 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
178 if ( nLen
!= (size_t)-1 )
185 wxCharBuffer
buf((char *)NULL
);
191 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
193 // the currently accumulated wide characters
196 // the current length of wbuf
199 // we need to know the representation of L'\0' for this conversion
201 const char * const nul
= GetMBNul(&nulLen
);
202 if ( nulLen
== (size_t)-1 || nulLen
== 0 )
203 return wxWCharBuffer();
205 // make a copy of the input string unless it is already properly
209 // now we can compute the input size if we were not given it: notice that
210 // in this case the string must be properly NUL-terminated, of course, as
211 // otherwise we have no way of knowing how long it is
212 if ( inLen
== (size_t)-1 )
214 // not the most efficient algorithm but it shouldn't matter as normally
215 // there are not many NULs in the string and so normally memcmp()
216 // should stop on the first character
218 while ( memcmp(p
, nul
, nulLen
) != 0 )
221 inLen
= p
- in
+ nulLen
;
223 else // we already have the size
225 // check if it's not already NUL-terminated too to avoid the copy
226 if ( inLen
< nulLen
|| memcmp(in
+ inLen
- nulLen
, nul
, nulLen
) != 0 )
228 // make a copy in order to properly NUL-terminate the string
229 bufTmp
= wxCharBuffer(inLen
+ nulLen
- 1 /* 1 will be added */);
230 memcpy(bufTmp
.data(), in
, inLen
);
231 memcpy(bufTmp
.data() + inLen
, nul
, nulLen
);
238 for ( const char * const inEnd
= in
+ inLen
;; )
240 // try to convert the current chunk if anything left
241 size_t lenChunk
= in
< inEnd
? MB2WC(NULL
, in
, 0) : 0;
244 // nothing left in the input string, conversion succeeded
247 // we shouldn't include the last NUL in the result length
248 *outLen
= lenBuf
? lenBuf
- 1 : 0;
254 if ( lenChunk
== (size_t)-1 )
257 const size_t lenBufNew
= lenBuf
+ lenChunk
;
258 if ( !wbuf
.extend(lenBufNew
) )
261 lenChunk
= MB2WC(wbuf
.data() + lenBuf
, in
, lenChunk
+ 1 /* for NUL */);
262 if ( lenChunk
== (size_t)-1 )
265 // +! for the embedded NUL (if something follows)
266 lenBuf
= lenBufNew
+ 1;
268 // advance the input pointer past the end of this chunk
269 while ( memcmp(in
, nul
, nulLen
) != 0 )
272 in
+= nulLen
; // skipping over its terminator as well
279 return wxWCharBuffer();
283 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
285 // the currently accumulated multibyte characters
288 // the current length of buf
291 // make a copy of the input string unless it is already properly
294 // if we don't know its length we have no choice but to assume that it is,
295 // indeed, properly terminated
296 wxWCharBuffer bufTmp
;
297 if ( inLen
== (size_t)-1 )
299 inLen
= wxWcslen(in
) + 1;
301 else if ( inLen
!= 0 && in
[inLen
- 1] != L
'\0' )
303 // make a copy in order to properly NUL-terminate the string
304 bufTmp
= wxWCharBuffer(inLen
);
305 memcpy(bufTmp
.data(), in
, inLen
*sizeof(wchar_t));
311 for ( const wchar_t * const inEnd
= in
+ inLen
;; )
313 // try to convert the current chunk, if anything left
314 size_t lenChunk
= in
< inEnd
? WC2MB(NULL
, in
, 0) : 0;
317 // nothing left in the input string, conversion succeeded
319 *outLen
= lenBuf
? lenBuf
- 1 : lenBuf
;
324 if ( lenChunk
== (size_t)-1 )
327 const size_t lenBufNew
= lenBuf
+ lenChunk
;
328 if ( !buf
.extend(lenBufNew
) )
331 lenChunk
= WC2MB(buf
.data() + lenBuf
, in
, lenChunk
+ 1 /* for NUL */);
332 if ( lenChunk
== (size_t)-1 )
335 // chunk successfully converted, go to the next one
336 in
+= wxWcslen(in
) + 1 /* skip NUL too */;
337 lenBuf
= lenBufNew
+ 1;
344 return wxCharBuffer();
347 // ----------------------------------------------------------------------------
349 // ----------------------------------------------------------------------------
351 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
353 return wxMB2WC(buf
, psz
, n
);
356 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
358 return wxWC2MB(buf
, psz
, n
);
361 // ----------------------------------------------------------------------------
362 // wxConvBrokenFileNames
363 // ----------------------------------------------------------------------------
367 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
369 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
370 || wxStricmp(charset
, _T("UTF8")) == 0 )
371 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
373 m_conv
= new wxCSConv(charset
);
378 // ----------------------------------------------------------------------------
380 // ----------------------------------------------------------------------------
382 // Implementation (C) 2004 Fredrik Roubert
385 // BASE64 decoding table
387 static const unsigned char utf7unb64
[] =
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
395 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
396 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
398 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
399 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
400 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
402 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
403 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
404 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
412 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
413 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
414 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
415 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
416 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
417 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
418 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
423 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
427 while ( *psz
&& (!buf
|| (len
< n
)) )
429 unsigned char cc
= *psz
++;
437 else if (*psz
== '-')
445 else // start of BASE64 encoded string
449 for ( ok
= lsb
= false, d
= 0, l
= 0;
450 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
455 for (l
+= 6; l
>= 8; lsb
= !lsb
)
457 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
467 *buf
= (wchar_t)(c
<< 8);
476 // in valid UTF7 we should have valid characters after '+'
485 if ( buf
&& (len
< n
) )
492 // BASE64 encoding table
494 static const unsigned char utf7enb64
[] =
496 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
497 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
498 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
499 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
500 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
501 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
502 'w', 'x', 'y', 'z', '0', '1', '2', '3',
503 '4', '5', '6', '7', '8', '9', '+', '/'
507 // UTF-7 encoding table
509 // 0 - Set D (directly encoded characters)
510 // 1 - Set O (optional direct characters)
511 // 2 - whitespace characters (optional)
512 // 3 - special characters
514 static const unsigned char utf7encode
[128] =
516 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
517 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
518 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
520 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
522 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
526 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
530 while (*psz
&& ((!buf
) || (len
< n
)))
533 if (cc
< 0x80 && utf7encode
[cc
] < 1)
541 else if (((wxUint32
)cc
) > 0xffff)
543 // no surrogate pair generation (yet?)
554 // BASE64 encode string
555 unsigned int lsb
, d
, l
;
556 for (d
= 0, l
= 0; /*nothing*/; psz
++)
558 for (lsb
= 0; lsb
< 2; lsb
++)
561 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
563 for (l
+= 8; l
>= 6; )
567 *buf
++ = utf7enb64
[(d
>> l
) % 64];
572 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
578 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
587 if (buf
&& (len
< n
))
592 // ----------------------------------------------------------------------------
594 // ----------------------------------------------------------------------------
596 static wxUint32 utf8_max
[]=
597 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
599 // boundaries of the private use area we use to (temporarily) remap invalid
600 // characters invalid in a UTF-8 encoded string
601 const wxUint32 wxUnicodePUA
= 0x100000;
602 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
604 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
608 while (*psz
&& ((!buf
) || (len
< n
)))
610 const char *opsz
= psz
;
611 bool invalid
= false;
612 unsigned char cc
= *psz
++, fc
= cc
;
614 for (cnt
= 0; fc
& 0x80; cnt
++)
623 // escape the escape character for octal escapes
624 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
625 && cc
== '\\' && (!buf
|| len
< n
))
637 // invalid UTF-8 sequence
642 unsigned ocnt
= cnt
- 1;
643 wxUint32 res
= cc
& (0x3f >> cnt
);
647 if ((cc
& 0xC0) != 0x80)
649 // invalid UTF-8 sequence
654 res
= (res
<< 6) | (cc
& 0x3f);
656 if (invalid
|| res
<= utf8_max
[ocnt
])
658 // illegal UTF-8 encoding
661 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
662 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
664 // if one of our PUA characters turns up externally
665 // it must also be treated as an illegal sequence
666 // (a bit like you have to escape an escape character)
672 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
673 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
674 if (pa
== (size_t)-1)
686 *buf
++ = (wchar_t)res
;
688 #endif // WC_UTF16/!WC_UTF16
693 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
695 while (opsz
< psz
&& (!buf
|| len
< n
))
698 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
699 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
700 wxASSERT(pa
!= (size_t)-1);
707 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
713 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
715 while (opsz
< psz
&& (!buf
|| len
< n
))
717 if ( buf
&& len
+ 3 < n
)
719 unsigned char on
= *opsz
;
721 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
722 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
723 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
729 else // MAP_INVALID_UTF8_NOT
736 if (buf
&& (len
< n
))
741 static inline bool isoctal(wchar_t wch
)
743 return L
'0' <= wch
&& wch
<= L
'7';
746 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
750 while (*psz
&& ((!buf
) || (len
< n
)))
754 // cast is ok for WC_UTF16
755 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
756 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
758 cc
=(*psz
++) & 0x7fffffff;
761 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
762 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
765 *buf
++ = (char)(cc
- wxUnicodePUA
);
768 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
769 && cc
== L
'\\' && psz
[0] == L
'\\' )
776 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
778 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
782 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
783 (psz
[1] - L
'0')*010 +
793 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
807 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
809 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
821 // ----------------------------------------------------------------------------
823 // ----------------------------------------------------------------------------
825 #ifdef WORDS_BIGENDIAN
826 #define wxMBConvUTF16straight wxMBConvUTF16BE
827 #define wxMBConvUTF16swap wxMBConvUTF16LE
829 #define wxMBConvUTF16swap wxMBConvUTF16BE
830 #define wxMBConvUTF16straight wxMBConvUTF16LE
836 // copy 16bit MB to 16bit String
837 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
841 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
844 *buf
++ = *(wxUint16
*)psz
;
847 psz
+= sizeof(wxUint16
);
849 if (buf
&& len
<n
) *buf
=0;
855 // copy 16bit String to 16bit MB
856 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
860 while (*psz
&& (!buf
|| len
< n
))
864 *(wxUint16
*)buf
= *psz
;
865 buf
+= sizeof(wxUint16
);
867 len
+= sizeof(wxUint16
);
870 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
876 // swap 16bit MB to 16bit String
877 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
881 // UTF16 string must be terminated by 2 NULs as single NULs may occur
883 while ( (psz
[0] || psz
[1]) && (!buf
|| len
< n
) )
887 ((char *)buf
)[0] = psz
[1];
888 ((char *)buf
)[1] = psz
[0];
895 if ( buf
&& len
< n
)
902 // swap 16bit MB to 16bit String
903 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
907 while ( *psz
&& (!buf
|| len
< n
) )
911 *buf
++ = ((char*)psz
)[1];
912 *buf
++ = ((char*)psz
)[0];
918 if ( buf
&& len
< n
)
928 // copy 16bit MB to 32bit String
929 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
933 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
936 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
937 if (pa
== (size_t)-1)
941 *buf
++ = (wchar_t)cc
;
943 psz
+= pa
* sizeof(wxUint16
);
945 if (buf
&& len
<n
) *buf
=0;
951 // copy 32bit String to 16bit MB
952 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
956 while (*psz
&& (!buf
|| len
< n
))
959 size_t pa
=encode_utf16(*psz
, cc
);
961 if (pa
== (size_t)-1)
966 *(wxUint16
*)buf
= cc
[0];
967 buf
+= sizeof(wxUint16
);
970 *(wxUint16
*)buf
= cc
[1];
971 buf
+= sizeof(wxUint16
);
975 len
+= pa
*sizeof(wxUint16
);
978 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
984 // swap 16bit MB to 32bit String
985 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
989 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
993 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
994 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
996 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
997 if (pa
== (size_t)-1)
1001 *buf
++ = (wchar_t)cc
;
1004 psz
+= pa
* sizeof(wxUint16
);
1006 if (buf
&& len
<n
) *buf
=0;
1012 // swap 32bit String to 16bit MB
1013 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1017 while (*psz
&& (!buf
|| len
< n
))
1020 size_t pa
=encode_utf16(*psz
, cc
);
1022 if (pa
== (size_t)-1)
1027 *buf
++ = ((char*)cc
)[1];
1028 *buf
++ = ((char*)cc
)[0];
1031 *buf
++ = ((char*)cc
)[3];
1032 *buf
++ = ((char*)cc
)[2];
1036 len
+= pa
*sizeof(wxUint16
);
1039 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1047 // ----------------------------------------------------------------------------
1049 // ----------------------------------------------------------------------------
1051 #ifdef WORDS_BIGENDIAN
1052 #define wxMBConvUTF32straight wxMBConvUTF32BE
1053 #define wxMBConvUTF32swap wxMBConvUTF32LE
1055 #define wxMBConvUTF32swap wxMBConvUTF32BE
1056 #define wxMBConvUTF32straight wxMBConvUTF32LE
1060 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1061 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1066 // copy 32bit MB to 16bit String
1067 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1071 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1075 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
1076 if (pa
== (size_t)-1)
1086 psz
+= sizeof(wxUint32
);
1088 if (buf
&& len
<n
) *buf
=0;
1094 // copy 16bit String to 32bit MB
1095 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1099 while (*psz
&& (!buf
|| len
< n
))
1103 // cast is ok for WC_UTF16
1104 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1105 if (pa
== (size_t)-1)
1110 *(wxUint32
*)buf
= cc
;
1111 buf
+= sizeof(wxUint32
);
1113 len
+= sizeof(wxUint32
);
1117 if (buf
&& len
<=n
-sizeof(wxUint32
))
1125 // swap 32bit MB to 16bit String
1126 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1130 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1133 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
1134 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
1139 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
1140 if (pa
== (size_t)-1)
1150 psz
+= sizeof(wxUint32
);
1160 // swap 16bit String to 32bit MB
1161 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1165 while (*psz
&& (!buf
|| len
< n
))
1169 // cast is ok for WC_UTF16
1170 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1171 if (pa
== (size_t)-1)
1181 len
+= sizeof(wxUint32
);
1185 if (buf
&& len
<=n
-sizeof(wxUint32
))
1194 // copy 32bit MB to 32bit String
1195 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1199 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1202 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1204 psz
+= sizeof(wxUint32
);
1214 // copy 32bit String to 32bit MB
1215 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1219 while (*psz
&& (!buf
|| len
< n
))
1223 *(wxUint32
*)buf
= *psz
;
1224 buf
+= sizeof(wxUint32
);
1227 len
+= sizeof(wxUint32
);
1231 if (buf
&& len
<=n
-sizeof(wxUint32
))
1238 // swap 32bit MB to 32bit String
1239 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1243 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1247 ((char *)buf
)[0] = psz
[3];
1248 ((char *)buf
)[1] = psz
[2];
1249 ((char *)buf
)[2] = psz
[1];
1250 ((char *)buf
)[3] = psz
[0];
1254 psz
+= sizeof(wxUint32
);
1264 // swap 32bit String to 32bit MB
1265 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1269 while (*psz
&& (!buf
|| len
< n
))
1273 *buf
++ = ((char *)psz
)[3];
1274 *buf
++ = ((char *)psz
)[2];
1275 *buf
++ = ((char *)psz
)[1];
1276 *buf
++ = ((char *)psz
)[0];
1278 len
+= sizeof(wxUint32
);
1282 if (buf
&& len
<=n
-sizeof(wxUint32
))
1292 // ============================================================================
1293 // The classes doing conversion using the iconv_xxx() functions
1294 // ============================================================================
1298 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1299 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1300 // (unless there's yet another bug in glibc) the only case when iconv()
1301 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1302 // left in the input buffer -- when _real_ error occurs,
1303 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1305 // [This bug does not appear in glibc 2.2.]
1306 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1307 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1308 (errno != E2BIG || bufLeft != 0))
1310 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1313 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1315 #define ICONV_T_INVALID ((iconv_t)-1)
1317 #if SIZEOF_WCHAR_T == 4
1318 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1319 #define WC_ENC wxFONTENCODING_UTF32
1320 #elif SIZEOF_WCHAR_T == 2
1321 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1322 #define WC_ENC wxFONTENCODING_UTF16
1323 #else // sizeof(wchar_t) != 2 nor 4
1324 // does this ever happen?
1325 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1328 // ----------------------------------------------------------------------------
1329 // wxMBConv_iconv: encapsulates an iconv character set
1330 // ----------------------------------------------------------------------------
1332 class wxMBConv_iconv
: public wxMBConv
1335 wxMBConv_iconv(const wxChar
*name
);
1336 virtual ~wxMBConv_iconv();
1338 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1339 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1342 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1345 // the iconv handlers used to translate from multibyte to wide char and in
1346 // the other direction
1350 // guards access to m2w and w2m objects
1351 wxMutex m_iconvMutex
;
1355 virtual const char *GetMBNul(size_t *nulLen
) const;
1357 // the name (for iconv_open()) of a wide char charset -- if none is
1358 // available on this machine, it will remain NULL
1359 static wxString ms_wcCharsetName
;
1361 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1362 // different endian-ness than the native one
1363 static bool ms_wcNeedsSwap
;
1365 // NUL representation
1370 // make the constructor available for unit testing
1371 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1373 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1374 if ( !result
->IsOk() )
1382 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1383 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1385 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1387 m_nulLen
= (size_t)-2;
1389 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1390 // names for the charsets
1391 const wxCharBuffer
cname(wxString(name
).ToAscii());
1393 // check for charset that represents wchar_t:
1394 if ( ms_wcCharsetName
.empty() )
1396 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1399 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1400 #else // !wxUSE_FONTMAP
1401 static const wxChar
*names
[] =
1403 #if SIZEOF_WCHAR_T == 4
1405 #elif SIZEOF_WCHAR_T = 2
1410 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1412 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1414 const wxString
nameCS(*names
);
1416 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1417 wxString
nameXE(nameCS
);
1418 #ifdef WORDS_BIGENDIAN
1420 #else // little endian
1424 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1427 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1428 if ( m2w
== ICONV_T_INVALID
)
1430 // try charset w/o bytesex info (e.g. "UCS4")
1431 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1433 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1435 // and check for bytesex ourselves:
1436 if ( m2w
!= ICONV_T_INVALID
)
1438 char buf
[2], *bufPtr
;
1439 wchar_t wbuf
[2], *wbufPtr
;
1447 outsz
= SIZEOF_WCHAR_T
* 2;
1451 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1452 (char**)&wbufPtr
, &outsz
);
1454 if (ICONV_FAILED(res
, insz
))
1456 wxLogLastError(wxT("iconv"));
1457 wxLogError(_("Conversion to charset '%s' doesn't work."),
1460 else // ok, can convert to this encoding, remember it
1462 ms_wcCharsetName
= nameCS
;
1463 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1467 else // use charset not requiring byte swapping
1469 ms_wcCharsetName
= nameXE
;
1473 wxLogTrace(TRACE_STRCONV
,
1474 wxT("iconv wchar_t charset is \"%s\"%s"),
1475 ms_wcCharsetName
.empty() ? _T("<none>")
1476 : ms_wcCharsetName
.c_str(),
1477 ms_wcNeedsSwap
? _T(" (needs swap)")
1480 else // we already have ms_wcCharsetName
1482 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1485 if ( ms_wcCharsetName
.empty() )
1487 w2m
= ICONV_T_INVALID
;
1491 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1492 if ( w2m
== ICONV_T_INVALID
)
1494 wxLogTrace(TRACE_STRCONV
,
1495 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1496 ms_wcCharsetName
.c_str(), cname
.data());
1501 wxMBConv_iconv::~wxMBConv_iconv()
1503 if ( m2w
!= ICONV_T_INVALID
)
1505 if ( w2m
!= ICONV_T_INVALID
)
1509 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1512 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1513 // Unfortunately there is a couple of global wxCSConv objects such as
1514 // wxConvLocal that are used all over wx code, so we have to make sure
1515 // the handle is used by at most one thread at the time. Otherwise
1516 // only a few wx classes would be safe to use from non-main threads
1517 // as MB<->WC conversion would fail "randomly".
1518 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1521 size_t inbuf
= strlen(psz
);
1522 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1524 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1525 wchar_t *bufPtr
= buf
;
1526 const char *pszPtr
= psz
;
1530 // have destination buffer, convert there
1532 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1533 (char**)&bufPtr
, &outbuf
);
1534 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1538 // convert to native endianness
1539 for ( unsigned i
= 0; i
< res
; i
++ )
1540 buf
[n
] = WC_BSWAP(buf
[i
]);
1543 // NB: iconv was given only strlen(psz) characters on input, and so
1544 // it couldn't convert the trailing zero. Let's do it ourselves
1545 // if there's some room left for it in the output buffer.
1551 // no destination buffer... convert using temp buffer
1552 // to calculate destination buffer requirement
1557 outbuf
= 8*SIZEOF_WCHAR_T
;
1560 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1561 (char**)&bufPtr
, &outbuf
);
1563 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1564 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1567 if (ICONV_FAILED(cres
, inbuf
))
1569 //VS: it is ok if iconv fails, hence trace only
1570 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1577 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1580 // NB: explained in MB2WC
1581 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1584 size_t inlen
= wxWcslen(psz
);
1585 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1589 wchar_t *tmpbuf
= 0;
1593 // need to copy to temp buffer to switch endianness
1594 // (doing WC_BSWAP twice on the original buffer won't help, as it
1595 // could be in read-only memory, or be accessed in some other thread)
1596 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1597 for ( size_t i
= 0; i
< inlen
; i
++ )
1598 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1599 tmpbuf
[inlen
] = L
'\0';
1605 // have destination buffer, convert there
1606 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1610 // NB: iconv was given only wcslen(psz) characters on input, and so
1611 // it couldn't convert the trailing zero. Let's do it ourselves
1612 // if there's some room left for it in the output buffer.
1618 // no destination buffer... convert using temp buffer
1619 // to calculate destination buffer requirement
1623 buf
= tbuf
; outbuf
= 16;
1625 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1628 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1636 if (ICONV_FAILED(cres
, inbuf
))
1638 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1645 const char *wxMBConv_iconv::GetMBNul(size_t *nulLen
) const
1647 if ( m_nulLen
== (size_t)-2 )
1649 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1652 // NB: explained in MB2WC
1653 wxMutexLocker
lock(self
->m_iconvMutex
);
1656 wchar_t *wnul
= L
"";
1657 size_t inLen
= sizeof(wchar_t),
1658 outLen
= WXSIZEOF(m_nulBuf
);
1659 char *in
= (char *)wnul
,
1660 *out
= self
->m_nulBuf
;
1661 if ( iconv(w2m
, &in
, &inLen
, &out
, &outLen
) == (size_t)-1 )
1663 self
->m_nulLen
= (size_t)-1;
1667 self
->m_nulLen
= out
- m_nulBuf
;
1675 #endif // HAVE_ICONV
1678 // ============================================================================
1679 // Win32 conversion classes
1680 // ============================================================================
1682 #ifdef wxHAVE_WIN32_MB2WC
1686 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1687 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1690 class wxMBConv_win32
: public wxMBConv
1695 m_CodePage
= CP_ACP
;
1696 m_nulLen
= (size_t)-2;
1700 wxMBConv_win32(const wxChar
* name
)
1702 m_CodePage
= wxCharsetToCodepage(name
);
1703 m_nulLen
= (size_t)-2;
1706 wxMBConv_win32(wxFontEncoding encoding
)
1708 m_CodePage
= wxEncodingToCodepage(encoding
);
1709 m_nulLen
= (size_t)-2;
1711 #endif // wxUSE_FONTMAP
1713 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1715 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1716 // the behaviour is not compatible with the Unix version (using iconv)
1717 // and break the library itself, e.g. wxTextInputStream::NextChar()
1718 // wouldn't work if reading an incomplete MB char didn't result in an
1721 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1722 // an error (tested under Windows Server 2003) and apparently it is
1723 // done on purpose, i.e. the function accepts any input in this case
1724 // and although I'd prefer to return error on ill-formed output, our
1725 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1726 // explicitly ill-formed according to RFC 2152) neither so we don't
1727 // even have any fallback here...
1729 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1730 // Win XP or newer and if it is specified on older versions, conversion
1731 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1732 // fails. So we can only use the flag on newer Windows versions.
1733 // Additionally, the flag is not supported by UTF7, symbol and CJK
1734 // encodings. See here:
1735 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1736 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1738 if ( m_CodePage
!= CP_UTF7
&& m_CodePage
!= CP_SYMBOL
&&
1739 m_CodePage
< 50000 &&
1740 IsAtLeastWin2kSP4() )
1742 flags
= MB_ERR_INVALID_CHARS
;
1744 else if ( m_CodePage
== CP_UTF8
)
1746 // Avoid round-trip in the special case of UTF-8 by using our
1747 // own UTF-8 conversion code:
1748 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
1751 const size_t len
= ::MultiByteToWideChar
1753 m_CodePage
, // code page
1754 flags
, // flags: fall on error
1755 psz
, // input string
1756 -1, // its length (NUL-terminated)
1757 buf
, // output string
1758 buf
? n
: 0 // size of output buffer
1762 // function totally failed
1766 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1767 // check if we succeeded, by doing a double trip:
1768 if ( !flags
&& buf
)
1770 const size_t mbLen
= strlen(psz
);
1771 wxCharBuffer
mbBuf(mbLen
);
1772 if ( ::WideCharToMultiByte
1779 mbLen
+ 1, // size in bytes, not length
1783 strcmp(mbBuf
, psz
) != 0 )
1785 // we didn't obtain the same thing we started from, hence
1786 // the conversion was lossy and we consider that it failed
1791 // note that it returns count of written chars for buf != NULL and size
1792 // of the needed buffer for buf == NULL so in either case the length of
1793 // the string (which never includes the terminating NUL) is one less
1797 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1800 we have a problem here: by default, WideCharToMultiByte() may
1801 replace characters unrepresentable in the target code page with bad
1802 quality approximations such as turning "1/2" symbol (U+00BD) into
1803 "1" for the code pages which don't have it and we, obviously, want
1804 to avoid this at any price
1806 the trouble is that this function does it _silently_, i.e. it won't
1807 even tell us whether it did or not... Win98/2000 and higher provide
1808 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1809 we have to resort to a round trip, i.e. check that converting back
1810 results in the same string -- this is, of course, expensive but
1811 otherwise we simply can't be sure to not garble the data.
1814 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1815 // it doesn't work with CJK encodings (which we test for rather roughly
1816 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1818 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1821 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1823 // it's our lucky day
1824 flags
= WC_NO_BEST_FIT_CHARS
;
1825 pUsedDef
= &usedDef
;
1827 else // old system or unsupported encoding
1833 const size_t len
= ::WideCharToMultiByte
1835 m_CodePage
, // code page
1836 flags
, // either none or no best fit
1837 pwz
, // input string
1838 -1, // it is (wide) NUL-terminated
1839 buf
, // output buffer
1840 buf
? n
: 0, // and its size
1841 NULL
, // default "replacement" char
1842 pUsedDef
// [out] was it used?
1847 // function totally failed
1851 // if we were really converting, check if we succeeded
1856 // check if the conversion failed, i.e. if any replacements
1861 else // we must resort to double tripping...
1863 wxWCharBuffer
wcBuf(n
);
1864 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1865 wcscmp(wcBuf
, pwz
) != 0 )
1867 // we didn't obtain the same thing we started from, hence
1868 // the conversion was lossy and we consider that it failed
1874 // see the comment above for the reason of "len - 1"
1878 bool IsOk() const { return m_CodePage
!= -1; }
1881 static bool CanUseNoBestFit()
1883 static int s_isWin98Or2k
= -1;
1885 if ( s_isWin98Or2k
== -1 )
1888 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1891 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1895 s_isWin98Or2k
= verMaj
>= 5;
1899 // unknown, be conseravtive by default
1903 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1906 return s_isWin98Or2k
== 1;
1909 static bool IsAtLeastWin2kSP4()
1914 static int s_isAtLeastWin2kSP4
= -1;
1916 if ( s_isAtLeastWin2kSP4
== -1 )
1918 OSVERSIONINFOEX ver
;
1920 memset(&ver
, 0, sizeof(ver
));
1921 ver
.dwOSVersionInfoSize
= sizeof(ver
);
1922 GetVersionEx((OSVERSIONINFO
*)&ver
);
1924 s_isAtLeastWin2kSP4
=
1925 ((ver
.dwMajorVersion
> 5) || // Vista+
1926 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
1927 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
1928 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
1932 return s_isAtLeastWin2kSP4
== 1;
1936 virtual const char *GetMBNul(size_t *nulLen
) const
1938 if ( m_nulLen
== (size_t)-2 )
1940 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
1942 self
->m_nulLen
= ::WideCharToMultiByte
1944 m_CodePage
, // code page
1946 L
"", // input string
1947 1, // translate just NUL
1948 self
->m_nulBuf
, // output buffer
1949 WXSIZEOF(m_nulBuf
), // and its size
1950 NULL
, // "replacement" char
1951 NULL
// [out] was it used?
1954 if ( m_nulLen
== 0 )
1955 self
->m_nulLen
= (size_t)-1;
1967 #endif // wxHAVE_WIN32_MB2WC
1969 // ============================================================================
1970 // Cocoa conversion classes
1971 // ============================================================================
1973 #if defined(__WXCOCOA__)
1975 // RN: There is no UTF-32 support in either Core Foundation or
1976 // Cocoa. Strangely enough, internally Core Foundation uses
1977 // UTF 32 internally quite a bit - its just not public (yet).
1979 #include <CoreFoundation/CFString.h>
1980 #include <CoreFoundation/CFStringEncodingExt.h>
1982 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
1984 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
1985 if ( encoding
== wxFONTENCODING_DEFAULT
)
1987 enc
= CFStringGetSystemEncoding();
1989 else switch( encoding
)
1991 case wxFONTENCODING_ISO8859_1
:
1992 enc
= kCFStringEncodingISOLatin1
;
1994 case wxFONTENCODING_ISO8859_2
:
1995 enc
= kCFStringEncodingISOLatin2
;
1997 case wxFONTENCODING_ISO8859_3
:
1998 enc
= kCFStringEncodingISOLatin3
;
2000 case wxFONTENCODING_ISO8859_4
:
2001 enc
= kCFStringEncodingISOLatin4
;
2003 case wxFONTENCODING_ISO8859_5
:
2004 enc
= kCFStringEncodingISOLatinCyrillic
;
2006 case wxFONTENCODING_ISO8859_6
:
2007 enc
= kCFStringEncodingISOLatinArabic
;
2009 case wxFONTENCODING_ISO8859_7
:
2010 enc
= kCFStringEncodingISOLatinGreek
;
2012 case wxFONTENCODING_ISO8859_8
:
2013 enc
= kCFStringEncodingISOLatinHebrew
;
2015 case wxFONTENCODING_ISO8859_9
:
2016 enc
= kCFStringEncodingISOLatin5
;
2018 case wxFONTENCODING_ISO8859_10
:
2019 enc
= kCFStringEncodingISOLatin6
;
2021 case wxFONTENCODING_ISO8859_11
:
2022 enc
= kCFStringEncodingISOLatinThai
;
2024 case wxFONTENCODING_ISO8859_13
:
2025 enc
= kCFStringEncodingISOLatin7
;
2027 case wxFONTENCODING_ISO8859_14
:
2028 enc
= kCFStringEncodingISOLatin8
;
2030 case wxFONTENCODING_ISO8859_15
:
2031 enc
= kCFStringEncodingISOLatin9
;
2034 case wxFONTENCODING_KOI8
:
2035 enc
= kCFStringEncodingKOI8_R
;
2037 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2038 enc
= kCFStringEncodingDOSRussian
;
2041 // case wxFONTENCODING_BULGARIAN :
2045 case wxFONTENCODING_CP437
:
2046 enc
=kCFStringEncodingDOSLatinUS
;
2048 case wxFONTENCODING_CP850
:
2049 enc
= kCFStringEncodingDOSLatin1
;
2051 case wxFONTENCODING_CP852
:
2052 enc
= kCFStringEncodingDOSLatin2
;
2054 case wxFONTENCODING_CP855
:
2055 enc
= kCFStringEncodingDOSCyrillic
;
2057 case wxFONTENCODING_CP866
:
2058 enc
=kCFStringEncodingDOSRussian
;
2060 case wxFONTENCODING_CP874
:
2061 enc
= kCFStringEncodingDOSThai
;
2063 case wxFONTENCODING_CP932
:
2064 enc
= kCFStringEncodingDOSJapanese
;
2066 case wxFONTENCODING_CP936
:
2067 enc
=kCFStringEncodingDOSChineseSimplif
;
2069 case wxFONTENCODING_CP949
:
2070 enc
= kCFStringEncodingDOSKorean
;
2072 case wxFONTENCODING_CP950
:
2073 enc
= kCFStringEncodingDOSChineseTrad
;
2075 case wxFONTENCODING_CP1250
:
2076 enc
= kCFStringEncodingWindowsLatin2
;
2078 case wxFONTENCODING_CP1251
:
2079 enc
=kCFStringEncodingWindowsCyrillic
;
2081 case wxFONTENCODING_CP1252
:
2082 enc
=kCFStringEncodingWindowsLatin1
;
2084 case wxFONTENCODING_CP1253
:
2085 enc
= kCFStringEncodingWindowsGreek
;
2087 case wxFONTENCODING_CP1254
:
2088 enc
= kCFStringEncodingWindowsLatin5
;
2090 case wxFONTENCODING_CP1255
:
2091 enc
=kCFStringEncodingWindowsHebrew
;
2093 case wxFONTENCODING_CP1256
:
2094 enc
=kCFStringEncodingWindowsArabic
;
2096 case wxFONTENCODING_CP1257
:
2097 enc
= kCFStringEncodingWindowsBalticRim
;
2099 // This only really encodes to UTF7 (if that) evidently
2100 // case wxFONTENCODING_UTF7 :
2101 // enc = kCFStringEncodingNonLossyASCII ;
2103 case wxFONTENCODING_UTF8
:
2104 enc
= kCFStringEncodingUTF8
;
2106 case wxFONTENCODING_EUC_JP
:
2107 enc
= kCFStringEncodingEUC_JP
;
2109 case wxFONTENCODING_UTF16
:
2110 enc
= kCFStringEncodingUnicode
;
2112 case wxFONTENCODING_MACROMAN
:
2113 enc
= kCFStringEncodingMacRoman
;
2115 case wxFONTENCODING_MACJAPANESE
:
2116 enc
= kCFStringEncodingMacJapanese
;
2118 case wxFONTENCODING_MACCHINESETRAD
:
2119 enc
= kCFStringEncodingMacChineseTrad
;
2121 case wxFONTENCODING_MACKOREAN
:
2122 enc
= kCFStringEncodingMacKorean
;
2124 case wxFONTENCODING_MACARABIC
:
2125 enc
= kCFStringEncodingMacArabic
;
2127 case wxFONTENCODING_MACHEBREW
:
2128 enc
= kCFStringEncodingMacHebrew
;
2130 case wxFONTENCODING_MACGREEK
:
2131 enc
= kCFStringEncodingMacGreek
;
2133 case wxFONTENCODING_MACCYRILLIC
:
2134 enc
= kCFStringEncodingMacCyrillic
;
2136 case wxFONTENCODING_MACDEVANAGARI
:
2137 enc
= kCFStringEncodingMacDevanagari
;
2139 case wxFONTENCODING_MACGURMUKHI
:
2140 enc
= kCFStringEncodingMacGurmukhi
;
2142 case wxFONTENCODING_MACGUJARATI
:
2143 enc
= kCFStringEncodingMacGujarati
;
2145 case wxFONTENCODING_MACORIYA
:
2146 enc
= kCFStringEncodingMacOriya
;
2148 case wxFONTENCODING_MACBENGALI
:
2149 enc
= kCFStringEncodingMacBengali
;
2151 case wxFONTENCODING_MACTAMIL
:
2152 enc
= kCFStringEncodingMacTamil
;
2154 case wxFONTENCODING_MACTELUGU
:
2155 enc
= kCFStringEncodingMacTelugu
;
2157 case wxFONTENCODING_MACKANNADA
:
2158 enc
= kCFStringEncodingMacKannada
;
2160 case wxFONTENCODING_MACMALAJALAM
:
2161 enc
= kCFStringEncodingMacMalayalam
;
2163 case wxFONTENCODING_MACSINHALESE
:
2164 enc
= kCFStringEncodingMacSinhalese
;
2166 case wxFONTENCODING_MACBURMESE
:
2167 enc
= kCFStringEncodingMacBurmese
;
2169 case wxFONTENCODING_MACKHMER
:
2170 enc
= kCFStringEncodingMacKhmer
;
2172 case wxFONTENCODING_MACTHAI
:
2173 enc
= kCFStringEncodingMacThai
;
2175 case wxFONTENCODING_MACLAOTIAN
:
2176 enc
= kCFStringEncodingMacLaotian
;
2178 case wxFONTENCODING_MACGEORGIAN
:
2179 enc
= kCFStringEncodingMacGeorgian
;
2181 case wxFONTENCODING_MACARMENIAN
:
2182 enc
= kCFStringEncodingMacArmenian
;
2184 case wxFONTENCODING_MACCHINESESIMP
:
2185 enc
= kCFStringEncodingMacChineseSimp
;
2187 case wxFONTENCODING_MACTIBETAN
:
2188 enc
= kCFStringEncodingMacTibetan
;
2190 case wxFONTENCODING_MACMONGOLIAN
:
2191 enc
= kCFStringEncodingMacMongolian
;
2193 case wxFONTENCODING_MACETHIOPIC
:
2194 enc
= kCFStringEncodingMacEthiopic
;
2196 case wxFONTENCODING_MACCENTRALEUR
:
2197 enc
= kCFStringEncodingMacCentralEurRoman
;
2199 case wxFONTENCODING_MACVIATNAMESE
:
2200 enc
= kCFStringEncodingMacVietnamese
;
2202 case wxFONTENCODING_MACARABICEXT
:
2203 enc
= kCFStringEncodingMacExtArabic
;
2205 case wxFONTENCODING_MACSYMBOL
:
2206 enc
= kCFStringEncodingMacSymbol
;
2208 case wxFONTENCODING_MACDINGBATS
:
2209 enc
= kCFStringEncodingMacDingbats
;
2211 case wxFONTENCODING_MACTURKISH
:
2212 enc
= kCFStringEncodingMacTurkish
;
2214 case wxFONTENCODING_MACCROATIAN
:
2215 enc
= kCFStringEncodingMacCroatian
;
2217 case wxFONTENCODING_MACICELANDIC
:
2218 enc
= kCFStringEncodingMacIcelandic
;
2220 case wxFONTENCODING_MACROMANIAN
:
2221 enc
= kCFStringEncodingMacRomanian
;
2223 case wxFONTENCODING_MACCELTIC
:
2224 enc
= kCFStringEncodingMacCeltic
;
2226 case wxFONTENCODING_MACGAELIC
:
2227 enc
= kCFStringEncodingMacGaelic
;
2229 // case wxFONTENCODING_MACKEYBOARD :
2230 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2233 // because gcc is picky
2239 class wxMBConv_cocoa
: public wxMBConv
2244 Init(CFStringGetSystemEncoding()) ;
2248 wxMBConv_cocoa(const wxChar
* name
)
2250 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2254 wxMBConv_cocoa(wxFontEncoding encoding
)
2256 Init( wxCFStringEncFromFontEnc(encoding
) );
2263 void Init( CFStringEncoding encoding
)
2265 m_encoding
= encoding
;
2268 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2272 CFStringRef theString
= CFStringCreateWithBytes (
2273 NULL
, //the allocator
2274 (const UInt8
*)szUnConv
,
2277 false //no BOM/external representation
2280 wxASSERT(theString
);
2282 size_t nOutLength
= CFStringGetLength(theString
);
2286 CFRelease(theString
);
2290 CFRange theRange
= { 0, nOutSize
};
2292 #if SIZEOF_WCHAR_T == 4
2293 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2296 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2298 CFRelease(theString
);
2300 szUniCharBuffer
[nOutLength
] = '\0' ;
2302 #if SIZEOF_WCHAR_T == 4
2303 wxMBConvUTF16 converter
;
2304 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2305 delete[] szUniCharBuffer
;
2311 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2315 size_t nRealOutSize
;
2316 size_t nBufSize
= wxWcslen(szUnConv
);
2317 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2319 #if SIZEOF_WCHAR_T == 4
2320 wxMBConvUTF16 converter
;
2321 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2322 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2323 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2324 nBufSize
/= sizeof(UniChar
);
2327 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2331 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2334 wxASSERT(theString
);
2336 //Note that CER puts a BOM when converting to unicode
2337 //so we check and use getchars instead in that case
2338 if (m_encoding
== kCFStringEncodingUnicode
)
2341 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2343 nRealOutSize
= CFStringGetLength(theString
) + 1;
2349 CFRangeMake(0, CFStringGetLength(theString
)),
2351 0, //what to put in characters that can't be converted -
2352 //0 tells CFString to return NULL if it meets such a character
2353 false, //not an external representation
2356 (CFIndex
*) &nRealOutSize
2360 CFRelease(theString
);
2362 #if SIZEOF_WCHAR_T == 4
2363 delete[] szUniBuffer
;
2366 return nRealOutSize
- 1;
2371 return m_encoding
!= kCFStringEncodingInvalidId
&&
2372 CFStringIsEncodingAvailable(m_encoding
);
2376 CFStringEncoding m_encoding
;
2379 #endif // defined(__WXCOCOA__)
2381 // ============================================================================
2382 // Mac conversion classes
2383 // ============================================================================
2385 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2387 class wxMBConv_mac
: public wxMBConv
2392 Init(CFStringGetSystemEncoding()) ;
2396 wxMBConv_mac(const wxChar
* name
)
2398 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2402 wxMBConv_mac(wxFontEncoding encoding
)
2404 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2409 OSStatus status
= noErr
;
2410 status
= TECDisposeConverter(m_MB2WC_converter
);
2411 status
= TECDisposeConverter(m_WC2MB_converter
);
2415 void Init( TextEncodingBase encoding
)
2417 OSStatus status
= noErr
;
2418 m_char_encoding
= encoding
;
2419 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2421 status
= TECCreateConverter(&m_MB2WC_converter
,
2423 m_unicode_encoding
);
2424 status
= TECCreateConverter(&m_WC2MB_converter
,
2429 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2431 OSStatus status
= noErr
;
2432 ByteCount byteOutLen
;
2433 ByteCount byteInLen
= strlen(psz
) ;
2434 wchar_t *tbuf
= NULL
;
2435 UniChar
* ubuf
= NULL
;
2440 //apple specs say at least 32
2441 n
= wxMax( 32 , byteInLen
) ;
2442 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2444 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2445 #if SIZEOF_WCHAR_T == 4
2446 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2448 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2450 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2451 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2452 #if SIZEOF_WCHAR_T == 4
2453 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2454 // is not properly terminated we get random characters at the end
2455 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2456 wxMBConvUTF16 converter
;
2457 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2460 res
= byteOutLen
/ sizeof( UniChar
) ;
2465 if ( buf
&& res
< n
)
2471 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2473 OSStatus status
= noErr
;
2474 ByteCount byteOutLen
;
2475 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2481 //apple specs say at least 32
2482 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2483 tbuf
= (char*) malloc( n
) ;
2486 ByteCount byteBufferLen
= n
;
2487 UniChar
* ubuf
= NULL
;
2488 #if SIZEOF_WCHAR_T == 4
2489 wxMBConvUTF16 converter
;
2490 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2491 byteInLen
= unicharlen
;
2492 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2493 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2495 ubuf
= (UniChar
*) psz
;
2497 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2498 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2499 #if SIZEOF_WCHAR_T == 4
2505 size_t res
= byteOutLen
;
2506 if ( buf
&& res
< n
)
2510 //we need to double-trip to verify it didn't insert any ? in place
2511 //of bogus characters
2512 wxWCharBuffer
wcBuf(n
);
2513 size_t pszlen
= wxWcslen(psz
);
2514 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2515 wxWcslen(wcBuf
) != pszlen
||
2516 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2518 // we didn't obtain the same thing we started from, hence
2519 // the conversion was lossy and we consider that it failed
2528 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2531 TECObjectRef m_MB2WC_converter
;
2532 TECObjectRef m_WC2MB_converter
;
2534 TextEncodingBase m_char_encoding
;
2535 TextEncodingBase m_unicode_encoding
;
2538 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2540 // ============================================================================
2541 // wxEncodingConverter based conversion classes
2542 // ============================================================================
2546 class wxMBConv_wxwin
: public wxMBConv
2551 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2552 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2556 // temporarily just use wxEncodingConverter stuff,
2557 // so that it works while a better implementation is built
2558 wxMBConv_wxwin(const wxChar
* name
)
2561 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2563 m_enc
= wxFONTENCODING_SYSTEM
;
2568 wxMBConv_wxwin(wxFontEncoding enc
)
2575 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2577 size_t inbuf
= strlen(psz
);
2580 if (!m2w
.Convert(psz
,buf
))
2586 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2588 const size_t inbuf
= wxWcslen(psz
);
2591 if (!w2m
.Convert(psz
,buf
))
2598 bool IsOk() const { return m_ok
; }
2601 wxFontEncoding m_enc
;
2602 wxEncodingConverter m2w
, w2m
;
2605 virtual const char *GetMBNul(size_t *nulLen
) const
2609 case wxFONTENCODING_UTF16BE
:
2610 case wxFONTENCODING_UTF16LE
:
2614 case wxFONTENCODING_UTF32BE
:
2615 case wxFONTENCODING_UTF32LE
:
2625 // were we initialized successfully?
2628 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2631 // make the constructors available for unit testing
2632 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2634 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2635 if ( !result
->IsOk() )
2643 #endif // wxUSE_FONTMAP
2645 // ============================================================================
2646 // wxCSConv implementation
2647 // ============================================================================
2649 void wxCSConv::Init()
2656 wxCSConv::wxCSConv(const wxChar
*charset
)
2666 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2668 m_encoding
= wxFONTENCODING_SYSTEM
;
2672 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2674 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2676 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2678 encoding
= wxFONTENCODING_SYSTEM
;
2683 m_encoding
= encoding
;
2686 wxCSConv::~wxCSConv()
2691 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2696 SetName(conv
.m_name
);
2697 m_encoding
= conv
.m_encoding
;
2700 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2704 SetName(conv
.m_name
);
2705 m_encoding
= conv
.m_encoding
;
2710 void wxCSConv::Clear()
2719 void wxCSConv::SetName(const wxChar
*charset
)
2723 m_name
= wxStrdup(charset
);
2729 #include "wx/hashmap.h"
2731 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2732 wxEncodingNameCache
);
2734 static wxEncodingNameCache gs_nameCache
;
2737 wxMBConv
*wxCSConv::DoCreate() const
2740 wxLogTrace(TRACE_STRCONV
,
2741 wxT("creating conversion for %s"),
2743 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
2744 #endif // wxUSE_FONTMAP
2746 // check for the special case of ASCII or ISO8859-1 charset: as we have
2747 // special knowledge of it anyhow, we don't need to create a special
2748 // conversion object
2749 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2750 m_encoding
== wxFONTENCODING_DEFAULT
)
2752 // don't convert at all
2756 // we trust OS to do conversion better than we can so try external
2757 // conversion methods first
2759 // the full order is:
2760 // 1. OS conversion (iconv() under Unix or Win32 API)
2761 // 2. hard coded conversions for UTF
2762 // 3. wxEncodingConverter as fall back
2768 #endif // !wxUSE_FONTMAP
2770 wxString
name(m_name
);
2771 wxFontEncoding
encoding(m_encoding
);
2773 if ( !name
.empty() )
2775 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2783 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2784 #endif // wxUSE_FONTMAP
2788 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2789 if ( it
!= gs_nameCache
.end() )
2791 if ( it
->second
.empty() )
2794 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
2801 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2803 for ( ; *names
; ++names
)
2805 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
2808 gs_nameCache
[encoding
] = *names
;
2815 gs_nameCache
[encoding
] = _T(""); // cache the failure
2817 #endif // wxUSE_FONTMAP
2819 #endif // HAVE_ICONV
2821 #ifdef wxHAVE_WIN32_MB2WC
2824 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2825 : new wxMBConv_win32(m_encoding
);
2834 #endif // wxHAVE_WIN32_MB2WC
2835 #if defined(__WXMAC__)
2837 // leave UTF16 and UTF32 to the built-ins of wx
2838 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2839 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2843 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2844 : new wxMBConv_mac(m_encoding
);
2846 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2855 #if defined(__WXCOCOA__)
2857 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2861 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2862 : new wxMBConv_cocoa(m_encoding
);
2864 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
2874 wxFontEncoding enc
= m_encoding
;
2876 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2878 // use "false" to suppress interactive dialogs -- we can be called from
2879 // anywhere and popping up a dialog from here is the last thing we want to
2881 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2883 #endif // wxUSE_FONTMAP
2887 case wxFONTENCODING_UTF7
:
2888 return new wxMBConvUTF7
;
2890 case wxFONTENCODING_UTF8
:
2891 return new wxMBConvUTF8
;
2893 case wxFONTENCODING_UTF16BE
:
2894 return new wxMBConvUTF16BE
;
2896 case wxFONTENCODING_UTF16LE
:
2897 return new wxMBConvUTF16LE
;
2899 case wxFONTENCODING_UTF32BE
:
2900 return new wxMBConvUTF32BE
;
2902 case wxFONTENCODING_UTF32LE
:
2903 return new wxMBConvUTF32LE
;
2906 // nothing to do but put here to suppress gcc warnings
2913 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2914 : new wxMBConv_wxwin(m_encoding
);
2920 #endif // wxUSE_FONTMAP
2922 // NB: This is a hack to prevent deadlock. What could otherwise happen
2923 // in Unicode build: wxConvLocal creation ends up being here
2924 // because of some failure and logs the error. But wxLog will try to
2925 // attach timestamp, for which it will need wxConvLocal (to convert
2926 // time to char* and then wchar_t*), but that fails, tries to log
2927 // error, but wxLog has a (already locked) critical section that
2928 // guards static buffer.
2929 static bool alreadyLoggingError
= false;
2930 if (!alreadyLoggingError
)
2932 alreadyLoggingError
= true;
2933 wxLogError(_("Cannot convert from the charset '%s'!"),
2937 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
2938 #else // !wxUSE_FONTMAP
2939 wxString::Format(_("encoding %s"), m_encoding
).c_str()
2940 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2942 alreadyLoggingError
= false;
2948 void wxCSConv::CreateConvIfNeeded() const
2952 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2955 // if we don't have neither the name nor the encoding, use the default
2956 // encoding for this system
2957 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2959 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
2961 #endif // wxUSE_INTL
2963 self
->m_convReal
= DoCreate();
2964 self
->m_deferred
= false;
2968 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2970 CreateConvIfNeeded();
2973 return m_convReal
->MB2WC(buf
, psz
, n
);
2976 size_t len
= strlen(psz
);
2980 for (size_t c
= 0; c
<= len
; c
++)
2981 buf
[c
] = (unsigned char)(psz
[c
]);
2987 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2989 CreateConvIfNeeded();
2992 return m_convReal
->WC2MB(buf
, psz
, n
);
2995 const size_t len
= wxWcslen(psz
);
2998 for (size_t c
= 0; c
<= len
; c
++)
3002 buf
[c
] = (char)psz
[c
];
3007 for (size_t c
= 0; c
<= len
; c
++)
3017 const char *wxCSConv::GetMBNul(size_t *nulLen
) const
3019 CreateConvIfNeeded();
3023 // cast needed just to call private function of m_convReal
3024 return ((wxCSConv
*)m_convReal
)->GetMBNul(nulLen
);
3031 // ----------------------------------------------------------------------------
3033 // ----------------------------------------------------------------------------
3036 static wxMBConv_win32 wxConvLibcObj
;
3037 #elif defined(__WXMAC__) && !defined(__MACH__)
3038 static wxMBConv_mac wxConvLibcObj
;
3040 static wxMBConvLibc wxConvLibcObj
;
3043 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3044 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3045 static wxMBConvUTF7 wxConvUTF7Obj
;
3046 static wxMBConvUTF8 wxConvUTF8Obj
;
3048 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3049 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3050 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3051 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3052 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3053 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3054 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3062 #else // !wxUSE_WCHAR_T
3064 // stand-ins in absence of wchar_t
3065 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3070 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T