1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p
, size_t n
)
92 while ( n
&& *p
++ == '\0' )
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
103 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
108 *output
= (wxUint16
) input
;
111 else if (input
>=0x110000)
119 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
120 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
126 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
128 if ((*input
<0xd800) || (*input
>0xdfff))
133 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
140 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
146 // ----------------------------------------------------------------------------
148 // ----------------------------------------------------------------------------
151 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
152 const char *src
, size_t srcLen
) const
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten
= 0;
164 // the number of NULs terminating this string
165 size_t nulLen
wxDUMMY_INITIALIZE(0);
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
173 if ( srcLen
!= (size_t)-1 )
175 // we need to know how to find the end of this string
176 nulLen
= GetMBNulLen();
177 if ( nulLen
== wxCONV_FAILED
)
178 return wxCONV_FAILED
;
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
185 char * const p
= bufTmp
.data();
186 memcpy(p
, src
, srcLen
);
187 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
193 srcEnd
= src
+ srcLen
;
195 else // quit after the first loop iteration
202 // try to convert the current chunk
203 size_t lenChunk
= MB2WC(NULL
, src
, 0);
206 // nothing left in the input string, conversion succeeded
210 if ( lenChunk
== wxCONV_FAILED
)
211 return wxCONV_FAILED
;
213 // if we already have a previous chunk, leave the NUL separating it
222 dstWritten
+= lenChunk
;
226 if ( dstWritten
> dstLen
)
227 return wxCONV_FAILED
;
229 lenChunk
= MB2WC(dst
, src
, lenChunk
+ 1 /* for NUL */);
230 if ( lenChunk
== wxCONV_FAILED
)
231 return wxCONV_FAILED
;
238 // we convert the entire string in this cas, as we suppose that the
239 // string is NUL-terminated and so srcEnd is not used at all
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src
, nulLen
) )
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
253 src
+= nulLen
; // skipping over its terminator as well
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
266 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
267 const wchar_t *src
, size_t srcLen
) const
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten
= 0;
272 // make a copy of the input string unless it is already properly
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp
;
278 if ( srcLen
== (size_t)-1 )
280 srcLen
= wxWcslen(src
) + 1;
282 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp
= wxWCharBuffer(srcLen
);
286 memcpy(bufTmp
.data(), src
, srcLen
*sizeof(wchar_t));
290 const size_t lenNul
= GetMBNulLen();
291 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
293 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
295 // try to convert the current chunk
296 size_t lenChunk
= WC2MB(NULL
, src
, 0);
298 if ( lenChunk
== wxCONV_FAILED
)
299 return wxCONV_FAILED
;
302 dstWritten
+= lenChunk
;
306 if ( dstWritten
> dstLen
)
307 return wxCONV_FAILED
;
309 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
310 return wxCONV_FAILED
;
319 wxMBConv::~wxMBConv()
321 // nothing to do here (necessary for Darwin linking probably)
324 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
328 // calculate the length of the buffer needed first
329 const size_t nLen
= MB2WC(NULL
, psz
, 0);
330 if ( nLen
!= wxCONV_FAILED
)
332 // now do the actual conversion
333 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
335 // +1 for the trailing NULL
336 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
341 return wxWCharBuffer();
344 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
348 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
349 if ( nLen
!= wxCONV_FAILED
)
351 // extra space for trailing NUL(s)
352 static const size_t extraLen
= GetMaxMBNulLen();
354 wxCharBuffer
buf(nLen
+ extraLen
- 1);
355 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
360 return wxCharBuffer();
364 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
366 const size_t dstLen
= ToWChar(NULL
, 0, in
, inLen
);
367 if ( dstLen
!= wxCONV_FAILED
)
369 wxWCharBuffer
wbuf(dstLen
);
370 if ( ToWChar(wbuf
.data(), dstLen
, in
, inLen
) )
381 return wxWCharBuffer();
385 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
387 const size_t dstLen
= FromWChar(NULL
, 0, in
, inLen
);
388 if ( dstLen
!= wxCONV_FAILED
)
390 wxCharBuffer
buf(dstLen
);
391 if ( FromWChar(buf
.data(), dstLen
, in
, inLen
) )
402 return wxCharBuffer();
405 // ----------------------------------------------------------------------------
407 // ----------------------------------------------------------------------------
409 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
411 return wxMB2WC(buf
, psz
, n
);
414 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
416 return wxWC2MB(buf
, psz
, n
);
419 // ----------------------------------------------------------------------------
420 // wxConvBrokenFileNames
421 // ----------------------------------------------------------------------------
425 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
427 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
428 || wxStricmp(charset
, _T("UTF8")) == 0 )
429 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
431 m_conv
= new wxCSConv(charset
);
436 // ----------------------------------------------------------------------------
438 // ----------------------------------------------------------------------------
440 // Implementation (C) 2004 Fredrik Roubert
443 // BASE64 decoding table
445 static const unsigned char utf7unb64
[] =
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
451 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
452 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
453 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
454 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
455 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
456 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
457 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
458 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
459 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
460 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
461 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
462 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
463 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
464 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
471 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
472 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
476 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
477 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
478 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
481 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
485 while ( *psz
&& (!buf
|| (len
< n
)) )
487 unsigned char cc
= *psz
++;
495 else if (*psz
== '-')
503 else // start of BASE64 encoded string
507 for ( ok
= lsb
= false, d
= 0, l
= 0;
508 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
513 for (l
+= 6; l
>= 8; lsb
= !lsb
)
515 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
525 *buf
= (wchar_t)(c
<< 8);
534 // in valid UTF7 we should have valid characters after '+'
543 if ( buf
&& (len
< n
) )
550 // BASE64 encoding table
552 static const unsigned char utf7enb64
[] =
554 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
555 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
556 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
557 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
558 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
559 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
560 'w', 'x', 'y', 'z', '0', '1', '2', '3',
561 '4', '5', '6', '7', '8', '9', '+', '/'
565 // UTF-7 encoding table
567 // 0 - Set D (directly encoded characters)
568 // 1 - Set O (optional direct characters)
569 // 2 - whitespace characters (optional)
570 // 3 - special characters
572 static const unsigned char utf7encode
[128] =
574 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
575 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
576 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
577 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
578 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
580 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
584 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
588 while (*psz
&& ((!buf
) || (len
< n
)))
591 if (cc
< 0x80 && utf7encode
[cc
] < 1)
599 else if (((wxUint32
)cc
) > 0xffff)
601 // no surrogate pair generation (yet?)
612 // BASE64 encode string
613 unsigned int lsb
, d
, l
;
614 for (d
= 0, l
= 0; /*nothing*/; psz
++)
616 for (lsb
= 0; lsb
< 2; lsb
++)
619 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
621 for (l
+= 8; l
>= 6; )
625 *buf
++ = utf7enb64
[(d
>> l
) % 64];
630 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
636 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
645 if (buf
&& (len
< n
))
650 // ----------------------------------------------------------------------------
652 // ----------------------------------------------------------------------------
654 static wxUint32 utf8_max
[]=
655 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
657 // boundaries of the private use area we use to (temporarily) remap invalid
658 // characters invalid in a UTF-8 encoded string
659 const wxUint32 wxUnicodePUA
= 0x100000;
660 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
662 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
666 while (*psz
&& ((!buf
) || (len
< n
)))
668 const char *opsz
= psz
;
669 bool invalid
= false;
670 unsigned char cc
= *psz
++, fc
= cc
;
672 for (cnt
= 0; fc
& 0x80; cnt
++)
681 // escape the escape character for octal escapes
682 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
683 && cc
== '\\' && (!buf
|| len
< n
))
695 // invalid UTF-8 sequence
700 unsigned ocnt
= cnt
- 1;
701 wxUint32 res
= cc
& (0x3f >> cnt
);
705 if ((cc
& 0xC0) != 0x80)
707 // invalid UTF-8 sequence
712 res
= (res
<< 6) | (cc
& 0x3f);
714 if (invalid
|| res
<= utf8_max
[ocnt
])
716 // illegal UTF-8 encoding
719 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
720 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
722 // if one of our PUA characters turns up externally
723 // it must also be treated as an illegal sequence
724 // (a bit like you have to escape an escape character)
730 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
731 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
732 if (pa
== (size_t)-1)
744 *buf
++ = (wchar_t)res
;
746 #endif // WC_UTF16/!WC_UTF16
751 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
753 while (opsz
< psz
&& (!buf
|| len
< n
))
756 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
757 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
758 wxASSERT(pa
!= (size_t)-1);
765 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
771 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
773 while (opsz
< psz
&& (!buf
|| len
< n
))
775 if ( buf
&& len
+ 3 < n
)
777 unsigned char on
= *opsz
;
779 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
780 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
781 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
787 else // MAP_INVALID_UTF8_NOT
794 if (buf
&& (len
< n
))
799 static inline bool isoctal(wchar_t wch
)
801 return L
'0' <= wch
&& wch
<= L
'7';
804 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
808 while (*psz
&& ((!buf
) || (len
< n
)))
812 // cast is ok for WC_UTF16
813 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
814 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
816 cc
=(*psz
++) & 0x7fffffff;
819 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
820 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
823 *buf
++ = (char)(cc
- wxUnicodePUA
);
826 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
827 && cc
== L
'\\' && psz
[0] == L
'\\' )
834 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
836 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
840 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
841 (psz
[1] - L
'0')*010 +
851 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
865 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
867 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
879 // ----------------------------------------------------------------------------
881 // ----------------------------------------------------------------------------
883 #ifdef WORDS_BIGENDIAN
884 #define wxMBConvUTF16straight wxMBConvUTF16BE
885 #define wxMBConvUTF16swap wxMBConvUTF16LE
887 #define wxMBConvUTF16swap wxMBConvUTF16BE
888 #define wxMBConvUTF16straight wxMBConvUTF16LE
894 // copy 16bit MB to 16bit String
895 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
899 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
902 *buf
++ = *(wxUint16
*)psz
;
905 psz
+= sizeof(wxUint16
);
907 if (buf
&& len
<n
) *buf
=0;
913 // copy 16bit String to 16bit MB
914 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
918 while (*psz
&& (!buf
|| len
< n
))
922 *(wxUint16
*)buf
= *psz
;
923 buf
+= sizeof(wxUint16
);
925 len
+= sizeof(wxUint16
);
928 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
934 // swap 16bit MB to 16bit String
935 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
939 // UTF16 string must be terminated by 2 NULs as single NULs may occur
941 while ( (psz
[0] || psz
[1]) && (!buf
|| len
< n
) )
945 ((char *)buf
)[0] = psz
[1];
946 ((char *)buf
)[1] = psz
[0];
953 if ( buf
&& len
< n
)
960 // swap 16bit MB to 16bit String
961 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
965 while ( *psz
&& (!buf
|| len
< n
) )
969 *buf
++ = ((char*)psz
)[1];
970 *buf
++ = ((char*)psz
)[0];
976 if ( buf
&& len
< n
)
986 // copy 16bit MB to 32bit String
987 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
991 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
994 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
995 if (pa
== (size_t)-1)
999 *buf
++ = (wchar_t)cc
;
1001 psz
+= pa
* sizeof(wxUint16
);
1003 if (buf
&& len
<n
) *buf
=0;
1009 // copy 32bit String to 16bit MB
1010 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1014 while (*psz
&& (!buf
|| len
< n
))
1017 size_t pa
=encode_utf16(*psz
, cc
);
1019 if (pa
== (size_t)-1)
1024 *(wxUint16
*)buf
= cc
[0];
1025 buf
+= sizeof(wxUint16
);
1028 *(wxUint16
*)buf
= cc
[1];
1029 buf
+= sizeof(wxUint16
);
1033 len
+= pa
*sizeof(wxUint16
);
1036 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1042 // swap 16bit MB to 32bit String
1043 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1047 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1051 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
1052 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
1054 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
1055 if (pa
== (size_t)-1)
1059 *buf
++ = (wchar_t)cc
;
1062 psz
+= pa
* sizeof(wxUint16
);
1064 if (buf
&& len
<n
) *buf
=0;
1070 // swap 32bit String to 16bit MB
1071 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1075 while (*psz
&& (!buf
|| len
< n
))
1078 size_t pa
=encode_utf16(*psz
, cc
);
1080 if (pa
== (size_t)-1)
1085 *buf
++ = ((char*)cc
)[1];
1086 *buf
++ = ((char*)cc
)[0];
1089 *buf
++ = ((char*)cc
)[3];
1090 *buf
++ = ((char*)cc
)[2];
1094 len
+= pa
*sizeof(wxUint16
);
1097 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1105 // ----------------------------------------------------------------------------
1107 // ----------------------------------------------------------------------------
1109 #ifdef WORDS_BIGENDIAN
1110 #define wxMBConvUTF32straight wxMBConvUTF32BE
1111 #define wxMBConvUTF32swap wxMBConvUTF32LE
1113 #define wxMBConvUTF32swap wxMBConvUTF32BE
1114 #define wxMBConvUTF32straight wxMBConvUTF32LE
1118 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1119 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1124 // copy 32bit MB to 16bit String
1125 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1129 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1133 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
1134 if (pa
== (size_t)-1)
1144 psz
+= sizeof(wxUint32
);
1146 if (buf
&& len
<n
) *buf
=0;
1152 // copy 16bit String to 32bit MB
1153 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1157 while (*psz
&& (!buf
|| len
< n
))
1161 // cast is ok for WC_UTF16
1162 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1163 if (pa
== (size_t)-1)
1168 *(wxUint32
*)buf
= cc
;
1169 buf
+= sizeof(wxUint32
);
1171 len
+= sizeof(wxUint32
);
1175 if (buf
&& len
<=n
-sizeof(wxUint32
))
1183 // swap 32bit MB to 16bit String
1184 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1188 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1191 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
1192 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
1197 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
1198 if (pa
== (size_t)-1)
1208 psz
+= sizeof(wxUint32
);
1218 // swap 16bit String to 32bit MB
1219 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1223 while (*psz
&& (!buf
|| len
< n
))
1227 // cast is ok for WC_UTF16
1228 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1229 if (pa
== (size_t)-1)
1239 len
+= sizeof(wxUint32
);
1243 if (buf
&& len
<=n
-sizeof(wxUint32
))
1252 // copy 32bit MB to 32bit String
1253 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1257 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1260 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1262 psz
+= sizeof(wxUint32
);
1272 // copy 32bit String to 32bit MB
1273 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1277 while (*psz
&& (!buf
|| len
< n
))
1281 *(wxUint32
*)buf
= *psz
;
1282 buf
+= sizeof(wxUint32
);
1285 len
+= sizeof(wxUint32
);
1289 if (buf
&& len
<=n
-sizeof(wxUint32
))
1296 // swap 32bit MB to 32bit String
1297 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1301 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1305 ((char *)buf
)[0] = psz
[3];
1306 ((char *)buf
)[1] = psz
[2];
1307 ((char *)buf
)[2] = psz
[1];
1308 ((char *)buf
)[3] = psz
[0];
1312 psz
+= sizeof(wxUint32
);
1322 // swap 32bit String to 32bit MB
1323 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1327 while (*psz
&& (!buf
|| len
< n
))
1331 *buf
++ = ((char *)psz
)[3];
1332 *buf
++ = ((char *)psz
)[2];
1333 *buf
++ = ((char *)psz
)[1];
1334 *buf
++ = ((char *)psz
)[0];
1336 len
+= sizeof(wxUint32
);
1340 if (buf
&& len
<=n
-sizeof(wxUint32
))
1350 // ============================================================================
1351 // The classes doing conversion using the iconv_xxx() functions
1352 // ============================================================================
1356 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1357 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1358 // (unless there's yet another bug in glibc) the only case when iconv()
1359 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1360 // left in the input buffer -- when _real_ error occurs,
1361 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1363 // [This bug does not appear in glibc 2.2.]
1364 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1365 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1366 (errno != E2BIG || bufLeft != 0))
1368 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1371 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1373 #define ICONV_T_INVALID ((iconv_t)-1)
1375 #if SIZEOF_WCHAR_T == 4
1376 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1377 #define WC_ENC wxFONTENCODING_UTF32
1378 #elif SIZEOF_WCHAR_T == 2
1379 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1380 #define WC_ENC wxFONTENCODING_UTF16
1381 #else // sizeof(wchar_t) != 2 nor 4
1382 // does this ever happen?
1383 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1386 // ----------------------------------------------------------------------------
1387 // wxMBConv_iconv: encapsulates an iconv character set
1388 // ----------------------------------------------------------------------------
1390 class wxMBConv_iconv
: public wxMBConv
1393 wxMBConv_iconv(const wxChar
*name
);
1394 virtual ~wxMBConv_iconv();
1396 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1397 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1399 // classify this encoding as explained in wxMBConv::GetMBNulLen()
1401 virtual size_t GetMBNulLen() const;
1404 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1407 // the iconv handlers used to translate from multibyte to wide char and in
1408 // the other direction
1412 // guards access to m2w and w2m objects
1413 wxMutex m_iconvMutex
;
1417 // the name (for iconv_open()) of a wide char charset -- if none is
1418 // available on this machine, it will remain NULL
1419 static wxString ms_wcCharsetName
;
1421 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1422 // different endian-ness than the native one
1423 static bool ms_wcNeedsSwap
;
1425 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1427 size_t m_minMBCharWidth
;
1430 // make the constructor available for unit testing
1431 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1433 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1434 if ( !result
->IsOk() )
1442 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1443 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1445 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1447 m_minMBCharWidth
= 0;
1449 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1450 // names for the charsets
1451 const wxCharBuffer
cname(wxString(name
).ToAscii());
1453 // check for charset that represents wchar_t:
1454 if ( ms_wcCharsetName
.empty() )
1456 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1459 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1460 #else // !wxUSE_FONTMAP
1461 static const wxChar
*names
[] =
1463 #if SIZEOF_WCHAR_T == 4
1465 #elif SIZEOF_WCHAR_T = 2
1470 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1472 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1474 const wxString
nameCS(*names
);
1476 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1477 wxString
nameXE(nameCS
);
1478 #ifdef WORDS_BIGENDIAN
1480 #else // little endian
1484 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1487 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1488 if ( m2w
== ICONV_T_INVALID
)
1490 // try charset w/o bytesex info (e.g. "UCS4")
1491 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1493 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1495 // and check for bytesex ourselves:
1496 if ( m2w
!= ICONV_T_INVALID
)
1498 char buf
[2], *bufPtr
;
1499 wchar_t wbuf
[2], *wbufPtr
;
1507 outsz
= SIZEOF_WCHAR_T
* 2;
1511 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1512 (char**)&wbufPtr
, &outsz
);
1514 if (ICONV_FAILED(res
, insz
))
1516 wxLogLastError(wxT("iconv"));
1517 wxLogError(_("Conversion to charset '%s' doesn't work."),
1520 else // ok, can convert to this encoding, remember it
1522 ms_wcCharsetName
= nameCS
;
1523 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1527 else // use charset not requiring byte swapping
1529 ms_wcCharsetName
= nameXE
;
1533 wxLogTrace(TRACE_STRCONV
,
1534 wxT("iconv wchar_t charset is \"%s\"%s"),
1535 ms_wcCharsetName
.empty() ? _T("<none>")
1536 : ms_wcCharsetName
.c_str(),
1537 ms_wcNeedsSwap
? _T(" (needs swap)")
1540 else // we already have ms_wcCharsetName
1542 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1545 if ( ms_wcCharsetName
.empty() )
1547 w2m
= ICONV_T_INVALID
;
1551 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1552 if ( w2m
== ICONV_T_INVALID
)
1554 wxLogTrace(TRACE_STRCONV
,
1555 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1556 ms_wcCharsetName
.c_str(), cname
.data());
1561 wxMBConv_iconv::~wxMBConv_iconv()
1563 if ( m2w
!= ICONV_T_INVALID
)
1565 if ( w2m
!= ICONV_T_INVALID
)
1569 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1571 // find the string length: notice that must be done differently for
1572 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1574 const size_t nulLen
= GetMBNulLen();
1581 inbuf
= strlen(psz
); // arguably more optimized than our version
1586 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1587 // they also have to start at character boundary and not span two
1588 // adjacent characters
1590 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1597 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1598 // Unfortunately there is a couple of global wxCSConv objects such as
1599 // wxConvLocal that are used all over wx code, so we have to make sure
1600 // the handle is used by at most one thread at the time. Otherwise
1601 // only a few wx classes would be safe to use from non-main threads
1602 // as MB<->WC conversion would fail "randomly".
1603 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1604 #endif // wxUSE_THREADS
1607 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1609 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1610 wchar_t *bufPtr
= buf
;
1611 const char *pszPtr
= psz
;
1615 // have destination buffer, convert there
1617 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1618 (char**)&bufPtr
, &outbuf
);
1619 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1623 // convert to native endianness
1624 for ( unsigned i
= 0; i
< res
; i
++ )
1625 buf
[n
] = WC_BSWAP(buf
[i
]);
1628 // NUL-terminate the string if there is any space left
1634 // no destination buffer... convert using temp buffer
1635 // to calculate destination buffer requirement
1640 outbuf
= 8*SIZEOF_WCHAR_T
;
1643 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1644 (char**)&bufPtr
, &outbuf
);
1646 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1647 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1650 if (ICONV_FAILED(cres
, inbuf
))
1652 //VS: it is ok if iconv fails, hence trace only
1653 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1660 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1663 // NB: explained in MB2WC
1664 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1667 size_t inlen
= wxWcslen(psz
);
1668 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1672 wchar_t *tmpbuf
= 0;
1676 // need to copy to temp buffer to switch endianness
1677 // (doing WC_BSWAP twice on the original buffer won't help, as it
1678 // could be in read-only memory, or be accessed in some other thread)
1679 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1680 for ( size_t i
= 0; i
< inlen
; i
++ )
1681 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1682 tmpbuf
[inlen
] = L
'\0';
1688 // have destination buffer, convert there
1689 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1693 // NB: iconv was given only wcslen(psz) characters on input, and so
1694 // it couldn't convert the trailing zero. Let's do it ourselves
1695 // if there's some room left for it in the output buffer.
1701 // no destination buffer... convert using temp buffer
1702 // to calculate destination buffer requirement
1706 buf
= tbuf
; outbuf
= 16;
1708 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1711 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1719 if (ICONV_FAILED(cres
, inbuf
))
1721 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1728 size_t wxMBConv_iconv::GetMBNulLen() const
1730 if ( m_minMBCharWidth
== 0 )
1732 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1735 // NB: explained in MB2WC
1736 wxMutexLocker
lock(self
->m_iconvMutex
);
1739 wchar_t *wnul
= L
"";
1740 char buf
[8]; // should be enough for NUL in any encoding
1741 size_t inLen
= sizeof(wchar_t),
1742 outLen
= WXSIZEOF(buf
);
1743 char *in
= (char *)wnul
;
1745 if ( iconv(w2m
, ICONV_CHAR_CAST(&in
), &inLen
, &out
, &outLen
) == (size_t)-1 )
1747 self
->m_minMBCharWidth
= (size_t)-1;
1751 self
->m_minMBCharWidth
= out
- buf
;
1755 return m_minMBCharWidth
;
1758 #endif // HAVE_ICONV
1761 // ============================================================================
1762 // Win32 conversion classes
1763 // ============================================================================
1765 #ifdef wxHAVE_WIN32_MB2WC
1769 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1770 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1773 class wxMBConv_win32
: public wxMBConv
1778 m_CodePage
= CP_ACP
;
1779 m_minMBCharWidth
= 0;
1783 wxMBConv_win32(const wxChar
* name
)
1785 m_CodePage
= wxCharsetToCodepage(name
);
1786 m_minMBCharWidth
= 0;
1789 wxMBConv_win32(wxFontEncoding encoding
)
1791 m_CodePage
= wxEncodingToCodepage(encoding
);
1792 m_minMBCharWidth
= 0;
1794 #endif // wxUSE_FONTMAP
1796 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1798 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1799 // the behaviour is not compatible with the Unix version (using iconv)
1800 // and break the library itself, e.g. wxTextInputStream::NextChar()
1801 // wouldn't work if reading an incomplete MB char didn't result in an
1804 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1805 // an error (tested under Windows Server 2003) and apparently it is
1806 // done on purpose, i.e. the function accepts any input in this case
1807 // and although I'd prefer to return error on ill-formed output, our
1808 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1809 // explicitly ill-formed according to RFC 2152) neither so we don't
1810 // even have any fallback here...
1812 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1813 // Win XP or newer and if it is specified on older versions, conversion
1814 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1815 // fails. So we can only use the flag on newer Windows versions.
1816 // Additionally, the flag is not supported by UTF7, symbol and CJK
1817 // encodings. See here:
1818 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1819 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1821 if ( m_CodePage
!= CP_UTF7
&& m_CodePage
!= CP_SYMBOL
&&
1822 m_CodePage
< 50000 &&
1823 IsAtLeastWin2kSP4() )
1825 flags
= MB_ERR_INVALID_CHARS
;
1827 else if ( m_CodePage
== CP_UTF8
)
1829 // Avoid round-trip in the special case of UTF-8 by using our
1830 // own UTF-8 conversion code:
1831 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
1834 const size_t len
= ::MultiByteToWideChar
1836 m_CodePage
, // code page
1837 flags
, // flags: fall on error
1838 psz
, // input string
1839 -1, // its length (NUL-terminated)
1840 buf
, // output string
1841 buf
? n
: 0 // size of output buffer
1845 // function totally failed
1849 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1850 // check if we succeeded, by doing a double trip:
1851 if ( !flags
&& buf
)
1853 const size_t mbLen
= strlen(psz
);
1854 wxCharBuffer
mbBuf(mbLen
);
1855 if ( ::WideCharToMultiByte
1862 mbLen
+ 1, // size in bytes, not length
1866 strcmp(mbBuf
, psz
) != 0 )
1868 // we didn't obtain the same thing we started from, hence
1869 // the conversion was lossy and we consider that it failed
1874 // note that it returns count of written chars for buf != NULL and size
1875 // of the needed buffer for buf == NULL so in either case the length of
1876 // the string (which never includes the terminating NUL) is one less
1880 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1883 we have a problem here: by default, WideCharToMultiByte() may
1884 replace characters unrepresentable in the target code page with bad
1885 quality approximations such as turning "1/2" symbol (U+00BD) into
1886 "1" for the code pages which don't have it and we, obviously, want
1887 to avoid this at any price
1889 the trouble is that this function does it _silently_, i.e. it won't
1890 even tell us whether it did or not... Win98/2000 and higher provide
1891 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1892 we have to resort to a round trip, i.e. check that converting back
1893 results in the same string -- this is, of course, expensive but
1894 otherwise we simply can't be sure to not garble the data.
1897 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1898 // it doesn't work with CJK encodings (which we test for rather roughly
1899 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1901 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1904 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1906 // it's our lucky day
1907 flags
= WC_NO_BEST_FIT_CHARS
;
1908 pUsedDef
= &usedDef
;
1910 else // old system or unsupported encoding
1916 const size_t len
= ::WideCharToMultiByte
1918 m_CodePage
, // code page
1919 flags
, // either none or no best fit
1920 pwz
, // input string
1921 -1, // it is (wide) NUL-terminated
1922 buf
, // output buffer
1923 buf
? n
: 0, // and its size
1924 NULL
, // default "replacement" char
1925 pUsedDef
// [out] was it used?
1930 // function totally failed
1934 // if we were really converting, check if we succeeded
1939 // check if the conversion failed, i.e. if any replacements
1944 else // we must resort to double tripping...
1946 wxWCharBuffer
wcBuf(n
);
1947 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1948 wcscmp(wcBuf
, pwz
) != 0 )
1950 // we didn't obtain the same thing we started from, hence
1951 // the conversion was lossy and we consider that it failed
1957 // see the comment above for the reason of "len - 1"
1961 virtual size_t GetMBNulLen() const
1963 if ( m_minMBCharWidth
== 0 )
1965 int len
= ::WideCharToMultiByte
1967 m_CodePage
, // code page
1969 L
"", // input string
1970 1, // translate just the NUL
1971 NULL
, // output buffer
1973 NULL
, // no replacement char
1974 NULL
// [out] don't care if it was used
1977 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
1981 wxLogDebug(_T("Unexpected NUL length %d"), len
);
1985 self
->m_minMBCharWidth
= (size_t)-1;
1991 self
->m_minMBCharWidth
= len
;
1996 return m_minMBCharWidth
;
1999 bool IsOk() const { return m_CodePage
!= -1; }
2002 static bool CanUseNoBestFit()
2004 static int s_isWin98Or2k
= -1;
2006 if ( s_isWin98Or2k
== -1 )
2009 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2012 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2016 s_isWin98Or2k
= verMaj
>= 5;
2020 // unknown, be conseravtive by default
2024 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2027 return s_isWin98Or2k
== 1;
2030 static bool IsAtLeastWin2kSP4()
2035 static int s_isAtLeastWin2kSP4
= -1;
2037 if ( s_isAtLeastWin2kSP4
== -1 )
2039 OSVERSIONINFOEX ver
;
2041 memset(&ver
, 0, sizeof(ver
));
2042 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2043 GetVersionEx((OSVERSIONINFO
*)&ver
);
2045 s_isAtLeastWin2kSP4
=
2046 ((ver
.dwMajorVersion
> 5) || // Vista+
2047 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2048 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2049 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2053 return s_isAtLeastWin2kSP4
== 1;
2058 // the code page we're working with
2061 // cached result of GetMBNulLen(), set to 0 initially meaning
2063 size_t m_minMBCharWidth
;
2066 #endif // wxHAVE_WIN32_MB2WC
2068 // ============================================================================
2069 // Cocoa conversion classes
2070 // ============================================================================
2072 #if defined(__WXCOCOA__)
2074 // RN: There is no UTF-32 support in either Core Foundation or
2075 // Cocoa. Strangely enough, internally Core Foundation uses
2076 // UTF 32 internally quite a bit - its just not public (yet).
2078 #include <CoreFoundation/CFString.h>
2079 #include <CoreFoundation/CFStringEncodingExt.h>
2081 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2083 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2084 if ( encoding
== wxFONTENCODING_DEFAULT
)
2086 enc
= CFStringGetSystemEncoding();
2088 else switch( encoding
)
2090 case wxFONTENCODING_ISO8859_1
:
2091 enc
= kCFStringEncodingISOLatin1
;
2093 case wxFONTENCODING_ISO8859_2
:
2094 enc
= kCFStringEncodingISOLatin2
;
2096 case wxFONTENCODING_ISO8859_3
:
2097 enc
= kCFStringEncodingISOLatin3
;
2099 case wxFONTENCODING_ISO8859_4
:
2100 enc
= kCFStringEncodingISOLatin4
;
2102 case wxFONTENCODING_ISO8859_5
:
2103 enc
= kCFStringEncodingISOLatinCyrillic
;
2105 case wxFONTENCODING_ISO8859_6
:
2106 enc
= kCFStringEncodingISOLatinArabic
;
2108 case wxFONTENCODING_ISO8859_7
:
2109 enc
= kCFStringEncodingISOLatinGreek
;
2111 case wxFONTENCODING_ISO8859_8
:
2112 enc
= kCFStringEncodingISOLatinHebrew
;
2114 case wxFONTENCODING_ISO8859_9
:
2115 enc
= kCFStringEncodingISOLatin5
;
2117 case wxFONTENCODING_ISO8859_10
:
2118 enc
= kCFStringEncodingISOLatin6
;
2120 case wxFONTENCODING_ISO8859_11
:
2121 enc
= kCFStringEncodingISOLatinThai
;
2123 case wxFONTENCODING_ISO8859_13
:
2124 enc
= kCFStringEncodingISOLatin7
;
2126 case wxFONTENCODING_ISO8859_14
:
2127 enc
= kCFStringEncodingISOLatin8
;
2129 case wxFONTENCODING_ISO8859_15
:
2130 enc
= kCFStringEncodingISOLatin9
;
2133 case wxFONTENCODING_KOI8
:
2134 enc
= kCFStringEncodingKOI8_R
;
2136 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2137 enc
= kCFStringEncodingDOSRussian
;
2140 // case wxFONTENCODING_BULGARIAN :
2144 case wxFONTENCODING_CP437
:
2145 enc
=kCFStringEncodingDOSLatinUS
;
2147 case wxFONTENCODING_CP850
:
2148 enc
= kCFStringEncodingDOSLatin1
;
2150 case wxFONTENCODING_CP852
:
2151 enc
= kCFStringEncodingDOSLatin2
;
2153 case wxFONTENCODING_CP855
:
2154 enc
= kCFStringEncodingDOSCyrillic
;
2156 case wxFONTENCODING_CP866
:
2157 enc
=kCFStringEncodingDOSRussian
;
2159 case wxFONTENCODING_CP874
:
2160 enc
= kCFStringEncodingDOSThai
;
2162 case wxFONTENCODING_CP932
:
2163 enc
= kCFStringEncodingDOSJapanese
;
2165 case wxFONTENCODING_CP936
:
2166 enc
=kCFStringEncodingDOSChineseSimplif
;
2168 case wxFONTENCODING_CP949
:
2169 enc
= kCFStringEncodingDOSKorean
;
2171 case wxFONTENCODING_CP950
:
2172 enc
= kCFStringEncodingDOSChineseTrad
;
2174 case wxFONTENCODING_CP1250
:
2175 enc
= kCFStringEncodingWindowsLatin2
;
2177 case wxFONTENCODING_CP1251
:
2178 enc
=kCFStringEncodingWindowsCyrillic
;
2180 case wxFONTENCODING_CP1252
:
2181 enc
=kCFStringEncodingWindowsLatin1
;
2183 case wxFONTENCODING_CP1253
:
2184 enc
= kCFStringEncodingWindowsGreek
;
2186 case wxFONTENCODING_CP1254
:
2187 enc
= kCFStringEncodingWindowsLatin5
;
2189 case wxFONTENCODING_CP1255
:
2190 enc
=kCFStringEncodingWindowsHebrew
;
2192 case wxFONTENCODING_CP1256
:
2193 enc
=kCFStringEncodingWindowsArabic
;
2195 case wxFONTENCODING_CP1257
:
2196 enc
= kCFStringEncodingWindowsBalticRim
;
2198 // This only really encodes to UTF7 (if that) evidently
2199 // case wxFONTENCODING_UTF7 :
2200 // enc = kCFStringEncodingNonLossyASCII ;
2202 case wxFONTENCODING_UTF8
:
2203 enc
= kCFStringEncodingUTF8
;
2205 case wxFONTENCODING_EUC_JP
:
2206 enc
= kCFStringEncodingEUC_JP
;
2208 case wxFONTENCODING_UTF16
:
2209 enc
= kCFStringEncodingUnicode
;
2211 case wxFONTENCODING_MACROMAN
:
2212 enc
= kCFStringEncodingMacRoman
;
2214 case wxFONTENCODING_MACJAPANESE
:
2215 enc
= kCFStringEncodingMacJapanese
;
2217 case wxFONTENCODING_MACCHINESETRAD
:
2218 enc
= kCFStringEncodingMacChineseTrad
;
2220 case wxFONTENCODING_MACKOREAN
:
2221 enc
= kCFStringEncodingMacKorean
;
2223 case wxFONTENCODING_MACARABIC
:
2224 enc
= kCFStringEncodingMacArabic
;
2226 case wxFONTENCODING_MACHEBREW
:
2227 enc
= kCFStringEncodingMacHebrew
;
2229 case wxFONTENCODING_MACGREEK
:
2230 enc
= kCFStringEncodingMacGreek
;
2232 case wxFONTENCODING_MACCYRILLIC
:
2233 enc
= kCFStringEncodingMacCyrillic
;
2235 case wxFONTENCODING_MACDEVANAGARI
:
2236 enc
= kCFStringEncodingMacDevanagari
;
2238 case wxFONTENCODING_MACGURMUKHI
:
2239 enc
= kCFStringEncodingMacGurmukhi
;
2241 case wxFONTENCODING_MACGUJARATI
:
2242 enc
= kCFStringEncodingMacGujarati
;
2244 case wxFONTENCODING_MACORIYA
:
2245 enc
= kCFStringEncodingMacOriya
;
2247 case wxFONTENCODING_MACBENGALI
:
2248 enc
= kCFStringEncodingMacBengali
;
2250 case wxFONTENCODING_MACTAMIL
:
2251 enc
= kCFStringEncodingMacTamil
;
2253 case wxFONTENCODING_MACTELUGU
:
2254 enc
= kCFStringEncodingMacTelugu
;
2256 case wxFONTENCODING_MACKANNADA
:
2257 enc
= kCFStringEncodingMacKannada
;
2259 case wxFONTENCODING_MACMALAJALAM
:
2260 enc
= kCFStringEncodingMacMalayalam
;
2262 case wxFONTENCODING_MACSINHALESE
:
2263 enc
= kCFStringEncodingMacSinhalese
;
2265 case wxFONTENCODING_MACBURMESE
:
2266 enc
= kCFStringEncodingMacBurmese
;
2268 case wxFONTENCODING_MACKHMER
:
2269 enc
= kCFStringEncodingMacKhmer
;
2271 case wxFONTENCODING_MACTHAI
:
2272 enc
= kCFStringEncodingMacThai
;
2274 case wxFONTENCODING_MACLAOTIAN
:
2275 enc
= kCFStringEncodingMacLaotian
;
2277 case wxFONTENCODING_MACGEORGIAN
:
2278 enc
= kCFStringEncodingMacGeorgian
;
2280 case wxFONTENCODING_MACARMENIAN
:
2281 enc
= kCFStringEncodingMacArmenian
;
2283 case wxFONTENCODING_MACCHINESESIMP
:
2284 enc
= kCFStringEncodingMacChineseSimp
;
2286 case wxFONTENCODING_MACTIBETAN
:
2287 enc
= kCFStringEncodingMacTibetan
;
2289 case wxFONTENCODING_MACMONGOLIAN
:
2290 enc
= kCFStringEncodingMacMongolian
;
2292 case wxFONTENCODING_MACETHIOPIC
:
2293 enc
= kCFStringEncodingMacEthiopic
;
2295 case wxFONTENCODING_MACCENTRALEUR
:
2296 enc
= kCFStringEncodingMacCentralEurRoman
;
2298 case wxFONTENCODING_MACVIATNAMESE
:
2299 enc
= kCFStringEncodingMacVietnamese
;
2301 case wxFONTENCODING_MACARABICEXT
:
2302 enc
= kCFStringEncodingMacExtArabic
;
2304 case wxFONTENCODING_MACSYMBOL
:
2305 enc
= kCFStringEncodingMacSymbol
;
2307 case wxFONTENCODING_MACDINGBATS
:
2308 enc
= kCFStringEncodingMacDingbats
;
2310 case wxFONTENCODING_MACTURKISH
:
2311 enc
= kCFStringEncodingMacTurkish
;
2313 case wxFONTENCODING_MACCROATIAN
:
2314 enc
= kCFStringEncodingMacCroatian
;
2316 case wxFONTENCODING_MACICELANDIC
:
2317 enc
= kCFStringEncodingMacIcelandic
;
2319 case wxFONTENCODING_MACROMANIAN
:
2320 enc
= kCFStringEncodingMacRomanian
;
2322 case wxFONTENCODING_MACCELTIC
:
2323 enc
= kCFStringEncodingMacCeltic
;
2325 case wxFONTENCODING_MACGAELIC
:
2326 enc
= kCFStringEncodingMacGaelic
;
2328 // case wxFONTENCODING_MACKEYBOARD :
2329 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2332 // because gcc is picky
2338 class wxMBConv_cocoa
: public wxMBConv
2343 Init(CFStringGetSystemEncoding()) ;
2347 wxMBConv_cocoa(const wxChar
* name
)
2349 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2353 wxMBConv_cocoa(wxFontEncoding encoding
)
2355 Init( wxCFStringEncFromFontEnc(encoding
) );
2362 void Init( CFStringEncoding encoding
)
2364 m_encoding
= encoding
;
2367 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2371 CFStringRef theString
= CFStringCreateWithBytes (
2372 NULL
, //the allocator
2373 (const UInt8
*)szUnConv
,
2376 false //no BOM/external representation
2379 wxASSERT(theString
);
2381 size_t nOutLength
= CFStringGetLength(theString
);
2385 CFRelease(theString
);
2389 CFRange theRange
= { 0, nOutSize
};
2391 #if SIZEOF_WCHAR_T == 4
2392 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2395 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2397 CFRelease(theString
);
2399 szUniCharBuffer
[nOutLength
] = '\0' ;
2401 #if SIZEOF_WCHAR_T == 4
2402 wxMBConvUTF16 converter
;
2403 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2404 delete[] szUniCharBuffer
;
2410 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2414 size_t nRealOutSize
;
2415 size_t nBufSize
= wxWcslen(szUnConv
);
2416 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2418 #if SIZEOF_WCHAR_T == 4
2419 wxMBConvUTF16 converter
;
2420 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2421 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2422 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2423 nBufSize
/= sizeof(UniChar
);
2426 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2430 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2433 wxASSERT(theString
);
2435 //Note that CER puts a BOM when converting to unicode
2436 //so we check and use getchars instead in that case
2437 if (m_encoding
== kCFStringEncodingUnicode
)
2440 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2442 nRealOutSize
= CFStringGetLength(theString
) + 1;
2448 CFRangeMake(0, CFStringGetLength(theString
)),
2450 0, //what to put in characters that can't be converted -
2451 //0 tells CFString to return NULL if it meets such a character
2452 false, //not an external representation
2455 (CFIndex
*) &nRealOutSize
2459 CFRelease(theString
);
2461 #if SIZEOF_WCHAR_T == 4
2462 delete[] szUniBuffer
;
2465 return nRealOutSize
- 1;
2470 return m_encoding
!= kCFStringEncodingInvalidId
&&
2471 CFStringIsEncodingAvailable(m_encoding
);
2475 CFStringEncoding m_encoding
;
2478 #endif // defined(__WXCOCOA__)
2480 // ============================================================================
2481 // Mac conversion classes
2482 // ============================================================================
2484 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2486 class wxMBConv_mac
: public wxMBConv
2491 Init(CFStringGetSystemEncoding()) ;
2495 wxMBConv_mac(const wxChar
* name
)
2497 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2501 wxMBConv_mac(wxFontEncoding encoding
)
2503 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2508 OSStatus status
= noErr
;
2509 status
= TECDisposeConverter(m_MB2WC_converter
);
2510 status
= TECDisposeConverter(m_WC2MB_converter
);
2514 void Init( TextEncodingBase encoding
)
2516 OSStatus status
= noErr
;
2517 m_char_encoding
= encoding
;
2518 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2520 status
= TECCreateConverter(&m_MB2WC_converter
,
2522 m_unicode_encoding
);
2523 status
= TECCreateConverter(&m_WC2MB_converter
,
2528 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2530 OSStatus status
= noErr
;
2531 ByteCount byteOutLen
;
2532 ByteCount byteInLen
= strlen(psz
) ;
2533 wchar_t *tbuf
= NULL
;
2534 UniChar
* ubuf
= NULL
;
2539 //apple specs say at least 32
2540 n
= wxMax( 32 , byteInLen
) ;
2541 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2543 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2544 #if SIZEOF_WCHAR_T == 4
2545 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2547 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2549 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2550 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2551 #if SIZEOF_WCHAR_T == 4
2552 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2553 // is not properly terminated we get random characters at the end
2554 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2555 wxMBConvUTF16 converter
;
2556 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2559 res
= byteOutLen
/ sizeof( UniChar
) ;
2564 if ( buf
&& res
< n
)
2570 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2572 OSStatus status
= noErr
;
2573 ByteCount byteOutLen
;
2574 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2580 //apple specs say at least 32
2581 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2582 tbuf
= (char*) malloc( n
) ;
2585 ByteCount byteBufferLen
= n
;
2586 UniChar
* ubuf
= NULL
;
2587 #if SIZEOF_WCHAR_T == 4
2588 wxMBConvUTF16 converter
;
2589 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2590 byteInLen
= unicharlen
;
2591 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2592 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2594 ubuf
= (UniChar
*) psz
;
2596 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2597 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2598 #if SIZEOF_WCHAR_T == 4
2604 size_t res
= byteOutLen
;
2605 if ( buf
&& res
< n
)
2609 //we need to double-trip to verify it didn't insert any ? in place
2610 //of bogus characters
2611 wxWCharBuffer
wcBuf(n
);
2612 size_t pszlen
= wxWcslen(psz
);
2613 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2614 wxWcslen(wcBuf
) != pszlen
||
2615 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2617 // we didn't obtain the same thing we started from, hence
2618 // the conversion was lossy and we consider that it failed
2627 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2630 TECObjectRef m_MB2WC_converter
;
2631 TECObjectRef m_WC2MB_converter
;
2633 TextEncodingBase m_char_encoding
;
2634 TextEncodingBase m_unicode_encoding
;
2637 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2639 // ============================================================================
2640 // wxEncodingConverter based conversion classes
2641 // ============================================================================
2645 class wxMBConv_wxwin
: public wxMBConv
2650 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2651 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2655 // temporarily just use wxEncodingConverter stuff,
2656 // so that it works while a better implementation is built
2657 wxMBConv_wxwin(const wxChar
* name
)
2660 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2662 m_enc
= wxFONTENCODING_SYSTEM
;
2667 wxMBConv_wxwin(wxFontEncoding enc
)
2674 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2676 size_t inbuf
= strlen(psz
);
2679 if (!m2w
.Convert(psz
,buf
))
2685 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2687 const size_t inbuf
= wxWcslen(psz
);
2690 if (!w2m
.Convert(psz
,buf
))
2697 virtual size_t GetMBNulLen() const
2701 case wxFONTENCODING_UTF16BE
:
2702 case wxFONTENCODING_UTF16LE
:
2705 case wxFONTENCODING_UTF32BE
:
2706 case wxFONTENCODING_UTF32LE
:
2714 bool IsOk() const { return m_ok
; }
2717 wxFontEncoding m_enc
;
2718 wxEncodingConverter m2w
, w2m
;
2721 // were we initialized successfully?
2724 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2727 // make the constructors available for unit testing
2728 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2730 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2731 if ( !result
->IsOk() )
2739 #endif // wxUSE_FONTMAP
2741 // ============================================================================
2742 // wxCSConv implementation
2743 // ============================================================================
2745 void wxCSConv::Init()
2752 wxCSConv::wxCSConv(const wxChar
*charset
)
2762 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2764 m_encoding
= wxFONTENCODING_SYSTEM
;
2768 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2770 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2772 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2774 encoding
= wxFONTENCODING_SYSTEM
;
2779 m_encoding
= encoding
;
2782 wxCSConv::~wxCSConv()
2787 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2792 SetName(conv
.m_name
);
2793 m_encoding
= conv
.m_encoding
;
2796 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2800 SetName(conv
.m_name
);
2801 m_encoding
= conv
.m_encoding
;
2806 void wxCSConv::Clear()
2815 void wxCSConv::SetName(const wxChar
*charset
)
2819 m_name
= wxStrdup(charset
);
2825 #include "wx/hashmap.h"
2827 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2828 wxEncodingNameCache
);
2830 static wxEncodingNameCache gs_nameCache
;
2833 wxMBConv
*wxCSConv::DoCreate() const
2836 wxLogTrace(TRACE_STRCONV
,
2837 wxT("creating conversion for %s"),
2839 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
2840 #endif // wxUSE_FONTMAP
2842 // check for the special case of ASCII or ISO8859-1 charset: as we have
2843 // special knowledge of it anyhow, we don't need to create a special
2844 // conversion object
2845 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2846 m_encoding
== wxFONTENCODING_DEFAULT
)
2848 // don't convert at all
2852 // we trust OS to do conversion better than we can so try external
2853 // conversion methods first
2855 // the full order is:
2856 // 1. OS conversion (iconv() under Unix or Win32 API)
2857 // 2. hard coded conversions for UTF
2858 // 3. wxEncodingConverter as fall back
2864 #endif // !wxUSE_FONTMAP
2866 wxString
name(m_name
);
2867 wxFontEncoding
encoding(m_encoding
);
2869 if ( !name
.empty() )
2871 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2879 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2880 #endif // wxUSE_FONTMAP
2884 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2885 if ( it
!= gs_nameCache
.end() )
2887 if ( it
->second
.empty() )
2890 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
2897 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2899 for ( ; *names
; ++names
)
2901 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
2904 gs_nameCache
[encoding
] = *names
;
2911 gs_nameCache
[encoding
] = _T(""); // cache the failure
2913 #endif // wxUSE_FONTMAP
2915 #endif // HAVE_ICONV
2917 #ifdef wxHAVE_WIN32_MB2WC
2920 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2921 : new wxMBConv_win32(m_encoding
);
2930 #endif // wxHAVE_WIN32_MB2WC
2931 #if defined(__WXMAC__)
2933 // leave UTF16 and UTF32 to the built-ins of wx
2934 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2935 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2939 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2940 : new wxMBConv_mac(m_encoding
);
2942 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2951 #if defined(__WXCOCOA__)
2953 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2957 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2958 : new wxMBConv_cocoa(m_encoding
);
2960 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
2970 wxFontEncoding enc
= m_encoding
;
2972 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2974 // use "false" to suppress interactive dialogs -- we can be called from
2975 // anywhere and popping up a dialog from here is the last thing we want to
2977 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2979 #endif // wxUSE_FONTMAP
2983 case wxFONTENCODING_UTF7
:
2984 return new wxMBConvUTF7
;
2986 case wxFONTENCODING_UTF8
:
2987 return new wxMBConvUTF8
;
2989 case wxFONTENCODING_UTF16BE
:
2990 return new wxMBConvUTF16BE
;
2992 case wxFONTENCODING_UTF16LE
:
2993 return new wxMBConvUTF16LE
;
2995 case wxFONTENCODING_UTF32BE
:
2996 return new wxMBConvUTF32BE
;
2998 case wxFONTENCODING_UTF32LE
:
2999 return new wxMBConvUTF32LE
;
3002 // nothing to do but put here to suppress gcc warnings
3009 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3010 : new wxMBConv_wxwin(m_encoding
);
3016 #endif // wxUSE_FONTMAP
3018 // NB: This is a hack to prevent deadlock. What could otherwise happen
3019 // in Unicode build: wxConvLocal creation ends up being here
3020 // because of some failure and logs the error. But wxLog will try to
3021 // attach timestamp, for which it will need wxConvLocal (to convert
3022 // time to char* and then wchar_t*), but that fails, tries to log
3023 // error, but wxLog has a (already locked) critical section that
3024 // guards static buffer.
3025 static bool alreadyLoggingError
= false;
3026 if (!alreadyLoggingError
)
3028 alreadyLoggingError
= true;
3029 wxLogError(_("Cannot convert from the charset '%s'!"),
3033 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3034 #else // !wxUSE_FONTMAP
3035 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3036 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3038 alreadyLoggingError
= false;
3044 void wxCSConv::CreateConvIfNeeded() const
3048 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3051 // if we don't have neither the name nor the encoding, use the default
3052 // encoding for this system
3053 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3055 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3057 #endif // wxUSE_INTL
3059 self
->m_convReal
= DoCreate();
3060 self
->m_deferred
= false;
3064 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3066 CreateConvIfNeeded();
3069 return m_convReal
->MB2WC(buf
, psz
, n
);
3072 size_t len
= strlen(psz
);
3076 for (size_t c
= 0; c
<= len
; c
++)
3077 buf
[c
] = (unsigned char)(psz
[c
]);
3083 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3085 CreateConvIfNeeded();
3088 return m_convReal
->WC2MB(buf
, psz
, n
);
3091 const size_t len
= wxWcslen(psz
);
3094 for (size_t c
= 0; c
<= len
; c
++)
3098 buf
[c
] = (char)psz
[c
];
3103 for (size_t c
= 0; c
<= len
; c
++)
3113 size_t wxCSConv::GetMBNulLen() const
3115 CreateConvIfNeeded();
3119 return m_convReal
->GetMBNulLen();
3125 // ----------------------------------------------------------------------------
3127 // ----------------------------------------------------------------------------
3130 static wxMBConv_win32 wxConvLibcObj
;
3131 #elif defined(__WXMAC__) && !defined(__MACH__)
3132 static wxMBConv_mac wxConvLibcObj
;
3134 static wxMBConvLibc wxConvLibcObj
;
3137 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3138 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3139 static wxMBConvUTF7 wxConvUTF7Obj
;
3140 static wxMBConvUTF8 wxConvUTF8Obj
;
3142 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3143 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3144 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3145 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3146 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3147 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3148 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3156 #else // !wxUSE_WCHAR_T
3158 // stand-ins in absence of wchar_t
3159 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3164 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T