1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p
, size_t n
)
92 while ( n
&& *p
++ == '\0' )
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
103 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
108 *output
= (wxUint16
) input
;
111 else if (input
>=0x110000)
119 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
120 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
126 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
128 if ((*input
<0xd800) || (*input
>0xdfff))
133 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
140 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
146 // ----------------------------------------------------------------------------
148 // ----------------------------------------------------------------------------
151 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
152 const char *src
, size_t srcLen
) const
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten
= 0;
164 // the number of NULs terminating this string
165 size_t nulLen
wxDUMMY_INITIALIZE(0);
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
173 if ( srcLen
!= (size_t)-1 )
175 // we need to know how to find the end of this string
176 nulLen
= GetMBNulLen();
177 if ( nulLen
== wxCONV_FAILED
)
178 return wxCONV_FAILED
;
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
185 char * const p
= bufTmp
.data();
186 memcpy(p
, src
, srcLen
);
187 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
193 srcEnd
= src
+ srcLen
;
195 else // quit after the first loop iteration
202 // try to convert the current chunk
203 size_t lenChunk
= MB2WC(NULL
, src
, 0);
206 // nothing left in the input string, conversion succeeded;
207 // but still account for the trailing NULL
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 lenChunk
++; // for trailing NUL
217 dstWritten
+= lenChunk
;
221 if ( dstWritten
> dstLen
)
222 return wxCONV_FAILED
;
224 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
225 return wxCONV_FAILED
;
232 // we convert the entire string in this case, as we suppose that the
233 // string is NUL-terminated and so srcEnd is not used at all
237 // advance the input pointer past the end of this chunk
238 while ( NotAllNULs(src
, nulLen
) )
240 // notice that we must skip over multiple bytes here as we suppose
241 // that if NUL takes 2 or 4 bytes, then all the other characters do
242 // too and so if advanced by a single byte we might erroneously
243 // detect sequences of NUL bytes in the middle of the input
247 src
+= nulLen
; // skipping over its terminator as well
249 // note that ">=" (and not just "==") is needed here as the terminator
250 // we skipped just above could be inside or just after the buffer
251 // delimited by inEnd
260 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
261 const wchar_t *src
, size_t srcLen
) const
263 // the number of chars [which would be] written to dst [if it were not NULL]
264 size_t dstWritten
= 0;
266 // make a copy of the input string unless it is already properly
269 // if we don't know its length we have no choice but to assume that it is,
270 // indeed, properly terminated
271 wxWCharBuffer bufTmp
;
272 if ( srcLen
== (size_t)-1 )
274 srcLen
= wxWcslen(src
) + 1;
276 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
278 // make a copy in order to properly NUL-terminate the string
279 bufTmp
= wxWCharBuffer(srcLen
);
280 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
284 const size_t lenNul
= GetMBNulLen();
285 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
287 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
289 // try to convert the current chunk
290 size_t lenChunk
= WC2MB(NULL
, src
, 0);
292 if ( lenChunk
== wxCONV_FAILED
)
293 return wxCONV_FAILED
;
296 dstWritten
+= lenChunk
;
300 if ( dstWritten
> dstLen
)
301 return wxCONV_FAILED
;
303 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
304 return wxCONV_FAILED
;
313 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
315 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
316 if ( rc
!= (size_t)wxCONV_FAILED
)
318 // ToWChar() returns the buffer length, i.e. including the trailing
319 // NUL, while this method doesn't take it into account
326 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
328 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
329 if ( rc
!= (size_t)wxCONV_FAILED
)
337 wxMBConv::~wxMBConv()
339 // nothing to do here (necessary for Darwin linking probably)
342 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
346 // calculate the length of the buffer needed first
347 const size_t nLen
= MB2WC(NULL
, psz
, 0);
348 if ( nLen
!= (size_t)wxCONV_FAILED
)
350 // now do the actual conversion
351 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
353 // +1 for the trailing NULL
354 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
359 return wxWCharBuffer();
362 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
366 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
367 if ( nLen
!= (size_t)wxCONV_FAILED
)
369 // extra space for trailing NUL(s)
370 static const size_t extraLen
= GetMaxMBNulLen();
372 wxCharBuffer
buf(nLen
+ extraLen
- 1);
373 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
378 return wxCharBuffer();
382 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
384 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
385 if ( dstLen
!= (size_t)wxCONV_FAILED
)
387 wxWCharBuffer
wbuf(dstLen
- 1);
388 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) )
391 *outLen
= dstLen
- 1;
399 return wxWCharBuffer();
403 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
405 const size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
406 if ( dstLen
!= (size_t)wxCONV_FAILED
)
408 wxCharBuffer
buf(dstLen
- 1);
409 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) )
412 *outLen
= dstLen
- 1;
421 return wxCharBuffer();
424 // ----------------------------------------------------------------------------
426 // ----------------------------------------------------------------------------
428 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
430 return wxMB2WC(buf
, psz
, n
);
433 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
435 return wxWC2MB(buf
, psz
, n
);
438 // ----------------------------------------------------------------------------
439 // wxConvBrokenFileNames
440 // ----------------------------------------------------------------------------
444 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
446 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
447 || wxStricmp(charset
, _T("UTF8")) == 0 )
448 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
450 m_conv
= new wxCSConv(charset
);
455 // ----------------------------------------------------------------------------
457 // ----------------------------------------------------------------------------
459 // Implementation (C) 2004 Fredrik Roubert
462 // BASE64 decoding table
464 static const unsigned char utf7unb64
[] =
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
471 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
472 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
473 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
475 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
476 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
477 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
478 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
479 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
480 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
481 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
500 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
504 while ( *psz
&& (!buf
|| (len
< n
)) )
506 unsigned char cc
= *psz
++;
514 else if (*psz
== '-')
522 else // start of BASE64 encoded string
526 for ( ok
= lsb
= false, d
= 0, l
= 0;
527 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
532 for (l
+= 6; l
>= 8; lsb
= !lsb
)
534 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
544 *buf
= (wchar_t)(c
<< 8);
553 // in valid UTF7 we should have valid characters after '+'
562 if ( buf
&& (len
< n
) )
569 // BASE64 encoding table
571 static const unsigned char utf7enb64
[] =
573 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
574 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
575 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
576 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
577 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
578 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
579 'w', 'x', 'y', 'z', '0', '1', '2', '3',
580 '4', '5', '6', '7', '8', '9', '+', '/'
584 // UTF-7 encoding table
586 // 0 - Set D (directly encoded characters)
587 // 1 - Set O (optional direct characters)
588 // 2 - whitespace characters (optional)
589 // 3 - special characters
591 static const unsigned char utf7encode
[128] =
593 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
594 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
595 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
597 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
599 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
603 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
607 while (*psz
&& ((!buf
) || (len
< n
)))
610 if (cc
< 0x80 && utf7encode
[cc
] < 1)
619 else if (((wxUint32
)cc
) > 0xffff)
621 // no surrogate pair generation (yet?)
632 // BASE64 encode string
633 unsigned int lsb
, d
, l
;
634 for (d
= 0, l
= 0; /*nothing*/; psz
++)
636 for (lsb
= 0; lsb
< 2; lsb
++)
639 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
641 for (l
+= 8; l
>= 6; )
645 *buf
++ = utf7enb64
[(d
>> l
) % 64];
650 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
656 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
667 if (buf
&& (len
< n
))
673 // ----------------------------------------------------------------------------
675 // ----------------------------------------------------------------------------
677 static wxUint32 utf8_max
[]=
678 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
680 // boundaries of the private use area we use to (temporarily) remap invalid
681 // characters invalid in a UTF-8 encoded string
682 const wxUint32 wxUnicodePUA
= 0x100000;
683 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
685 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
689 while (*psz
&& ((!buf
) || (len
< n
)))
691 const char *opsz
= psz
;
692 bool invalid
= false;
693 unsigned char cc
= *psz
++, fc
= cc
;
695 for (cnt
= 0; fc
& 0x80; cnt
++)
705 // escape the escape character for octal escapes
706 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
707 && cc
== '\\' && (!buf
|| len
< n
))
719 // invalid UTF-8 sequence
724 unsigned ocnt
= cnt
- 1;
725 wxUint32 res
= cc
& (0x3f >> cnt
);
729 if ((cc
& 0xC0) != 0x80)
731 // invalid UTF-8 sequence
737 res
= (res
<< 6) | (cc
& 0x3f);
739 if (invalid
|| res
<= utf8_max
[ocnt
])
741 // illegal UTF-8 encoding
744 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
745 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
747 // if one of our PUA characters turns up externally
748 // it must also be treated as an illegal sequence
749 // (a bit like you have to escape an escape character)
755 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
756 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
757 if (pa
== (size_t)-1)
769 *buf
++ = (wchar_t)res
;
771 #endif // WC_UTF16/!WC_UTF16
776 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
778 while (opsz
< psz
&& (!buf
|| len
< n
))
781 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
782 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
783 wxASSERT(pa
!= (size_t)-1);
790 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
796 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
798 while (opsz
< psz
&& (!buf
|| len
< n
))
800 if ( buf
&& len
+ 3 < n
)
802 unsigned char on
= *opsz
;
804 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
805 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
806 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
813 else // MAP_INVALID_UTF8_NOT
821 if (buf
&& (len
< n
))
827 static inline bool isoctal(wchar_t wch
)
829 return L
'0' <= wch
&& wch
<= L
'7';
832 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
836 while (*psz
&& ((!buf
) || (len
< n
)))
841 // cast is ok for WC_UTF16
842 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
843 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
845 cc
= (*psz
++) & 0x7fffffff;
848 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
849 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
852 *buf
++ = (char)(cc
- wxUnicodePUA
);
855 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
856 && cc
== L
'\\' && psz
[0] == L
'\\' )
863 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
865 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
869 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
870 (psz
[1] - L
'0') * 010 +
880 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
897 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
899 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
905 if (buf
&& (len
< n
))
911 // ----------------------------------------------------------------------------
913 // ----------------------------------------------------------------------------
915 #ifdef WORDS_BIGENDIAN
916 #define wxMBConvUTF16straight wxMBConvUTF16BE
917 #define wxMBConvUTF16swap wxMBConvUTF16LE
919 #define wxMBConvUTF16swap wxMBConvUTF16BE
920 #define wxMBConvUTF16straight wxMBConvUTF16LE
926 // copy 16bit MB to 16bit String
927 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
931 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
934 *buf
++ = *(wxUint16
*)psz
;
937 psz
+= sizeof(wxUint16
);
947 // copy 16bit String to 16bit MB
948 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
952 while (*psz
&& (!buf
|| len
< n
))
956 *(wxUint16
*)buf
= *psz
;
957 buf
+= sizeof(wxUint16
);
960 len
+= sizeof(wxUint16
);
964 if (buf
&& len
<= n
- sizeof(wxUint16
))
971 // swap 16bit MB to 16bit String
972 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
976 // UTF16 string must be terminated by 2 NULs as single NULs may occur
978 while ( (psz
[0] || psz
[1]) && (!buf
|| len
< n
) )
982 ((char *)buf
)[0] = psz
[1];
983 ((char *)buf
)[1] = psz
[0];
990 if ( buf
&& len
< n
)
997 // swap 16bit MB to 16bit String
998 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1002 while ( *psz
&& (!buf
|| len
< n
) )
1006 *buf
++ = ((char*)psz
)[1];
1007 *buf
++ = ((char*)psz
)[0];
1014 if ( buf
&& len
< n
- 1 )
1027 // copy 16bit MB to 32bit String
1028 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1032 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1035 size_t pa
= decode_utf16((wxUint16
*)psz
, cc
);
1036 if (pa
== (size_t)-1)
1040 *buf
++ = (wchar_t)cc
;
1042 psz
+= pa
* sizeof(wxUint16
);
1052 // copy 32bit String to 16bit MB
1053 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1057 while (*psz
&& (!buf
|| len
< n
))
1060 size_t pa
= encode_utf16(*psz
, cc
);
1062 if (pa
== (size_t)-1)
1067 *(wxUint16
*)buf
= cc
[0];
1068 buf
+= sizeof(wxUint16
);
1071 *(wxUint16
*)buf
= cc
[1];
1072 buf
+= sizeof(wxUint16
);
1076 len
+= pa
*sizeof(wxUint16
);
1080 if (buf
&& len
<= n
- sizeof(wxUint16
))
1081 *(wxUint16
*)buf
= 0;
1087 // swap 16bit MB to 32bit String
1088 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1092 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1102 size_t pa
= decode_utf16((wxUint16
*)tmp
, cc
);
1103 if (pa
== (size_t)-1)
1107 *buf
++ = (wchar_t)cc
;
1110 psz
+= pa
* sizeof(wxUint16
);
1120 // swap 32bit String to 16bit MB
1121 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1125 while (*psz
&& (!buf
|| len
< n
))
1128 size_t pa
= encode_utf16(*psz
, cc
);
1130 if (pa
== (size_t)-1)
1135 *buf
++ = ((char*)cc
)[1];
1136 *buf
++ = ((char*)cc
)[0];
1139 *buf
++ = ((char*)cc
)[3];
1140 *buf
++ = ((char*)cc
)[2];
1144 len
+= pa
* sizeof(wxUint16
);
1148 if (buf
&& len
<= n
- sizeof(wxUint16
))
1149 *(wxUint16
*)buf
= 0;
1157 // ----------------------------------------------------------------------------
1159 // ----------------------------------------------------------------------------
1161 #ifdef WORDS_BIGENDIAN
1162 #define wxMBConvUTF32straight wxMBConvUTF32BE
1163 #define wxMBConvUTF32swap wxMBConvUTF32LE
1165 #define wxMBConvUTF32swap wxMBConvUTF32BE
1166 #define wxMBConvUTF32straight wxMBConvUTF32LE
1170 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1171 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1176 // copy 32bit MB to 16bit String
1177 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1181 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1185 size_t pa
= encode_utf16(*(wxUint32
*)psz
, cc
);
1186 if (pa
== (size_t)-1)
1197 psz
+= sizeof(wxUint32
);
1207 // copy 16bit String to 32bit MB
1208 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1212 while (*psz
&& (!buf
|| len
< n
))
1216 // cast is ok for WC_UTF16
1217 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1218 if (pa
== (size_t)-1)
1223 *(wxUint32
*)buf
= cc
;
1224 buf
+= sizeof(wxUint32
);
1227 len
+= sizeof(wxUint32
);
1231 if (buf
&& len
<= n
- sizeof(wxUint32
))
1232 *(wxUint32
*)buf
= 0;
1238 // swap 32bit MB to 16bit String
1239 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1243 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1253 size_t pa
= encode_utf16(*(wxUint32
*)tmp
, cc
);
1254 if (pa
== (size_t)-1)
1265 psz
+= sizeof(wxUint32
);
1275 // swap 16bit String to 32bit MB
1276 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1280 while (*psz
&& (!buf
|| len
< n
))
1284 // cast is ok for WC_UTF16
1285 size_t pa
= decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1286 if (pa
== (size_t)-1)
1297 len
+= sizeof(wxUint32
);
1301 if (buf
&& len
<= n
- sizeof(wxUint32
))
1302 *(wxUint32
*)buf
= 0;
1310 // copy 32bit MB to 32bit String
1311 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1315 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1318 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1320 psz
+= sizeof(wxUint32
);
1330 // copy 32bit String to 32bit MB
1331 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1335 while (*psz
&& (!buf
|| len
< n
))
1339 *(wxUint32
*)buf
= *psz
;
1340 buf
+= sizeof(wxUint32
);
1343 len
+= sizeof(wxUint32
);
1347 if (buf
&& len
<= n
- sizeof(wxUint32
))
1348 *(wxUint32
*)buf
= 0;
1354 // swap 32bit MB to 32bit String
1355 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1359 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1363 ((char *)buf
)[0] = psz
[3];
1364 ((char *)buf
)[1] = psz
[2];
1365 ((char *)buf
)[2] = psz
[1];
1366 ((char *)buf
)[3] = psz
[0];
1371 psz
+= sizeof(wxUint32
);
1381 // swap 32bit String to 32bit MB
1382 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1386 while (*psz
&& (!buf
|| len
< n
))
1390 *buf
++ = ((char *)psz
)[3];
1391 *buf
++ = ((char *)psz
)[2];
1392 *buf
++ = ((char *)psz
)[1];
1393 *buf
++ = ((char *)psz
)[0];
1396 len
+= sizeof(wxUint32
);
1400 if (buf
&& len
<= n
- sizeof(wxUint32
))
1401 *(wxUint32
*)buf
= 0;
1410 // ============================================================================
1411 // The classes doing conversion using the iconv_xxx() functions
1412 // ============================================================================
1416 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1417 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1418 // (unless there's yet another bug in glibc) the only case when iconv()
1419 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1420 // left in the input buffer -- when _real_ error occurs,
1421 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1423 // [This bug does not appear in glibc 2.2.]
1424 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1425 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1426 (errno != E2BIG || bufLeft != 0))
1428 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1431 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1433 #define ICONV_T_INVALID ((iconv_t)-1)
1435 #if SIZEOF_WCHAR_T == 4
1436 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1437 #define WC_ENC wxFONTENCODING_UTF32
1438 #elif SIZEOF_WCHAR_T == 2
1439 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1440 #define WC_ENC wxFONTENCODING_UTF16
1441 #else // sizeof(wchar_t) != 2 nor 4
1442 // does this ever happen?
1443 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1446 // ----------------------------------------------------------------------------
1447 // wxMBConv_iconv: encapsulates an iconv character set
1448 // ----------------------------------------------------------------------------
1450 class wxMBConv_iconv
: public wxMBConv
1453 wxMBConv_iconv(const wxChar
*name
);
1454 virtual ~wxMBConv_iconv();
1456 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1457 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1459 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1460 virtual size_t GetMBNulLen() const;
1462 virtual wxMBConv
*Clone() const
1464 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1465 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1470 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1473 // the iconv handlers used to translate from multibyte to wide char and in
1474 // the other direction
1479 // guards access to m2w and w2m objects
1480 wxMutex m_iconvMutex
;
1484 // the name (for iconv_open()) of a wide char charset -- if none is
1485 // available on this machine, it will remain NULL
1486 static wxString ms_wcCharsetName
;
1488 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1489 // different endian-ness than the native one
1490 static bool ms_wcNeedsSwap
;
1493 // name of the encoding handled by this conversion
1496 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1498 size_t m_minMBCharWidth
;
1501 // make the constructor available for unit testing
1502 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1504 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1505 if ( !result
->IsOk() )
1514 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1515 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1517 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1520 m_minMBCharWidth
= 0;
1522 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1523 // names for the charsets
1524 const wxCharBuffer
cname(wxString(name
).ToAscii());
1526 // check for charset that represents wchar_t:
1527 if ( ms_wcCharsetName
.empty() )
1529 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1532 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1533 #else // !wxUSE_FONTMAP
1534 static const wxChar
*names
[] =
1536 #if SIZEOF_WCHAR_T == 4
1538 #elif SIZEOF_WCHAR_T = 2
1543 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1545 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1547 const wxString
nameCS(*names
);
1549 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1550 wxString
nameXE(nameCS
);
1551 #ifdef WORDS_BIGENDIAN
1553 #else // little endian
1557 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1560 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1561 if ( m2w
== ICONV_T_INVALID
)
1563 // try charset w/o bytesex info (e.g. "UCS4")
1564 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1566 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1568 // and check for bytesex ourselves:
1569 if ( m2w
!= ICONV_T_INVALID
)
1571 char buf
[2], *bufPtr
;
1572 wchar_t wbuf
[2], *wbufPtr
;
1580 outsz
= SIZEOF_WCHAR_T
* 2;
1584 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1585 (char**)&wbufPtr
, &outsz
);
1587 if (ICONV_FAILED(res
, insz
))
1589 wxLogLastError(wxT("iconv"));
1590 wxLogError(_("Conversion to charset '%s' doesn't work."),
1593 else // ok, can convert to this encoding, remember it
1595 ms_wcCharsetName
= nameCS
;
1596 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1600 else // use charset not requiring byte swapping
1602 ms_wcCharsetName
= nameXE
;
1606 wxLogTrace(TRACE_STRCONV
,
1607 wxT("iconv wchar_t charset is \"%s\"%s"),
1608 ms_wcCharsetName
.empty() ? _T("<none>")
1609 : ms_wcCharsetName
.c_str(),
1610 ms_wcNeedsSwap
? _T(" (needs swap)")
1613 else // we already have ms_wcCharsetName
1615 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1618 if ( ms_wcCharsetName
.empty() )
1620 w2m
= ICONV_T_INVALID
;
1624 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1625 if ( w2m
== ICONV_T_INVALID
)
1627 wxLogTrace(TRACE_STRCONV
,
1628 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1629 ms_wcCharsetName
.c_str(), cname
.data());
1634 wxMBConv_iconv::~wxMBConv_iconv()
1636 if ( m2w
!= ICONV_T_INVALID
)
1638 if ( w2m
!= ICONV_T_INVALID
)
1642 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1644 // find the string length: notice that must be done differently for
1645 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1647 const size_t nulLen
= GetMBNulLen();
1654 inbuf
= strlen(psz
); // arguably more optimized than our version
1659 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1660 // they also have to start at character boundary and not span two
1661 // adjacent characters
1663 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1670 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1671 // Unfortunately there is a couple of global wxCSConv objects such as
1672 // wxConvLocal that are used all over wx code, so we have to make sure
1673 // the handle is used by at most one thread at the time. Otherwise
1674 // only a few wx classes would be safe to use from non-main threads
1675 // as MB<->WC conversion would fail "randomly".
1676 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1677 #endif // wxUSE_THREADS
1679 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1681 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1682 wchar_t *bufPtr
= buf
;
1683 const char *pszPtr
= psz
;
1687 // have destination buffer, convert there
1689 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1690 (char**)&bufPtr
, &outbuf
);
1691 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1695 // convert to native endianness
1696 for ( unsigned i
= 0; i
< res
; i
++ )
1697 buf
[n
] = WC_BSWAP(buf
[i
]);
1700 // NUL-terminate the string if there is any space left
1706 // no destination buffer... convert using temp buffer
1707 // to calculate destination buffer requirement
1714 outbuf
= 8 * SIZEOF_WCHAR_T
;
1717 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1718 (char**)&bufPtr
, &outbuf
);
1720 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1722 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1725 if (ICONV_FAILED(cres
, inbuf
))
1727 //VS: it is ok if iconv fails, hence trace only
1728 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1735 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1738 // NB: explained in MB2WC
1739 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1742 size_t inlen
= wxWcslen(psz
);
1743 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1747 wchar_t *tmpbuf
= 0;
1751 // need to copy to temp buffer to switch endianness
1752 // (doing WC_BSWAP twice on the original buffer won't help, as it
1753 // could be in read-only memory, or be accessed in some other thread)
1754 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1755 for ( size_t i
= 0; i
< inlen
; i
++ )
1756 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1758 tmpbuf
[inlen
] = L
'\0';
1764 // have destination buffer, convert there
1765 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1769 // NB: iconv was given only wcslen(psz) characters on input, and so
1770 // it couldn't convert the trailing zero. Let's do it ourselves
1771 // if there's some room left for it in the output buffer.
1777 // no destination buffer... convert using temp buffer
1778 // to calculate destination buffer requirement
1786 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1790 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1798 if (ICONV_FAILED(cres
, inbuf
))
1800 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1807 size_t wxMBConv_iconv::GetMBNulLen() const
1809 if ( m_minMBCharWidth
== 0 )
1811 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1814 // NB: explained in MB2WC
1815 wxMutexLocker
lock(self
->m_iconvMutex
);
1818 wchar_t *wnul
= L
"";
1819 char buf
[8]; // should be enough for NUL in any encoding
1820 size_t inLen
= sizeof(wchar_t),
1821 outLen
= WXSIZEOF(buf
);
1822 char *inBuff
= (char *)wnul
;
1823 char *outBuff
= buf
;
1824 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1826 self
->m_minMBCharWidth
= (size_t)-1;
1830 self
->m_minMBCharWidth
= outBuff
- buf
;
1834 return m_minMBCharWidth
;
1837 #endif // HAVE_ICONV
1840 // ============================================================================
1841 // Win32 conversion classes
1842 // ============================================================================
1844 #ifdef wxHAVE_WIN32_MB2WC
1848 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1849 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1852 class wxMBConv_win32
: public wxMBConv
1857 m_CodePage
= CP_ACP
;
1858 m_minMBCharWidth
= 0;
1861 wxMBConv_win32(const wxMBConv_win32
& conv
)
1863 m_CodePage
= conv
.m_CodePage
;
1864 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
1868 wxMBConv_win32(const wxChar
* name
)
1870 m_CodePage
= wxCharsetToCodepage(name
);
1871 m_minMBCharWidth
= 0;
1874 wxMBConv_win32(wxFontEncoding encoding
)
1876 m_CodePage
= wxEncodingToCodepage(encoding
);
1877 m_minMBCharWidth
= 0;
1879 #endif // wxUSE_FONTMAP
1881 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1883 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1884 // the behaviour is not compatible with the Unix version (using iconv)
1885 // and break the library itself, e.g. wxTextInputStream::NextChar()
1886 // wouldn't work if reading an incomplete MB char didn't result in an
1889 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1890 // Win XP or newer and it is not supported for UTF-[78] so we always
1891 // use our own conversions in this case. See
1892 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1893 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1894 if ( m_CodePage
== CP_UTF8
)
1896 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
1899 if ( m_CodePage
== CP_UTF7
)
1901 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
1905 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
1906 IsAtLeastWin2kSP4() )
1908 flags
= MB_ERR_INVALID_CHARS
;
1911 const size_t len
= ::MultiByteToWideChar
1913 m_CodePage
, // code page
1914 flags
, // flags: fall on error
1915 psz
, // input string
1916 -1, // its length (NUL-terminated)
1917 buf
, // output string
1918 buf
? n
: 0 // size of output buffer
1922 // function totally failed
1926 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1927 // check if we succeeded, by doing a double trip:
1928 if ( !flags
&& buf
)
1930 const size_t mbLen
= strlen(psz
);
1931 wxCharBuffer
mbBuf(mbLen
);
1932 if ( ::WideCharToMultiByte
1939 mbLen
+ 1, // size in bytes, not length
1943 strcmp(mbBuf
, psz
) != 0 )
1945 // we didn't obtain the same thing we started from, hence
1946 // the conversion was lossy and we consider that it failed
1951 // note that it returns count of written chars for buf != NULL and size
1952 // of the needed buffer for buf == NULL so in either case the length of
1953 // the string (which never includes the terminating NUL) is one less
1957 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1960 we have a problem here: by default, WideCharToMultiByte() may
1961 replace characters unrepresentable in the target code page with bad
1962 quality approximations such as turning "1/2" symbol (U+00BD) into
1963 "1" for the code pages which don't have it and we, obviously, want
1964 to avoid this at any price
1966 the trouble is that this function does it _silently_, i.e. it won't
1967 even tell us whether it did or not... Win98/2000 and higher provide
1968 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1969 we have to resort to a round trip, i.e. check that converting back
1970 results in the same string -- this is, of course, expensive but
1971 otherwise we simply can't be sure to not garble the data.
1974 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1975 // it doesn't work with CJK encodings (which we test for rather roughly
1976 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1978 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1981 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1983 // it's our lucky day
1984 flags
= WC_NO_BEST_FIT_CHARS
;
1985 pUsedDef
= &usedDef
;
1987 else // old system or unsupported encoding
1993 const size_t len
= ::WideCharToMultiByte
1995 m_CodePage
, // code page
1996 flags
, // either none or no best fit
1997 pwz
, // input string
1998 -1, // it is (wide) NUL-terminated
1999 buf
, // output buffer
2000 buf
? n
: 0, // and its size
2001 NULL
, // default "replacement" char
2002 pUsedDef
// [out] was it used?
2007 // function totally failed
2011 // if we were really converting, check if we succeeded
2016 // check if the conversion failed, i.e. if any replacements
2021 else // we must resort to double tripping...
2023 wxWCharBuffer
wcBuf(n
);
2024 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2025 wcscmp(wcBuf
, pwz
) != 0 )
2027 // we didn't obtain the same thing we started from, hence
2028 // the conversion was lossy and we consider that it failed
2034 // see the comment above for the reason of "len - 1"
2038 virtual size_t GetMBNulLen() const
2040 if ( m_minMBCharWidth
== 0 )
2042 int len
= ::WideCharToMultiByte
2044 m_CodePage
, // code page
2046 L
"", // input string
2047 1, // translate just the NUL
2048 NULL
, // output buffer
2050 NULL
, // no replacement char
2051 NULL
// [out] don't care if it was used
2054 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2058 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2059 self
->m_minMBCharWidth
= (size_t)-1;
2063 self
->m_minMBCharWidth
= (size_t)-1;
2069 self
->m_minMBCharWidth
= len
;
2074 return m_minMBCharWidth
;
2077 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2079 bool IsOk() const { return m_CodePage
!= -1; }
2082 static bool CanUseNoBestFit()
2084 static int s_isWin98Or2k
= -1;
2086 if ( s_isWin98Or2k
== -1 )
2089 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2092 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2096 s_isWin98Or2k
= verMaj
>= 5;
2100 // unknown, be conservative by default
2105 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2108 return s_isWin98Or2k
== 1;
2111 static bool IsAtLeastWin2kSP4()
2116 static int s_isAtLeastWin2kSP4
= -1;
2118 if ( s_isAtLeastWin2kSP4
== -1 )
2120 OSVERSIONINFOEX ver
;
2122 memset(&ver
, 0, sizeof(ver
));
2123 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2124 GetVersionEx((OSVERSIONINFO
*)&ver
);
2126 s_isAtLeastWin2kSP4
=
2127 ((ver
.dwMajorVersion
> 5) || // Vista+
2128 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2129 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2130 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2134 return s_isAtLeastWin2kSP4
== 1;
2139 // the code page we're working with
2142 // cached result of GetMBNulLen(), set to 0 initially meaning
2144 size_t m_minMBCharWidth
;
2147 #endif // wxHAVE_WIN32_MB2WC
2149 // ============================================================================
2150 // Cocoa conversion classes
2151 // ============================================================================
2153 #if defined(__WXCOCOA__)
2155 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2156 // Strangely enough, internally Core Foundation uses
2157 // UTF 32 internally quite a bit - its just not public (yet).
2159 #include <CoreFoundation/CFString.h>
2160 #include <CoreFoundation/CFStringEncodingExt.h>
2162 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2164 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2168 case wxFONTENCODING_DEFAULT
:
2169 enc
= CFStringGetSystemEncoding();
2172 case wxFONTENCODING_ISO8859_1
:
2173 enc
= kCFStringEncodingISOLatin1
;
2175 case wxFONTENCODING_ISO8859_2
:
2176 enc
= kCFStringEncodingISOLatin2
;
2178 case wxFONTENCODING_ISO8859_3
:
2179 enc
= kCFStringEncodingISOLatin3
;
2181 case wxFONTENCODING_ISO8859_4
:
2182 enc
= kCFStringEncodingISOLatin4
;
2184 case wxFONTENCODING_ISO8859_5
:
2185 enc
= kCFStringEncodingISOLatinCyrillic
;
2187 case wxFONTENCODING_ISO8859_6
:
2188 enc
= kCFStringEncodingISOLatinArabic
;
2190 case wxFONTENCODING_ISO8859_7
:
2191 enc
= kCFStringEncodingISOLatinGreek
;
2193 case wxFONTENCODING_ISO8859_8
:
2194 enc
= kCFStringEncodingISOLatinHebrew
;
2196 case wxFONTENCODING_ISO8859_9
:
2197 enc
= kCFStringEncodingISOLatin5
;
2199 case wxFONTENCODING_ISO8859_10
:
2200 enc
= kCFStringEncodingISOLatin6
;
2202 case wxFONTENCODING_ISO8859_11
:
2203 enc
= kCFStringEncodingISOLatinThai
;
2205 case wxFONTENCODING_ISO8859_13
:
2206 enc
= kCFStringEncodingISOLatin7
;
2208 case wxFONTENCODING_ISO8859_14
:
2209 enc
= kCFStringEncodingISOLatin8
;
2211 case wxFONTENCODING_ISO8859_15
:
2212 enc
= kCFStringEncodingISOLatin9
;
2215 case wxFONTENCODING_KOI8
:
2216 enc
= kCFStringEncodingKOI8_R
;
2218 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2219 enc
= kCFStringEncodingDOSRussian
;
2222 // case wxFONTENCODING_BULGARIAN :
2226 case wxFONTENCODING_CP437
:
2227 enc
= kCFStringEncodingDOSLatinUS
;
2229 case wxFONTENCODING_CP850
:
2230 enc
= kCFStringEncodingDOSLatin1
;
2232 case wxFONTENCODING_CP852
:
2233 enc
= kCFStringEncodingDOSLatin2
;
2235 case wxFONTENCODING_CP855
:
2236 enc
= kCFStringEncodingDOSCyrillic
;
2238 case wxFONTENCODING_CP866
:
2239 enc
= kCFStringEncodingDOSRussian
;
2241 case wxFONTENCODING_CP874
:
2242 enc
= kCFStringEncodingDOSThai
;
2244 case wxFONTENCODING_CP932
:
2245 enc
= kCFStringEncodingDOSJapanese
;
2247 case wxFONTENCODING_CP936
:
2248 enc
= kCFStringEncodingDOSChineseSimplif
;
2250 case wxFONTENCODING_CP949
:
2251 enc
= kCFStringEncodingDOSKorean
;
2253 case wxFONTENCODING_CP950
:
2254 enc
= kCFStringEncodingDOSChineseTrad
;
2256 case wxFONTENCODING_CP1250
:
2257 enc
= kCFStringEncodingWindowsLatin2
;
2259 case wxFONTENCODING_CP1251
:
2260 enc
= kCFStringEncodingWindowsCyrillic
;
2262 case wxFONTENCODING_CP1252
:
2263 enc
= kCFStringEncodingWindowsLatin1
;
2265 case wxFONTENCODING_CP1253
:
2266 enc
= kCFStringEncodingWindowsGreek
;
2268 case wxFONTENCODING_CP1254
:
2269 enc
= kCFStringEncodingWindowsLatin5
;
2271 case wxFONTENCODING_CP1255
:
2272 enc
= kCFStringEncodingWindowsHebrew
;
2274 case wxFONTENCODING_CP1256
:
2275 enc
= kCFStringEncodingWindowsArabic
;
2277 case wxFONTENCODING_CP1257
:
2278 enc
= kCFStringEncodingWindowsBalticRim
;
2280 // This only really encodes to UTF7 (if that) evidently
2281 // case wxFONTENCODING_UTF7 :
2282 // enc = kCFStringEncodingNonLossyASCII ;
2284 case wxFONTENCODING_UTF8
:
2285 enc
= kCFStringEncodingUTF8
;
2287 case wxFONTENCODING_EUC_JP
:
2288 enc
= kCFStringEncodingEUC_JP
;
2290 case wxFONTENCODING_UTF16
:
2291 enc
= kCFStringEncodingUnicode
;
2293 case wxFONTENCODING_MACROMAN
:
2294 enc
= kCFStringEncodingMacRoman
;
2296 case wxFONTENCODING_MACJAPANESE
:
2297 enc
= kCFStringEncodingMacJapanese
;
2299 case wxFONTENCODING_MACCHINESETRAD
:
2300 enc
= kCFStringEncodingMacChineseTrad
;
2302 case wxFONTENCODING_MACKOREAN
:
2303 enc
= kCFStringEncodingMacKorean
;
2305 case wxFONTENCODING_MACARABIC
:
2306 enc
= kCFStringEncodingMacArabic
;
2308 case wxFONTENCODING_MACHEBREW
:
2309 enc
= kCFStringEncodingMacHebrew
;
2311 case wxFONTENCODING_MACGREEK
:
2312 enc
= kCFStringEncodingMacGreek
;
2314 case wxFONTENCODING_MACCYRILLIC
:
2315 enc
= kCFStringEncodingMacCyrillic
;
2317 case wxFONTENCODING_MACDEVANAGARI
:
2318 enc
= kCFStringEncodingMacDevanagari
;
2320 case wxFONTENCODING_MACGURMUKHI
:
2321 enc
= kCFStringEncodingMacGurmukhi
;
2323 case wxFONTENCODING_MACGUJARATI
:
2324 enc
= kCFStringEncodingMacGujarati
;
2326 case wxFONTENCODING_MACORIYA
:
2327 enc
= kCFStringEncodingMacOriya
;
2329 case wxFONTENCODING_MACBENGALI
:
2330 enc
= kCFStringEncodingMacBengali
;
2332 case wxFONTENCODING_MACTAMIL
:
2333 enc
= kCFStringEncodingMacTamil
;
2335 case wxFONTENCODING_MACTELUGU
:
2336 enc
= kCFStringEncodingMacTelugu
;
2338 case wxFONTENCODING_MACKANNADA
:
2339 enc
= kCFStringEncodingMacKannada
;
2341 case wxFONTENCODING_MACMALAJALAM
:
2342 enc
= kCFStringEncodingMacMalayalam
;
2344 case wxFONTENCODING_MACSINHALESE
:
2345 enc
= kCFStringEncodingMacSinhalese
;
2347 case wxFONTENCODING_MACBURMESE
:
2348 enc
= kCFStringEncodingMacBurmese
;
2350 case wxFONTENCODING_MACKHMER
:
2351 enc
= kCFStringEncodingMacKhmer
;
2353 case wxFONTENCODING_MACTHAI
:
2354 enc
= kCFStringEncodingMacThai
;
2356 case wxFONTENCODING_MACLAOTIAN
:
2357 enc
= kCFStringEncodingMacLaotian
;
2359 case wxFONTENCODING_MACGEORGIAN
:
2360 enc
= kCFStringEncodingMacGeorgian
;
2362 case wxFONTENCODING_MACARMENIAN
:
2363 enc
= kCFStringEncodingMacArmenian
;
2365 case wxFONTENCODING_MACCHINESESIMP
:
2366 enc
= kCFStringEncodingMacChineseSimp
;
2368 case wxFONTENCODING_MACTIBETAN
:
2369 enc
= kCFStringEncodingMacTibetan
;
2371 case wxFONTENCODING_MACMONGOLIAN
:
2372 enc
= kCFStringEncodingMacMongolian
;
2374 case wxFONTENCODING_MACETHIOPIC
:
2375 enc
= kCFStringEncodingMacEthiopic
;
2377 case wxFONTENCODING_MACCENTRALEUR
:
2378 enc
= kCFStringEncodingMacCentralEurRoman
;
2380 case wxFONTENCODING_MACVIATNAMESE
:
2381 enc
= kCFStringEncodingMacVietnamese
;
2383 case wxFONTENCODING_MACARABICEXT
:
2384 enc
= kCFStringEncodingMacExtArabic
;
2386 case wxFONTENCODING_MACSYMBOL
:
2387 enc
= kCFStringEncodingMacSymbol
;
2389 case wxFONTENCODING_MACDINGBATS
:
2390 enc
= kCFStringEncodingMacDingbats
;
2392 case wxFONTENCODING_MACTURKISH
:
2393 enc
= kCFStringEncodingMacTurkish
;
2395 case wxFONTENCODING_MACCROATIAN
:
2396 enc
= kCFStringEncodingMacCroatian
;
2398 case wxFONTENCODING_MACICELANDIC
:
2399 enc
= kCFStringEncodingMacIcelandic
;
2401 case wxFONTENCODING_MACROMANIAN
:
2402 enc
= kCFStringEncodingMacRomanian
;
2404 case wxFONTENCODING_MACCELTIC
:
2405 enc
= kCFStringEncodingMacCeltic
;
2407 case wxFONTENCODING_MACGAELIC
:
2408 enc
= kCFStringEncodingMacGaelic
;
2410 // case wxFONTENCODING_MACKEYBOARD :
2411 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2415 // because gcc is picky
2422 class wxMBConv_cocoa
: public wxMBConv
2427 Init(CFStringGetSystemEncoding()) ;
2430 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2432 m_encoding
= conv
.m_encoding
;
2436 wxMBConv_cocoa(const wxChar
* name
)
2438 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2442 wxMBConv_cocoa(wxFontEncoding encoding
)
2444 Init( wxCFStringEncFromFontEnc(encoding
) );
2451 void Init( CFStringEncoding encoding
)
2453 m_encoding
= encoding
;
2456 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2460 CFStringRef theString
= CFStringCreateWithBytes (
2461 NULL
, //the allocator
2462 (const UInt8
*)szUnConv
,
2465 false //no BOM/external representation
2468 wxASSERT(theString
);
2470 size_t nOutLength
= CFStringGetLength(theString
);
2474 CFRelease(theString
);
2478 CFRange theRange
= { 0, nOutSize
};
2480 #if SIZEOF_WCHAR_T == 4
2481 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2484 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2486 CFRelease(theString
);
2488 szUniCharBuffer
[nOutLength
] = '\0' ;
2490 #if SIZEOF_WCHAR_T == 4
2491 wxMBConvUTF16 converter
;
2492 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2493 delete[] szUniCharBuffer
;
2499 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2503 size_t nRealOutSize
;
2504 size_t nBufSize
= wxWcslen(szUnConv
);
2505 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2507 #if SIZEOF_WCHAR_T == 4
2508 wxMBConvUTF16 converter
;
2509 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2510 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2511 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2512 nBufSize
/= sizeof(UniChar
);
2515 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2519 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2522 wxASSERT(theString
);
2524 //Note that CER puts a BOM when converting to unicode
2525 //so we check and use getchars instead in that case
2526 if (m_encoding
== kCFStringEncodingUnicode
)
2529 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2531 nRealOutSize
= CFStringGetLength(theString
) + 1;
2537 CFRangeMake(0, CFStringGetLength(theString
)),
2539 0, //what to put in characters that can't be converted -
2540 //0 tells CFString to return NULL if it meets such a character
2541 false, //not an external representation
2544 (CFIndex
*) &nRealOutSize
2548 CFRelease(theString
);
2550 #if SIZEOF_WCHAR_T == 4
2551 delete[] szUniBuffer
;
2554 return nRealOutSize
- 1;
2557 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2561 return m_encoding
!= kCFStringEncodingInvalidId
&&
2562 CFStringIsEncodingAvailable(m_encoding
);
2566 CFStringEncoding m_encoding
;
2569 #endif // defined(__WXCOCOA__)
2571 // ============================================================================
2572 // Mac conversion classes
2573 // ============================================================================
2575 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2577 class wxMBConv_mac
: public wxMBConv
2582 Init(CFStringGetSystemEncoding()) ;
2585 wxMBConv_mac(const wxMBConv_mac
& conv
)
2587 Init(conv
.m_char_encoding
);
2591 wxMBConv_mac(const wxChar
* name
)
2593 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2597 wxMBConv_mac(wxFontEncoding encoding
)
2599 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2604 OSStatus status
= noErr
;
2605 status
= TECDisposeConverter(m_MB2WC_converter
);
2606 status
= TECDisposeConverter(m_WC2MB_converter
);
2610 void Init( TextEncodingBase encoding
)
2612 OSStatus status
= noErr
;
2613 m_char_encoding
= encoding
;
2614 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2616 status
= TECCreateConverter(&m_MB2WC_converter
,
2618 m_unicode_encoding
);
2619 status
= TECCreateConverter(&m_WC2MB_converter
,
2624 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2626 OSStatus status
= noErr
;
2627 ByteCount byteOutLen
;
2628 ByteCount byteInLen
= strlen(psz
) ;
2629 wchar_t *tbuf
= NULL
;
2630 UniChar
* ubuf
= NULL
;
2635 //apple specs say at least 32
2636 n
= wxMax( 32 , byteInLen
) ;
2637 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2640 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2642 #if SIZEOF_WCHAR_T == 4
2643 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2645 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2647 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2648 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2649 #if SIZEOF_WCHAR_T == 4
2650 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2651 // is not properly terminated we get random characters at the end
2652 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2653 wxMBConvUTF16 converter
;
2654 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2657 res
= byteOutLen
/ sizeof( UniChar
) ;
2663 if ( buf
&& res
< n
)
2669 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2671 OSStatus status
= noErr
;
2672 ByteCount byteOutLen
;
2673 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2679 //apple specs say at least 32
2680 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2681 tbuf
= (char*) malloc( n
) ;
2684 ByteCount byteBufferLen
= n
;
2685 UniChar
* ubuf
= NULL
;
2687 #if SIZEOF_WCHAR_T == 4
2688 wxMBConvUTF16 converter
;
2689 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2690 byteInLen
= unicharlen
;
2691 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2692 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2694 ubuf
= (UniChar
*) psz
;
2697 status
= TECConvertText(
2698 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2699 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2701 #if SIZEOF_WCHAR_T == 4
2708 size_t res
= byteOutLen
;
2709 if ( buf
&& res
< n
)
2713 //we need to double-trip to verify it didn't insert any ? in place
2714 //of bogus characters
2715 wxWCharBuffer
wcBuf(n
);
2716 size_t pszlen
= wxWcslen(psz
);
2717 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2718 wxWcslen(wcBuf
) != pszlen
||
2719 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2721 // we didn't obtain the same thing we started from, hence
2722 // the conversion was lossy and we consider that it failed
2730 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2733 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2736 TECObjectRef m_MB2WC_converter
;
2737 TECObjectRef m_WC2MB_converter
;
2739 TextEncodingBase m_char_encoding
;
2740 TextEncodingBase m_unicode_encoding
;
2743 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2745 // ============================================================================
2746 // wxEncodingConverter based conversion classes
2747 // ============================================================================
2751 class wxMBConv_wxwin
: public wxMBConv
2756 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2757 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2761 // temporarily just use wxEncodingConverter stuff,
2762 // so that it works while a better implementation is built
2763 wxMBConv_wxwin(const wxChar
* name
)
2766 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2768 m_enc
= wxFONTENCODING_SYSTEM
;
2773 wxMBConv_wxwin(wxFontEncoding enc
)
2780 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2782 size_t inbuf
= strlen(psz
);
2785 if (!m2w
.Convert(psz
, buf
))
2791 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2793 const size_t inbuf
= wxWcslen(psz
);
2796 if (!w2m
.Convert(psz
, buf
))
2803 virtual size_t GetMBNulLen() const
2807 case wxFONTENCODING_UTF16BE
:
2808 case wxFONTENCODING_UTF16LE
:
2811 case wxFONTENCODING_UTF32BE
:
2812 case wxFONTENCODING_UTF32LE
:
2820 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2822 bool IsOk() const { return m_ok
; }
2825 wxFontEncoding m_enc
;
2826 wxEncodingConverter m2w
, w2m
;
2829 // were we initialized successfully?
2832 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2835 // make the constructors available for unit testing
2836 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2838 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2839 if ( !result
->IsOk() )
2848 #endif // wxUSE_FONTMAP
2850 // ============================================================================
2851 // wxCSConv implementation
2852 // ============================================================================
2854 void wxCSConv::Init()
2861 wxCSConv::wxCSConv(const wxChar
*charset
)
2871 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2873 m_encoding
= wxFONTENCODING_SYSTEM
;
2877 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2879 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2881 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2883 encoding
= wxFONTENCODING_SYSTEM
;
2888 m_encoding
= encoding
;
2891 wxCSConv::~wxCSConv()
2896 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2901 SetName(conv
.m_name
);
2902 m_encoding
= conv
.m_encoding
;
2905 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2909 SetName(conv
.m_name
);
2910 m_encoding
= conv
.m_encoding
;
2915 void wxCSConv::Clear()
2924 void wxCSConv::SetName(const wxChar
*charset
)
2928 m_name
= wxStrdup(charset
);
2934 #include "wx/hashmap.h"
2936 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2937 wxEncodingNameCache
);
2939 static wxEncodingNameCache gs_nameCache
;
2942 wxMBConv
*wxCSConv::DoCreate() const
2945 wxLogTrace(TRACE_STRCONV
,
2946 wxT("creating conversion for %s"),
2948 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
2949 #endif // wxUSE_FONTMAP
2951 // check for the special case of ASCII or ISO8859-1 charset: as we have
2952 // special knowledge of it anyhow, we don't need to create a special
2953 // conversion object
2954 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2955 m_encoding
== wxFONTENCODING_DEFAULT
)
2957 // don't convert at all
2961 // we trust OS to do conversion better than we can so try external
2962 // conversion methods first
2964 // the full order is:
2965 // 1. OS conversion (iconv() under Unix or Win32 API)
2966 // 2. hard coded conversions for UTF
2967 // 3. wxEncodingConverter as fall back
2973 #endif // !wxUSE_FONTMAP
2975 wxString
name(m_name
);
2976 wxFontEncoding
encoding(m_encoding
);
2978 if ( !name
.empty() )
2980 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2988 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2989 #endif // wxUSE_FONTMAP
2993 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2994 if ( it
!= gs_nameCache
.end() )
2996 if ( it
->second
.empty() )
2999 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3006 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3008 for ( ; *names
; ++names
)
3010 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3013 gs_nameCache
[encoding
] = *names
;
3020 gs_nameCache
[encoding
] = _T(""); // cache the failure
3022 #endif // wxUSE_FONTMAP
3024 #endif // HAVE_ICONV
3026 #ifdef wxHAVE_WIN32_MB2WC
3029 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3030 : new wxMBConv_win32(m_encoding
);
3039 #endif // wxHAVE_WIN32_MB2WC
3041 #if defined(__WXMAC__)
3043 // leave UTF16 and UTF32 to the built-ins of wx
3044 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3045 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3048 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3049 : new wxMBConv_mac(m_encoding
);
3051 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3061 #if defined(__WXCOCOA__)
3063 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3066 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3067 : new wxMBConv_cocoa(m_encoding
);
3069 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3080 wxFontEncoding enc
= m_encoding
;
3082 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3084 // use "false" to suppress interactive dialogs -- we can be called from
3085 // anywhere and popping up a dialog from here is the last thing we want to
3087 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3089 #endif // wxUSE_FONTMAP
3093 case wxFONTENCODING_UTF7
:
3094 return new wxMBConvUTF7
;
3096 case wxFONTENCODING_UTF8
:
3097 return new wxMBConvUTF8
;
3099 case wxFONTENCODING_UTF16BE
:
3100 return new wxMBConvUTF16BE
;
3102 case wxFONTENCODING_UTF16LE
:
3103 return new wxMBConvUTF16LE
;
3105 case wxFONTENCODING_UTF32BE
:
3106 return new wxMBConvUTF32BE
;
3108 case wxFONTENCODING_UTF32LE
:
3109 return new wxMBConvUTF32LE
;
3112 // nothing to do but put here to suppress gcc warnings
3119 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3120 : new wxMBConv_wxwin(m_encoding
);
3126 #endif // wxUSE_FONTMAP
3128 // NB: This is a hack to prevent deadlock. What could otherwise happen
3129 // in Unicode build: wxConvLocal creation ends up being here
3130 // because of some failure and logs the error. But wxLog will try to
3131 // attach timestamp, for which it will need wxConvLocal (to convert
3132 // time to char* and then wchar_t*), but that fails, tries to log
3133 // error, but wxLog has a (already locked) critical section that
3134 // guards static buffer.
3135 static bool alreadyLoggingError
= false;
3136 if (!alreadyLoggingError
)
3138 alreadyLoggingError
= true;
3139 wxLogError(_("Cannot convert from the charset '%s'!"),
3143 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3144 #else // !wxUSE_FONTMAP
3145 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3146 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3149 alreadyLoggingError
= false;
3155 void wxCSConv::CreateConvIfNeeded() const
3159 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3162 // if we don't have neither the name nor the encoding, use the default
3163 // encoding for this system
3164 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3166 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3168 #endif // wxUSE_INTL
3170 self
->m_convReal
= DoCreate();
3171 self
->m_deferred
= false;
3175 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3177 CreateConvIfNeeded();
3180 return m_convReal
->MB2WC(buf
, psz
, n
);
3183 size_t len
= strlen(psz
);
3187 for (size_t c
= 0; c
<= len
; c
++)
3188 buf
[c
] = (unsigned char)(psz
[c
]);
3194 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3196 CreateConvIfNeeded();
3199 return m_convReal
->WC2MB(buf
, psz
, n
);
3202 const size_t len
= wxWcslen(psz
);
3205 for (size_t c
= 0; c
<= len
; c
++)
3210 buf
[c
] = (char)psz
[c
];
3215 for (size_t c
= 0; c
<= len
; c
++)
3225 size_t wxCSConv::GetMBNulLen() const
3227 CreateConvIfNeeded();
3231 return m_convReal
->GetMBNulLen();
3237 // ----------------------------------------------------------------------------
3239 // ----------------------------------------------------------------------------
3242 static wxMBConv_win32 wxConvLibcObj
;
3243 #elif defined(__WXMAC__) && !defined(__MACH__)
3244 static wxMBConv_mac wxConvLibcObj
;
3246 static wxMBConvLibc wxConvLibcObj
;
3249 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3250 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3251 static wxMBConvUTF7 wxConvUTF7Obj
;
3252 static wxMBConvUTF8 wxConvUTF8Obj
;
3254 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3255 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3256 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3258 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3259 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3260 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3268 #else // !wxUSE_WCHAR_T
3270 // stand-ins in absence of wchar_t
3271 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3276 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T