1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p
, size_t n
)
92 while ( n
&& *p
++ == '\0' )
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
103 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
108 *output
= (wxUint16
) input
;
111 else if (input
>=0x110000)
119 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
120 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
126 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
128 if ((*input
<0xd800) || (*input
>0xdfff))
133 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
140 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
146 // ----------------------------------------------------------------------------
148 // ----------------------------------------------------------------------------
151 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
152 const char *src
, size_t srcLen
) const
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten
= 0;
164 // the number of NULs terminating this string
165 size_t nulLen
wxDUMMY_INITIALIZE(0);
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
173 if ( srcLen
!= (size_t)-1 )
175 // we need to know how to find the end of this string
176 nulLen
= GetMBNulLen();
177 if ( nulLen
== wxCONV_FAILED
)
178 return wxCONV_FAILED
;
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
185 char * const p
= bufTmp
.data();
186 memcpy(p
, src
, srcLen
);
187 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
193 srcEnd
= src
+ srcLen
;
195 else // quit after the first loop iteration
202 // try to convert the current chunk
203 size_t lenChunk
= MB2WC(NULL
, src
, 0);
206 // nothing left in the input string, conversion succeeded
210 if ( lenChunk
== wxCONV_FAILED
)
211 return wxCONV_FAILED
;
213 // if we already have a previous chunk, leave the NUL separating it
222 dstWritten
+= lenChunk
;
226 if ( dstWritten
> dstLen
)
227 return wxCONV_FAILED
;
229 lenChunk
= MB2WC(dst
, src
, lenChunk
+ 1 /* for NUL */);
230 if ( lenChunk
== wxCONV_FAILED
)
231 return wxCONV_FAILED
;
238 // we convert the entire string in this cas, as we suppose that the
239 // string is NUL-terminated and so srcEnd is not used at all
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src
, nulLen
) )
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
253 src
+= nulLen
; // skipping over its terminator as well
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
266 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
267 const wchar_t *src
, size_t srcLen
) const
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten
= 0;
272 // make a copy of the input string unless it is already properly
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp
;
278 if ( srcLen
== (size_t)-1 )
280 srcLen
= wxWcslen(src
) + 1;
282 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp
= wxWCharBuffer(srcLen
);
286 memcpy(bufTmp
.data(), src
, srcLen
*sizeof(wchar_t));
290 const size_t lenNul
= GetMBNulLen();
291 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
293 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
295 // try to convert the current chunk
296 size_t lenChunk
= WC2MB(NULL
, src
, 0);
298 if ( lenChunk
== wxCONV_FAILED
)
299 return wxCONV_FAILED
;
302 dstWritten
+= lenChunk
;
306 if ( dstWritten
> dstLen
)
307 return wxCONV_FAILED
;
309 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
310 return wxCONV_FAILED
;
319 size_t wxMBConv::MB2WC(wchar_t *out
, const char *in
, size_t outLen
) const
321 size_t rc
= ToWChar(out
, outLen
, in
);
322 if ( rc
!= wxCONV_FAILED
)
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
332 size_t wxMBConv::WC2MB(char *out
, const wchar_t *in
, size_t outLen
) const
334 size_t rc
= FromWChar(out
, outLen
, in
);
335 if ( rc
!= wxCONV_FAILED
)
343 wxMBConv::~wxMBConv()
345 // nothing to do here (necessary for Darwin linking probably)
348 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
352 // calculate the length of the buffer needed first
353 const size_t nLen
= MB2WC(NULL
, psz
, 0);
354 if ( nLen
!= wxCONV_FAILED
)
356 // now do the actual conversion
357 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
359 // +1 for the trailing NULL
360 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
365 return wxWCharBuffer();
368 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
372 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
373 if ( nLen
!= wxCONV_FAILED
)
375 // extra space for trailing NUL(s)
376 static const size_t extraLen
= GetMaxMBNulLen();
378 wxCharBuffer
buf(nLen
+ extraLen
- 1);
379 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
384 return wxCharBuffer();
388 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
390 const size_t dstLen
= ToWChar(NULL
, 0, in
, inLen
);
391 if ( dstLen
!= wxCONV_FAILED
)
393 wxWCharBuffer
wbuf(dstLen
);
394 if ( ToWChar(wbuf
.data(), dstLen
, in
, inLen
) )
405 return wxWCharBuffer();
409 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
411 const size_t dstLen
= FromWChar(NULL
, 0, in
, inLen
);
412 if ( dstLen
!= wxCONV_FAILED
)
414 wxCharBuffer
buf(dstLen
);
415 if ( FromWChar(buf
.data(), dstLen
, in
, inLen
) )
426 return wxCharBuffer();
429 // ----------------------------------------------------------------------------
431 // ----------------------------------------------------------------------------
433 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
435 return wxMB2WC(buf
, psz
, n
);
438 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
440 return wxWC2MB(buf
, psz
, n
);
443 // ----------------------------------------------------------------------------
444 // wxConvBrokenFileNames
445 // ----------------------------------------------------------------------------
449 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
451 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
452 || wxStricmp(charset
, _T("UTF8")) == 0 )
453 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
455 m_conv
= new wxCSConv(charset
);
460 // ----------------------------------------------------------------------------
462 // ----------------------------------------------------------------------------
464 // Implementation (C) 2004 Fredrik Roubert
467 // BASE64 decoding table
469 static const unsigned char utf7unb64
[] =
471 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
472 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
476 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
477 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
478 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
479 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
480 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
481 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
482 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
484 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
485 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
486 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
505 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
509 while ( *psz
&& (!buf
|| (len
< n
)) )
511 unsigned char cc
= *psz
++;
519 else if (*psz
== '-')
527 else // start of BASE64 encoded string
531 for ( ok
= lsb
= false, d
= 0, l
= 0;
532 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
537 for (l
+= 6; l
>= 8; lsb
= !lsb
)
539 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
549 *buf
= (wchar_t)(c
<< 8);
558 // in valid UTF7 we should have valid characters after '+'
567 if ( buf
&& (len
< n
) )
574 // BASE64 encoding table
576 static const unsigned char utf7enb64
[] =
578 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
579 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
580 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
581 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
582 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
583 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
584 'w', 'x', 'y', 'z', '0', '1', '2', '3',
585 '4', '5', '6', '7', '8', '9', '+', '/'
589 // UTF-7 encoding table
591 // 0 - Set D (directly encoded characters)
592 // 1 - Set O (optional direct characters)
593 // 2 - whitespace characters (optional)
594 // 3 - special characters
596 static const unsigned char utf7encode
[128] =
598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
599 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
600 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
602 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
603 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
604 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
608 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
612 while (*psz
&& ((!buf
) || (len
< n
)))
615 if (cc
< 0x80 && utf7encode
[cc
] < 1)
623 else if (((wxUint32
)cc
) > 0xffff)
625 // no surrogate pair generation (yet?)
636 // BASE64 encode string
637 unsigned int lsb
, d
, l
;
638 for (d
= 0, l
= 0; /*nothing*/; psz
++)
640 for (lsb
= 0; lsb
< 2; lsb
++)
643 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
645 for (l
+= 8; l
>= 6; )
649 *buf
++ = utf7enb64
[(d
>> l
) % 64];
654 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
660 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
669 if (buf
&& (len
< n
))
674 // ----------------------------------------------------------------------------
676 // ----------------------------------------------------------------------------
678 static wxUint32 utf8_max
[]=
679 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
681 // boundaries of the private use area we use to (temporarily) remap invalid
682 // characters invalid in a UTF-8 encoded string
683 const wxUint32 wxUnicodePUA
= 0x100000;
684 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
686 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
690 while (*psz
&& ((!buf
) || (len
< n
)))
692 const char *opsz
= psz
;
693 bool invalid
= false;
694 unsigned char cc
= *psz
++, fc
= cc
;
696 for (cnt
= 0; fc
& 0x80; cnt
++)
705 // escape the escape character for octal escapes
706 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
707 && cc
== '\\' && (!buf
|| len
< n
))
719 // invalid UTF-8 sequence
724 unsigned ocnt
= cnt
- 1;
725 wxUint32 res
= cc
& (0x3f >> cnt
);
729 if ((cc
& 0xC0) != 0x80)
731 // invalid UTF-8 sequence
736 res
= (res
<< 6) | (cc
& 0x3f);
738 if (invalid
|| res
<= utf8_max
[ocnt
])
740 // illegal UTF-8 encoding
743 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
744 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
746 // if one of our PUA characters turns up externally
747 // it must also be treated as an illegal sequence
748 // (a bit like you have to escape an escape character)
754 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
755 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
756 if (pa
== (size_t)-1)
768 *buf
++ = (wchar_t)res
;
770 #endif // WC_UTF16/!WC_UTF16
775 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
777 while (opsz
< psz
&& (!buf
|| len
< n
))
780 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
781 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
782 wxASSERT(pa
!= (size_t)-1);
789 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
795 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
797 while (opsz
< psz
&& (!buf
|| len
< n
))
799 if ( buf
&& len
+ 3 < n
)
801 unsigned char on
= *opsz
;
803 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
804 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
805 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
811 else // MAP_INVALID_UTF8_NOT
818 if (buf
&& (len
< n
))
823 static inline bool isoctal(wchar_t wch
)
825 return L
'0' <= wch
&& wch
<= L
'7';
828 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
832 while (*psz
&& ((!buf
) || (len
< n
)))
836 // cast is ok for WC_UTF16
837 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
838 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
840 cc
=(*psz
++) & 0x7fffffff;
843 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
844 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
847 *buf
++ = (char)(cc
- wxUnicodePUA
);
850 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
851 && cc
== L
'\\' && psz
[0] == L
'\\' )
858 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
860 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
864 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
865 (psz
[1] - L
'0')*010 +
875 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
889 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
891 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
903 // ----------------------------------------------------------------------------
905 // ----------------------------------------------------------------------------
907 #ifdef WORDS_BIGENDIAN
908 #define wxMBConvUTF16straight wxMBConvUTF16BE
909 #define wxMBConvUTF16swap wxMBConvUTF16LE
911 #define wxMBConvUTF16swap wxMBConvUTF16BE
912 #define wxMBConvUTF16straight wxMBConvUTF16LE
918 // copy 16bit MB to 16bit String
919 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
923 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
926 *buf
++ = *(wxUint16
*)psz
;
929 psz
+= sizeof(wxUint16
);
931 if (buf
&& len
<n
) *buf
=0;
937 // copy 16bit String to 16bit MB
938 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
942 while (*psz
&& (!buf
|| len
< n
))
946 *(wxUint16
*)buf
= *psz
;
947 buf
+= sizeof(wxUint16
);
949 len
+= sizeof(wxUint16
);
952 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
958 // swap 16bit MB to 16bit String
959 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
963 // UTF16 string must be terminated by 2 NULs as single NULs may occur
965 while ( (psz
[0] || psz
[1]) && (!buf
|| len
< n
) )
969 ((char *)buf
)[0] = psz
[1];
970 ((char *)buf
)[1] = psz
[0];
977 if ( buf
&& len
< n
)
984 // swap 16bit MB to 16bit String
985 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
989 while ( *psz
&& (!buf
|| len
< n
) )
993 *buf
++ = ((char*)psz
)[1];
994 *buf
++ = ((char*)psz
)[0];
1000 if ( buf
&& len
< n
)
1010 // copy 16bit MB to 32bit String
1011 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1015 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1018 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
1019 if (pa
== (size_t)-1)
1023 *buf
++ = (wchar_t)cc
;
1025 psz
+= pa
* sizeof(wxUint16
);
1027 if (buf
&& len
<n
) *buf
=0;
1033 // copy 32bit String to 16bit MB
1034 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1038 while (*psz
&& (!buf
|| len
< n
))
1041 size_t pa
=encode_utf16(*psz
, cc
);
1043 if (pa
== (size_t)-1)
1048 *(wxUint16
*)buf
= cc
[0];
1049 buf
+= sizeof(wxUint16
);
1052 *(wxUint16
*)buf
= cc
[1];
1053 buf
+= sizeof(wxUint16
);
1057 len
+= pa
*sizeof(wxUint16
);
1060 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1066 // swap 16bit MB to 32bit String
1067 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1071 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1075 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
1076 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
1078 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
1079 if (pa
== (size_t)-1)
1083 *buf
++ = (wchar_t)cc
;
1086 psz
+= pa
* sizeof(wxUint16
);
1088 if (buf
&& len
<n
) *buf
=0;
1094 // swap 32bit String to 16bit MB
1095 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1099 while (*psz
&& (!buf
|| len
< n
))
1102 size_t pa
=encode_utf16(*psz
, cc
);
1104 if (pa
== (size_t)-1)
1109 *buf
++ = ((char*)cc
)[1];
1110 *buf
++ = ((char*)cc
)[0];
1113 *buf
++ = ((char*)cc
)[3];
1114 *buf
++ = ((char*)cc
)[2];
1118 len
+= pa
*sizeof(wxUint16
);
1121 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1129 // ----------------------------------------------------------------------------
1131 // ----------------------------------------------------------------------------
1133 #ifdef WORDS_BIGENDIAN
1134 #define wxMBConvUTF32straight wxMBConvUTF32BE
1135 #define wxMBConvUTF32swap wxMBConvUTF32LE
1137 #define wxMBConvUTF32swap wxMBConvUTF32BE
1138 #define wxMBConvUTF32straight wxMBConvUTF32LE
1142 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1143 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1148 // copy 32bit MB to 16bit String
1149 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1153 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1157 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
1158 if (pa
== (size_t)-1)
1168 psz
+= sizeof(wxUint32
);
1170 if (buf
&& len
<n
) *buf
=0;
1176 // copy 16bit String to 32bit MB
1177 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1181 while (*psz
&& (!buf
|| len
< n
))
1185 // cast is ok for WC_UTF16
1186 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1187 if (pa
== (size_t)-1)
1192 *(wxUint32
*)buf
= cc
;
1193 buf
+= sizeof(wxUint32
);
1195 len
+= sizeof(wxUint32
);
1199 if (buf
&& len
<=n
-sizeof(wxUint32
))
1207 // swap 32bit MB to 16bit String
1208 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1212 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1215 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
1216 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
1221 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
1222 if (pa
== (size_t)-1)
1232 psz
+= sizeof(wxUint32
);
1242 // swap 16bit String to 32bit MB
1243 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1247 while (*psz
&& (!buf
|| len
< n
))
1251 // cast is ok for WC_UTF16
1252 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1253 if (pa
== (size_t)-1)
1263 len
+= sizeof(wxUint32
);
1267 if (buf
&& len
<=n
-sizeof(wxUint32
))
1276 // copy 32bit MB to 32bit String
1277 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1281 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1284 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1286 psz
+= sizeof(wxUint32
);
1296 // copy 32bit String to 32bit MB
1297 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1301 while (*psz
&& (!buf
|| len
< n
))
1305 *(wxUint32
*)buf
= *psz
;
1306 buf
+= sizeof(wxUint32
);
1309 len
+= sizeof(wxUint32
);
1313 if (buf
&& len
<=n
-sizeof(wxUint32
))
1320 // swap 32bit MB to 32bit String
1321 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1325 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1329 ((char *)buf
)[0] = psz
[3];
1330 ((char *)buf
)[1] = psz
[2];
1331 ((char *)buf
)[2] = psz
[1];
1332 ((char *)buf
)[3] = psz
[0];
1336 psz
+= sizeof(wxUint32
);
1346 // swap 32bit String to 32bit MB
1347 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1351 while (*psz
&& (!buf
|| len
< n
))
1355 *buf
++ = ((char *)psz
)[3];
1356 *buf
++ = ((char *)psz
)[2];
1357 *buf
++ = ((char *)psz
)[1];
1358 *buf
++ = ((char *)psz
)[0];
1360 len
+= sizeof(wxUint32
);
1364 if (buf
&& len
<=n
-sizeof(wxUint32
))
1374 // ============================================================================
1375 // The classes doing conversion using the iconv_xxx() functions
1376 // ============================================================================
1380 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1381 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1382 // (unless there's yet another bug in glibc) the only case when iconv()
1383 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1384 // left in the input buffer -- when _real_ error occurs,
1385 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1387 // [This bug does not appear in glibc 2.2.]
1388 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1389 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1390 (errno != E2BIG || bufLeft != 0))
1392 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1395 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1397 #define ICONV_T_INVALID ((iconv_t)-1)
1399 #if SIZEOF_WCHAR_T == 4
1400 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1401 #define WC_ENC wxFONTENCODING_UTF32
1402 #elif SIZEOF_WCHAR_T == 2
1403 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1404 #define WC_ENC wxFONTENCODING_UTF16
1405 #else // sizeof(wchar_t) != 2 nor 4
1406 // does this ever happen?
1407 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1410 // ----------------------------------------------------------------------------
1411 // wxMBConv_iconv: encapsulates an iconv character set
1412 // ----------------------------------------------------------------------------
1414 class wxMBConv_iconv
: public wxMBConv
1417 wxMBConv_iconv(const wxChar
*name
);
1418 virtual ~wxMBConv_iconv();
1420 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1421 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1423 // classify this encoding as explained in wxMBConv::GetMBNulLen()
1425 virtual size_t GetMBNulLen() const;
1428 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1431 // the iconv handlers used to translate from multibyte to wide char and in
1432 // the other direction
1436 // guards access to m2w and w2m objects
1437 wxMutex m_iconvMutex
;
1441 // the name (for iconv_open()) of a wide char charset -- if none is
1442 // available on this machine, it will remain NULL
1443 static wxString ms_wcCharsetName
;
1445 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1446 // different endian-ness than the native one
1447 static bool ms_wcNeedsSwap
;
1449 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1451 size_t m_minMBCharWidth
;
1454 // make the constructor available for unit testing
1455 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1457 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1458 if ( !result
->IsOk() )
1466 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1467 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1469 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1471 m_minMBCharWidth
= 0;
1473 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1474 // names for the charsets
1475 const wxCharBuffer
cname(wxString(name
).ToAscii());
1477 // check for charset that represents wchar_t:
1478 if ( ms_wcCharsetName
.empty() )
1480 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1483 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1484 #else // !wxUSE_FONTMAP
1485 static const wxChar
*names
[] =
1487 #if SIZEOF_WCHAR_T == 4
1489 #elif SIZEOF_WCHAR_T = 2
1494 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1496 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1498 const wxString
nameCS(*names
);
1500 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1501 wxString
nameXE(nameCS
);
1502 #ifdef WORDS_BIGENDIAN
1504 #else // little endian
1508 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1511 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1512 if ( m2w
== ICONV_T_INVALID
)
1514 // try charset w/o bytesex info (e.g. "UCS4")
1515 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1517 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1519 // and check for bytesex ourselves:
1520 if ( m2w
!= ICONV_T_INVALID
)
1522 char buf
[2], *bufPtr
;
1523 wchar_t wbuf
[2], *wbufPtr
;
1531 outsz
= SIZEOF_WCHAR_T
* 2;
1535 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1536 (char**)&wbufPtr
, &outsz
);
1538 if (ICONV_FAILED(res
, insz
))
1540 wxLogLastError(wxT("iconv"));
1541 wxLogError(_("Conversion to charset '%s' doesn't work."),
1544 else // ok, can convert to this encoding, remember it
1546 ms_wcCharsetName
= nameCS
;
1547 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1551 else // use charset not requiring byte swapping
1553 ms_wcCharsetName
= nameXE
;
1557 wxLogTrace(TRACE_STRCONV
,
1558 wxT("iconv wchar_t charset is \"%s\"%s"),
1559 ms_wcCharsetName
.empty() ? _T("<none>")
1560 : ms_wcCharsetName
.c_str(),
1561 ms_wcNeedsSwap
? _T(" (needs swap)")
1564 else // we already have ms_wcCharsetName
1566 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1569 if ( ms_wcCharsetName
.empty() )
1571 w2m
= ICONV_T_INVALID
;
1575 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1576 if ( w2m
== ICONV_T_INVALID
)
1578 wxLogTrace(TRACE_STRCONV
,
1579 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1580 ms_wcCharsetName
.c_str(), cname
.data());
1585 wxMBConv_iconv::~wxMBConv_iconv()
1587 if ( m2w
!= ICONV_T_INVALID
)
1589 if ( w2m
!= ICONV_T_INVALID
)
1593 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1595 // find the string length: notice that must be done differently for
1596 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1598 const size_t nulLen
= GetMBNulLen();
1605 inbuf
= strlen(psz
); // arguably more optimized than our version
1610 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1611 // they also have to start at character boundary and not span two
1612 // adjacent characters
1614 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1621 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1622 // Unfortunately there is a couple of global wxCSConv objects such as
1623 // wxConvLocal that are used all over wx code, so we have to make sure
1624 // the handle is used by at most one thread at the time. Otherwise
1625 // only a few wx classes would be safe to use from non-main threads
1626 // as MB<->WC conversion would fail "randomly".
1627 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1628 #endif // wxUSE_THREADS
1631 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1633 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1634 wchar_t *bufPtr
= buf
;
1635 const char *pszPtr
= psz
;
1639 // have destination buffer, convert there
1641 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1642 (char**)&bufPtr
, &outbuf
);
1643 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1647 // convert to native endianness
1648 for ( unsigned i
= 0; i
< res
; i
++ )
1649 buf
[n
] = WC_BSWAP(buf
[i
]);
1652 // NUL-terminate the string if there is any space left
1658 // no destination buffer... convert using temp buffer
1659 // to calculate destination buffer requirement
1664 outbuf
= 8*SIZEOF_WCHAR_T
;
1667 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1668 (char**)&bufPtr
, &outbuf
);
1670 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1671 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1674 if (ICONV_FAILED(cres
, inbuf
))
1676 //VS: it is ok if iconv fails, hence trace only
1677 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1684 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1687 // NB: explained in MB2WC
1688 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1691 size_t inlen
= wxWcslen(psz
);
1692 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1696 wchar_t *tmpbuf
= 0;
1700 // need to copy to temp buffer to switch endianness
1701 // (doing WC_BSWAP twice on the original buffer won't help, as it
1702 // could be in read-only memory, or be accessed in some other thread)
1703 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1704 for ( size_t i
= 0; i
< inlen
; i
++ )
1705 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1706 tmpbuf
[inlen
] = L
'\0';
1712 // have destination buffer, convert there
1713 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1717 // NB: iconv was given only wcslen(psz) characters on input, and so
1718 // it couldn't convert the trailing zero. Let's do it ourselves
1719 // if there's some room left for it in the output buffer.
1725 // no destination buffer... convert using temp buffer
1726 // to calculate destination buffer requirement
1730 buf
= tbuf
; outbuf
= 16;
1732 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1735 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1743 if (ICONV_FAILED(cres
, inbuf
))
1745 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1752 size_t wxMBConv_iconv::GetMBNulLen() const
1754 if ( m_minMBCharWidth
== 0 )
1756 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1759 // NB: explained in MB2WC
1760 wxMutexLocker
lock(self
->m_iconvMutex
);
1763 wchar_t *wnul
= L
"";
1764 char buf
[8]; // should be enough for NUL in any encoding
1765 size_t inLen
= sizeof(wchar_t),
1766 outLen
= WXSIZEOF(buf
);
1767 char *in
= (char *)wnul
;
1769 if ( iconv(w2m
, ICONV_CHAR_CAST(&in
), &inLen
, &out
, &outLen
) == (size_t)-1 )
1771 self
->m_minMBCharWidth
= (size_t)-1;
1775 self
->m_minMBCharWidth
= out
- buf
;
1779 return m_minMBCharWidth
;
1782 #endif // HAVE_ICONV
1785 // ============================================================================
1786 // Win32 conversion classes
1787 // ============================================================================
1789 #ifdef wxHAVE_WIN32_MB2WC
1793 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1794 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1797 class wxMBConv_win32
: public wxMBConv
1802 m_CodePage
= CP_ACP
;
1803 m_minMBCharWidth
= 0;
1807 wxMBConv_win32(const wxChar
* name
)
1809 m_CodePage
= wxCharsetToCodepage(name
);
1810 m_minMBCharWidth
= 0;
1813 wxMBConv_win32(wxFontEncoding encoding
)
1815 m_CodePage
= wxEncodingToCodepage(encoding
);
1816 m_minMBCharWidth
= 0;
1818 #endif // wxUSE_FONTMAP
1820 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1822 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1823 // the behaviour is not compatible with the Unix version (using iconv)
1824 // and break the library itself, e.g. wxTextInputStream::NextChar()
1825 // wouldn't work if reading an incomplete MB char didn't result in an
1828 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1829 // an error (tested under Windows Server 2003) and apparently it is
1830 // done on purpose, i.e. the function accepts any input in this case
1831 // and although I'd prefer to return error on ill-formed output, our
1832 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1833 // explicitly ill-formed according to RFC 2152) neither so we don't
1834 // even have any fallback here...
1836 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1837 // Win XP or newer and if it is specified on older versions, conversion
1838 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1839 // fails. So we can only use the flag on newer Windows versions.
1840 // Additionally, the flag is not supported by UTF7, symbol and CJK
1841 // encodings. See here:
1842 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1843 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1845 if ( m_CodePage
!= CP_UTF7
&& m_CodePage
!= CP_SYMBOL
&&
1846 m_CodePage
< 50000 &&
1847 IsAtLeastWin2kSP4() )
1849 flags
= MB_ERR_INVALID_CHARS
;
1851 else if ( m_CodePage
== CP_UTF8
)
1853 // Avoid round-trip in the special case of UTF-8 by using our
1854 // own UTF-8 conversion code:
1855 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
1858 const size_t len
= ::MultiByteToWideChar
1860 m_CodePage
, // code page
1861 flags
, // flags: fall on error
1862 psz
, // input string
1863 -1, // its length (NUL-terminated)
1864 buf
, // output string
1865 buf
? n
: 0 // size of output buffer
1869 // function totally failed
1873 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1874 // check if we succeeded, by doing a double trip:
1875 if ( !flags
&& buf
)
1877 const size_t mbLen
= strlen(psz
);
1878 wxCharBuffer
mbBuf(mbLen
);
1879 if ( ::WideCharToMultiByte
1886 mbLen
+ 1, // size in bytes, not length
1890 strcmp(mbBuf
, psz
) != 0 )
1892 // we didn't obtain the same thing we started from, hence
1893 // the conversion was lossy and we consider that it failed
1898 // note that it returns count of written chars for buf != NULL and size
1899 // of the needed buffer for buf == NULL so in either case the length of
1900 // the string (which never includes the terminating NUL) is one less
1904 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1907 we have a problem here: by default, WideCharToMultiByte() may
1908 replace characters unrepresentable in the target code page with bad
1909 quality approximations such as turning "1/2" symbol (U+00BD) into
1910 "1" for the code pages which don't have it and we, obviously, want
1911 to avoid this at any price
1913 the trouble is that this function does it _silently_, i.e. it won't
1914 even tell us whether it did or not... Win98/2000 and higher provide
1915 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1916 we have to resort to a round trip, i.e. check that converting back
1917 results in the same string -- this is, of course, expensive but
1918 otherwise we simply can't be sure to not garble the data.
1921 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1922 // it doesn't work with CJK encodings (which we test for rather roughly
1923 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1925 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1928 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1930 // it's our lucky day
1931 flags
= WC_NO_BEST_FIT_CHARS
;
1932 pUsedDef
= &usedDef
;
1934 else // old system or unsupported encoding
1940 const size_t len
= ::WideCharToMultiByte
1942 m_CodePage
, // code page
1943 flags
, // either none or no best fit
1944 pwz
, // input string
1945 -1, // it is (wide) NUL-terminated
1946 buf
, // output buffer
1947 buf
? n
: 0, // and its size
1948 NULL
, // default "replacement" char
1949 pUsedDef
// [out] was it used?
1954 // function totally failed
1958 // if we were really converting, check if we succeeded
1963 // check if the conversion failed, i.e. if any replacements
1968 else // we must resort to double tripping...
1970 wxWCharBuffer
wcBuf(n
);
1971 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1972 wcscmp(wcBuf
, pwz
) != 0 )
1974 // we didn't obtain the same thing we started from, hence
1975 // the conversion was lossy and we consider that it failed
1981 // see the comment above for the reason of "len - 1"
1985 virtual size_t GetMBNulLen() const
1987 if ( m_minMBCharWidth
== 0 )
1989 int len
= ::WideCharToMultiByte
1991 m_CodePage
, // code page
1993 L
"", // input string
1994 1, // translate just the NUL
1995 NULL
, // output buffer
1997 NULL
, // no replacement char
1998 NULL
// [out] don't care if it was used
2001 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2005 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2009 self
->m_minMBCharWidth
= (size_t)-1;
2015 self
->m_minMBCharWidth
= len
;
2020 return m_minMBCharWidth
;
2023 bool IsOk() const { return m_CodePage
!= -1; }
2026 static bool CanUseNoBestFit()
2028 static int s_isWin98Or2k
= -1;
2030 if ( s_isWin98Or2k
== -1 )
2033 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2036 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2040 s_isWin98Or2k
= verMaj
>= 5;
2044 // unknown, be conseravtive by default
2048 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2051 return s_isWin98Or2k
== 1;
2054 static bool IsAtLeastWin2kSP4()
2059 static int s_isAtLeastWin2kSP4
= -1;
2061 if ( s_isAtLeastWin2kSP4
== -1 )
2063 OSVERSIONINFOEX ver
;
2065 memset(&ver
, 0, sizeof(ver
));
2066 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2067 GetVersionEx((OSVERSIONINFO
*)&ver
);
2069 s_isAtLeastWin2kSP4
=
2070 ((ver
.dwMajorVersion
> 5) || // Vista+
2071 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2072 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2073 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2077 return s_isAtLeastWin2kSP4
== 1;
2082 // the code page we're working with
2085 // cached result of GetMBNulLen(), set to 0 initially meaning
2087 size_t m_minMBCharWidth
;
2090 #endif // wxHAVE_WIN32_MB2WC
2092 // ============================================================================
2093 // Cocoa conversion classes
2094 // ============================================================================
2096 #if defined(__WXCOCOA__)
2098 // RN: There is no UTF-32 support in either Core Foundation or
2099 // Cocoa. Strangely enough, internally Core Foundation uses
2100 // UTF 32 internally quite a bit - its just not public (yet).
2102 #include <CoreFoundation/CFString.h>
2103 #include <CoreFoundation/CFStringEncodingExt.h>
2105 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2107 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2108 if ( encoding
== wxFONTENCODING_DEFAULT
)
2110 enc
= CFStringGetSystemEncoding();
2112 else switch( encoding
)
2114 case wxFONTENCODING_ISO8859_1
:
2115 enc
= kCFStringEncodingISOLatin1
;
2117 case wxFONTENCODING_ISO8859_2
:
2118 enc
= kCFStringEncodingISOLatin2
;
2120 case wxFONTENCODING_ISO8859_3
:
2121 enc
= kCFStringEncodingISOLatin3
;
2123 case wxFONTENCODING_ISO8859_4
:
2124 enc
= kCFStringEncodingISOLatin4
;
2126 case wxFONTENCODING_ISO8859_5
:
2127 enc
= kCFStringEncodingISOLatinCyrillic
;
2129 case wxFONTENCODING_ISO8859_6
:
2130 enc
= kCFStringEncodingISOLatinArabic
;
2132 case wxFONTENCODING_ISO8859_7
:
2133 enc
= kCFStringEncodingISOLatinGreek
;
2135 case wxFONTENCODING_ISO8859_8
:
2136 enc
= kCFStringEncodingISOLatinHebrew
;
2138 case wxFONTENCODING_ISO8859_9
:
2139 enc
= kCFStringEncodingISOLatin5
;
2141 case wxFONTENCODING_ISO8859_10
:
2142 enc
= kCFStringEncodingISOLatin6
;
2144 case wxFONTENCODING_ISO8859_11
:
2145 enc
= kCFStringEncodingISOLatinThai
;
2147 case wxFONTENCODING_ISO8859_13
:
2148 enc
= kCFStringEncodingISOLatin7
;
2150 case wxFONTENCODING_ISO8859_14
:
2151 enc
= kCFStringEncodingISOLatin8
;
2153 case wxFONTENCODING_ISO8859_15
:
2154 enc
= kCFStringEncodingISOLatin9
;
2157 case wxFONTENCODING_KOI8
:
2158 enc
= kCFStringEncodingKOI8_R
;
2160 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2161 enc
= kCFStringEncodingDOSRussian
;
2164 // case wxFONTENCODING_BULGARIAN :
2168 case wxFONTENCODING_CP437
:
2169 enc
=kCFStringEncodingDOSLatinUS
;
2171 case wxFONTENCODING_CP850
:
2172 enc
= kCFStringEncodingDOSLatin1
;
2174 case wxFONTENCODING_CP852
:
2175 enc
= kCFStringEncodingDOSLatin2
;
2177 case wxFONTENCODING_CP855
:
2178 enc
= kCFStringEncodingDOSCyrillic
;
2180 case wxFONTENCODING_CP866
:
2181 enc
=kCFStringEncodingDOSRussian
;
2183 case wxFONTENCODING_CP874
:
2184 enc
= kCFStringEncodingDOSThai
;
2186 case wxFONTENCODING_CP932
:
2187 enc
= kCFStringEncodingDOSJapanese
;
2189 case wxFONTENCODING_CP936
:
2190 enc
=kCFStringEncodingDOSChineseSimplif
;
2192 case wxFONTENCODING_CP949
:
2193 enc
= kCFStringEncodingDOSKorean
;
2195 case wxFONTENCODING_CP950
:
2196 enc
= kCFStringEncodingDOSChineseTrad
;
2198 case wxFONTENCODING_CP1250
:
2199 enc
= kCFStringEncodingWindowsLatin2
;
2201 case wxFONTENCODING_CP1251
:
2202 enc
=kCFStringEncodingWindowsCyrillic
;
2204 case wxFONTENCODING_CP1252
:
2205 enc
=kCFStringEncodingWindowsLatin1
;
2207 case wxFONTENCODING_CP1253
:
2208 enc
= kCFStringEncodingWindowsGreek
;
2210 case wxFONTENCODING_CP1254
:
2211 enc
= kCFStringEncodingWindowsLatin5
;
2213 case wxFONTENCODING_CP1255
:
2214 enc
=kCFStringEncodingWindowsHebrew
;
2216 case wxFONTENCODING_CP1256
:
2217 enc
=kCFStringEncodingWindowsArabic
;
2219 case wxFONTENCODING_CP1257
:
2220 enc
= kCFStringEncodingWindowsBalticRim
;
2222 // This only really encodes to UTF7 (if that) evidently
2223 // case wxFONTENCODING_UTF7 :
2224 // enc = kCFStringEncodingNonLossyASCII ;
2226 case wxFONTENCODING_UTF8
:
2227 enc
= kCFStringEncodingUTF8
;
2229 case wxFONTENCODING_EUC_JP
:
2230 enc
= kCFStringEncodingEUC_JP
;
2232 case wxFONTENCODING_UTF16
:
2233 enc
= kCFStringEncodingUnicode
;
2235 case wxFONTENCODING_MACROMAN
:
2236 enc
= kCFStringEncodingMacRoman
;
2238 case wxFONTENCODING_MACJAPANESE
:
2239 enc
= kCFStringEncodingMacJapanese
;
2241 case wxFONTENCODING_MACCHINESETRAD
:
2242 enc
= kCFStringEncodingMacChineseTrad
;
2244 case wxFONTENCODING_MACKOREAN
:
2245 enc
= kCFStringEncodingMacKorean
;
2247 case wxFONTENCODING_MACARABIC
:
2248 enc
= kCFStringEncodingMacArabic
;
2250 case wxFONTENCODING_MACHEBREW
:
2251 enc
= kCFStringEncodingMacHebrew
;
2253 case wxFONTENCODING_MACGREEK
:
2254 enc
= kCFStringEncodingMacGreek
;
2256 case wxFONTENCODING_MACCYRILLIC
:
2257 enc
= kCFStringEncodingMacCyrillic
;
2259 case wxFONTENCODING_MACDEVANAGARI
:
2260 enc
= kCFStringEncodingMacDevanagari
;
2262 case wxFONTENCODING_MACGURMUKHI
:
2263 enc
= kCFStringEncodingMacGurmukhi
;
2265 case wxFONTENCODING_MACGUJARATI
:
2266 enc
= kCFStringEncodingMacGujarati
;
2268 case wxFONTENCODING_MACORIYA
:
2269 enc
= kCFStringEncodingMacOriya
;
2271 case wxFONTENCODING_MACBENGALI
:
2272 enc
= kCFStringEncodingMacBengali
;
2274 case wxFONTENCODING_MACTAMIL
:
2275 enc
= kCFStringEncodingMacTamil
;
2277 case wxFONTENCODING_MACTELUGU
:
2278 enc
= kCFStringEncodingMacTelugu
;
2280 case wxFONTENCODING_MACKANNADA
:
2281 enc
= kCFStringEncodingMacKannada
;
2283 case wxFONTENCODING_MACMALAJALAM
:
2284 enc
= kCFStringEncodingMacMalayalam
;
2286 case wxFONTENCODING_MACSINHALESE
:
2287 enc
= kCFStringEncodingMacSinhalese
;
2289 case wxFONTENCODING_MACBURMESE
:
2290 enc
= kCFStringEncodingMacBurmese
;
2292 case wxFONTENCODING_MACKHMER
:
2293 enc
= kCFStringEncodingMacKhmer
;
2295 case wxFONTENCODING_MACTHAI
:
2296 enc
= kCFStringEncodingMacThai
;
2298 case wxFONTENCODING_MACLAOTIAN
:
2299 enc
= kCFStringEncodingMacLaotian
;
2301 case wxFONTENCODING_MACGEORGIAN
:
2302 enc
= kCFStringEncodingMacGeorgian
;
2304 case wxFONTENCODING_MACARMENIAN
:
2305 enc
= kCFStringEncodingMacArmenian
;
2307 case wxFONTENCODING_MACCHINESESIMP
:
2308 enc
= kCFStringEncodingMacChineseSimp
;
2310 case wxFONTENCODING_MACTIBETAN
:
2311 enc
= kCFStringEncodingMacTibetan
;
2313 case wxFONTENCODING_MACMONGOLIAN
:
2314 enc
= kCFStringEncodingMacMongolian
;
2316 case wxFONTENCODING_MACETHIOPIC
:
2317 enc
= kCFStringEncodingMacEthiopic
;
2319 case wxFONTENCODING_MACCENTRALEUR
:
2320 enc
= kCFStringEncodingMacCentralEurRoman
;
2322 case wxFONTENCODING_MACVIATNAMESE
:
2323 enc
= kCFStringEncodingMacVietnamese
;
2325 case wxFONTENCODING_MACARABICEXT
:
2326 enc
= kCFStringEncodingMacExtArabic
;
2328 case wxFONTENCODING_MACSYMBOL
:
2329 enc
= kCFStringEncodingMacSymbol
;
2331 case wxFONTENCODING_MACDINGBATS
:
2332 enc
= kCFStringEncodingMacDingbats
;
2334 case wxFONTENCODING_MACTURKISH
:
2335 enc
= kCFStringEncodingMacTurkish
;
2337 case wxFONTENCODING_MACCROATIAN
:
2338 enc
= kCFStringEncodingMacCroatian
;
2340 case wxFONTENCODING_MACICELANDIC
:
2341 enc
= kCFStringEncodingMacIcelandic
;
2343 case wxFONTENCODING_MACROMANIAN
:
2344 enc
= kCFStringEncodingMacRomanian
;
2346 case wxFONTENCODING_MACCELTIC
:
2347 enc
= kCFStringEncodingMacCeltic
;
2349 case wxFONTENCODING_MACGAELIC
:
2350 enc
= kCFStringEncodingMacGaelic
;
2352 // case wxFONTENCODING_MACKEYBOARD :
2353 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2356 // because gcc is picky
2362 class wxMBConv_cocoa
: public wxMBConv
2367 Init(CFStringGetSystemEncoding()) ;
2371 wxMBConv_cocoa(const wxChar
* name
)
2373 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2377 wxMBConv_cocoa(wxFontEncoding encoding
)
2379 Init( wxCFStringEncFromFontEnc(encoding
) );
2386 void Init( CFStringEncoding encoding
)
2388 m_encoding
= encoding
;
2391 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2395 CFStringRef theString
= CFStringCreateWithBytes (
2396 NULL
, //the allocator
2397 (const UInt8
*)szUnConv
,
2400 false //no BOM/external representation
2403 wxASSERT(theString
);
2405 size_t nOutLength
= CFStringGetLength(theString
);
2409 CFRelease(theString
);
2413 CFRange theRange
= { 0, nOutSize
};
2415 #if SIZEOF_WCHAR_T == 4
2416 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2419 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2421 CFRelease(theString
);
2423 szUniCharBuffer
[nOutLength
] = '\0' ;
2425 #if SIZEOF_WCHAR_T == 4
2426 wxMBConvUTF16 converter
;
2427 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2428 delete[] szUniCharBuffer
;
2434 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2438 size_t nRealOutSize
;
2439 size_t nBufSize
= wxWcslen(szUnConv
);
2440 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2442 #if SIZEOF_WCHAR_T == 4
2443 wxMBConvUTF16 converter
;
2444 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2445 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2446 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2447 nBufSize
/= sizeof(UniChar
);
2450 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2454 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2457 wxASSERT(theString
);
2459 //Note that CER puts a BOM when converting to unicode
2460 //so we check and use getchars instead in that case
2461 if (m_encoding
== kCFStringEncodingUnicode
)
2464 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2466 nRealOutSize
= CFStringGetLength(theString
) + 1;
2472 CFRangeMake(0, CFStringGetLength(theString
)),
2474 0, //what to put in characters that can't be converted -
2475 //0 tells CFString to return NULL if it meets such a character
2476 false, //not an external representation
2479 (CFIndex
*) &nRealOutSize
2483 CFRelease(theString
);
2485 #if SIZEOF_WCHAR_T == 4
2486 delete[] szUniBuffer
;
2489 return nRealOutSize
- 1;
2494 return m_encoding
!= kCFStringEncodingInvalidId
&&
2495 CFStringIsEncodingAvailable(m_encoding
);
2499 CFStringEncoding m_encoding
;
2502 #endif // defined(__WXCOCOA__)
2504 // ============================================================================
2505 // Mac conversion classes
2506 // ============================================================================
2508 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2510 class wxMBConv_mac
: public wxMBConv
2515 Init(CFStringGetSystemEncoding()) ;
2519 wxMBConv_mac(const wxChar
* name
)
2521 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2525 wxMBConv_mac(wxFontEncoding encoding
)
2527 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2532 OSStatus status
= noErr
;
2533 status
= TECDisposeConverter(m_MB2WC_converter
);
2534 status
= TECDisposeConverter(m_WC2MB_converter
);
2538 void Init( TextEncodingBase encoding
)
2540 OSStatus status
= noErr
;
2541 m_char_encoding
= encoding
;
2542 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2544 status
= TECCreateConverter(&m_MB2WC_converter
,
2546 m_unicode_encoding
);
2547 status
= TECCreateConverter(&m_WC2MB_converter
,
2552 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2554 OSStatus status
= noErr
;
2555 ByteCount byteOutLen
;
2556 ByteCount byteInLen
= strlen(psz
) ;
2557 wchar_t *tbuf
= NULL
;
2558 UniChar
* ubuf
= NULL
;
2563 //apple specs say at least 32
2564 n
= wxMax( 32 , byteInLen
) ;
2565 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2567 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2568 #if SIZEOF_WCHAR_T == 4
2569 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2571 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2573 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2574 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2575 #if SIZEOF_WCHAR_T == 4
2576 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2577 // is not properly terminated we get random characters at the end
2578 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2579 wxMBConvUTF16 converter
;
2580 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2583 res
= byteOutLen
/ sizeof( UniChar
) ;
2588 if ( buf
&& res
< n
)
2594 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2596 OSStatus status
= noErr
;
2597 ByteCount byteOutLen
;
2598 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2604 //apple specs say at least 32
2605 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2606 tbuf
= (char*) malloc( n
) ;
2609 ByteCount byteBufferLen
= n
;
2610 UniChar
* ubuf
= NULL
;
2611 #if SIZEOF_WCHAR_T == 4
2612 wxMBConvUTF16 converter
;
2613 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2614 byteInLen
= unicharlen
;
2615 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2616 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2618 ubuf
= (UniChar
*) psz
;
2620 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2621 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2622 #if SIZEOF_WCHAR_T == 4
2628 size_t res
= byteOutLen
;
2629 if ( buf
&& res
< n
)
2633 //we need to double-trip to verify it didn't insert any ? in place
2634 //of bogus characters
2635 wxWCharBuffer
wcBuf(n
);
2636 size_t pszlen
= wxWcslen(psz
);
2637 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2638 wxWcslen(wcBuf
) != pszlen
||
2639 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2641 // we didn't obtain the same thing we started from, hence
2642 // the conversion was lossy and we consider that it failed
2651 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2654 TECObjectRef m_MB2WC_converter
;
2655 TECObjectRef m_WC2MB_converter
;
2657 TextEncodingBase m_char_encoding
;
2658 TextEncodingBase m_unicode_encoding
;
2661 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2663 // ============================================================================
2664 // wxEncodingConverter based conversion classes
2665 // ============================================================================
2669 class wxMBConv_wxwin
: public wxMBConv
2674 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2675 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2679 // temporarily just use wxEncodingConverter stuff,
2680 // so that it works while a better implementation is built
2681 wxMBConv_wxwin(const wxChar
* name
)
2684 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2686 m_enc
= wxFONTENCODING_SYSTEM
;
2691 wxMBConv_wxwin(wxFontEncoding enc
)
2698 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2700 size_t inbuf
= strlen(psz
);
2703 if (!m2w
.Convert(psz
,buf
))
2709 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2711 const size_t inbuf
= wxWcslen(psz
);
2714 if (!w2m
.Convert(psz
,buf
))
2721 virtual size_t GetMBNulLen() const
2725 case wxFONTENCODING_UTF16BE
:
2726 case wxFONTENCODING_UTF16LE
:
2729 case wxFONTENCODING_UTF32BE
:
2730 case wxFONTENCODING_UTF32LE
:
2738 bool IsOk() const { return m_ok
; }
2741 wxFontEncoding m_enc
;
2742 wxEncodingConverter m2w
, w2m
;
2745 // were we initialized successfully?
2748 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2751 // make the constructors available for unit testing
2752 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2754 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2755 if ( !result
->IsOk() )
2763 #endif // wxUSE_FONTMAP
2765 // ============================================================================
2766 // wxCSConv implementation
2767 // ============================================================================
2769 void wxCSConv::Init()
2776 wxCSConv::wxCSConv(const wxChar
*charset
)
2786 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2788 m_encoding
= wxFONTENCODING_SYSTEM
;
2792 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2794 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2796 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2798 encoding
= wxFONTENCODING_SYSTEM
;
2803 m_encoding
= encoding
;
2806 wxCSConv::~wxCSConv()
2811 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2816 SetName(conv
.m_name
);
2817 m_encoding
= conv
.m_encoding
;
2820 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2824 SetName(conv
.m_name
);
2825 m_encoding
= conv
.m_encoding
;
2830 void wxCSConv::Clear()
2839 void wxCSConv::SetName(const wxChar
*charset
)
2843 m_name
= wxStrdup(charset
);
2849 #include "wx/hashmap.h"
2851 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2852 wxEncodingNameCache
);
2854 static wxEncodingNameCache gs_nameCache
;
2857 wxMBConv
*wxCSConv::DoCreate() const
2860 wxLogTrace(TRACE_STRCONV
,
2861 wxT("creating conversion for %s"),
2863 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
2864 #endif // wxUSE_FONTMAP
2866 // check for the special case of ASCII or ISO8859-1 charset: as we have
2867 // special knowledge of it anyhow, we don't need to create a special
2868 // conversion object
2869 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2870 m_encoding
== wxFONTENCODING_DEFAULT
)
2872 // don't convert at all
2876 // we trust OS to do conversion better than we can so try external
2877 // conversion methods first
2879 // the full order is:
2880 // 1. OS conversion (iconv() under Unix or Win32 API)
2881 // 2. hard coded conversions for UTF
2882 // 3. wxEncodingConverter as fall back
2888 #endif // !wxUSE_FONTMAP
2890 wxString
name(m_name
);
2891 wxFontEncoding
encoding(m_encoding
);
2893 if ( !name
.empty() )
2895 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2903 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2904 #endif // wxUSE_FONTMAP
2908 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2909 if ( it
!= gs_nameCache
.end() )
2911 if ( it
->second
.empty() )
2914 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
2921 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2923 for ( ; *names
; ++names
)
2925 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
2928 gs_nameCache
[encoding
] = *names
;
2935 gs_nameCache
[encoding
] = _T(""); // cache the failure
2937 #endif // wxUSE_FONTMAP
2939 #endif // HAVE_ICONV
2941 #ifdef wxHAVE_WIN32_MB2WC
2944 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2945 : new wxMBConv_win32(m_encoding
);
2954 #endif // wxHAVE_WIN32_MB2WC
2955 #if defined(__WXMAC__)
2957 // leave UTF16 and UTF32 to the built-ins of wx
2958 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2959 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2963 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2964 : new wxMBConv_mac(m_encoding
);
2966 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2975 #if defined(__WXCOCOA__)
2977 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2981 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2982 : new wxMBConv_cocoa(m_encoding
);
2984 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
2994 wxFontEncoding enc
= m_encoding
;
2996 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2998 // use "false" to suppress interactive dialogs -- we can be called from
2999 // anywhere and popping up a dialog from here is the last thing we want to
3001 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3003 #endif // wxUSE_FONTMAP
3007 case wxFONTENCODING_UTF7
:
3008 return new wxMBConvUTF7
;
3010 case wxFONTENCODING_UTF8
:
3011 return new wxMBConvUTF8
;
3013 case wxFONTENCODING_UTF16BE
:
3014 return new wxMBConvUTF16BE
;
3016 case wxFONTENCODING_UTF16LE
:
3017 return new wxMBConvUTF16LE
;
3019 case wxFONTENCODING_UTF32BE
:
3020 return new wxMBConvUTF32BE
;
3022 case wxFONTENCODING_UTF32LE
:
3023 return new wxMBConvUTF32LE
;
3026 // nothing to do but put here to suppress gcc warnings
3033 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3034 : new wxMBConv_wxwin(m_encoding
);
3040 #endif // wxUSE_FONTMAP
3042 // NB: This is a hack to prevent deadlock. What could otherwise happen
3043 // in Unicode build: wxConvLocal creation ends up being here
3044 // because of some failure and logs the error. But wxLog will try to
3045 // attach timestamp, for which it will need wxConvLocal (to convert
3046 // time to char* and then wchar_t*), but that fails, tries to log
3047 // error, but wxLog has a (already locked) critical section that
3048 // guards static buffer.
3049 static bool alreadyLoggingError
= false;
3050 if (!alreadyLoggingError
)
3052 alreadyLoggingError
= true;
3053 wxLogError(_("Cannot convert from the charset '%s'!"),
3057 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3058 #else // !wxUSE_FONTMAP
3059 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3060 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3062 alreadyLoggingError
= false;
3068 void wxCSConv::CreateConvIfNeeded() const
3072 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3075 // if we don't have neither the name nor the encoding, use the default
3076 // encoding for this system
3077 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3079 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3081 #endif // wxUSE_INTL
3083 self
->m_convReal
= DoCreate();
3084 self
->m_deferred
= false;
3088 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3090 CreateConvIfNeeded();
3093 return m_convReal
->MB2WC(buf
, psz
, n
);
3096 size_t len
= strlen(psz
);
3100 for (size_t c
= 0; c
<= len
; c
++)
3101 buf
[c
] = (unsigned char)(psz
[c
]);
3107 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3109 CreateConvIfNeeded();
3112 return m_convReal
->WC2MB(buf
, psz
, n
);
3115 const size_t len
= wxWcslen(psz
);
3118 for (size_t c
= 0; c
<= len
; c
++)
3122 buf
[c
] = (char)psz
[c
];
3127 for (size_t c
= 0; c
<= len
; c
++)
3137 size_t wxCSConv::GetMBNulLen() const
3139 CreateConvIfNeeded();
3143 return m_convReal
->GetMBNulLen();
3149 // ----------------------------------------------------------------------------
3151 // ----------------------------------------------------------------------------
3154 static wxMBConv_win32 wxConvLibcObj
;
3155 #elif defined(__WXMAC__) && !defined(__MACH__)
3156 static wxMBConv_mac wxConvLibcObj
;
3158 static wxMBConvLibc wxConvLibcObj
;
3161 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3162 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3163 static wxMBConvUTF7 wxConvUTF7Obj
;
3164 static wxMBConvUTF8 wxConvUTF8Obj
;
3166 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3167 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3168 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3169 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3170 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3171 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3180 #else // !wxUSE_WCHAR_T
3182 // stand-ins in absence of wchar_t
3183 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3188 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T