1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p
, size_t n
)
92 while ( n
&& *p
++ == '\0' )
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
103 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
108 *output
= (wxUint16
) input
;
111 else if (input
>=0x110000)
119 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
120 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
126 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
128 if ((*input
<0xd800) || (*input
>0xdfff))
133 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
140 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
146 // ----------------------------------------------------------------------------
148 // ----------------------------------------------------------------------------
151 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
152 const char *src
, size_t srcLen
) const
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten
= 0;
164 // the number of NULs terminating this string
165 size_t nulLen
wxDUMMY_INITIALIZE(0);
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
173 if ( srcLen
!= (size_t)-1 )
175 // we need to know how to find the end of this string
176 nulLen
= GetMBNulLen();
177 if ( nulLen
== wxCONV_FAILED
)
178 return wxCONV_FAILED
;
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
185 char * const p
= bufTmp
.data();
186 memcpy(p
, src
, srcLen
);
187 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
193 srcEnd
= src
+ srcLen
;
195 else // quit after the first loop iteration
202 // try to convert the current chunk
203 size_t lenChunk
= MB2WC(NULL
, src
, 0);
206 // nothing left in the input string, conversion succeeded; but
207 // still account for the trailing NULL
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 lenChunk
++; // for trailing NUL
217 dstWritten
+= lenChunk
;
221 if ( dstWritten
> dstLen
)
222 return wxCONV_FAILED
;
224 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
225 return wxCONV_FAILED
;
232 // we convert the entire string in this cas, as we suppose that the
233 // string is NUL-terminated and so srcEnd is not used at all
237 // advance the input pointer past the end of this chunk
238 while ( NotAllNULs(src
, nulLen
) )
240 // notice that we must skip over multiple bytes here as we suppose
241 // that if NUL takes 2 or 4 bytes, then all the other characters do
242 // too and so if advanced by a single byte we might erroneously
243 // detect sequences of NUL bytes in the middle of the input
247 src
+= nulLen
; // skipping over its terminator as well
249 // note that ">=" (and not just "==") is needed here as the terminator
250 // we skipped just above could be inside or just after the buffer
251 // delimited by inEnd
260 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
261 const wchar_t *src
, size_t srcLen
) const
263 // the number of chars [which would be] written to dst [if it were not NULL]
264 size_t dstWritten
= 0;
266 // make a copy of the input string unless it is already properly
269 // if we don't know its length we have no choice but to assume that it is,
270 // indeed, properly terminated
271 wxWCharBuffer bufTmp
;
272 if ( srcLen
== (size_t)-1 )
274 srcLen
= wxWcslen(src
) + 1;
276 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
278 // make a copy in order to properly NUL-terminate the string
279 bufTmp
= wxWCharBuffer(srcLen
);
280 memcpy(bufTmp
.data(), src
, srcLen
*sizeof(wchar_t));
284 const size_t lenNul
= GetMBNulLen();
285 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
287 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
289 // try to convert the current chunk
290 size_t lenChunk
= WC2MB(NULL
, src
, 0);
292 if ( lenChunk
== wxCONV_FAILED
)
293 return wxCONV_FAILED
;
296 dstWritten
+= lenChunk
;
300 if ( dstWritten
> dstLen
)
301 return wxCONV_FAILED
;
303 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
304 return wxCONV_FAILED
;
313 size_t wxMBConv::MB2WC(wchar_t *out
, const char *in
, size_t outLen
) const
315 size_t rc
= ToWChar(out
, outLen
, in
);
316 if ( rc
!= wxCONV_FAILED
)
318 // ToWChar() returns the buffer length, i.e. including the trailing
319 // NUL, while this method doesn't take it into account
326 size_t wxMBConv::WC2MB(char *out
, const wchar_t *in
, size_t outLen
) const
328 size_t rc
= FromWChar(out
, outLen
, in
);
329 if ( rc
!= wxCONV_FAILED
)
337 wxMBConv::~wxMBConv()
339 // nothing to do here (necessary for Darwin linking probably)
342 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
346 // calculate the length of the buffer needed first
347 const size_t nLen
= MB2WC(NULL
, psz
, 0);
348 if ( nLen
!= wxCONV_FAILED
)
350 // now do the actual conversion
351 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
353 // +1 for the trailing NULL
354 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
359 return wxWCharBuffer();
362 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
366 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
367 if ( nLen
!= wxCONV_FAILED
)
369 // extra space for trailing NUL(s)
370 static const size_t extraLen
= GetMaxMBNulLen();
372 wxCharBuffer
buf(nLen
+ extraLen
- 1);
373 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
378 return wxCharBuffer();
382 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
384 const size_t dstLen
= ToWChar(NULL
, 0, in
, inLen
);
385 if ( dstLen
!= wxCONV_FAILED
)
387 wxWCharBuffer
wbuf(dstLen
- 1);
388 if ( ToWChar(wbuf
.data(), dstLen
, in
, inLen
) )
391 *outLen
= dstLen
- 1;
399 return wxWCharBuffer();
403 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
405 const size_t dstLen
= FromWChar(NULL
, 0, in
, inLen
);
406 if ( dstLen
!= wxCONV_FAILED
)
408 wxCharBuffer
buf(dstLen
- 1);
409 if ( FromWChar(buf
.data(), dstLen
, in
, inLen
) )
412 *outLen
= dstLen
- 1;
420 return wxCharBuffer();
423 // ----------------------------------------------------------------------------
425 // ----------------------------------------------------------------------------
427 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
429 return wxMB2WC(buf
, psz
, n
);
432 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
434 return wxWC2MB(buf
, psz
, n
);
437 // ----------------------------------------------------------------------------
438 // wxConvBrokenFileNames
439 // ----------------------------------------------------------------------------
443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
445 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
446 || wxStricmp(charset
, _T("UTF8")) == 0 )
447 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
449 m_conv
= new wxCSConv(charset
);
454 // ----------------------------------------------------------------------------
456 // ----------------------------------------------------------------------------
458 // Implementation (C) 2004 Fredrik Roubert
461 // BASE64 decoding table
463 static const unsigned char utf7unb64
[] =
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
471 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
472 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
474 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
475 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
476 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
477 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
478 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
479 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
480 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
481 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
503 while ( *psz
&& (!buf
|| (len
< n
)) )
505 unsigned char cc
= *psz
++;
513 else if (*psz
== '-')
521 else // start of BASE64 encoded string
525 for ( ok
= lsb
= false, d
= 0, l
= 0;
526 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
531 for (l
+= 6; l
>= 8; lsb
= !lsb
)
533 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
543 *buf
= (wchar_t)(c
<< 8);
552 // in valid UTF7 we should have valid characters after '+'
561 if ( buf
&& (len
< n
) )
568 // BASE64 encoding table
570 static const unsigned char utf7enb64
[] =
572 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
573 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
574 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
575 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
576 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
577 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
578 'w', 'x', 'y', 'z', '0', '1', '2', '3',
579 '4', '5', '6', '7', '8', '9', '+', '/'
583 // UTF-7 encoding table
585 // 0 - Set D (directly encoded characters)
586 // 1 - Set O (optional direct characters)
587 // 2 - whitespace characters (optional)
588 // 3 - special characters
590 static const unsigned char utf7encode
[128] =
592 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
593 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
594 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
596 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
598 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
602 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
606 while (*psz
&& ((!buf
) || (len
< n
)))
609 if (cc
< 0x80 && utf7encode
[cc
] < 1)
617 else if (((wxUint32
)cc
) > 0xffff)
619 // no surrogate pair generation (yet?)
630 // BASE64 encode string
631 unsigned int lsb
, d
, l
;
632 for (d
= 0, l
= 0; /*nothing*/; psz
++)
634 for (lsb
= 0; lsb
< 2; lsb
++)
637 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
639 for (l
+= 8; l
>= 6; )
643 *buf
++ = utf7enb64
[(d
>> l
) % 64];
648 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
654 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
663 if (buf
&& (len
< n
))
668 // ----------------------------------------------------------------------------
670 // ----------------------------------------------------------------------------
672 static wxUint32 utf8_max
[]=
673 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
675 // boundaries of the private use area we use to (temporarily) remap invalid
676 // characters invalid in a UTF-8 encoded string
677 const wxUint32 wxUnicodePUA
= 0x100000;
678 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
684 while (*psz
&& ((!buf
) || (len
< n
)))
686 const char *opsz
= psz
;
687 bool invalid
= false;
688 unsigned char cc
= *psz
++, fc
= cc
;
690 for (cnt
= 0; fc
& 0x80; cnt
++)
699 // escape the escape character for octal escapes
700 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
701 && cc
== '\\' && (!buf
|| len
< n
))
713 // invalid UTF-8 sequence
718 unsigned ocnt
= cnt
- 1;
719 wxUint32 res
= cc
& (0x3f >> cnt
);
723 if ((cc
& 0xC0) != 0x80)
725 // invalid UTF-8 sequence
730 res
= (res
<< 6) | (cc
& 0x3f);
732 if (invalid
|| res
<= utf8_max
[ocnt
])
734 // illegal UTF-8 encoding
737 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
738 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
740 // if one of our PUA characters turns up externally
741 // it must also be treated as an illegal sequence
742 // (a bit like you have to escape an escape character)
748 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
749 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
750 if (pa
== (size_t)-1)
762 *buf
++ = (wchar_t)res
;
764 #endif // WC_UTF16/!WC_UTF16
769 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
771 while (opsz
< psz
&& (!buf
|| len
< n
))
774 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
775 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
776 wxASSERT(pa
!= (size_t)-1);
783 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
789 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
791 while (opsz
< psz
&& (!buf
|| len
< n
))
793 if ( buf
&& len
+ 3 < n
)
795 unsigned char on
= *opsz
;
797 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
798 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
799 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
805 else // MAP_INVALID_UTF8_NOT
812 if (buf
&& (len
< n
))
817 static inline bool isoctal(wchar_t wch
)
819 return L
'0' <= wch
&& wch
<= L
'7';
822 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
826 while (*psz
&& ((!buf
) || (len
< n
)))
830 // cast is ok for WC_UTF16
831 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
832 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
834 cc
=(*psz
++) & 0x7fffffff;
837 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
838 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
841 *buf
++ = (char)(cc
- wxUnicodePUA
);
844 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
845 && cc
== L
'\\' && psz
[0] == L
'\\' )
852 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
854 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
858 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
859 (psz
[1] - L
'0')*010 +
869 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
883 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
885 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
897 // ----------------------------------------------------------------------------
899 // ----------------------------------------------------------------------------
901 #ifdef WORDS_BIGENDIAN
902 #define wxMBConvUTF16straight wxMBConvUTF16BE
903 #define wxMBConvUTF16swap wxMBConvUTF16LE
905 #define wxMBConvUTF16swap wxMBConvUTF16BE
906 #define wxMBConvUTF16straight wxMBConvUTF16LE
912 // copy 16bit MB to 16bit String
913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
917 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
920 *buf
++ = *(wxUint16
*)psz
;
923 psz
+= sizeof(wxUint16
);
925 if (buf
&& len
<n
) *buf
=0;
931 // copy 16bit String to 16bit MB
932 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
936 while (*psz
&& (!buf
|| len
< n
))
940 *(wxUint16
*)buf
= *psz
;
941 buf
+= sizeof(wxUint16
);
943 len
+= sizeof(wxUint16
);
946 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
952 // swap 16bit MB to 16bit String
953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
957 // UTF16 string must be terminated by 2 NULs as single NULs may occur
959 while ( (psz
[0] || psz
[1]) && (!buf
|| len
< n
) )
963 ((char *)buf
)[0] = psz
[1];
964 ((char *)buf
)[1] = psz
[0];
971 if ( buf
&& len
< n
)
978 // swap 16bit MB to 16bit String
979 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
983 while ( *psz
&& (!buf
|| len
< n
) )
987 *buf
++ = ((char*)psz
)[1];
988 *buf
++ = ((char*)psz
)[0];
994 if ( buf
&& len
< n
- 1 )
1007 // copy 16bit MB to 32bit String
1008 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1012 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1015 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
1016 if (pa
== (size_t)-1)
1020 *buf
++ = (wchar_t)cc
;
1022 psz
+= pa
* sizeof(wxUint16
);
1024 if (buf
&& len
<n
) *buf
=0;
1030 // copy 32bit String to 16bit MB
1031 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1035 while (*psz
&& (!buf
|| len
< n
))
1038 size_t pa
=encode_utf16(*psz
, cc
);
1040 if (pa
== (size_t)-1)
1045 *(wxUint16
*)buf
= cc
[0];
1046 buf
+= sizeof(wxUint16
);
1049 *(wxUint16
*)buf
= cc
[1];
1050 buf
+= sizeof(wxUint16
);
1054 len
+= pa
*sizeof(wxUint16
);
1057 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1063 // swap 16bit MB to 32bit String
1064 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1068 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1072 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
1073 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
1075 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
1076 if (pa
== (size_t)-1)
1080 *buf
++ = (wchar_t)cc
;
1083 psz
+= pa
* sizeof(wxUint16
);
1085 if (buf
&& len
<n
) *buf
=0;
1091 // swap 32bit String to 16bit MB
1092 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1096 while (*psz
&& (!buf
|| len
< n
))
1099 size_t pa
=encode_utf16(*psz
, cc
);
1101 if (pa
== (size_t)-1)
1106 *buf
++ = ((char*)cc
)[1];
1107 *buf
++ = ((char*)cc
)[0];
1110 *buf
++ = ((char*)cc
)[3];
1111 *buf
++ = ((char*)cc
)[2];
1115 len
+= pa
*sizeof(wxUint16
);
1118 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1126 // ----------------------------------------------------------------------------
1128 // ----------------------------------------------------------------------------
1130 #ifdef WORDS_BIGENDIAN
1131 #define wxMBConvUTF32straight wxMBConvUTF32BE
1132 #define wxMBConvUTF32swap wxMBConvUTF32LE
1134 #define wxMBConvUTF32swap wxMBConvUTF32BE
1135 #define wxMBConvUTF32straight wxMBConvUTF32LE
1139 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1140 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1145 // copy 32bit MB to 16bit String
1146 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1150 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1154 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
1155 if (pa
== (size_t)-1)
1165 psz
+= sizeof(wxUint32
);
1167 if (buf
&& len
<n
) *buf
=0;
1173 // copy 16bit String to 32bit MB
1174 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1178 while (*psz
&& (!buf
|| len
< n
))
1182 // cast is ok for WC_UTF16
1183 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1184 if (pa
== (size_t)-1)
1189 *(wxUint32
*)buf
= cc
;
1190 buf
+= sizeof(wxUint32
);
1192 len
+= sizeof(wxUint32
);
1196 if (buf
&& len
<=n
-sizeof(wxUint32
))
1204 // swap 32bit MB to 16bit String
1205 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1209 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1212 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
1213 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
1218 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
1219 if (pa
== (size_t)-1)
1229 psz
+= sizeof(wxUint32
);
1239 // swap 16bit String to 32bit MB
1240 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1244 while (*psz
&& (!buf
|| len
< n
))
1248 // cast is ok for WC_UTF16
1249 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1250 if (pa
== (size_t)-1)
1260 len
+= sizeof(wxUint32
);
1264 if (buf
&& len
<=n
-sizeof(wxUint32
))
1273 // copy 32bit MB to 32bit String
1274 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1278 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1281 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1283 psz
+= sizeof(wxUint32
);
1293 // copy 32bit String to 32bit MB
1294 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1298 while (*psz
&& (!buf
|| len
< n
))
1302 *(wxUint32
*)buf
= *psz
;
1303 buf
+= sizeof(wxUint32
);
1306 len
+= sizeof(wxUint32
);
1310 if (buf
&& len
<=n
-sizeof(wxUint32
))
1317 // swap 32bit MB to 32bit String
1318 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1322 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1326 ((char *)buf
)[0] = psz
[3];
1327 ((char *)buf
)[1] = psz
[2];
1328 ((char *)buf
)[2] = psz
[1];
1329 ((char *)buf
)[3] = psz
[0];
1333 psz
+= sizeof(wxUint32
);
1343 // swap 32bit String to 32bit MB
1344 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1348 while (*psz
&& (!buf
|| len
< n
))
1352 *buf
++ = ((char *)psz
)[3];
1353 *buf
++ = ((char *)psz
)[2];
1354 *buf
++ = ((char *)psz
)[1];
1355 *buf
++ = ((char *)psz
)[0];
1357 len
+= sizeof(wxUint32
);
1361 if (buf
&& len
<=n
-sizeof(wxUint32
))
1371 // ============================================================================
1372 // The classes doing conversion using the iconv_xxx() functions
1373 // ============================================================================
1377 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1378 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1379 // (unless there's yet another bug in glibc) the only case when iconv()
1380 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1381 // left in the input buffer -- when _real_ error occurs,
1382 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1384 // [This bug does not appear in glibc 2.2.]
1385 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1386 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1387 (errno != E2BIG || bufLeft != 0))
1389 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1392 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1394 #define ICONV_T_INVALID ((iconv_t)-1)
1396 #if SIZEOF_WCHAR_T == 4
1397 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1398 #define WC_ENC wxFONTENCODING_UTF32
1399 #elif SIZEOF_WCHAR_T == 2
1400 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1401 #define WC_ENC wxFONTENCODING_UTF16
1402 #else // sizeof(wchar_t) != 2 nor 4
1403 // does this ever happen?
1404 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1407 // ----------------------------------------------------------------------------
1408 // wxMBConv_iconv: encapsulates an iconv character set
1409 // ----------------------------------------------------------------------------
1411 class wxMBConv_iconv
: public wxMBConv
1414 wxMBConv_iconv(const wxChar
*name
);
1415 virtual ~wxMBConv_iconv();
1417 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1418 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1420 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1421 virtual size_t GetMBNulLen() const;
1423 virtual wxMBConv
*Clone() const
1425 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1426 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1431 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1434 // the iconv handlers used to translate from multibyte to wide char and in
1435 // the other direction
1439 // guards access to m2w and w2m objects
1440 wxMutex m_iconvMutex
;
1444 // the name (for iconv_open()) of a wide char charset -- if none is
1445 // available on this machine, it will remain NULL
1446 static wxString ms_wcCharsetName
;
1448 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1449 // different endian-ness than the native one
1450 static bool ms_wcNeedsSwap
;
1453 // name of the encoding handled by this conversion
1456 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1458 size_t m_minMBCharWidth
;
1461 // make the constructor available for unit testing
1462 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1464 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1465 if ( !result
->IsOk() )
1473 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1474 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1476 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1479 m_minMBCharWidth
= 0;
1481 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1482 // names for the charsets
1483 const wxCharBuffer
cname(wxString(name
).ToAscii());
1485 // check for charset that represents wchar_t:
1486 if ( ms_wcCharsetName
.empty() )
1488 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1491 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1492 #else // !wxUSE_FONTMAP
1493 static const wxChar
*names
[] =
1495 #if SIZEOF_WCHAR_T == 4
1497 #elif SIZEOF_WCHAR_T = 2
1502 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1504 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1506 const wxString
nameCS(*names
);
1508 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1509 wxString
nameXE(nameCS
);
1510 #ifdef WORDS_BIGENDIAN
1512 #else // little endian
1516 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1519 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1520 if ( m2w
== ICONV_T_INVALID
)
1522 // try charset w/o bytesex info (e.g. "UCS4")
1523 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1525 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1527 // and check for bytesex ourselves:
1528 if ( m2w
!= ICONV_T_INVALID
)
1530 char buf
[2], *bufPtr
;
1531 wchar_t wbuf
[2], *wbufPtr
;
1539 outsz
= SIZEOF_WCHAR_T
* 2;
1543 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1544 (char**)&wbufPtr
, &outsz
);
1546 if (ICONV_FAILED(res
, insz
))
1548 wxLogLastError(wxT("iconv"));
1549 wxLogError(_("Conversion to charset '%s' doesn't work."),
1552 else // ok, can convert to this encoding, remember it
1554 ms_wcCharsetName
= nameCS
;
1555 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1559 else // use charset not requiring byte swapping
1561 ms_wcCharsetName
= nameXE
;
1565 wxLogTrace(TRACE_STRCONV
,
1566 wxT("iconv wchar_t charset is \"%s\"%s"),
1567 ms_wcCharsetName
.empty() ? _T("<none>")
1568 : ms_wcCharsetName
.c_str(),
1569 ms_wcNeedsSwap
? _T(" (needs swap)")
1572 else // we already have ms_wcCharsetName
1574 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1577 if ( ms_wcCharsetName
.empty() )
1579 w2m
= ICONV_T_INVALID
;
1583 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1584 if ( w2m
== ICONV_T_INVALID
)
1586 wxLogTrace(TRACE_STRCONV
,
1587 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1588 ms_wcCharsetName
.c_str(), cname
.data());
1593 wxMBConv_iconv::~wxMBConv_iconv()
1595 if ( m2w
!= ICONV_T_INVALID
)
1597 if ( w2m
!= ICONV_T_INVALID
)
1601 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1603 // find the string length: notice that must be done differently for
1604 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1606 const size_t nulLen
= GetMBNulLen();
1613 inbuf
= strlen(psz
); // arguably more optimized than our version
1618 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1619 // they also have to start at character boundary and not span two
1620 // adjacent characters
1622 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1629 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1630 // Unfortunately there is a couple of global wxCSConv objects such as
1631 // wxConvLocal that are used all over wx code, so we have to make sure
1632 // the handle is used by at most one thread at the time. Otherwise
1633 // only a few wx classes would be safe to use from non-main threads
1634 // as MB<->WC conversion would fail "randomly".
1635 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1636 #endif // wxUSE_THREADS
1639 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1641 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1642 wchar_t *bufPtr
= buf
;
1643 const char *pszPtr
= psz
;
1647 // have destination buffer, convert there
1649 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1650 (char**)&bufPtr
, &outbuf
);
1651 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1655 // convert to native endianness
1656 for ( unsigned i
= 0; i
< res
; i
++ )
1657 buf
[n
] = WC_BSWAP(buf
[i
]);
1660 // NUL-terminate the string if there is any space left
1666 // no destination buffer... convert using temp buffer
1667 // to calculate destination buffer requirement
1672 outbuf
= 8*SIZEOF_WCHAR_T
;
1675 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1676 (char**)&bufPtr
, &outbuf
);
1678 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1679 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1682 if (ICONV_FAILED(cres
, inbuf
))
1684 //VS: it is ok if iconv fails, hence trace only
1685 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1692 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1695 // NB: explained in MB2WC
1696 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1699 size_t inlen
= wxWcslen(psz
);
1700 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1704 wchar_t *tmpbuf
= 0;
1708 // need to copy to temp buffer to switch endianness
1709 // (doing WC_BSWAP twice on the original buffer won't help, as it
1710 // could be in read-only memory, or be accessed in some other thread)
1711 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1712 for ( size_t i
= 0; i
< inlen
; i
++ )
1713 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1714 tmpbuf
[inlen
] = L
'\0';
1720 // have destination buffer, convert there
1721 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1725 // NB: iconv was given only wcslen(psz) characters on input, and so
1726 // it couldn't convert the trailing zero. Let's do it ourselves
1727 // if there's some room left for it in the output buffer.
1733 // no destination buffer... convert using temp buffer
1734 // to calculate destination buffer requirement
1738 buf
= tbuf
; outbuf
= 16;
1740 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1743 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1751 if (ICONV_FAILED(cres
, inbuf
))
1753 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1760 size_t wxMBConv_iconv::GetMBNulLen() const
1762 if ( m_minMBCharWidth
== 0 )
1764 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1767 // NB: explained in MB2WC
1768 wxMutexLocker
lock(self
->m_iconvMutex
);
1771 wchar_t *wnul
= L
"";
1772 char buf
[8]; // should be enough for NUL in any encoding
1773 size_t inLen
= sizeof(wchar_t),
1774 outLen
= WXSIZEOF(buf
);
1775 char *in
= (char *)wnul
;
1777 if ( iconv(w2m
, ICONV_CHAR_CAST(&in
), &inLen
, &out
, &outLen
) == (size_t)-1 )
1779 self
->m_minMBCharWidth
= (size_t)-1;
1783 self
->m_minMBCharWidth
= out
- buf
;
1787 return m_minMBCharWidth
;
1790 #endif // HAVE_ICONV
1793 // ============================================================================
1794 // Win32 conversion classes
1795 // ============================================================================
1797 #ifdef wxHAVE_WIN32_MB2WC
1801 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1802 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1805 class wxMBConv_win32
: public wxMBConv
1810 m_CodePage
= CP_ACP
;
1811 m_minMBCharWidth
= 0;
1814 wxMBConv_win32(const wxMBConv_win32
& conv
)
1816 m_CodePage
= conv
.m_CodePage
;
1817 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
1821 wxMBConv_win32(const wxChar
* name
)
1823 m_CodePage
= wxCharsetToCodepage(name
);
1824 m_minMBCharWidth
= 0;
1827 wxMBConv_win32(wxFontEncoding encoding
)
1829 m_CodePage
= wxEncodingToCodepage(encoding
);
1830 m_minMBCharWidth
= 0;
1832 #endif // wxUSE_FONTMAP
1834 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1836 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1837 // the behaviour is not compatible with the Unix version (using iconv)
1838 // and break the library itself, e.g. wxTextInputStream::NextChar()
1839 // wouldn't work if reading an incomplete MB char didn't result in an
1842 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1843 // Win XP or newer and it is not supported for UTF-[78] so we always
1844 // use our own conversions in this case. See
1845 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1846 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1847 if ( m_CodePage
== CP_UTF8
)
1849 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
1852 if ( m_CodePage
== CP_UTF7
)
1854 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
1858 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
1859 IsAtLeastWin2kSP4() )
1861 flags
= MB_ERR_INVALID_CHARS
;
1864 const size_t len
= ::MultiByteToWideChar
1866 m_CodePage
, // code page
1867 flags
, // flags: fall on error
1868 psz
, // input string
1869 -1, // its length (NUL-terminated)
1870 buf
, // output string
1871 buf
? n
: 0 // size of output buffer
1875 // function totally failed
1879 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1880 // check if we succeeded, by doing a double trip:
1881 if ( !flags
&& buf
)
1883 const size_t mbLen
= strlen(psz
);
1884 wxCharBuffer
mbBuf(mbLen
);
1885 if ( ::WideCharToMultiByte
1892 mbLen
+ 1, // size in bytes, not length
1896 strcmp(mbBuf
, psz
) != 0 )
1898 // we didn't obtain the same thing we started from, hence
1899 // the conversion was lossy and we consider that it failed
1904 // note that it returns count of written chars for buf != NULL and size
1905 // of the needed buffer for buf == NULL so in either case the length of
1906 // the string (which never includes the terminating NUL) is one less
1910 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1913 we have a problem here: by default, WideCharToMultiByte() may
1914 replace characters unrepresentable in the target code page with bad
1915 quality approximations such as turning "1/2" symbol (U+00BD) into
1916 "1" for the code pages which don't have it and we, obviously, want
1917 to avoid this at any price
1919 the trouble is that this function does it _silently_, i.e. it won't
1920 even tell us whether it did or not... Win98/2000 and higher provide
1921 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1922 we have to resort to a round trip, i.e. check that converting back
1923 results in the same string -- this is, of course, expensive but
1924 otherwise we simply can't be sure to not garble the data.
1927 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1928 // it doesn't work with CJK encodings (which we test for rather roughly
1929 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1931 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1934 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1936 // it's our lucky day
1937 flags
= WC_NO_BEST_FIT_CHARS
;
1938 pUsedDef
= &usedDef
;
1940 else // old system or unsupported encoding
1946 const size_t len
= ::WideCharToMultiByte
1948 m_CodePage
, // code page
1949 flags
, // either none or no best fit
1950 pwz
, // input string
1951 -1, // it is (wide) NUL-terminated
1952 buf
, // output buffer
1953 buf
? n
: 0, // and its size
1954 NULL
, // default "replacement" char
1955 pUsedDef
// [out] was it used?
1960 // function totally failed
1964 // if we were really converting, check if we succeeded
1969 // check if the conversion failed, i.e. if any replacements
1974 else // we must resort to double tripping...
1976 wxWCharBuffer
wcBuf(n
);
1977 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1978 wcscmp(wcBuf
, pwz
) != 0 )
1980 // we didn't obtain the same thing we started from, hence
1981 // the conversion was lossy and we consider that it failed
1987 // see the comment above for the reason of "len - 1"
1991 virtual size_t GetMBNulLen() const
1993 if ( m_minMBCharWidth
== 0 )
1995 int len
= ::WideCharToMultiByte
1997 m_CodePage
, // code page
1999 L
"", // input string
2000 1, // translate just the NUL
2001 NULL
, // output buffer
2003 NULL
, // no replacement char
2004 NULL
// [out] don't care if it was used
2007 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2011 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2015 self
->m_minMBCharWidth
= (size_t)-1;
2021 self
->m_minMBCharWidth
= len
;
2026 return m_minMBCharWidth
;
2029 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2031 bool IsOk() const { return m_CodePage
!= -1; }
2034 static bool CanUseNoBestFit()
2036 static int s_isWin98Or2k
= -1;
2038 if ( s_isWin98Or2k
== -1 )
2041 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2044 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2048 s_isWin98Or2k
= verMaj
>= 5;
2052 // unknown, be conseravtive by default
2056 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2059 return s_isWin98Or2k
== 1;
2062 static bool IsAtLeastWin2kSP4()
2067 static int s_isAtLeastWin2kSP4
= -1;
2069 if ( s_isAtLeastWin2kSP4
== -1 )
2071 OSVERSIONINFOEX ver
;
2073 memset(&ver
, 0, sizeof(ver
));
2074 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2075 GetVersionEx((OSVERSIONINFO
*)&ver
);
2077 s_isAtLeastWin2kSP4
=
2078 ((ver
.dwMajorVersion
> 5) || // Vista+
2079 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2080 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2081 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2085 return s_isAtLeastWin2kSP4
== 1;
2090 // the code page we're working with
2093 // cached result of GetMBNulLen(), set to 0 initially meaning
2095 size_t m_minMBCharWidth
;
2098 #endif // wxHAVE_WIN32_MB2WC
2100 // ============================================================================
2101 // Cocoa conversion classes
2102 // ============================================================================
2104 #if defined(__WXCOCOA__)
2106 // RN: There is no UTF-32 support in either Core Foundation or
2107 // Cocoa. Strangely enough, internally Core Foundation uses
2108 // UTF 32 internally quite a bit - its just not public (yet).
2110 #include <CoreFoundation/CFString.h>
2111 #include <CoreFoundation/CFStringEncodingExt.h>
2113 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2115 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2116 if ( encoding
== wxFONTENCODING_DEFAULT
)
2118 enc
= CFStringGetSystemEncoding();
2120 else switch( encoding
)
2122 case wxFONTENCODING_ISO8859_1
:
2123 enc
= kCFStringEncodingISOLatin1
;
2125 case wxFONTENCODING_ISO8859_2
:
2126 enc
= kCFStringEncodingISOLatin2
;
2128 case wxFONTENCODING_ISO8859_3
:
2129 enc
= kCFStringEncodingISOLatin3
;
2131 case wxFONTENCODING_ISO8859_4
:
2132 enc
= kCFStringEncodingISOLatin4
;
2134 case wxFONTENCODING_ISO8859_5
:
2135 enc
= kCFStringEncodingISOLatinCyrillic
;
2137 case wxFONTENCODING_ISO8859_6
:
2138 enc
= kCFStringEncodingISOLatinArabic
;
2140 case wxFONTENCODING_ISO8859_7
:
2141 enc
= kCFStringEncodingISOLatinGreek
;
2143 case wxFONTENCODING_ISO8859_8
:
2144 enc
= kCFStringEncodingISOLatinHebrew
;
2146 case wxFONTENCODING_ISO8859_9
:
2147 enc
= kCFStringEncodingISOLatin5
;
2149 case wxFONTENCODING_ISO8859_10
:
2150 enc
= kCFStringEncodingISOLatin6
;
2152 case wxFONTENCODING_ISO8859_11
:
2153 enc
= kCFStringEncodingISOLatinThai
;
2155 case wxFONTENCODING_ISO8859_13
:
2156 enc
= kCFStringEncodingISOLatin7
;
2158 case wxFONTENCODING_ISO8859_14
:
2159 enc
= kCFStringEncodingISOLatin8
;
2161 case wxFONTENCODING_ISO8859_15
:
2162 enc
= kCFStringEncodingISOLatin9
;
2165 case wxFONTENCODING_KOI8
:
2166 enc
= kCFStringEncodingKOI8_R
;
2168 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2169 enc
= kCFStringEncodingDOSRussian
;
2172 // case wxFONTENCODING_BULGARIAN :
2176 case wxFONTENCODING_CP437
:
2177 enc
=kCFStringEncodingDOSLatinUS
;
2179 case wxFONTENCODING_CP850
:
2180 enc
= kCFStringEncodingDOSLatin1
;
2182 case wxFONTENCODING_CP852
:
2183 enc
= kCFStringEncodingDOSLatin2
;
2185 case wxFONTENCODING_CP855
:
2186 enc
= kCFStringEncodingDOSCyrillic
;
2188 case wxFONTENCODING_CP866
:
2189 enc
=kCFStringEncodingDOSRussian
;
2191 case wxFONTENCODING_CP874
:
2192 enc
= kCFStringEncodingDOSThai
;
2194 case wxFONTENCODING_CP932
:
2195 enc
= kCFStringEncodingDOSJapanese
;
2197 case wxFONTENCODING_CP936
:
2198 enc
=kCFStringEncodingDOSChineseSimplif
;
2200 case wxFONTENCODING_CP949
:
2201 enc
= kCFStringEncodingDOSKorean
;
2203 case wxFONTENCODING_CP950
:
2204 enc
= kCFStringEncodingDOSChineseTrad
;
2206 case wxFONTENCODING_CP1250
:
2207 enc
= kCFStringEncodingWindowsLatin2
;
2209 case wxFONTENCODING_CP1251
:
2210 enc
=kCFStringEncodingWindowsCyrillic
;
2212 case wxFONTENCODING_CP1252
:
2213 enc
=kCFStringEncodingWindowsLatin1
;
2215 case wxFONTENCODING_CP1253
:
2216 enc
= kCFStringEncodingWindowsGreek
;
2218 case wxFONTENCODING_CP1254
:
2219 enc
= kCFStringEncodingWindowsLatin5
;
2221 case wxFONTENCODING_CP1255
:
2222 enc
=kCFStringEncodingWindowsHebrew
;
2224 case wxFONTENCODING_CP1256
:
2225 enc
=kCFStringEncodingWindowsArabic
;
2227 case wxFONTENCODING_CP1257
:
2228 enc
= kCFStringEncodingWindowsBalticRim
;
2230 // This only really encodes to UTF7 (if that) evidently
2231 // case wxFONTENCODING_UTF7 :
2232 // enc = kCFStringEncodingNonLossyASCII ;
2234 case wxFONTENCODING_UTF8
:
2235 enc
= kCFStringEncodingUTF8
;
2237 case wxFONTENCODING_EUC_JP
:
2238 enc
= kCFStringEncodingEUC_JP
;
2240 case wxFONTENCODING_UTF16
:
2241 enc
= kCFStringEncodingUnicode
;
2243 case wxFONTENCODING_MACROMAN
:
2244 enc
= kCFStringEncodingMacRoman
;
2246 case wxFONTENCODING_MACJAPANESE
:
2247 enc
= kCFStringEncodingMacJapanese
;
2249 case wxFONTENCODING_MACCHINESETRAD
:
2250 enc
= kCFStringEncodingMacChineseTrad
;
2252 case wxFONTENCODING_MACKOREAN
:
2253 enc
= kCFStringEncodingMacKorean
;
2255 case wxFONTENCODING_MACARABIC
:
2256 enc
= kCFStringEncodingMacArabic
;
2258 case wxFONTENCODING_MACHEBREW
:
2259 enc
= kCFStringEncodingMacHebrew
;
2261 case wxFONTENCODING_MACGREEK
:
2262 enc
= kCFStringEncodingMacGreek
;
2264 case wxFONTENCODING_MACCYRILLIC
:
2265 enc
= kCFStringEncodingMacCyrillic
;
2267 case wxFONTENCODING_MACDEVANAGARI
:
2268 enc
= kCFStringEncodingMacDevanagari
;
2270 case wxFONTENCODING_MACGURMUKHI
:
2271 enc
= kCFStringEncodingMacGurmukhi
;
2273 case wxFONTENCODING_MACGUJARATI
:
2274 enc
= kCFStringEncodingMacGujarati
;
2276 case wxFONTENCODING_MACORIYA
:
2277 enc
= kCFStringEncodingMacOriya
;
2279 case wxFONTENCODING_MACBENGALI
:
2280 enc
= kCFStringEncodingMacBengali
;
2282 case wxFONTENCODING_MACTAMIL
:
2283 enc
= kCFStringEncodingMacTamil
;
2285 case wxFONTENCODING_MACTELUGU
:
2286 enc
= kCFStringEncodingMacTelugu
;
2288 case wxFONTENCODING_MACKANNADA
:
2289 enc
= kCFStringEncodingMacKannada
;
2291 case wxFONTENCODING_MACMALAJALAM
:
2292 enc
= kCFStringEncodingMacMalayalam
;
2294 case wxFONTENCODING_MACSINHALESE
:
2295 enc
= kCFStringEncodingMacSinhalese
;
2297 case wxFONTENCODING_MACBURMESE
:
2298 enc
= kCFStringEncodingMacBurmese
;
2300 case wxFONTENCODING_MACKHMER
:
2301 enc
= kCFStringEncodingMacKhmer
;
2303 case wxFONTENCODING_MACTHAI
:
2304 enc
= kCFStringEncodingMacThai
;
2306 case wxFONTENCODING_MACLAOTIAN
:
2307 enc
= kCFStringEncodingMacLaotian
;
2309 case wxFONTENCODING_MACGEORGIAN
:
2310 enc
= kCFStringEncodingMacGeorgian
;
2312 case wxFONTENCODING_MACARMENIAN
:
2313 enc
= kCFStringEncodingMacArmenian
;
2315 case wxFONTENCODING_MACCHINESESIMP
:
2316 enc
= kCFStringEncodingMacChineseSimp
;
2318 case wxFONTENCODING_MACTIBETAN
:
2319 enc
= kCFStringEncodingMacTibetan
;
2321 case wxFONTENCODING_MACMONGOLIAN
:
2322 enc
= kCFStringEncodingMacMongolian
;
2324 case wxFONTENCODING_MACETHIOPIC
:
2325 enc
= kCFStringEncodingMacEthiopic
;
2327 case wxFONTENCODING_MACCENTRALEUR
:
2328 enc
= kCFStringEncodingMacCentralEurRoman
;
2330 case wxFONTENCODING_MACVIATNAMESE
:
2331 enc
= kCFStringEncodingMacVietnamese
;
2333 case wxFONTENCODING_MACARABICEXT
:
2334 enc
= kCFStringEncodingMacExtArabic
;
2336 case wxFONTENCODING_MACSYMBOL
:
2337 enc
= kCFStringEncodingMacSymbol
;
2339 case wxFONTENCODING_MACDINGBATS
:
2340 enc
= kCFStringEncodingMacDingbats
;
2342 case wxFONTENCODING_MACTURKISH
:
2343 enc
= kCFStringEncodingMacTurkish
;
2345 case wxFONTENCODING_MACCROATIAN
:
2346 enc
= kCFStringEncodingMacCroatian
;
2348 case wxFONTENCODING_MACICELANDIC
:
2349 enc
= kCFStringEncodingMacIcelandic
;
2351 case wxFONTENCODING_MACROMANIAN
:
2352 enc
= kCFStringEncodingMacRomanian
;
2354 case wxFONTENCODING_MACCELTIC
:
2355 enc
= kCFStringEncodingMacCeltic
;
2357 case wxFONTENCODING_MACGAELIC
:
2358 enc
= kCFStringEncodingMacGaelic
;
2360 // case wxFONTENCODING_MACKEYBOARD :
2361 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2364 // because gcc is picky
2370 class wxMBConv_cocoa
: public wxMBConv
2375 Init(CFStringGetSystemEncoding()) ;
2378 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2380 m_encoding
= conv
.m_encoding
;
2384 wxMBConv_cocoa(const wxChar
* name
)
2386 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2390 wxMBConv_cocoa(wxFontEncoding encoding
)
2392 Init( wxCFStringEncFromFontEnc(encoding
) );
2399 void Init( CFStringEncoding encoding
)
2401 m_encoding
= encoding
;
2404 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2408 CFStringRef theString
= CFStringCreateWithBytes (
2409 NULL
, //the allocator
2410 (const UInt8
*)szUnConv
,
2413 false //no BOM/external representation
2416 wxASSERT(theString
);
2418 size_t nOutLength
= CFStringGetLength(theString
);
2422 CFRelease(theString
);
2426 CFRange theRange
= { 0, nOutSize
};
2428 #if SIZEOF_WCHAR_T == 4
2429 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2432 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2434 CFRelease(theString
);
2436 szUniCharBuffer
[nOutLength
] = '\0' ;
2438 #if SIZEOF_WCHAR_T == 4
2439 wxMBConvUTF16 converter
;
2440 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2441 delete[] szUniCharBuffer
;
2447 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2451 size_t nRealOutSize
;
2452 size_t nBufSize
= wxWcslen(szUnConv
);
2453 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2455 #if SIZEOF_WCHAR_T == 4
2456 wxMBConvUTF16 converter
;
2457 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2458 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2459 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2460 nBufSize
/= sizeof(UniChar
);
2463 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2467 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2470 wxASSERT(theString
);
2472 //Note that CER puts a BOM when converting to unicode
2473 //so we check and use getchars instead in that case
2474 if (m_encoding
== kCFStringEncodingUnicode
)
2477 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2479 nRealOutSize
= CFStringGetLength(theString
) + 1;
2485 CFRangeMake(0, CFStringGetLength(theString
)),
2487 0, //what to put in characters that can't be converted -
2488 //0 tells CFString to return NULL if it meets such a character
2489 false, //not an external representation
2492 (CFIndex
*) &nRealOutSize
2496 CFRelease(theString
);
2498 #if SIZEOF_WCHAR_T == 4
2499 delete[] szUniBuffer
;
2502 return nRealOutSize
- 1;
2505 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2509 return m_encoding
!= kCFStringEncodingInvalidId
&&
2510 CFStringIsEncodingAvailable(m_encoding
);
2514 CFStringEncoding m_encoding
;
2517 #endif // defined(__WXCOCOA__)
2519 // ============================================================================
2520 // Mac conversion classes
2521 // ============================================================================
2523 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2525 class wxMBConv_mac
: public wxMBConv
2530 Init(CFStringGetSystemEncoding()) ;
2533 wxMBConv_mac(const wxMBConv_mac
& conv
)
2535 Init(conv
.m_char_encoding
);
2539 wxMBConv_mac(const wxChar
* name
)
2541 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2545 wxMBConv_mac(wxFontEncoding encoding
)
2547 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2552 OSStatus status
= noErr
;
2553 status
= TECDisposeConverter(m_MB2WC_converter
);
2554 status
= TECDisposeConverter(m_WC2MB_converter
);
2558 void Init( TextEncodingBase encoding
)
2560 OSStatus status
= noErr
;
2561 m_char_encoding
= encoding
;
2562 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2564 status
= TECCreateConverter(&m_MB2WC_converter
,
2566 m_unicode_encoding
);
2567 status
= TECCreateConverter(&m_WC2MB_converter
,
2572 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2574 OSStatus status
= noErr
;
2575 ByteCount byteOutLen
;
2576 ByteCount byteInLen
= strlen(psz
) ;
2577 wchar_t *tbuf
= NULL
;
2578 UniChar
* ubuf
= NULL
;
2583 //apple specs say at least 32
2584 n
= wxMax( 32 , byteInLen
) ;
2585 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2587 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2588 #if SIZEOF_WCHAR_T == 4
2589 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2591 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2593 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2594 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2595 #if SIZEOF_WCHAR_T == 4
2596 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2597 // is not properly terminated we get random characters at the end
2598 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2599 wxMBConvUTF16 converter
;
2600 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2603 res
= byteOutLen
/ sizeof( UniChar
) ;
2608 if ( buf
&& res
< n
)
2614 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2616 OSStatus status
= noErr
;
2617 ByteCount byteOutLen
;
2618 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2624 //apple specs say at least 32
2625 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2626 tbuf
= (char*) malloc( n
) ;
2629 ByteCount byteBufferLen
= n
;
2630 UniChar
* ubuf
= NULL
;
2631 #if SIZEOF_WCHAR_T == 4
2632 wxMBConvUTF16 converter
;
2633 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2634 byteInLen
= unicharlen
;
2635 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2636 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2638 ubuf
= (UniChar
*) psz
;
2640 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2641 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2642 #if SIZEOF_WCHAR_T == 4
2648 size_t res
= byteOutLen
;
2649 if ( buf
&& res
< n
)
2653 //we need to double-trip to verify it didn't insert any ? in place
2654 //of bogus characters
2655 wxWCharBuffer
wcBuf(n
);
2656 size_t pszlen
= wxWcslen(psz
);
2657 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2658 wxWcslen(wcBuf
) != pszlen
||
2659 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2661 // we didn't obtain the same thing we started from, hence
2662 // the conversion was lossy and we consider that it failed
2670 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2673 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2676 TECObjectRef m_MB2WC_converter
;
2677 TECObjectRef m_WC2MB_converter
;
2679 TextEncodingBase m_char_encoding
;
2680 TextEncodingBase m_unicode_encoding
;
2683 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2685 // ============================================================================
2686 // wxEncodingConverter based conversion classes
2687 // ============================================================================
2691 class wxMBConv_wxwin
: public wxMBConv
2696 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2697 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2701 // temporarily just use wxEncodingConverter stuff,
2702 // so that it works while a better implementation is built
2703 wxMBConv_wxwin(const wxChar
* name
)
2706 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2708 m_enc
= wxFONTENCODING_SYSTEM
;
2713 wxMBConv_wxwin(wxFontEncoding enc
)
2720 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2722 size_t inbuf
= strlen(psz
);
2725 if (!m2w
.Convert(psz
,buf
))
2731 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2733 const size_t inbuf
= wxWcslen(psz
);
2736 if (!w2m
.Convert(psz
,buf
))
2743 virtual size_t GetMBNulLen() const
2747 case wxFONTENCODING_UTF16BE
:
2748 case wxFONTENCODING_UTF16LE
:
2751 case wxFONTENCODING_UTF32BE
:
2752 case wxFONTENCODING_UTF32LE
:
2760 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2762 bool IsOk() const { return m_ok
; }
2765 wxFontEncoding m_enc
;
2766 wxEncodingConverter m2w
, w2m
;
2769 // were we initialized successfully?
2772 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2775 // make the constructors available for unit testing
2776 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2778 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2779 if ( !result
->IsOk() )
2787 #endif // wxUSE_FONTMAP
2789 // ============================================================================
2790 // wxCSConv implementation
2791 // ============================================================================
2793 void wxCSConv::Init()
2800 wxCSConv::wxCSConv(const wxChar
*charset
)
2810 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2812 m_encoding
= wxFONTENCODING_SYSTEM
;
2816 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2818 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2820 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2822 encoding
= wxFONTENCODING_SYSTEM
;
2827 m_encoding
= encoding
;
2830 wxCSConv::~wxCSConv()
2835 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2840 SetName(conv
.m_name
);
2841 m_encoding
= conv
.m_encoding
;
2844 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2848 SetName(conv
.m_name
);
2849 m_encoding
= conv
.m_encoding
;
2854 void wxCSConv::Clear()
2863 void wxCSConv::SetName(const wxChar
*charset
)
2867 m_name
= wxStrdup(charset
);
2873 #include "wx/hashmap.h"
2875 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2876 wxEncodingNameCache
);
2878 static wxEncodingNameCache gs_nameCache
;
2881 wxMBConv
*wxCSConv::DoCreate() const
2884 wxLogTrace(TRACE_STRCONV
,
2885 wxT("creating conversion for %s"),
2887 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
2888 #endif // wxUSE_FONTMAP
2890 // check for the special case of ASCII or ISO8859-1 charset: as we have
2891 // special knowledge of it anyhow, we don't need to create a special
2892 // conversion object
2893 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2894 m_encoding
== wxFONTENCODING_DEFAULT
)
2896 // don't convert at all
2900 // we trust OS to do conversion better than we can so try external
2901 // conversion methods first
2903 // the full order is:
2904 // 1. OS conversion (iconv() under Unix or Win32 API)
2905 // 2. hard coded conversions for UTF
2906 // 3. wxEncodingConverter as fall back
2912 #endif // !wxUSE_FONTMAP
2914 wxString
name(m_name
);
2915 wxFontEncoding
encoding(m_encoding
);
2917 if ( !name
.empty() )
2919 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2927 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2928 #endif // wxUSE_FONTMAP
2932 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2933 if ( it
!= gs_nameCache
.end() )
2935 if ( it
->second
.empty() )
2938 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
2945 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2947 for ( ; *names
; ++names
)
2949 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
2952 gs_nameCache
[encoding
] = *names
;
2959 gs_nameCache
[encoding
] = _T(""); // cache the failure
2961 #endif // wxUSE_FONTMAP
2963 #endif // HAVE_ICONV
2965 #ifdef wxHAVE_WIN32_MB2WC
2968 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2969 : new wxMBConv_win32(m_encoding
);
2978 #endif // wxHAVE_WIN32_MB2WC
2979 #if defined(__WXMAC__)
2981 // leave UTF16 and UTF32 to the built-ins of wx
2982 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2983 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2987 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2988 : new wxMBConv_mac(m_encoding
);
2990 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2999 #if defined(__WXCOCOA__)
3001 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3005 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3006 : new wxMBConv_cocoa(m_encoding
);
3008 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3018 wxFontEncoding enc
= m_encoding
;
3020 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3022 // use "false" to suppress interactive dialogs -- we can be called from
3023 // anywhere and popping up a dialog from here is the last thing we want to
3025 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3027 #endif // wxUSE_FONTMAP
3031 case wxFONTENCODING_UTF7
:
3032 return new wxMBConvUTF7
;
3034 case wxFONTENCODING_UTF8
:
3035 return new wxMBConvUTF8
;
3037 case wxFONTENCODING_UTF16BE
:
3038 return new wxMBConvUTF16BE
;
3040 case wxFONTENCODING_UTF16LE
:
3041 return new wxMBConvUTF16LE
;
3043 case wxFONTENCODING_UTF32BE
:
3044 return new wxMBConvUTF32BE
;
3046 case wxFONTENCODING_UTF32LE
:
3047 return new wxMBConvUTF32LE
;
3050 // nothing to do but put here to suppress gcc warnings
3057 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3058 : new wxMBConv_wxwin(m_encoding
);
3064 #endif // wxUSE_FONTMAP
3066 // NB: This is a hack to prevent deadlock. What could otherwise happen
3067 // in Unicode build: wxConvLocal creation ends up being here
3068 // because of some failure and logs the error. But wxLog will try to
3069 // attach timestamp, for which it will need wxConvLocal (to convert
3070 // time to char* and then wchar_t*), but that fails, tries to log
3071 // error, but wxLog has a (already locked) critical section that
3072 // guards static buffer.
3073 static bool alreadyLoggingError
= false;
3074 if (!alreadyLoggingError
)
3076 alreadyLoggingError
= true;
3077 wxLogError(_("Cannot convert from the charset '%s'!"),
3081 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3082 #else // !wxUSE_FONTMAP
3083 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3084 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3086 alreadyLoggingError
= false;
3092 void wxCSConv::CreateConvIfNeeded() const
3096 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3099 // if we don't have neither the name nor the encoding, use the default
3100 // encoding for this system
3101 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3103 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3105 #endif // wxUSE_INTL
3107 self
->m_convReal
= DoCreate();
3108 self
->m_deferred
= false;
3112 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3114 CreateConvIfNeeded();
3117 return m_convReal
->MB2WC(buf
, psz
, n
);
3120 size_t len
= strlen(psz
);
3124 for (size_t c
= 0; c
<= len
; c
++)
3125 buf
[c
] = (unsigned char)(psz
[c
]);
3131 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3133 CreateConvIfNeeded();
3136 return m_convReal
->WC2MB(buf
, psz
, n
);
3139 const size_t len
= wxWcslen(psz
);
3142 for (size_t c
= 0; c
<= len
; c
++)
3146 buf
[c
] = (char)psz
[c
];
3151 for (size_t c
= 0; c
<= len
; c
++)
3161 size_t wxCSConv::GetMBNulLen() const
3163 CreateConvIfNeeded();
3167 return m_convReal
->GetMBNulLen();
3173 // ----------------------------------------------------------------------------
3175 // ----------------------------------------------------------------------------
3178 static wxMBConv_win32 wxConvLibcObj
;
3179 #elif defined(__WXMAC__) && !defined(__MACH__)
3180 static wxMBConv_mac wxConvLibcObj
;
3182 static wxMBConvLibc wxConvLibcObj
;
3185 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3186 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3187 static wxMBConvUTF7 wxConvUTF7Obj
;
3188 static wxMBConvUTF8 wxConvUTF8Obj
;
3190 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3191 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3192 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3193 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3194 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3195 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3196 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3204 #else // !wxUSE_WCHAR_T
3206 // stand-ins in absence of wchar_t
3207 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3212 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T