1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 #if SIZEOF_WCHAR_T == 2
85 // ============================================================================
87 // ============================================================================
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p
, size_t n
)
92 while ( n
&& *p
++ == '\0' )
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
103 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
108 *output
= (wxUint16
) input
;
111 else if (input
>=0x110000)
119 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
120 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
126 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
128 if ((*input
<0xd800) || (*input
>0xdfff))
133 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
140 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
146 // ----------------------------------------------------------------------------
148 // ----------------------------------------------------------------------------
150 wxMBConv::~wxMBConv()
152 // nothing to do here (necessary for Darwin linking probably)
155 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
159 // calculate the length of the buffer needed first
160 size_t nLen
= MB2WC(NULL
, psz
, 0);
161 if ( nLen
!= (size_t)-1 )
163 // now do the actual conversion
164 wxWCharBuffer
buf(nLen
);
165 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
166 if ( nLen
!= (size_t)-1 )
173 wxWCharBuffer
buf((wchar_t *)NULL
);
178 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
182 size_t nLen
= WC2MB(NULL
, pwz
, 0);
183 if ( nLen
!= (size_t)-1 )
185 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
186 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
187 if ( nLen
!= (size_t)-1 )
194 wxCharBuffer
buf((char *)NULL
);
200 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
202 // the currently accumulated wide characters
205 // the current length of wbuf
208 // the number of NULs terminating this string
209 size_t nulLen
wxDUMMY_INITIALIZE(0);
211 // make a copy of the input string unless it is already properly
215 // if we were not given the input size we just have to assume that the
216 // string is properly terminated as we have no way of knowing how long it
217 // is anyhow, but if we do have the size check whether there are enough
219 if ( inLen
!= (size_t)-1 )
221 // we need to know how to find the end of this string
222 nulLen
= GetMinMBCharWidth();
223 if ( nulLen
== (size_t)-1 )
226 // if there are enough NULs we can avoid the copy
227 if ( inLen
< nulLen
|| NotAllNULs(in
+ inLen
- nulLen
, nulLen
) )
229 // make a copy in order to properly NUL-terminate the string
230 bufTmp
= wxCharBuffer(inLen
+ nulLen
- 1 /* 1 will be added */);
231 char * const p
= bufTmp
.data();
232 memcpy(p
, in
, inLen
);
233 for ( char *s
= p
+ inLen
; s
< p
+ inLen
+ nulLen
; s
++ )
242 for ( const char * const inEnd
= in
+ inLen
;; )
244 // try to convert the current chunk
245 lenChunk
= MB2WC(NULL
, in
, 0);
248 // nothing left in the input string, conversion succeeded
252 if ( lenChunk
== (size_t)-1 )
255 // if we already have a previous chunk, leave the NUL separating it
260 const size_t lenBufNew
= lenBuf
+ lenChunk
;
261 if ( !wbuf
.extend(lenBufNew
) )
263 lenChunk
= (size_t)-1;
267 lenChunk
= MB2WC(wbuf
.data() + lenBuf
, in
, lenChunk
+ 1 /* for NUL */);
268 if ( lenChunk
== (size_t)-1 )
273 if ( inLen
== (size_t)-1 )
275 // convert only one chunk in this case, as we suppose that the
276 // string is NUL-terminated and so inEnd is not used at all
280 // advance the input pointer past the end of this chunk
281 while ( NotAllNULs(in
, nulLen
) )
283 // notice that we must skip over multiple bytes here as we suppose
284 // that if NUL takes 2 or 4 bytes, then all the other characters do
285 // too and so if advanced by a single byte we might erroneously
286 // detect sequences of NUL bytes in the middle of the input
290 in
+= nulLen
; // skipping over its terminator as well
292 // note that ">=" (and not just "==") is needed here as the terminator
293 // we skipped just above could be inside or just after the buffer
294 // delimited by inEnd
299 if ( lenChunk
== (size_t)-1 )
313 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
315 // the currently accumulated multibyte characters
318 // the current length of buf
321 // make a copy of the input string unless it is already properly
324 // if we don't know its length we have no choice but to assume that it is,
325 // indeed, properly terminated
326 wxWCharBuffer bufTmp
;
327 if ( inLen
== (size_t)-1 )
329 inLen
= wxWcslen(in
) + 1;
331 else if ( inLen
!= 0 && in
[inLen
- 1] != L
'\0' )
333 // make a copy in order to properly NUL-terminate the string
334 bufTmp
= wxWCharBuffer(inLen
);
335 memcpy(bufTmp
.data(), in
, inLen
*sizeof(wchar_t));
341 for ( const wchar_t * const inEnd
= in
+ inLen
;; )
343 // try to convert the current chunk, if anything left
344 size_t lenChunk
= in
< inEnd
? WC2MB(NULL
, in
, 0) : 0;
347 // nothing left in the input string, conversion succeeded
349 *outLen
= lenBuf
? lenBuf
- 1 : lenBuf
;
354 if ( lenChunk
== (size_t)-1 )
357 const size_t lenBufNew
= lenBuf
+ lenChunk
;
358 if ( !buf
.extend(lenBufNew
) )
361 lenChunk
= WC2MB(buf
.data() + lenBuf
, in
, lenChunk
+ 1 /* for NUL */);
362 if ( lenChunk
== (size_t)-1 )
365 // chunk successfully converted, go to the next one
366 in
+= wxWcslen(in
) + 1 /* skip NUL too */;
367 lenBuf
= lenBufNew
+ 1;
374 return wxCharBuffer();
377 // ----------------------------------------------------------------------------
379 // ----------------------------------------------------------------------------
381 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
383 return wxMB2WC(buf
, psz
, n
);
386 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
388 return wxWC2MB(buf
, psz
, n
);
391 // ----------------------------------------------------------------------------
392 // wxConvBrokenFileNames
393 // ----------------------------------------------------------------------------
397 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
399 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
400 || wxStricmp(charset
, _T("UTF8")) == 0 )
401 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
403 m_conv
= new wxCSConv(charset
);
408 // ----------------------------------------------------------------------------
410 // ----------------------------------------------------------------------------
412 // Implementation (C) 2004 Fredrik Roubert
415 // BASE64 decoding table
417 static const unsigned char utf7unb64
[] =
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
425 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
426 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
428 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
429 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
430 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
432 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
433 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
434 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
437 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
438 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
439 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
440 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
441 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
442 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
443 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
444 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
453 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
457 while ( *psz
&& (!buf
|| (len
< n
)) )
459 unsigned char cc
= *psz
++;
467 else if (*psz
== '-')
475 else // start of BASE64 encoded string
479 for ( ok
= lsb
= false, d
= 0, l
= 0;
480 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
485 for (l
+= 6; l
>= 8; lsb
= !lsb
)
487 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
497 *buf
= (wchar_t)(c
<< 8);
506 // in valid UTF7 we should have valid characters after '+'
515 if ( buf
&& (len
< n
) )
522 // BASE64 encoding table
524 static const unsigned char utf7enb64
[] =
526 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
527 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
528 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
529 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
530 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
531 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
532 'w', 'x', 'y', 'z', '0', '1', '2', '3',
533 '4', '5', '6', '7', '8', '9', '+', '/'
537 // UTF-7 encoding table
539 // 0 - Set D (directly encoded characters)
540 // 1 - Set O (optional direct characters)
541 // 2 - whitespace characters (optional)
542 // 3 - special characters
544 static const unsigned char utf7encode
[128] =
546 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
547 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
548 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
550 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
552 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
556 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
560 while (*psz
&& ((!buf
) || (len
< n
)))
563 if (cc
< 0x80 && utf7encode
[cc
] < 1)
571 else if (((wxUint32
)cc
) > 0xffff)
573 // no surrogate pair generation (yet?)
584 // BASE64 encode string
585 unsigned int lsb
, d
, l
;
586 for (d
= 0, l
= 0; /*nothing*/; psz
++)
588 for (lsb
= 0; lsb
< 2; lsb
++)
591 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
593 for (l
+= 8; l
>= 6; )
597 *buf
++ = utf7enb64
[(d
>> l
) % 64];
602 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
608 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
617 if (buf
&& (len
< n
))
622 // ----------------------------------------------------------------------------
624 // ----------------------------------------------------------------------------
626 static wxUint32 utf8_max
[]=
627 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
629 // boundaries of the private use area we use to (temporarily) remap invalid
630 // characters invalid in a UTF-8 encoded string
631 const wxUint32 wxUnicodePUA
= 0x100000;
632 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
634 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
638 while (*psz
&& ((!buf
) || (len
< n
)))
640 const char *opsz
= psz
;
641 bool invalid
= false;
642 unsigned char cc
= *psz
++, fc
= cc
;
644 for (cnt
= 0; fc
& 0x80; cnt
++)
653 // escape the escape character for octal escapes
654 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
655 && cc
== '\\' && (!buf
|| len
< n
))
667 // invalid UTF-8 sequence
672 unsigned ocnt
= cnt
- 1;
673 wxUint32 res
= cc
& (0x3f >> cnt
);
677 if ((cc
& 0xC0) != 0x80)
679 // invalid UTF-8 sequence
684 res
= (res
<< 6) | (cc
& 0x3f);
686 if (invalid
|| res
<= utf8_max
[ocnt
])
688 // illegal UTF-8 encoding
691 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
692 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
694 // if one of our PUA characters turns up externally
695 // it must also be treated as an illegal sequence
696 // (a bit like you have to escape an escape character)
702 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
703 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
704 if (pa
== (size_t)-1)
716 *buf
++ = (wchar_t)res
;
718 #endif // WC_UTF16/!WC_UTF16
723 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
725 while (opsz
< psz
&& (!buf
|| len
< n
))
728 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
729 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
730 wxASSERT(pa
!= (size_t)-1);
737 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
743 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
745 while (opsz
< psz
&& (!buf
|| len
< n
))
747 if ( buf
&& len
+ 3 < n
)
749 unsigned char on
= *opsz
;
751 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
752 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
753 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
759 else // MAP_INVALID_UTF8_NOT
766 if (buf
&& (len
< n
))
771 static inline bool isoctal(wchar_t wch
)
773 return L
'0' <= wch
&& wch
<= L
'7';
776 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
780 while (*psz
&& ((!buf
) || (len
< n
)))
784 // cast is ok for WC_UTF16
785 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
786 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
788 cc
=(*psz
++) & 0x7fffffff;
791 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
792 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
795 *buf
++ = (char)(cc
- wxUnicodePUA
);
798 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
799 && cc
== L
'\\' && psz
[0] == L
'\\' )
806 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
808 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
812 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
813 (psz
[1] - L
'0')*010 +
823 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
837 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
839 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
851 // ----------------------------------------------------------------------------
853 // ----------------------------------------------------------------------------
855 #ifdef WORDS_BIGENDIAN
856 #define wxMBConvUTF16straight wxMBConvUTF16BE
857 #define wxMBConvUTF16swap wxMBConvUTF16LE
859 #define wxMBConvUTF16swap wxMBConvUTF16BE
860 #define wxMBConvUTF16straight wxMBConvUTF16LE
866 // copy 16bit MB to 16bit String
867 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
871 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
874 *buf
++ = *(wxUint16
*)psz
;
877 psz
+= sizeof(wxUint16
);
879 if (buf
&& len
<n
) *buf
=0;
885 // copy 16bit String to 16bit MB
886 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
890 while (*psz
&& (!buf
|| len
< n
))
894 *(wxUint16
*)buf
= *psz
;
895 buf
+= sizeof(wxUint16
);
897 len
+= sizeof(wxUint16
);
900 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
906 // swap 16bit MB to 16bit String
907 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
911 // UTF16 string must be terminated by 2 NULs as single NULs may occur
913 while ( (psz
[0] || psz
[1]) && (!buf
|| len
< n
) )
917 ((char *)buf
)[0] = psz
[1];
918 ((char *)buf
)[1] = psz
[0];
925 if ( buf
&& len
< n
)
932 // swap 16bit MB to 16bit String
933 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
937 while ( *psz
&& (!buf
|| len
< n
) )
941 *buf
++ = ((char*)psz
)[1];
942 *buf
++ = ((char*)psz
)[0];
948 if ( buf
&& len
< n
)
958 // copy 16bit MB to 32bit String
959 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
963 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
966 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
967 if (pa
== (size_t)-1)
971 *buf
++ = (wchar_t)cc
;
973 psz
+= pa
* sizeof(wxUint16
);
975 if (buf
&& len
<n
) *buf
=0;
981 // copy 32bit String to 16bit MB
982 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
986 while (*psz
&& (!buf
|| len
< n
))
989 size_t pa
=encode_utf16(*psz
, cc
);
991 if (pa
== (size_t)-1)
996 *(wxUint16
*)buf
= cc
[0];
997 buf
+= sizeof(wxUint16
);
1000 *(wxUint16
*)buf
= cc
[1];
1001 buf
+= sizeof(wxUint16
);
1005 len
+= pa
*sizeof(wxUint16
);
1008 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1014 // swap 16bit MB to 32bit String
1015 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1019 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1023 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
1024 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
1026 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
1027 if (pa
== (size_t)-1)
1031 *buf
++ = (wchar_t)cc
;
1034 psz
+= pa
* sizeof(wxUint16
);
1036 if (buf
&& len
<n
) *buf
=0;
1042 // swap 32bit String to 16bit MB
1043 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1047 while (*psz
&& (!buf
|| len
< n
))
1050 size_t pa
=encode_utf16(*psz
, cc
);
1052 if (pa
== (size_t)-1)
1057 *buf
++ = ((char*)cc
)[1];
1058 *buf
++ = ((char*)cc
)[0];
1061 *buf
++ = ((char*)cc
)[3];
1062 *buf
++ = ((char*)cc
)[2];
1066 len
+= pa
*sizeof(wxUint16
);
1069 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1077 // ----------------------------------------------------------------------------
1079 // ----------------------------------------------------------------------------
1081 #ifdef WORDS_BIGENDIAN
1082 #define wxMBConvUTF32straight wxMBConvUTF32BE
1083 #define wxMBConvUTF32swap wxMBConvUTF32LE
1085 #define wxMBConvUTF32swap wxMBConvUTF32BE
1086 #define wxMBConvUTF32straight wxMBConvUTF32LE
1090 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1091 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1096 // copy 32bit MB to 16bit String
1097 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1101 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1105 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
1106 if (pa
== (size_t)-1)
1116 psz
+= sizeof(wxUint32
);
1118 if (buf
&& len
<n
) *buf
=0;
1124 // copy 16bit String to 32bit MB
1125 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1129 while (*psz
&& (!buf
|| len
< n
))
1133 // cast is ok for WC_UTF16
1134 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1135 if (pa
== (size_t)-1)
1140 *(wxUint32
*)buf
= cc
;
1141 buf
+= sizeof(wxUint32
);
1143 len
+= sizeof(wxUint32
);
1147 if (buf
&& len
<=n
-sizeof(wxUint32
))
1155 // swap 32bit MB to 16bit String
1156 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1160 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1163 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
1164 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
1169 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
1170 if (pa
== (size_t)-1)
1180 psz
+= sizeof(wxUint32
);
1190 // swap 16bit String to 32bit MB
1191 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1195 while (*psz
&& (!buf
|| len
< n
))
1199 // cast is ok for WC_UTF16
1200 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1201 if (pa
== (size_t)-1)
1211 len
+= sizeof(wxUint32
);
1215 if (buf
&& len
<=n
-sizeof(wxUint32
))
1224 // copy 32bit MB to 32bit String
1225 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1229 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1232 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1234 psz
+= sizeof(wxUint32
);
1244 // copy 32bit String to 32bit MB
1245 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1249 while (*psz
&& (!buf
|| len
< n
))
1253 *(wxUint32
*)buf
= *psz
;
1254 buf
+= sizeof(wxUint32
);
1257 len
+= sizeof(wxUint32
);
1261 if (buf
&& len
<=n
-sizeof(wxUint32
))
1268 // swap 32bit MB to 32bit String
1269 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1273 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1277 ((char *)buf
)[0] = psz
[3];
1278 ((char *)buf
)[1] = psz
[2];
1279 ((char *)buf
)[2] = psz
[1];
1280 ((char *)buf
)[3] = psz
[0];
1284 psz
+= sizeof(wxUint32
);
1294 // swap 32bit String to 32bit MB
1295 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1299 while (*psz
&& (!buf
|| len
< n
))
1303 *buf
++ = ((char *)psz
)[3];
1304 *buf
++ = ((char *)psz
)[2];
1305 *buf
++ = ((char *)psz
)[1];
1306 *buf
++ = ((char *)psz
)[0];
1308 len
+= sizeof(wxUint32
);
1312 if (buf
&& len
<=n
-sizeof(wxUint32
))
1322 // ============================================================================
1323 // The classes doing conversion using the iconv_xxx() functions
1324 // ============================================================================
1328 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1329 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1330 // (unless there's yet another bug in glibc) the only case when iconv()
1331 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1332 // left in the input buffer -- when _real_ error occurs,
1333 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1335 // [This bug does not appear in glibc 2.2.]
1336 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1337 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1338 (errno != E2BIG || bufLeft != 0))
1340 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1343 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1345 #define ICONV_T_INVALID ((iconv_t)-1)
1347 #if SIZEOF_WCHAR_T == 4
1348 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1349 #define WC_ENC wxFONTENCODING_UTF32
1350 #elif SIZEOF_WCHAR_T == 2
1351 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1352 #define WC_ENC wxFONTENCODING_UTF16
1353 #else // sizeof(wchar_t) != 2 nor 4
1354 // does this ever happen?
1355 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1358 // ----------------------------------------------------------------------------
1359 // wxMBConv_iconv: encapsulates an iconv character set
1360 // ----------------------------------------------------------------------------
1362 class wxMBConv_iconv
: public wxMBConv
1365 wxMBConv_iconv(const wxChar
*name
);
1366 virtual ~wxMBConv_iconv();
1368 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1369 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1372 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1375 // the iconv handlers used to translate from multibyte to wide char and in
1376 // the other direction
1380 // guards access to m2w and w2m objects
1381 wxMutex m_iconvMutex
;
1385 // classify this encoding as explained in wxMBConv::GetMinMBCharWidth()
1387 virtual size_t GetMinMBCharWidth() const;
1389 // the name (for iconv_open()) of a wide char charset -- if none is
1390 // available on this machine, it will remain NULL
1391 static wxString ms_wcCharsetName
;
1393 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1394 // different endian-ness than the native one
1395 static bool ms_wcNeedsSwap
;
1397 // cached result of GetMinMBCharWidth(); set to 0 meaning "unknown"
1399 size_t m_minMBCharWidth
;
1402 // make the constructor available for unit testing
1403 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1405 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1406 if ( !result
->IsOk() )
1414 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1415 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1417 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1419 m_minMBCharWidth
= 0;
1421 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1422 // names for the charsets
1423 const wxCharBuffer
cname(wxString(name
).ToAscii());
1425 // check for charset that represents wchar_t:
1426 if ( ms_wcCharsetName
.empty() )
1428 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1431 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1432 #else // !wxUSE_FONTMAP
1433 static const wxChar
*names
[] =
1435 #if SIZEOF_WCHAR_T == 4
1437 #elif SIZEOF_WCHAR_T = 2
1442 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1444 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1446 const wxString
nameCS(*names
);
1448 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1449 wxString
nameXE(nameCS
);
1450 #ifdef WORDS_BIGENDIAN
1452 #else // little endian
1456 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1459 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1460 if ( m2w
== ICONV_T_INVALID
)
1462 // try charset w/o bytesex info (e.g. "UCS4")
1463 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1465 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1467 // and check for bytesex ourselves:
1468 if ( m2w
!= ICONV_T_INVALID
)
1470 char buf
[2], *bufPtr
;
1471 wchar_t wbuf
[2], *wbufPtr
;
1479 outsz
= SIZEOF_WCHAR_T
* 2;
1483 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1484 (char**)&wbufPtr
, &outsz
);
1486 if (ICONV_FAILED(res
, insz
))
1488 wxLogLastError(wxT("iconv"));
1489 wxLogError(_("Conversion to charset '%s' doesn't work."),
1492 else // ok, can convert to this encoding, remember it
1494 ms_wcCharsetName
= nameCS
;
1495 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1499 else // use charset not requiring byte swapping
1501 ms_wcCharsetName
= nameXE
;
1505 wxLogTrace(TRACE_STRCONV
,
1506 wxT("iconv wchar_t charset is \"%s\"%s"),
1507 ms_wcCharsetName
.empty() ? _T("<none>")
1508 : ms_wcCharsetName
.c_str(),
1509 ms_wcNeedsSwap
? _T(" (needs swap)")
1512 else // we already have ms_wcCharsetName
1514 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1517 if ( ms_wcCharsetName
.empty() )
1519 w2m
= ICONV_T_INVALID
;
1523 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1524 if ( w2m
== ICONV_T_INVALID
)
1526 wxLogTrace(TRACE_STRCONV
,
1527 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1528 ms_wcCharsetName
.c_str(), cname
.data());
1533 wxMBConv_iconv::~wxMBConv_iconv()
1535 if ( m2w
!= ICONV_T_INVALID
)
1537 if ( w2m
!= ICONV_T_INVALID
)
1541 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1543 // find the string length: notice that must be done differently for
1544 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1546 const size_t nulLen
= GetMinMBCharWidth();
1553 inbuf
= strlen(psz
); // arguably more optimized than our version
1558 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1559 // they also have to start at character boundary and not span two
1560 // adjacent characters
1562 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1569 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1570 // Unfortunately there is a couple of global wxCSConv objects such as
1571 // wxConvLocal that are used all over wx code, so we have to make sure
1572 // the handle is used by at most one thread at the time. Otherwise
1573 // only a few wx classes would be safe to use from non-main threads
1574 // as MB<->WC conversion would fail "randomly".
1575 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1576 #endif // wxUSE_THREADS
1579 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1581 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1582 wchar_t *bufPtr
= buf
;
1583 const char *pszPtr
= psz
;
1587 // have destination buffer, convert there
1589 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1590 (char**)&bufPtr
, &outbuf
);
1591 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1595 // convert to native endianness
1596 for ( unsigned i
= 0; i
< res
; i
++ )
1597 buf
[n
] = WC_BSWAP(buf
[i
]);
1600 // NUL-terminate the string if there is any space left
1606 // no destination buffer... convert using temp buffer
1607 // to calculate destination buffer requirement
1612 outbuf
= 8*SIZEOF_WCHAR_T
;
1615 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1616 (char**)&bufPtr
, &outbuf
);
1618 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1619 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1622 if (ICONV_FAILED(cres
, inbuf
))
1624 //VS: it is ok if iconv fails, hence trace only
1625 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1632 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1635 // NB: explained in MB2WC
1636 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1639 size_t inlen
= wxWcslen(psz
);
1640 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1644 wchar_t *tmpbuf
= 0;
1648 // need to copy to temp buffer to switch endianness
1649 // (doing WC_BSWAP twice on the original buffer won't help, as it
1650 // could be in read-only memory, or be accessed in some other thread)
1651 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1652 for ( size_t i
= 0; i
< inlen
; i
++ )
1653 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1654 tmpbuf
[inlen
] = L
'\0';
1660 // have destination buffer, convert there
1661 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1665 // NB: iconv was given only wcslen(psz) characters on input, and so
1666 // it couldn't convert the trailing zero. Let's do it ourselves
1667 // if there's some room left for it in the output buffer.
1673 // no destination buffer... convert using temp buffer
1674 // to calculate destination buffer requirement
1678 buf
= tbuf
; outbuf
= 16;
1680 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1683 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1691 if (ICONV_FAILED(cres
, inbuf
))
1693 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1700 size_t wxMBConv_iconv::GetMinMBCharWidth() const
1702 if ( m_minMBCharWidth
== 0 )
1704 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1707 // NB: explained in MB2WC
1708 wxMutexLocker
lock(self
->m_iconvMutex
);
1711 wchar_t *wnul
= L
"";
1712 char buf
[8]; // should be enough for NUL in any encoding
1713 size_t inLen
= sizeof(wchar_t),
1714 outLen
= WXSIZEOF(buf
);
1715 char *in
= (char *)wnul
;
1717 if ( iconv(w2m
, ICONV_CHAR_CAST(&in
), &inLen
, &out
, &outLen
) == (size_t)-1 )
1719 self
->m_minMBCharWidth
= (size_t)-1;
1723 self
->m_minMBCharWidth
= out
- buf
;
1727 return m_minMBCharWidth
;
1730 #endif // HAVE_ICONV
1733 // ============================================================================
1734 // Win32 conversion classes
1735 // ============================================================================
1737 #ifdef wxHAVE_WIN32_MB2WC
1741 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1742 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1745 class wxMBConv_win32
: public wxMBConv
1750 m_CodePage
= CP_ACP
;
1751 m_minMBCharWidth
= 0;
1755 wxMBConv_win32(const wxChar
* name
)
1757 m_CodePage
= wxCharsetToCodepage(name
);
1758 m_minMBCharWidth
= 0;
1761 wxMBConv_win32(wxFontEncoding encoding
)
1763 m_CodePage
= wxEncodingToCodepage(encoding
);
1764 m_minMBCharWidth
= 0;
1766 #endif // wxUSE_FONTMAP
1768 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1770 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1771 // the behaviour is not compatible with the Unix version (using iconv)
1772 // and break the library itself, e.g. wxTextInputStream::NextChar()
1773 // wouldn't work if reading an incomplete MB char didn't result in an
1776 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1777 // an error (tested under Windows Server 2003) and apparently it is
1778 // done on purpose, i.e. the function accepts any input in this case
1779 // and although I'd prefer to return error on ill-formed output, our
1780 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1781 // explicitly ill-formed according to RFC 2152) neither so we don't
1782 // even have any fallback here...
1784 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1785 // Win XP or newer and if it is specified on older versions, conversion
1786 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1787 // fails. So we can only use the flag on newer Windows versions.
1788 // Additionally, the flag is not supported by UTF7, symbol and CJK
1789 // encodings. See here:
1790 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1791 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1793 if ( m_CodePage
!= CP_UTF7
&& m_CodePage
!= CP_SYMBOL
&&
1794 m_CodePage
< 50000 &&
1795 IsAtLeastWin2kSP4() )
1797 flags
= MB_ERR_INVALID_CHARS
;
1799 else if ( m_CodePage
== CP_UTF8
)
1801 // Avoid round-trip in the special case of UTF-8 by using our
1802 // own UTF-8 conversion code:
1803 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
1806 const size_t len
= ::MultiByteToWideChar
1808 m_CodePage
, // code page
1809 flags
, // flags: fall on error
1810 psz
, // input string
1811 -1, // its length (NUL-terminated)
1812 buf
, // output string
1813 buf
? n
: 0 // size of output buffer
1817 // function totally failed
1821 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1822 // check if we succeeded, by doing a double trip:
1823 if ( !flags
&& buf
)
1825 const size_t mbLen
= strlen(psz
);
1826 wxCharBuffer
mbBuf(mbLen
);
1827 if ( ::WideCharToMultiByte
1834 mbLen
+ 1, // size in bytes, not length
1838 strcmp(mbBuf
, psz
) != 0 )
1840 // we didn't obtain the same thing we started from, hence
1841 // the conversion was lossy and we consider that it failed
1846 // note that it returns count of written chars for buf != NULL and size
1847 // of the needed buffer for buf == NULL so in either case the length of
1848 // the string (which never includes the terminating NUL) is one less
1852 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1855 we have a problem here: by default, WideCharToMultiByte() may
1856 replace characters unrepresentable in the target code page with bad
1857 quality approximations such as turning "1/2" symbol (U+00BD) into
1858 "1" for the code pages which don't have it and we, obviously, want
1859 to avoid this at any price
1861 the trouble is that this function does it _silently_, i.e. it won't
1862 even tell us whether it did or not... Win98/2000 and higher provide
1863 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1864 we have to resort to a round trip, i.e. check that converting back
1865 results in the same string -- this is, of course, expensive but
1866 otherwise we simply can't be sure to not garble the data.
1869 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1870 // it doesn't work with CJK encodings (which we test for rather roughly
1871 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1873 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1876 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1878 // it's our lucky day
1879 flags
= WC_NO_BEST_FIT_CHARS
;
1880 pUsedDef
= &usedDef
;
1882 else // old system or unsupported encoding
1888 const size_t len
= ::WideCharToMultiByte
1890 m_CodePage
, // code page
1891 flags
, // either none or no best fit
1892 pwz
, // input string
1893 -1, // it is (wide) NUL-terminated
1894 buf
, // output buffer
1895 buf
? n
: 0, // and its size
1896 NULL
, // default "replacement" char
1897 pUsedDef
// [out] was it used?
1902 // function totally failed
1906 // if we were really converting, check if we succeeded
1911 // check if the conversion failed, i.e. if any replacements
1916 else // we must resort to double tripping...
1918 wxWCharBuffer
wcBuf(n
);
1919 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1920 wcscmp(wcBuf
, pwz
) != 0 )
1922 // we didn't obtain the same thing we started from, hence
1923 // the conversion was lossy and we consider that it failed
1929 // see the comment above for the reason of "len - 1"
1933 bool IsOk() const { return m_CodePage
!= -1; }
1936 static bool CanUseNoBestFit()
1938 static int s_isWin98Or2k
= -1;
1940 if ( s_isWin98Or2k
== -1 )
1943 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1946 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1950 s_isWin98Or2k
= verMaj
>= 5;
1954 // unknown, be conseravtive by default
1958 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1961 return s_isWin98Or2k
== 1;
1964 static bool IsAtLeastWin2kSP4()
1969 static int s_isAtLeastWin2kSP4
= -1;
1971 if ( s_isAtLeastWin2kSP4
== -1 )
1973 OSVERSIONINFOEX ver
;
1975 memset(&ver
, 0, sizeof(ver
));
1976 ver
.dwOSVersionInfoSize
= sizeof(ver
);
1977 GetVersionEx((OSVERSIONINFO
*)&ver
);
1979 s_isAtLeastWin2kSP4
=
1980 ((ver
.dwMajorVersion
> 5) || // Vista+
1981 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
1982 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
1983 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
1987 return s_isAtLeastWin2kSP4
== 1;
1991 virtual size_t GetMinMBCharWidth() const
1993 if ( m_minMBCharWidth
== 0 )
1995 int len
= ::WideCharToMultiByte
1997 m_CodePage
, // code page
1999 L
"", // input string
2000 1, // translate just the NUL
2001 NULL
, // output buffer
2003 NULL
, // no replacement char
2004 NULL
// [out] don't care if it was used
2007 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2011 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2015 self
->m_minMBCharWidth
= (size_t)-1;
2021 self
->m_minMBCharWidth
= len
;
2026 return m_minMBCharWidth
;
2029 // the code page we're working with
2032 // cached result of GetMinMBCharWidth(), set to 0 initially meaning
2034 size_t m_minMBCharWidth
;
2037 #endif // wxHAVE_WIN32_MB2WC
2039 // ============================================================================
2040 // Cocoa conversion classes
2041 // ============================================================================
2043 #if defined(__WXCOCOA__)
2045 // RN: There is no UTF-32 support in either Core Foundation or
2046 // Cocoa. Strangely enough, internally Core Foundation uses
2047 // UTF 32 internally quite a bit - its just not public (yet).
2049 #include <CoreFoundation/CFString.h>
2050 #include <CoreFoundation/CFStringEncodingExt.h>
2052 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2054 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2055 if ( encoding
== wxFONTENCODING_DEFAULT
)
2057 enc
= CFStringGetSystemEncoding();
2059 else switch( encoding
)
2061 case wxFONTENCODING_ISO8859_1
:
2062 enc
= kCFStringEncodingISOLatin1
;
2064 case wxFONTENCODING_ISO8859_2
:
2065 enc
= kCFStringEncodingISOLatin2
;
2067 case wxFONTENCODING_ISO8859_3
:
2068 enc
= kCFStringEncodingISOLatin3
;
2070 case wxFONTENCODING_ISO8859_4
:
2071 enc
= kCFStringEncodingISOLatin4
;
2073 case wxFONTENCODING_ISO8859_5
:
2074 enc
= kCFStringEncodingISOLatinCyrillic
;
2076 case wxFONTENCODING_ISO8859_6
:
2077 enc
= kCFStringEncodingISOLatinArabic
;
2079 case wxFONTENCODING_ISO8859_7
:
2080 enc
= kCFStringEncodingISOLatinGreek
;
2082 case wxFONTENCODING_ISO8859_8
:
2083 enc
= kCFStringEncodingISOLatinHebrew
;
2085 case wxFONTENCODING_ISO8859_9
:
2086 enc
= kCFStringEncodingISOLatin5
;
2088 case wxFONTENCODING_ISO8859_10
:
2089 enc
= kCFStringEncodingISOLatin6
;
2091 case wxFONTENCODING_ISO8859_11
:
2092 enc
= kCFStringEncodingISOLatinThai
;
2094 case wxFONTENCODING_ISO8859_13
:
2095 enc
= kCFStringEncodingISOLatin7
;
2097 case wxFONTENCODING_ISO8859_14
:
2098 enc
= kCFStringEncodingISOLatin8
;
2100 case wxFONTENCODING_ISO8859_15
:
2101 enc
= kCFStringEncodingISOLatin9
;
2104 case wxFONTENCODING_KOI8
:
2105 enc
= kCFStringEncodingKOI8_R
;
2107 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2108 enc
= kCFStringEncodingDOSRussian
;
2111 // case wxFONTENCODING_BULGARIAN :
2115 case wxFONTENCODING_CP437
:
2116 enc
=kCFStringEncodingDOSLatinUS
;
2118 case wxFONTENCODING_CP850
:
2119 enc
= kCFStringEncodingDOSLatin1
;
2121 case wxFONTENCODING_CP852
:
2122 enc
= kCFStringEncodingDOSLatin2
;
2124 case wxFONTENCODING_CP855
:
2125 enc
= kCFStringEncodingDOSCyrillic
;
2127 case wxFONTENCODING_CP866
:
2128 enc
=kCFStringEncodingDOSRussian
;
2130 case wxFONTENCODING_CP874
:
2131 enc
= kCFStringEncodingDOSThai
;
2133 case wxFONTENCODING_CP932
:
2134 enc
= kCFStringEncodingDOSJapanese
;
2136 case wxFONTENCODING_CP936
:
2137 enc
=kCFStringEncodingDOSChineseSimplif
;
2139 case wxFONTENCODING_CP949
:
2140 enc
= kCFStringEncodingDOSKorean
;
2142 case wxFONTENCODING_CP950
:
2143 enc
= kCFStringEncodingDOSChineseTrad
;
2145 case wxFONTENCODING_CP1250
:
2146 enc
= kCFStringEncodingWindowsLatin2
;
2148 case wxFONTENCODING_CP1251
:
2149 enc
=kCFStringEncodingWindowsCyrillic
;
2151 case wxFONTENCODING_CP1252
:
2152 enc
=kCFStringEncodingWindowsLatin1
;
2154 case wxFONTENCODING_CP1253
:
2155 enc
= kCFStringEncodingWindowsGreek
;
2157 case wxFONTENCODING_CP1254
:
2158 enc
= kCFStringEncodingWindowsLatin5
;
2160 case wxFONTENCODING_CP1255
:
2161 enc
=kCFStringEncodingWindowsHebrew
;
2163 case wxFONTENCODING_CP1256
:
2164 enc
=kCFStringEncodingWindowsArabic
;
2166 case wxFONTENCODING_CP1257
:
2167 enc
= kCFStringEncodingWindowsBalticRim
;
2169 // This only really encodes to UTF7 (if that) evidently
2170 // case wxFONTENCODING_UTF7 :
2171 // enc = kCFStringEncodingNonLossyASCII ;
2173 case wxFONTENCODING_UTF8
:
2174 enc
= kCFStringEncodingUTF8
;
2176 case wxFONTENCODING_EUC_JP
:
2177 enc
= kCFStringEncodingEUC_JP
;
2179 case wxFONTENCODING_UTF16
:
2180 enc
= kCFStringEncodingUnicode
;
2182 case wxFONTENCODING_MACROMAN
:
2183 enc
= kCFStringEncodingMacRoman
;
2185 case wxFONTENCODING_MACJAPANESE
:
2186 enc
= kCFStringEncodingMacJapanese
;
2188 case wxFONTENCODING_MACCHINESETRAD
:
2189 enc
= kCFStringEncodingMacChineseTrad
;
2191 case wxFONTENCODING_MACKOREAN
:
2192 enc
= kCFStringEncodingMacKorean
;
2194 case wxFONTENCODING_MACARABIC
:
2195 enc
= kCFStringEncodingMacArabic
;
2197 case wxFONTENCODING_MACHEBREW
:
2198 enc
= kCFStringEncodingMacHebrew
;
2200 case wxFONTENCODING_MACGREEK
:
2201 enc
= kCFStringEncodingMacGreek
;
2203 case wxFONTENCODING_MACCYRILLIC
:
2204 enc
= kCFStringEncodingMacCyrillic
;
2206 case wxFONTENCODING_MACDEVANAGARI
:
2207 enc
= kCFStringEncodingMacDevanagari
;
2209 case wxFONTENCODING_MACGURMUKHI
:
2210 enc
= kCFStringEncodingMacGurmukhi
;
2212 case wxFONTENCODING_MACGUJARATI
:
2213 enc
= kCFStringEncodingMacGujarati
;
2215 case wxFONTENCODING_MACORIYA
:
2216 enc
= kCFStringEncodingMacOriya
;
2218 case wxFONTENCODING_MACBENGALI
:
2219 enc
= kCFStringEncodingMacBengali
;
2221 case wxFONTENCODING_MACTAMIL
:
2222 enc
= kCFStringEncodingMacTamil
;
2224 case wxFONTENCODING_MACTELUGU
:
2225 enc
= kCFStringEncodingMacTelugu
;
2227 case wxFONTENCODING_MACKANNADA
:
2228 enc
= kCFStringEncodingMacKannada
;
2230 case wxFONTENCODING_MACMALAJALAM
:
2231 enc
= kCFStringEncodingMacMalayalam
;
2233 case wxFONTENCODING_MACSINHALESE
:
2234 enc
= kCFStringEncodingMacSinhalese
;
2236 case wxFONTENCODING_MACBURMESE
:
2237 enc
= kCFStringEncodingMacBurmese
;
2239 case wxFONTENCODING_MACKHMER
:
2240 enc
= kCFStringEncodingMacKhmer
;
2242 case wxFONTENCODING_MACTHAI
:
2243 enc
= kCFStringEncodingMacThai
;
2245 case wxFONTENCODING_MACLAOTIAN
:
2246 enc
= kCFStringEncodingMacLaotian
;
2248 case wxFONTENCODING_MACGEORGIAN
:
2249 enc
= kCFStringEncodingMacGeorgian
;
2251 case wxFONTENCODING_MACARMENIAN
:
2252 enc
= kCFStringEncodingMacArmenian
;
2254 case wxFONTENCODING_MACCHINESESIMP
:
2255 enc
= kCFStringEncodingMacChineseSimp
;
2257 case wxFONTENCODING_MACTIBETAN
:
2258 enc
= kCFStringEncodingMacTibetan
;
2260 case wxFONTENCODING_MACMONGOLIAN
:
2261 enc
= kCFStringEncodingMacMongolian
;
2263 case wxFONTENCODING_MACETHIOPIC
:
2264 enc
= kCFStringEncodingMacEthiopic
;
2266 case wxFONTENCODING_MACCENTRALEUR
:
2267 enc
= kCFStringEncodingMacCentralEurRoman
;
2269 case wxFONTENCODING_MACVIATNAMESE
:
2270 enc
= kCFStringEncodingMacVietnamese
;
2272 case wxFONTENCODING_MACARABICEXT
:
2273 enc
= kCFStringEncodingMacExtArabic
;
2275 case wxFONTENCODING_MACSYMBOL
:
2276 enc
= kCFStringEncodingMacSymbol
;
2278 case wxFONTENCODING_MACDINGBATS
:
2279 enc
= kCFStringEncodingMacDingbats
;
2281 case wxFONTENCODING_MACTURKISH
:
2282 enc
= kCFStringEncodingMacTurkish
;
2284 case wxFONTENCODING_MACCROATIAN
:
2285 enc
= kCFStringEncodingMacCroatian
;
2287 case wxFONTENCODING_MACICELANDIC
:
2288 enc
= kCFStringEncodingMacIcelandic
;
2290 case wxFONTENCODING_MACROMANIAN
:
2291 enc
= kCFStringEncodingMacRomanian
;
2293 case wxFONTENCODING_MACCELTIC
:
2294 enc
= kCFStringEncodingMacCeltic
;
2296 case wxFONTENCODING_MACGAELIC
:
2297 enc
= kCFStringEncodingMacGaelic
;
2299 // case wxFONTENCODING_MACKEYBOARD :
2300 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2303 // because gcc is picky
2309 class wxMBConv_cocoa
: public wxMBConv
2314 Init(CFStringGetSystemEncoding()) ;
2318 wxMBConv_cocoa(const wxChar
* name
)
2320 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2324 wxMBConv_cocoa(wxFontEncoding encoding
)
2326 Init( wxCFStringEncFromFontEnc(encoding
) );
2333 void Init( CFStringEncoding encoding
)
2335 m_encoding
= encoding
;
2338 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2342 CFStringRef theString
= CFStringCreateWithBytes (
2343 NULL
, //the allocator
2344 (const UInt8
*)szUnConv
,
2347 false //no BOM/external representation
2350 wxASSERT(theString
);
2352 size_t nOutLength
= CFStringGetLength(theString
);
2356 CFRelease(theString
);
2360 CFRange theRange
= { 0, nOutSize
};
2362 #if SIZEOF_WCHAR_T == 4
2363 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2366 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2368 CFRelease(theString
);
2370 szUniCharBuffer
[nOutLength
] = '\0' ;
2372 #if SIZEOF_WCHAR_T == 4
2373 wxMBConvUTF16 converter
;
2374 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2375 delete[] szUniCharBuffer
;
2381 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2385 size_t nRealOutSize
;
2386 size_t nBufSize
= wxWcslen(szUnConv
);
2387 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2389 #if SIZEOF_WCHAR_T == 4
2390 wxMBConvUTF16 converter
;
2391 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2392 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2393 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2394 nBufSize
/= sizeof(UniChar
);
2397 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2401 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2404 wxASSERT(theString
);
2406 //Note that CER puts a BOM when converting to unicode
2407 //so we check and use getchars instead in that case
2408 if (m_encoding
== kCFStringEncodingUnicode
)
2411 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2413 nRealOutSize
= CFStringGetLength(theString
) + 1;
2419 CFRangeMake(0, CFStringGetLength(theString
)),
2421 0, //what to put in characters that can't be converted -
2422 //0 tells CFString to return NULL if it meets such a character
2423 false, //not an external representation
2426 (CFIndex
*) &nRealOutSize
2430 CFRelease(theString
);
2432 #if SIZEOF_WCHAR_T == 4
2433 delete[] szUniBuffer
;
2436 return nRealOutSize
- 1;
2441 return m_encoding
!= kCFStringEncodingInvalidId
&&
2442 CFStringIsEncodingAvailable(m_encoding
);
2446 CFStringEncoding m_encoding
;
2449 #endif // defined(__WXCOCOA__)
2451 // ============================================================================
2452 // Mac conversion classes
2453 // ============================================================================
2455 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2457 class wxMBConv_mac
: public wxMBConv
2462 Init(CFStringGetSystemEncoding()) ;
2466 wxMBConv_mac(const wxChar
* name
)
2468 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2472 wxMBConv_mac(wxFontEncoding encoding
)
2474 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2479 OSStatus status
= noErr
;
2480 status
= TECDisposeConverter(m_MB2WC_converter
);
2481 status
= TECDisposeConverter(m_WC2MB_converter
);
2485 void Init( TextEncodingBase encoding
)
2487 OSStatus status
= noErr
;
2488 m_char_encoding
= encoding
;
2489 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2491 status
= TECCreateConverter(&m_MB2WC_converter
,
2493 m_unicode_encoding
);
2494 status
= TECCreateConverter(&m_WC2MB_converter
,
2499 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2501 OSStatus status
= noErr
;
2502 ByteCount byteOutLen
;
2503 ByteCount byteInLen
= strlen(psz
) ;
2504 wchar_t *tbuf
= NULL
;
2505 UniChar
* ubuf
= NULL
;
2510 //apple specs say at least 32
2511 n
= wxMax( 32 , byteInLen
) ;
2512 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2514 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2515 #if SIZEOF_WCHAR_T == 4
2516 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2518 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2520 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2521 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2522 #if SIZEOF_WCHAR_T == 4
2523 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2524 // is not properly terminated we get random characters at the end
2525 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2526 wxMBConvUTF16 converter
;
2527 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2530 res
= byteOutLen
/ sizeof( UniChar
) ;
2535 if ( buf
&& res
< n
)
2541 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2543 OSStatus status
= noErr
;
2544 ByteCount byteOutLen
;
2545 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2551 //apple specs say at least 32
2552 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2553 tbuf
= (char*) malloc( n
) ;
2556 ByteCount byteBufferLen
= n
;
2557 UniChar
* ubuf
= NULL
;
2558 #if SIZEOF_WCHAR_T == 4
2559 wxMBConvUTF16 converter
;
2560 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2561 byteInLen
= unicharlen
;
2562 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2563 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2565 ubuf
= (UniChar
*) psz
;
2567 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2568 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2569 #if SIZEOF_WCHAR_T == 4
2575 size_t res
= byteOutLen
;
2576 if ( buf
&& res
< n
)
2580 //we need to double-trip to verify it didn't insert any ? in place
2581 //of bogus characters
2582 wxWCharBuffer
wcBuf(n
);
2583 size_t pszlen
= wxWcslen(psz
);
2584 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2585 wxWcslen(wcBuf
) != pszlen
||
2586 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2588 // we didn't obtain the same thing we started from, hence
2589 // the conversion was lossy and we consider that it failed
2598 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2601 TECObjectRef m_MB2WC_converter
;
2602 TECObjectRef m_WC2MB_converter
;
2604 TextEncodingBase m_char_encoding
;
2605 TextEncodingBase m_unicode_encoding
;
2608 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2610 // ============================================================================
2611 // wxEncodingConverter based conversion classes
2612 // ============================================================================
2616 class wxMBConv_wxwin
: public wxMBConv
2621 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2622 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2626 // temporarily just use wxEncodingConverter stuff,
2627 // so that it works while a better implementation is built
2628 wxMBConv_wxwin(const wxChar
* name
)
2631 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2633 m_enc
= wxFONTENCODING_SYSTEM
;
2638 wxMBConv_wxwin(wxFontEncoding enc
)
2645 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2647 size_t inbuf
= strlen(psz
);
2650 if (!m2w
.Convert(psz
,buf
))
2656 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2658 const size_t inbuf
= wxWcslen(psz
);
2661 if (!w2m
.Convert(psz
,buf
))
2668 bool IsOk() const { return m_ok
; }
2671 wxFontEncoding m_enc
;
2672 wxEncodingConverter m2w
, w2m
;
2675 virtual size_t GetMinMBCharWidth() const
2679 case wxFONTENCODING_UTF16BE
:
2680 case wxFONTENCODING_UTF16LE
:
2683 case wxFONTENCODING_UTF32BE
:
2684 case wxFONTENCODING_UTF32LE
:
2692 // were we initialized successfully?
2695 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2698 // make the constructors available for unit testing
2699 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2701 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2702 if ( !result
->IsOk() )
2710 #endif // wxUSE_FONTMAP
2712 // ============================================================================
2713 // wxCSConv implementation
2714 // ============================================================================
2716 void wxCSConv::Init()
2723 wxCSConv::wxCSConv(const wxChar
*charset
)
2733 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2735 m_encoding
= wxFONTENCODING_SYSTEM
;
2739 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2741 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2743 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2745 encoding
= wxFONTENCODING_SYSTEM
;
2750 m_encoding
= encoding
;
2753 wxCSConv::~wxCSConv()
2758 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2763 SetName(conv
.m_name
);
2764 m_encoding
= conv
.m_encoding
;
2767 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2771 SetName(conv
.m_name
);
2772 m_encoding
= conv
.m_encoding
;
2777 void wxCSConv::Clear()
2786 void wxCSConv::SetName(const wxChar
*charset
)
2790 m_name
= wxStrdup(charset
);
2796 #include "wx/hashmap.h"
2798 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2799 wxEncodingNameCache
);
2801 static wxEncodingNameCache gs_nameCache
;
2804 wxMBConv
*wxCSConv::DoCreate() const
2807 wxLogTrace(TRACE_STRCONV
,
2808 wxT("creating conversion for %s"),
2810 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
2811 #endif // wxUSE_FONTMAP
2813 // check for the special case of ASCII or ISO8859-1 charset: as we have
2814 // special knowledge of it anyhow, we don't need to create a special
2815 // conversion object
2816 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2817 m_encoding
== wxFONTENCODING_DEFAULT
)
2819 // don't convert at all
2823 // we trust OS to do conversion better than we can so try external
2824 // conversion methods first
2826 // the full order is:
2827 // 1. OS conversion (iconv() under Unix or Win32 API)
2828 // 2. hard coded conversions for UTF
2829 // 3. wxEncodingConverter as fall back
2835 #endif // !wxUSE_FONTMAP
2837 wxString
name(m_name
);
2838 wxFontEncoding
encoding(m_encoding
);
2840 if ( !name
.empty() )
2842 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2850 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2851 #endif // wxUSE_FONTMAP
2855 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2856 if ( it
!= gs_nameCache
.end() )
2858 if ( it
->second
.empty() )
2861 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
2868 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2870 for ( ; *names
; ++names
)
2872 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
2875 gs_nameCache
[encoding
] = *names
;
2882 gs_nameCache
[encoding
] = _T(""); // cache the failure
2884 #endif // wxUSE_FONTMAP
2886 #endif // HAVE_ICONV
2888 #ifdef wxHAVE_WIN32_MB2WC
2891 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2892 : new wxMBConv_win32(m_encoding
);
2901 #endif // wxHAVE_WIN32_MB2WC
2902 #if defined(__WXMAC__)
2904 // leave UTF16 and UTF32 to the built-ins of wx
2905 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2906 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2910 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2911 : new wxMBConv_mac(m_encoding
);
2913 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2922 #if defined(__WXCOCOA__)
2924 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2928 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2929 : new wxMBConv_cocoa(m_encoding
);
2931 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
2941 wxFontEncoding enc
= m_encoding
;
2943 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2945 // use "false" to suppress interactive dialogs -- we can be called from
2946 // anywhere and popping up a dialog from here is the last thing we want to
2948 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2950 #endif // wxUSE_FONTMAP
2954 case wxFONTENCODING_UTF7
:
2955 return new wxMBConvUTF7
;
2957 case wxFONTENCODING_UTF8
:
2958 return new wxMBConvUTF8
;
2960 case wxFONTENCODING_UTF16BE
:
2961 return new wxMBConvUTF16BE
;
2963 case wxFONTENCODING_UTF16LE
:
2964 return new wxMBConvUTF16LE
;
2966 case wxFONTENCODING_UTF32BE
:
2967 return new wxMBConvUTF32BE
;
2969 case wxFONTENCODING_UTF32LE
:
2970 return new wxMBConvUTF32LE
;
2973 // nothing to do but put here to suppress gcc warnings
2980 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2981 : new wxMBConv_wxwin(m_encoding
);
2987 #endif // wxUSE_FONTMAP
2989 // NB: This is a hack to prevent deadlock. What could otherwise happen
2990 // in Unicode build: wxConvLocal creation ends up being here
2991 // because of some failure and logs the error. But wxLog will try to
2992 // attach timestamp, for which it will need wxConvLocal (to convert
2993 // time to char* and then wchar_t*), but that fails, tries to log
2994 // error, but wxLog has a (already locked) critical section that
2995 // guards static buffer.
2996 static bool alreadyLoggingError
= false;
2997 if (!alreadyLoggingError
)
2999 alreadyLoggingError
= true;
3000 wxLogError(_("Cannot convert from the charset '%s'!"),
3004 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3005 #else // !wxUSE_FONTMAP
3006 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3007 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3009 alreadyLoggingError
= false;
3015 void wxCSConv::CreateConvIfNeeded() const
3019 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3022 // if we don't have neither the name nor the encoding, use the default
3023 // encoding for this system
3024 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3026 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3028 #endif // wxUSE_INTL
3030 self
->m_convReal
= DoCreate();
3031 self
->m_deferred
= false;
3035 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3037 CreateConvIfNeeded();
3040 return m_convReal
->MB2WC(buf
, psz
, n
);
3043 size_t len
= strlen(psz
);
3047 for (size_t c
= 0; c
<= len
; c
++)
3048 buf
[c
] = (unsigned char)(psz
[c
]);
3054 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3056 CreateConvIfNeeded();
3059 return m_convReal
->WC2MB(buf
, psz
, n
);
3062 const size_t len
= wxWcslen(psz
);
3065 for (size_t c
= 0; c
<= len
; c
++)
3069 buf
[c
] = (char)psz
[c
];
3074 for (size_t c
= 0; c
<= len
; c
++)
3084 size_t wxCSConv::GetMinMBCharWidth() const
3086 CreateConvIfNeeded();
3090 // cast needed just to call private function of m_convReal
3091 return ((wxCSConv
*)m_convReal
)->GetMinMBCharWidth();
3097 // ----------------------------------------------------------------------------
3099 // ----------------------------------------------------------------------------
3102 static wxMBConv_win32 wxConvLibcObj
;
3103 #elif defined(__WXMAC__) && !defined(__MACH__)
3104 static wxMBConv_mac wxConvLibcObj
;
3106 static wxMBConvLibc wxConvLibcObj
;
3109 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3110 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3111 static wxMBConvUTF7 wxConvUTF7Obj
;
3112 static wxMBConvUTF8 wxConvUTF8Obj
;
3114 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3115 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3116 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3117 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3118 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3119 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3120 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3128 #else // !wxUSE_WCHAR_T
3130 // stand-ins in absence of wchar_t
3131 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3136 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T