1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
83 #if SIZEOF_WCHAR_T == 2
87 // ============================================================================
89 // ============================================================================
91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
92 static bool NotAllNULs(const char *p
, size_t n
)
94 while ( n
&& *p
++ == '\0' )
100 // ----------------------------------------------------------------------------
101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
102 // ----------------------------------------------------------------------------
104 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
109 *output
= (wxUint16
) input
;
112 else if (input
>=0x110000)
114 return wxCONV_FAILED
;
120 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
121 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
127 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
129 if ((*input
<0xd800) || (*input
>0xdfff))
134 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
137 return wxCONV_FAILED
;
141 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
148 // returns the next UTF-32 character from the wchar_t buffer and advances the
149 // pointer to the character after this one
151 // if an invalid character is found, *pSrc is set to NULL, the caller must
153 static wxUint32
wxDecodeSurrogate(const wchar_t **pSrc
)
156 const size_t n
= decode_utf16(*pSrc
, out
);
157 if ( n
== wxCONV_FAILED
)
167 // ----------------------------------------------------------------------------
169 // ----------------------------------------------------------------------------
172 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
173 const char *src
, size_t srcLen
) const
175 // although new conversion classes are supposed to implement this function
176 // directly, the existins ones only implement the old MB2WC() and so, to
177 // avoid to have to rewrite all conversion classes at once, we provide a
178 // default (but not efficient) implementation of this one in terms of the
179 // old function by copying the input to ensure that it's NUL-terminated and
180 // then using MB2WC() to convert it
182 // the number of chars [which would be] written to dst [if it were not NULL]
183 size_t dstWritten
= 0;
185 // the number of NULs terminating this string
186 size_t nulLen
wxDUMMY_INITIALIZE(0);
188 // if we were not given the input size we just have to assume that the
189 // string is properly terminated as we have no way of knowing how long it
190 // is anyhow, but if we do have the size check whether there are enough
194 if ( srcLen
!= wxNO_LEN
)
196 // we need to know how to find the end of this string
197 nulLen
= GetMBNulLen();
198 if ( nulLen
== wxCONV_FAILED
)
199 return wxCONV_FAILED
;
201 // if there are enough NULs we can avoid the copy
202 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
204 // make a copy in order to properly NUL-terminate the string
205 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
206 char * const p
= bufTmp
.data();
207 memcpy(p
, src
, srcLen
);
208 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
214 srcEnd
= src
+ srcLen
;
216 else // quit after the first loop iteration
223 // try to convert the current chunk
224 size_t lenChunk
= MB2WC(NULL
, src
, 0);
225 if ( lenChunk
== wxCONV_FAILED
)
226 return wxCONV_FAILED
;
228 lenChunk
++; // for the L'\0' at the end of this chunk
230 dstWritten
+= lenChunk
;
234 // nothing left in the input string, conversion succeeded
240 if ( dstWritten
> dstLen
)
241 return wxCONV_FAILED
;
243 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
244 return wxCONV_FAILED
;
251 // we convert just one chunk in this case as this is the entire
256 // advance the input pointer past the end of this chunk
257 while ( NotAllNULs(src
, nulLen
) )
259 // notice that we must skip over multiple bytes here as we suppose
260 // that if NUL takes 2 or 4 bytes, then all the other characters do
261 // too and so if advanced by a single byte we might erroneously
262 // detect sequences of NUL bytes in the middle of the input
266 src
+= nulLen
; // skipping over its terminator as well
268 // note that ">=" (and not just "==") is needed here as the terminator
269 // we skipped just above could be inside or just after the buffer
270 // delimited by inEnd
279 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
280 const wchar_t *src
, size_t srcLen
) const
282 // the number of chars [which would be] written to dst [if it were not NULL]
283 size_t dstWritten
= 0;
285 // make a copy of the input string unless it is already properly
288 // if we don't know its length we have no choice but to assume that it is,
289 // indeed, properly terminated
290 wxWCharBuffer bufTmp
;
291 if ( srcLen
== wxNO_LEN
)
293 srcLen
= wxWcslen(src
) + 1;
295 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
297 // make a copy in order to properly NUL-terminate the string
298 bufTmp
= wxWCharBuffer(srcLen
);
299 memcpy(bufTmp
.data(), src
, srcLen
*sizeof(wchar_t));
303 const size_t lenNul
= GetMBNulLen();
304 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
306 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
308 // try to convert the current chunk
309 size_t lenChunk
= WC2MB(NULL
, src
, 0);
311 if ( lenChunk
== wxCONV_FAILED
)
312 return wxCONV_FAILED
;
315 dstWritten
+= lenChunk
;
319 if ( dstWritten
> dstLen
)
320 return wxCONV_FAILED
;
322 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
323 return wxCONV_FAILED
;
332 size_t wxMBConv::MB2WC(wchar_t *out
, const char *in
, size_t outLen
) const
334 size_t rc
= ToWChar(out
, outLen
, in
);
335 if ( rc
!= wxCONV_FAILED
)
337 // ToWChar() returns the buffer length, i.e. including the trailing
338 // NUL, while this method doesn't take it into account
345 size_t wxMBConv::WC2MB(char *out
, const wchar_t *in
, size_t outLen
) const
347 size_t rc
= FromWChar(out
, outLen
, in
);
348 if ( rc
!= wxCONV_FAILED
)
356 wxMBConv::~wxMBConv()
358 // nothing to do here (necessary for Darwin linking probably)
361 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
365 // calculate the length of the buffer needed first
366 const size_t nLen
= MB2WC(NULL
, psz
, 0);
367 if ( nLen
!= wxCONV_FAILED
)
369 // now do the actual conversion
370 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
372 // +1 for the trailing NULL
373 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
378 return wxWCharBuffer();
381 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
385 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
386 if ( nLen
!= wxCONV_FAILED
)
388 // extra space for trailing NUL(s)
389 static const size_t extraLen
= GetMaxMBNulLen();
391 wxCharBuffer
buf(nLen
+ extraLen
- 1);
392 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
397 return wxCharBuffer();
401 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
403 const size_t dstLen
= ToWChar(NULL
, 0, in
, inLen
);
404 if ( dstLen
!= wxCONV_FAILED
)
406 wxWCharBuffer
wbuf(dstLen
- 1);
407 if ( ToWChar(wbuf
.data(), dstLen
, in
, inLen
) != wxCONV_FAILED
)
412 if ( wbuf
[dstLen
- 1] == L
'\0' )
423 return wxWCharBuffer();
427 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
429 const size_t dstLen
= FromWChar(NULL
, 0, in
, inLen
);
430 if ( dstLen
!= wxCONV_FAILED
)
432 wxCharBuffer
buf(dstLen
- 1);
433 if ( FromWChar(buf
.data(), dstLen
, in
, inLen
) != wxCONV_FAILED
)
439 const size_t nulLen
= GetMBNulLen();
440 if ( !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
442 // in this case the output is NUL-terminated and we're not
443 // supposed to count NUL
455 return wxCharBuffer();
458 // ----------------------------------------------------------------------------
460 // ----------------------------------------------------------------------------
462 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
464 return wxMB2WC(buf
, psz
, n
);
467 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
469 return wxWC2MB(buf
, psz
, n
);
472 // ----------------------------------------------------------------------------
473 // wxConvBrokenFileNames
474 // ----------------------------------------------------------------------------
478 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
480 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
481 || wxStricmp(charset
, _T("UTF8")) == 0 )
482 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
484 m_conv
= new wxCSConv(charset
);
489 // ----------------------------------------------------------------------------
491 // ----------------------------------------------------------------------------
493 // Implementation (C) 2004 Fredrik Roubert
496 // BASE64 decoding table
498 static const unsigned char utf7unb64
[] =
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
506 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
507 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
509 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
510 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
511 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
513 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
514 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
515 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
534 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
538 while ( *psz
&& (!buf
|| (len
< n
)) )
540 unsigned char cc
= *psz
++;
548 else if (*psz
== '-')
556 else // start of BASE64 encoded string
560 for ( ok
= lsb
= false, d
= 0, l
= 0;
561 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
566 for (l
+= 6; l
>= 8; lsb
= !lsb
)
568 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
578 *buf
= (wchar_t)(c
<< 8);
587 // in valid UTF7 we should have valid characters after '+'
588 return wxCONV_FAILED
;
596 if ( buf
&& (len
< n
) )
603 // BASE64 encoding table
605 static const unsigned char utf7enb64
[] =
607 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
608 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
609 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
610 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
611 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
612 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
613 'w', 'x', 'y', 'z', '0', '1', '2', '3',
614 '4', '5', '6', '7', '8', '9', '+', '/'
618 // UTF-7 encoding table
620 // 0 - Set D (directly encoded characters)
621 // 1 - Set O (optional direct characters)
622 // 2 - whitespace characters (optional)
623 // 3 - special characters
625 static const unsigned char utf7encode
[128] =
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
628 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
629 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
631 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
637 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
641 while (*psz
&& ((!buf
) || (len
< n
)))
644 if (cc
< 0x80 && utf7encode
[cc
] < 1)
652 else if (((wxUint32
)cc
) > 0xffff)
654 // no surrogate pair generation (yet?)
655 return wxCONV_FAILED
;
665 // BASE64 encode string
666 unsigned int lsb
, d
, l
;
667 for (d
= 0, l
= 0; /*nothing*/; psz
++)
669 for (lsb
= 0; lsb
< 2; lsb
++)
672 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
674 for (l
+= 8; l
>= 6; )
678 *buf
++ = utf7enb64
[(d
>> l
) % 64];
683 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
689 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
698 if (buf
&& (len
< n
))
703 // ----------------------------------------------------------------------------
705 // ----------------------------------------------------------------------------
707 static wxUint32 utf8_max
[]=
708 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
710 // boundaries of the private use area we use to (temporarily) remap invalid
711 // characters invalid in a UTF-8 encoded string
712 const wxUint32 wxUnicodePUA
= 0x100000;
713 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
715 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
719 while (*psz
&& ((!buf
) || (len
< n
)))
721 const char *opsz
= psz
;
722 bool invalid
= false;
723 unsigned char cc
= *psz
++, fc
= cc
;
725 for (cnt
= 0; fc
& 0x80; cnt
++)
734 // escape the escape character for octal escapes
735 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
736 && cc
== '\\' && (!buf
|| len
< n
))
748 // invalid UTF-8 sequence
753 unsigned ocnt
= cnt
- 1;
754 wxUint32 res
= cc
& (0x3f >> cnt
);
758 if ((cc
& 0xC0) != 0x80)
760 // invalid UTF-8 sequence
765 res
= (res
<< 6) | (cc
& 0x3f);
767 if (invalid
|| res
<= utf8_max
[ocnt
])
769 // illegal UTF-8 encoding
772 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
773 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
775 // if one of our PUA characters turns up externally
776 // it must also be treated as an illegal sequence
777 // (a bit like you have to escape an escape character)
783 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
784 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
785 if (pa
== wxCONV_FAILED
)
797 *buf
++ = (wchar_t)res
;
799 #endif // WC_UTF16/!WC_UTF16
804 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
806 while (opsz
< psz
&& (!buf
|| len
< n
))
809 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
810 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
811 wxASSERT(pa
!= wxCONV_FAILED
);
818 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
824 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
826 while (opsz
< psz
&& (!buf
|| len
< n
))
828 if ( buf
&& len
+ 3 < n
)
830 unsigned char on
= *opsz
;
832 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
833 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
834 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
840 else // MAP_INVALID_UTF8_NOT
842 return wxCONV_FAILED
;
847 if (buf
&& (len
< n
))
852 static inline bool isoctal(wchar_t wch
)
854 return L
'0' <= wch
&& wch
<= L
'7';
857 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
861 while (*psz
&& ((!buf
) || (len
< n
)))
865 // cast is ok for WC_UTF16
866 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
867 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
869 cc
=(*psz
++) & 0x7fffffff;
872 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
873 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
876 *buf
++ = (char)(cc
- wxUnicodePUA
);
879 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
880 && cc
== L
'\\' && psz
[0] == L
'\\' )
887 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
889 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
893 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
894 (psz
[1] - L
'0')*010 +
904 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
918 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
920 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
932 // ============================================================================
934 // ============================================================================
936 #ifdef WORDS_BIGENDIAN
937 #define wxMBConvUTF16straight wxMBConvUTF16BE
938 #define wxMBConvUTF16swap wxMBConvUTF16LE
940 #define wxMBConvUTF16swap wxMBConvUTF16BE
941 #define wxMBConvUTF16straight wxMBConvUTF16LE
945 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
947 if ( srcLen
== wxNO_LEN
)
949 // count the number of bytes in input, including the trailing NULs
950 const wxUint16
*in
= wx_reinterpret_cast(const wxUint16
*, src
);
951 for ( srcLen
= 1; *in
++; srcLen
++ )
954 srcLen
*= BYTES_PER_CHAR
;
956 else // we already have the length
958 // we can only convert an entire number of UTF-16 characters
959 if ( srcLen
% BYTES_PER_CHAR
)
960 return wxCONV_FAILED
;
966 // case when in-memory representation is UTF-16 too
969 // ----------------------------------------------------------------------------
970 // conversions without endianness change
971 // ----------------------------------------------------------------------------
974 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
975 const char *src
, size_t srcLen
) const
977 // set up the scene for using memcpy() (which is presumably more efficient
978 // than copying the bytes one by one)
979 srcLen
= GetLength(src
, srcLen
);
980 if ( srcLen
== wxNO_LEN
)
981 return wxCONV_FAILED
;
983 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
986 if ( dstLen
< inLen
)
987 return wxCONV_FAILED
;
989 memcpy(dst
, src
, srcLen
);
996 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
997 const wchar_t *src
, size_t srcLen
) const
999 if ( srcLen
== wxNO_LEN
)
1000 srcLen
= wxWcslen(src
) + 1;
1002 srcLen
*= BYTES_PER_CHAR
;
1006 if ( dstLen
< srcLen
)
1007 return wxCONV_FAILED
;
1009 memcpy(dst
, src
, srcLen
);
1015 // ----------------------------------------------------------------------------
1016 // endian-reversing conversions
1017 // ----------------------------------------------------------------------------
1020 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1021 const char *src
, size_t srcLen
) const
1023 srcLen
= GetLength(src
, srcLen
);
1024 if ( srcLen
== wxNO_LEN
)
1025 return wxCONV_FAILED
;
1027 srcLen
/= BYTES_PER_CHAR
;
1031 if ( dstLen
< srcLen
)
1032 return wxCONV_FAILED
;
1034 const wxUint16
*in
= wx_reinterpret_cast(const wxUint16
*, src
);
1035 for ( size_t n
= 0; n
< srcLen
; n
++, in
++ )
1037 *dst
++ = wxUINT16_SWAP_ALWAYS(*in
);
1045 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1046 const wchar_t *src
, size_t srcLen
) const
1048 if ( srcLen
== wxNO_LEN
)
1049 srcLen
= wxWcslen(src
) + 1;
1051 srcLen
*= BYTES_PER_CHAR
;
1055 if ( dstLen
< srcLen
)
1056 return wxCONV_FAILED
;
1058 wxUint16
*out
= wx_reinterpret_cast(wxUint16
*, dst
);
1059 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1061 *out
++ = wxUINT16_SWAP_ALWAYS(*src
);
1068 #else // !WC_UTF16: wchar_t is UTF-32
1070 // ----------------------------------------------------------------------------
1071 // conversions without endianness change
1072 // ----------------------------------------------------------------------------
1074 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1078 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1081 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
1082 if (pa
== wxCONV_FAILED
)
1086 *buf
++ = (wchar_t)cc
;
1088 psz
+= pa
* sizeof(wxUint16
);
1090 if (buf
&& len
<n
) *buf
=0;
1096 // copy 32bit String to 16bit MB
1097 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1101 while (*psz
&& (!buf
|| len
< n
))
1104 size_t pa
=encode_utf16(*psz
, cc
);
1106 if (pa
== wxCONV_FAILED
)
1111 *(wxUint16
*)buf
= cc
[0];
1112 buf
+= sizeof(wxUint16
);
1115 *(wxUint16
*)buf
= cc
[1];
1116 buf
+= sizeof(wxUint16
);
1120 len
+= pa
*sizeof(wxUint16
);
1123 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1128 // ----------------------------------------------------------------------------
1129 // endian-reversing conversions
1130 // ----------------------------------------------------------------------------
1132 // swap 16bit MB to 32bit String
1133 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1137 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
1141 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
1142 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
1144 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
1145 if (pa
== wxCONV_FAILED
)
1149 *buf
++ = (wchar_t)cc
;
1152 psz
+= pa
* sizeof(wxUint16
);
1154 if (buf
&& len
<n
) *buf
=0;
1160 // swap 32bit String to 16bit MB
1161 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1165 while (*psz
&& (!buf
|| len
< n
))
1168 size_t pa
=encode_utf16(*psz
, cc
);
1170 if (pa
== wxCONV_FAILED
)
1175 *buf
++ = ((char*)cc
)[1];
1176 *buf
++ = ((char*)cc
)[0];
1179 *buf
++ = ((char*)cc
)[3];
1180 *buf
++ = ((char*)cc
)[2];
1184 len
+= pa
*sizeof(wxUint16
);
1187 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
1192 #endif // WC_UTF16/!WC_UTF16
1195 // ----------------------------------------------------------------------------
1197 // ----------------------------------------------------------------------------
1199 #ifdef WORDS_BIGENDIAN
1200 #define wxMBConvUTF32straight wxMBConvUTF32BE
1201 #define wxMBConvUTF32swap wxMBConvUTF32LE
1203 #define wxMBConvUTF32swap wxMBConvUTF32BE
1204 #define wxMBConvUTF32straight wxMBConvUTF32LE
1208 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1209 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1212 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1214 if ( srcLen
== wxNO_LEN
)
1216 // count the number of bytes in input, including the trailing NULs
1217 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1218 for ( srcLen
= 1; *in
++; srcLen
++ )
1221 srcLen
*= BYTES_PER_CHAR
;
1223 else // we already have the length
1225 // we can only convert an entire number of UTF-32 characters
1226 if ( srcLen
% BYTES_PER_CHAR
)
1227 return wxCONV_FAILED
;
1233 // case when in-memory representation is UTF-16
1236 // ----------------------------------------------------------------------------
1237 // conversions without endianness change
1238 // ----------------------------------------------------------------------------
1241 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1242 const char *src
, size_t srcLen
) const
1244 srcLen
= GetLength(src
, srcLen
);
1245 if ( srcLen
== wxNO_LEN
)
1246 return wxCONV_FAILED
;
1248 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1249 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1251 for ( size_t n
= 0; n
< inLen
; n
++ )
1254 const size_t numChars
= encode_utf16(*in
++, cc
);
1255 if ( numChars
== wxCONV_FAILED
)
1256 return wxCONV_FAILED
;
1261 if ( outLen
> dstLen
)
1262 return wxCONV_FAILED
;
1265 if ( numChars
== 2 )
1267 // second character of a surrogate
1277 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1278 const wchar_t *src
, size_t srcLen
) const
1280 if ( srcLen
== wxNO_LEN
)
1281 srcLen
= wxWcslen(src
) + 1;
1285 // optimization: return maximal space which could be needed for this
1286 // string instead of the exact amount which could be less if there are
1287 // any surrogates in the input
1289 // we consider that surrogates are rare enough to make it worthwhile to
1290 // avoid running the loop below at the cost of slightly extra memory
1292 return srcLen
*BYTES_PER_CHAR
;
1295 wxUint32
*out
= wx_reinterpret_cast(wxUint32
*, dst
);
1297 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1299 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1301 return wxCONV_FAILED
;
1303 outLen
+= BYTES_PER_CHAR
;
1305 if ( outLen
> dstLen
)
1306 return wxCONV_FAILED
;
1314 // ----------------------------------------------------------------------------
1315 // endian-reversing conversions
1316 // ----------------------------------------------------------------------------
1319 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1320 const char *src
, size_t srcLen
) const
1322 srcLen
= GetLength(src
, srcLen
);
1323 if ( srcLen
== wxNO_LEN
)
1324 return wxCONV_FAILED
;
1326 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1327 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1329 for ( size_t n
= 0; n
< inLen
; n
++, in
++ )
1332 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*in
), cc
);
1333 if ( numChars
== wxCONV_FAILED
)
1334 return wxCONV_FAILED
;
1339 if ( outLen
> dstLen
)
1340 return wxCONV_FAILED
;
1343 if ( numChars
== 2 )
1345 // second character of a surrogate
1355 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1356 const wchar_t *src
, size_t srcLen
) const
1358 if ( srcLen
== wxNO_LEN
)
1359 srcLen
= wxWcslen(src
) + 1;
1363 // optimization: return maximal space which could be needed for this
1364 // string instead of the exact amount which could be less if there are
1365 // any surrogates in the input
1367 // we consider that surrogates are rare enough to make it worthwhile to
1368 // avoid running the loop below at the cost of slightly extra memory
1370 return srcLen
*BYTES_PER_CHAR
;
1373 wxUint32
*out
= wx_reinterpret_cast(wxUint32
*, dst
);
1375 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1377 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1379 return wxCONV_FAILED
;
1381 outLen
+= BYTES_PER_CHAR
;
1383 if ( outLen
> dstLen
)
1384 return wxCONV_FAILED
;
1386 *out
++ = wxUINT32_SWAP_ALWAYS(ch
);
1392 #else // !WC_UTF16: wchar_t is UTF-32
1394 // copy 32bit MB to 32bit String
1395 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1399 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1402 *buf
++ = (wchar_t)(*(wxUint32
*)psz
);
1404 psz
+= sizeof(wxUint32
);
1414 // copy 32bit String to 32bit MB
1415 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1419 while (*psz
&& (!buf
|| len
< n
))
1423 *(wxUint32
*)buf
= *psz
;
1424 buf
+= sizeof(wxUint32
);
1427 len
+= sizeof(wxUint32
);
1431 if (buf
&& len
<=n
-sizeof(wxUint32
))
1438 // swap 32bit MB to 32bit String
1439 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1443 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1447 ((char *)buf
)[0] = psz
[3];
1448 ((char *)buf
)[1] = psz
[2];
1449 ((char *)buf
)[2] = psz
[1];
1450 ((char *)buf
)[3] = psz
[0];
1454 psz
+= sizeof(wxUint32
);
1464 // swap 32bit String to 32bit MB
1465 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1469 while (*psz
&& (!buf
|| len
< n
))
1473 *buf
++ = ((char *)psz
)[3];
1474 *buf
++ = ((char *)psz
)[2];
1475 *buf
++ = ((char *)psz
)[1];
1476 *buf
++ = ((char *)psz
)[0];
1478 len
+= sizeof(wxUint32
);
1482 if (buf
&& len
<=n
-sizeof(wxUint32
))
1489 #endif // WC_UTF16/!WC_UTF16
1492 // ============================================================================
1493 // The classes doing conversion using the iconv_xxx() functions
1494 // ============================================================================
1498 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1499 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1500 // (unless there's yet another bug in glibc) the only case when iconv()
1501 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1502 // left in the input buffer -- when _real_ error occurs,
1503 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1505 // [This bug does not appear in glibc 2.2.]
1506 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1507 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1508 (errno != E2BIG || bufLeft != 0))
1510 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1513 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1515 #define ICONV_T_INVALID ((iconv_t)-1)
1517 #if SIZEOF_WCHAR_T == 4
1518 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1519 #define WC_ENC wxFONTENCODING_UTF32
1520 #elif SIZEOF_WCHAR_T == 2
1521 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1522 #define WC_ENC wxFONTENCODING_UTF16
1523 #else // sizeof(wchar_t) != 2 nor 4
1524 // does this ever happen?
1525 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1528 // ----------------------------------------------------------------------------
1529 // wxMBConv_iconv: encapsulates an iconv character set
1530 // ----------------------------------------------------------------------------
1532 class wxMBConv_iconv
: public wxMBConv
1535 wxMBConv_iconv(const wxChar
*name
);
1536 virtual ~wxMBConv_iconv();
1538 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1539 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1541 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1542 virtual size_t GetMBNulLen() const;
1544 virtual wxMBConv
*Clone() const
1546 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1547 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1552 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1555 // the iconv handlers used to translate from multibyte to wide char and in
1556 // the other direction
1560 // guards access to m2w and w2m objects
1561 wxMutex m_iconvMutex
;
1565 // the name (for iconv_open()) of a wide char charset -- if none is
1566 // available on this machine, it will remain NULL
1567 static wxString ms_wcCharsetName
;
1569 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1570 // different endian-ness than the native one
1571 static bool ms_wcNeedsSwap
;
1574 // name of the encoding handled by this conversion
1577 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1579 size_t m_minMBCharWidth
;
1582 // make the constructor available for unit testing
1583 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1585 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1586 if ( !result
->IsOk() )
1594 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1595 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1597 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1600 m_minMBCharWidth
= 0;
1602 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1603 // names for the charsets
1604 const wxCharBuffer
cname(wxString(name
).ToAscii());
1606 // check for charset that represents wchar_t:
1607 if ( ms_wcCharsetName
.empty() )
1609 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1612 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1613 #else // !wxUSE_FONTMAP
1614 static const wxChar
*names
[] =
1616 #if SIZEOF_WCHAR_T == 4
1618 #elif SIZEOF_WCHAR_T = 2
1623 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1625 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1627 const wxString
nameCS(*names
);
1629 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1630 wxString
nameXE(nameCS
);
1631 #ifdef WORDS_BIGENDIAN
1633 #else // little endian
1637 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1640 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1641 if ( m2w
== ICONV_T_INVALID
)
1643 // try charset w/o bytesex info (e.g. "UCS4")
1644 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1646 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1648 // and check for bytesex ourselves:
1649 if ( m2w
!= ICONV_T_INVALID
)
1651 char buf
[2], *bufPtr
;
1652 wchar_t wbuf
[2], *wbufPtr
;
1660 outsz
= SIZEOF_WCHAR_T
* 2;
1664 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1665 (char**)&wbufPtr
, &outsz
);
1667 if (ICONV_FAILED(res
, insz
))
1669 wxLogLastError(wxT("iconv"));
1670 wxLogError(_("Conversion to charset '%s' doesn't work."),
1673 else // ok, can convert to this encoding, remember it
1675 ms_wcCharsetName
= nameCS
;
1676 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1680 else // use charset not requiring byte swapping
1682 ms_wcCharsetName
= nameXE
;
1686 wxLogTrace(TRACE_STRCONV
,
1687 wxT("iconv wchar_t charset is \"%s\"%s"),
1688 ms_wcCharsetName
.empty() ? _T("<none>")
1689 : ms_wcCharsetName
.c_str(),
1690 ms_wcNeedsSwap
? _T(" (needs swap)")
1693 else // we already have ms_wcCharsetName
1695 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1698 if ( ms_wcCharsetName
.empty() )
1700 w2m
= ICONV_T_INVALID
;
1704 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1705 if ( w2m
== ICONV_T_INVALID
)
1707 wxLogTrace(TRACE_STRCONV
,
1708 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1709 ms_wcCharsetName
.c_str(), cname
.data());
1714 wxMBConv_iconv::~wxMBConv_iconv()
1716 if ( m2w
!= ICONV_T_INVALID
)
1718 if ( w2m
!= ICONV_T_INVALID
)
1722 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1724 // find the string length: notice that must be done differently for
1725 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1727 const size_t nulLen
= GetMBNulLen();
1731 return wxCONV_FAILED
;
1734 inbuf
= strlen(psz
); // arguably more optimized than our version
1739 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1740 // they also have to start at character boundary and not span two
1741 // adjacent characters
1743 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1750 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1751 // Unfortunately there is a couple of global wxCSConv objects such as
1752 // wxConvLocal that are used all over wx code, so we have to make sure
1753 // the handle is used by at most one thread at the time. Otherwise
1754 // only a few wx classes would be safe to use from non-main threads
1755 // as MB<->WC conversion would fail "randomly".
1756 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1757 #endif // wxUSE_THREADS
1760 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1762 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1763 wchar_t *bufPtr
= buf
;
1764 const char *pszPtr
= psz
;
1768 // have destination buffer, convert there
1770 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1771 (char**)&bufPtr
, &outbuf
);
1772 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1776 // convert to native endianness
1777 for ( unsigned i
= 0; i
< res
; i
++ )
1778 buf
[n
] = WC_BSWAP(buf
[i
]);
1781 // NUL-terminate the string if there is any space left
1787 // no destination buffer... convert using temp buffer
1788 // to calculate destination buffer requirement
1793 outbuf
= 8*SIZEOF_WCHAR_T
;
1796 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1797 (char**)&bufPtr
, &outbuf
);
1799 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1800 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1803 if (ICONV_FAILED(cres
, inbuf
))
1805 //VS: it is ok if iconv fails, hence trace only
1806 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1807 return wxCONV_FAILED
;
1813 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1816 // NB: explained in MB2WC
1817 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1820 size_t inlen
= wxWcslen(psz
);
1821 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1825 wchar_t *tmpbuf
= 0;
1829 // need to copy to temp buffer to switch endianness
1830 // (doing WC_BSWAP twice on the original buffer won't help, as it
1831 // could be in read-only memory, or be accessed in some other thread)
1832 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1833 for ( size_t i
= 0; i
< inlen
; i
++ )
1834 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1835 tmpbuf
[inlen
] = L
'\0';
1841 // have destination buffer, convert there
1842 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1846 // NB: iconv was given only wcslen(psz) characters on input, and so
1847 // it couldn't convert the trailing zero. Let's do it ourselves
1848 // if there's some room left for it in the output buffer.
1854 // no destination buffer... convert using temp buffer
1855 // to calculate destination buffer requirement
1859 buf
= tbuf
; outbuf
= 16;
1861 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1864 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1872 if (ICONV_FAILED(cres
, inbuf
))
1874 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1875 return wxCONV_FAILED
;
1881 size_t wxMBConv_iconv::GetMBNulLen() const
1883 if ( m_minMBCharWidth
== 0 )
1885 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1888 // NB: explained in MB2WC
1889 wxMutexLocker
lock(self
->m_iconvMutex
);
1892 wchar_t *wnul
= L
"";
1893 char buf
[8]; // should be enough for NUL in any encoding
1894 size_t inLen
= sizeof(wchar_t),
1895 outLen
= WXSIZEOF(buf
);
1896 char *in
= (char *)wnul
;
1898 if ( iconv(w2m
, ICONV_CHAR_CAST(&in
), &inLen
, &out
, &outLen
) == (size_t)-1 )
1900 self
->m_minMBCharWidth
= (size_t)-1;
1904 self
->m_minMBCharWidth
= out
- buf
;
1908 return m_minMBCharWidth
;
1911 #endif // HAVE_ICONV
1914 // ============================================================================
1915 // Win32 conversion classes
1916 // ============================================================================
1918 #ifdef wxHAVE_WIN32_MB2WC
1922 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1923 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1926 class wxMBConv_win32
: public wxMBConv
1931 m_CodePage
= CP_ACP
;
1932 m_minMBCharWidth
= 0;
1935 wxMBConv_win32(const wxMBConv_win32
& conv
)
1937 m_CodePage
= conv
.m_CodePage
;
1938 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
1942 wxMBConv_win32(const wxChar
* name
)
1944 m_CodePage
= wxCharsetToCodepage(name
);
1945 m_minMBCharWidth
= 0;
1948 wxMBConv_win32(wxFontEncoding encoding
)
1950 m_CodePage
= wxEncodingToCodepage(encoding
);
1951 m_minMBCharWidth
= 0;
1953 #endif // wxUSE_FONTMAP
1955 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1957 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1958 // the behaviour is not compatible with the Unix version (using iconv)
1959 // and break the library itself, e.g. wxTextInputStream::NextChar()
1960 // wouldn't work if reading an incomplete MB char didn't result in an
1963 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1964 // Win XP or newer and it is not supported for UTF-[78] so we always
1965 // use our own conversions in this case. See
1966 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1967 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1968 if ( m_CodePage
== CP_UTF8
)
1970 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
1973 if ( m_CodePage
== CP_UTF7
)
1975 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
1979 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
1980 IsAtLeastWin2kSP4() )
1982 flags
= MB_ERR_INVALID_CHARS
;
1985 const size_t len
= ::MultiByteToWideChar
1987 m_CodePage
, // code page
1988 flags
, // flags: fall on error
1989 psz
, // input string
1990 -1, // its length (NUL-terminated)
1991 buf
, // output string
1992 buf
? n
: 0 // size of output buffer
1996 // function totally failed
1997 return wxCONV_FAILED
;
2000 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2001 // check if we succeeded, by doing a double trip:
2002 if ( !flags
&& buf
)
2004 const size_t mbLen
= strlen(psz
);
2005 wxCharBuffer
mbBuf(mbLen
);
2006 if ( ::WideCharToMultiByte
2013 mbLen
+ 1, // size in bytes, not length
2017 strcmp(mbBuf
, psz
) != 0 )
2019 // we didn't obtain the same thing we started from, hence
2020 // the conversion was lossy and we consider that it failed
2021 return wxCONV_FAILED
;
2025 // note that it returns count of written chars for buf != NULL and size
2026 // of the needed buffer for buf == NULL so in either case the length of
2027 // the string (which never includes the terminating NUL) is one less
2031 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2034 we have a problem here: by default, WideCharToMultiByte() may
2035 replace characters unrepresentable in the target code page with bad
2036 quality approximations such as turning "1/2" symbol (U+00BD) into
2037 "1" for the code pages which don't have it and we, obviously, want
2038 to avoid this at any price
2040 the trouble is that this function does it _silently_, i.e. it won't
2041 even tell us whether it did or not... Win98/2000 and higher provide
2042 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2043 we have to resort to a round trip, i.e. check that converting back
2044 results in the same string -- this is, of course, expensive but
2045 otherwise we simply can't be sure to not garble the data.
2048 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2049 // it doesn't work with CJK encodings (which we test for rather roughly
2050 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2052 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2055 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2057 // it's our lucky day
2058 flags
= WC_NO_BEST_FIT_CHARS
;
2059 pUsedDef
= &usedDef
;
2061 else // old system or unsupported encoding
2067 const size_t len
= ::WideCharToMultiByte
2069 m_CodePage
, // code page
2070 flags
, // either none or no best fit
2071 pwz
, // input string
2072 -1, // it is (wide) NUL-terminated
2073 buf
, // output buffer
2074 buf
? n
: 0, // and its size
2075 NULL
, // default "replacement" char
2076 pUsedDef
// [out] was it used?
2081 // function totally failed
2082 return wxCONV_FAILED
;
2085 // if we were really converting, check if we succeeded
2090 // check if the conversion failed, i.e. if any replacements
2093 return wxCONV_FAILED
;
2095 else // we must resort to double tripping...
2097 wxWCharBuffer
wcBuf(n
);
2098 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2099 wcscmp(wcBuf
, pwz
) != 0 )
2101 // we didn't obtain the same thing we started from, hence
2102 // the conversion was lossy and we consider that it failed
2103 return wxCONV_FAILED
;
2108 // see the comment above for the reason of "len - 1"
2112 virtual size_t GetMBNulLen() const
2114 if ( m_minMBCharWidth
== 0 )
2116 int len
= ::WideCharToMultiByte
2118 m_CodePage
, // code page
2120 L
"", // input string
2121 1, // translate just the NUL
2122 NULL
, // output buffer
2124 NULL
, // no replacement char
2125 NULL
// [out] don't care if it was used
2128 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2132 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2136 self
->m_minMBCharWidth
= (size_t)-1;
2142 self
->m_minMBCharWidth
= len
;
2147 return m_minMBCharWidth
;
2150 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2152 bool IsOk() const { return m_CodePage
!= -1; }
2155 static bool CanUseNoBestFit()
2157 static int s_isWin98Or2k
= -1;
2159 if ( s_isWin98Or2k
== -1 )
2162 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2165 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2169 s_isWin98Or2k
= verMaj
>= 5;
2173 // unknown, be conseravtive by default
2177 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2180 return s_isWin98Or2k
== 1;
2183 static bool IsAtLeastWin2kSP4()
2188 static int s_isAtLeastWin2kSP4
= -1;
2190 if ( s_isAtLeastWin2kSP4
== -1 )
2192 OSVERSIONINFOEX ver
;
2194 memset(&ver
, 0, sizeof(ver
));
2195 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2196 GetVersionEx((OSVERSIONINFO
*)&ver
);
2198 s_isAtLeastWin2kSP4
=
2199 ((ver
.dwMajorVersion
> 5) || // Vista+
2200 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2201 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2202 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2206 return s_isAtLeastWin2kSP4
== 1;
2211 // the code page we're working with
2214 // cached result of GetMBNulLen(), set to 0 initially meaning
2216 size_t m_minMBCharWidth
;
2219 #endif // wxHAVE_WIN32_MB2WC
2221 // ============================================================================
2222 // Cocoa conversion classes
2223 // ============================================================================
2225 #if defined(__WXCOCOA__)
2227 // RN: There is no UTF-32 support in either Core Foundation or
2228 // Cocoa. Strangely enough, internally Core Foundation uses
2229 // UTF 32 internally quite a bit - its just not public (yet).
2231 #include <CoreFoundation/CFString.h>
2232 #include <CoreFoundation/CFStringEncodingExt.h>
2234 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2236 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2237 if ( encoding
== wxFONTENCODING_DEFAULT
)
2239 enc
= CFStringGetSystemEncoding();
2241 else switch( encoding
)
2243 case wxFONTENCODING_ISO8859_1
:
2244 enc
= kCFStringEncodingISOLatin1
;
2246 case wxFONTENCODING_ISO8859_2
:
2247 enc
= kCFStringEncodingISOLatin2
;
2249 case wxFONTENCODING_ISO8859_3
:
2250 enc
= kCFStringEncodingISOLatin3
;
2252 case wxFONTENCODING_ISO8859_4
:
2253 enc
= kCFStringEncodingISOLatin4
;
2255 case wxFONTENCODING_ISO8859_5
:
2256 enc
= kCFStringEncodingISOLatinCyrillic
;
2258 case wxFONTENCODING_ISO8859_6
:
2259 enc
= kCFStringEncodingISOLatinArabic
;
2261 case wxFONTENCODING_ISO8859_7
:
2262 enc
= kCFStringEncodingISOLatinGreek
;
2264 case wxFONTENCODING_ISO8859_8
:
2265 enc
= kCFStringEncodingISOLatinHebrew
;
2267 case wxFONTENCODING_ISO8859_9
:
2268 enc
= kCFStringEncodingISOLatin5
;
2270 case wxFONTENCODING_ISO8859_10
:
2271 enc
= kCFStringEncodingISOLatin6
;
2273 case wxFONTENCODING_ISO8859_11
:
2274 enc
= kCFStringEncodingISOLatinThai
;
2276 case wxFONTENCODING_ISO8859_13
:
2277 enc
= kCFStringEncodingISOLatin7
;
2279 case wxFONTENCODING_ISO8859_14
:
2280 enc
= kCFStringEncodingISOLatin8
;
2282 case wxFONTENCODING_ISO8859_15
:
2283 enc
= kCFStringEncodingISOLatin9
;
2286 case wxFONTENCODING_KOI8
:
2287 enc
= kCFStringEncodingKOI8_R
;
2289 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2290 enc
= kCFStringEncodingDOSRussian
;
2293 // case wxFONTENCODING_BULGARIAN :
2297 case wxFONTENCODING_CP437
:
2298 enc
=kCFStringEncodingDOSLatinUS
;
2300 case wxFONTENCODING_CP850
:
2301 enc
= kCFStringEncodingDOSLatin1
;
2303 case wxFONTENCODING_CP852
:
2304 enc
= kCFStringEncodingDOSLatin2
;
2306 case wxFONTENCODING_CP855
:
2307 enc
= kCFStringEncodingDOSCyrillic
;
2309 case wxFONTENCODING_CP866
:
2310 enc
=kCFStringEncodingDOSRussian
;
2312 case wxFONTENCODING_CP874
:
2313 enc
= kCFStringEncodingDOSThai
;
2315 case wxFONTENCODING_CP932
:
2316 enc
= kCFStringEncodingDOSJapanese
;
2318 case wxFONTENCODING_CP936
:
2319 enc
=kCFStringEncodingDOSChineseSimplif
;
2321 case wxFONTENCODING_CP949
:
2322 enc
= kCFStringEncodingDOSKorean
;
2324 case wxFONTENCODING_CP950
:
2325 enc
= kCFStringEncodingDOSChineseTrad
;
2327 case wxFONTENCODING_CP1250
:
2328 enc
= kCFStringEncodingWindowsLatin2
;
2330 case wxFONTENCODING_CP1251
:
2331 enc
=kCFStringEncodingWindowsCyrillic
;
2333 case wxFONTENCODING_CP1252
:
2334 enc
=kCFStringEncodingWindowsLatin1
;
2336 case wxFONTENCODING_CP1253
:
2337 enc
= kCFStringEncodingWindowsGreek
;
2339 case wxFONTENCODING_CP1254
:
2340 enc
= kCFStringEncodingWindowsLatin5
;
2342 case wxFONTENCODING_CP1255
:
2343 enc
=kCFStringEncodingWindowsHebrew
;
2345 case wxFONTENCODING_CP1256
:
2346 enc
=kCFStringEncodingWindowsArabic
;
2348 case wxFONTENCODING_CP1257
:
2349 enc
= kCFStringEncodingWindowsBalticRim
;
2351 // This only really encodes to UTF7 (if that) evidently
2352 // case wxFONTENCODING_UTF7 :
2353 // enc = kCFStringEncodingNonLossyASCII ;
2355 case wxFONTENCODING_UTF8
:
2356 enc
= kCFStringEncodingUTF8
;
2358 case wxFONTENCODING_EUC_JP
:
2359 enc
= kCFStringEncodingEUC_JP
;
2361 case wxFONTENCODING_UTF16
:
2362 enc
= kCFStringEncodingUnicode
;
2364 case wxFONTENCODING_MACROMAN
:
2365 enc
= kCFStringEncodingMacRoman
;
2367 case wxFONTENCODING_MACJAPANESE
:
2368 enc
= kCFStringEncodingMacJapanese
;
2370 case wxFONTENCODING_MACCHINESETRAD
:
2371 enc
= kCFStringEncodingMacChineseTrad
;
2373 case wxFONTENCODING_MACKOREAN
:
2374 enc
= kCFStringEncodingMacKorean
;
2376 case wxFONTENCODING_MACARABIC
:
2377 enc
= kCFStringEncodingMacArabic
;
2379 case wxFONTENCODING_MACHEBREW
:
2380 enc
= kCFStringEncodingMacHebrew
;
2382 case wxFONTENCODING_MACGREEK
:
2383 enc
= kCFStringEncodingMacGreek
;
2385 case wxFONTENCODING_MACCYRILLIC
:
2386 enc
= kCFStringEncodingMacCyrillic
;
2388 case wxFONTENCODING_MACDEVANAGARI
:
2389 enc
= kCFStringEncodingMacDevanagari
;
2391 case wxFONTENCODING_MACGURMUKHI
:
2392 enc
= kCFStringEncodingMacGurmukhi
;
2394 case wxFONTENCODING_MACGUJARATI
:
2395 enc
= kCFStringEncodingMacGujarati
;
2397 case wxFONTENCODING_MACORIYA
:
2398 enc
= kCFStringEncodingMacOriya
;
2400 case wxFONTENCODING_MACBENGALI
:
2401 enc
= kCFStringEncodingMacBengali
;
2403 case wxFONTENCODING_MACTAMIL
:
2404 enc
= kCFStringEncodingMacTamil
;
2406 case wxFONTENCODING_MACTELUGU
:
2407 enc
= kCFStringEncodingMacTelugu
;
2409 case wxFONTENCODING_MACKANNADA
:
2410 enc
= kCFStringEncodingMacKannada
;
2412 case wxFONTENCODING_MACMALAJALAM
:
2413 enc
= kCFStringEncodingMacMalayalam
;
2415 case wxFONTENCODING_MACSINHALESE
:
2416 enc
= kCFStringEncodingMacSinhalese
;
2418 case wxFONTENCODING_MACBURMESE
:
2419 enc
= kCFStringEncodingMacBurmese
;
2421 case wxFONTENCODING_MACKHMER
:
2422 enc
= kCFStringEncodingMacKhmer
;
2424 case wxFONTENCODING_MACTHAI
:
2425 enc
= kCFStringEncodingMacThai
;
2427 case wxFONTENCODING_MACLAOTIAN
:
2428 enc
= kCFStringEncodingMacLaotian
;
2430 case wxFONTENCODING_MACGEORGIAN
:
2431 enc
= kCFStringEncodingMacGeorgian
;
2433 case wxFONTENCODING_MACARMENIAN
:
2434 enc
= kCFStringEncodingMacArmenian
;
2436 case wxFONTENCODING_MACCHINESESIMP
:
2437 enc
= kCFStringEncodingMacChineseSimp
;
2439 case wxFONTENCODING_MACTIBETAN
:
2440 enc
= kCFStringEncodingMacTibetan
;
2442 case wxFONTENCODING_MACMONGOLIAN
:
2443 enc
= kCFStringEncodingMacMongolian
;
2445 case wxFONTENCODING_MACETHIOPIC
:
2446 enc
= kCFStringEncodingMacEthiopic
;
2448 case wxFONTENCODING_MACCENTRALEUR
:
2449 enc
= kCFStringEncodingMacCentralEurRoman
;
2451 case wxFONTENCODING_MACVIATNAMESE
:
2452 enc
= kCFStringEncodingMacVietnamese
;
2454 case wxFONTENCODING_MACARABICEXT
:
2455 enc
= kCFStringEncodingMacExtArabic
;
2457 case wxFONTENCODING_MACSYMBOL
:
2458 enc
= kCFStringEncodingMacSymbol
;
2460 case wxFONTENCODING_MACDINGBATS
:
2461 enc
= kCFStringEncodingMacDingbats
;
2463 case wxFONTENCODING_MACTURKISH
:
2464 enc
= kCFStringEncodingMacTurkish
;
2466 case wxFONTENCODING_MACCROATIAN
:
2467 enc
= kCFStringEncodingMacCroatian
;
2469 case wxFONTENCODING_MACICELANDIC
:
2470 enc
= kCFStringEncodingMacIcelandic
;
2472 case wxFONTENCODING_MACROMANIAN
:
2473 enc
= kCFStringEncodingMacRomanian
;
2475 case wxFONTENCODING_MACCELTIC
:
2476 enc
= kCFStringEncodingMacCeltic
;
2478 case wxFONTENCODING_MACGAELIC
:
2479 enc
= kCFStringEncodingMacGaelic
;
2481 // case wxFONTENCODING_MACKEYBOARD :
2482 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2485 // because gcc is picky
2491 class wxMBConv_cocoa
: public wxMBConv
2496 Init(CFStringGetSystemEncoding()) ;
2499 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2501 m_encoding
= conv
.m_encoding
;
2505 wxMBConv_cocoa(const wxChar
* name
)
2507 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2511 wxMBConv_cocoa(wxFontEncoding encoding
)
2513 Init( wxCFStringEncFromFontEnc(encoding
) );
2520 void Init( CFStringEncoding encoding
)
2522 m_encoding
= encoding
;
2525 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2529 CFStringRef theString
= CFStringCreateWithBytes (
2530 NULL
, //the allocator
2531 (const UInt8
*)szUnConv
,
2534 false //no BOM/external representation
2537 wxASSERT(theString
);
2539 size_t nOutLength
= CFStringGetLength(theString
);
2543 CFRelease(theString
);
2547 CFRange theRange
= { 0, nOutSize
};
2549 #if SIZEOF_WCHAR_T == 4
2550 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2553 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2555 CFRelease(theString
);
2557 szUniCharBuffer
[nOutLength
] = '\0' ;
2559 #if SIZEOF_WCHAR_T == 4
2560 wxMBConvUTF16 converter
;
2561 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2562 delete[] szUniCharBuffer
;
2568 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2572 size_t nRealOutSize
;
2573 size_t nBufSize
= wxWcslen(szUnConv
);
2574 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2576 #if SIZEOF_WCHAR_T == 4
2577 wxMBConvUTF16 converter
;
2578 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2579 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2580 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2581 nBufSize
/= sizeof(UniChar
);
2584 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2588 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2591 wxASSERT(theString
);
2593 //Note that CER puts a BOM when converting to unicode
2594 //so we check and use getchars instead in that case
2595 if (m_encoding
== kCFStringEncodingUnicode
)
2598 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2600 nRealOutSize
= CFStringGetLength(theString
) + 1;
2606 CFRangeMake(0, CFStringGetLength(theString
)),
2608 0, //what to put in characters that can't be converted -
2609 //0 tells CFString to return NULL if it meets such a character
2610 false, //not an external representation
2613 (CFIndex
*) &nRealOutSize
2617 CFRelease(theString
);
2619 #if SIZEOF_WCHAR_T == 4
2620 delete[] szUniBuffer
;
2623 return nRealOutSize
- 1;
2626 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2630 return m_encoding
!= kCFStringEncodingInvalidId
&&
2631 CFStringIsEncodingAvailable(m_encoding
);
2635 CFStringEncoding m_encoding
;
2638 #endif // defined(__WXCOCOA__)
2640 // ============================================================================
2641 // Mac conversion classes
2642 // ============================================================================
2644 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2646 class wxMBConv_mac
: public wxMBConv
2651 Init(CFStringGetSystemEncoding()) ;
2654 wxMBConv_mac(const wxMBConv_mac
& conv
)
2656 Init(conv
.m_char_encoding
);
2660 wxMBConv_mac(const wxChar
* name
)
2662 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2666 wxMBConv_mac(wxFontEncoding encoding
)
2668 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2673 OSStatus status
= noErr
;
2674 status
= TECDisposeConverter(m_MB2WC_converter
);
2675 status
= TECDisposeConverter(m_WC2MB_converter
);
2679 void Init( TextEncodingBase encoding
)
2681 OSStatus status
= noErr
;
2682 m_char_encoding
= encoding
;
2683 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2685 status
= TECCreateConverter(&m_MB2WC_converter
,
2687 m_unicode_encoding
);
2688 status
= TECCreateConverter(&m_WC2MB_converter
,
2693 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2695 OSStatus status
= noErr
;
2696 ByteCount byteOutLen
;
2697 ByteCount byteInLen
= strlen(psz
) ;
2698 wchar_t *tbuf
= NULL
;
2699 UniChar
* ubuf
= NULL
;
2704 //apple specs say at least 32
2705 n
= wxMax( 32 , byteInLen
) ;
2706 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2708 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2709 #if SIZEOF_WCHAR_T == 4
2710 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2712 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2714 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2715 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2716 #if SIZEOF_WCHAR_T == 4
2717 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2718 // is not properly terminated we get random characters at the end
2719 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2720 wxMBConvUTF16 converter
;
2721 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2724 res
= byteOutLen
/ sizeof( UniChar
) ;
2729 if ( buf
&& res
< n
)
2735 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2737 OSStatus status
= noErr
;
2738 ByteCount byteOutLen
;
2739 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2745 //apple specs say at least 32
2746 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2747 tbuf
= (char*) malloc( n
) ;
2750 ByteCount byteBufferLen
= n
;
2751 UniChar
* ubuf
= NULL
;
2752 #if SIZEOF_WCHAR_T == 4
2753 wxMBConvUTF16 converter
;
2754 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2755 byteInLen
= unicharlen
;
2756 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2757 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2759 ubuf
= (UniChar
*) psz
;
2761 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2762 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2763 #if SIZEOF_WCHAR_T == 4
2769 size_t res
= byteOutLen
;
2770 if ( buf
&& res
< n
)
2774 //we need to double-trip to verify it didn't insert any ? in place
2775 //of bogus characters
2776 wxWCharBuffer
wcBuf(n
);
2777 size_t pszlen
= wxWcslen(psz
);
2778 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2779 wxWcslen(wcBuf
) != pszlen
||
2780 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2782 // we didn't obtain the same thing we started from, hence
2783 // the conversion was lossy and we consider that it failed
2784 return wxCONV_FAILED
;
2791 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2794 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2797 TECObjectRef m_MB2WC_converter
;
2798 TECObjectRef m_WC2MB_converter
;
2800 TextEncodingBase m_char_encoding
;
2801 TextEncodingBase m_unicode_encoding
;
2804 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2806 // ============================================================================
2807 // wxEncodingConverter based conversion classes
2808 // ============================================================================
2812 class wxMBConv_wxwin
: public wxMBConv
2817 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2818 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2822 // temporarily just use wxEncodingConverter stuff,
2823 // so that it works while a better implementation is built
2824 wxMBConv_wxwin(const wxChar
* name
)
2827 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2829 m_enc
= wxFONTENCODING_SYSTEM
;
2834 wxMBConv_wxwin(wxFontEncoding enc
)
2841 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2843 size_t inbuf
= strlen(psz
);
2846 if (!m2w
.Convert(psz
,buf
))
2847 return wxCONV_FAILED
;
2852 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2854 const size_t inbuf
= wxWcslen(psz
);
2857 if (!w2m
.Convert(psz
,buf
))
2858 return wxCONV_FAILED
;
2864 virtual size_t GetMBNulLen() const
2868 case wxFONTENCODING_UTF16BE
:
2869 case wxFONTENCODING_UTF16LE
:
2872 case wxFONTENCODING_UTF32BE
:
2873 case wxFONTENCODING_UTF32LE
:
2881 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2883 bool IsOk() const { return m_ok
; }
2886 wxFontEncoding m_enc
;
2887 wxEncodingConverter m2w
, w2m
;
2890 // were we initialized successfully?
2893 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2896 // make the constructors available for unit testing
2897 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2899 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2900 if ( !result
->IsOk() )
2908 #endif // wxUSE_FONTMAP
2910 // ============================================================================
2911 // wxCSConv implementation
2912 // ============================================================================
2914 void wxCSConv::Init()
2921 wxCSConv::wxCSConv(const wxChar
*charset
)
2931 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2933 m_encoding
= wxFONTENCODING_SYSTEM
;
2937 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2939 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2941 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2943 encoding
= wxFONTENCODING_SYSTEM
;
2948 m_encoding
= encoding
;
2951 wxCSConv::~wxCSConv()
2956 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2961 SetName(conv
.m_name
);
2962 m_encoding
= conv
.m_encoding
;
2965 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2969 SetName(conv
.m_name
);
2970 m_encoding
= conv
.m_encoding
;
2975 void wxCSConv::Clear()
2984 void wxCSConv::SetName(const wxChar
*charset
)
2988 m_name
= wxStrdup(charset
);
2994 #include "wx/hashmap.h"
2996 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2997 wxEncodingNameCache
);
2999 static wxEncodingNameCache gs_nameCache
;
3002 wxMBConv
*wxCSConv::DoCreate() const
3005 wxLogTrace(TRACE_STRCONV
,
3006 wxT("creating conversion for %s"),
3008 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
3009 #endif // wxUSE_FONTMAP
3011 // check for the special case of ASCII or ISO8859-1 charset: as we have
3012 // special knowledge of it anyhow, we don't need to create a special
3013 // conversion object
3014 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3015 m_encoding
== wxFONTENCODING_DEFAULT
)
3017 // don't convert at all
3021 // we trust OS to do conversion better than we can so try external
3022 // conversion methods first
3024 // the full order is:
3025 // 1. OS conversion (iconv() under Unix or Win32 API)
3026 // 2. hard coded conversions for UTF
3027 // 3. wxEncodingConverter as fall back
3033 #endif // !wxUSE_FONTMAP
3035 wxString
name(m_name
);
3036 wxFontEncoding
encoding(m_encoding
);
3038 if ( !name
.empty() )
3040 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
3048 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3049 #endif // wxUSE_FONTMAP
3053 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3054 if ( it
!= gs_nameCache
.end() )
3056 if ( it
->second
.empty() )
3059 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3066 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3068 for ( ; *names
; ++names
)
3070 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3073 gs_nameCache
[encoding
] = *names
;
3080 gs_nameCache
[encoding
] = _T(""); // cache the failure
3082 #endif // wxUSE_FONTMAP
3084 #endif // HAVE_ICONV
3086 #ifdef wxHAVE_WIN32_MB2WC
3089 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3090 : new wxMBConv_win32(m_encoding
);
3099 #endif // wxHAVE_WIN32_MB2WC
3100 #if defined(__WXMAC__)
3102 // leave UTF16 and UTF32 to the built-ins of wx
3103 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3104 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3108 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3109 : new wxMBConv_mac(m_encoding
);
3111 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3120 #if defined(__WXCOCOA__)
3122 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3126 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3127 : new wxMBConv_cocoa(m_encoding
);
3129 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3139 wxFontEncoding enc
= m_encoding
;
3141 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3143 // use "false" to suppress interactive dialogs -- we can be called from
3144 // anywhere and popping up a dialog from here is the last thing we want to
3146 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3148 #endif // wxUSE_FONTMAP
3152 case wxFONTENCODING_UTF7
:
3153 return new wxMBConvUTF7
;
3155 case wxFONTENCODING_UTF8
:
3156 return new wxMBConvUTF8
;
3158 case wxFONTENCODING_UTF16BE
:
3159 return new wxMBConvUTF16BE
;
3161 case wxFONTENCODING_UTF16LE
:
3162 return new wxMBConvUTF16LE
;
3164 case wxFONTENCODING_UTF32BE
:
3165 return new wxMBConvUTF32BE
;
3167 case wxFONTENCODING_UTF32LE
:
3168 return new wxMBConvUTF32LE
;
3171 // nothing to do but put here to suppress gcc warnings
3178 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3179 : new wxMBConv_wxwin(m_encoding
);
3185 #endif // wxUSE_FONTMAP
3187 // NB: This is a hack to prevent deadlock. What could otherwise happen
3188 // in Unicode build: wxConvLocal creation ends up being here
3189 // because of some failure and logs the error. But wxLog will try to
3190 // attach timestamp, for which it will need wxConvLocal (to convert
3191 // time to char* and then wchar_t*), but that fails, tries to log
3192 // error, but wxLog has a (already locked) critical section that
3193 // guards static buffer.
3194 static bool alreadyLoggingError
= false;
3195 if (!alreadyLoggingError
)
3197 alreadyLoggingError
= true;
3198 wxLogError(_("Cannot convert from the charset '%s'!"),
3202 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3203 #else // !wxUSE_FONTMAP
3204 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3205 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3207 alreadyLoggingError
= false;
3213 void wxCSConv::CreateConvIfNeeded() const
3217 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3220 // if we don't have neither the name nor the encoding, use the default
3221 // encoding for this system
3222 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3224 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3226 #endif // wxUSE_INTL
3228 self
->m_convReal
= DoCreate();
3229 self
->m_deferred
= false;
3233 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3235 CreateConvIfNeeded();
3238 return m_convReal
->MB2WC(buf
, psz
, n
);
3241 size_t len
= strlen(psz
);
3245 for (size_t c
= 0; c
<= len
; c
++)
3246 buf
[c
] = (unsigned char)(psz
[c
]);
3252 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3254 CreateConvIfNeeded();
3257 return m_convReal
->WC2MB(buf
, psz
, n
);
3260 const size_t len
= wxWcslen(psz
);
3263 for (size_t c
= 0; c
<= len
; c
++)
3266 return wxCONV_FAILED
;
3267 buf
[c
] = (char)psz
[c
];
3272 for (size_t c
= 0; c
<= len
; c
++)
3275 return wxCONV_FAILED
;
3282 size_t wxCSConv::GetMBNulLen() const
3284 CreateConvIfNeeded();
3288 return m_convReal
->GetMBNulLen();
3294 // ----------------------------------------------------------------------------
3296 // ----------------------------------------------------------------------------
3299 static wxMBConv_win32 wxConvLibcObj
;
3300 #elif defined(__WXMAC__) && !defined(__MACH__)
3301 static wxMBConv_mac wxConvLibcObj
;
3303 static wxMBConvLibc wxConvLibcObj
;
3306 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3307 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3308 static wxMBConvUTF7 wxConvUTF7Obj
;
3309 static wxMBConvUTF8 wxConvUTF8Obj
;
3311 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3312 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3313 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3314 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3315 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3317 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3325 #else // !wxUSE_WCHAR_T
3327 // stand-ins in absence of wchar_t
3328 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3333 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T