1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
24 #include "wx/strconv.h"
29 #include "wx/msw/private.h"
30 #include "wx/msw/missing.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
51 #include "wx/thread.h"
54 #include "wx/encconv.h"
55 #include "wx/fontmap.h"
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
64 // includes Mac headers
65 #include "wx/mac/private.h"
69 #define TRACE_STRCONV _T("strconv")
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
73 #if SIZEOF_WCHAR_T == 2
78 // ============================================================================
80 // ============================================================================
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p
, size_t n
)
85 while ( n
&& *p
++ == '\0' )
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
95 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
100 *output
= (wxUint16
) input
;
104 else if (input
>= 0x110000)
106 return wxCONV_FAILED
;
112 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
113 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
120 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
122 if ((*input
< 0xd800) || (*input
> 0xdfff))
127 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
130 return wxCONV_FAILED
;
134 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
140 typedef wchar_t wxDecodeSurrogate_t
;
142 typedef wxUint16 wxDecodeSurrogate_t
;
143 #endif // WC_UTF16/!WC_UTF16
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
150 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
154 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
155 if ( n
== wxCONV_FAILED
)
163 // ----------------------------------------------------------------------------
165 // ----------------------------------------------------------------------------
168 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
169 const char *src
, size_t srcLen
) const
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten
= 0;
181 // the number of NULs terminating this string
182 size_t nulLen
= 0; // not really needed, but just to avoid warnings
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
190 if ( srcLen
!= wxNO_LEN
)
192 // we need to know how to find the end of this string
193 nulLen
= GetMBNulLen();
194 if ( nulLen
== wxCONV_FAILED
)
195 return wxCONV_FAILED
;
197 // if there are enough NULs we can avoid the copy
198 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
200 // make a copy in order to properly NUL-terminate the string
201 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
202 char * const p
= bufTmp
.data();
203 memcpy(p
, src
, srcLen
);
204 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
210 srcEnd
= src
+ srcLen
;
212 else // quit after the first loop iteration
219 // try to convert the current chunk
220 size_t lenChunk
= MB2WC(NULL
, src
, 0);
221 if ( lenChunk
== wxCONV_FAILED
)
222 return wxCONV_FAILED
;
224 lenChunk
++; // for the L'\0' at the end of this chunk
226 dstWritten
+= lenChunk
;
230 // nothing left in the input string, conversion succeeded
236 if ( dstWritten
> dstLen
)
237 return wxCONV_FAILED
;
239 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
240 return wxCONV_FAILED
;
247 // we convert just one chunk in this case as this is the entire
252 // advance the input pointer past the end of this chunk
253 while ( NotAllNULs(src
, nulLen
) )
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
262 src
+= nulLen
; // skipping over its terminator as well
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
275 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
276 const wchar_t *src
, size_t srcLen
) const
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten
= 0;
281 // make a copy of the input string unless it is already properly
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp
;
287 if ( srcLen
== wxNO_LEN
)
289 srcLen
= wxWcslen(src
) + 1;
291 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
293 // make a copy in order to properly NUL-terminate the string
294 bufTmp
= wxWCharBuffer(srcLen
);
295 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
299 const size_t lenNul
= GetMBNulLen();
300 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
302 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
304 // try to convert the current chunk
305 size_t lenChunk
= WC2MB(NULL
, src
, 0);
307 if ( lenChunk
== wxCONV_FAILED
)
308 return wxCONV_FAILED
;
311 dstWritten
+= lenChunk
;
315 if ( dstWritten
> dstLen
)
316 return wxCONV_FAILED
;
318 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
319 return wxCONV_FAILED
;
328 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
330 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
331 if ( rc
!= wxCONV_FAILED
)
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
341 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
343 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
344 if ( rc
!= wxCONV_FAILED
)
352 wxMBConv::~wxMBConv()
354 // nothing to do here (necessary for Darwin linking probably)
357 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
361 // calculate the length of the buffer needed first
362 const size_t nLen
= MB2WC(NULL
, psz
, 0);
363 if ( nLen
!= wxCONV_FAILED
)
365 // now do the actual conversion
366 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
368 // +1 for the trailing NULL
369 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
374 return wxWCharBuffer();
377 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
381 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
382 if ( nLen
!= wxCONV_FAILED
)
384 // extra space for trailing NUL(s)
385 static const size_t extraLen
= GetMaxMBNulLen();
387 wxCharBuffer
buf(nLen
+ extraLen
- 1);
388 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
393 return wxCharBuffer();
397 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
399 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
400 if ( dstLen
!= wxCONV_FAILED
)
402 wxWCharBuffer
wbuf(dstLen
- 1);
403 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
408 if ( wbuf
[dstLen
- 1] == L
'\0' )
419 return wxWCharBuffer();
423 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
425 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
426 if ( dstLen
!= wxCONV_FAILED
)
428 // special case of empty input: can't allocate 0 size buffer below as
429 // wxCharBuffer insists on NUL-terminating it
430 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
431 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
437 const size_t nulLen
= GetMBNulLen();
438 if ( dstLen
>= nulLen
&&
439 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
441 // in this case the output is NUL-terminated and we're not
442 // supposed to count NUL
454 return wxCharBuffer();
457 // ----------------------------------------------------------------------------
459 // ----------------------------------------------------------------------------
461 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
463 return wxMB2WC(buf
, psz
, n
);
466 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
468 return wxWC2MB(buf
, psz
, n
);
471 // ----------------------------------------------------------------------------
472 // wxConvBrokenFileNames
473 // ----------------------------------------------------------------------------
477 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
479 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
480 || wxStricmp(charset
, _T("UTF8")) == 0 )
481 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
483 m_conv
= new wxCSConv(charset
);
488 // ----------------------------------------------------------------------------
490 // ----------------------------------------------------------------------------
492 // Implementation (C) 2004 Fredrik Roubert
495 // BASE64 decoding table
497 static const unsigned char utf7unb64
[] =
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
505 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
506 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
508 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
509 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
510 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
512 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
513 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
514 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
533 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
537 while ( *psz
&& (!buf
|| (len
< n
)) )
539 unsigned char cc
= *psz
++;
547 else if (*psz
== '-')
555 else // start of BASE64 encoded string
559 for ( ok
= lsb
= false, d
= 0, l
= 0;
560 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
565 for (l
+= 6; l
>= 8; lsb
= !lsb
)
567 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
577 *buf
= (wchar_t)(c
<< 8);
586 // in valid UTF7 we should have valid characters after '+'
587 return wxCONV_FAILED
;
595 if ( buf
&& (len
< n
) )
602 // BASE64 encoding table
604 static const unsigned char utf7enb64
[] =
606 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
607 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
608 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
609 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
610 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
611 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
612 'w', 'x', 'y', 'z', '0', '1', '2', '3',
613 '4', '5', '6', '7', '8', '9', '+', '/'
617 // UTF-7 encoding table
619 // 0 - Set D (directly encoded characters)
620 // 1 - Set O (optional direct characters)
621 // 2 - whitespace characters (optional)
622 // 3 - special characters
624 static const unsigned char utf7encode
[128] =
626 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
636 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
640 while (*psz
&& ((!buf
) || (len
< n
)))
643 if (cc
< 0x80 && utf7encode
[cc
] < 1)
652 else if (((wxUint32
)cc
) > 0xffff)
654 // no surrogate pair generation (yet?)
655 return wxCONV_FAILED
;
666 // BASE64 encode string
667 unsigned int lsb
, d
, l
;
668 for (d
= 0, l
= 0; /*nothing*/; psz
++)
670 for (lsb
= 0; lsb
< 2; lsb
++)
673 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
675 for (l
+= 8; l
>= 6; )
679 *buf
++ = utf7enb64
[(d
>> l
) % 64];
685 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
692 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
704 if (buf
&& (len
< n
))
710 // ----------------------------------------------------------------------------
712 // ----------------------------------------------------------------------------
714 static wxUint32 utf8_max
[]=
715 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
717 // boundaries of the private use area we use to (temporarily) remap invalid
718 // characters invalid in a UTF-8 encoded string
719 const wxUint32 wxUnicodePUA
= 0x100000;
720 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
722 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
726 while (*psz
&& ((!buf
) || (len
< n
)))
728 const char *opsz
= psz
;
729 bool invalid
= false;
730 unsigned char cc
= *psz
++, fc
= cc
;
732 for (cnt
= 0; fc
& 0x80; cnt
++)
742 // escape the escape character for octal escapes
743 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
744 && cc
== '\\' && (!buf
|| len
< n
))
756 // invalid UTF-8 sequence
761 unsigned ocnt
= cnt
- 1;
762 wxUint32 res
= cc
& (0x3f >> cnt
);
766 if ((cc
& 0xC0) != 0x80)
768 // invalid UTF-8 sequence
774 res
= (res
<< 6) | (cc
& 0x3f);
777 if (invalid
|| res
<= utf8_max
[ocnt
])
779 // illegal UTF-8 encoding
782 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
783 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
785 // if one of our PUA characters turns up externally
786 // it must also be treated as an illegal sequence
787 // (a bit like you have to escape an escape character)
793 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
794 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
795 if (pa
== wxCONV_FAILED
)
807 *buf
++ = (wchar_t)res
;
809 #endif // WC_UTF16/!WC_UTF16
815 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
817 while (opsz
< psz
&& (!buf
|| len
< n
))
820 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
821 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
822 wxASSERT(pa
!= wxCONV_FAILED
);
829 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
835 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
837 while (opsz
< psz
&& (!buf
|| len
< n
))
839 if ( buf
&& len
+ 3 < n
)
841 unsigned char on
= *opsz
;
843 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
844 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
845 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
852 else // MAP_INVALID_UTF8_NOT
854 return wxCONV_FAILED
;
860 if (buf
&& (len
< n
))
866 static inline bool isoctal(wchar_t wch
)
868 return L
'0' <= wch
&& wch
<= L
'7';
871 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
875 while (*psz
&& ((!buf
) || (len
< n
)))
880 // cast is ok for WC_UTF16
881 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
882 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
884 cc
= (*psz
++) & 0x7fffffff;
887 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
888 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
891 *buf
++ = (char)(cc
- wxUnicodePUA
);
894 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
895 && cc
== L
'\\' && psz
[0] == L
'\\' )
902 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
904 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
908 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
909 (psz
[1] - L
'0') * 010 +
919 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
935 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
937 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
943 if (buf
&& (len
< n
))
949 // ============================================================================
951 // ============================================================================
953 #ifdef WORDS_BIGENDIAN
954 #define wxMBConvUTF16straight wxMBConvUTF16BE
955 #define wxMBConvUTF16swap wxMBConvUTF16LE
957 #define wxMBConvUTF16swap wxMBConvUTF16BE
958 #define wxMBConvUTF16straight wxMBConvUTF16LE
962 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
964 if ( srcLen
== wxNO_LEN
)
966 // count the number of bytes in input, including the trailing NULs
967 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
968 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
971 srcLen
*= BYTES_PER_CHAR
;
973 else // we already have the length
975 // we can only convert an entire number of UTF-16 characters
976 if ( srcLen
% BYTES_PER_CHAR
)
977 return wxCONV_FAILED
;
983 // case when in-memory representation is UTF-16 too
986 // ----------------------------------------------------------------------------
987 // conversions without endianness change
988 // ----------------------------------------------------------------------------
991 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
992 const char *src
, size_t srcLen
) const
994 // set up the scene for using memcpy() (which is presumably more efficient
995 // than copying the bytes one by one)
996 srcLen
= GetLength(src
, srcLen
);
997 if ( srcLen
== wxNO_LEN
)
998 return wxCONV_FAILED
;
1000 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1003 if ( dstLen
< inLen
)
1004 return wxCONV_FAILED
;
1006 memcpy(dst
, src
, srcLen
);
1013 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1014 const wchar_t *src
, size_t srcLen
) const
1016 if ( srcLen
== wxNO_LEN
)
1017 srcLen
= wxWcslen(src
) + 1;
1019 srcLen
*= BYTES_PER_CHAR
;
1023 if ( dstLen
< srcLen
)
1024 return wxCONV_FAILED
;
1026 memcpy(dst
, src
, srcLen
);
1032 // ----------------------------------------------------------------------------
1033 // endian-reversing conversions
1034 // ----------------------------------------------------------------------------
1037 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1038 const char *src
, size_t srcLen
) const
1040 srcLen
= GetLength(src
, srcLen
);
1041 if ( srcLen
== wxNO_LEN
)
1042 return wxCONV_FAILED
;
1044 srcLen
/= BYTES_PER_CHAR
;
1048 if ( dstLen
< srcLen
)
1049 return wxCONV_FAILED
;
1051 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1052 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1054 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1062 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1063 const wchar_t *src
, size_t srcLen
) const
1065 if ( srcLen
== wxNO_LEN
)
1066 srcLen
= wxWcslen(src
) + 1;
1068 srcLen
*= BYTES_PER_CHAR
;
1072 if ( dstLen
< srcLen
)
1073 return wxCONV_FAILED
;
1075 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1076 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1078 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1085 #else // !WC_UTF16: wchar_t is UTF-32
1087 // ----------------------------------------------------------------------------
1088 // conversions without endianness change
1089 // ----------------------------------------------------------------------------
1092 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1093 const char *src
, size_t srcLen
) const
1095 srcLen
= GetLength(src
, srcLen
);
1096 if ( srcLen
== wxNO_LEN
)
1097 return wxCONV_FAILED
;
1099 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1102 // optimization: return maximal space which could be needed for this
1103 // string even if the real size could be smaller if the buffer contains
1109 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1110 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1112 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1114 return wxCONV_FAILED
;
1116 if ( ++outLen
> dstLen
)
1117 return wxCONV_FAILED
;
1127 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1128 const wchar_t *src
, size_t srcLen
) const
1130 if ( srcLen
== wxNO_LEN
)
1131 srcLen
= wxWcslen(src
) + 1;
1134 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1135 for ( size_t n
= 0; n
< srcLen
; n
++ )
1138 const size_t numChars
= encode_utf16(*src
++, cc
);
1139 if ( numChars
== wxCONV_FAILED
)
1140 return wxCONV_FAILED
;
1142 outLen
+= numChars
* BYTES_PER_CHAR
;
1145 if ( outLen
> dstLen
)
1146 return wxCONV_FAILED
;
1149 if ( numChars
== 2 )
1151 // second character of a surrogate
1160 // ----------------------------------------------------------------------------
1161 // endian-reversing conversions
1162 // ----------------------------------------------------------------------------
1165 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1166 const char *src
, size_t srcLen
) const
1168 srcLen
= GetLength(src
, srcLen
);
1169 if ( srcLen
== wxNO_LEN
)
1170 return wxCONV_FAILED
;
1172 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1175 // optimization: return maximal space which could be needed for this
1176 // string even if the real size could be smaller if the buffer contains
1182 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1183 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1188 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1190 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1192 const size_t numChars
= decode_utf16(tmp
, ch
);
1193 if ( numChars
== wxCONV_FAILED
)
1194 return wxCONV_FAILED
;
1196 if ( numChars
== 2 )
1199 if ( ++outLen
> dstLen
)
1200 return wxCONV_FAILED
;
1210 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1211 const wchar_t *src
, size_t srcLen
) const
1213 if ( srcLen
== wxNO_LEN
)
1214 srcLen
= wxWcslen(src
) + 1;
1217 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1218 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1221 const size_t numChars
= encode_utf16(*src
, cc
);
1222 if ( numChars
== wxCONV_FAILED
)
1223 return wxCONV_FAILED
;
1225 outLen
+= numChars
* BYTES_PER_CHAR
;
1228 if ( outLen
> dstLen
)
1229 return wxCONV_FAILED
;
1231 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1232 if ( numChars
== 2 )
1234 // second character of a surrogate
1235 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1243 #endif // WC_UTF16/!WC_UTF16
1246 // ============================================================================
1248 // ============================================================================
1250 #ifdef WORDS_BIGENDIAN
1251 #define wxMBConvUTF32straight wxMBConvUTF32BE
1252 #define wxMBConvUTF32swap wxMBConvUTF32LE
1254 #define wxMBConvUTF32swap wxMBConvUTF32BE
1255 #define wxMBConvUTF32straight wxMBConvUTF32LE
1259 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1263 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1265 if ( srcLen
== wxNO_LEN
)
1267 // count the number of bytes in input, including the trailing NULs
1268 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1269 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1272 srcLen
*= BYTES_PER_CHAR
;
1274 else // we already have the length
1276 // we can only convert an entire number of UTF-32 characters
1277 if ( srcLen
% BYTES_PER_CHAR
)
1278 return wxCONV_FAILED
;
1284 // case when in-memory representation is UTF-16
1287 // ----------------------------------------------------------------------------
1288 // conversions without endianness change
1289 // ----------------------------------------------------------------------------
1292 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1293 const char *src
, size_t srcLen
) const
1295 srcLen
= GetLength(src
, srcLen
);
1296 if ( srcLen
== wxNO_LEN
)
1297 return wxCONV_FAILED
;
1299 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1300 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1302 for ( size_t n
= 0; n
< inLen
; n
++ )
1305 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1306 if ( numChars
== wxCONV_FAILED
)
1307 return wxCONV_FAILED
;
1312 if ( outLen
> dstLen
)
1313 return wxCONV_FAILED
;
1316 if ( numChars
== 2 )
1318 // second character of a surrogate
1328 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1329 const wchar_t *src
, size_t srcLen
) const
1331 if ( srcLen
== wxNO_LEN
)
1332 srcLen
= wxWcslen(src
) + 1;
1336 // optimization: return maximal space which could be needed for this
1337 // string instead of the exact amount which could be less if there are
1338 // any surrogates in the input
1340 // we consider that surrogates are rare enough to make it worthwhile to
1341 // avoid running the loop below at the cost of slightly extra memory
1343 return srcLen
* BYTES_PER_CHAR
;
1346 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1348 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1350 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1352 return wxCONV_FAILED
;
1354 outLen
+= BYTES_PER_CHAR
;
1356 if ( outLen
> dstLen
)
1357 return wxCONV_FAILED
;
1365 // ----------------------------------------------------------------------------
1366 // endian-reversing conversions
1367 // ----------------------------------------------------------------------------
1370 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1371 const char *src
, size_t srcLen
) const
1373 srcLen
= GetLength(src
, srcLen
);
1374 if ( srcLen
== wxNO_LEN
)
1375 return wxCONV_FAILED
;
1377 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1378 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1380 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1383 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1384 if ( numChars
== wxCONV_FAILED
)
1385 return wxCONV_FAILED
;
1390 if ( outLen
> dstLen
)
1391 return wxCONV_FAILED
;
1394 if ( numChars
== 2 )
1396 // second character of a surrogate
1406 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1407 const wchar_t *src
, size_t srcLen
) const
1409 if ( srcLen
== wxNO_LEN
)
1410 srcLen
= wxWcslen(src
) + 1;
1414 // optimization: return maximal space which could be needed for this
1415 // string instead of the exact amount which could be less if there are
1416 // any surrogates in the input
1418 // we consider that surrogates are rare enough to make it worthwhile to
1419 // avoid running the loop below at the cost of slightly extra memory
1421 return srcLen
*BYTES_PER_CHAR
;
1424 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1426 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1428 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1430 return wxCONV_FAILED
;
1432 outLen
+= BYTES_PER_CHAR
;
1434 if ( outLen
> dstLen
)
1435 return wxCONV_FAILED
;
1437 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1443 #else // !WC_UTF16: wchar_t is UTF-32
1445 // ----------------------------------------------------------------------------
1446 // conversions without endianness change
1447 // ----------------------------------------------------------------------------
1450 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1451 const char *src
, size_t srcLen
) const
1453 // use memcpy() as it should be much faster than hand-written loop
1454 srcLen
= GetLength(src
, srcLen
);
1455 if ( srcLen
== wxNO_LEN
)
1456 return wxCONV_FAILED
;
1458 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1461 if ( dstLen
< inLen
)
1462 return wxCONV_FAILED
;
1464 memcpy(dst
, src
, srcLen
);
1471 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1472 const wchar_t *src
, size_t srcLen
) const
1474 if ( srcLen
== wxNO_LEN
)
1475 srcLen
= wxWcslen(src
) + 1;
1477 srcLen
*= BYTES_PER_CHAR
;
1481 if ( dstLen
< srcLen
)
1482 return wxCONV_FAILED
;
1484 memcpy(dst
, src
, srcLen
);
1490 // ----------------------------------------------------------------------------
1491 // endian-reversing conversions
1492 // ----------------------------------------------------------------------------
1495 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1496 const char *src
, size_t srcLen
) const
1498 srcLen
= GetLength(src
, srcLen
);
1499 if ( srcLen
== wxNO_LEN
)
1500 return wxCONV_FAILED
;
1502 srcLen
/= BYTES_PER_CHAR
;
1506 if ( dstLen
< srcLen
)
1507 return wxCONV_FAILED
;
1509 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1510 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1512 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1520 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1521 const wchar_t *src
, size_t srcLen
) const
1523 if ( srcLen
== wxNO_LEN
)
1524 srcLen
= wxWcslen(src
) + 1;
1526 srcLen
*= BYTES_PER_CHAR
;
1530 if ( dstLen
< srcLen
)
1531 return wxCONV_FAILED
;
1533 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1534 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1536 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1543 #endif // WC_UTF16/!WC_UTF16
1546 // ============================================================================
1547 // The classes doing conversion using the iconv_xxx() functions
1548 // ============================================================================
1552 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1553 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1554 // (unless there's yet another bug in glibc) the only case when iconv()
1555 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1556 // left in the input buffer -- when _real_ error occurs,
1557 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1559 // [This bug does not appear in glibc 2.2.]
1560 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1561 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1562 (errno != E2BIG || bufLeft != 0))
1564 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1567 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1569 #define ICONV_T_INVALID ((iconv_t)-1)
1571 #if SIZEOF_WCHAR_T == 4
1572 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1573 #define WC_ENC wxFONTENCODING_UTF32
1574 #elif SIZEOF_WCHAR_T == 2
1575 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1576 #define WC_ENC wxFONTENCODING_UTF16
1577 #else // sizeof(wchar_t) != 2 nor 4
1578 // does this ever happen?
1579 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1582 // ----------------------------------------------------------------------------
1583 // wxMBConv_iconv: encapsulates an iconv character set
1584 // ----------------------------------------------------------------------------
1586 class wxMBConv_iconv
: public wxMBConv
1589 wxMBConv_iconv(const wxChar
*name
);
1590 virtual ~wxMBConv_iconv();
1592 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1593 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1595 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1596 virtual size_t GetMBNulLen() const;
1598 virtual wxMBConv
*Clone() const
1600 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1601 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1606 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1609 // the iconv handlers used to translate from multibyte
1610 // to wide char and in the other direction
1615 // guards access to m2w and w2m objects
1616 wxMutex m_iconvMutex
;
1620 // the name (for iconv_open()) of a wide char charset -- if none is
1621 // available on this machine, it will remain NULL
1622 static wxString ms_wcCharsetName
;
1624 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1625 // different endian-ness than the native one
1626 static bool ms_wcNeedsSwap
;
1629 // name of the encoding handled by this conversion
1632 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1634 size_t m_minMBCharWidth
;
1637 // make the constructor available for unit testing
1638 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1640 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1641 if ( !result
->IsOk() )
1650 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1651 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1653 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1656 m_minMBCharWidth
= 0;
1658 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1659 // names for the charsets
1660 const wxCharBuffer
cname(wxString(name
).ToAscii());
1662 // check for charset that represents wchar_t:
1663 if ( ms_wcCharsetName
.empty() )
1665 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1668 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1669 #else // !wxUSE_FONTMAP
1670 static const wxChar
*names
[] =
1672 #if SIZEOF_WCHAR_T == 4
1674 #elif SIZEOF_WCHAR_T = 2
1679 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1681 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1683 const wxString
nameCS(*names
);
1685 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1686 wxString
nameXE(nameCS
);
1688 #ifdef WORDS_BIGENDIAN
1690 #else // little endian
1694 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1697 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1698 if ( m2w
== ICONV_T_INVALID
)
1700 // try charset w/o bytesex info (e.g. "UCS4")
1701 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1703 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1705 // and check for bytesex ourselves:
1706 if ( m2w
!= ICONV_T_INVALID
)
1708 char buf
[2], *bufPtr
;
1709 wchar_t wbuf
[2], *wbufPtr
;
1717 outsz
= SIZEOF_WCHAR_T
* 2;
1722 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1723 (char**)&wbufPtr
, &outsz
);
1725 if (ICONV_FAILED(res
, insz
))
1727 wxLogLastError(wxT("iconv"));
1728 wxLogError(_("Conversion to charset '%s' doesn't work."),
1731 else // ok, can convert to this encoding, remember it
1733 ms_wcCharsetName
= nameCS
;
1734 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1738 else // use charset not requiring byte swapping
1740 ms_wcCharsetName
= nameXE
;
1744 wxLogTrace(TRACE_STRCONV
,
1745 wxT("iconv wchar_t charset is \"%s\"%s"),
1746 ms_wcCharsetName
.empty() ? _T("<none>")
1747 : ms_wcCharsetName
.c_str(),
1748 ms_wcNeedsSwap
? _T(" (needs swap)")
1751 else // we already have ms_wcCharsetName
1753 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1756 if ( ms_wcCharsetName
.empty() )
1758 w2m
= ICONV_T_INVALID
;
1762 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1763 if ( w2m
== ICONV_T_INVALID
)
1765 wxLogTrace(TRACE_STRCONV
,
1766 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1767 ms_wcCharsetName
.c_str(), cname
.data());
1772 wxMBConv_iconv::~wxMBConv_iconv()
1774 if ( m2w
!= ICONV_T_INVALID
)
1776 if ( w2m
!= ICONV_T_INVALID
)
1780 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1782 // find the string length: notice that must be done differently for
1783 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1785 const size_t nulLen
= GetMBNulLen();
1789 return wxCONV_FAILED
;
1792 inbuf
= strlen(psz
); // arguably more optimized than our version
1797 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1798 // they also have to start at character boundary and not span two
1799 // adjacent characters
1801 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1808 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1809 // Unfortunately there is a couple of global wxCSConv objects such as
1810 // wxConvLocal that are used all over wx code, so we have to make sure
1811 // the handle is used by at most one thread at the time. Otherwise
1812 // only a few wx classes would be safe to use from non-main threads
1813 // as MB<->WC conversion would fail "randomly".
1814 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1815 #endif // wxUSE_THREADS
1817 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1819 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1820 wchar_t *bufPtr
= buf
;
1821 const char *pszPtr
= psz
;
1825 // have destination buffer, convert there
1827 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1828 (char**)&bufPtr
, &outbuf
);
1829 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1833 // convert to native endianness
1834 for ( unsigned i
= 0; i
< res
; i
++ )
1835 buf
[n
] = WC_BSWAP(buf
[i
]);
1838 // NUL-terminate the string if there is any space left
1844 // no destination buffer... convert using temp buffer
1845 // to calculate destination buffer requirement
1852 outbuf
= 8 * SIZEOF_WCHAR_T
;
1855 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1856 (char**)&bufPtr
, &outbuf
);
1858 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1860 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1863 if (ICONV_FAILED(cres
, inbuf
))
1865 //VS: it is ok if iconv fails, hence trace only
1866 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1867 return wxCONV_FAILED
;
1873 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1876 // NB: explained in MB2WC
1877 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1880 size_t inlen
= wxWcslen(psz
);
1881 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1885 wchar_t *tmpbuf
= 0;
1889 // need to copy to temp buffer to switch endianness
1890 // (doing WC_BSWAP twice on the original buffer won't help, as it
1891 // could be in read-only memory, or be accessed in some other thread)
1892 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1893 for ( size_t i
= 0; i
< inlen
; i
++ )
1894 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1896 tmpbuf
[inlen
] = L
'\0';
1902 // have destination buffer, convert there
1903 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1907 // NB: iconv was given only wcslen(psz) characters on input, and so
1908 // it couldn't convert the trailing zero. Let's do it ourselves
1909 // if there's some room left for it in the output buffer.
1915 // no destination buffer: convert using temp buffer
1916 // to calculate destination buffer requirement
1924 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1928 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1936 if (ICONV_FAILED(cres
, inbuf
))
1938 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1939 return wxCONV_FAILED
;
1945 size_t wxMBConv_iconv::GetMBNulLen() const
1947 if ( m_minMBCharWidth
== 0 )
1949 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1952 // NB: explained in MB2WC
1953 wxMutexLocker
lock(self
->m_iconvMutex
);
1956 wchar_t *wnul
= L
"";
1957 char buf
[8]; // should be enough for NUL in any encoding
1958 size_t inLen
= sizeof(wchar_t),
1959 outLen
= WXSIZEOF(buf
);
1960 char *inBuff
= (char *)wnul
;
1961 char *outBuff
= buf
;
1962 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1964 self
->m_minMBCharWidth
= (size_t)-1;
1968 self
->m_minMBCharWidth
= outBuff
- buf
;
1972 return m_minMBCharWidth
;
1975 #endif // HAVE_ICONV
1978 // ============================================================================
1979 // Win32 conversion classes
1980 // ============================================================================
1982 #ifdef wxHAVE_WIN32_MB2WC
1986 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1987 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1990 class wxMBConv_win32
: public wxMBConv
1995 m_CodePage
= CP_ACP
;
1996 m_minMBCharWidth
= 0;
1999 wxMBConv_win32(const wxMBConv_win32
& conv
)
2002 m_CodePage
= conv
.m_CodePage
;
2003 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2007 wxMBConv_win32(const wxChar
* name
)
2009 m_CodePage
= wxCharsetToCodepage(name
);
2010 m_minMBCharWidth
= 0;
2013 wxMBConv_win32(wxFontEncoding encoding
)
2015 m_CodePage
= wxEncodingToCodepage(encoding
);
2016 m_minMBCharWidth
= 0;
2018 #endif // wxUSE_FONTMAP
2020 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2022 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2023 // the behaviour is not compatible with the Unix version (using iconv)
2024 // and break the library itself, e.g. wxTextInputStream::NextChar()
2025 // wouldn't work if reading an incomplete MB char didn't result in an
2028 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2029 // Win XP or newer and it is not supported for UTF-[78] so we always
2030 // use our own conversions in this case. See
2031 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2032 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2033 if ( m_CodePage
== CP_UTF8
)
2035 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
2038 if ( m_CodePage
== CP_UTF7
)
2040 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
2044 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2045 IsAtLeastWin2kSP4() )
2047 flags
= MB_ERR_INVALID_CHARS
;
2050 const size_t len
= ::MultiByteToWideChar
2052 m_CodePage
, // code page
2053 flags
, // flags: fall on error
2054 psz
, // input string
2055 -1, // its length (NUL-terminated)
2056 buf
, // output string
2057 buf
? n
: 0 // size of output buffer
2061 // function totally failed
2062 return wxCONV_FAILED
;
2065 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2066 // check if we succeeded, by doing a double trip:
2067 if ( !flags
&& buf
)
2069 const size_t mbLen
= strlen(psz
);
2070 wxCharBuffer
mbBuf(mbLen
);
2071 if ( ::WideCharToMultiByte
2078 mbLen
+ 1, // size in bytes, not length
2082 strcmp(mbBuf
, psz
) != 0 )
2084 // we didn't obtain the same thing we started from, hence
2085 // the conversion was lossy and we consider that it failed
2086 return wxCONV_FAILED
;
2090 // note that it returns count of written chars for buf != NULL and size
2091 // of the needed buffer for buf == NULL so in either case the length of
2092 // the string (which never includes the terminating NUL) is one less
2096 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2099 we have a problem here: by default, WideCharToMultiByte() may
2100 replace characters unrepresentable in the target code page with bad
2101 quality approximations such as turning "1/2" symbol (U+00BD) into
2102 "1" for the code pages which don't have it and we, obviously, want
2103 to avoid this at any price
2105 the trouble is that this function does it _silently_, i.e. it won't
2106 even tell us whether it did or not... Win98/2000 and higher provide
2107 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2108 we have to resort to a round trip, i.e. check that converting back
2109 results in the same string -- this is, of course, expensive but
2110 otherwise we simply can't be sure to not garble the data.
2113 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2114 // it doesn't work with CJK encodings (which we test for rather roughly
2115 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2117 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2120 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2122 // it's our lucky day
2123 flags
= WC_NO_BEST_FIT_CHARS
;
2124 pUsedDef
= &usedDef
;
2126 else // old system or unsupported encoding
2132 const size_t len
= ::WideCharToMultiByte
2134 m_CodePage
, // code page
2135 flags
, // either none or no best fit
2136 pwz
, // input string
2137 -1, // it is (wide) NUL-terminated
2138 buf
, // output buffer
2139 buf
? n
: 0, // and its size
2140 NULL
, // default "replacement" char
2141 pUsedDef
// [out] was it used?
2146 // function totally failed
2147 return wxCONV_FAILED
;
2150 // if we were really converting, check if we succeeded
2155 // check if the conversion failed, i.e. if any replacements
2158 return wxCONV_FAILED
;
2160 else // we must resort to double tripping...
2162 wxWCharBuffer
wcBuf(n
);
2163 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2164 wcscmp(wcBuf
, pwz
) != 0 )
2166 // we didn't obtain the same thing we started from, hence
2167 // the conversion was lossy and we consider that it failed
2168 return wxCONV_FAILED
;
2173 // see the comment above for the reason of "len - 1"
2177 virtual size_t GetMBNulLen() const
2179 if ( m_minMBCharWidth
== 0 )
2181 int len
= ::WideCharToMultiByte
2183 m_CodePage
, // code page
2185 L
"", // input string
2186 1, // translate just the NUL
2187 NULL
, // output buffer
2189 NULL
, // no replacement char
2190 NULL
// [out] don't care if it was used
2193 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2197 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2198 self
->m_minMBCharWidth
= (size_t)-1;
2202 self
->m_minMBCharWidth
= (size_t)-1;
2208 self
->m_minMBCharWidth
= len
;
2213 return m_minMBCharWidth
;
2216 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2218 bool IsOk() const { return m_CodePage
!= -1; }
2221 static bool CanUseNoBestFit()
2223 static int s_isWin98Or2k
= -1;
2225 if ( s_isWin98Or2k
== -1 )
2228 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2231 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2235 s_isWin98Or2k
= verMaj
>= 5;
2239 // unknown: be conservative by default
2244 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2247 return s_isWin98Or2k
== 1;
2250 static bool IsAtLeastWin2kSP4()
2255 static int s_isAtLeastWin2kSP4
= -1;
2257 if ( s_isAtLeastWin2kSP4
== -1 )
2259 OSVERSIONINFOEX ver
;
2261 memset(&ver
, 0, sizeof(ver
));
2262 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2263 GetVersionEx((OSVERSIONINFO
*)&ver
);
2265 s_isAtLeastWin2kSP4
=
2266 ((ver
.dwMajorVersion
> 5) || // Vista+
2267 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2268 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2269 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2273 return s_isAtLeastWin2kSP4
== 1;
2278 // the code page we're working with
2281 // cached result of GetMBNulLen(), set to 0 initially meaning
2283 size_t m_minMBCharWidth
;
2286 #endif // wxHAVE_WIN32_MB2WC
2288 // ============================================================================
2289 // Cocoa conversion classes
2290 // ============================================================================
2292 #if defined(__WXCOCOA__)
2294 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2295 // Strangely enough, internally Core Foundation uses
2296 // UTF-32 internally quite a bit - its just not public (yet).
2298 #include <CoreFoundation/CFString.h>
2299 #include <CoreFoundation/CFStringEncodingExt.h>
2301 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2303 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2307 case wxFONTENCODING_DEFAULT
:
2308 enc
= CFStringGetSystemEncoding();
2311 case wxFONTENCODING_ISO8859_1
:
2312 enc
= kCFStringEncodingISOLatin1
;
2314 case wxFONTENCODING_ISO8859_2
:
2315 enc
= kCFStringEncodingISOLatin2
;
2317 case wxFONTENCODING_ISO8859_3
:
2318 enc
= kCFStringEncodingISOLatin3
;
2320 case wxFONTENCODING_ISO8859_4
:
2321 enc
= kCFStringEncodingISOLatin4
;
2323 case wxFONTENCODING_ISO8859_5
:
2324 enc
= kCFStringEncodingISOLatinCyrillic
;
2326 case wxFONTENCODING_ISO8859_6
:
2327 enc
= kCFStringEncodingISOLatinArabic
;
2329 case wxFONTENCODING_ISO8859_7
:
2330 enc
= kCFStringEncodingISOLatinGreek
;
2332 case wxFONTENCODING_ISO8859_8
:
2333 enc
= kCFStringEncodingISOLatinHebrew
;
2335 case wxFONTENCODING_ISO8859_9
:
2336 enc
= kCFStringEncodingISOLatin5
;
2338 case wxFONTENCODING_ISO8859_10
:
2339 enc
= kCFStringEncodingISOLatin6
;
2341 case wxFONTENCODING_ISO8859_11
:
2342 enc
= kCFStringEncodingISOLatinThai
;
2344 case wxFONTENCODING_ISO8859_13
:
2345 enc
= kCFStringEncodingISOLatin7
;
2347 case wxFONTENCODING_ISO8859_14
:
2348 enc
= kCFStringEncodingISOLatin8
;
2350 case wxFONTENCODING_ISO8859_15
:
2351 enc
= kCFStringEncodingISOLatin9
;
2354 case wxFONTENCODING_KOI8
:
2355 enc
= kCFStringEncodingKOI8_R
;
2357 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2358 enc
= kCFStringEncodingDOSRussian
;
2361 // case wxFONTENCODING_BULGARIAN :
2365 case wxFONTENCODING_CP437
:
2366 enc
= kCFStringEncodingDOSLatinUS
;
2368 case wxFONTENCODING_CP850
:
2369 enc
= kCFStringEncodingDOSLatin1
;
2371 case wxFONTENCODING_CP852
:
2372 enc
= kCFStringEncodingDOSLatin2
;
2374 case wxFONTENCODING_CP855
:
2375 enc
= kCFStringEncodingDOSCyrillic
;
2377 case wxFONTENCODING_CP866
:
2378 enc
= kCFStringEncodingDOSRussian
;
2380 case wxFONTENCODING_CP874
:
2381 enc
= kCFStringEncodingDOSThai
;
2383 case wxFONTENCODING_CP932
:
2384 enc
= kCFStringEncodingDOSJapanese
;
2386 case wxFONTENCODING_CP936
:
2387 enc
= kCFStringEncodingDOSChineseSimplif
;
2389 case wxFONTENCODING_CP949
:
2390 enc
= kCFStringEncodingDOSKorean
;
2392 case wxFONTENCODING_CP950
:
2393 enc
= kCFStringEncodingDOSChineseTrad
;
2395 case wxFONTENCODING_CP1250
:
2396 enc
= kCFStringEncodingWindowsLatin2
;
2398 case wxFONTENCODING_CP1251
:
2399 enc
= kCFStringEncodingWindowsCyrillic
;
2401 case wxFONTENCODING_CP1252
:
2402 enc
= kCFStringEncodingWindowsLatin1
;
2404 case wxFONTENCODING_CP1253
:
2405 enc
= kCFStringEncodingWindowsGreek
;
2407 case wxFONTENCODING_CP1254
:
2408 enc
= kCFStringEncodingWindowsLatin5
;
2410 case wxFONTENCODING_CP1255
:
2411 enc
= kCFStringEncodingWindowsHebrew
;
2413 case wxFONTENCODING_CP1256
:
2414 enc
= kCFStringEncodingWindowsArabic
;
2416 case wxFONTENCODING_CP1257
:
2417 enc
= kCFStringEncodingWindowsBalticRim
;
2419 // This only really encodes to UTF7 (if that) evidently
2420 // case wxFONTENCODING_UTF7 :
2421 // enc = kCFStringEncodingNonLossyASCII ;
2423 case wxFONTENCODING_UTF8
:
2424 enc
= kCFStringEncodingUTF8
;
2426 case wxFONTENCODING_EUC_JP
:
2427 enc
= kCFStringEncodingEUC_JP
;
2429 case wxFONTENCODING_UTF16
:
2430 enc
= kCFStringEncodingUnicode
;
2432 case wxFONTENCODING_MACROMAN
:
2433 enc
= kCFStringEncodingMacRoman
;
2435 case wxFONTENCODING_MACJAPANESE
:
2436 enc
= kCFStringEncodingMacJapanese
;
2438 case wxFONTENCODING_MACCHINESETRAD
:
2439 enc
= kCFStringEncodingMacChineseTrad
;
2441 case wxFONTENCODING_MACKOREAN
:
2442 enc
= kCFStringEncodingMacKorean
;
2444 case wxFONTENCODING_MACARABIC
:
2445 enc
= kCFStringEncodingMacArabic
;
2447 case wxFONTENCODING_MACHEBREW
:
2448 enc
= kCFStringEncodingMacHebrew
;
2450 case wxFONTENCODING_MACGREEK
:
2451 enc
= kCFStringEncodingMacGreek
;
2453 case wxFONTENCODING_MACCYRILLIC
:
2454 enc
= kCFStringEncodingMacCyrillic
;
2456 case wxFONTENCODING_MACDEVANAGARI
:
2457 enc
= kCFStringEncodingMacDevanagari
;
2459 case wxFONTENCODING_MACGURMUKHI
:
2460 enc
= kCFStringEncodingMacGurmukhi
;
2462 case wxFONTENCODING_MACGUJARATI
:
2463 enc
= kCFStringEncodingMacGujarati
;
2465 case wxFONTENCODING_MACORIYA
:
2466 enc
= kCFStringEncodingMacOriya
;
2468 case wxFONTENCODING_MACBENGALI
:
2469 enc
= kCFStringEncodingMacBengali
;
2471 case wxFONTENCODING_MACTAMIL
:
2472 enc
= kCFStringEncodingMacTamil
;
2474 case wxFONTENCODING_MACTELUGU
:
2475 enc
= kCFStringEncodingMacTelugu
;
2477 case wxFONTENCODING_MACKANNADA
:
2478 enc
= kCFStringEncodingMacKannada
;
2480 case wxFONTENCODING_MACMALAJALAM
:
2481 enc
= kCFStringEncodingMacMalayalam
;
2483 case wxFONTENCODING_MACSINHALESE
:
2484 enc
= kCFStringEncodingMacSinhalese
;
2486 case wxFONTENCODING_MACBURMESE
:
2487 enc
= kCFStringEncodingMacBurmese
;
2489 case wxFONTENCODING_MACKHMER
:
2490 enc
= kCFStringEncodingMacKhmer
;
2492 case wxFONTENCODING_MACTHAI
:
2493 enc
= kCFStringEncodingMacThai
;
2495 case wxFONTENCODING_MACLAOTIAN
:
2496 enc
= kCFStringEncodingMacLaotian
;
2498 case wxFONTENCODING_MACGEORGIAN
:
2499 enc
= kCFStringEncodingMacGeorgian
;
2501 case wxFONTENCODING_MACARMENIAN
:
2502 enc
= kCFStringEncodingMacArmenian
;
2504 case wxFONTENCODING_MACCHINESESIMP
:
2505 enc
= kCFStringEncodingMacChineseSimp
;
2507 case wxFONTENCODING_MACTIBETAN
:
2508 enc
= kCFStringEncodingMacTibetan
;
2510 case wxFONTENCODING_MACMONGOLIAN
:
2511 enc
= kCFStringEncodingMacMongolian
;
2513 case wxFONTENCODING_MACETHIOPIC
:
2514 enc
= kCFStringEncodingMacEthiopic
;
2516 case wxFONTENCODING_MACCENTRALEUR
:
2517 enc
= kCFStringEncodingMacCentralEurRoman
;
2519 case wxFONTENCODING_MACVIATNAMESE
:
2520 enc
= kCFStringEncodingMacVietnamese
;
2522 case wxFONTENCODING_MACARABICEXT
:
2523 enc
= kCFStringEncodingMacExtArabic
;
2525 case wxFONTENCODING_MACSYMBOL
:
2526 enc
= kCFStringEncodingMacSymbol
;
2528 case wxFONTENCODING_MACDINGBATS
:
2529 enc
= kCFStringEncodingMacDingbats
;
2531 case wxFONTENCODING_MACTURKISH
:
2532 enc
= kCFStringEncodingMacTurkish
;
2534 case wxFONTENCODING_MACCROATIAN
:
2535 enc
= kCFStringEncodingMacCroatian
;
2537 case wxFONTENCODING_MACICELANDIC
:
2538 enc
= kCFStringEncodingMacIcelandic
;
2540 case wxFONTENCODING_MACROMANIAN
:
2541 enc
= kCFStringEncodingMacRomanian
;
2543 case wxFONTENCODING_MACCELTIC
:
2544 enc
= kCFStringEncodingMacCeltic
;
2546 case wxFONTENCODING_MACGAELIC
:
2547 enc
= kCFStringEncodingMacGaelic
;
2549 // case wxFONTENCODING_MACKEYBOARD :
2550 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2554 // because gcc is picky
2561 class wxMBConv_cocoa
: public wxMBConv
2566 Init(CFStringGetSystemEncoding()) ;
2569 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2571 m_encoding
= conv
.m_encoding
;
2575 wxMBConv_cocoa(const wxChar
* name
)
2577 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2581 wxMBConv_cocoa(wxFontEncoding encoding
)
2583 Init( wxCFStringEncFromFontEnc(encoding
) );
2590 void Init( CFStringEncoding encoding
)
2592 m_encoding
= encoding
;
2595 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2599 CFStringRef theString
= CFStringCreateWithBytes (
2600 NULL
, //the allocator
2601 (const UInt8
*)szUnConv
,
2604 false //no BOM/external representation
2607 wxASSERT(theString
);
2609 size_t nOutLength
= CFStringGetLength(theString
);
2613 CFRelease(theString
);
2617 CFRange theRange
= { 0, nOutSize
};
2619 #if SIZEOF_WCHAR_T == 4
2620 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2623 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2625 CFRelease(theString
);
2627 szUniCharBuffer
[nOutLength
] = '\0';
2629 #if SIZEOF_WCHAR_T == 4
2630 wxMBConvUTF16 converter
;
2631 converter
.MB2WC( szOut
, (const char*)szUniCharBuffer
, nOutSize
);
2632 delete [] szUniCharBuffer
;
2638 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2642 size_t nRealOutSize
;
2643 size_t nBufSize
= wxWcslen(szUnConv
);
2644 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2646 #if SIZEOF_WCHAR_T == 4
2647 wxMBConvUTF16 converter
;
2648 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2649 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1];
2650 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
));
2651 nBufSize
/= sizeof(UniChar
);
2654 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2658 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2661 wxASSERT(theString
);
2663 //Note that CER puts a BOM when converting to unicode
2664 //so we check and use getchars instead in that case
2665 if (m_encoding
== kCFStringEncodingUnicode
)
2668 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2670 nRealOutSize
= CFStringGetLength(theString
) + 1;
2676 CFRangeMake(0, CFStringGetLength(theString
)),
2678 0, //what to put in characters that can't be converted -
2679 //0 tells CFString to return NULL if it meets such a character
2680 false, //not an external representation
2683 (CFIndex
*) &nRealOutSize
2687 CFRelease(theString
);
2689 #if SIZEOF_WCHAR_T == 4
2690 delete[] szUniBuffer
;
2693 return nRealOutSize
- 1;
2696 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2700 return m_encoding
!= kCFStringEncodingInvalidId
&&
2701 CFStringIsEncodingAvailable(m_encoding
);
2705 CFStringEncoding m_encoding
;
2708 #endif // defined(__WXCOCOA__)
2710 // ============================================================================
2711 // Mac conversion classes
2712 // ============================================================================
2714 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2716 class wxMBConv_mac
: public wxMBConv
2721 Init(CFStringGetSystemEncoding()) ;
2724 wxMBConv_mac(const wxMBConv_mac
& conv
)
2726 Init(conv
.m_char_encoding
);
2730 wxMBConv_mac(const wxChar
* name
)
2732 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2736 wxMBConv_mac(wxFontEncoding encoding
)
2738 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2743 OSStatus status
= noErr
;
2744 status
= TECDisposeConverter(m_MB2WC_converter
);
2745 status
= TECDisposeConverter(m_WC2MB_converter
);
2749 void Init( TextEncodingBase encoding
)
2751 OSStatus status
= noErr
;
2752 m_char_encoding
= encoding
;
2753 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2755 status
= TECCreateConverter(&m_MB2WC_converter
,
2757 m_unicode_encoding
);
2758 status
= TECCreateConverter(&m_WC2MB_converter
,
2763 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2765 OSStatus status
= noErr
;
2766 ByteCount byteOutLen
;
2767 ByteCount byteInLen
= strlen(psz
) + 1;
2768 wchar_t *tbuf
= NULL
;
2769 UniChar
* ubuf
= NULL
;
2774 // Apple specs say at least 32
2775 n
= wxMax( 32, byteInLen
) ;
2776 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2779 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2781 #if SIZEOF_WCHAR_T == 4
2782 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2784 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2787 status
= TECConvertText(
2788 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2789 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2791 #if SIZEOF_WCHAR_T == 4
2792 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2793 // is not properly terminated we get random characters at the end
2794 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2795 wxMBConvUTF16 converter
;
2796 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2799 res
= byteOutLen
/ sizeof( UniChar
) ;
2805 if ( buf
&& res
< n
)
2811 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2813 OSStatus status
= noErr
;
2814 ByteCount byteOutLen
;
2815 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2821 // Apple specs say at least 32
2822 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2823 tbuf
= (char*) malloc( n
) ;
2826 ByteCount byteBufferLen
= n
;
2827 UniChar
* ubuf
= NULL
;
2829 #if SIZEOF_WCHAR_T == 4
2830 wxMBConvUTF16 converter
;
2831 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2832 byteInLen
= unicharlen
;
2833 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2834 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2836 ubuf
= (UniChar
*) psz
;
2839 status
= TECConvertText(
2840 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2841 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2843 #if SIZEOF_WCHAR_T == 4
2850 size_t res
= byteOutLen
;
2851 if ( buf
&& res
< n
)
2855 //we need to double-trip to verify it didn't insert any ? in place
2856 //of bogus characters
2857 wxWCharBuffer
wcBuf(n
);
2858 size_t pszlen
= wxWcslen(psz
);
2859 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2860 wxWcslen(wcBuf
) != pszlen
||
2861 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2863 // we didn't obtain the same thing we started from, hence
2864 // the conversion was lossy and we consider that it failed
2865 return wxCONV_FAILED
;
2872 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2875 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2878 TECObjectRef m_MB2WC_converter
;
2879 TECObjectRef m_WC2MB_converter
;
2881 TextEncodingBase m_char_encoding
;
2882 TextEncodingBase m_unicode_encoding
;
2885 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2887 // ============================================================================
2888 // wxEncodingConverter based conversion classes
2889 // ============================================================================
2893 class wxMBConv_wxwin
: public wxMBConv
2898 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2899 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2903 // temporarily just use wxEncodingConverter stuff,
2904 // so that it works while a better implementation is built
2905 wxMBConv_wxwin(const wxChar
* name
)
2908 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2910 m_enc
= wxFONTENCODING_SYSTEM
;
2915 wxMBConv_wxwin(wxFontEncoding enc
)
2922 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2924 size_t inbuf
= strlen(psz
);
2927 if (!m2w
.Convert(psz
, buf
))
2928 return wxCONV_FAILED
;
2933 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2935 const size_t inbuf
= wxWcslen(psz
);
2938 if (!w2m
.Convert(psz
, buf
))
2939 return wxCONV_FAILED
;
2945 virtual size_t GetMBNulLen() const
2949 case wxFONTENCODING_UTF16BE
:
2950 case wxFONTENCODING_UTF16LE
:
2953 case wxFONTENCODING_UTF32BE
:
2954 case wxFONTENCODING_UTF32LE
:
2962 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2964 bool IsOk() const { return m_ok
; }
2967 wxFontEncoding m_enc
;
2968 wxEncodingConverter m2w
, w2m
;
2971 // were we initialized successfully?
2974 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2977 // make the constructors available for unit testing
2978 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2980 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2981 if ( !result
->IsOk() )
2990 #endif // wxUSE_FONTMAP
2992 // ============================================================================
2993 // wxCSConv implementation
2994 // ============================================================================
2996 void wxCSConv::Init()
3003 wxCSConv::wxCSConv(const wxChar
*charset
)
3013 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
3015 m_encoding
= wxFONTENCODING_SYSTEM
;
3019 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3021 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3023 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3025 encoding
= wxFONTENCODING_SYSTEM
;
3030 m_encoding
= encoding
;
3033 wxCSConv::~wxCSConv()
3038 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3043 SetName(conv
.m_name
);
3044 m_encoding
= conv
.m_encoding
;
3047 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3051 SetName(conv
.m_name
);
3052 m_encoding
= conv
.m_encoding
;
3057 void wxCSConv::Clear()
3066 void wxCSConv::SetName(const wxChar
*charset
)
3070 m_name
= wxStrdup(charset
);
3076 #include "wx/hashmap.h"
3078 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3079 wxEncodingNameCache
);
3081 static wxEncodingNameCache gs_nameCache
;
3084 wxMBConv
*wxCSConv::DoCreate() const
3087 wxLogTrace(TRACE_STRCONV
,
3088 wxT("creating conversion for %s"),
3090 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
3091 #endif // wxUSE_FONTMAP
3093 // check for the special case of ASCII or ISO8859-1 charset: as we have
3094 // special knowledge of it anyhow, we don't need to create a special
3095 // conversion object
3096 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3097 m_encoding
== wxFONTENCODING_DEFAULT
)
3099 // don't convert at all
3103 // we trust OS to do conversion better than we can so try external
3104 // conversion methods first
3106 // the full order is:
3107 // 1. OS conversion (iconv() under Unix or Win32 API)
3108 // 2. hard coded conversions for UTF
3109 // 3. wxEncodingConverter as fall back
3115 #endif // !wxUSE_FONTMAP
3117 wxString
name(m_name
);
3118 wxFontEncoding
encoding(m_encoding
);
3120 if ( !name
.empty() )
3122 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
3130 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3131 #endif // wxUSE_FONTMAP
3135 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3136 if ( it
!= gs_nameCache
.end() )
3138 if ( it
->second
.empty() )
3141 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3148 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3150 for ( ; *names
; ++names
)
3152 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3155 gs_nameCache
[encoding
] = *names
;
3162 gs_nameCache
[encoding
] = _T(""); // cache the failure
3164 #endif // wxUSE_FONTMAP
3166 #endif // HAVE_ICONV
3168 #ifdef wxHAVE_WIN32_MB2WC
3171 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3172 : new wxMBConv_win32(m_encoding
);
3181 #endif // wxHAVE_WIN32_MB2WC
3183 #if defined(__WXMAC__)
3185 // leave UTF16 and UTF32 to the built-ins of wx
3186 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3187 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3190 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3191 : new wxMBConv_mac(m_encoding
);
3193 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3203 #if defined(__WXCOCOA__)
3205 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3208 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3209 : new wxMBConv_cocoa(m_encoding
);
3211 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3222 wxFontEncoding enc
= m_encoding
;
3224 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3229 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3231 #endif // wxUSE_FONTMAP
3235 case wxFONTENCODING_UTF7
:
3236 return new wxMBConvUTF7
;
3238 case wxFONTENCODING_UTF8
:
3239 return new wxMBConvUTF8
;
3241 case wxFONTENCODING_UTF16BE
:
3242 return new wxMBConvUTF16BE
;
3244 case wxFONTENCODING_UTF16LE
:
3245 return new wxMBConvUTF16LE
;
3247 case wxFONTENCODING_UTF32BE
:
3248 return new wxMBConvUTF32BE
;
3250 case wxFONTENCODING_UTF32LE
:
3251 return new wxMBConvUTF32LE
;
3254 // nothing to do but put here to suppress gcc warnings
3261 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3262 : new wxMBConv_wxwin(m_encoding
);
3268 #endif // wxUSE_FONTMAP
3270 // NB: This is a hack to prevent deadlock. What could otherwise happen
3271 // in Unicode build: wxConvLocal creation ends up being here
3272 // because of some failure and logs the error. But wxLog will try to
3273 // attach timestamp, for which it will need wxConvLocal (to convert
3274 // time to char* and then wchar_t*), but that fails, tries to log
3275 // error, but wxLog has a (already locked) critical section that
3276 // guards static buffer.
3277 static bool alreadyLoggingError
= false;
3278 if (!alreadyLoggingError
)
3280 alreadyLoggingError
= true;
3281 wxLogError(_("Cannot convert from the charset '%s'!"),
3285 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3286 #else // !wxUSE_FONTMAP
3287 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3288 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3291 alreadyLoggingError
= false;
3297 void wxCSConv::CreateConvIfNeeded() const
3301 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3304 // if we don't have neither the name nor the encoding, use the default
3305 // encoding for this system
3306 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3308 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3310 #endif // wxUSE_INTL
3312 self
->m_convReal
= DoCreate();
3313 self
->m_deferred
= false;
3317 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3319 CreateConvIfNeeded();
3322 return m_convReal
->MB2WC(buf
, psz
, n
);
3325 size_t len
= strlen(psz
);
3329 for (size_t c
= 0; c
<= len
; c
++)
3330 buf
[c
] = (unsigned char)(psz
[c
]);
3336 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3338 CreateConvIfNeeded();
3341 return m_convReal
->WC2MB(buf
, psz
, n
);
3344 const size_t len
= wxWcslen(psz
);
3347 for (size_t c
= 0; c
<= len
; c
++)
3350 return wxCONV_FAILED
;
3352 buf
[c
] = (char)psz
[c
];
3357 for (size_t c
= 0; c
<= len
; c
++)
3360 return wxCONV_FAILED
;
3367 size_t wxCSConv::GetMBNulLen() const
3369 CreateConvIfNeeded();
3373 return m_convReal
->GetMBNulLen();
3379 // ----------------------------------------------------------------------------
3381 // ----------------------------------------------------------------------------
3384 static wxMBConv_win32 wxConvLibcObj
;
3385 #elif defined(__WXMAC__) && !defined(__MACH__)
3386 static wxMBConv_mac wxConvLibcObj
;
3388 static wxMBConvLibc wxConvLibcObj
;
3391 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3392 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3393 static wxMBConvUTF7 wxConvUTF7Obj
;
3394 static wxMBConvUTF8 wxConvUTF8Obj
;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3398 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= &wxConvLocal
;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3410 #else // !wxUSE_WCHAR_T
3412 // stand-ins in absence of wchar_t
3413 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3418 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T