1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
22 #include "wx/hashmap.h"
25 #include "wx/strconv.h"
30 #include "wx/msw/private.h"
31 #include "wx/msw/missing.h"
42 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
43 #define wxHAVE_WIN32_MB2WC
52 #include "wx/thread.h"
55 #include "wx/encconv.h"
56 #include "wx/fontmap.h"
60 #include <ATSUnicode.h>
61 #include <TextCommon.h>
62 #include <TextEncodingConverter.h>
65 // includes Mac headers
66 #include "wx/mac/private.h"
70 #define TRACE_STRCONV _T("strconv")
72 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
74 #if SIZEOF_WCHAR_T == 2
79 // ============================================================================
81 // ============================================================================
83 // helper function of cMB2WC(): check if n bytes at this location are all NUL
84 static bool NotAllNULs(const char *p
, size_t n
)
86 while ( n
&& *p
++ == '\0' )
92 // ----------------------------------------------------------------------------
93 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
94 // ----------------------------------------------------------------------------
96 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
101 *output
= (wxUint16
) input
;
105 else if (input
>= 0x110000)
107 return wxCONV_FAILED
;
113 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
114 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
121 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
123 if ((*input
< 0xd800) || (*input
> 0xdfff))
128 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
131 return wxCONV_FAILED
;
135 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
141 typedef wchar_t wxDecodeSurrogate_t
;
143 typedef wxUint16 wxDecodeSurrogate_t
;
144 #endif // WC_UTF16/!WC_UTF16
146 // returns the next UTF-32 character from the wchar_t buffer and advances the
147 // pointer to the character after this one
149 // if an invalid character is found, *pSrc is set to NULL, the caller must
151 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
155 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
156 if ( n
== wxCONV_FAILED
)
164 // ----------------------------------------------------------------------------
166 // ----------------------------------------------------------------------------
169 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
170 const char *src
, size_t srcLen
) const
172 // although new conversion classes are supposed to implement this function
173 // directly, the existins ones only implement the old MB2WC() and so, to
174 // avoid to have to rewrite all conversion classes at once, we provide a
175 // default (but not efficient) implementation of this one in terms of the
176 // old function by copying the input to ensure that it's NUL-terminated and
177 // then using MB2WC() to convert it
179 // the number of chars [which would be] written to dst [if it were not NULL]
180 size_t dstWritten
= 0;
182 // the number of NULs terminating this string
183 size_t nulLen
= 0; // not really needed, but just to avoid warnings
185 // if we were not given the input size we just have to assume that the
186 // string is properly terminated as we have no way of knowing how long it
187 // is anyhow, but if we do have the size check whether there are enough
191 if ( srcLen
!= wxNO_LEN
)
193 // we need to know how to find the end of this string
194 nulLen
= GetMBNulLen();
195 if ( nulLen
== wxCONV_FAILED
)
196 return wxCONV_FAILED
;
198 // if there are enough NULs we can avoid the copy
199 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
201 // make a copy in order to properly NUL-terminate the string
202 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
203 char * const p
= bufTmp
.data();
204 memcpy(p
, src
, srcLen
);
205 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
211 srcEnd
= src
+ srcLen
;
213 else // quit after the first loop iteration
220 // try to convert the current chunk
221 size_t lenChunk
= MB2WC(NULL
, src
, 0);
222 if ( lenChunk
== wxCONV_FAILED
)
223 return wxCONV_FAILED
;
225 lenChunk
++; // for the L'\0' at the end of this chunk
227 dstWritten
+= lenChunk
;
231 // nothing left in the input string, conversion succeeded
237 if ( dstWritten
> dstLen
)
238 return wxCONV_FAILED
;
240 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
241 return wxCONV_FAILED
;
248 // we convert just one chunk in this case as this is the entire
253 // advance the input pointer past the end of this chunk
254 while ( NotAllNULs(src
, nulLen
) )
256 // notice that we must skip over multiple bytes here as we suppose
257 // that if NUL takes 2 or 4 bytes, then all the other characters do
258 // too and so if advanced by a single byte we might erroneously
259 // detect sequences of NUL bytes in the middle of the input
263 src
+= nulLen
; // skipping over its terminator as well
265 // note that ">=" (and not just "==") is needed here as the terminator
266 // we skipped just above could be inside or just after the buffer
267 // delimited by inEnd
276 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
277 const wchar_t *src
, size_t srcLen
) const
279 // the number of chars [which would be] written to dst [if it were not NULL]
280 size_t dstWritten
= 0;
282 // make a copy of the input string unless it is already properly
285 // if we don't know its length we have no choice but to assume that it is,
286 // indeed, properly terminated
287 wxWCharBuffer bufTmp
;
288 if ( srcLen
== wxNO_LEN
)
290 srcLen
= wxWcslen(src
) + 1;
292 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
294 // make a copy in order to properly NUL-terminate the string
295 bufTmp
= wxWCharBuffer(srcLen
);
296 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
300 const size_t lenNul
= GetMBNulLen();
301 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
303 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
305 // try to convert the current chunk
306 size_t lenChunk
= WC2MB(NULL
, src
, 0);
308 if ( lenChunk
== wxCONV_FAILED
)
309 return wxCONV_FAILED
;
312 dstWritten
+= lenChunk
;
316 if ( dstWritten
> dstLen
)
317 return wxCONV_FAILED
;
319 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
320 return wxCONV_FAILED
;
329 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
331 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
332 if ( rc
!= wxCONV_FAILED
)
334 // ToWChar() returns the buffer length, i.e. including the trailing
335 // NUL, while this method doesn't take it into account
342 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
344 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
345 if ( rc
!= wxCONV_FAILED
)
353 wxMBConv::~wxMBConv()
355 // nothing to do here (necessary for Darwin linking probably)
358 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
362 // calculate the length of the buffer needed first
363 const size_t nLen
= MB2WC(NULL
, psz
, 0);
364 if ( nLen
!= wxCONV_FAILED
)
366 // now do the actual conversion
367 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
369 // +1 for the trailing NULL
370 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
375 return wxWCharBuffer();
378 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
382 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
383 if ( nLen
!= wxCONV_FAILED
)
385 // extra space for trailing NUL(s)
386 static const size_t extraLen
= GetMaxMBNulLen();
388 wxCharBuffer
buf(nLen
+ extraLen
- 1);
389 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
394 return wxCharBuffer();
398 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
400 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
401 if ( dstLen
!= wxCONV_FAILED
)
403 wxWCharBuffer
wbuf(dstLen
- 1);
404 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
409 if ( wbuf
[dstLen
- 1] == L
'\0' )
420 return wxWCharBuffer();
424 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
426 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
427 if ( dstLen
!= wxCONV_FAILED
)
429 // special case of empty input: can't allocate 0 size buffer below as
430 // wxCharBuffer insists on NUL-terminating it
431 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
432 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
438 const size_t nulLen
= GetMBNulLen();
439 if ( dstLen
>= nulLen
&&
440 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
442 // in this case the output is NUL-terminated and we're not
443 // supposed to count NUL
455 return wxCharBuffer();
458 // ----------------------------------------------------------------------------
460 // ----------------------------------------------------------------------------
462 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
464 return wxMB2WC(buf
, psz
, n
);
467 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
469 return wxWC2MB(buf
, psz
, n
);
472 // ----------------------------------------------------------------------------
473 // wxConvBrokenFileNames
474 // ----------------------------------------------------------------------------
478 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
480 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
481 || wxStricmp(charset
, _T("UTF8")) == 0 )
482 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
484 m_conv
= new wxCSConv(charset
);
489 // ----------------------------------------------------------------------------
491 // ----------------------------------------------------------------------------
493 // Implementation (C) 2004 Fredrik Roubert
496 // BASE64 decoding table
498 static const unsigned char utf7unb64
[] =
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
506 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
507 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
509 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
510 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
511 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
513 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
514 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
515 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
534 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
538 while ( *psz
&& (!buf
|| (len
< n
)) )
540 unsigned char cc
= *psz
++;
548 else if (*psz
== '-')
556 else // start of BASE64 encoded string
560 for ( ok
= lsb
= false, d
= 0, l
= 0;
561 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
566 for (l
+= 6; l
>= 8; lsb
= !lsb
)
568 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
578 *buf
= (wchar_t)(c
<< 8);
587 // in valid UTF7 we should have valid characters after '+'
588 return wxCONV_FAILED
;
596 if ( buf
&& (len
< n
) )
603 // BASE64 encoding table
605 static const unsigned char utf7enb64
[] =
607 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
608 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
609 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
610 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
611 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
612 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
613 'w', 'x', 'y', 'z', '0', '1', '2', '3',
614 '4', '5', '6', '7', '8', '9', '+', '/'
618 // UTF-7 encoding table
620 // 0 - Set D (directly encoded characters)
621 // 1 - Set O (optional direct characters)
622 // 2 - whitespace characters (optional)
623 // 3 - special characters
625 static const unsigned char utf7encode
[128] =
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
628 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
629 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
631 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
637 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
641 while (*psz
&& ((!buf
) || (len
< n
)))
644 if (cc
< 0x80 && utf7encode
[cc
] < 1)
653 else if (((wxUint32
)cc
) > 0xffff)
655 // no surrogate pair generation (yet?)
656 return wxCONV_FAILED
;
667 // BASE64 encode string
668 unsigned int lsb
, d
, l
;
669 for (d
= 0, l
= 0; /*nothing*/; psz
++)
671 for (lsb
= 0; lsb
< 2; lsb
++)
674 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
676 for (l
+= 8; l
>= 6; )
680 *buf
++ = utf7enb64
[(d
>> l
) % 64];
686 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
693 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
705 if (buf
&& (len
< n
))
711 // ----------------------------------------------------------------------------
713 // ----------------------------------------------------------------------------
715 static wxUint32 utf8_max
[]=
716 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
718 // boundaries of the private use area we use to (temporarily) remap invalid
719 // characters invalid in a UTF-8 encoded string
720 const wxUint32 wxUnicodePUA
= 0x100000;
721 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
723 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
727 while (*psz
&& ((!buf
) || (len
< n
)))
729 const char *opsz
= psz
;
730 bool invalid
= false;
731 unsigned char cc
= *psz
++, fc
= cc
;
733 for (cnt
= 0; fc
& 0x80; cnt
++)
743 // escape the escape character for octal escapes
744 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
745 && cc
== '\\' && (!buf
|| len
< n
))
757 // invalid UTF-8 sequence
762 unsigned ocnt
= cnt
- 1;
763 wxUint32 res
= cc
& (0x3f >> cnt
);
767 if ((cc
& 0xC0) != 0x80)
769 // invalid UTF-8 sequence
775 res
= (res
<< 6) | (cc
& 0x3f);
778 if (invalid
|| res
<= utf8_max
[ocnt
])
780 // illegal UTF-8 encoding
783 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
784 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
786 // if one of our PUA characters turns up externally
787 // it must also be treated as an illegal sequence
788 // (a bit like you have to escape an escape character)
794 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
795 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
796 if (pa
== wxCONV_FAILED
)
808 *buf
++ = (wchar_t)res
;
810 #endif // WC_UTF16/!WC_UTF16
816 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
818 while (opsz
< psz
&& (!buf
|| len
< n
))
821 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
822 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
823 wxASSERT(pa
!= wxCONV_FAILED
);
830 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
836 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
838 while (opsz
< psz
&& (!buf
|| len
< n
))
840 if ( buf
&& len
+ 3 < n
)
842 unsigned char on
= *opsz
;
844 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
845 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
846 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
853 else // MAP_INVALID_UTF8_NOT
855 return wxCONV_FAILED
;
861 if (buf
&& (len
< n
))
867 static inline bool isoctal(wchar_t wch
)
869 return L
'0' <= wch
&& wch
<= L
'7';
872 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
876 while (*psz
&& ((!buf
) || (len
< n
)))
881 // cast is ok for WC_UTF16
882 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
883 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
885 cc
= (*psz
++) & 0x7fffffff;
888 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
889 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
892 *buf
++ = (char)(cc
- wxUnicodePUA
);
895 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
896 && cc
== L
'\\' && psz
[0] == L
'\\' )
903 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
905 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
909 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
910 (psz
[1] - L
'0') * 010 +
920 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
936 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
938 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
944 if (buf
&& (len
< n
))
950 // ============================================================================
952 // ============================================================================
954 #ifdef WORDS_BIGENDIAN
955 #define wxMBConvUTF16straight wxMBConvUTF16BE
956 #define wxMBConvUTF16swap wxMBConvUTF16LE
958 #define wxMBConvUTF16swap wxMBConvUTF16BE
959 #define wxMBConvUTF16straight wxMBConvUTF16LE
963 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
965 if ( srcLen
== wxNO_LEN
)
967 // count the number of bytes in input, including the trailing NULs
968 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
969 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
972 srcLen
*= BYTES_PER_CHAR
;
974 else // we already have the length
976 // we can only convert an entire number of UTF-16 characters
977 if ( srcLen
% BYTES_PER_CHAR
)
978 return wxCONV_FAILED
;
984 // case when in-memory representation is UTF-16 too
987 // ----------------------------------------------------------------------------
988 // conversions without endianness change
989 // ----------------------------------------------------------------------------
992 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
993 const char *src
, size_t srcLen
) const
995 // set up the scene for using memcpy() (which is presumably more efficient
996 // than copying the bytes one by one)
997 srcLen
= GetLength(src
, srcLen
);
998 if ( srcLen
== wxNO_LEN
)
999 return wxCONV_FAILED
;
1001 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1004 if ( dstLen
< inLen
)
1005 return wxCONV_FAILED
;
1007 memcpy(dst
, src
, srcLen
);
1014 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1015 const wchar_t *src
, size_t srcLen
) const
1017 if ( srcLen
== wxNO_LEN
)
1018 srcLen
= wxWcslen(src
) + 1;
1020 srcLen
*= BYTES_PER_CHAR
;
1024 if ( dstLen
< srcLen
)
1025 return wxCONV_FAILED
;
1027 memcpy(dst
, src
, srcLen
);
1033 // ----------------------------------------------------------------------------
1034 // endian-reversing conversions
1035 // ----------------------------------------------------------------------------
1038 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1039 const char *src
, size_t srcLen
) const
1041 srcLen
= GetLength(src
, srcLen
);
1042 if ( srcLen
== wxNO_LEN
)
1043 return wxCONV_FAILED
;
1045 srcLen
/= BYTES_PER_CHAR
;
1049 if ( dstLen
< srcLen
)
1050 return wxCONV_FAILED
;
1052 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1053 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1055 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1063 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1064 const wchar_t *src
, size_t srcLen
) const
1066 if ( srcLen
== wxNO_LEN
)
1067 srcLen
= wxWcslen(src
) + 1;
1069 srcLen
*= BYTES_PER_CHAR
;
1073 if ( dstLen
< srcLen
)
1074 return wxCONV_FAILED
;
1076 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1077 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1079 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1086 #else // !WC_UTF16: wchar_t is UTF-32
1088 // ----------------------------------------------------------------------------
1089 // conversions without endianness change
1090 // ----------------------------------------------------------------------------
1093 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1094 const char *src
, size_t srcLen
) const
1096 srcLen
= GetLength(src
, srcLen
);
1097 if ( srcLen
== wxNO_LEN
)
1098 return wxCONV_FAILED
;
1100 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1103 // optimization: return maximal space which could be needed for this
1104 // string even if the real size could be smaller if the buffer contains
1110 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1111 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1113 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1115 return wxCONV_FAILED
;
1117 if ( ++outLen
> dstLen
)
1118 return wxCONV_FAILED
;
1128 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1129 const wchar_t *src
, size_t srcLen
) const
1131 if ( srcLen
== wxNO_LEN
)
1132 srcLen
= wxWcslen(src
) + 1;
1135 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1136 for ( size_t n
= 0; n
< srcLen
; n
++ )
1139 const size_t numChars
= encode_utf16(*src
++, cc
);
1140 if ( numChars
== wxCONV_FAILED
)
1141 return wxCONV_FAILED
;
1143 outLen
+= numChars
* BYTES_PER_CHAR
;
1146 if ( outLen
> dstLen
)
1147 return wxCONV_FAILED
;
1150 if ( numChars
== 2 )
1152 // second character of a surrogate
1161 // ----------------------------------------------------------------------------
1162 // endian-reversing conversions
1163 // ----------------------------------------------------------------------------
1166 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1167 const char *src
, size_t srcLen
) const
1169 srcLen
= GetLength(src
, srcLen
);
1170 if ( srcLen
== wxNO_LEN
)
1171 return wxCONV_FAILED
;
1173 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1176 // optimization: return maximal space which could be needed for this
1177 // string even if the real size could be smaller if the buffer contains
1183 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1184 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1189 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1191 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1193 const size_t numChars
= decode_utf16(tmp
, ch
);
1194 if ( numChars
== wxCONV_FAILED
)
1195 return wxCONV_FAILED
;
1197 if ( numChars
== 2 )
1200 if ( ++outLen
> dstLen
)
1201 return wxCONV_FAILED
;
1211 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1212 const wchar_t *src
, size_t srcLen
) const
1214 if ( srcLen
== wxNO_LEN
)
1215 srcLen
= wxWcslen(src
) + 1;
1218 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1219 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1222 const size_t numChars
= encode_utf16(*src
, cc
);
1223 if ( numChars
== wxCONV_FAILED
)
1224 return wxCONV_FAILED
;
1226 outLen
+= numChars
* BYTES_PER_CHAR
;
1229 if ( outLen
> dstLen
)
1230 return wxCONV_FAILED
;
1232 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1233 if ( numChars
== 2 )
1235 // second character of a surrogate
1236 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1244 #endif // WC_UTF16/!WC_UTF16
1247 // ============================================================================
1249 // ============================================================================
1251 #ifdef WORDS_BIGENDIAN
1252 #define wxMBConvUTF32straight wxMBConvUTF32BE
1253 #define wxMBConvUTF32swap wxMBConvUTF32LE
1255 #define wxMBConvUTF32swap wxMBConvUTF32BE
1256 #define wxMBConvUTF32straight wxMBConvUTF32LE
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1261 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1264 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1266 if ( srcLen
== wxNO_LEN
)
1268 // count the number of bytes in input, including the trailing NULs
1269 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1270 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1273 srcLen
*= BYTES_PER_CHAR
;
1275 else // we already have the length
1277 // we can only convert an entire number of UTF-32 characters
1278 if ( srcLen
% BYTES_PER_CHAR
)
1279 return wxCONV_FAILED
;
1285 // case when in-memory representation is UTF-16
1288 // ----------------------------------------------------------------------------
1289 // conversions without endianness change
1290 // ----------------------------------------------------------------------------
1293 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1294 const char *src
, size_t srcLen
) const
1296 srcLen
= GetLength(src
, srcLen
);
1297 if ( srcLen
== wxNO_LEN
)
1298 return wxCONV_FAILED
;
1300 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1301 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1303 for ( size_t n
= 0; n
< inLen
; n
++ )
1306 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1307 if ( numChars
== wxCONV_FAILED
)
1308 return wxCONV_FAILED
;
1313 if ( outLen
> dstLen
)
1314 return wxCONV_FAILED
;
1317 if ( numChars
== 2 )
1319 // second character of a surrogate
1329 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1330 const wchar_t *src
, size_t srcLen
) const
1332 if ( srcLen
== wxNO_LEN
)
1333 srcLen
= wxWcslen(src
) + 1;
1337 // optimization: return maximal space which could be needed for this
1338 // string instead of the exact amount which could be less if there are
1339 // any surrogates in the input
1341 // we consider that surrogates are rare enough to make it worthwhile to
1342 // avoid running the loop below at the cost of slightly extra memory
1344 return srcLen
* BYTES_PER_CHAR
;
1347 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1349 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1351 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1353 return wxCONV_FAILED
;
1355 outLen
+= BYTES_PER_CHAR
;
1357 if ( outLen
> dstLen
)
1358 return wxCONV_FAILED
;
1366 // ----------------------------------------------------------------------------
1367 // endian-reversing conversions
1368 // ----------------------------------------------------------------------------
1371 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1372 const char *src
, size_t srcLen
) const
1374 srcLen
= GetLength(src
, srcLen
);
1375 if ( srcLen
== wxNO_LEN
)
1376 return wxCONV_FAILED
;
1378 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1379 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1381 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1384 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1385 if ( numChars
== wxCONV_FAILED
)
1386 return wxCONV_FAILED
;
1391 if ( outLen
> dstLen
)
1392 return wxCONV_FAILED
;
1395 if ( numChars
== 2 )
1397 // second character of a surrogate
1407 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1408 const wchar_t *src
, size_t srcLen
) const
1410 if ( srcLen
== wxNO_LEN
)
1411 srcLen
= wxWcslen(src
) + 1;
1415 // optimization: return maximal space which could be needed for this
1416 // string instead of the exact amount which could be less if there are
1417 // any surrogates in the input
1419 // we consider that surrogates are rare enough to make it worthwhile to
1420 // avoid running the loop below at the cost of slightly extra memory
1422 return srcLen
*BYTES_PER_CHAR
;
1425 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1427 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1429 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1431 return wxCONV_FAILED
;
1433 outLen
+= BYTES_PER_CHAR
;
1435 if ( outLen
> dstLen
)
1436 return wxCONV_FAILED
;
1438 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1444 #else // !WC_UTF16: wchar_t is UTF-32
1446 // ----------------------------------------------------------------------------
1447 // conversions without endianness change
1448 // ----------------------------------------------------------------------------
1451 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1452 const char *src
, size_t srcLen
) const
1454 // use memcpy() as it should be much faster than hand-written loop
1455 srcLen
= GetLength(src
, srcLen
);
1456 if ( srcLen
== wxNO_LEN
)
1457 return wxCONV_FAILED
;
1459 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1462 if ( dstLen
< inLen
)
1463 return wxCONV_FAILED
;
1465 memcpy(dst
, src
, srcLen
);
1472 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1473 const wchar_t *src
, size_t srcLen
) const
1475 if ( srcLen
== wxNO_LEN
)
1476 srcLen
= wxWcslen(src
) + 1;
1478 srcLen
*= BYTES_PER_CHAR
;
1482 if ( dstLen
< srcLen
)
1483 return wxCONV_FAILED
;
1485 memcpy(dst
, src
, srcLen
);
1491 // ----------------------------------------------------------------------------
1492 // endian-reversing conversions
1493 // ----------------------------------------------------------------------------
1496 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1497 const char *src
, size_t srcLen
) const
1499 srcLen
= GetLength(src
, srcLen
);
1500 if ( srcLen
== wxNO_LEN
)
1501 return wxCONV_FAILED
;
1503 srcLen
/= BYTES_PER_CHAR
;
1507 if ( dstLen
< srcLen
)
1508 return wxCONV_FAILED
;
1510 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1511 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1513 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1521 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1522 const wchar_t *src
, size_t srcLen
) const
1524 if ( srcLen
== wxNO_LEN
)
1525 srcLen
= wxWcslen(src
) + 1;
1527 srcLen
*= BYTES_PER_CHAR
;
1531 if ( dstLen
< srcLen
)
1532 return wxCONV_FAILED
;
1534 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1535 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1537 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1544 #endif // WC_UTF16/!WC_UTF16
1547 // ============================================================================
1548 // The classes doing conversion using the iconv_xxx() functions
1549 // ============================================================================
1553 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1554 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1555 // (unless there's yet another bug in glibc) the only case when iconv()
1556 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1557 // left in the input buffer -- when _real_ error occurs,
1558 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1560 // [This bug does not appear in glibc 2.2.]
1561 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1562 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1563 (errno != E2BIG || bufLeft != 0))
1565 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1568 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1570 #define ICONV_T_INVALID ((iconv_t)-1)
1572 #if SIZEOF_WCHAR_T == 4
1573 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1574 #define WC_ENC wxFONTENCODING_UTF32
1575 #elif SIZEOF_WCHAR_T == 2
1576 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1577 #define WC_ENC wxFONTENCODING_UTF16
1578 #else // sizeof(wchar_t) != 2 nor 4
1579 // does this ever happen?
1580 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1583 // ----------------------------------------------------------------------------
1584 // wxMBConv_iconv: encapsulates an iconv character set
1585 // ----------------------------------------------------------------------------
1587 class wxMBConv_iconv
: public wxMBConv
1590 wxMBConv_iconv(const wxChar
*name
);
1591 virtual ~wxMBConv_iconv();
1593 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1594 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1596 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1597 virtual size_t GetMBNulLen() const;
1599 virtual wxMBConv
*Clone() const
1601 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1602 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1607 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1610 // the iconv handlers used to translate from multibyte
1611 // to wide char and in the other direction
1616 // guards access to m2w and w2m objects
1617 wxMutex m_iconvMutex
;
1621 // the name (for iconv_open()) of a wide char charset -- if none is
1622 // available on this machine, it will remain NULL
1623 static wxString ms_wcCharsetName
;
1625 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1626 // different endian-ness than the native one
1627 static bool ms_wcNeedsSwap
;
1630 // name of the encoding handled by this conversion
1633 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1635 size_t m_minMBCharWidth
;
1638 // make the constructor available for unit testing
1639 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1641 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1642 if ( !result
->IsOk() )
1651 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1652 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1654 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1657 m_minMBCharWidth
= 0;
1659 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1660 // names for the charsets
1661 const wxCharBuffer
cname(wxString(name
).ToAscii());
1663 // check for charset that represents wchar_t:
1664 if ( ms_wcCharsetName
.empty() )
1666 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1669 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1670 #else // !wxUSE_FONTMAP
1671 static const wxChar
*names
[] =
1673 #if SIZEOF_WCHAR_T == 4
1675 #elif SIZEOF_WCHAR_T = 2
1680 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1682 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1684 const wxString
nameCS(*names
);
1686 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1687 wxString
nameXE(nameCS
);
1689 #ifdef WORDS_BIGENDIAN
1691 #else // little endian
1695 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1698 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1699 if ( m2w
== ICONV_T_INVALID
)
1701 // try charset w/o bytesex info (e.g. "UCS4")
1702 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1704 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1706 // and check for bytesex ourselves:
1707 if ( m2w
!= ICONV_T_INVALID
)
1709 char buf
[2], *bufPtr
;
1710 wchar_t wbuf
[2], *wbufPtr
;
1718 outsz
= SIZEOF_WCHAR_T
* 2;
1723 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1724 (char**)&wbufPtr
, &outsz
);
1726 if (ICONV_FAILED(res
, insz
))
1728 wxLogLastError(wxT("iconv"));
1729 wxLogError(_("Conversion to charset '%s' doesn't work."),
1732 else // ok, can convert to this encoding, remember it
1734 ms_wcCharsetName
= nameCS
;
1735 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1739 else // use charset not requiring byte swapping
1741 ms_wcCharsetName
= nameXE
;
1745 wxLogTrace(TRACE_STRCONV
,
1746 wxT("iconv wchar_t charset is \"%s\"%s"),
1747 ms_wcCharsetName
.empty() ? _T("<none>")
1748 : ms_wcCharsetName
.c_str(),
1749 ms_wcNeedsSwap
? _T(" (needs swap)")
1752 else // we already have ms_wcCharsetName
1754 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1757 if ( ms_wcCharsetName
.empty() )
1759 w2m
= ICONV_T_INVALID
;
1763 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1764 if ( w2m
== ICONV_T_INVALID
)
1766 wxLogTrace(TRACE_STRCONV
,
1767 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1768 ms_wcCharsetName
.c_str(), cname
.data());
1773 wxMBConv_iconv::~wxMBConv_iconv()
1775 if ( m2w
!= ICONV_T_INVALID
)
1777 if ( w2m
!= ICONV_T_INVALID
)
1781 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1783 // find the string length: notice that must be done differently for
1784 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1786 const size_t nulLen
= GetMBNulLen();
1790 return wxCONV_FAILED
;
1793 inbuf
= strlen(psz
); // arguably more optimized than our version
1798 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1799 // they also have to start at character boundary and not span two
1800 // adjacent characters
1802 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1809 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1810 // Unfortunately there is a couple of global wxCSConv objects such as
1811 // wxConvLocal that are used all over wx code, so we have to make sure
1812 // the handle is used by at most one thread at the time. Otherwise
1813 // only a few wx classes would be safe to use from non-main threads
1814 // as MB<->WC conversion would fail "randomly".
1815 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1816 #endif // wxUSE_THREADS
1818 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1820 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1821 wchar_t *bufPtr
= buf
;
1822 const char *pszPtr
= psz
;
1826 // have destination buffer, convert there
1828 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1829 (char**)&bufPtr
, &outbuf
);
1830 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1834 // convert to native endianness
1835 for ( unsigned i
= 0; i
< res
; i
++ )
1836 buf
[n
] = WC_BSWAP(buf
[i
]);
1839 // NUL-terminate the string if there is any space left
1845 // no destination buffer... convert using temp buffer
1846 // to calculate destination buffer requirement
1853 outbuf
= 8 * SIZEOF_WCHAR_T
;
1856 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1857 (char**)&bufPtr
, &outbuf
);
1859 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1861 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1864 if (ICONV_FAILED(cres
, inbuf
))
1866 //VS: it is ok if iconv fails, hence trace only
1867 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1868 return wxCONV_FAILED
;
1874 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1877 // NB: explained in MB2WC
1878 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1881 size_t inlen
= wxWcslen(psz
);
1882 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1886 wchar_t *tmpbuf
= 0;
1890 // need to copy to temp buffer to switch endianness
1891 // (doing WC_BSWAP twice on the original buffer won't help, as it
1892 // could be in read-only memory, or be accessed in some other thread)
1893 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1894 for ( size_t i
= 0; i
< inlen
; i
++ )
1895 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1897 tmpbuf
[inlen
] = L
'\0';
1903 // have destination buffer, convert there
1904 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1908 // NB: iconv was given only wcslen(psz) characters on input, and so
1909 // it couldn't convert the trailing zero. Let's do it ourselves
1910 // if there's some room left for it in the output buffer.
1916 // no destination buffer: convert using temp buffer
1917 // to calculate destination buffer requirement
1925 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1929 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1937 if (ICONV_FAILED(cres
, inbuf
))
1939 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1940 return wxCONV_FAILED
;
1946 size_t wxMBConv_iconv::GetMBNulLen() const
1948 if ( m_minMBCharWidth
== 0 )
1950 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1953 // NB: explained in MB2WC
1954 wxMutexLocker
lock(self
->m_iconvMutex
);
1957 wchar_t *wnul
= L
"";
1958 char buf
[8]; // should be enough for NUL in any encoding
1959 size_t inLen
= sizeof(wchar_t),
1960 outLen
= WXSIZEOF(buf
);
1961 char *inBuff
= (char *)wnul
;
1962 char *outBuff
= buf
;
1963 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1965 self
->m_minMBCharWidth
= (size_t)-1;
1969 self
->m_minMBCharWidth
= outBuff
- buf
;
1973 return m_minMBCharWidth
;
1976 #endif // HAVE_ICONV
1979 // ============================================================================
1980 // Win32 conversion classes
1981 // ============================================================================
1983 #ifdef wxHAVE_WIN32_MB2WC
1987 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1988 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1991 class wxMBConv_win32
: public wxMBConv
1996 m_CodePage
= CP_ACP
;
1997 m_minMBCharWidth
= 0;
2000 wxMBConv_win32(const wxMBConv_win32
& conv
)
2003 m_CodePage
= conv
.m_CodePage
;
2004 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2008 wxMBConv_win32(const wxChar
* name
)
2010 m_CodePage
= wxCharsetToCodepage(name
);
2011 m_minMBCharWidth
= 0;
2014 wxMBConv_win32(wxFontEncoding encoding
)
2016 m_CodePage
= wxEncodingToCodepage(encoding
);
2017 m_minMBCharWidth
= 0;
2019 #endif // wxUSE_FONTMAP
2021 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2023 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2024 // the behaviour is not compatible with the Unix version (using iconv)
2025 // and break the library itself, e.g. wxTextInputStream::NextChar()
2026 // wouldn't work if reading an incomplete MB char didn't result in an
2029 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2030 // Win XP or newer and it is not supported for UTF-[78] so we always
2031 // use our own conversions in this case. See
2032 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2033 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2034 if ( m_CodePage
== CP_UTF8
)
2036 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
2039 if ( m_CodePage
== CP_UTF7
)
2041 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
2045 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2046 IsAtLeastWin2kSP4() )
2048 flags
= MB_ERR_INVALID_CHARS
;
2051 const size_t len
= ::MultiByteToWideChar
2053 m_CodePage
, // code page
2054 flags
, // flags: fall on error
2055 psz
, // input string
2056 -1, // its length (NUL-terminated)
2057 buf
, // output string
2058 buf
? n
: 0 // size of output buffer
2062 // function totally failed
2063 return wxCONV_FAILED
;
2066 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2067 // check if we succeeded, by doing a double trip:
2068 if ( !flags
&& buf
)
2070 const size_t mbLen
= strlen(psz
);
2071 wxCharBuffer
mbBuf(mbLen
);
2072 if ( ::WideCharToMultiByte
2079 mbLen
+ 1, // size in bytes, not length
2083 strcmp(mbBuf
, psz
) != 0 )
2085 // we didn't obtain the same thing we started from, hence
2086 // the conversion was lossy and we consider that it failed
2087 return wxCONV_FAILED
;
2091 // note that it returns count of written chars for buf != NULL and size
2092 // of the needed buffer for buf == NULL so in either case the length of
2093 // the string (which never includes the terminating NUL) is one less
2097 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2100 we have a problem here: by default, WideCharToMultiByte() may
2101 replace characters unrepresentable in the target code page with bad
2102 quality approximations such as turning "1/2" symbol (U+00BD) into
2103 "1" for the code pages which don't have it and we, obviously, want
2104 to avoid this at any price
2106 the trouble is that this function does it _silently_, i.e. it won't
2107 even tell us whether it did or not... Win98/2000 and higher provide
2108 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2109 we have to resort to a round trip, i.e. check that converting back
2110 results in the same string -- this is, of course, expensive but
2111 otherwise we simply can't be sure to not garble the data.
2114 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2115 // it doesn't work with CJK encodings (which we test for rather roughly
2116 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2118 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2121 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2123 // it's our lucky day
2124 flags
= WC_NO_BEST_FIT_CHARS
;
2125 pUsedDef
= &usedDef
;
2127 else // old system or unsupported encoding
2133 const size_t len
= ::WideCharToMultiByte
2135 m_CodePage
, // code page
2136 flags
, // either none or no best fit
2137 pwz
, // input string
2138 -1, // it is (wide) NUL-terminated
2139 buf
, // output buffer
2140 buf
? n
: 0, // and its size
2141 NULL
, // default "replacement" char
2142 pUsedDef
// [out] was it used?
2147 // function totally failed
2148 return wxCONV_FAILED
;
2151 // if we were really converting, check if we succeeded
2156 // check if the conversion failed, i.e. if any replacements
2159 return wxCONV_FAILED
;
2161 else // we must resort to double tripping...
2163 wxWCharBuffer
wcBuf(n
);
2164 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2165 wcscmp(wcBuf
, pwz
) != 0 )
2167 // we didn't obtain the same thing we started from, hence
2168 // the conversion was lossy and we consider that it failed
2169 return wxCONV_FAILED
;
2174 // see the comment above for the reason of "len - 1"
2178 virtual size_t GetMBNulLen() const
2180 if ( m_minMBCharWidth
== 0 )
2182 int len
= ::WideCharToMultiByte
2184 m_CodePage
, // code page
2186 L
"", // input string
2187 1, // translate just the NUL
2188 NULL
, // output buffer
2190 NULL
, // no replacement char
2191 NULL
// [out] don't care if it was used
2194 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2198 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2199 self
->m_minMBCharWidth
= (size_t)-1;
2203 self
->m_minMBCharWidth
= (size_t)-1;
2209 self
->m_minMBCharWidth
= len
;
2214 return m_minMBCharWidth
;
2217 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2219 bool IsOk() const { return m_CodePage
!= -1; }
2222 static bool CanUseNoBestFit()
2224 static int s_isWin98Or2k
= -1;
2226 if ( s_isWin98Or2k
== -1 )
2229 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2232 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2236 s_isWin98Or2k
= verMaj
>= 5;
2240 // unknown: be conservative by default
2245 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2248 return s_isWin98Or2k
== 1;
2251 static bool IsAtLeastWin2kSP4()
2256 static int s_isAtLeastWin2kSP4
= -1;
2258 if ( s_isAtLeastWin2kSP4
== -1 )
2260 OSVERSIONINFOEX ver
;
2262 memset(&ver
, 0, sizeof(ver
));
2263 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2264 GetVersionEx((OSVERSIONINFO
*)&ver
);
2266 s_isAtLeastWin2kSP4
=
2267 ((ver
.dwMajorVersion
> 5) || // Vista+
2268 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2269 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2270 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2274 return s_isAtLeastWin2kSP4
== 1;
2279 // the code page we're working with
2282 // cached result of GetMBNulLen(), set to 0 initially meaning
2284 size_t m_minMBCharWidth
;
2287 #endif // wxHAVE_WIN32_MB2WC
2289 // ============================================================================
2290 // Cocoa conversion classes
2291 // ============================================================================
2293 #if defined(__WXCOCOA__)
2295 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2296 // Strangely enough, internally Core Foundation uses
2297 // UTF-32 internally quite a bit - its just not public (yet).
2299 #include <CoreFoundation/CFString.h>
2300 #include <CoreFoundation/CFStringEncodingExt.h>
2302 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2304 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2308 case wxFONTENCODING_DEFAULT
:
2309 enc
= CFStringGetSystemEncoding();
2312 case wxFONTENCODING_ISO8859_1
:
2313 enc
= kCFStringEncodingISOLatin1
;
2315 case wxFONTENCODING_ISO8859_2
:
2316 enc
= kCFStringEncodingISOLatin2
;
2318 case wxFONTENCODING_ISO8859_3
:
2319 enc
= kCFStringEncodingISOLatin3
;
2321 case wxFONTENCODING_ISO8859_4
:
2322 enc
= kCFStringEncodingISOLatin4
;
2324 case wxFONTENCODING_ISO8859_5
:
2325 enc
= kCFStringEncodingISOLatinCyrillic
;
2327 case wxFONTENCODING_ISO8859_6
:
2328 enc
= kCFStringEncodingISOLatinArabic
;
2330 case wxFONTENCODING_ISO8859_7
:
2331 enc
= kCFStringEncodingISOLatinGreek
;
2333 case wxFONTENCODING_ISO8859_8
:
2334 enc
= kCFStringEncodingISOLatinHebrew
;
2336 case wxFONTENCODING_ISO8859_9
:
2337 enc
= kCFStringEncodingISOLatin5
;
2339 case wxFONTENCODING_ISO8859_10
:
2340 enc
= kCFStringEncodingISOLatin6
;
2342 case wxFONTENCODING_ISO8859_11
:
2343 enc
= kCFStringEncodingISOLatinThai
;
2345 case wxFONTENCODING_ISO8859_13
:
2346 enc
= kCFStringEncodingISOLatin7
;
2348 case wxFONTENCODING_ISO8859_14
:
2349 enc
= kCFStringEncodingISOLatin8
;
2351 case wxFONTENCODING_ISO8859_15
:
2352 enc
= kCFStringEncodingISOLatin9
;
2355 case wxFONTENCODING_KOI8
:
2356 enc
= kCFStringEncodingKOI8_R
;
2358 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2359 enc
= kCFStringEncodingDOSRussian
;
2362 // case wxFONTENCODING_BULGARIAN :
2366 case wxFONTENCODING_CP437
:
2367 enc
= kCFStringEncodingDOSLatinUS
;
2369 case wxFONTENCODING_CP850
:
2370 enc
= kCFStringEncodingDOSLatin1
;
2372 case wxFONTENCODING_CP852
:
2373 enc
= kCFStringEncodingDOSLatin2
;
2375 case wxFONTENCODING_CP855
:
2376 enc
= kCFStringEncodingDOSCyrillic
;
2378 case wxFONTENCODING_CP866
:
2379 enc
= kCFStringEncodingDOSRussian
;
2381 case wxFONTENCODING_CP874
:
2382 enc
= kCFStringEncodingDOSThai
;
2384 case wxFONTENCODING_CP932
:
2385 enc
= kCFStringEncodingDOSJapanese
;
2387 case wxFONTENCODING_CP936
:
2388 enc
= kCFStringEncodingDOSChineseSimplif
;
2390 case wxFONTENCODING_CP949
:
2391 enc
= kCFStringEncodingDOSKorean
;
2393 case wxFONTENCODING_CP950
:
2394 enc
= kCFStringEncodingDOSChineseTrad
;
2396 case wxFONTENCODING_CP1250
:
2397 enc
= kCFStringEncodingWindowsLatin2
;
2399 case wxFONTENCODING_CP1251
:
2400 enc
= kCFStringEncodingWindowsCyrillic
;
2402 case wxFONTENCODING_CP1252
:
2403 enc
= kCFStringEncodingWindowsLatin1
;
2405 case wxFONTENCODING_CP1253
:
2406 enc
= kCFStringEncodingWindowsGreek
;
2408 case wxFONTENCODING_CP1254
:
2409 enc
= kCFStringEncodingWindowsLatin5
;
2411 case wxFONTENCODING_CP1255
:
2412 enc
= kCFStringEncodingWindowsHebrew
;
2414 case wxFONTENCODING_CP1256
:
2415 enc
= kCFStringEncodingWindowsArabic
;
2417 case wxFONTENCODING_CP1257
:
2418 enc
= kCFStringEncodingWindowsBalticRim
;
2420 // This only really encodes to UTF7 (if that) evidently
2421 // case wxFONTENCODING_UTF7 :
2422 // enc = kCFStringEncodingNonLossyASCII ;
2424 case wxFONTENCODING_UTF8
:
2425 enc
= kCFStringEncodingUTF8
;
2427 case wxFONTENCODING_EUC_JP
:
2428 enc
= kCFStringEncodingEUC_JP
;
2430 case wxFONTENCODING_UTF16
:
2431 enc
= kCFStringEncodingUnicode
;
2433 case wxFONTENCODING_MACROMAN
:
2434 enc
= kCFStringEncodingMacRoman
;
2436 case wxFONTENCODING_MACJAPANESE
:
2437 enc
= kCFStringEncodingMacJapanese
;
2439 case wxFONTENCODING_MACCHINESETRAD
:
2440 enc
= kCFStringEncodingMacChineseTrad
;
2442 case wxFONTENCODING_MACKOREAN
:
2443 enc
= kCFStringEncodingMacKorean
;
2445 case wxFONTENCODING_MACARABIC
:
2446 enc
= kCFStringEncodingMacArabic
;
2448 case wxFONTENCODING_MACHEBREW
:
2449 enc
= kCFStringEncodingMacHebrew
;
2451 case wxFONTENCODING_MACGREEK
:
2452 enc
= kCFStringEncodingMacGreek
;
2454 case wxFONTENCODING_MACCYRILLIC
:
2455 enc
= kCFStringEncodingMacCyrillic
;
2457 case wxFONTENCODING_MACDEVANAGARI
:
2458 enc
= kCFStringEncodingMacDevanagari
;
2460 case wxFONTENCODING_MACGURMUKHI
:
2461 enc
= kCFStringEncodingMacGurmukhi
;
2463 case wxFONTENCODING_MACGUJARATI
:
2464 enc
= kCFStringEncodingMacGujarati
;
2466 case wxFONTENCODING_MACORIYA
:
2467 enc
= kCFStringEncodingMacOriya
;
2469 case wxFONTENCODING_MACBENGALI
:
2470 enc
= kCFStringEncodingMacBengali
;
2472 case wxFONTENCODING_MACTAMIL
:
2473 enc
= kCFStringEncodingMacTamil
;
2475 case wxFONTENCODING_MACTELUGU
:
2476 enc
= kCFStringEncodingMacTelugu
;
2478 case wxFONTENCODING_MACKANNADA
:
2479 enc
= kCFStringEncodingMacKannada
;
2481 case wxFONTENCODING_MACMALAJALAM
:
2482 enc
= kCFStringEncodingMacMalayalam
;
2484 case wxFONTENCODING_MACSINHALESE
:
2485 enc
= kCFStringEncodingMacSinhalese
;
2487 case wxFONTENCODING_MACBURMESE
:
2488 enc
= kCFStringEncodingMacBurmese
;
2490 case wxFONTENCODING_MACKHMER
:
2491 enc
= kCFStringEncodingMacKhmer
;
2493 case wxFONTENCODING_MACTHAI
:
2494 enc
= kCFStringEncodingMacThai
;
2496 case wxFONTENCODING_MACLAOTIAN
:
2497 enc
= kCFStringEncodingMacLaotian
;
2499 case wxFONTENCODING_MACGEORGIAN
:
2500 enc
= kCFStringEncodingMacGeorgian
;
2502 case wxFONTENCODING_MACARMENIAN
:
2503 enc
= kCFStringEncodingMacArmenian
;
2505 case wxFONTENCODING_MACCHINESESIMP
:
2506 enc
= kCFStringEncodingMacChineseSimp
;
2508 case wxFONTENCODING_MACTIBETAN
:
2509 enc
= kCFStringEncodingMacTibetan
;
2511 case wxFONTENCODING_MACMONGOLIAN
:
2512 enc
= kCFStringEncodingMacMongolian
;
2514 case wxFONTENCODING_MACETHIOPIC
:
2515 enc
= kCFStringEncodingMacEthiopic
;
2517 case wxFONTENCODING_MACCENTRALEUR
:
2518 enc
= kCFStringEncodingMacCentralEurRoman
;
2520 case wxFONTENCODING_MACVIATNAMESE
:
2521 enc
= kCFStringEncodingMacVietnamese
;
2523 case wxFONTENCODING_MACARABICEXT
:
2524 enc
= kCFStringEncodingMacExtArabic
;
2526 case wxFONTENCODING_MACSYMBOL
:
2527 enc
= kCFStringEncodingMacSymbol
;
2529 case wxFONTENCODING_MACDINGBATS
:
2530 enc
= kCFStringEncodingMacDingbats
;
2532 case wxFONTENCODING_MACTURKISH
:
2533 enc
= kCFStringEncodingMacTurkish
;
2535 case wxFONTENCODING_MACCROATIAN
:
2536 enc
= kCFStringEncodingMacCroatian
;
2538 case wxFONTENCODING_MACICELANDIC
:
2539 enc
= kCFStringEncodingMacIcelandic
;
2541 case wxFONTENCODING_MACROMANIAN
:
2542 enc
= kCFStringEncodingMacRomanian
;
2544 case wxFONTENCODING_MACCELTIC
:
2545 enc
= kCFStringEncodingMacCeltic
;
2547 case wxFONTENCODING_MACGAELIC
:
2548 enc
= kCFStringEncodingMacGaelic
;
2550 // case wxFONTENCODING_MACKEYBOARD :
2551 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2555 // because gcc is picky
2562 class wxMBConv_cocoa
: public wxMBConv
2567 Init(CFStringGetSystemEncoding()) ;
2570 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2572 m_encoding
= conv
.m_encoding
;
2576 wxMBConv_cocoa(const wxChar
* name
)
2578 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2582 wxMBConv_cocoa(wxFontEncoding encoding
)
2584 Init( wxCFStringEncFromFontEnc(encoding
) );
2591 void Init( CFStringEncoding encoding
)
2593 m_encoding
= encoding
;
2596 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2600 CFStringRef theString
= CFStringCreateWithBytes (
2601 NULL
, //the allocator
2602 (const UInt8
*)szUnConv
,
2605 false //no BOM/external representation
2608 wxASSERT(theString
);
2610 size_t nOutLength
= CFStringGetLength(theString
);
2614 CFRelease(theString
);
2618 CFRange theRange
= { 0, nOutSize
};
2620 #if SIZEOF_WCHAR_T == 4
2621 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2624 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2626 CFRelease(theString
);
2628 szUniCharBuffer
[nOutLength
] = '\0';
2630 #if SIZEOF_WCHAR_T == 4
2631 wxMBConvUTF16 converter
;
2632 converter
.MB2WC( szOut
, (const char*)szUniCharBuffer
, nOutSize
);
2633 delete [] szUniCharBuffer
;
2639 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2643 size_t nRealOutSize
;
2644 size_t nBufSize
= wxWcslen(szUnConv
);
2645 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2647 #if SIZEOF_WCHAR_T == 4
2648 wxMBConvUTF16 converter
;
2649 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2650 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1];
2651 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
));
2652 nBufSize
/= sizeof(UniChar
);
2655 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2659 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2662 wxASSERT(theString
);
2664 //Note that CER puts a BOM when converting to unicode
2665 //so we check and use getchars instead in that case
2666 if (m_encoding
== kCFStringEncodingUnicode
)
2669 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2671 nRealOutSize
= CFStringGetLength(theString
) + 1;
2677 CFRangeMake(0, CFStringGetLength(theString
)),
2679 0, //what to put in characters that can't be converted -
2680 //0 tells CFString to return NULL if it meets such a character
2681 false, //not an external representation
2684 (CFIndex
*) &nRealOutSize
2688 CFRelease(theString
);
2690 #if SIZEOF_WCHAR_T == 4
2691 delete[] szUniBuffer
;
2694 return nRealOutSize
- 1;
2697 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2701 return m_encoding
!= kCFStringEncodingInvalidId
&&
2702 CFStringIsEncodingAvailable(m_encoding
);
2706 CFStringEncoding m_encoding
;
2709 #endif // defined(__WXCOCOA__)
2711 // ============================================================================
2712 // Mac conversion classes
2713 // ============================================================================
2715 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2717 class wxMBConv_mac
: public wxMBConv
2722 Init(CFStringGetSystemEncoding()) ;
2725 wxMBConv_mac(const wxMBConv_mac
& conv
)
2727 Init(conv
.m_char_encoding
);
2731 wxMBConv_mac(const wxChar
* name
)
2733 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2737 wxMBConv_mac(wxFontEncoding encoding
)
2739 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2744 OSStatus status
= noErr
;
2745 status
= TECDisposeConverter(m_MB2WC_converter
);
2746 status
= TECDisposeConverter(m_WC2MB_converter
);
2750 void Init( TextEncodingBase encoding
)
2752 OSStatus status
= noErr
;
2753 m_char_encoding
= encoding
;
2754 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2756 status
= TECCreateConverter(&m_MB2WC_converter
,
2758 m_unicode_encoding
);
2759 status
= TECCreateConverter(&m_WC2MB_converter
,
2764 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2766 OSStatus status
= noErr
;
2767 ByteCount byteOutLen
;
2768 ByteCount byteInLen
= strlen(psz
) + 1;
2769 wchar_t *tbuf
= NULL
;
2770 UniChar
* ubuf
= NULL
;
2775 // Apple specs say at least 32
2776 n
= wxMax( 32, byteInLen
) ;
2777 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2780 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2782 #if SIZEOF_WCHAR_T == 4
2783 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2785 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2788 status
= TECConvertText(
2789 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2790 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2792 #if SIZEOF_WCHAR_T == 4
2793 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2794 // is not properly terminated we get random characters at the end
2795 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2796 wxMBConvUTF16 converter
;
2797 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2800 res
= byteOutLen
/ sizeof( UniChar
) ;
2806 if ( buf
&& res
< n
)
2812 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2814 OSStatus status
= noErr
;
2815 ByteCount byteOutLen
;
2816 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2822 // Apple specs say at least 32
2823 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2824 tbuf
= (char*) malloc( n
) ;
2827 ByteCount byteBufferLen
= n
;
2828 UniChar
* ubuf
= NULL
;
2830 #if SIZEOF_WCHAR_T == 4
2831 wxMBConvUTF16 converter
;
2832 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2833 byteInLen
= unicharlen
;
2834 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2835 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2837 ubuf
= (UniChar
*) psz
;
2840 status
= TECConvertText(
2841 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2842 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2844 #if SIZEOF_WCHAR_T == 4
2851 size_t res
= byteOutLen
;
2852 if ( buf
&& res
< n
)
2856 //we need to double-trip to verify it didn't insert any ? in place
2857 //of bogus characters
2858 wxWCharBuffer
wcBuf(n
);
2859 size_t pszlen
= wxWcslen(psz
);
2860 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2861 wxWcslen(wcBuf
) != pszlen
||
2862 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2864 // we didn't obtain the same thing we started from, hence
2865 // the conversion was lossy and we consider that it failed
2866 return wxCONV_FAILED
;
2873 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2876 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2879 TECObjectRef m_MB2WC_converter
;
2880 TECObjectRef m_WC2MB_converter
;
2882 TextEncodingBase m_char_encoding
;
2883 TextEncodingBase m_unicode_encoding
;
2886 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2888 // ============================================================================
2889 // wxEncodingConverter based conversion classes
2890 // ============================================================================
2894 class wxMBConv_wxwin
: public wxMBConv
2899 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2900 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2904 // temporarily just use wxEncodingConverter stuff,
2905 // so that it works while a better implementation is built
2906 wxMBConv_wxwin(const wxChar
* name
)
2909 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2911 m_enc
= wxFONTENCODING_SYSTEM
;
2916 wxMBConv_wxwin(wxFontEncoding enc
)
2923 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2925 size_t inbuf
= strlen(psz
);
2928 if (!m2w
.Convert(psz
, buf
))
2929 return wxCONV_FAILED
;
2934 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2936 const size_t inbuf
= wxWcslen(psz
);
2939 if (!w2m
.Convert(psz
, buf
))
2940 return wxCONV_FAILED
;
2946 virtual size_t GetMBNulLen() const
2950 case wxFONTENCODING_UTF16BE
:
2951 case wxFONTENCODING_UTF16LE
:
2954 case wxFONTENCODING_UTF32BE
:
2955 case wxFONTENCODING_UTF32LE
:
2963 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2965 bool IsOk() const { return m_ok
; }
2968 wxFontEncoding m_enc
;
2969 wxEncodingConverter m2w
, w2m
;
2972 // were we initialized successfully?
2975 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2978 // make the constructors available for unit testing
2979 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2981 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2982 if ( !result
->IsOk() )
2991 #endif // wxUSE_FONTMAP
2993 // ============================================================================
2994 // wxCSConv implementation
2995 // ============================================================================
2997 void wxCSConv::Init()
3004 wxCSConv::wxCSConv(const wxChar
*charset
)
3014 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
3016 m_encoding
= wxFONTENCODING_SYSTEM
;
3020 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3022 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3024 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3026 encoding
= wxFONTENCODING_SYSTEM
;
3031 m_encoding
= encoding
;
3034 wxCSConv::~wxCSConv()
3039 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3044 SetName(conv
.m_name
);
3045 m_encoding
= conv
.m_encoding
;
3048 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3052 SetName(conv
.m_name
);
3053 m_encoding
= conv
.m_encoding
;
3058 void wxCSConv::Clear()
3067 void wxCSConv::SetName(const wxChar
*charset
)
3071 m_name
= wxStrdup(charset
);
3078 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3079 wxEncodingNameCache
);
3081 static wxEncodingNameCache gs_nameCache
;
3084 wxMBConv
*wxCSConv::DoCreate() const
3087 wxLogTrace(TRACE_STRCONV
,
3088 wxT("creating conversion for %s"),
3090 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
3091 #endif // wxUSE_FONTMAP
3093 // check for the special case of ASCII or ISO8859-1 charset: as we have
3094 // special knowledge of it anyhow, we don't need to create a special
3095 // conversion object
3096 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3097 m_encoding
== wxFONTENCODING_DEFAULT
)
3099 // don't convert at all
3103 // we trust OS to do conversion better than we can so try external
3104 // conversion methods first
3106 // the full order is:
3107 // 1. OS conversion (iconv() under Unix or Win32 API)
3108 // 2. hard coded conversions for UTF
3109 // 3. wxEncodingConverter as fall back
3115 #endif // !wxUSE_FONTMAP
3117 wxString
name(m_name
);
3118 wxFontEncoding
encoding(m_encoding
);
3120 if ( !name
.empty() )
3122 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
3130 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3131 #endif // wxUSE_FONTMAP
3135 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3136 if ( it
!= gs_nameCache
.end() )
3138 if ( it
->second
.empty() )
3141 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3148 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3150 for ( ; *names
; ++names
)
3152 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3155 gs_nameCache
[encoding
] = *names
;
3162 gs_nameCache
[encoding
] = _T(""); // cache the failure
3164 #endif // wxUSE_FONTMAP
3166 #endif // HAVE_ICONV
3168 #ifdef wxHAVE_WIN32_MB2WC
3171 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3172 : new wxMBConv_win32(m_encoding
);
3181 #endif // wxHAVE_WIN32_MB2WC
3183 #if defined(__WXMAC__)
3185 // leave UTF16 and UTF32 to the built-ins of wx
3186 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3187 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3190 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3191 : new wxMBConv_mac(m_encoding
);
3193 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3203 #if defined(__WXCOCOA__)
3205 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3208 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3209 : new wxMBConv_cocoa(m_encoding
);
3211 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3222 wxFontEncoding enc
= m_encoding
;
3224 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3229 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3231 #endif // wxUSE_FONTMAP
3235 case wxFONTENCODING_UTF7
:
3236 return new wxMBConvUTF7
;
3238 case wxFONTENCODING_UTF8
:
3239 return new wxMBConvUTF8
;
3241 case wxFONTENCODING_UTF16BE
:
3242 return new wxMBConvUTF16BE
;
3244 case wxFONTENCODING_UTF16LE
:
3245 return new wxMBConvUTF16LE
;
3247 case wxFONTENCODING_UTF32BE
:
3248 return new wxMBConvUTF32BE
;
3250 case wxFONTENCODING_UTF32LE
:
3251 return new wxMBConvUTF32LE
;
3254 // nothing to do but put here to suppress gcc warnings
3261 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3262 : new wxMBConv_wxwin(m_encoding
);
3268 #endif // wxUSE_FONTMAP
3270 // NB: This is a hack to prevent deadlock. What could otherwise happen
3271 // in Unicode build: wxConvLocal creation ends up being here
3272 // because of some failure and logs the error. But wxLog will try to
3273 // attach timestamp, for which it will need wxConvLocal (to convert
3274 // time to char* and then wchar_t*), but that fails, tries to log
3275 // error, but wxLog has a (already locked) critical section that
3276 // guards static buffer.
3277 static bool alreadyLoggingError
= false;
3278 if (!alreadyLoggingError
)
3280 alreadyLoggingError
= true;
3281 wxLogError(_("Cannot convert from the charset '%s'!"),
3285 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3286 #else // !wxUSE_FONTMAP
3287 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3288 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3291 alreadyLoggingError
= false;
3297 void wxCSConv::CreateConvIfNeeded() const
3301 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3304 // if we don't have neither the name nor the encoding, use the default
3305 // encoding for this system
3306 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3308 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3310 #endif // wxUSE_INTL
3312 self
->m_convReal
= DoCreate();
3313 self
->m_deferred
= false;
3317 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3319 CreateConvIfNeeded();
3322 return m_convReal
->MB2WC(buf
, psz
, n
);
3325 size_t len
= strlen(psz
);
3329 for (size_t c
= 0; c
<= len
; c
++)
3330 buf
[c
] = (unsigned char)(psz
[c
]);
3336 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3338 CreateConvIfNeeded();
3341 return m_convReal
->WC2MB(buf
, psz
, n
);
3344 const size_t len
= wxWcslen(psz
);
3347 for (size_t c
= 0; c
<= len
; c
++)
3350 return wxCONV_FAILED
;
3352 buf
[c
] = (char)psz
[c
];
3357 for (size_t c
= 0; c
<= len
; c
++)
3360 return wxCONV_FAILED
;
3367 size_t wxCSConv::GetMBNulLen() const
3369 CreateConvIfNeeded();
3373 return m_convReal
->GetMBNulLen();
3379 // ----------------------------------------------------------------------------
3381 // ----------------------------------------------------------------------------
3384 static wxMBConv_win32 wxConvLibcObj
;
3385 #elif defined(__WXMAC__) && !defined(__MACH__)
3386 static wxMBConv_mac wxConvLibcObj
;
3388 static wxMBConvLibc wxConvLibcObj
;
3391 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3392 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3393 static wxMBConvUTF7 wxConvUTF7Obj
;
3394 static wxMBConvUTF8 wxConvUTF8Obj
;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3398 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= &wxConvLocal
;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3410 #else // !wxUSE_WCHAR_T
3412 // stand-ins in absence of wchar_t
3413 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3418 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T