1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
35 #include "wx/strconv.h"
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
62 #include "wx/thread.h"
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
76 #include "wx/mac/private.h" // includes mac headers
79 #define TRACE_STRCONV _T("strconv")
81 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
83 #if SIZEOF_WCHAR_T == 2
87 // ============================================================================
89 // ============================================================================
91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
92 static bool NotAllNULs(const char *p
, size_t n
)
94 while ( n
&& *p
++ == '\0' )
100 // ----------------------------------------------------------------------------
101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
102 // ----------------------------------------------------------------------------
104 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
109 *output
= (wxUint16
) input
;
112 else if (input
>=0x110000)
114 return wxCONV_FAILED
;
120 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
121 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
127 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
129 if ((*input
<0xd800) || (*input
>0xdfff))
134 else if ((input
[1]<0xdc00) || (input
[1]>0xdfff))
137 return wxCONV_FAILED
;
141 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
147 typedef wchar_t wxDecodeSurrogate_t
;
149 typedef wxUint16 wxDecodeSurrogate_t
;
150 #endif // WC_UTF16/!WC_UTF16
152 // returns the next UTF-32 character from the wchar_t buffer and advances the
153 // pointer to the character after this one
155 // if an invalid character is found, *pSrc is set to NULL, the caller must
157 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
160 const size_t n
= decode_utf16(*pSrc
, out
);
161 if ( n
== wxCONV_FAILED
)
169 // ----------------------------------------------------------------------------
171 // ----------------------------------------------------------------------------
174 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
175 const char *src
, size_t srcLen
) const
177 // although new conversion classes are supposed to implement this function
178 // directly, the existins ones only implement the old MB2WC() and so, to
179 // avoid to have to rewrite all conversion classes at once, we provide a
180 // default (but not efficient) implementation of this one in terms of the
181 // old function by copying the input to ensure that it's NUL-terminated and
182 // then using MB2WC() to convert it
184 // the number of chars [which would be] written to dst [if it were not NULL]
185 size_t dstWritten
= 0;
187 // the number of NULs terminating this string
188 size_t nulLen
wxDUMMY_INITIALIZE(0);
190 // if we were not given the input size we just have to assume that the
191 // string is properly terminated as we have no way of knowing how long it
192 // is anyhow, but if we do have the size check whether there are enough
196 if ( srcLen
!= wxNO_LEN
)
198 // we need to know how to find the end of this string
199 nulLen
= GetMBNulLen();
200 if ( nulLen
== wxCONV_FAILED
)
201 return wxCONV_FAILED
;
203 // if there are enough NULs we can avoid the copy
204 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
206 // make a copy in order to properly NUL-terminate the string
207 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
208 char * const p
= bufTmp
.data();
209 memcpy(p
, src
, srcLen
);
210 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
216 srcEnd
= src
+ srcLen
;
218 else // quit after the first loop iteration
225 // try to convert the current chunk
226 size_t lenChunk
= MB2WC(NULL
, src
, 0);
227 if ( lenChunk
== wxCONV_FAILED
)
228 return wxCONV_FAILED
;
230 lenChunk
++; // for the L'\0' at the end of this chunk
232 dstWritten
+= lenChunk
;
236 // nothing left in the input string, conversion succeeded
242 if ( dstWritten
> dstLen
)
243 return wxCONV_FAILED
;
245 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
246 return wxCONV_FAILED
;
253 // we convert just one chunk in this case as this is the entire
258 // advance the input pointer past the end of this chunk
259 while ( NotAllNULs(src
, nulLen
) )
261 // notice that we must skip over multiple bytes here as we suppose
262 // that if NUL takes 2 or 4 bytes, then all the other characters do
263 // too and so if advanced by a single byte we might erroneously
264 // detect sequences of NUL bytes in the middle of the input
268 src
+= nulLen
; // skipping over its terminator as well
270 // note that ">=" (and not just "==") is needed here as the terminator
271 // we skipped just above could be inside or just after the buffer
272 // delimited by inEnd
281 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
282 const wchar_t *src
, size_t srcLen
) const
284 // the number of chars [which would be] written to dst [if it were not NULL]
285 size_t dstWritten
= 0;
287 // make a copy of the input string unless it is already properly
290 // if we don't know its length we have no choice but to assume that it is,
291 // indeed, properly terminated
292 wxWCharBuffer bufTmp
;
293 if ( srcLen
== wxNO_LEN
)
295 srcLen
= wxWcslen(src
) + 1;
297 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
299 // make a copy in order to properly NUL-terminate the string
300 bufTmp
= wxWCharBuffer(srcLen
);
301 memcpy(bufTmp
.data(), src
, srcLen
*sizeof(wchar_t));
305 const size_t lenNul
= GetMBNulLen();
306 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
308 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
310 // try to convert the current chunk
311 size_t lenChunk
= WC2MB(NULL
, src
, 0);
313 if ( lenChunk
== wxCONV_FAILED
)
314 return wxCONV_FAILED
;
317 dstWritten
+= lenChunk
;
321 if ( dstWritten
> dstLen
)
322 return wxCONV_FAILED
;
324 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
325 return wxCONV_FAILED
;
334 size_t wxMBConv::MB2WC(wchar_t *out
, const char *in
, size_t outLen
) const
336 size_t rc
= ToWChar(out
, outLen
, in
);
337 if ( rc
!= wxCONV_FAILED
)
339 // ToWChar() returns the buffer length, i.e. including the trailing
340 // NUL, while this method doesn't take it into account
347 size_t wxMBConv::WC2MB(char *out
, const wchar_t *in
, size_t outLen
) const
349 size_t rc
= FromWChar(out
, outLen
, in
);
350 if ( rc
!= wxCONV_FAILED
)
358 wxMBConv::~wxMBConv()
360 // nothing to do here (necessary for Darwin linking probably)
363 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
367 // calculate the length of the buffer needed first
368 const size_t nLen
= MB2WC(NULL
, psz
, 0);
369 if ( nLen
!= wxCONV_FAILED
)
371 // now do the actual conversion
372 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
374 // +1 for the trailing NULL
375 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
380 return wxWCharBuffer();
383 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
387 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
388 if ( nLen
!= wxCONV_FAILED
)
390 // extra space for trailing NUL(s)
391 static const size_t extraLen
= GetMaxMBNulLen();
393 wxCharBuffer
buf(nLen
+ extraLen
- 1);
394 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
399 return wxCharBuffer();
403 wxMBConv::cMB2WC(const char *in
, size_t inLen
, size_t *outLen
) const
405 const size_t dstLen
= ToWChar(NULL
, 0, in
, inLen
);
406 if ( dstLen
!= wxCONV_FAILED
)
408 wxWCharBuffer
wbuf(dstLen
- 1);
409 if ( ToWChar(wbuf
.data(), dstLen
, in
, inLen
) != wxCONV_FAILED
)
414 if ( wbuf
[dstLen
- 1] == L
'\0' )
425 return wxWCharBuffer();
429 wxMBConv::cWC2MB(const wchar_t *in
, size_t inLen
, size_t *outLen
) const
431 const size_t dstLen
= FromWChar(NULL
, 0, in
, inLen
);
432 if ( dstLen
!= wxCONV_FAILED
)
434 wxCharBuffer
buf(dstLen
- 1);
435 if ( FromWChar(buf
.data(), dstLen
, in
, inLen
) != wxCONV_FAILED
)
441 const size_t nulLen
= GetMBNulLen();
442 if ( !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
444 // in this case the output is NUL-terminated and we're not
445 // supposed to count NUL
457 return wxCharBuffer();
460 // ----------------------------------------------------------------------------
462 // ----------------------------------------------------------------------------
464 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
466 return wxMB2WC(buf
, psz
, n
);
469 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
471 return wxWC2MB(buf
, psz
, n
);
474 // ----------------------------------------------------------------------------
475 // wxConvBrokenFileNames
476 // ----------------------------------------------------------------------------
480 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
482 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
483 || wxStricmp(charset
, _T("UTF8")) == 0 )
484 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
486 m_conv
= new wxCSConv(charset
);
491 // ----------------------------------------------------------------------------
493 // ----------------------------------------------------------------------------
495 // Implementation (C) 2004 Fredrik Roubert
498 // BASE64 decoding table
500 static const unsigned char utf7unb64
[] =
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
508 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
509 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
511 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
512 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
513 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
515 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
516 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
517 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
536 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
540 while ( *psz
&& (!buf
|| (len
< n
)) )
542 unsigned char cc
= *psz
++;
550 else if (*psz
== '-')
558 else // start of BASE64 encoded string
562 for ( ok
= lsb
= false, d
= 0, l
= 0;
563 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
568 for (l
+= 6; l
>= 8; lsb
= !lsb
)
570 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
580 *buf
= (wchar_t)(c
<< 8);
589 // in valid UTF7 we should have valid characters after '+'
590 return wxCONV_FAILED
;
598 if ( buf
&& (len
< n
) )
605 // BASE64 encoding table
607 static const unsigned char utf7enb64
[] =
609 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
610 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
611 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
612 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
613 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
614 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
615 'w', 'x', 'y', 'z', '0', '1', '2', '3',
616 '4', '5', '6', '7', '8', '9', '+', '/'
620 // UTF-7 encoding table
622 // 0 - Set D (directly encoded characters)
623 // 1 - Set O (optional direct characters)
624 // 2 - whitespace characters (optional)
625 // 3 - special characters
627 static const unsigned char utf7encode
[128] =
629 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
630 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
631 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
635 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
639 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
643 while (*psz
&& ((!buf
) || (len
< n
)))
646 if (cc
< 0x80 && utf7encode
[cc
] < 1)
654 else if (((wxUint32
)cc
) > 0xffff)
656 // no surrogate pair generation (yet?)
657 return wxCONV_FAILED
;
667 // BASE64 encode string
668 unsigned int lsb
, d
, l
;
669 for (d
= 0, l
= 0; /*nothing*/; psz
++)
671 for (lsb
= 0; lsb
< 2; lsb
++)
674 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
676 for (l
+= 8; l
>= 6; )
680 *buf
++ = utf7enb64
[(d
>> l
) % 64];
685 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
691 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
700 if (buf
&& (len
< n
))
705 // ----------------------------------------------------------------------------
707 // ----------------------------------------------------------------------------
709 static wxUint32 utf8_max
[]=
710 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
712 // boundaries of the private use area we use to (temporarily) remap invalid
713 // characters invalid in a UTF-8 encoded string
714 const wxUint32 wxUnicodePUA
= 0x100000;
715 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
717 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
721 while (*psz
&& ((!buf
) || (len
< n
)))
723 const char *opsz
= psz
;
724 bool invalid
= false;
725 unsigned char cc
= *psz
++, fc
= cc
;
727 for (cnt
= 0; fc
& 0x80; cnt
++)
736 // escape the escape character for octal escapes
737 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
738 && cc
== '\\' && (!buf
|| len
< n
))
750 // invalid UTF-8 sequence
755 unsigned ocnt
= cnt
- 1;
756 wxUint32 res
= cc
& (0x3f >> cnt
);
760 if ((cc
& 0xC0) != 0x80)
762 // invalid UTF-8 sequence
767 res
= (res
<< 6) | (cc
& 0x3f);
769 if (invalid
|| res
<= utf8_max
[ocnt
])
771 // illegal UTF-8 encoding
774 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
775 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
777 // if one of our PUA characters turns up externally
778 // it must also be treated as an illegal sequence
779 // (a bit like you have to escape an escape character)
785 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
786 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
787 if (pa
== wxCONV_FAILED
)
799 *buf
++ = (wchar_t)res
;
801 #endif // WC_UTF16/!WC_UTF16
806 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
808 while (opsz
< psz
&& (!buf
|| len
< n
))
811 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
812 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
813 wxASSERT(pa
!= wxCONV_FAILED
);
820 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
826 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
828 while (opsz
< psz
&& (!buf
|| len
< n
))
830 if ( buf
&& len
+ 3 < n
)
832 unsigned char on
= *opsz
;
834 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
835 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
836 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
842 else // MAP_INVALID_UTF8_NOT
844 return wxCONV_FAILED
;
849 if (buf
&& (len
< n
))
854 static inline bool isoctal(wchar_t wch
)
856 return L
'0' <= wch
&& wch
<= L
'7';
859 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
863 while (*psz
&& ((!buf
) || (len
< n
)))
867 // cast is ok for WC_UTF16
868 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
869 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
871 cc
=(*psz
++) & 0x7fffffff;
874 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
875 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
878 *buf
++ = (char)(cc
- wxUnicodePUA
);
881 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
882 && cc
== L
'\\' && psz
[0] == L
'\\' )
889 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
891 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
895 *buf
++ = (char) ((psz
[0] - L
'0')*0100 +
896 (psz
[1] - L
'0')*010 +
906 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
920 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
922 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
934 // ============================================================================
936 // ============================================================================
938 #ifdef WORDS_BIGENDIAN
939 #define wxMBConvUTF16straight wxMBConvUTF16BE
940 #define wxMBConvUTF16swap wxMBConvUTF16LE
942 #define wxMBConvUTF16swap wxMBConvUTF16BE
943 #define wxMBConvUTF16straight wxMBConvUTF16LE
947 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
949 if ( srcLen
== wxNO_LEN
)
951 // count the number of bytes in input, including the trailing NULs
952 const wxUint16
*in
= wx_reinterpret_cast(const wxUint16
*, src
);
953 for ( srcLen
= 1; *in
++; srcLen
++ )
956 srcLen
*= BYTES_PER_CHAR
;
958 else // we already have the length
960 // we can only convert an entire number of UTF-16 characters
961 if ( srcLen
% BYTES_PER_CHAR
)
962 return wxCONV_FAILED
;
968 // case when in-memory representation is UTF-16 too
971 // ----------------------------------------------------------------------------
972 // conversions without endianness change
973 // ----------------------------------------------------------------------------
976 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
977 const char *src
, size_t srcLen
) const
979 // set up the scene for using memcpy() (which is presumably more efficient
980 // than copying the bytes one by one)
981 srcLen
= GetLength(src
, srcLen
);
982 if ( srcLen
== wxNO_LEN
)
983 return wxCONV_FAILED
;
985 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
988 if ( dstLen
< inLen
)
989 return wxCONV_FAILED
;
991 memcpy(dst
, src
, srcLen
);
998 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
999 const wchar_t *src
, size_t srcLen
) const
1001 if ( srcLen
== wxNO_LEN
)
1002 srcLen
= wxWcslen(src
) + 1;
1004 srcLen
*= BYTES_PER_CHAR
;
1008 if ( dstLen
< srcLen
)
1009 return wxCONV_FAILED
;
1011 memcpy(dst
, src
, srcLen
);
1017 // ----------------------------------------------------------------------------
1018 // endian-reversing conversions
1019 // ----------------------------------------------------------------------------
1022 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1023 const char *src
, size_t srcLen
) const
1025 srcLen
= GetLength(src
, srcLen
);
1026 if ( srcLen
== wxNO_LEN
)
1027 return wxCONV_FAILED
;
1029 srcLen
/= BYTES_PER_CHAR
;
1033 if ( dstLen
< srcLen
)
1034 return wxCONV_FAILED
;
1036 const wxUint16
*in
= wx_reinterpret_cast(const wxUint16
*, src
);
1037 for ( size_t n
= 0; n
< srcLen
; n
++, in
++ )
1039 *dst
++ = wxUINT16_SWAP_ALWAYS(*in
);
1047 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1048 const wchar_t *src
, size_t srcLen
) const
1050 if ( srcLen
== wxNO_LEN
)
1051 srcLen
= wxWcslen(src
) + 1;
1053 srcLen
*= BYTES_PER_CHAR
;
1057 if ( dstLen
< srcLen
)
1058 return wxCONV_FAILED
;
1060 wxUint16
*out
= wx_reinterpret_cast(wxUint16
*, dst
);
1061 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1063 *out
++ = wxUINT16_SWAP_ALWAYS(*src
);
1070 #else // !WC_UTF16: wchar_t is UTF-32
1072 // ----------------------------------------------------------------------------
1073 // conversions without endianness change
1074 // ----------------------------------------------------------------------------
1077 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1078 const char *src
, size_t srcLen
) const
1080 srcLen
= GetLength(src
, srcLen
);
1081 if ( srcLen
== wxNO_LEN
)
1082 return wxCONV_FAILED
;
1084 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1087 // optimization: return maximal space which could be needed for this
1088 // string even if the real size could be smaller if the buffer contains
1094 const wxUint16
*in
= wx_reinterpret_cast(const wxUint16
*, src
);
1095 for ( const wxUint16
* const inEnd
= in
+ inLen
; in
< inEnd
; )
1097 const wxUint32 ch
= wxDecodeSurrogate(&in
);
1099 return wxCONV_FAILED
;
1101 if ( ++outLen
> dstLen
)
1102 return wxCONV_FAILED
;
1112 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1113 const wchar_t *src
, size_t srcLen
) const
1115 if ( srcLen
== wxNO_LEN
)
1116 srcLen
= wxWcslen(src
) + 1;
1119 wxUint16
*out
= wx_reinterpret_cast(wxUint16
*, dst
);
1120 for ( size_t n
= 0; n
< srcLen
; n
++ )
1123 const size_t numChars
= encode_utf16(*src
++, cc
);
1124 if ( numChars
== wxCONV_FAILED
)
1125 return wxCONV_FAILED
;
1127 outLen
+= numChars
*BYTES_PER_CHAR
;
1130 if ( outLen
> dstLen
)
1131 return wxCONV_FAILED
;
1134 if ( numChars
== 2 )
1136 // second character of a surrogate
1145 // ----------------------------------------------------------------------------
1146 // endian-reversing conversions
1147 // ----------------------------------------------------------------------------
1150 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1151 const char *src
, size_t srcLen
) const
1153 srcLen
= GetLength(src
, srcLen
);
1154 if ( srcLen
== wxNO_LEN
)
1155 return wxCONV_FAILED
;
1157 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1160 // optimization: return maximal space which could be needed for this
1161 // string even if the real size could be smaller if the buffer contains
1167 const wxUint16
*in
= wx_reinterpret_cast(const wxUint16
*, src
);
1168 for ( const wxUint16
* const inEnd
= in
+ inLen
; in
< inEnd
; )
1172 tmp
[0] = wxUINT16_SWAP_ALWAYS(*in
);
1174 tmp
[1] = wxUINT16_SWAP_ALWAYS(*in
);
1176 const size_t numChars
= decode_utf16(tmp
, ch
);
1177 if ( numChars
== wxCONV_FAILED
)
1178 return wxCONV_FAILED
;
1180 if ( numChars
== 2 )
1183 if ( ++outLen
> dstLen
)
1184 return wxCONV_FAILED
;
1194 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1195 const wchar_t *src
, size_t srcLen
) const
1197 if ( srcLen
== wxNO_LEN
)
1198 srcLen
= wxWcslen(src
) + 1;
1201 wxUint16
*out
= wx_reinterpret_cast(wxUint16
*, dst
);
1202 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1205 const size_t numChars
= encode_utf16(*src
, cc
);
1206 if ( numChars
== wxCONV_FAILED
)
1207 return wxCONV_FAILED
;
1209 outLen
+= numChars
*BYTES_PER_CHAR
;
1212 if ( outLen
> dstLen
)
1213 return wxCONV_FAILED
;
1215 *out
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1216 if ( numChars
== 2 )
1218 // second character of a surrogate
1219 *out
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1227 #endif // WC_UTF16/!WC_UTF16
1230 // ============================================================================
1232 // ============================================================================
1234 #ifdef WORDS_BIGENDIAN
1235 #define wxMBConvUTF32straight wxMBConvUTF32BE
1236 #define wxMBConvUTF32swap wxMBConvUTF32LE
1238 #define wxMBConvUTF32swap wxMBConvUTF32BE
1239 #define wxMBConvUTF32straight wxMBConvUTF32LE
1243 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1244 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1247 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1249 if ( srcLen
== wxNO_LEN
)
1251 // count the number of bytes in input, including the trailing NULs
1252 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1253 for ( srcLen
= 1; *in
++; srcLen
++ )
1256 srcLen
*= BYTES_PER_CHAR
;
1258 else // we already have the length
1260 // we can only convert an entire number of UTF-32 characters
1261 if ( srcLen
% BYTES_PER_CHAR
)
1262 return wxCONV_FAILED
;
1268 // case when in-memory representation is UTF-16
1271 // ----------------------------------------------------------------------------
1272 // conversions without endianness change
1273 // ----------------------------------------------------------------------------
1276 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1277 const char *src
, size_t srcLen
) const
1279 srcLen
= GetLength(src
, srcLen
);
1280 if ( srcLen
== wxNO_LEN
)
1281 return wxCONV_FAILED
;
1283 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1284 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1286 for ( size_t n
= 0; n
< inLen
; n
++ )
1289 const size_t numChars
= encode_utf16(*in
++, cc
);
1290 if ( numChars
== wxCONV_FAILED
)
1291 return wxCONV_FAILED
;
1296 if ( outLen
> dstLen
)
1297 return wxCONV_FAILED
;
1300 if ( numChars
== 2 )
1302 // second character of a surrogate
1312 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1313 const wchar_t *src
, size_t srcLen
) const
1315 if ( srcLen
== wxNO_LEN
)
1316 srcLen
= wxWcslen(src
) + 1;
1320 // optimization: return maximal space which could be needed for this
1321 // string instead of the exact amount which could be less if there are
1322 // any surrogates in the input
1324 // we consider that surrogates are rare enough to make it worthwhile to
1325 // avoid running the loop below at the cost of slightly extra memory
1327 return srcLen
*BYTES_PER_CHAR
;
1330 wxUint32
*out
= wx_reinterpret_cast(wxUint32
*, dst
);
1332 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1334 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1336 return wxCONV_FAILED
;
1338 outLen
+= BYTES_PER_CHAR
;
1340 if ( outLen
> dstLen
)
1341 return wxCONV_FAILED
;
1349 // ----------------------------------------------------------------------------
1350 // endian-reversing conversions
1351 // ----------------------------------------------------------------------------
1354 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1355 const char *src
, size_t srcLen
) const
1357 srcLen
= GetLength(src
, srcLen
);
1358 if ( srcLen
== wxNO_LEN
)
1359 return wxCONV_FAILED
;
1361 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1362 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1364 for ( size_t n
= 0; n
< inLen
; n
++, in
++ )
1367 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*in
), cc
);
1368 if ( numChars
== wxCONV_FAILED
)
1369 return wxCONV_FAILED
;
1374 if ( outLen
> dstLen
)
1375 return wxCONV_FAILED
;
1378 if ( numChars
== 2 )
1380 // second character of a surrogate
1390 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1391 const wchar_t *src
, size_t srcLen
) const
1393 if ( srcLen
== wxNO_LEN
)
1394 srcLen
= wxWcslen(src
) + 1;
1398 // optimization: return maximal space which could be needed for this
1399 // string instead of the exact amount which could be less if there are
1400 // any surrogates in the input
1402 // we consider that surrogates are rare enough to make it worthwhile to
1403 // avoid running the loop below at the cost of slightly extra memory
1405 return srcLen
*BYTES_PER_CHAR
;
1408 wxUint32
*out
= wx_reinterpret_cast(wxUint32
*, dst
);
1410 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1412 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1414 return wxCONV_FAILED
;
1416 outLen
+= BYTES_PER_CHAR
;
1418 if ( outLen
> dstLen
)
1419 return wxCONV_FAILED
;
1421 *out
++ = wxUINT32_SWAP_ALWAYS(ch
);
1427 #else // !WC_UTF16: wchar_t is UTF-32
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1434 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1435 const char *src
, size_t srcLen
) const
1437 // use memcpy() as it should be much faster than hand-written loop
1438 srcLen
= GetLength(src
, srcLen
);
1439 if ( srcLen
== wxNO_LEN
)
1440 return wxCONV_FAILED
;
1442 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1445 if ( dstLen
< inLen
)
1446 return wxCONV_FAILED
;
1448 memcpy(dst
, src
, srcLen
);
1455 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1456 const wchar_t *src
, size_t srcLen
) const
1458 if ( srcLen
== wxNO_LEN
)
1459 srcLen
= wxWcslen(src
) + 1;
1461 srcLen
*= BYTES_PER_CHAR
;
1465 if ( dstLen
< srcLen
)
1466 return wxCONV_FAILED
;
1468 memcpy(dst
, src
, srcLen
);
1474 // ----------------------------------------------------------------------------
1475 // endian-reversing conversions
1476 // ----------------------------------------------------------------------------
1479 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1480 const char *src
, size_t srcLen
) const
1482 srcLen
= GetLength(src
, srcLen
);
1483 if ( srcLen
== wxNO_LEN
)
1484 return wxCONV_FAILED
;
1486 srcLen
/= BYTES_PER_CHAR
;
1490 if ( dstLen
< srcLen
)
1491 return wxCONV_FAILED
;
1493 const wxUint32
*in
= wx_reinterpret_cast(const wxUint32
*, src
);
1494 for ( size_t n
= 0; n
< srcLen
; n
++, in
++ )
1496 *dst
++ = wxUINT32_SWAP_ALWAYS(*in
);
1504 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1505 const wchar_t *src
, size_t srcLen
) const
1507 if ( srcLen
== wxNO_LEN
)
1508 srcLen
= wxWcslen(src
) + 1;
1510 srcLen
*= BYTES_PER_CHAR
;
1514 if ( dstLen
< srcLen
)
1515 return wxCONV_FAILED
;
1517 wxUint32
*out
= wx_reinterpret_cast(wxUint32
*, dst
);
1518 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1520 *out
++ = wxUINT32_SWAP_ALWAYS(*src
);
1527 #endif // WC_UTF16/!WC_UTF16
1530 // ============================================================================
1531 // The classes doing conversion using the iconv_xxx() functions
1532 // ============================================================================
1536 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1537 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1538 // (unless there's yet another bug in glibc) the only case when iconv()
1539 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1540 // left in the input buffer -- when _real_ error occurs,
1541 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1543 // [This bug does not appear in glibc 2.2.]
1544 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1545 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1546 (errno != E2BIG || bufLeft != 0))
1548 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1551 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1553 #define ICONV_T_INVALID ((iconv_t)-1)
1555 #if SIZEOF_WCHAR_T == 4
1556 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1557 #define WC_ENC wxFONTENCODING_UTF32
1558 #elif SIZEOF_WCHAR_T == 2
1559 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1560 #define WC_ENC wxFONTENCODING_UTF16
1561 #else // sizeof(wchar_t) != 2 nor 4
1562 // does this ever happen?
1563 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1566 // ----------------------------------------------------------------------------
1567 // wxMBConv_iconv: encapsulates an iconv character set
1568 // ----------------------------------------------------------------------------
1570 class wxMBConv_iconv
: public wxMBConv
1573 wxMBConv_iconv(const wxChar
*name
);
1574 virtual ~wxMBConv_iconv();
1576 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1577 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1579 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1580 virtual size_t GetMBNulLen() const;
1582 virtual wxMBConv
*Clone() const
1584 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1585 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1590 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1593 // the iconv handlers used to translate from multibyte to wide char and in
1594 // the other direction
1598 // guards access to m2w and w2m objects
1599 wxMutex m_iconvMutex
;
1603 // the name (for iconv_open()) of a wide char charset -- if none is
1604 // available on this machine, it will remain NULL
1605 static wxString ms_wcCharsetName
;
1607 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1608 // different endian-ness than the native one
1609 static bool ms_wcNeedsSwap
;
1612 // name of the encoding handled by this conversion
1615 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1617 size_t m_minMBCharWidth
;
1620 // make the constructor available for unit testing
1621 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1623 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1624 if ( !result
->IsOk() )
1632 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1633 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1635 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1638 m_minMBCharWidth
= 0;
1640 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1641 // names for the charsets
1642 const wxCharBuffer
cname(wxString(name
).ToAscii());
1644 // check for charset that represents wchar_t:
1645 if ( ms_wcCharsetName
.empty() )
1647 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1650 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1651 #else // !wxUSE_FONTMAP
1652 static const wxChar
*names
[] =
1654 #if SIZEOF_WCHAR_T == 4
1656 #elif SIZEOF_WCHAR_T = 2
1661 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1663 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1665 const wxString
nameCS(*names
);
1667 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1668 wxString
nameXE(nameCS
);
1669 #ifdef WORDS_BIGENDIAN
1671 #else // little endian
1675 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1678 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1679 if ( m2w
== ICONV_T_INVALID
)
1681 // try charset w/o bytesex info (e.g. "UCS4")
1682 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1684 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1686 // and check for bytesex ourselves:
1687 if ( m2w
!= ICONV_T_INVALID
)
1689 char buf
[2], *bufPtr
;
1690 wchar_t wbuf
[2], *wbufPtr
;
1698 outsz
= SIZEOF_WCHAR_T
* 2;
1702 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1703 (char**)&wbufPtr
, &outsz
);
1705 if (ICONV_FAILED(res
, insz
))
1707 wxLogLastError(wxT("iconv"));
1708 wxLogError(_("Conversion to charset '%s' doesn't work."),
1711 else // ok, can convert to this encoding, remember it
1713 ms_wcCharsetName
= nameCS
;
1714 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1718 else // use charset not requiring byte swapping
1720 ms_wcCharsetName
= nameXE
;
1724 wxLogTrace(TRACE_STRCONV
,
1725 wxT("iconv wchar_t charset is \"%s\"%s"),
1726 ms_wcCharsetName
.empty() ? _T("<none>")
1727 : ms_wcCharsetName
.c_str(),
1728 ms_wcNeedsSwap
? _T(" (needs swap)")
1731 else // we already have ms_wcCharsetName
1733 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1736 if ( ms_wcCharsetName
.empty() )
1738 w2m
= ICONV_T_INVALID
;
1742 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1743 if ( w2m
== ICONV_T_INVALID
)
1745 wxLogTrace(TRACE_STRCONV
,
1746 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1747 ms_wcCharsetName
.c_str(), cname
.data());
1752 wxMBConv_iconv::~wxMBConv_iconv()
1754 if ( m2w
!= ICONV_T_INVALID
)
1756 if ( w2m
!= ICONV_T_INVALID
)
1760 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1762 // find the string length: notice that must be done differently for
1763 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1765 const size_t nulLen
= GetMBNulLen();
1769 return wxCONV_FAILED
;
1772 inbuf
= strlen(psz
); // arguably more optimized than our version
1777 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1778 // they also have to start at character boundary and not span two
1779 // adjacent characters
1781 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1788 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1789 // Unfortunately there is a couple of global wxCSConv objects such as
1790 // wxConvLocal that are used all over wx code, so we have to make sure
1791 // the handle is used by at most one thread at the time. Otherwise
1792 // only a few wx classes would be safe to use from non-main threads
1793 // as MB<->WC conversion would fail "randomly".
1794 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1795 #endif // wxUSE_THREADS
1798 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1800 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1801 wchar_t *bufPtr
= buf
;
1802 const char *pszPtr
= psz
;
1806 // have destination buffer, convert there
1808 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1809 (char**)&bufPtr
, &outbuf
);
1810 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1814 // convert to native endianness
1815 for ( unsigned i
= 0; i
< res
; i
++ )
1816 buf
[n
] = WC_BSWAP(buf
[i
]);
1819 // NUL-terminate the string if there is any space left
1825 // no destination buffer... convert using temp buffer
1826 // to calculate destination buffer requirement
1831 outbuf
= 8*SIZEOF_WCHAR_T
;
1834 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1835 (char**)&bufPtr
, &outbuf
);
1837 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1838 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1841 if (ICONV_FAILED(cres
, inbuf
))
1843 //VS: it is ok if iconv fails, hence trace only
1844 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1845 return wxCONV_FAILED
;
1851 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1854 // NB: explained in MB2WC
1855 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1858 size_t inlen
= wxWcslen(psz
);
1859 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1863 wchar_t *tmpbuf
= 0;
1867 // need to copy to temp buffer to switch endianness
1868 // (doing WC_BSWAP twice on the original buffer won't help, as it
1869 // could be in read-only memory, or be accessed in some other thread)
1870 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1871 for ( size_t i
= 0; i
< inlen
; i
++ )
1872 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1873 tmpbuf
[inlen
] = L
'\0';
1879 // have destination buffer, convert there
1880 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1884 // NB: iconv was given only wcslen(psz) characters on input, and so
1885 // it couldn't convert the trailing zero. Let's do it ourselves
1886 // if there's some room left for it in the output buffer.
1892 // no destination buffer... convert using temp buffer
1893 // to calculate destination buffer requirement
1897 buf
= tbuf
; outbuf
= 16;
1899 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1902 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1910 if (ICONV_FAILED(cres
, inbuf
))
1912 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1913 return wxCONV_FAILED
;
1919 size_t wxMBConv_iconv::GetMBNulLen() const
1921 if ( m_minMBCharWidth
== 0 )
1923 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1926 // NB: explained in MB2WC
1927 wxMutexLocker
lock(self
->m_iconvMutex
);
1930 wchar_t *wnul
= L
"";
1931 char buf
[8]; // should be enough for NUL in any encoding
1932 size_t inLen
= sizeof(wchar_t),
1933 outLen
= WXSIZEOF(buf
);
1934 char *in
= (char *)wnul
;
1936 if ( iconv(w2m
, ICONV_CHAR_CAST(&in
), &inLen
, &out
, &outLen
) == (size_t)-1 )
1938 self
->m_minMBCharWidth
= (size_t)-1;
1942 self
->m_minMBCharWidth
= out
- buf
;
1946 return m_minMBCharWidth
;
1949 #endif // HAVE_ICONV
1952 // ============================================================================
1953 // Win32 conversion classes
1954 // ============================================================================
1956 #ifdef wxHAVE_WIN32_MB2WC
1960 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1961 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1964 class wxMBConv_win32
: public wxMBConv
1969 m_CodePage
= CP_ACP
;
1970 m_minMBCharWidth
= 0;
1973 wxMBConv_win32(const wxMBConv_win32
& conv
)
1975 m_CodePage
= conv
.m_CodePage
;
1976 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
1980 wxMBConv_win32(const wxChar
* name
)
1982 m_CodePage
= wxCharsetToCodepage(name
);
1983 m_minMBCharWidth
= 0;
1986 wxMBConv_win32(wxFontEncoding encoding
)
1988 m_CodePage
= wxEncodingToCodepage(encoding
);
1989 m_minMBCharWidth
= 0;
1991 #endif // wxUSE_FONTMAP
1993 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1995 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1996 // the behaviour is not compatible with the Unix version (using iconv)
1997 // and break the library itself, e.g. wxTextInputStream::NextChar()
1998 // wouldn't work if reading an incomplete MB char didn't result in an
2001 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2002 // Win XP or newer and it is not supported for UTF-[78] so we always
2003 // use our own conversions in this case. See
2004 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2005 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2006 if ( m_CodePage
== CP_UTF8
)
2008 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
2011 if ( m_CodePage
== CP_UTF7
)
2013 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
2017 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2018 IsAtLeastWin2kSP4() )
2020 flags
= MB_ERR_INVALID_CHARS
;
2023 const size_t len
= ::MultiByteToWideChar
2025 m_CodePage
, // code page
2026 flags
, // flags: fall on error
2027 psz
, // input string
2028 -1, // its length (NUL-terminated)
2029 buf
, // output string
2030 buf
? n
: 0 // size of output buffer
2034 // function totally failed
2035 return wxCONV_FAILED
;
2038 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2039 // check if we succeeded, by doing a double trip:
2040 if ( !flags
&& buf
)
2042 const size_t mbLen
= strlen(psz
);
2043 wxCharBuffer
mbBuf(mbLen
);
2044 if ( ::WideCharToMultiByte
2051 mbLen
+ 1, // size in bytes, not length
2055 strcmp(mbBuf
, psz
) != 0 )
2057 // we didn't obtain the same thing we started from, hence
2058 // the conversion was lossy and we consider that it failed
2059 return wxCONV_FAILED
;
2063 // note that it returns count of written chars for buf != NULL and size
2064 // of the needed buffer for buf == NULL so in either case the length of
2065 // the string (which never includes the terminating NUL) is one less
2069 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2072 we have a problem here: by default, WideCharToMultiByte() may
2073 replace characters unrepresentable in the target code page with bad
2074 quality approximations such as turning "1/2" symbol (U+00BD) into
2075 "1" for the code pages which don't have it and we, obviously, want
2076 to avoid this at any price
2078 the trouble is that this function does it _silently_, i.e. it won't
2079 even tell us whether it did or not... Win98/2000 and higher provide
2080 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2081 we have to resort to a round trip, i.e. check that converting back
2082 results in the same string -- this is, of course, expensive but
2083 otherwise we simply can't be sure to not garble the data.
2086 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2087 // it doesn't work with CJK encodings (which we test for rather roughly
2088 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2090 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2093 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2095 // it's our lucky day
2096 flags
= WC_NO_BEST_FIT_CHARS
;
2097 pUsedDef
= &usedDef
;
2099 else // old system or unsupported encoding
2105 const size_t len
= ::WideCharToMultiByte
2107 m_CodePage
, // code page
2108 flags
, // either none or no best fit
2109 pwz
, // input string
2110 -1, // it is (wide) NUL-terminated
2111 buf
, // output buffer
2112 buf
? n
: 0, // and its size
2113 NULL
, // default "replacement" char
2114 pUsedDef
// [out] was it used?
2119 // function totally failed
2120 return wxCONV_FAILED
;
2123 // if we were really converting, check if we succeeded
2128 // check if the conversion failed, i.e. if any replacements
2131 return wxCONV_FAILED
;
2133 else // we must resort to double tripping...
2135 wxWCharBuffer
wcBuf(n
);
2136 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2137 wcscmp(wcBuf
, pwz
) != 0 )
2139 // we didn't obtain the same thing we started from, hence
2140 // the conversion was lossy and we consider that it failed
2141 return wxCONV_FAILED
;
2146 // see the comment above for the reason of "len - 1"
2150 virtual size_t GetMBNulLen() const
2152 if ( m_minMBCharWidth
== 0 )
2154 int len
= ::WideCharToMultiByte
2156 m_CodePage
, // code page
2158 L
"", // input string
2159 1, // translate just the NUL
2160 NULL
, // output buffer
2162 NULL
, // no replacement char
2163 NULL
// [out] don't care if it was used
2166 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2170 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2174 self
->m_minMBCharWidth
= (size_t)-1;
2180 self
->m_minMBCharWidth
= len
;
2185 return m_minMBCharWidth
;
2188 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2190 bool IsOk() const { return m_CodePage
!= -1; }
2193 static bool CanUseNoBestFit()
2195 static int s_isWin98Or2k
= -1;
2197 if ( s_isWin98Or2k
== -1 )
2200 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2203 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2207 s_isWin98Or2k
= verMaj
>= 5;
2211 // unknown, be conseravtive by default
2215 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2218 return s_isWin98Or2k
== 1;
2221 static bool IsAtLeastWin2kSP4()
2226 static int s_isAtLeastWin2kSP4
= -1;
2228 if ( s_isAtLeastWin2kSP4
== -1 )
2230 OSVERSIONINFOEX ver
;
2232 memset(&ver
, 0, sizeof(ver
));
2233 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2234 GetVersionEx((OSVERSIONINFO
*)&ver
);
2236 s_isAtLeastWin2kSP4
=
2237 ((ver
.dwMajorVersion
> 5) || // Vista+
2238 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2239 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2240 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2244 return s_isAtLeastWin2kSP4
== 1;
2249 // the code page we're working with
2252 // cached result of GetMBNulLen(), set to 0 initially meaning
2254 size_t m_minMBCharWidth
;
2257 #endif // wxHAVE_WIN32_MB2WC
2259 // ============================================================================
2260 // Cocoa conversion classes
2261 // ============================================================================
2263 #if defined(__WXCOCOA__)
2265 // RN: There is no UTF-32 support in either Core Foundation or
2266 // Cocoa. Strangely enough, internally Core Foundation uses
2267 // UTF 32 internally quite a bit - its just not public (yet).
2269 #include <CoreFoundation/CFString.h>
2270 #include <CoreFoundation/CFStringEncodingExt.h>
2272 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2274 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2275 if ( encoding
== wxFONTENCODING_DEFAULT
)
2277 enc
= CFStringGetSystemEncoding();
2279 else switch( encoding
)
2281 case wxFONTENCODING_ISO8859_1
:
2282 enc
= kCFStringEncodingISOLatin1
;
2284 case wxFONTENCODING_ISO8859_2
:
2285 enc
= kCFStringEncodingISOLatin2
;
2287 case wxFONTENCODING_ISO8859_3
:
2288 enc
= kCFStringEncodingISOLatin3
;
2290 case wxFONTENCODING_ISO8859_4
:
2291 enc
= kCFStringEncodingISOLatin4
;
2293 case wxFONTENCODING_ISO8859_5
:
2294 enc
= kCFStringEncodingISOLatinCyrillic
;
2296 case wxFONTENCODING_ISO8859_6
:
2297 enc
= kCFStringEncodingISOLatinArabic
;
2299 case wxFONTENCODING_ISO8859_7
:
2300 enc
= kCFStringEncodingISOLatinGreek
;
2302 case wxFONTENCODING_ISO8859_8
:
2303 enc
= kCFStringEncodingISOLatinHebrew
;
2305 case wxFONTENCODING_ISO8859_9
:
2306 enc
= kCFStringEncodingISOLatin5
;
2308 case wxFONTENCODING_ISO8859_10
:
2309 enc
= kCFStringEncodingISOLatin6
;
2311 case wxFONTENCODING_ISO8859_11
:
2312 enc
= kCFStringEncodingISOLatinThai
;
2314 case wxFONTENCODING_ISO8859_13
:
2315 enc
= kCFStringEncodingISOLatin7
;
2317 case wxFONTENCODING_ISO8859_14
:
2318 enc
= kCFStringEncodingISOLatin8
;
2320 case wxFONTENCODING_ISO8859_15
:
2321 enc
= kCFStringEncodingISOLatin9
;
2324 case wxFONTENCODING_KOI8
:
2325 enc
= kCFStringEncodingKOI8_R
;
2327 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2328 enc
= kCFStringEncodingDOSRussian
;
2331 // case wxFONTENCODING_BULGARIAN :
2335 case wxFONTENCODING_CP437
:
2336 enc
=kCFStringEncodingDOSLatinUS
;
2338 case wxFONTENCODING_CP850
:
2339 enc
= kCFStringEncodingDOSLatin1
;
2341 case wxFONTENCODING_CP852
:
2342 enc
= kCFStringEncodingDOSLatin2
;
2344 case wxFONTENCODING_CP855
:
2345 enc
= kCFStringEncodingDOSCyrillic
;
2347 case wxFONTENCODING_CP866
:
2348 enc
=kCFStringEncodingDOSRussian
;
2350 case wxFONTENCODING_CP874
:
2351 enc
= kCFStringEncodingDOSThai
;
2353 case wxFONTENCODING_CP932
:
2354 enc
= kCFStringEncodingDOSJapanese
;
2356 case wxFONTENCODING_CP936
:
2357 enc
=kCFStringEncodingDOSChineseSimplif
;
2359 case wxFONTENCODING_CP949
:
2360 enc
= kCFStringEncodingDOSKorean
;
2362 case wxFONTENCODING_CP950
:
2363 enc
= kCFStringEncodingDOSChineseTrad
;
2365 case wxFONTENCODING_CP1250
:
2366 enc
= kCFStringEncodingWindowsLatin2
;
2368 case wxFONTENCODING_CP1251
:
2369 enc
=kCFStringEncodingWindowsCyrillic
;
2371 case wxFONTENCODING_CP1252
:
2372 enc
=kCFStringEncodingWindowsLatin1
;
2374 case wxFONTENCODING_CP1253
:
2375 enc
= kCFStringEncodingWindowsGreek
;
2377 case wxFONTENCODING_CP1254
:
2378 enc
= kCFStringEncodingWindowsLatin5
;
2380 case wxFONTENCODING_CP1255
:
2381 enc
=kCFStringEncodingWindowsHebrew
;
2383 case wxFONTENCODING_CP1256
:
2384 enc
=kCFStringEncodingWindowsArabic
;
2386 case wxFONTENCODING_CP1257
:
2387 enc
= kCFStringEncodingWindowsBalticRim
;
2389 // This only really encodes to UTF7 (if that) evidently
2390 // case wxFONTENCODING_UTF7 :
2391 // enc = kCFStringEncodingNonLossyASCII ;
2393 case wxFONTENCODING_UTF8
:
2394 enc
= kCFStringEncodingUTF8
;
2396 case wxFONTENCODING_EUC_JP
:
2397 enc
= kCFStringEncodingEUC_JP
;
2399 case wxFONTENCODING_UTF16
:
2400 enc
= kCFStringEncodingUnicode
;
2402 case wxFONTENCODING_MACROMAN
:
2403 enc
= kCFStringEncodingMacRoman
;
2405 case wxFONTENCODING_MACJAPANESE
:
2406 enc
= kCFStringEncodingMacJapanese
;
2408 case wxFONTENCODING_MACCHINESETRAD
:
2409 enc
= kCFStringEncodingMacChineseTrad
;
2411 case wxFONTENCODING_MACKOREAN
:
2412 enc
= kCFStringEncodingMacKorean
;
2414 case wxFONTENCODING_MACARABIC
:
2415 enc
= kCFStringEncodingMacArabic
;
2417 case wxFONTENCODING_MACHEBREW
:
2418 enc
= kCFStringEncodingMacHebrew
;
2420 case wxFONTENCODING_MACGREEK
:
2421 enc
= kCFStringEncodingMacGreek
;
2423 case wxFONTENCODING_MACCYRILLIC
:
2424 enc
= kCFStringEncodingMacCyrillic
;
2426 case wxFONTENCODING_MACDEVANAGARI
:
2427 enc
= kCFStringEncodingMacDevanagari
;
2429 case wxFONTENCODING_MACGURMUKHI
:
2430 enc
= kCFStringEncodingMacGurmukhi
;
2432 case wxFONTENCODING_MACGUJARATI
:
2433 enc
= kCFStringEncodingMacGujarati
;
2435 case wxFONTENCODING_MACORIYA
:
2436 enc
= kCFStringEncodingMacOriya
;
2438 case wxFONTENCODING_MACBENGALI
:
2439 enc
= kCFStringEncodingMacBengali
;
2441 case wxFONTENCODING_MACTAMIL
:
2442 enc
= kCFStringEncodingMacTamil
;
2444 case wxFONTENCODING_MACTELUGU
:
2445 enc
= kCFStringEncodingMacTelugu
;
2447 case wxFONTENCODING_MACKANNADA
:
2448 enc
= kCFStringEncodingMacKannada
;
2450 case wxFONTENCODING_MACMALAJALAM
:
2451 enc
= kCFStringEncodingMacMalayalam
;
2453 case wxFONTENCODING_MACSINHALESE
:
2454 enc
= kCFStringEncodingMacSinhalese
;
2456 case wxFONTENCODING_MACBURMESE
:
2457 enc
= kCFStringEncodingMacBurmese
;
2459 case wxFONTENCODING_MACKHMER
:
2460 enc
= kCFStringEncodingMacKhmer
;
2462 case wxFONTENCODING_MACTHAI
:
2463 enc
= kCFStringEncodingMacThai
;
2465 case wxFONTENCODING_MACLAOTIAN
:
2466 enc
= kCFStringEncodingMacLaotian
;
2468 case wxFONTENCODING_MACGEORGIAN
:
2469 enc
= kCFStringEncodingMacGeorgian
;
2471 case wxFONTENCODING_MACARMENIAN
:
2472 enc
= kCFStringEncodingMacArmenian
;
2474 case wxFONTENCODING_MACCHINESESIMP
:
2475 enc
= kCFStringEncodingMacChineseSimp
;
2477 case wxFONTENCODING_MACTIBETAN
:
2478 enc
= kCFStringEncodingMacTibetan
;
2480 case wxFONTENCODING_MACMONGOLIAN
:
2481 enc
= kCFStringEncodingMacMongolian
;
2483 case wxFONTENCODING_MACETHIOPIC
:
2484 enc
= kCFStringEncodingMacEthiopic
;
2486 case wxFONTENCODING_MACCENTRALEUR
:
2487 enc
= kCFStringEncodingMacCentralEurRoman
;
2489 case wxFONTENCODING_MACVIATNAMESE
:
2490 enc
= kCFStringEncodingMacVietnamese
;
2492 case wxFONTENCODING_MACARABICEXT
:
2493 enc
= kCFStringEncodingMacExtArabic
;
2495 case wxFONTENCODING_MACSYMBOL
:
2496 enc
= kCFStringEncodingMacSymbol
;
2498 case wxFONTENCODING_MACDINGBATS
:
2499 enc
= kCFStringEncodingMacDingbats
;
2501 case wxFONTENCODING_MACTURKISH
:
2502 enc
= kCFStringEncodingMacTurkish
;
2504 case wxFONTENCODING_MACCROATIAN
:
2505 enc
= kCFStringEncodingMacCroatian
;
2507 case wxFONTENCODING_MACICELANDIC
:
2508 enc
= kCFStringEncodingMacIcelandic
;
2510 case wxFONTENCODING_MACROMANIAN
:
2511 enc
= kCFStringEncodingMacRomanian
;
2513 case wxFONTENCODING_MACCELTIC
:
2514 enc
= kCFStringEncodingMacCeltic
;
2516 case wxFONTENCODING_MACGAELIC
:
2517 enc
= kCFStringEncodingMacGaelic
;
2519 // case wxFONTENCODING_MACKEYBOARD :
2520 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2523 // because gcc is picky
2529 class wxMBConv_cocoa
: public wxMBConv
2534 Init(CFStringGetSystemEncoding()) ;
2537 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2539 m_encoding
= conv
.m_encoding
;
2543 wxMBConv_cocoa(const wxChar
* name
)
2545 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2549 wxMBConv_cocoa(wxFontEncoding encoding
)
2551 Init( wxCFStringEncFromFontEnc(encoding
) );
2558 void Init( CFStringEncoding encoding
)
2560 m_encoding
= encoding
;
2563 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2567 CFStringRef theString
= CFStringCreateWithBytes (
2568 NULL
, //the allocator
2569 (const UInt8
*)szUnConv
,
2572 false //no BOM/external representation
2575 wxASSERT(theString
);
2577 size_t nOutLength
= CFStringGetLength(theString
);
2581 CFRelease(theString
);
2585 CFRange theRange
= { 0, nOutSize
};
2587 #if SIZEOF_WCHAR_T == 4
2588 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2591 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2593 CFRelease(theString
);
2595 szUniCharBuffer
[nOutLength
] = '\0' ;
2597 #if SIZEOF_WCHAR_T == 4
2598 wxMBConvUTF16 converter
;
2599 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
2600 delete[] szUniCharBuffer
;
2606 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2610 size_t nRealOutSize
;
2611 size_t nBufSize
= wxWcslen(szUnConv
);
2612 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2614 #if SIZEOF_WCHAR_T == 4
2615 wxMBConvUTF16 converter
;
2616 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2617 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2618 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2619 nBufSize
/= sizeof(UniChar
);
2622 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2626 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2629 wxASSERT(theString
);
2631 //Note that CER puts a BOM when converting to unicode
2632 //so we check and use getchars instead in that case
2633 if (m_encoding
== kCFStringEncodingUnicode
)
2636 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2638 nRealOutSize
= CFStringGetLength(theString
) + 1;
2644 CFRangeMake(0, CFStringGetLength(theString
)),
2646 0, //what to put in characters that can't be converted -
2647 //0 tells CFString to return NULL if it meets such a character
2648 false, //not an external representation
2651 (CFIndex
*) &nRealOutSize
2655 CFRelease(theString
);
2657 #if SIZEOF_WCHAR_T == 4
2658 delete[] szUniBuffer
;
2661 return nRealOutSize
- 1;
2664 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2668 return m_encoding
!= kCFStringEncodingInvalidId
&&
2669 CFStringIsEncodingAvailable(m_encoding
);
2673 CFStringEncoding m_encoding
;
2676 #endif // defined(__WXCOCOA__)
2678 // ============================================================================
2679 // Mac conversion classes
2680 // ============================================================================
2682 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2684 class wxMBConv_mac
: public wxMBConv
2689 Init(CFStringGetSystemEncoding()) ;
2692 wxMBConv_mac(const wxMBConv_mac
& conv
)
2694 Init(conv
.m_char_encoding
);
2698 wxMBConv_mac(const wxChar
* name
)
2700 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2704 wxMBConv_mac(wxFontEncoding encoding
)
2706 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2711 OSStatus status
= noErr
;
2712 status
= TECDisposeConverter(m_MB2WC_converter
);
2713 status
= TECDisposeConverter(m_WC2MB_converter
);
2717 void Init( TextEncodingBase encoding
)
2719 OSStatus status
= noErr
;
2720 m_char_encoding
= encoding
;
2721 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2723 status
= TECCreateConverter(&m_MB2WC_converter
,
2725 m_unicode_encoding
);
2726 status
= TECCreateConverter(&m_WC2MB_converter
,
2731 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2733 OSStatus status
= noErr
;
2734 ByteCount byteOutLen
;
2735 ByteCount byteInLen
= strlen(psz
) ;
2736 wchar_t *tbuf
= NULL
;
2737 UniChar
* ubuf
= NULL
;
2742 //apple specs say at least 32
2743 n
= wxMax( 32 , byteInLen
) ;
2744 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2746 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2747 #if SIZEOF_WCHAR_T == 4
2748 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2750 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2752 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2753 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2754 #if SIZEOF_WCHAR_T == 4
2755 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2756 // is not properly terminated we get random characters at the end
2757 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2758 wxMBConvUTF16 converter
;
2759 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2762 res
= byteOutLen
/ sizeof( UniChar
) ;
2767 if ( buf
&& res
< n
)
2773 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2775 OSStatus status
= noErr
;
2776 ByteCount byteOutLen
;
2777 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2783 //apple specs say at least 32
2784 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2785 tbuf
= (char*) malloc( n
) ;
2788 ByteCount byteBufferLen
= n
;
2789 UniChar
* ubuf
= NULL
;
2790 #if SIZEOF_WCHAR_T == 4
2791 wxMBConvUTF16 converter
;
2792 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2793 byteInLen
= unicharlen
;
2794 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2795 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2797 ubuf
= (UniChar
*) psz
;
2799 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2800 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2801 #if SIZEOF_WCHAR_T == 4
2807 size_t res
= byteOutLen
;
2808 if ( buf
&& res
< n
)
2812 //we need to double-trip to verify it didn't insert any ? in place
2813 //of bogus characters
2814 wxWCharBuffer
wcBuf(n
);
2815 size_t pszlen
= wxWcslen(psz
);
2816 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2817 wxWcslen(wcBuf
) != pszlen
||
2818 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2820 // we didn't obtain the same thing we started from, hence
2821 // the conversion was lossy and we consider that it failed
2822 return wxCONV_FAILED
;
2829 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2832 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2835 TECObjectRef m_MB2WC_converter
;
2836 TECObjectRef m_WC2MB_converter
;
2838 TextEncodingBase m_char_encoding
;
2839 TextEncodingBase m_unicode_encoding
;
2842 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2844 // ============================================================================
2845 // wxEncodingConverter based conversion classes
2846 // ============================================================================
2850 class wxMBConv_wxwin
: public wxMBConv
2855 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2856 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2860 // temporarily just use wxEncodingConverter stuff,
2861 // so that it works while a better implementation is built
2862 wxMBConv_wxwin(const wxChar
* name
)
2865 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2867 m_enc
= wxFONTENCODING_SYSTEM
;
2872 wxMBConv_wxwin(wxFontEncoding enc
)
2879 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2881 size_t inbuf
= strlen(psz
);
2884 if (!m2w
.Convert(psz
,buf
))
2885 return wxCONV_FAILED
;
2890 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2892 const size_t inbuf
= wxWcslen(psz
);
2895 if (!w2m
.Convert(psz
,buf
))
2896 return wxCONV_FAILED
;
2902 virtual size_t GetMBNulLen() const
2906 case wxFONTENCODING_UTF16BE
:
2907 case wxFONTENCODING_UTF16LE
:
2910 case wxFONTENCODING_UTF32BE
:
2911 case wxFONTENCODING_UTF32LE
:
2919 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2921 bool IsOk() const { return m_ok
; }
2924 wxFontEncoding m_enc
;
2925 wxEncodingConverter m2w
, w2m
;
2928 // were we initialized successfully?
2931 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2934 // make the constructors available for unit testing
2935 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2937 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2938 if ( !result
->IsOk() )
2946 #endif // wxUSE_FONTMAP
2948 // ============================================================================
2949 // wxCSConv implementation
2950 // ============================================================================
2952 void wxCSConv::Init()
2959 wxCSConv::wxCSConv(const wxChar
*charset
)
2969 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2971 m_encoding
= wxFONTENCODING_SYSTEM
;
2975 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2977 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2979 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2981 encoding
= wxFONTENCODING_SYSTEM
;
2986 m_encoding
= encoding
;
2989 wxCSConv::~wxCSConv()
2994 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2999 SetName(conv
.m_name
);
3000 m_encoding
= conv
.m_encoding
;
3003 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3007 SetName(conv
.m_name
);
3008 m_encoding
= conv
.m_encoding
;
3013 void wxCSConv::Clear()
3022 void wxCSConv::SetName(const wxChar
*charset
)
3026 m_name
= wxStrdup(charset
);
3032 #include "wx/hashmap.h"
3034 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3035 wxEncodingNameCache
);
3037 static wxEncodingNameCache gs_nameCache
;
3040 wxMBConv
*wxCSConv::DoCreate() const
3043 wxLogTrace(TRACE_STRCONV
,
3044 wxT("creating conversion for %s"),
3046 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
3047 #endif // wxUSE_FONTMAP
3049 // check for the special case of ASCII or ISO8859-1 charset: as we have
3050 // special knowledge of it anyhow, we don't need to create a special
3051 // conversion object
3052 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3053 m_encoding
== wxFONTENCODING_DEFAULT
)
3055 // don't convert at all
3059 // we trust OS to do conversion better than we can so try external
3060 // conversion methods first
3062 // the full order is:
3063 // 1. OS conversion (iconv() under Unix or Win32 API)
3064 // 2. hard coded conversions for UTF
3065 // 3. wxEncodingConverter as fall back
3071 #endif // !wxUSE_FONTMAP
3073 wxString
name(m_name
);
3074 wxFontEncoding
encoding(m_encoding
);
3076 if ( !name
.empty() )
3078 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
3086 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3087 #endif // wxUSE_FONTMAP
3091 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3092 if ( it
!= gs_nameCache
.end() )
3094 if ( it
->second
.empty() )
3097 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3104 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3106 for ( ; *names
; ++names
)
3108 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3111 gs_nameCache
[encoding
] = *names
;
3118 gs_nameCache
[encoding
] = _T(""); // cache the failure
3120 #endif // wxUSE_FONTMAP
3122 #endif // HAVE_ICONV
3124 #ifdef wxHAVE_WIN32_MB2WC
3127 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3128 : new wxMBConv_win32(m_encoding
);
3137 #endif // wxHAVE_WIN32_MB2WC
3138 #if defined(__WXMAC__)
3140 // leave UTF16 and UTF32 to the built-ins of wx
3141 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3142 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3146 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3147 : new wxMBConv_mac(m_encoding
);
3149 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3158 #if defined(__WXCOCOA__)
3160 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3164 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3165 : new wxMBConv_cocoa(m_encoding
);
3167 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3177 wxFontEncoding enc
= m_encoding
;
3179 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3181 // use "false" to suppress interactive dialogs -- we can be called from
3182 // anywhere and popping up a dialog from here is the last thing we want to
3184 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3186 #endif // wxUSE_FONTMAP
3190 case wxFONTENCODING_UTF7
:
3191 return new wxMBConvUTF7
;
3193 case wxFONTENCODING_UTF8
:
3194 return new wxMBConvUTF8
;
3196 case wxFONTENCODING_UTF16BE
:
3197 return new wxMBConvUTF16BE
;
3199 case wxFONTENCODING_UTF16LE
:
3200 return new wxMBConvUTF16LE
;
3202 case wxFONTENCODING_UTF32BE
:
3203 return new wxMBConvUTF32BE
;
3205 case wxFONTENCODING_UTF32LE
:
3206 return new wxMBConvUTF32LE
;
3209 // nothing to do but put here to suppress gcc warnings
3216 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3217 : new wxMBConv_wxwin(m_encoding
);
3223 #endif // wxUSE_FONTMAP
3225 // NB: This is a hack to prevent deadlock. What could otherwise happen
3226 // in Unicode build: wxConvLocal creation ends up being here
3227 // because of some failure and logs the error. But wxLog will try to
3228 // attach timestamp, for which it will need wxConvLocal (to convert
3229 // time to char* and then wchar_t*), but that fails, tries to log
3230 // error, but wxLog has a (already locked) critical section that
3231 // guards static buffer.
3232 static bool alreadyLoggingError
= false;
3233 if (!alreadyLoggingError
)
3235 alreadyLoggingError
= true;
3236 wxLogError(_("Cannot convert from the charset '%s'!"),
3240 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3241 #else // !wxUSE_FONTMAP
3242 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3243 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3245 alreadyLoggingError
= false;
3251 void wxCSConv::CreateConvIfNeeded() const
3255 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3258 // if we don't have neither the name nor the encoding, use the default
3259 // encoding for this system
3260 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3262 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3264 #endif // wxUSE_INTL
3266 self
->m_convReal
= DoCreate();
3267 self
->m_deferred
= false;
3271 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3273 CreateConvIfNeeded();
3276 return m_convReal
->MB2WC(buf
, psz
, n
);
3279 size_t len
= strlen(psz
);
3283 for (size_t c
= 0; c
<= len
; c
++)
3284 buf
[c
] = (unsigned char)(psz
[c
]);
3290 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3292 CreateConvIfNeeded();
3295 return m_convReal
->WC2MB(buf
, psz
, n
);
3298 const size_t len
= wxWcslen(psz
);
3301 for (size_t c
= 0; c
<= len
; c
++)
3304 return wxCONV_FAILED
;
3305 buf
[c
] = (char)psz
[c
];
3310 for (size_t c
= 0; c
<= len
; c
++)
3313 return wxCONV_FAILED
;
3320 size_t wxCSConv::GetMBNulLen() const
3322 CreateConvIfNeeded();
3326 return m_convReal
->GetMBNulLen();
3332 // ----------------------------------------------------------------------------
3334 // ----------------------------------------------------------------------------
3337 static wxMBConv_win32 wxConvLibcObj
;
3338 #elif defined(__WXMAC__) && !defined(__MACH__)
3339 static wxMBConv_mac wxConvLibcObj
;
3341 static wxMBConvLibc wxConvLibcObj
;
3344 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3345 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3346 static wxMBConvUTF7 wxConvUTF7Obj
;
3347 static wxMBConvUTF8 wxConvUTF8Obj
;
3349 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3350 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3351 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3352 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3353 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3354 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3355 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3363 #else // !wxUSE_WCHAR_T
3365 // stand-ins in absence of wchar_t
3366 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3371 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T