]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
check for comctl32.dll version and not OS one as an app running XP without manifest...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
c933e267 56#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
ce6f8d6f
VZ
60#define TRACE_STRCONV _T("strconv")
61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069
VZ
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434
VZ
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
6001e347 168
483b0434
VZ
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
eec47cc6 171
c1464d9d 172 // the number of NULs terminating this string
a78c43f1 173 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 174
c1464d9d
VZ
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
483b0434
VZ
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
467e0479 181 if ( srcLen != wxNO_LEN )
eec47cc6 182 {
c1464d9d 183 // we need to know how to find the end of this string
7ef3ab50 184 nulLen = GetMBNulLen();
483b0434
VZ
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
e4e3bbb4 187
c1464d9d 188 // if there are enough NULs we can avoid the copy
483b0434 189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
190 {
191 // make a copy in order to properly NUL-terminate the string
483b0434 192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 193 char * const p = bufTmp.data();
483b0434
VZ
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 196 *s = '\0';
483b0434
VZ
197
198 src = bufTmp;
eec47cc6 199 }
e4e3bbb4 200
483b0434
VZ
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
e4e3bbb4 207
483b0434 208 for ( ;; )
eec47cc6 209 {
c1464d9d 210 // try to convert the current chunk
483b0434 211 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
e4e3bbb4 214
483b0434 215 dstWritten += lenChunk;
f6a02087
VZ
216 if ( !srcEnd )
217 dstWritten++;
f5fb6871 218
f6a02087 219 if ( !lenChunk )
467e0479
VZ
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
483b0434
VZ
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
f6a02087
VZ
230 // +1 is for trailing NUL
231 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
232 return wxCONV_FAILED;
233
234 dst += lenChunk;
f6a02087
VZ
235 if ( !srcEnd )
236 dst++;
483b0434 237 }
c1464d9d 238
483b0434 239 if ( !srcEnd )
c1464d9d 240 {
467e0479
VZ
241 // we convert just one chunk in this case as this is the entire
242 // string anyhow
c1464d9d
VZ
243 break;
244 }
eec47cc6
VZ
245
246 // advance the input pointer past the end of this chunk
483b0434 247 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
248 {
249 // notice that we must skip over multiple bytes here as we suppose
250 // that if NUL takes 2 or 4 bytes, then all the other characters do
251 // too and so if advanced by a single byte we might erroneously
252 // detect sequences of NUL bytes in the middle of the input
483b0434 253 src += nulLen;
c1464d9d 254 }
e4e3bbb4 255
483b0434 256 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
257
258 // note that ">=" (and not just "==") is needed here as the terminator
259 // we skipped just above could be inside or just after the buffer
260 // delimited by inEnd
483b0434 261 if ( src >= srcEnd )
c1464d9d
VZ
262 break;
263 }
264
483b0434 265 return dstWritten;
e4e3bbb4
RN
266}
267
483b0434
VZ
268size_t
269wxMBConv::FromWChar(char *dst, size_t dstLen,
270 const wchar_t *src, size_t srcLen) const
e4e3bbb4 271{
483b0434
VZ
272 // the number of chars [which would be] written to dst [if it were not NULL]
273 size_t dstWritten = 0;
e4e3bbb4 274
f6a02087
VZ
275 // if we don't know its length we have no choice but to assume that it is
276 // NUL-terminated (notice that it can still be NUL-terminated even if
277 // explicit length is given but it doesn't change our return value)
278 const bool isNulTerminated = srcLen == wxNO_LEN;
279
eec47cc6
VZ
280 // make a copy of the input string unless it is already properly
281 // NUL-terminated
eec47cc6 282 wxWCharBuffer bufTmp;
f6a02087 283 if ( isNulTerminated )
e4e3bbb4 284 {
483b0434 285 srcLen = wxWcslen(src) + 1;
eec47cc6 286 }
483b0434 287 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
288 {
289 // make a copy in order to properly NUL-terminate the string
483b0434 290 bufTmp = wxWCharBuffer(srcLen);
ef199164 291 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
292 src = bufTmp;
293 }
294
295 const size_t lenNul = GetMBNulLen();
296 for ( const wchar_t * const srcEnd = src + srcLen;
297 src < srcEnd;
298 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
299 {
300 // try to convert the current chunk
301 size_t lenChunk = WC2MB(NULL, src, 0);
302
303 if ( lenChunk == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
483b0434 306 dstWritten += lenChunk;
f6a02087
VZ
307 if ( isNulTerminated )
308 dstWritten += lenNul;
483b0434
VZ
309
310 if ( dst )
311 {
312 if ( dstWritten > dstLen )
313 return wxCONV_FAILED;
314
f6a02087 315 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
316 return wxCONV_FAILED;
317
318 dst += lenChunk;
f6a02087
VZ
319 if ( isNulTerminated )
320 dst += lenNul;
483b0434 321 }
eec47cc6 322 }
e4e3bbb4 323
483b0434
VZ
324 return dstWritten;
325}
326
ef199164 327size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 328{
ef199164 329 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 330 if ( rc != wxCONV_FAILED )
509da451
VZ
331 {
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
334 rc--;
335 }
336
337 return rc;
338}
339
ef199164 340size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 341{
ef199164 342 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 343 if ( rc != wxCONV_FAILED )
509da451
VZ
344 {
345 rc -= GetMBNulLen();
346 }
347
348 return rc;
349}
350
483b0434
VZ
351wxMBConv::~wxMBConv()
352{
353 // nothing to do here (necessary for Darwin linking probably)
354}
e4e3bbb4 355
483b0434
VZ
356const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
357{
358 if ( psz )
eec47cc6 359 {
483b0434 360 // calculate the length of the buffer needed first
a2db25a1 361 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 362 if ( nLen != wxCONV_FAILED )
f5fb6871 363 {
483b0434 364 // now do the actual conversion
a2db25a1 365 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 366
483b0434 367 // +1 for the trailing NULL
a2db25a1 368 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 369 return buf;
f5fb6871 370 }
483b0434 371 }
e4e3bbb4 372
483b0434
VZ
373 return wxWCharBuffer();
374}
3698ae71 375
483b0434
VZ
376const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
377{
378 if ( pwz )
379 {
a2db25a1 380 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 381 if ( nLen != wxCONV_FAILED )
483b0434 382 {
a2db25a1
VZ
383 wxCharBuffer buf(nLen - 1);
384 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
385 return buf;
386 }
387 }
388
389 return wxCharBuffer();
390}
e4e3bbb4 391
483b0434 392const wxWCharBuffer
ef199164 393wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 394{
ef199164 395 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 396 if ( dstLen != wxCONV_FAILED )
483b0434 397 {
0dd13d21
VZ
398 // notice that we allocate space for dstLen+1 wide characters here
399 // because we want the buffer to always be NUL-terminated, even if the
400 // input isn't (as otherwise the caller has no way to know its length)
401 wxWCharBuffer wbuf(dstLen);
f6a02087 402 wbuf.data()[dstLen] = L'\0';
ef199164 403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
404 {
405 if ( outLen )
467e0479
VZ
406 {
407 *outLen = dstLen;
f6a02087
VZ
408
409 // we also need to handle NUL-terminated input strings
410 // specially: for them the output is the length of the string
411 // excluding the trailing NUL, however if we're asked to
412 // convert a specific number of characters we return the length
413 // of the resulting output even if it's NUL-terminated
414 if ( inLen == wxNO_LEN )
467e0479
VZ
415 (*outLen)--;
416 }
417
483b0434
VZ
418 return wbuf;
419 }
420 }
421
422 if ( outLen )
423 *outLen = 0;
424
425 return wxWCharBuffer();
426}
427
428const wxCharBuffer
ef199164 429wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 430{
13d92ad6 431 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 432 if ( dstLen != wxCONV_FAILED )
483b0434 433 {
0dd13d21
VZ
434 const size_t nulLen = GetMBNulLen();
435
436 // as above, ensure that the buffer is always NUL-terminated, even if
437 // the input is not
438 wxCharBuffer buf(dstLen + nulLen - 1);
439 memset(buf.data() + dstLen, 0, nulLen);
ef199164 440 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
441 {
442 if ( outLen )
467e0479
VZ
443 {
444 *outLen = dstLen;
445
f6a02087 446 if ( inLen == wxNO_LEN )
467e0479 447 {
f6a02087
VZ
448 // in this case both input and output are NUL-terminated
449 // and we're not supposed to count NUL
13d92ad6 450 *outLen -= nulLen;
467e0479
VZ
451 }
452 }
d32a507d 453
483b0434
VZ
454 return buf;
455 }
e4e3bbb4
RN
456 }
457
eec47cc6
VZ
458 if ( outLen )
459 *outLen = 0;
460
461 return wxCharBuffer();
e4e3bbb4
RN
462}
463
6001e347 464// ----------------------------------------------------------------------------
bde4baac 465// wxMBConvLibc
6001e347
RR
466// ----------------------------------------------------------------------------
467
bde4baac
VZ
468size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
469{
470 return wxMB2WC(buf, psz, n);
471}
472
473size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
474{
475 return wxWC2MB(buf, psz, n);
476}
e1bfe89e
RR
477
478// ----------------------------------------------------------------------------
532d575b 479// wxConvBrokenFileNames
e1bfe89e
RR
480// ----------------------------------------------------------------------------
481
eec47cc6
VZ
482#ifdef __UNIX__
483
86501081 484wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 485{
86501081
VS
486 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
487 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 488 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
489 else
490 m_conv = new wxCSConv(charset);
ea8ce907
RR
491}
492
eec47cc6 493#endif // __UNIX__
c12b7f79 494
bde4baac 495// ----------------------------------------------------------------------------
3698ae71 496// UTF-7
bde4baac 497// ----------------------------------------------------------------------------
6001e347 498
15f2ee32 499// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
500//
501// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 502
15f2ee32
RN
503//
504// BASE64 decoding table
505//
506static const unsigned char utf7unb64[] =
6001e347 507{
15f2ee32
RN
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
9d653e81 539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
540};
541
9d653e81
VZ
542size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
543 const char *src, size_t srcLen) const
15f2ee32 544{
9d653e81
VZ
545 DecoderState stateOrig,
546 *statePtr;
547 if ( srcLen == wxNO_LEN )
548 {
549 // convert the entire string, up to and including the trailing NUL
550 srcLen = strlen(src) + 1;
551
552 // when working on the entire strings we don't update nor use the shift
553 // state from the previous call
554 statePtr = &stateOrig;
555 }
556 else // when working with partial strings we do use the shift state
557 {
558 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
559
560 // also save the old state to be able to rollback to it on error
561 stateOrig = m_stateDecoder;
562 }
563
564 // but to simplify the code below we use this variable in both cases
565 DecoderState& state = *statePtr;
566
567
568 // number of characters [which would have been] written to dst [if it were
569 // not NULL]
15f2ee32
RN
570 size_t len = 0;
571
9d653e81
VZ
572 const char * const srcEnd = src + srcLen;
573
574 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 575 {
9d653e81
VZ
576 const unsigned char cc = *src++;
577
578 if ( state.IsShifted() )
15f2ee32 579 {
9d653e81
VZ
580 const unsigned char dc = utf7unb64[cc];
581 if ( dc == 0xff )
15f2ee32 582 {
9d653e81
VZ
583 // end of encoded part
584 state.ToDirect();
585
586 // re-parse this character normally below unless it's '-' which
587 // is consumed by the decoder
588 if ( cc == '-' )
589 continue;
590 }
591 else // valid encoded character
592 {
593 // mini base64 decoder: each character is 6 bits
594 state.bit += 6;
595 state.accum <<= 6;
596 state.accum += dc;
597
598 if ( state.bit >= 8 )
15f2ee32 599 {
9d653e81
VZ
600 // got the full byte, consume it
601 state.bit -= 8;
602 unsigned char b = (state.accum >> state.bit) & 0x00ff;
603
604 if ( state.isLSB )
15f2ee32 605 {
9d653e81
VZ
606 // we've got the full word, output it
607 if ( dst )
608 *dst++ = (state.msb << 8) | b;
609 len++;
610 state.isLSB = false;
15f2ee32 611 }
9d653e81 612 else // MSB
04a37834 613 {
9d653e81
VZ
614 // just store it while we wait for LSB
615 state.msb = b;
616 state.isLSB = true;
04a37834 617 }
15f2ee32
RN
618 }
619 }
9d653e81 620 }
04a37834 621
9d653e81
VZ
622 if ( state.IsDirect() )
623 {
624 // start of an encoded segment?
625 if ( cc == '+' )
04a37834 626 {
9d653e81
VZ
627 if ( src == srcEnd )
628 return wxCONV_FAILED; // can't have '+' at the end
04a37834 629
9d653e81
VZ
630 if ( *src == '-' )
631 {
632 // just the encoded plus sign, don't switch to shifted mode
633 if ( dst )
634 *dst++ = '+';
635 len++;
636 src++;
637 }
638 else
639 {
640 state.ToShifted();
641 }
642 }
643 else // not '+'
644 {
645 // only printable 7 bit ASCII characters (with the exception of
646 // NUL, TAB, CR and LF) can be used directly
647 if ( cc >= 0x7f || (cc < ' ' &&
648 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
649 return wxCONV_FAILED;
650
651 if ( dst )
652 *dst++ = cc;
653 len++;
654 }
15f2ee32
RN
655 }
656 }
04a37834 657
9d653e81
VZ
658 if ( !len )
659 {
660 // as we didn't read any characters we should be called with the same
661 // data (followed by some more new data) again later so don't save our
662 // state
663 state = stateOrig;
664
665 return wxCONV_FAILED;
666 }
04a37834 667
15f2ee32 668 return len;
6001e347
RR
669}
670
15f2ee32
RN
671//
672// BASE64 encoding table
673//
674static const unsigned char utf7enb64[] =
675{
676 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
677 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
678 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
679 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
680 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
681 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
682 'w', 'x', 'y', 'z', '0', '1', '2', '3',
683 '4', '5', '6', '7', '8', '9', '+', '/'
684};
685
686//
687// UTF-7 encoding table
688//
689// 0 - Set D (directly encoded characters)
690// 1 - Set O (optional direct characters)
691// 2 - whitespace characters (optional)
692// 3 - special characters
693//
694static const unsigned char utf7encode[128] =
6001e347 695{
9d653e81 696 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
697 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
698 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
700 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
704};
705
9d653e81
VZ
706static inline bool wxIsUTF7Direct(wchar_t wc)
707{
708 return wc < 0x80 && utf7encode[wc] < 1;
709}
710
711size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
712 const wchar_t *src, size_t srcLen) const
15f2ee32 713{
9d653e81
VZ
714 EncoderState stateOrig,
715 *statePtr;
716 if ( srcLen == wxNO_LEN )
717 {
718 // we don't apply the stored state when operating on entire strings at
719 // once
720 statePtr = &stateOrig;
721
722 srcLen = wxWcslen(src) + 1;
723 }
724 else // do use the mode we left the output in previously
725 {
726 stateOrig = m_stateEncoder;
727 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
728 }
729
730 EncoderState& state = *statePtr;
731
732
15f2ee32
RN
733 size_t len = 0;
734
9d653e81
VZ
735 const wchar_t * const srcEnd = src + srcLen;
736 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 737 {
9d653e81
VZ
738 wchar_t cc = *src++;
739 if ( wxIsUTF7Direct(cc) )
15f2ee32 740 {
9d653e81
VZ
741 if ( state.IsShifted() )
742 {
743 // pad with zeros the last encoded block if necessary
744 if ( state.bit )
745 {
746 if ( dst )
747 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
748 len++;
749 }
ef199164 750
9d653e81
VZ
751 state.ToDirect();
752
753 if ( dst )
754 *dst++ = '-';
755 len++;
756 }
757
758 if ( dst )
759 *dst++ = (char)cc;
15f2ee32
RN
760 len++;
761 }
9d653e81
VZ
762 else if ( cc == '+' && state.IsDirect() )
763 {
764 if ( dst )
765 {
766 *dst++ = '+';
767 *dst++ = '-';
768 }
769
770 len += 2;
771 }
15f2ee32 772#ifndef WC_UTF16
79c78d42 773 else if (((wxUint32)cc) > 0xffff)
b2c13097 774 {
15f2ee32 775 // no surrogate pair generation (yet?)
467e0479 776 return wxCONV_FAILED;
15f2ee32
RN
777 }
778#endif
779 else
780 {
9d653e81
VZ
781 if ( state.IsDirect() )
782 {
783 state.ToShifted();
ef199164 784
9d653e81
VZ
785 if ( dst )
786 *dst++ = '+';
787 len++;
788 }
789
790 // BASE64 encode string
791 for ( ;; )
15f2ee32 792 {
9d653e81 793 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 794 {
9d653e81
VZ
795 state.accum <<= 8;
796 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
797
798 for (state.bit += 8; state.bit >= 6; )
15f2ee32 799 {
9d653e81
VZ
800 state.bit -= 6;
801 if ( dst )
802 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
803 len++;
15f2ee32 804 }
15f2ee32 805 }
ef199164 806
9d653e81
VZ
807 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
808 break;
ef199164 809
9d653e81 810 src++;
15f2ee32 811 }
15f2ee32
RN
812 }
813 }
ef199164 814
9d653e81
VZ
815 // we need to restore the original encoder state if we were called just to
816 // calculate the amount of space needed as we will presumably be called
817 // again to really convert the data now
818 if ( !dst )
819 state = stateOrig;
ef199164 820
15f2ee32 821 return len;
6001e347
RR
822}
823
f6bcfd97 824// ----------------------------------------------------------------------------
6001e347 825// UTF-8
f6bcfd97 826// ----------------------------------------------------------------------------
6001e347 827
1774c3c5 828static const wxUint32 utf8_max[]=
4def3b35 829 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 830
3698ae71
VZ
831// boundaries of the private use area we use to (temporarily) remap invalid
832// characters invalid in a UTF-8 encoded string
ea8ce907
RR
833const wxUint32 wxUnicodePUA = 0x100000;
834const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
835
0286d08d 836// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 837const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
838 // single-byte sequences (ASCII):
839 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
840 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
847
848 // these are invalid:
849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
853 0, 0, // C0,C1
854
855 // two-byte sequences:
856 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
857 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
858
859 // three-byte sequences:
860 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
861
862 // four-byte sequences:
863 4, 4, 4, 4, 4, // F0..F4
864
865 // these are invalid again (5- or 6-byte
866 // sequences and sequences for code points
867 // above U+10FFFF, as restricted by RFC 3629):
868 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
869};
870
871size_t
872wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
873 const char *src, size_t srcLen) const
874{
875 wchar_t *out = dstLen ? dst : NULL;
876 size_t written = 0;
877
878 if ( srcLen == wxNO_LEN )
879 srcLen = strlen(src) + 1;
880
881 for ( const char *p = src; ; p++ )
882 {
883 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
884 {
885 // all done successfully, just add the trailing NULL if we are not
886 // using explicit length
887 if ( srcLen == wxNO_LEN )
888 {
889 if ( out )
890 {
891 if ( !dstLen )
892 break;
893
894 *out = L'\0';
895 }
896
897 written++;
898 }
899
900 return written;
901 }
902
0286d08d
VZ
903 if ( out && !dstLen-- )
904 break;
905
5367a38a
VS
906 wxUint32 code;
907 unsigned char c = *p;
0286d08d 908
5367a38a
VS
909 if ( c < 0x80 )
910 {
911 if ( srcLen == 0 ) // the test works for wxNO_LEN too
912 break;
0286d08d 913
5367a38a
VS
914 if ( srcLen != wxNO_LEN )
915 srcLen--;
0286d08d 916
5367a38a
VS
917 code = c;
918 }
919 else
0286d08d 920 {
5367a38a
VS
921 unsigned len = tableUtf8Lengths[c];
922 if ( !len )
923 break;
924
925 if ( srcLen < len ) // the test works for wxNO_LEN too
926 break;
927
928 if ( srcLen != wxNO_LEN )
929 srcLen -= len;
930
931 // Char. number range | UTF-8 octet sequence
932 // (hexadecimal) | (binary)
933 // ----------------------+----------------------------------------
934 // 0000 0000 - 0000 007F | 0xxxxxxx
935 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
936 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
937 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
938 //
939 // Code point value is stored in bits marked with 'x',
940 // lowest-order bit of the value on the right side in the diagram
941 // above. (from RFC 3629)
942
943 // mask to extract lead byte's value ('x' bits above), by sequence
944 // length:
945 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
946
947 // mask and value of lead byte's most significant bits, by length:
948 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
949 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
950
951 len--; // it's more convenient to work with 0-based length here
952
953 // extract the lead byte's value bits:
954 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
955 break;
956
957 code = c & leadValueMask[len];
958
959 // all remaining bytes, if any, are handled in the same way
960 // regardless of sequence's length:
961 for ( ; len; --len )
962 {
963 c = *++p;
964 if ( (c & 0xC0) != 0x80 )
965 return wxCONV_FAILED;
0286d08d 966
5367a38a
VS
967 code <<= 6;
968 code |= c & 0x3F;
969 }
0286d08d
VZ
970 }
971
972#ifdef WC_UTF16
973 // cast is ok because wchar_t == wxUint16 if WC_UTF16
974 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
975 {
976 if ( out )
977 out++;
978 written++;
979 }
980#else // !WC_UTF16
981 if ( out )
982 *out = code;
983#endif // WC_UTF16/!WC_UTF16
984
985 if ( out )
986 out++;
987
988 written++;
989 }
990
991 return wxCONV_FAILED;
992}
993
994size_t
995wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
996 const wchar_t *src, size_t srcLen) const
997{
998 char *out = dstLen ? dst : NULL;
999 size_t written = 0;
1000
1001 for ( const wchar_t *wp = src; ; wp++ )
1002 {
a964d3ed 1003 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1004 {
1005 // all done successfully, just add the trailing NULL if we are not
1006 // using explicit length
1007 if ( srcLen == wxNO_LEN )
1008 {
1009 if ( out )
1010 {
1011 if ( !dstLen )
1012 break;
1013
1014 *out = '\0';
1015 }
1016
1017 written++;
1018 }
1019
1020 return written;
1021 }
1022
a964d3ed
VZ
1023 if ( srcLen != wxNO_LEN )
1024 srcLen--;
0286d08d
VZ
1025
1026 wxUint32 code;
1027#ifdef WC_UTF16
1028 // cast is ok for WC_UTF16
1029 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1030 {
1031 // skip the next char too as we decoded a surrogate
1032 wp++;
1033 }
1034#else // wchar_t is UTF-32
1035 code = *wp & 0x7fffffff;
1036#endif
1037
1038 unsigned len;
1039 if ( code <= 0x7F )
1040 {
1041 len = 1;
1042 if ( out )
1043 {
1044 if ( dstLen < len )
1045 break;
1046
1047 out[0] = (char)code;
1048 }
1049 }
1050 else if ( code <= 0x07FF )
1051 {
1052 len = 2;
1053 if ( out )
1054 {
1055 if ( dstLen < len )
1056 break;
1057
1058 // NB: this line takes 6 least significant bits, encodes them as
1059 // 10xxxxxx and discards them so that the next byte can be encoded:
1060 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1061 out[0] = 0xC0 | code;
1062 }
1063 }
1064 else if ( code < 0xFFFF )
1065 {
1066 len = 3;
1067 if ( out )
1068 {
1069 if ( dstLen < len )
1070 break;
1071
1072 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1073 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1074 out[0] = 0xE0 | code;
1075 }
1076 }
1077 else if ( code <= 0x10FFFF )
1078 {
1079 len = 4;
1080 if ( out )
1081 {
1082 if ( dstLen < len )
1083 break;
1084
1085 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1086 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1087 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1088 out[0] = 0xF0 | code;
1089 }
1090 }
1091 else
1092 {
1093 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1094 break;
1095 }
1096
1097 if ( out )
1098 {
1099 out += len;
1100 dstLen -= len;
1101 }
1102
1103 written += len;
1104 }
1105
1106 // we only get here if an error occurs during decoding
1107 return wxCONV_FAILED;
1108}
1109
d16d0917
VZ
1110size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1111 const char *psz, size_t srcLen) const
6001e347 1112{
0286d08d 1113 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1114 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1115
4def3b35
VS
1116 size_t len = 0;
1117
d16d0917 1118 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1119 {
ea8ce907
RR
1120 const char *opsz = psz;
1121 bool invalid = false;
4def3b35
VS
1122 unsigned char cc = *psz++, fc = cc;
1123 unsigned cnt;
dccce9ea 1124 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1125 fc <<= 1;
ef199164 1126
dccce9ea 1127 if (!cnt)
4def3b35
VS
1128 {
1129 // plain ASCII char
dccce9ea 1130 if (buf)
4def3b35
VS
1131 *buf++ = cc;
1132 len++;
561488ef
MW
1133
1134 // escape the escape character for octal escapes
1135 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1136 && cc == '\\' && (!buf || len < n))
1137 {
1138 if (buf)
1139 *buf++ = cc;
1140 len++;
1141 }
dccce9ea
VZ
1142 }
1143 else
4def3b35
VS
1144 {
1145 cnt--;
dccce9ea 1146 if (!cnt)
4def3b35
VS
1147 {
1148 // invalid UTF-8 sequence
ea8ce907 1149 invalid = true;
dccce9ea
VZ
1150 }
1151 else
4def3b35
VS
1152 {
1153 unsigned ocnt = cnt - 1;
1154 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1155 while (cnt--)
4def3b35 1156 {
ea8ce907 1157 cc = *psz;
dccce9ea 1158 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1159 {
1160 // invalid UTF-8 sequence
ea8ce907
RR
1161 invalid = true;
1162 break;
4def3b35 1163 }
ef199164 1164
ea8ce907 1165 psz++;
4def3b35
VS
1166 res = (res << 6) | (cc & 0x3f);
1167 }
ef199164 1168
ea8ce907 1169 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1170 {
1171 // illegal UTF-8 encoding
ea8ce907 1172 invalid = true;
4def3b35 1173 }
ea8ce907
RR
1174 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1175 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1176 {
1177 // if one of our PUA characters turns up externally
1178 // it must also be treated as an illegal sequence
1179 // (a bit like you have to escape an escape character)
1180 invalid = true;
1181 }
1182 else
1183 {
1cd52418 1184#ifdef WC_UTF16
0286d08d 1185 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1186 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1187 if (pa == wxCONV_FAILED)
ea8ce907
RR
1188 {
1189 invalid = true;
1190 }
1191 else
1192 {
1193 if (buf)
1194 buf += pa;
1195 len += pa;
1196 }
373658eb 1197#else // !WC_UTF16
ea8ce907 1198 if (buf)
38d4b1e4 1199 *buf++ = (wchar_t)res;
ea8ce907 1200 len++;
373658eb 1201#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1202 }
1203 }
ef199164 1204
ea8ce907
RR
1205 if (invalid)
1206 {
1207 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1208 {
1209 while (opsz < psz && (!buf || len < n))
1210 {
1211#ifdef WC_UTF16
1212 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1213 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1214 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1215 if (buf)
1216 buf += pa;
1217 opsz++;
1218 len += pa;
1219#else
1220 if (buf)
38d4b1e4 1221 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1222 opsz++;
1223 len++;
1224#endif
1225 }
1226 }
3698ae71 1227 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1228 {
1229 while (opsz < psz && (!buf || len < n))
1230 {
3698ae71
VZ
1231 if ( buf && len + 3 < n )
1232 {
17a1ebd1 1233 unsigned char on = *opsz;
3698ae71 1234 *buf++ = L'\\';
17a1ebd1
VZ
1235 *buf++ = (wchar_t)( L'0' + on / 0100 );
1236 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1237 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1238 }
ef199164 1239
ea8ce907
RR
1240 opsz++;
1241 len += 4;
1242 }
1243 }
3698ae71 1244 else // MAP_INVALID_UTF8_NOT
ea8ce907 1245 {
467e0479 1246 return wxCONV_FAILED;
ea8ce907 1247 }
4def3b35
VS
1248 }
1249 }
6001e347 1250 }
ef199164 1251
d16d0917 1252 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1253 *buf = 0;
ef199164 1254
d16d0917 1255 return len + 1;
6001e347
RR
1256}
1257
3698ae71
VZ
1258static inline bool isoctal(wchar_t wch)
1259{
1260 return L'0' <= wch && wch <= L'7';
1261}
1262
d16d0917
VZ
1263size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1264 const wchar_t *psz, size_t srcLen) const
6001e347 1265{
0286d08d 1266 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1267 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1268
4def3b35 1269 size_t len = 0;
6001e347 1270
d16d0917 1271 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1272 {
1273 wxUint32 cc;
ef199164 1274
1cd52418 1275#ifdef WC_UTF16
b5153fd8
VZ
1276 // cast is ok for WC_UTF16
1277 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1278 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1279#else
ef199164 1280 cc = (*psz++) & 0x7fffffff;
4def3b35 1281#endif
3698ae71
VZ
1282
1283 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1284 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1285 {
dccce9ea 1286 if (buf)
ea8ce907 1287 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1288 len++;
3698ae71 1289 }
561488ef
MW
1290 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1291 && cc == L'\\' && psz[0] == L'\\' )
1292 {
1293 if (buf)
1294 *buf++ = (char)cc;
1295 psz++;
1296 len++;
1297 }
3698ae71
VZ
1298 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1299 cc == L'\\' &&
1300 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1301 {
dccce9ea 1302 if (buf)
3698ae71 1303 {
ef199164
DS
1304 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1305 (psz[1] - L'0') * 010 +
b2c13097 1306 (psz[2] - L'0'));
3698ae71
VZ
1307 }
1308
1309 psz += 3;
ea8ce907
RR
1310 len++;
1311 }
1312 else
1313 {
1314 unsigned cnt;
ef199164
DS
1315 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1316 {
1317 }
1318
ea8ce907 1319 if (!cnt)
4def3b35 1320 {
ea8ce907
RR
1321 // plain ASCII char
1322 if (buf)
1323 *buf++ = (char) cc;
1324 len++;
1325 }
ea8ce907
RR
1326 else
1327 {
1328 len += cnt + 1;
1329 if (buf)
1330 {
1331 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1332 while (cnt--)
1333 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1334 }
4def3b35
VS
1335 }
1336 }
6001e347 1337 }
4def3b35 1338
d16d0917 1339 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1340 *buf = 0;
adb45366 1341
d16d0917 1342 return len + 1;
6001e347
RR
1343}
1344
467e0479 1345// ============================================================================
c91830cb 1346// UTF-16
467e0479 1347// ============================================================================
c91830cb
VZ
1348
1349#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1350 #define wxMBConvUTF16straight wxMBConvUTF16BE
1351 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1352#else
bde4baac
VZ
1353 #define wxMBConvUTF16swap wxMBConvUTF16BE
1354 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1355#endif
1356
467e0479
VZ
1357/* static */
1358size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1359{
1360 if ( srcLen == wxNO_LEN )
1361 {
1362 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1363 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1364 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1365 ;
c91830cb 1366
467e0479
VZ
1367 srcLen *= BYTES_PER_CHAR;
1368 }
1369 else // we already have the length
1370 {
1371 // we can only convert an entire number of UTF-16 characters
1372 if ( srcLen % BYTES_PER_CHAR )
1373 return wxCONV_FAILED;
1374 }
1375
1376 return srcLen;
1377}
1378
1379// case when in-memory representation is UTF-16 too
c91830cb
VZ
1380#ifdef WC_UTF16
1381
467e0479
VZ
1382// ----------------------------------------------------------------------------
1383// conversions without endianness change
1384// ----------------------------------------------------------------------------
1385
1386size_t
1387wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1388 const char *src, size_t srcLen) const
c91830cb 1389{
467e0479
VZ
1390 // set up the scene for using memcpy() (which is presumably more efficient
1391 // than copying the bytes one by one)
1392 srcLen = GetLength(src, srcLen);
1393 if ( srcLen == wxNO_LEN )
1394 return wxCONV_FAILED;
c91830cb 1395
ef199164 1396 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1397 if ( dst )
c91830cb 1398 {
467e0479
VZ
1399 if ( dstLen < inLen )
1400 return wxCONV_FAILED;
c91830cb 1401
467e0479 1402 memcpy(dst, src, srcLen);
c91830cb 1403 }
d32a507d 1404
467e0479 1405 return inLen;
c91830cb
VZ
1406}
1407
467e0479
VZ
1408size_t
1409wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1410 const wchar_t *src, size_t srcLen) const
c91830cb 1411{
467e0479
VZ
1412 if ( srcLen == wxNO_LEN )
1413 srcLen = wxWcslen(src) + 1;
c91830cb 1414
467e0479
VZ
1415 srcLen *= BYTES_PER_CHAR;
1416
1417 if ( dst )
c91830cb 1418 {
467e0479
VZ
1419 if ( dstLen < srcLen )
1420 return wxCONV_FAILED;
d32a507d 1421
467e0479 1422 memcpy(dst, src, srcLen);
c91830cb 1423 }
d32a507d 1424
467e0479 1425 return srcLen;
c91830cb
VZ
1426}
1427
467e0479
VZ
1428// ----------------------------------------------------------------------------
1429// endian-reversing conversions
1430// ----------------------------------------------------------------------------
c91830cb 1431
467e0479
VZ
1432size_t
1433wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1434 const char *src, size_t srcLen) const
c91830cb 1435{
467e0479
VZ
1436 srcLen = GetLength(src, srcLen);
1437 if ( srcLen == wxNO_LEN )
1438 return wxCONV_FAILED;
c91830cb 1439
467e0479
VZ
1440 srcLen /= BYTES_PER_CHAR;
1441
1442 if ( dst )
c91830cb 1443 {
467e0479
VZ
1444 if ( dstLen < srcLen )
1445 return wxCONV_FAILED;
1446
ef199164
DS
1447 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1448 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1449 {
ef199164 1450 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1451 }
c91830cb 1452 }
bfab25d4 1453
467e0479 1454 return srcLen;
c91830cb
VZ
1455}
1456
467e0479
VZ
1457size_t
1458wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1459 const wchar_t *src, size_t srcLen) const
c91830cb 1460{
467e0479
VZ
1461 if ( srcLen == wxNO_LEN )
1462 srcLen = wxWcslen(src) + 1;
c91830cb 1463
467e0479
VZ
1464 srcLen *= BYTES_PER_CHAR;
1465
1466 if ( dst )
c91830cb 1467 {
467e0479
VZ
1468 if ( dstLen < srcLen )
1469 return wxCONV_FAILED;
1470
ef199164 1471 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1472 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1473 {
ef199164 1474 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1475 }
c91830cb 1476 }
eec47cc6 1477
467e0479 1478 return srcLen;
c91830cb
VZ
1479}
1480
467e0479 1481#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1482
467e0479
VZ
1483// ----------------------------------------------------------------------------
1484// conversions without endianness change
1485// ----------------------------------------------------------------------------
c91830cb 1486
35d11700
VZ
1487size_t
1488wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1489 const char *src, size_t srcLen) const
c91830cb 1490{
35d11700
VZ
1491 srcLen = GetLength(src, srcLen);
1492 if ( srcLen == wxNO_LEN )
1493 return wxCONV_FAILED;
c91830cb 1494
ef199164 1495 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1496 if ( !dst )
c91830cb 1497 {
35d11700
VZ
1498 // optimization: return maximal space which could be needed for this
1499 // string even if the real size could be smaller if the buffer contains
1500 // any surrogates
1501 return inLen;
c91830cb 1502 }
c91830cb 1503
35d11700 1504 size_t outLen = 0;
ef199164
DS
1505 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1506 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1507 {
ef199164
DS
1508 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1509 if ( !inBuff )
35d11700
VZ
1510 return wxCONV_FAILED;
1511
1512 if ( ++outLen > dstLen )
1513 return wxCONV_FAILED;
c91830cb 1514
35d11700
VZ
1515 *dst++ = ch;
1516 }
1517
1518
1519 return outLen;
1520}
c91830cb 1521
35d11700
VZ
1522size_t
1523wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1524 const wchar_t *src, size_t srcLen) const
c91830cb 1525{
35d11700
VZ
1526 if ( srcLen == wxNO_LEN )
1527 srcLen = wxWcslen(src) + 1;
c91830cb 1528
35d11700 1529 size_t outLen = 0;
ef199164 1530 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1531 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1532 {
1533 wxUint16 cc[2];
35d11700
VZ
1534 const size_t numChars = encode_utf16(*src++, cc);
1535 if ( numChars == wxCONV_FAILED )
1536 return wxCONV_FAILED;
c91830cb 1537
ef199164
DS
1538 outLen += numChars * BYTES_PER_CHAR;
1539 if ( outBuff )
c91830cb 1540 {
35d11700
VZ
1541 if ( outLen > dstLen )
1542 return wxCONV_FAILED;
1543
ef199164 1544 *outBuff++ = cc[0];
35d11700 1545 if ( numChars == 2 )
69b80d28 1546 {
35d11700 1547 // second character of a surrogate
ef199164 1548 *outBuff++ = cc[1];
69b80d28 1549 }
c91830cb 1550 }
c91830cb 1551 }
c91830cb 1552
35d11700 1553 return outLen;
c91830cb
VZ
1554}
1555
467e0479
VZ
1556// ----------------------------------------------------------------------------
1557// endian-reversing conversions
1558// ----------------------------------------------------------------------------
c91830cb 1559
35d11700
VZ
1560size_t
1561wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1562 const char *src, size_t srcLen) const
c91830cb 1563{
35d11700
VZ
1564 srcLen = GetLength(src, srcLen);
1565 if ( srcLen == wxNO_LEN )
1566 return wxCONV_FAILED;
1567
ef199164 1568 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1569 if ( !dst )
1570 {
1571 // optimization: return maximal space which could be needed for this
1572 // string even if the real size could be smaller if the buffer contains
1573 // any surrogates
1574 return inLen;
1575 }
c91830cb 1576
35d11700 1577 size_t outLen = 0;
ef199164
DS
1578 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1579 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1580 {
35d11700
VZ
1581 wxUint32 ch;
1582 wxUint16 tmp[2];
ef199164
DS
1583
1584 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1585 inBuff++;
1586 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1587
35d11700
VZ
1588 const size_t numChars = decode_utf16(tmp, ch);
1589 if ( numChars == wxCONV_FAILED )
1590 return wxCONV_FAILED;
c91830cb 1591
35d11700 1592 if ( numChars == 2 )
ef199164 1593 inBuff++;
35d11700
VZ
1594
1595 if ( ++outLen > dstLen )
1596 return wxCONV_FAILED;
c91830cb 1597
35d11700 1598 *dst++ = ch;
c91830cb 1599 }
c91830cb 1600
c91830cb 1601
35d11700
VZ
1602 return outLen;
1603}
c91830cb 1604
35d11700
VZ
1605size_t
1606wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1607 const wchar_t *src, size_t srcLen) const
c91830cb 1608{
35d11700
VZ
1609 if ( srcLen == wxNO_LEN )
1610 srcLen = wxWcslen(src) + 1;
c91830cb 1611
35d11700 1612 size_t outLen = 0;
ef199164 1613 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1614 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1615 {
1616 wxUint16 cc[2];
35d11700
VZ
1617 const size_t numChars = encode_utf16(*src, cc);
1618 if ( numChars == wxCONV_FAILED )
1619 return wxCONV_FAILED;
c91830cb 1620
ef199164
DS
1621 outLen += numChars * BYTES_PER_CHAR;
1622 if ( outBuff )
c91830cb 1623 {
35d11700
VZ
1624 if ( outLen > dstLen )
1625 return wxCONV_FAILED;
1626
ef199164 1627 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1628 if ( numChars == 2 )
c91830cb 1629 {
35d11700 1630 // second character of a surrogate
ef199164 1631 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1632 }
1633 }
c91830cb 1634 }
c91830cb 1635
35d11700 1636 return outLen;
c91830cb
VZ
1637}
1638
467e0479 1639#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1640
1641
35d11700 1642// ============================================================================
c91830cb 1643// UTF-32
35d11700 1644// ============================================================================
c91830cb
VZ
1645
1646#ifdef WORDS_BIGENDIAN
467e0479
VZ
1647 #define wxMBConvUTF32straight wxMBConvUTF32BE
1648 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1649#else
467e0479
VZ
1650 #define wxMBConvUTF32swap wxMBConvUTF32BE
1651 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1652#endif
1653
1654
1655WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1656WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1657
467e0479
VZ
1658/* static */
1659size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1660{
1661 if ( srcLen == wxNO_LEN )
1662 {
1663 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1664 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1665 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1666 ;
c91830cb 1667
467e0479
VZ
1668 srcLen *= BYTES_PER_CHAR;
1669 }
1670 else // we already have the length
1671 {
1672 // we can only convert an entire number of UTF-32 characters
1673 if ( srcLen % BYTES_PER_CHAR )
1674 return wxCONV_FAILED;
1675 }
1676
1677 return srcLen;
1678}
1679
1680// case when in-memory representation is UTF-16
c91830cb
VZ
1681#ifdef WC_UTF16
1682
467e0479
VZ
1683// ----------------------------------------------------------------------------
1684// conversions without endianness change
1685// ----------------------------------------------------------------------------
1686
1687size_t
1688wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1689 const char *src, size_t srcLen) const
c91830cb 1690{
467e0479
VZ
1691 srcLen = GetLength(src, srcLen);
1692 if ( srcLen == wxNO_LEN )
1693 return wxCONV_FAILED;
c91830cb 1694
ef199164
DS
1695 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1696 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1697 size_t outLen = 0;
1698 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1699 {
1700 wxUint16 cc[2];
ef199164 1701 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1702 if ( numChars == wxCONV_FAILED )
1703 return wxCONV_FAILED;
c91830cb 1704
467e0479
VZ
1705 outLen += numChars;
1706 if ( dst )
c91830cb 1707 {
467e0479
VZ
1708 if ( outLen > dstLen )
1709 return wxCONV_FAILED;
d32a507d 1710
467e0479
VZ
1711 *dst++ = cc[0];
1712 if ( numChars == 2 )
1713 {
1714 // second character of a surrogate
1715 *dst++ = cc[1];
1716 }
1717 }
c91830cb 1718 }
d32a507d 1719
467e0479 1720 return outLen;
c91830cb
VZ
1721}
1722
467e0479
VZ
1723size_t
1724wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1725 const wchar_t *src, size_t srcLen) const
c91830cb 1726{
467e0479
VZ
1727 if ( srcLen == wxNO_LEN )
1728 srcLen = wxWcslen(src) + 1;
c91830cb 1729
467e0479 1730 if ( !dst )
c91830cb 1731 {
467e0479
VZ
1732 // optimization: return maximal space which could be needed for this
1733 // string instead of the exact amount which could be less if there are
1734 // any surrogates in the input
1735 //
1736 // we consider that surrogates are rare enough to make it worthwhile to
1737 // avoid running the loop below at the cost of slightly extra memory
1738 // consumption
ef199164 1739 return srcLen * BYTES_PER_CHAR;
467e0479 1740 }
c91830cb 1741
ef199164 1742 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1743 size_t outLen = 0;
1744 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1745 {
1746 const wxUint32 ch = wxDecodeSurrogate(&src);
1747 if ( !src )
1748 return wxCONV_FAILED;
c91830cb 1749
467e0479 1750 outLen += BYTES_PER_CHAR;
d32a507d 1751
467e0479
VZ
1752 if ( outLen > dstLen )
1753 return wxCONV_FAILED;
b5153fd8 1754
ef199164 1755 *outBuff++ = ch;
467e0479 1756 }
c91830cb 1757
467e0479 1758 return outLen;
c91830cb
VZ
1759}
1760
467e0479
VZ
1761// ----------------------------------------------------------------------------
1762// endian-reversing conversions
1763// ----------------------------------------------------------------------------
c91830cb 1764
467e0479
VZ
1765size_t
1766wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1767 const char *src, size_t srcLen) const
c91830cb 1768{
467e0479
VZ
1769 srcLen = GetLength(src, srcLen);
1770 if ( srcLen == wxNO_LEN )
1771 return wxCONV_FAILED;
c91830cb 1772
ef199164
DS
1773 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1774 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1775 size_t outLen = 0;
ef199164 1776 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1777 {
c91830cb 1778 wxUint16 cc[2];
ef199164 1779 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1780 if ( numChars == wxCONV_FAILED )
1781 return wxCONV_FAILED;
c91830cb 1782
467e0479
VZ
1783 outLen += numChars;
1784 if ( dst )
c91830cb 1785 {
467e0479
VZ
1786 if ( outLen > dstLen )
1787 return wxCONV_FAILED;
d32a507d 1788
467e0479
VZ
1789 *dst++ = cc[0];
1790 if ( numChars == 2 )
1791 {
1792 // second character of a surrogate
1793 *dst++ = cc[1];
1794 }
1795 }
c91830cb 1796 }
b5153fd8 1797
467e0479 1798 return outLen;
c91830cb
VZ
1799}
1800
467e0479
VZ
1801size_t
1802wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1803 const wchar_t *src, size_t srcLen) const
c91830cb 1804{
467e0479
VZ
1805 if ( srcLen == wxNO_LEN )
1806 srcLen = wxWcslen(src) + 1;
c91830cb 1807
467e0479 1808 if ( !dst )
c91830cb 1809 {
467e0479
VZ
1810 // optimization: return maximal space which could be needed for this
1811 // string instead of the exact amount which could be less if there are
1812 // any surrogates in the input
1813 //
1814 // we consider that surrogates are rare enough to make it worthwhile to
1815 // avoid running the loop below at the cost of slightly extra memory
1816 // consumption
1817 return srcLen*BYTES_PER_CHAR;
1818 }
c91830cb 1819
ef199164 1820 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1821 size_t outLen = 0;
1822 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1823 {
1824 const wxUint32 ch = wxDecodeSurrogate(&src);
1825 if ( !src )
1826 return wxCONV_FAILED;
c91830cb 1827
467e0479 1828 outLen += BYTES_PER_CHAR;
d32a507d 1829
467e0479
VZ
1830 if ( outLen > dstLen )
1831 return wxCONV_FAILED;
b5153fd8 1832
ef199164 1833 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1834 }
c91830cb 1835
467e0479 1836 return outLen;
c91830cb
VZ
1837}
1838
467e0479 1839#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1840
35d11700
VZ
1841// ----------------------------------------------------------------------------
1842// conversions without endianness change
1843// ----------------------------------------------------------------------------
1844
1845size_t
1846wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1847 const char *src, size_t srcLen) const
c91830cb 1848{
35d11700
VZ
1849 // use memcpy() as it should be much faster than hand-written loop
1850 srcLen = GetLength(src, srcLen);
1851 if ( srcLen == wxNO_LEN )
1852 return wxCONV_FAILED;
c91830cb 1853
35d11700
VZ
1854 const size_t inLen = srcLen/BYTES_PER_CHAR;
1855 if ( dst )
c91830cb 1856 {
35d11700
VZ
1857 if ( dstLen < inLen )
1858 return wxCONV_FAILED;
b5153fd8 1859
35d11700
VZ
1860 memcpy(dst, src, srcLen);
1861 }
c91830cb 1862
35d11700 1863 return inLen;
c91830cb
VZ
1864}
1865
35d11700
VZ
1866size_t
1867wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1868 const wchar_t *src, size_t srcLen) const
c91830cb 1869{
35d11700
VZ
1870 if ( srcLen == wxNO_LEN )
1871 srcLen = wxWcslen(src) + 1;
1872
1873 srcLen *= BYTES_PER_CHAR;
c91830cb 1874
35d11700 1875 if ( dst )
c91830cb 1876 {
35d11700
VZ
1877 if ( dstLen < srcLen )
1878 return wxCONV_FAILED;
c91830cb 1879
35d11700 1880 memcpy(dst, src, srcLen);
c91830cb
VZ
1881 }
1882
35d11700 1883 return srcLen;
c91830cb
VZ
1884}
1885
35d11700
VZ
1886// ----------------------------------------------------------------------------
1887// endian-reversing conversions
1888// ----------------------------------------------------------------------------
c91830cb 1889
35d11700
VZ
1890size_t
1891wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1892 const char *src, size_t srcLen) const
c91830cb 1893{
35d11700
VZ
1894 srcLen = GetLength(src, srcLen);
1895 if ( srcLen == wxNO_LEN )
1896 return wxCONV_FAILED;
1897
1898 srcLen /= BYTES_PER_CHAR;
c91830cb 1899
35d11700 1900 if ( dst )
c91830cb 1901 {
35d11700
VZ
1902 if ( dstLen < srcLen )
1903 return wxCONV_FAILED;
1904
ef199164
DS
1905 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1906 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1907 {
ef199164 1908 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1909 }
c91830cb 1910 }
b5153fd8 1911
35d11700 1912 return srcLen;
c91830cb
VZ
1913}
1914
35d11700
VZ
1915size_t
1916wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1917 const wchar_t *src, size_t srcLen) const
c91830cb 1918{
35d11700
VZ
1919 if ( srcLen == wxNO_LEN )
1920 srcLen = wxWcslen(src) + 1;
1921
1922 srcLen *= BYTES_PER_CHAR;
c91830cb 1923
35d11700 1924 if ( dst )
c91830cb 1925 {
35d11700
VZ
1926 if ( dstLen < srcLen )
1927 return wxCONV_FAILED;
1928
ef199164 1929 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1930 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1931 {
ef199164 1932 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1933 }
c91830cb 1934 }
b5153fd8 1935
35d11700 1936 return srcLen;
c91830cb
VZ
1937}
1938
467e0479 1939#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1940
1941
36acb880
VZ
1942// ============================================================================
1943// The classes doing conversion using the iconv_xxx() functions
1944// ============================================================================
3caec1bb 1945
b040e242 1946#ifdef HAVE_ICONV
3a0d76bc 1947
b1d547eb
VS
1948// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1949// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1950// (unless there's yet another bug in glibc) the only case when iconv()
1951// returns with (size_t)-1 (which means error) and says there are 0 bytes
1952// left in the input buffer -- when _real_ error occurs,
1953// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1954// iconv() failure.
3caec1bb
VS
1955// [This bug does not appear in glibc 2.2.]
1956#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1957#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1958 (errno != E2BIG || bufLeft != 0))
1959#else
1960#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1961#endif
1962
ab217dba 1963#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1964
74a7eb0b
VZ
1965#define ICONV_T_INVALID ((iconv_t)-1)
1966
1967#if SIZEOF_WCHAR_T == 4
1968 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1969 #define WC_ENC wxFONTENCODING_UTF32
1970#elif SIZEOF_WCHAR_T == 2
1971 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1972 #define WC_ENC wxFONTENCODING_UTF16
1973#else // sizeof(wchar_t) != 2 nor 4
1974 // does this ever happen?
1975 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1976#endif
1977
36acb880 1978// ----------------------------------------------------------------------------
e95354ec 1979// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1980// ----------------------------------------------------------------------------
1981
e95354ec 1982class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1983{
1984public:
86501081 1985 wxMBConv_iconv(const char *name);
e95354ec 1986 virtual ~wxMBConv_iconv();
36acb880 1987
8f4b0f43
VZ
1988 // implement base class virtual methods
1989 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1990 const char *src, size_t srcLen = wxNO_LEN) const;
1991 virtual size_t FromWChar(char *dst, size_t dstLen,
1992 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
1993 virtual size_t GetMBNulLen() const;
1994
ba98e032
VS
1995#if wxUSE_UNICODE_UTF8
1996 virtual bool IsUTF8() const;
1997#endif
1998
d36c9347
VZ
1999 virtual wxMBConv *Clone() const
2000 {
86501081 2001 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2002 p->m_minMBCharWidth = m_minMBCharWidth;
2003 return p;
2004 }
2005
e95354ec 2006 bool IsOk() const
74a7eb0b 2007 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2008
2009protected:
ef199164
DS
2010 // the iconv handlers used to translate from multibyte
2011 // to wide char and in the other direction
36acb880
VZ
2012 iconv_t m2w,
2013 w2m;
ef199164 2014
b1d547eb
VS
2015#if wxUSE_THREADS
2016 // guards access to m2w and w2m objects
2017 wxMutex m_iconvMutex;
2018#endif
36acb880
VZ
2019
2020private:
e95354ec 2021 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2022 // available on this machine, it will remain NULL
74a7eb0b 2023 static wxString ms_wcCharsetName;
36acb880
VZ
2024
2025 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2026 // different endian-ness than the native one
405d8f46 2027 static bool ms_wcNeedsSwap;
eec47cc6 2028
d36c9347
VZ
2029
2030 // name of the encoding handled by this conversion
2031 wxString m_name;
2032
7ef3ab50 2033 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2034 // initially
2035 size_t m_minMBCharWidth;
36acb880
VZ
2036};
2037
8f115891 2038// make the constructor available for unit testing
86501081 2039WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2040{
2041 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2042 if ( !result->IsOk() )
2043 {
2044 delete result;
2045 return 0;
2046 }
ef199164 2047
8f115891
MW
2048 return result;
2049}
2050
422e411e 2051wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2052bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2053
86501081 2054wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2055 : m_name(name)
36acb880 2056{
c1464d9d 2057 m_minMBCharWidth = 0;
eec47cc6 2058
36acb880 2059 // check for charset that represents wchar_t:
74a7eb0b 2060 if ( ms_wcCharsetName.empty() )
f1339c56 2061 {
c2b83fdd
VZ
2062 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2063
74a7eb0b
VZ
2064#if wxUSE_FONTMAP
2065 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2066#else // !wxUSE_FONTMAP
91cb7f52 2067 static const wxChar *names_static[] =
36acb880 2068 {
74a7eb0b
VZ
2069#if SIZEOF_WCHAR_T == 4
2070 _T("UCS-4"),
2071#elif SIZEOF_WCHAR_T = 2
2072 _T("UCS-2"),
2073#endif
2074 NULL
2075 };
91cb7f52 2076 const wxChar **names = names_static;
74a7eb0b 2077#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2078
d1f024a8 2079 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2080 {
17a1ebd1 2081 const wxString nameCS(*names);
74a7eb0b
VZ
2082
2083 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2084 wxString nameXE(nameCS);
ef199164
DS
2085
2086#ifdef WORDS_BIGENDIAN
74a7eb0b 2087 nameXE += _T("BE");
ef199164 2088#else // little endian
74a7eb0b 2089 nameXE += _T("LE");
ef199164 2090#endif
74a7eb0b 2091
c2b83fdd
VZ
2092 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2093 nameXE.c_str());
2094
86501081 2095 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2096 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2097 {
74a7eb0b 2098 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
2099 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2100 nameCS.c_str());
86501081 2101 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2102
74a7eb0b
VZ
2103 // and check for bytesex ourselves:
2104 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2105 {
74a7eb0b 2106 char buf[2], *bufPtr;
e8769ed1 2107 wchar_t wbuf[2];
74a7eb0b
VZ
2108 size_t insz, outsz;
2109 size_t res;
2110
2111 buf[0] = 'A';
2112 buf[1] = 0;
2113 wbuf[0] = 0;
2114 insz = 2;
2115 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2116 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2117 bufPtr = buf;
2118
ef199164
DS
2119 res = iconv(
2120 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2121 &wbufPtr, &outsz);
74a7eb0b
VZ
2122
2123 if (ICONV_FAILED(res, insz))
2124 {
2125 wxLogLastError(wxT("iconv"));
422e411e 2126 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2127 nameCS.c_str());
74a7eb0b
VZ
2128 }
2129 else // ok, can convert to this encoding, remember it
2130 {
17a1ebd1 2131 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2132 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2133 }
3a0d76bc
VS
2134 }
2135 }
74a7eb0b 2136 else // use charset not requiring byte swapping
36acb880 2137 {
74a7eb0b 2138 ms_wcCharsetName = nameXE;
36acb880 2139 }
3a0d76bc 2140 }
74a7eb0b 2141
0944fceb 2142 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2143 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2144 ms_wcCharsetName.empty() ? wxString("<none>")
2145 : ms_wcCharsetName,
74a7eb0b
VZ
2146 ms_wcNeedsSwap ? _T(" (needs swap)")
2147 : _T(""));
3a0d76bc 2148 }
36acb880 2149 else // we already have ms_wcCharsetName
3caec1bb 2150 {
86501081 2151 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2152 }
dccce9ea 2153
74a7eb0b 2154 if ( ms_wcCharsetName.empty() )
f1339c56 2155 {
74a7eb0b 2156 w2m = ICONV_T_INVALID;
36acb880 2157 }
405d8f46
VZ
2158 else
2159 {
86501081 2160 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2161 if ( w2m == ICONV_T_INVALID )
2162 {
2163 wxLogTrace(TRACE_STRCONV,
2164 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2165 ms_wcCharsetName.c_str(), name);
74a7eb0b 2166 }
405d8f46 2167 }
36acb880 2168}
3caec1bb 2169
e95354ec 2170wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2171{
74a7eb0b 2172 if ( m2w != ICONV_T_INVALID )
36acb880 2173 iconv_close(m2w);
74a7eb0b 2174 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2175 iconv_close(w2m);
2176}
3a0d76bc 2177
8f4b0f43
VZ
2178size_t
2179wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2180 const char *src, size_t srcLen) const
36acb880 2181{
8f4b0f43 2182 if ( srcLen == wxNO_LEN )
69373110 2183 {
8f4b0f43
VZ
2184 // find the string length: notice that must be done differently for
2185 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2186 // consecutive NULs
2187 const size_t nulLen = GetMBNulLen();
2188 switch ( nulLen )
2189 {
2190 default:
2191 return wxCONV_FAILED;
69373110 2192
8f4b0f43
VZ
2193 case 1:
2194 srcLen = strlen(src); // arguably more optimized than our version
2195 break;
69373110 2196
8f4b0f43
VZ
2197 case 2:
2198 case 4:
2199 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2200 // but they also have to start at character boundary and not
2201 // span two adjacent characters
2202 const char *p;
2203 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2204 ;
2205 srcLen = p - src;
2206 break;
2207 }
d50c0831
VZ
2208
2209 // when we're determining the length of the string ourselves we count
2210 // the terminating NUL(s) as part of it and always NUL-terminate the
2211 // output
2212 srcLen += nulLen;
69373110
VZ
2213 }
2214
8f4b0f43
VZ
2215 // we express length in the number of (wide) characters but iconv always
2216 // counts buffer sizes it in bytes
2217 dstLen *= SIZEOF_WCHAR_T;
2218
b1d547eb 2219#if wxUSE_THREADS
6a17b868
SN
2220 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2221 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2222 // wxConvLocal that are used all over wx code, so we have to make sure
2223 // the handle is used by at most one thread at the time. Otherwise
2224 // only a few wx classes would be safe to use from non-main threads
2225 // as MB<->WC conversion would fail "randomly".
2226 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2227#endif // wxUSE_THREADS
2228
36acb880 2229 size_t res, cres;
8f4b0f43 2230 const char *pszPtr = src;
36acb880 2231
8f4b0f43 2232 if ( dst )
36acb880 2233 {
8f4b0f43 2234 char* bufPtr = (char*)dst;
e8769ed1 2235
36acb880 2236 // have destination buffer, convert there
1752fda6 2237 size_t dstLenOrig = dstLen;
36acb880 2238 cres = iconv(m2w,
8f4b0f43
VZ
2239 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2240 &bufPtr, &dstLen);
1752fda6
VZ
2241
2242 // convert the number of bytes converted as returned by iconv to the
2243 // number of (wide) characters converted that we need
2244 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2245
36acb880 2246 if (ms_wcNeedsSwap)
3a0d76bc 2247 {
36acb880 2248 // convert to native endianness
17a1ebd1 2249 for ( unsigned i = 0; i < res; i++ )
467a2982 2250 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2251 }
36acb880 2252 }
8f4b0f43 2253 else // no destination buffer
36acb880 2254 {
8f4b0f43 2255 // convert using temp buffer to calculate the size of the buffer needed
36acb880
VZ
2256 wchar_t tbuf[8];
2257 res = 0;
ef199164
DS
2258
2259 do
2260 {
e8769ed1 2261 char* bufPtr = (char*)tbuf;
8f4b0f43 2262 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2263
2264 cres = iconv(m2w,
8f4b0f43
VZ
2265 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2266 &bufPtr, &dstLen );
36acb880 2267
8f4b0f43 2268 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2269 }
2270 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2271 }
dccce9ea 2272
8f4b0f43 2273 if (ICONV_FAILED(cres, srcLen))
f1339c56 2274 {
36acb880 2275 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2276 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2277 return wxCONV_FAILED;
36acb880
VZ
2278 }
2279
2280 return res;
2281}
2282
8f4b0f43
VZ
2283size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2284 const wchar_t *src, size_t srcLen) const
36acb880 2285{
b1d547eb
VS
2286#if wxUSE_THREADS
2287 // NB: explained in MB2WC
2288 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2289#endif
3698ae71 2290
8f4b0f43 2291 if ( srcLen == wxNO_LEN )
2588ee86 2292 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2293
2294 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2295 size_t outbuflen = dstLen;
36acb880 2296 size_t res, cres;
3a0d76bc 2297
36acb880 2298 wchar_t *tmpbuf = 0;
3caec1bb 2299
36acb880
VZ
2300 if (ms_wcNeedsSwap)
2301 {
2302 // need to copy to temp buffer to switch endianness
74a7eb0b 2303 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2304 // could be in read-only memory, or be accessed in some other thread)
e8769ed1 2305 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
8f4b0f43
VZ
2306 for ( size_t i = 0; i < srcLen; i++ )
2307 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2308
8f4b0f43
VZ
2309 tmpbuf[srcLen] = L'\0';
2310 src = tmpbuf;
36acb880 2311 }
3a0d76bc 2312
8f4b0f43
VZ
2313 char* inbuf = (char*)src;
2314 if ( dst )
36acb880
VZ
2315 {
2316 // have destination buffer, convert there
8f4b0f43 2317 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2318
8f4b0f43 2319 res = dstLen - outbuflen;
36acb880 2320 }
8f4b0f43 2321 else // no destination buffer
36acb880 2322 {
8f4b0f43 2323 // convert using temp buffer to calculate the size of the buffer needed
36acb880
VZ
2324 char tbuf[16];
2325 res = 0;
ef199164
DS
2326 do
2327 {
8f4b0f43 2328 dst = tbuf;
e8769ed1 2329 outbuflen = 16;
36acb880 2330
8f4b0f43 2331 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2332
e8769ed1 2333 res += 16 - outbuflen;
ef199164
DS
2334 }
2335 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2336 }
dccce9ea 2337
36acb880
VZ
2338 if (ms_wcNeedsSwap)
2339 {
2340 free(tmpbuf);
2341 }
dccce9ea 2342
e8769ed1 2343 if (ICONV_FAILED(cres, inbuflen))
36acb880 2344 {
ce6f8d6f 2345 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2346 return wxCONV_FAILED;
36acb880
VZ
2347 }
2348
2349 return res;
2350}
2351
7ef3ab50 2352size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2353{
c1464d9d 2354 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2355 {
2356 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2357
2358#if wxUSE_THREADS
2359 // NB: explained in MB2WC
2360 wxMutexLocker lock(self->m_iconvMutex);
2361#endif
2362
999020e1 2363 const wchar_t *wnul = L"";
c1464d9d 2364 char buf[8]; // should be enough for NUL in any encoding
356410fc 2365 size_t inLen = sizeof(wchar_t),
c1464d9d 2366 outLen = WXSIZEOF(buf);
ef199164
DS
2367 char *inBuff = (char *)wnul;
2368 char *outBuff = buf;
2369 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2370 {
c1464d9d 2371 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2372 }
2373 else // ok
2374 {
ef199164 2375 self->m_minMBCharWidth = outBuff - buf;
356410fc 2376 }
eec47cc6
VZ
2377 }
2378
c1464d9d 2379 return m_minMBCharWidth;
eec47cc6
VZ
2380}
2381
ba98e032
VS
2382#if wxUSE_UNICODE_UTF8
2383bool wxMBConv_iconv::IsUTF8() const
2384{
86501081
VS
2385 return wxStricmp(m_name, "UTF-8") == 0 ||
2386 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2387}
2388#endif
2389
b040e242 2390#endif // HAVE_ICONV
36acb880 2391
e95354ec 2392
36acb880
VZ
2393// ============================================================================
2394// Win32 conversion classes
2395// ============================================================================
1cd52418 2396
e95354ec 2397#ifdef wxHAVE_WIN32_MB2WC
373658eb 2398
8b04d4c4 2399// from utils.cpp
d775fa82 2400#if wxUSE_FONTMAP
86501081 2401extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2402extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2403#endif
373658eb 2404
e95354ec 2405class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2406{
2407public:
bde4baac
VZ
2408 wxMBConv_win32()
2409 {
2410 m_CodePage = CP_ACP;
c1464d9d 2411 m_minMBCharWidth = 0;
bde4baac
VZ
2412 }
2413
d36c9347 2414 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2415 : wxMBConv()
d36c9347
VZ
2416 {
2417 m_CodePage = conv.m_CodePage;
2418 m_minMBCharWidth = conv.m_minMBCharWidth;
2419 }
2420
7608a683 2421#if wxUSE_FONTMAP
86501081 2422 wxMBConv_win32(const char* name)
bde4baac
VZ
2423 {
2424 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2425 m_minMBCharWidth = 0;
bde4baac 2426 }
dccce9ea 2427
e95354ec 2428 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2429 {
2430 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2431 m_minMBCharWidth = 0;
bde4baac 2432 }
eec47cc6 2433#endif // wxUSE_FONTMAP
8b04d4c4 2434
d36c9347 2435 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2436 {
02272c9c
VZ
2437 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2438 // the behaviour is not compatible with the Unix version (using iconv)
2439 // and break the library itself, e.g. wxTextInputStream::NextChar()
2440 // wouldn't work if reading an incomplete MB char didn't result in an
2441 // error
667e5b3e 2442 //
89028980 2443 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2444 // Win XP or newer and it is not supported for UTF-[78] so we always
2445 // use our own conversions in this case. See
89028980
VS
2446 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2447 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2448 if ( m_CodePage == CP_UTF8 )
89028980 2449 {
5487ff0f 2450 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2451 }
830f8f11
VZ
2452
2453 if ( m_CodePage == CP_UTF7 )
2454 {
5487ff0f 2455 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2456 }
2457
2458 int flags = 0;
2459 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2460 IsAtLeastWin2kSP4() )
89028980 2461 {
830f8f11 2462 flags = MB_ERR_INVALID_CHARS;
89028980 2463 }
667e5b3e 2464
2b5f62a0
VZ
2465 const size_t len = ::MultiByteToWideChar
2466 (
2467 m_CodePage, // code page
667e5b3e 2468 flags, // flags: fall on error
2b5f62a0
VZ
2469 psz, // input string
2470 -1, // its length (NUL-terminated)
b4da152e 2471 buf, // output string
2b5f62a0
VZ
2472 buf ? n : 0 // size of output buffer
2473 );
89028980
VS
2474 if ( !len )
2475 {
2476 // function totally failed
467e0479 2477 return wxCONV_FAILED;
89028980
VS
2478 }
2479
2480 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2481 // check if we succeeded, by doing a double trip:
2482 if ( !flags && buf )
2483 {
53c174fc
VZ
2484 const size_t mbLen = strlen(psz);
2485 wxCharBuffer mbBuf(mbLen);
89028980
VS
2486 if ( ::WideCharToMultiByte
2487 (
2488 m_CodePage,
2489 0,
2490 buf,
2491 -1,
2492 mbBuf.data(),
53c174fc 2493 mbLen + 1, // size in bytes, not length
89028980
VS
2494 NULL,
2495 NULL
2496 ) == 0 ||
2497 strcmp(mbBuf, psz) != 0 )
2498 {
2499 // we didn't obtain the same thing we started from, hence
2500 // the conversion was lossy and we consider that it failed
467e0479 2501 return wxCONV_FAILED;
89028980
VS
2502 }
2503 }
2b5f62a0 2504
03a991bc
VZ
2505 // note that it returns count of written chars for buf != NULL and size
2506 // of the needed buffer for buf == NULL so in either case the length of
2507 // the string (which never includes the terminating NUL) is one less
89028980 2508 return len - 1;
f1339c56 2509 }
dccce9ea 2510
d36c9347 2511 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2512 {
13dd924a
VZ
2513 /*
2514 we have a problem here: by default, WideCharToMultiByte() may
2515 replace characters unrepresentable in the target code page with bad
2516 quality approximations such as turning "1/2" symbol (U+00BD) into
2517 "1" for the code pages which don't have it and we, obviously, want
2518 to avoid this at any price
d775fa82 2519
13dd924a
VZ
2520 the trouble is that this function does it _silently_, i.e. it won't
2521 even tell us whether it did or not... Win98/2000 and higher provide
2522 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2523 we have to resort to a round trip, i.e. check that converting back
2524 results in the same string -- this is, of course, expensive but
2525 otherwise we simply can't be sure to not garble the data.
2526 */
2527
2528 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2529 // it doesn't work with CJK encodings (which we test for rather roughly
2530 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2531 // supporting it
907173e5
WS
2532 BOOL usedDef wxDUMMY_INITIALIZE(false);
2533 BOOL *pUsedDef;
13dd924a
VZ
2534 int flags;
2535 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2536 {
2537 // it's our lucky day
2538 flags = WC_NO_BEST_FIT_CHARS;
2539 pUsedDef = &usedDef;
2540 }
2541 else // old system or unsupported encoding
2542 {
2543 flags = 0;
2544 pUsedDef = NULL;
2545 }
2546
2b5f62a0
VZ
2547 const size_t len = ::WideCharToMultiByte
2548 (
2549 m_CodePage, // code page
13dd924a
VZ
2550 flags, // either none or no best fit
2551 pwz, // input string
2b5f62a0
VZ
2552 -1, // it is (wide) NUL-terminated
2553 buf, // output buffer
2554 buf ? n : 0, // and its size
2555 NULL, // default "replacement" char
13dd924a 2556 pUsedDef // [out] was it used?
2b5f62a0
VZ
2557 );
2558
13dd924a
VZ
2559 if ( !len )
2560 {
2561 // function totally failed
467e0479 2562 return wxCONV_FAILED;
13dd924a
VZ
2563 }
2564
765bdb4a
VZ
2565 // we did something, check if we really succeeded
2566 if ( flags )
13dd924a 2567 {
765bdb4a
VZ
2568 // check if the conversion failed, i.e. if any replacements
2569 // were done
2570 if ( usedDef )
2571 return wxCONV_FAILED;
2572 }
2573 else // we must resort to double tripping...
2574 {
2575 // first we need to ensure that we really have the MB data: this is
2576 // not the case if we're called with NULL buffer, in which case we
2577 // need to do the conversion yet again
2578 wxCharBuffer bufDef;
2579 if ( !buf )
13dd924a 2580 {
765bdb4a
VZ
2581 bufDef = wxCharBuffer(len);
2582 buf = bufDef.data();
2583 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2584 buf, len, NULL, NULL) )
467e0479 2585 return wxCONV_FAILED;
13dd924a 2586 }
765bdb4a 2587
564da6ff
VZ
2588 if ( !n )
2589 n = wcslen(pwz);
765bdb4a 2590 wxWCharBuffer wcBuf(n);
564da6ff 2591 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2592 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2593 {
765bdb4a
VZ
2594 // we didn't obtain the same thing we started from, hence
2595 // the conversion was lossy and we consider that it failed
2596 return wxCONV_FAILED;
13dd924a
VZ
2597 }
2598 }
2599
03a991bc 2600 // see the comment above for the reason of "len - 1"
13dd924a 2601 return len - 1;
f1339c56 2602 }
dccce9ea 2603
7ef3ab50
VZ
2604 virtual size_t GetMBNulLen() const
2605 {
2606 if ( m_minMBCharWidth == 0 )
2607 {
2608 int len = ::WideCharToMultiByte
2609 (
2610 m_CodePage, // code page
2611 0, // no flags
2612 L"", // input string
2613 1, // translate just the NUL
2614 NULL, // output buffer
2615 0, // and its size
2616 NULL, // no replacement char
2617 NULL // [out] don't care if it was used
2618 );
2619
2620 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2621 switch ( len )
2622 {
2623 default:
2624 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2625 self->m_minMBCharWidth = (size_t)-1;
2626 break;
7ef3ab50
VZ
2627
2628 case 0:
2629 self->m_minMBCharWidth = (size_t)-1;
2630 break;
2631
2632 case 1:
2633 case 2:
2634 case 4:
2635 self->m_minMBCharWidth = len;
2636 break;
2637 }
2638 }
2639
2640 return m_minMBCharWidth;
2641 }
2642
d36c9347
VZ
2643 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2644
13dd924a
VZ
2645 bool IsOk() const { return m_CodePage != -1; }
2646
2647private:
2648 static bool CanUseNoBestFit()
2649 {
2650 static int s_isWin98Or2k = -1;
2651
2652 if ( s_isWin98Or2k == -1 )
2653 {
2654 int verMaj, verMin;
2655 switch ( wxGetOsVersion(&verMaj, &verMin) )
2656 {
406d283a 2657 case wxOS_WINDOWS_9X:
13dd924a
VZ
2658 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2659 break;
2660
406d283a 2661 case wxOS_WINDOWS_NT:
13dd924a
VZ
2662 s_isWin98Or2k = verMaj >= 5;
2663 break;
2664
2665 default:
ef199164 2666 // unknown: be conservative by default
13dd924a 2667 s_isWin98Or2k = 0;
ef199164 2668 break;
13dd924a
VZ
2669 }
2670
2671 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2672 }
2673
2674 return s_isWin98Or2k == 1;
2675 }
f1339c56 2676
89028980
VS
2677 static bool IsAtLeastWin2kSP4()
2678 {
8942f83a
WS
2679#ifdef __WXWINCE__
2680 return false;
2681#else
89028980
VS
2682 static int s_isAtLeastWin2kSP4 = -1;
2683
2684 if ( s_isAtLeastWin2kSP4 == -1 )
2685 {
2686 OSVERSIONINFOEX ver;
2687
2688 memset(&ver, 0, sizeof(ver));
2689 ver.dwOSVersionInfoSize = sizeof(ver);
2690 GetVersionEx((OSVERSIONINFO*)&ver);
2691
2692 s_isAtLeastWin2kSP4 =
2693 ((ver.dwMajorVersion > 5) || // Vista+
2694 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2695 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2696 ver.wServicePackMajor >= 4)) // 2000 SP4+
2697 ? 1 : 0;
2698 }
2699
2700 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2701#endif
89028980
VS
2702 }
2703
eec47cc6 2704
c1464d9d 2705 // the code page we're working with
b1d66b54 2706 long m_CodePage;
c1464d9d 2707
7ef3ab50 2708 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2709 // "unknown"
2710 size_t m_minMBCharWidth;
1cd52418 2711};
e95354ec
VZ
2712
2713#endif // wxHAVE_WIN32_MB2WC
2714
f7e98dee 2715
36acb880
VZ
2716// ============================================================================
2717// wxEncodingConverter based conversion classes
2718// ============================================================================
2719
1e6feb95 2720#if wxUSE_FONTMAP
1cd52418 2721
e95354ec 2722class wxMBConv_wxwin : public wxMBConv
1cd52418 2723{
8b04d4c4
VZ
2724private:
2725 void Init()
2726 {
6ac84a78
DE
2727 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2728 // The wxMBConv_cf class does a better job.
2729 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2730 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2731 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2732 }
2733
6001e347 2734public:
f1339c56
RR
2735 // temporarily just use wxEncodingConverter stuff,
2736 // so that it works while a better implementation is built
86501081 2737 wxMBConv_wxwin(const char* name)
f1339c56
RR
2738 {
2739 if (name)
267e11c5 2740 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2741 else
2742 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2743
8b04d4c4
VZ
2744 Init();
2745 }
2746
e95354ec 2747 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2748 {
2749 m_enc = enc;
2750
2751 Init();
f1339c56 2752 }
dccce9ea 2753
bde4baac 2754 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2755 {
2756 size_t inbuf = strlen(psz);
dccce9ea 2757 if (buf)
c643a977 2758 {
ef199164 2759 if (!m2w.Convert(psz, buf))
467e0479 2760 return wxCONV_FAILED;
c643a977 2761 }
f1339c56
RR
2762 return inbuf;
2763 }
dccce9ea 2764
bde4baac 2765 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2766 {
f8d791e0 2767 const size_t inbuf = wxWcslen(psz);
f1339c56 2768 if (buf)
c643a977 2769 {
ef199164 2770 if (!w2m.Convert(psz, buf))
467e0479 2771 return wxCONV_FAILED;
c643a977 2772 }
dccce9ea 2773
f1339c56
RR
2774 return inbuf;
2775 }
dccce9ea 2776
7ef3ab50 2777 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2778 {
2779 switch ( m_enc )
2780 {
2781 case wxFONTENCODING_UTF16BE:
2782 case wxFONTENCODING_UTF16LE:
c1464d9d 2783 return 2;
eec47cc6
VZ
2784
2785 case wxFONTENCODING_UTF32BE:
2786 case wxFONTENCODING_UTF32LE:
c1464d9d 2787 return 4;
eec47cc6
VZ
2788
2789 default:
c1464d9d 2790 return 1;
eec47cc6
VZ
2791 }
2792 }
2793
d36c9347
VZ
2794 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2795
7ef3ab50
VZ
2796 bool IsOk() const { return m_ok; }
2797
2798public:
2799 wxFontEncoding m_enc;
2800 wxEncodingConverter m2w, w2m;
2801
2802private:
cafbf6fb
VZ
2803 // were we initialized successfully?
2804 bool m_ok;
fc7a2a60 2805
e95354ec 2806 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2807};
6001e347 2808
8f115891 2809// make the constructors available for unit testing
86501081 2810WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2811{
2812 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2813 if ( !result->IsOk() )
2814 {
2815 delete result;
2816 return 0;
2817 }
ef199164 2818
8f115891
MW
2819 return result;
2820}
2821
1e6feb95
VZ
2822#endif // wxUSE_FONTMAP
2823
36acb880
VZ
2824// ============================================================================
2825// wxCSConv implementation
2826// ============================================================================
2827
8b04d4c4 2828void wxCSConv::Init()
6001e347 2829{
e95354ec
VZ
2830 m_name = NULL;
2831 m_convReal = NULL;
2832 m_deferred = true;
2833}
2834
86501081 2835wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2836{
2837 Init();
82713003 2838
86501081 2839 if ( !charset.empty() )
e95354ec 2840 {
86501081 2841 SetName(charset.ToAscii());
e95354ec 2842 }
bda3d86a 2843
e4277538
VZ
2844#if wxUSE_FONTMAP
2845 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2846#else
bda3d86a 2847 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2848#endif
6001e347
RR
2849}
2850
8b04d4c4
VZ
2851wxCSConv::wxCSConv(wxFontEncoding encoding)
2852{
bda3d86a 2853 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2854 {
2855 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2856
2857 encoding = wxFONTENCODING_SYSTEM;
2858 }
2859
8b04d4c4
VZ
2860 Init();
2861
bda3d86a 2862 m_encoding = encoding;
8b04d4c4
VZ
2863}
2864
6001e347
RR
2865wxCSConv::~wxCSConv()
2866{
65e50848
JS
2867 Clear();
2868}
2869
54380f29 2870wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2871 : wxMBConv()
54380f29 2872{
8b04d4c4
VZ
2873 Init();
2874
54380f29 2875 SetName(conv.m_name);
8b04d4c4 2876 m_encoding = conv.m_encoding;
54380f29
GD
2877}
2878
2879wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2880{
2881 Clear();
8b04d4c4 2882
54380f29 2883 SetName(conv.m_name);
8b04d4c4
VZ
2884 m_encoding = conv.m_encoding;
2885
54380f29
GD
2886 return *this;
2887}
2888
65e50848
JS
2889void wxCSConv::Clear()
2890{
8b04d4c4 2891 free(m_name);
e95354ec 2892 delete m_convReal;
8b04d4c4 2893
65e50848 2894 m_name = NULL;
e95354ec 2895 m_convReal = NULL;
6001e347
RR
2896}
2897
86501081 2898void wxCSConv::SetName(const char *charset)
6001e347 2899{
f1339c56
RR
2900 if (charset)
2901 {
d6f2a891 2902 m_name = wxStrdup(charset);
e95354ec 2903 m_deferred = true;
f1339c56 2904 }
6001e347
RR
2905}
2906
8b3eb85d 2907#if wxUSE_FONTMAP
8b3eb85d
VZ
2908
2909WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2910 wxEncodingNameCache );
8b3eb85d
VZ
2911
2912static wxEncodingNameCache gs_nameCache;
2913#endif
2914
e95354ec
VZ
2915wxMBConv *wxCSConv::DoCreate() const
2916{
ce6f8d6f
VZ
2917#if wxUSE_FONTMAP
2918 wxLogTrace(TRACE_STRCONV,
2919 wxT("creating conversion for %s"),
2920 (m_name ? m_name
86501081 2921 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2922#endif // wxUSE_FONTMAP
2923
c547282d
VZ
2924 // check for the special case of ASCII or ISO8859-1 charset: as we have
2925 // special knowledge of it anyhow, we don't need to create a special
2926 // conversion object
e4277538
VZ
2927 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2928 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2929 {
e95354ec
VZ
2930 // don't convert at all
2931 return NULL;
2932 }
dccce9ea 2933
e95354ec
VZ
2934 // we trust OS to do conversion better than we can so try external
2935 // conversion methods first
2936 //
2937 // the full order is:
2938 // 1. OS conversion (iconv() under Unix or Win32 API)
2939 // 2. hard coded conversions for UTF
2940 // 3. wxEncodingConverter as fall back
2941
2942 // step (1)
2943#ifdef HAVE_ICONV
c547282d 2944#if !wxUSE_FONTMAP
e95354ec 2945 if ( m_name )
c547282d 2946#endif // !wxUSE_FONTMAP
e95354ec 2947 {
3ef10cfc 2948#if wxUSE_FONTMAP
8b3eb85d 2949 wxFontEncoding encoding(m_encoding);
3ef10cfc 2950#endif
8b3eb85d 2951
86501081 2952 if ( m_name )
8b3eb85d 2953 {
86501081 2954 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2955 if ( conv->IsOk() )
2956 return conv;
2957
2958 delete conv;
c547282d
VZ
2959
2960#if wxUSE_FONTMAP
8b3eb85d 2961 encoding =
86501081 2962 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2963#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2964 }
2965#if wxUSE_FONTMAP
2966 {
2967 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2968 if ( it != gs_nameCache.end() )
2969 {
2970 if ( it->second.empty() )
2971 return NULL;
c547282d 2972
86501081 2973 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2974 if ( conv->IsOk() )
2975 return conv;
e95354ec 2976
8b3eb85d
VZ
2977 delete conv;
2978 }
2979
2980 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2981 // CS : in case this does not return valid names (eg for MacRoman)
2982 // encoding got a 'failure' entry in the cache all the same,
2983 // although it just has to be created using a different method, so
2984 // only store failed iconv creation attempts (or perhaps we
2985 // shoulnd't do this at all ?)
3c67ec06 2986 if ( names[0] != NULL )
8b3eb85d 2987 {
3c67ec06 2988 for ( ; *names; ++names )
8b3eb85d 2989 {
86501081
VS
2990 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2991 // will need changes that will obsolete this
2992 wxString name(*names);
2993 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2994 if ( conv->IsOk() )
2995 {
2996 gs_nameCache[encoding] = *names;
2997 return conv;
2998 }
2999
3000 delete conv;
8b3eb85d
VZ
3001 }
3002
3c67ec06 3003 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 3004 }
8b3eb85d
VZ
3005 }
3006#endif // wxUSE_FONTMAP
e95354ec
VZ
3007 }
3008#endif // HAVE_ICONV
3009
3010#ifdef wxHAVE_WIN32_MB2WC
3011 {
7608a683 3012#if wxUSE_FONTMAP
e95354ec
VZ
3013 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3014 : new wxMBConv_win32(m_encoding);
3015 if ( conv->IsOk() )
3016 return conv;
3017
3018 delete conv;
7608a683
WS
3019#else
3020 return NULL;
3021#endif
e95354ec
VZ
3022 }
3023#endif // wxHAVE_WIN32_MB2WC
ef199164 3024
5c4ed98d 3025#ifdef __DARWIN__
f7e98dee 3026 {
6ff49cbc
DE
3027 // leave UTF16 and UTF32 to the built-ins of wx
3028 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3029 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3030 {
a6900d10 3031#if wxUSE_FONTMAP
5c4ed98d
DE
3032 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3033 : new wxMBConv_cf(m_encoding);
a6900d10 3034#else
5c4ed98d 3035 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3036#endif
ef199164 3037
f7e98dee 3038 if ( conv->IsOk() )
d775fa82
WS
3039 return conv;
3040
3041 delete conv;
3042 }
335d31e0 3043 }
5c4ed98d
DE
3044#endif // __DARWIN__
3045
e95354ec
VZ
3046 // step (2)
3047 wxFontEncoding enc = m_encoding;
3048#if wxUSE_FONTMAP
c547282d
VZ
3049 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3050 {
3051 // use "false" to suppress interactive dialogs -- we can be called from
3052 // anywhere and popping up a dialog from here is the last thing we want to
3053 // do
267e11c5 3054 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3055 }
e95354ec
VZ
3056#endif // wxUSE_FONTMAP
3057
3058 switch ( enc )
3059 {
3060 case wxFONTENCODING_UTF7:
3061 return new wxMBConvUTF7;
3062
3063 case wxFONTENCODING_UTF8:
3064 return new wxMBConvUTF8;
3065
e95354ec
VZ
3066 case wxFONTENCODING_UTF16BE:
3067 return new wxMBConvUTF16BE;
3068
3069 case wxFONTENCODING_UTF16LE:
3070 return new wxMBConvUTF16LE;
3071
e95354ec
VZ
3072 case wxFONTENCODING_UTF32BE:
3073 return new wxMBConvUTF32BE;
3074
3075 case wxFONTENCODING_UTF32LE:
3076 return new wxMBConvUTF32LE;
3077
3078 default:
3079 // nothing to do but put here to suppress gcc warnings
ef199164 3080 break;
e95354ec
VZ
3081 }
3082
3083 // step (3)
3084#if wxUSE_FONTMAP
3085 {
3086 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3087 : new wxMBConv_wxwin(m_encoding);
3088 if ( conv->IsOk() )
3089 return conv;
3090
3091 delete conv;
3092 }
3093#endif // wxUSE_FONTMAP
3094
a58d4f4d
VS
3095 // NB: This is a hack to prevent deadlock. What could otherwise happen
3096 // in Unicode build: wxConvLocal creation ends up being here
3097 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
3098 // attach a timestamp, for which it will need wxConvLocal (to convert
3099 // time to char* and then wchar_t*), but that fails, tries to log the
3100 // error, but wxLog has an (already locked) critical section that
3101 // guards the static buffer.
a58d4f4d
VS
3102 static bool alreadyLoggingError = false;
3103 if (!alreadyLoggingError)
3104 {
3105 alreadyLoggingError = true;
3106 wxLogError(_("Cannot convert from the charset '%s'!"),
3107 m_name ? m_name
e95354ec
VZ
3108 :
3109#if wxUSE_FONTMAP
86501081 3110 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 3111#else // !wxUSE_FONTMAP
86501081 3112 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
3113#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3114 );
ef199164 3115
a58d4f4d
VS
3116 alreadyLoggingError = false;
3117 }
e95354ec
VZ
3118
3119 return NULL;
3120}
3121
3122void wxCSConv::CreateConvIfNeeded() const
3123{
3124 if ( m_deferred )
3125 {
3126 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3127
bda3d86a
VZ
3128 // if we don't have neither the name nor the encoding, use the default
3129 // encoding for this system
3130 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3131 {
4c75209f 3132#if wxUSE_INTL
02c7347b 3133 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3134#else
3135 // fallback to some reasonable default:
3136 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3137#endif // wxUSE_INTL
4c75209f 3138 }
bda3d86a 3139
e95354ec
VZ
3140 self->m_convReal = DoCreate();
3141 self->m_deferred = false;
6001e347 3142 }
6001e347
RR
3143}
3144
0f0298b1
VZ
3145bool wxCSConv::IsOk() const
3146{
3147 CreateConvIfNeeded();
3148
3149 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3150 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3151 return true; // always ok as we do it ourselves
3152
3153 // m_convReal->IsOk() is called at its own creation, so we know it must
3154 // be ok if m_convReal is non-NULL
3155 return m_convReal != NULL;
3156}
3157
1c714a5d
VZ
3158size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3159 const char *src, size_t srcLen) const
3160{
3161 CreateConvIfNeeded();
3162
2c74c558
VS
3163 if (m_convReal)
3164 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3165
3166 // latin-1 (direct)
05392dc8
VZ
3167 if ( srcLen == wxNO_LEN )
3168 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3169
05392dc8
VZ
3170 if ( dst )
3171 {
3172 if ( dstLen < srcLen )
3173 return wxCONV_FAILED;
1c714a5d 3174
05392dc8
VZ
3175 for ( size_t n = 0; n < srcLen; n++ )
3176 dst[n] = (unsigned char)(src[n]);
3177 }
2c74c558 3178
05392dc8 3179 return srcLen;
1c714a5d
VZ
3180}
3181
05392dc8
VZ
3182size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3183 const wchar_t *src, size_t srcLen) const
6001e347 3184{
e95354ec 3185 CreateConvIfNeeded();
dccce9ea 3186
e95354ec 3187 if (m_convReal)
05392dc8 3188 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3189
3190 // latin-1 (direct)
05392dc8
VZ
3191 if ( srcLen == wxNO_LEN )
3192 srcLen = wxWcslen(src) + 1;
dccce9ea 3193
05392dc8 3194 if ( dst )
f1339c56 3195 {
05392dc8
VZ
3196 if ( dstLen < srcLen )
3197 return wxCONV_FAILED;
1cd52418 3198
05392dc8 3199 for ( size_t n = 0; n < srcLen; n++ )
24642831 3200 {
05392dc8 3201 if ( src[n] > 0xFF )
467e0479 3202 return wxCONV_FAILED;
ef199164 3203
05392dc8 3204 dst[n] = (char)src[n];
24642831 3205 }
05392dc8 3206
24642831 3207 }
05392dc8 3208 else // still need to check the input validity
24642831 3209 {
05392dc8 3210 for ( size_t n = 0; n < srcLen; n++ )
24642831 3211 {
05392dc8 3212 if ( src[n] > 0xFF )
467e0479 3213 return wxCONV_FAILED;
24642831 3214 }
f1339c56 3215 }
dccce9ea 3216
05392dc8 3217 return srcLen;
6001e347
RR
3218}
3219
7ef3ab50 3220size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3221{
3222 CreateConvIfNeeded();
3223
3224 if ( m_convReal )
3225 {
7ef3ab50 3226 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3227 }
3228
ba98e032 3229 // otherwise, we are ISO-8859-1
c1464d9d 3230 return 1;
eec47cc6
VZ
3231}
3232
ba98e032
VS
3233#if wxUSE_UNICODE_UTF8
3234bool wxCSConv::IsUTF8() const
3235{
3236 CreateConvIfNeeded();
3237
3238 if ( m_convReal )
3239 {
3240 return m_convReal->IsUTF8();
3241 }
3242
3243 // otherwise, we are ISO-8859-1
3244 return false;
3245}
3246#endif
3247
69c928ef
VZ
3248
3249#if wxUSE_UNICODE
3250
3251wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3252{
3253 if ( !s )
3254 return wxWCharBuffer();
3255
3256 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3257 if ( !wbuf )
5487ff0f 3258 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3259 if ( !wbuf )
3260 wbuf = wxConvISO8859_1.cMB2WX(s);
3261
3262 return wbuf;
3263}
3264
3265wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3266{
3267 if ( !ws )
3268 return wxCharBuffer();
3269
3270 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3271 if ( !buf )
3272 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3273
3274 return buf;
3275}
3276
3277#endif // wxUSE_UNICODE
f5a1953b 3278
1e50d914
VS
3279// ----------------------------------------------------------------------------
3280// globals
3281// ----------------------------------------------------------------------------
3282
3283// NB: The reason why we create converted objects in this convoluted way,
3284// using a factory function instead of global variable, is that they
3285// may be used at static initialization time (some of them are used by
3286// wxString ctors and there may be a global wxString object). In other
3287// words, possibly _before_ the converter global object would be
3288// initialized.
3289
3290#undef wxConvLibc
3291#undef wxConvUTF8
3292#undef wxConvUTF7
3293#undef wxConvLocal
3294#undef wxConvISO8859_1
3295
3296#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3297 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3298 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3299 { \
3300 static impl_klass name##Obj ctor_args; \
3301 return &name##Obj; \
3302 } \
3303 /* this ensures that all global converter objects are created */ \
3304 /* by the time static initialization is done, i.e. before any */ \
3305 /* thread is launched: */ \
3306 static klass* gs_##name##instance = wxGet_##name##Ptr()
3307
3308#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3309 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3310
3311#ifdef __WINDOWS__
3312 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3313#else
3314 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3315#endif
3316
e1079eda
VZ
3317// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3318// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3319// provokes an error message about "not enough macro parameters"; and we
3320// can't use "()" here as the name##Obj declaration would be parsed as a
3321// function declaration then, so use a semicolon and live with an extra
3322// empty statement (and hope that no compilers warns about this)
3323WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3324WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3325
3326WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3327WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3328
3329WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3330WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3331
6ac84a78
DE
3332#ifdef __DARWIN__
3333// The xnu kernel always communicates file paths in decomposed UTF-8.
3334// WARNING: Are we sure that CFString's conversion will cause decomposition?
3335static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3336#endif
6ac84a78 3337
1e50d914 3338WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3339#ifdef __DARWIN__
1e50d914 3340 &wxConvMacUTF8DObj;
6ac84a78 3341#else // !__DARWIN__
1e50d914 3342 wxGet_wxConvLibcPtr();
6ac84a78 3343#endif // __DARWIN__/!__DARWIN__
1e50d914 3344
bde4baac
VZ
3345#else // !wxUSE_WCHAR_T
3346
1e50d914 3347// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3348// stand-ins in absence of wchar_t
3349WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3350 wxConvISO8859_1,
3351 wxConvLocal,
3352 wxConvUTF8;
3353
3354#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T