]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
No changes, just convert file to UTF-8.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
c933e267 56#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
9a83f860 60#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069 144 const size_t
5c33522f 145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434 162 // although new conversion classes are supposed to implement this function
36f93678 163 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
36f93678
VZ
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
36f93678
VZ
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
483b0434 227 for ( ;; )
eec47cc6 228 {
c1464d9d 229 // try to convert the current chunk
483b0434 230 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
e4e3bbb4 233
483b0434 234 dstWritten += lenChunk;
f6a02087
VZ
235 if ( !srcEnd )
236 dstWritten++;
f5fb6871 237
f6a02087 238 if ( !lenChunk )
467e0479
VZ
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
483b0434
VZ
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
f6a02087
VZ
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
f6a02087
VZ
254 if ( !srcEnd )
255 dst++;
483b0434 256 }
c1464d9d 257
483b0434 258 if ( !srcEnd )
c1464d9d 259 {
467e0479
VZ
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
c1464d9d
VZ
262 break;
263 }
eec47cc6
VZ
264
265 // advance the input pointer past the end of this chunk
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
483b0434 275 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
36f93678 279 // delimited by srcEnd
483b0434 280 if ( src >= srcEnd )
c1464d9d 281 break;
36f93678
VZ
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
c1464d9d
VZ
292 }
293
483b0434 294 return dstWritten;
e4e3bbb4
RN
295}
296
483b0434
VZ
297size_t
298wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
e4e3bbb4 300{
483b0434
VZ
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
e4e3bbb4 303
f6a02087
VZ
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
eec47cc6
VZ
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
eec47cc6 311 wxWCharBuffer bufTmp;
f6a02087 312 if ( isNulTerminated )
e4e3bbb4 313 {
483b0434 314 srcLen = wxWcslen(src) + 1;
eec47cc6 315 }
483b0434 316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
317 {
318 // make a copy in order to properly NUL-terminate the string
483b0434 319 bufTmp = wxWCharBuffer(srcLen);
ef199164 320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
483b0434 335 dstWritten += lenChunk;
c45fad9a 336 if ( src+lenChunk < srcEnd || isNulTerminated )
f6a02087 337 dstWritten += lenNul;
483b0434
VZ
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
f6a02087 344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
c45fad9a 348 if ( src+lenChunk < srcEnd || isNulTerminated )
f6a02087 349 dst += lenNul;
483b0434 350 }
eec47cc6 351 }
e4e3bbb4 352
483b0434
VZ
353 return dstWritten;
354}
355
ef199164 356size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 357{
51725fc0 358 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 359 if ( rc != wxCONV_FAILED )
509da451
VZ
360 {
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
363 rc--;
364 }
365
366 return rc;
367}
368
ef199164 369size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 370{
51725fc0 371 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 372 if ( rc != wxCONV_FAILED )
509da451 373 {
51725fc0 374 rc -= GetMBNulLen();
509da451
VZ
375 }
376
377 return rc;
378}
379
483b0434
VZ
380wxMBConv::~wxMBConv()
381{
382 // nothing to do here (necessary for Darwin linking probably)
383}
e4e3bbb4 384
483b0434
VZ
385const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
386{
387 if ( psz )
eec47cc6 388 {
483b0434 389 // calculate the length of the buffer needed first
a2db25a1 390 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 391 if ( nLen != wxCONV_FAILED )
f5fb6871 392 {
483b0434 393 // now do the actual conversion
a2db25a1 394 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 395
483b0434 396 // +1 for the trailing NULL
a2db25a1 397 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 398 return buf;
f5fb6871 399 }
483b0434 400 }
e4e3bbb4 401
483b0434
VZ
402 return wxWCharBuffer();
403}
3698ae71 404
483b0434
VZ
405const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
406{
407 if ( pwz )
408 {
a2db25a1 409 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 410 if ( nLen != wxCONV_FAILED )
483b0434 411 {
a2db25a1
VZ
412 wxCharBuffer buf(nLen - 1);
413 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
414 return buf;
415 }
416 }
417
418 return wxCharBuffer();
419}
e4e3bbb4 420
483b0434 421const wxWCharBuffer
ef199164 422wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 423{
ef199164 424 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 425 if ( dstLen != wxCONV_FAILED )
483b0434 426 {
0dd13d21
VZ
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer wbuf(dstLen);
f6a02087 431 wbuf.data()[dstLen] = L'\0';
ef199164 432 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
433 {
434 if ( outLen )
467e0479
VZ
435 {
436 *outLen = dstLen;
f6a02087
VZ
437
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen == wxNO_LEN )
467e0479
VZ
444 (*outLen)--;
445 }
446
483b0434
VZ
447 return wbuf;
448 }
449 }
450
451 if ( outLen )
452 *outLen = 0;
453
454 return wxWCharBuffer();
455}
456
457const wxCharBuffer
ef199164 458wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 459{
13d92ad6 460 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 461 if ( dstLen != wxCONV_FAILED )
483b0434 462 {
0dd13d21
VZ
463 const size_t nulLen = GetMBNulLen();
464
465 // as above, ensure that the buffer is always NUL-terminated, even if
466 // the input is not
467 wxCharBuffer buf(dstLen + nulLen - 1);
468 memset(buf.data() + dstLen, 0, nulLen);
ef199164 469 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
470 {
471 if ( outLen )
467e0479
VZ
472 {
473 *outLen = dstLen;
474
f6a02087 475 if ( inLen == wxNO_LEN )
467e0479 476 {
f6a02087
VZ
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
13d92ad6 479 *outLen -= nulLen;
467e0479
VZ
480 }
481 }
d32a507d 482
483b0434
VZ
483 return buf;
484 }
e4e3bbb4
RN
485 }
486
eec47cc6
VZ
487 if ( outLen )
488 *outLen = 0;
489
490 return wxCharBuffer();
e4e3bbb4
RN
491}
492
40ac5040
VZ
493const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
494{
495 const size_t srcLen = buf.length();
496 if ( srcLen )
497 {
498 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
499 if ( dstLen != wxCONV_FAILED )
500 {
501 wxWCharBuffer wbuf(dstLen);
502 wbuf.data()[dstLen] = L'\0';
503 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
504 return wbuf;
505 }
506 }
507
508 return wxWCharBuffer();
509}
510
511const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
512{
513 const size_t srcLen = wbuf.length();
514 if ( srcLen )
515 {
516 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
517 if ( dstLen != wxCONV_FAILED )
518 {
519 wxCharBuffer buf(dstLen);
520 buf.data()[dstLen] = '\0';
521 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
522 return buf;
523 }
524 }
525
526 return wxCharBuffer();
527}
528
6001e347 529// ----------------------------------------------------------------------------
bde4baac 530// wxMBConvLibc
6001e347
RR
531// ----------------------------------------------------------------------------
532
bde4baac
VZ
533size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
534{
535 return wxMB2WC(buf, psz, n);
536}
537
538size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
539{
540 return wxWC2MB(buf, psz, n);
541}
e1bfe89e
RR
542
543// ----------------------------------------------------------------------------
532d575b 544// wxConvBrokenFileNames
e1bfe89e
RR
545// ----------------------------------------------------------------------------
546
eec47cc6
VZ
547#ifdef __UNIX__
548
86501081 549wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 550{
9a83f860
VZ
551 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
552 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 553 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
554 else
555 m_conv = new wxCSConv(charset);
ea8ce907
RR
556}
557
eec47cc6 558#endif // __UNIX__
c12b7f79 559
bde4baac 560// ----------------------------------------------------------------------------
3698ae71 561// UTF-7
bde4baac 562// ----------------------------------------------------------------------------
6001e347 563
15f2ee32 564// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
565//
566// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 567
15f2ee32
RN
568//
569// BASE64 decoding table
570//
571static const unsigned char utf7unb64[] =
6001e347 572{
15f2ee32
RN
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
576 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
577 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
578 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
579 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
580 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
581 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
582 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
583 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
584 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
585 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
586 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
587 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
588 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
605};
606
9d653e81
VZ
607size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
608 const char *src, size_t srcLen) const
15f2ee32 609{
9d653e81 610 DecoderState stateOrig,
852dcba5 611 *statePtr;
9d653e81
VZ
612 if ( srcLen == wxNO_LEN )
613 {
614 // convert the entire string, up to and including the trailing NUL
615 srcLen = strlen(src) + 1;
616
617 // when working on the entire strings we don't update nor use the shift
618 // state from the previous call
619 statePtr = &stateOrig;
620 }
621 else // when working with partial strings we do use the shift state
622 {
5c33522f 623 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
624
625 // also save the old state to be able to rollback to it on error
626 stateOrig = m_stateDecoder;
627 }
628
629 // but to simplify the code below we use this variable in both cases
630 DecoderState& state = *statePtr;
631
632
633 // number of characters [which would have been] written to dst [if it were
634 // not NULL]
15f2ee32
RN
635 size_t len = 0;
636
9d653e81
VZ
637 const char * const srcEnd = src + srcLen;
638
639 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 640 {
9d653e81
VZ
641 const unsigned char cc = *src++;
642
643 if ( state.IsShifted() )
15f2ee32 644 {
9d653e81
VZ
645 const unsigned char dc = utf7unb64[cc];
646 if ( dc == 0xff )
15f2ee32 647 {
ccaa848d
VZ
648 // end of encoded part, check that nothing was left: there can
649 // be up to 4 bits of 0 padding but nothing else (we also need
650 // to check isLSB as we count bits modulo 8 while a valid UTF-7
651 // encoded sequence must contain an integral number of UTF-16
652 // characters)
653 if ( state.isLSB || state.bit > 4 ||
654 (state.accum & ((1 << state.bit) - 1)) )
655 {
656 if ( !len )
657 state = stateOrig;
658
852dcba5 659 return wxCONV_FAILED;
ccaa848d 660 }
852dcba5 661
9d653e81
VZ
662 state.ToDirect();
663
664 // re-parse this character normally below unless it's '-' which
665 // is consumed by the decoder
666 if ( cc == '-' )
667 continue;
668 }
669 else // valid encoded character
670 {
671 // mini base64 decoder: each character is 6 bits
672 state.bit += 6;
673 state.accum <<= 6;
674 state.accum += dc;
675
676 if ( state.bit >= 8 )
15f2ee32 677 {
9d653e81
VZ
678 // got the full byte, consume it
679 state.bit -= 8;
680 unsigned char b = (state.accum >> state.bit) & 0x00ff;
681
682 if ( state.isLSB )
15f2ee32 683 {
9d653e81
VZ
684 // we've got the full word, output it
685 if ( dst )
686 *dst++ = (state.msb << 8) | b;
687 len++;
688 state.isLSB = false;
15f2ee32 689 }
9d653e81 690 else // MSB
04a37834 691 {
9d653e81
VZ
692 // just store it while we wait for LSB
693 state.msb = b;
694 state.isLSB = true;
04a37834 695 }
15f2ee32
RN
696 }
697 }
9d653e81 698 }
04a37834 699
9d653e81
VZ
700 if ( state.IsDirect() )
701 {
702 // start of an encoded segment?
703 if ( cc == '+' )
04a37834 704 {
9d653e81
VZ
705 if ( *src == '-' )
706 {
707 // just the encoded plus sign, don't switch to shifted mode
708 if ( dst )
709 *dst++ = '+';
710 len++;
711 src++;
712 }
ccaa848d
VZ
713 else if ( utf7unb64[(unsigned)*src] == 0xff )
714 {
715 // empty encoded chunks are not allowed
716 if ( !len )
717 state = stateOrig;
718
719 return wxCONV_FAILED;
720 }
721 else // base-64 encoded chunk follows
9d653e81
VZ
722 {
723 state.ToShifted();
724 }
725 }
726 else // not '+'
727 {
728 // only printable 7 bit ASCII characters (with the exception of
729 // NUL, TAB, CR and LF) can be used directly
730 if ( cc >= 0x7f || (cc < ' ' &&
731 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
732 return wxCONV_FAILED;
733
734 if ( dst )
735 *dst++ = cc;
736 len++;
737 }
15f2ee32
RN
738 }
739 }
04a37834 740
9d653e81
VZ
741 if ( !len )
742 {
743 // as we didn't read any characters we should be called with the same
744 // data (followed by some more new data) again later so don't save our
745 // state
746 state = stateOrig;
747
748 return wxCONV_FAILED;
749 }
04a37834 750
15f2ee32 751 return len;
6001e347
RR
752}
753
15f2ee32
RN
754//
755// BASE64 encoding table
756//
757static const unsigned char utf7enb64[] =
758{
759 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
760 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
761 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
762 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
763 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
764 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
765 'w', 'x', 'y', 'z', '0', '1', '2', '3',
766 '4', '5', '6', '7', '8', '9', '+', '/'
767};
768
769//
770// UTF-7 encoding table
771//
772// 0 - Set D (directly encoded characters)
773// 1 - Set O (optional direct characters)
774// 2 - whitespace characters (optional)
775// 3 - special characters
776//
777static const unsigned char utf7encode[128] =
6001e347 778{
9d653e81 779 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
780 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
781 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
783 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
785 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
787};
788
9d653e81
VZ
789static inline bool wxIsUTF7Direct(wchar_t wc)
790{
791 return wc < 0x80 && utf7encode[wc] < 1;
792}
793
794size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
795 const wchar_t *src, size_t srcLen) const
15f2ee32 796{
9d653e81
VZ
797 EncoderState stateOrig,
798 *statePtr;
799 if ( srcLen == wxNO_LEN )
800 {
801 // we don't apply the stored state when operating on entire strings at
802 // once
803 statePtr = &stateOrig;
804
805 srcLen = wxWcslen(src) + 1;
806 }
807 else // do use the mode we left the output in previously
808 {
809 stateOrig = m_stateEncoder;
5c33522f 810 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
811 }
812
813 EncoderState& state = *statePtr;
814
815
15f2ee32
RN
816 size_t len = 0;
817
9d653e81
VZ
818 const wchar_t * const srcEnd = src + srcLen;
819 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 820 {
9d653e81
VZ
821 wchar_t cc = *src++;
822 if ( wxIsUTF7Direct(cc) )
15f2ee32 823 {
9d653e81
VZ
824 if ( state.IsShifted() )
825 {
826 // pad with zeros the last encoded block if necessary
827 if ( state.bit )
828 {
829 if ( dst )
830 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
831 len++;
832 }
ef199164 833
9d653e81
VZ
834 state.ToDirect();
835
836 if ( dst )
837 *dst++ = '-';
838 len++;
839 }
840
841 if ( dst )
842 *dst++ = (char)cc;
15f2ee32
RN
843 len++;
844 }
9d653e81
VZ
845 else if ( cc == '+' && state.IsDirect() )
846 {
847 if ( dst )
848 {
849 *dst++ = '+';
850 *dst++ = '-';
851 }
852
853 len += 2;
854 }
15f2ee32 855#ifndef WC_UTF16
79c78d42 856 else if (((wxUint32)cc) > 0xffff)
b2c13097 857 {
15f2ee32 858 // no surrogate pair generation (yet?)
467e0479 859 return wxCONV_FAILED;
15f2ee32
RN
860 }
861#endif
862 else
863 {
9d653e81
VZ
864 if ( state.IsDirect() )
865 {
866 state.ToShifted();
ef199164 867
9d653e81
VZ
868 if ( dst )
869 *dst++ = '+';
870 len++;
871 }
872
873 // BASE64 encode string
874 for ( ;; )
15f2ee32 875 {
9d653e81 876 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 877 {
9d653e81
VZ
878 state.accum <<= 8;
879 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
880
881 for (state.bit += 8; state.bit >= 6; )
15f2ee32 882 {
9d653e81
VZ
883 state.bit -= 6;
884 if ( dst )
885 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
886 len++;
15f2ee32 887 }
15f2ee32 888 }
ef199164 889
9d653e81
VZ
890 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
891 break;
ef199164 892
9d653e81 893 src++;
15f2ee32 894 }
15f2ee32
RN
895 }
896 }
ef199164 897
9d653e81
VZ
898 // we need to restore the original encoder state if we were called just to
899 // calculate the amount of space needed as we will presumably be called
900 // again to really convert the data now
901 if ( !dst )
902 state = stateOrig;
ef199164 903
15f2ee32 904 return len;
6001e347
RR
905}
906
f6bcfd97 907// ----------------------------------------------------------------------------
6001e347 908// UTF-8
f6bcfd97 909// ----------------------------------------------------------------------------
6001e347 910
1774c3c5 911static const wxUint32 utf8_max[]=
4def3b35 912 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 913
3698ae71
VZ
914// boundaries of the private use area we use to (temporarily) remap invalid
915// characters invalid in a UTF-8 encoded string
ea8ce907
RR
916const wxUint32 wxUnicodePUA = 0x100000;
917const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
918
0286d08d 919// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 920const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
921 // single-byte sequences (ASCII):
922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
929 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
930
931 // these are invalid:
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
936 0, 0, // C0,C1
937
938 // two-byte sequences:
939 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
940 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
941
942 // three-byte sequences:
943 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
944
945 // four-byte sequences:
946 4, 4, 4, 4, 4, // F0..F4
947
948 // these are invalid again (5- or 6-byte
949 // sequences and sequences for code points
950 // above U+10FFFF, as restricted by RFC 3629):
951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
952};
953
954size_t
955wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
956 const char *src, size_t srcLen) const
957{
958 wchar_t *out = dstLen ? dst : NULL;
959 size_t written = 0;
960
961 if ( srcLen == wxNO_LEN )
962 srcLen = strlen(src) + 1;
963
964 for ( const char *p = src; ; p++ )
965 {
966 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
967 {
968 // all done successfully, just add the trailing NULL if we are not
969 // using explicit length
970 if ( srcLen == wxNO_LEN )
971 {
972 if ( out )
973 {
974 if ( !dstLen )
975 break;
976
977 *out = L'\0';
978 }
979
980 written++;
981 }
982
983 return written;
984 }
985
0286d08d
VZ
986 if ( out && !dstLen-- )
987 break;
988
5367a38a
VS
989 wxUint32 code;
990 unsigned char c = *p;
0286d08d 991
5367a38a
VS
992 if ( c < 0x80 )
993 {
994 if ( srcLen == 0 ) // the test works for wxNO_LEN too
995 break;
0286d08d 996
5367a38a
VS
997 if ( srcLen != wxNO_LEN )
998 srcLen--;
0286d08d 999
5367a38a
VS
1000 code = c;
1001 }
1002 else
0286d08d 1003 {
5367a38a
VS
1004 unsigned len = tableUtf8Lengths[c];
1005 if ( !len )
1006 break;
1007
1008 if ( srcLen < len ) // the test works for wxNO_LEN too
1009 break;
1010
1011 if ( srcLen != wxNO_LEN )
1012 srcLen -= len;
1013
1014 // Char. number range | UTF-8 octet sequence
1015 // (hexadecimal) | (binary)
1016 // ----------------------+----------------------------------------
1017 // 0000 0000 - 0000 007F | 0xxxxxxx
1018 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1019 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1020 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1021 //
1022 // Code point value is stored in bits marked with 'x',
1023 // lowest-order bit of the value on the right side in the diagram
1024 // above. (from RFC 3629)
1025
1026 // mask to extract lead byte's value ('x' bits above), by sequence
1027 // length:
1028 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1029
1030 // mask and value of lead byte's most significant bits, by length:
1031 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1032 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1033
1034 len--; // it's more convenient to work with 0-based length here
1035
1036 // extract the lead byte's value bits:
1037 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1038 break;
1039
1040 code = c & leadValueMask[len];
1041
1042 // all remaining bytes, if any, are handled in the same way
1043 // regardless of sequence's length:
1044 for ( ; len; --len )
1045 {
1046 c = *++p;
1047 if ( (c & 0xC0) != 0x80 )
1048 return wxCONV_FAILED;
0286d08d 1049
5367a38a
VS
1050 code <<= 6;
1051 code |= c & 0x3F;
1052 }
0286d08d
VZ
1053 }
1054
1055#ifdef WC_UTF16
1056 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1057 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1058 {
1059 if ( out )
1060 out++;
1061 written++;
1062 }
1063#else // !WC_UTF16
1064 if ( out )
1065 *out = code;
1066#endif // WC_UTF16/!WC_UTF16
1067
1068 if ( out )
1069 out++;
1070
1071 written++;
1072 }
1073
1074 return wxCONV_FAILED;
1075}
1076
1077size_t
1078wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1079 const wchar_t *src, size_t srcLen) const
1080{
1081 char *out = dstLen ? dst : NULL;
1082 size_t written = 0;
1083
1084 for ( const wchar_t *wp = src; ; wp++ )
1085 {
a964d3ed 1086 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1087 {
1088 // all done successfully, just add the trailing NULL if we are not
1089 // using explicit length
1090 if ( srcLen == wxNO_LEN )
1091 {
1092 if ( out )
1093 {
1094 if ( !dstLen )
1095 break;
1096
1097 *out = '\0';
1098 }
1099
1100 written++;
1101 }
1102
1103 return written;
1104 }
1105
a964d3ed
VZ
1106 if ( srcLen != wxNO_LEN )
1107 srcLen--;
0286d08d
VZ
1108
1109 wxUint32 code;
1110#ifdef WC_UTF16
1111 // cast is ok for WC_UTF16
1112 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1113 {
1114 // skip the next char too as we decoded a surrogate
1115 wp++;
1116 }
1117#else // wchar_t is UTF-32
1118 code = *wp & 0x7fffffff;
1119#endif
1120
1121 unsigned len;
1122 if ( code <= 0x7F )
1123 {
1124 len = 1;
1125 if ( out )
1126 {
1127 if ( dstLen < len )
1128 break;
1129
1130 out[0] = (char)code;
1131 }
1132 }
1133 else if ( code <= 0x07FF )
1134 {
1135 len = 2;
1136 if ( out )
1137 {
1138 if ( dstLen < len )
1139 break;
1140
1141 // NB: this line takes 6 least significant bits, encodes them as
1142 // 10xxxxxx and discards them so that the next byte can be encoded:
1143 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1144 out[0] = 0xC0 | code;
1145 }
1146 }
1147 else if ( code < 0xFFFF )
1148 {
1149 len = 3;
1150 if ( out )
1151 {
1152 if ( dstLen < len )
1153 break;
1154
1155 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1156 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1157 out[0] = 0xE0 | code;
1158 }
1159 }
1160 else if ( code <= 0x10FFFF )
1161 {
1162 len = 4;
1163 if ( out )
1164 {
1165 if ( dstLen < len )
1166 break;
1167
1168 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1169 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1170 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1171 out[0] = 0xF0 | code;
1172 }
1173 }
1174 else
1175 {
9a83f860 1176 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1177 break;
1178 }
1179
1180 if ( out )
1181 {
1182 out += len;
1183 dstLen -= len;
1184 }
1185
1186 written += len;
1187 }
1188
1189 // we only get here if an error occurs during decoding
1190 return wxCONV_FAILED;
1191}
1192
d16d0917
VZ
1193size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1194 const char *psz, size_t srcLen) const
6001e347 1195{
0286d08d 1196 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1197 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1198
4def3b35
VS
1199 size_t len = 0;
1200
d16d0917 1201 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1202 {
ea8ce907
RR
1203 const char *opsz = psz;
1204 bool invalid = false;
4def3b35
VS
1205 unsigned char cc = *psz++, fc = cc;
1206 unsigned cnt;
dccce9ea 1207 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1208 fc <<= 1;
ef199164 1209
dccce9ea 1210 if (!cnt)
4def3b35
VS
1211 {
1212 // plain ASCII char
dccce9ea 1213 if (buf)
4def3b35
VS
1214 *buf++ = cc;
1215 len++;
561488ef
MW
1216
1217 // escape the escape character for octal escapes
1218 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1219 && cc == '\\' && (!buf || len < n))
1220 {
1221 if (buf)
1222 *buf++ = cc;
1223 len++;
1224 }
dccce9ea
VZ
1225 }
1226 else
4def3b35
VS
1227 {
1228 cnt--;
dccce9ea 1229 if (!cnt)
4def3b35
VS
1230 {
1231 // invalid UTF-8 sequence
ea8ce907 1232 invalid = true;
dccce9ea
VZ
1233 }
1234 else
4def3b35
VS
1235 {
1236 unsigned ocnt = cnt - 1;
1237 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1238 while (cnt--)
4def3b35 1239 {
ea8ce907 1240 cc = *psz;
dccce9ea 1241 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1242 {
1243 // invalid UTF-8 sequence
ea8ce907
RR
1244 invalid = true;
1245 break;
4def3b35 1246 }
ef199164 1247
ea8ce907 1248 psz++;
4def3b35
VS
1249 res = (res << 6) | (cc & 0x3f);
1250 }
ef199164 1251
ea8ce907 1252 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1253 {
1254 // illegal UTF-8 encoding
ea8ce907 1255 invalid = true;
4def3b35 1256 }
ea8ce907
RR
1257 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1258 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1259 {
1260 // if one of our PUA characters turns up externally
1261 // it must also be treated as an illegal sequence
1262 // (a bit like you have to escape an escape character)
1263 invalid = true;
1264 }
1265 else
1266 {
1cd52418 1267#ifdef WC_UTF16
0286d08d 1268 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1269 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1270 if (pa == wxCONV_FAILED)
ea8ce907
RR
1271 {
1272 invalid = true;
1273 }
1274 else
1275 {
1276 if (buf)
1277 buf += pa;
1278 len += pa;
1279 }
373658eb 1280#else // !WC_UTF16
ea8ce907 1281 if (buf)
38d4b1e4 1282 *buf++ = (wchar_t)res;
ea8ce907 1283 len++;
373658eb 1284#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1285 }
1286 }
ef199164 1287
ea8ce907
RR
1288 if (invalid)
1289 {
1290 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1291 {
1292 while (opsz < psz && (!buf || len < n))
1293 {
1294#ifdef WC_UTF16
1295 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1296 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1297 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1298 if (buf)
1299 buf += pa;
1300 opsz++;
1301 len += pa;
1302#else
1303 if (buf)
38d4b1e4 1304 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1305 opsz++;
1306 len++;
1307#endif
1308 }
1309 }
3698ae71 1310 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1311 {
1312 while (opsz < psz && (!buf || len < n))
1313 {
3698ae71
VZ
1314 if ( buf && len + 3 < n )
1315 {
17a1ebd1 1316 unsigned char on = *opsz;
3698ae71 1317 *buf++ = L'\\';
17a1ebd1
VZ
1318 *buf++ = (wchar_t)( L'0' + on / 0100 );
1319 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1320 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1321 }
ef199164 1322
ea8ce907
RR
1323 opsz++;
1324 len += 4;
1325 }
1326 }
3698ae71 1327 else // MAP_INVALID_UTF8_NOT
ea8ce907 1328 {
467e0479 1329 return wxCONV_FAILED;
ea8ce907 1330 }
4def3b35
VS
1331 }
1332 }
6001e347 1333 }
ef199164 1334
d16d0917 1335 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1336 *buf = 0;
ef199164 1337
d16d0917 1338 return len + 1;
6001e347
RR
1339}
1340
3698ae71
VZ
1341static inline bool isoctal(wchar_t wch)
1342{
1343 return L'0' <= wch && wch <= L'7';
1344}
1345
d16d0917
VZ
1346size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1347 const wchar_t *psz, size_t srcLen) const
6001e347 1348{
0286d08d 1349 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1350 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1351
4def3b35 1352 size_t len = 0;
6001e347 1353
d16d0917 1354 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1355 {
1356 wxUint32 cc;
ef199164 1357
1cd52418 1358#ifdef WC_UTF16
b5153fd8
VZ
1359 // cast is ok for WC_UTF16
1360 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1361 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1362#else
ef199164 1363 cc = (*psz++) & 0x7fffffff;
4def3b35 1364#endif
3698ae71
VZ
1365
1366 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1367 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1368 {
dccce9ea 1369 if (buf)
ea8ce907 1370 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1371 len++;
3698ae71 1372 }
561488ef
MW
1373 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1374 && cc == L'\\' && psz[0] == L'\\' )
1375 {
1376 if (buf)
1377 *buf++ = (char)cc;
1378 psz++;
1379 len++;
1380 }
3698ae71
VZ
1381 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1382 cc == L'\\' &&
1383 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1384 {
dccce9ea 1385 if (buf)
3698ae71 1386 {
ef199164
DS
1387 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1388 (psz[1] - L'0') * 010 +
b2c13097 1389 (psz[2] - L'0'));
3698ae71
VZ
1390 }
1391
1392 psz += 3;
ea8ce907
RR
1393 len++;
1394 }
1395 else
1396 {
1397 unsigned cnt;
ef199164
DS
1398 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1399 {
1400 }
1401
ea8ce907 1402 if (!cnt)
4def3b35 1403 {
ea8ce907
RR
1404 // plain ASCII char
1405 if (buf)
1406 *buf++ = (char) cc;
1407 len++;
1408 }
ea8ce907
RR
1409 else
1410 {
1411 len += cnt + 1;
1412 if (buf)
1413 {
1414 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1415 while (cnt--)
1416 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1417 }
4def3b35
VS
1418 }
1419 }
6001e347 1420 }
4def3b35 1421
d16d0917 1422 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1423 *buf = 0;
adb45366 1424
d16d0917 1425 return len + 1;
6001e347
RR
1426}
1427
467e0479 1428// ============================================================================
c91830cb 1429// UTF-16
467e0479 1430// ============================================================================
c91830cb
VZ
1431
1432#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1433 #define wxMBConvUTF16straight wxMBConvUTF16BE
1434 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1435#else
bde4baac
VZ
1436 #define wxMBConvUTF16swap wxMBConvUTF16BE
1437 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1438#endif
1439
467e0479
VZ
1440/* static */
1441size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1442{
1443 if ( srcLen == wxNO_LEN )
1444 {
1445 // count the number of bytes in input, including the trailing NULs
5c33522f 1446 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1447 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1448 ;
c91830cb 1449
467e0479
VZ
1450 srcLen *= BYTES_PER_CHAR;
1451 }
1452 else // we already have the length
1453 {
1454 // we can only convert an entire number of UTF-16 characters
1455 if ( srcLen % BYTES_PER_CHAR )
1456 return wxCONV_FAILED;
1457 }
1458
1459 return srcLen;
1460}
1461
1462// case when in-memory representation is UTF-16 too
c91830cb
VZ
1463#ifdef WC_UTF16
1464
467e0479
VZ
1465// ----------------------------------------------------------------------------
1466// conversions without endianness change
1467// ----------------------------------------------------------------------------
1468
1469size_t
1470wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1471 const char *src, size_t srcLen) const
c91830cb 1472{
467e0479
VZ
1473 // set up the scene for using memcpy() (which is presumably more efficient
1474 // than copying the bytes one by one)
1475 srcLen = GetLength(src, srcLen);
1476 if ( srcLen == wxNO_LEN )
1477 return wxCONV_FAILED;
c91830cb 1478
ef199164 1479 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1480 if ( dst )
c91830cb 1481 {
467e0479
VZ
1482 if ( dstLen < inLen )
1483 return wxCONV_FAILED;
c91830cb 1484
467e0479 1485 memcpy(dst, src, srcLen);
c91830cb 1486 }
d32a507d 1487
467e0479 1488 return inLen;
c91830cb
VZ
1489}
1490
467e0479
VZ
1491size_t
1492wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1493 const wchar_t *src, size_t srcLen) const
c91830cb 1494{
467e0479
VZ
1495 if ( srcLen == wxNO_LEN )
1496 srcLen = wxWcslen(src) + 1;
c91830cb 1497
467e0479
VZ
1498 srcLen *= BYTES_PER_CHAR;
1499
1500 if ( dst )
c91830cb 1501 {
467e0479
VZ
1502 if ( dstLen < srcLen )
1503 return wxCONV_FAILED;
d32a507d 1504
467e0479 1505 memcpy(dst, src, srcLen);
c91830cb 1506 }
d32a507d 1507
467e0479 1508 return srcLen;
c91830cb
VZ
1509}
1510
467e0479
VZ
1511// ----------------------------------------------------------------------------
1512// endian-reversing conversions
1513// ----------------------------------------------------------------------------
c91830cb 1514
467e0479
VZ
1515size_t
1516wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1517 const char *src, size_t srcLen) const
c91830cb 1518{
467e0479
VZ
1519 srcLen = GetLength(src, srcLen);
1520 if ( srcLen == wxNO_LEN )
1521 return wxCONV_FAILED;
c91830cb 1522
467e0479
VZ
1523 srcLen /= BYTES_PER_CHAR;
1524
1525 if ( dst )
c91830cb 1526 {
467e0479
VZ
1527 if ( dstLen < srcLen )
1528 return wxCONV_FAILED;
1529
5c33522f 1530 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1531 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1532 {
ef199164 1533 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1534 }
c91830cb 1535 }
bfab25d4 1536
467e0479 1537 return srcLen;
c91830cb
VZ
1538}
1539
467e0479
VZ
1540size_t
1541wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1542 const wchar_t *src, size_t srcLen) const
c91830cb 1543{
467e0479
VZ
1544 if ( srcLen == wxNO_LEN )
1545 srcLen = wxWcslen(src) + 1;
c91830cb 1546
467e0479
VZ
1547 srcLen *= BYTES_PER_CHAR;
1548
1549 if ( dst )
c91830cb 1550 {
467e0479
VZ
1551 if ( dstLen < srcLen )
1552 return wxCONV_FAILED;
1553
5c33522f 1554 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1555 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1556 {
ef199164 1557 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1558 }
c91830cb 1559 }
eec47cc6 1560
467e0479 1561 return srcLen;
c91830cb
VZ
1562}
1563
467e0479 1564#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1565
467e0479
VZ
1566// ----------------------------------------------------------------------------
1567// conversions without endianness change
1568// ----------------------------------------------------------------------------
c91830cb 1569
35d11700
VZ
1570size_t
1571wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1572 const char *src, size_t srcLen) const
c91830cb 1573{
35d11700
VZ
1574 srcLen = GetLength(src, srcLen);
1575 if ( srcLen == wxNO_LEN )
1576 return wxCONV_FAILED;
c91830cb 1577
ef199164 1578 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1579 if ( !dst )
c91830cb 1580 {
35d11700
VZ
1581 // optimization: return maximal space which could be needed for this
1582 // string even if the real size could be smaller if the buffer contains
1583 // any surrogates
1584 return inLen;
c91830cb 1585 }
c91830cb 1586
35d11700 1587 size_t outLen = 0;
5c33522f 1588 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1589 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1590 {
ef199164
DS
1591 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1592 if ( !inBuff )
35d11700
VZ
1593 return wxCONV_FAILED;
1594
1595 if ( ++outLen > dstLen )
1596 return wxCONV_FAILED;
c91830cb 1597
35d11700
VZ
1598 *dst++ = ch;
1599 }
1600
1601
1602 return outLen;
1603}
c91830cb 1604
35d11700
VZ
1605size_t
1606wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1607 const wchar_t *src, size_t srcLen) const
c91830cb 1608{
35d11700
VZ
1609 if ( srcLen == wxNO_LEN )
1610 srcLen = wxWcslen(src) + 1;
c91830cb 1611
35d11700 1612 size_t outLen = 0;
5c33522f 1613 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1614 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1615 {
1616 wxUint16 cc[2];
35d11700
VZ
1617 const size_t numChars = encode_utf16(*src++, cc);
1618 if ( numChars == wxCONV_FAILED )
1619 return wxCONV_FAILED;
c91830cb 1620
ef199164
DS
1621 outLen += numChars * BYTES_PER_CHAR;
1622 if ( outBuff )
c91830cb 1623 {
35d11700
VZ
1624 if ( outLen > dstLen )
1625 return wxCONV_FAILED;
1626
ef199164 1627 *outBuff++ = cc[0];
35d11700 1628 if ( numChars == 2 )
69b80d28 1629 {
35d11700 1630 // second character of a surrogate
ef199164 1631 *outBuff++ = cc[1];
69b80d28 1632 }
c91830cb 1633 }
c91830cb 1634 }
c91830cb 1635
35d11700 1636 return outLen;
c91830cb
VZ
1637}
1638
467e0479
VZ
1639// ----------------------------------------------------------------------------
1640// endian-reversing conversions
1641// ----------------------------------------------------------------------------
c91830cb 1642
35d11700
VZ
1643size_t
1644wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1645 const char *src, size_t srcLen) const
c91830cb 1646{
35d11700
VZ
1647 srcLen = GetLength(src, srcLen);
1648 if ( srcLen == wxNO_LEN )
1649 return wxCONV_FAILED;
1650
ef199164 1651 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1652 if ( !dst )
1653 {
1654 // optimization: return maximal space which could be needed for this
1655 // string even if the real size could be smaller if the buffer contains
1656 // any surrogates
1657 return inLen;
1658 }
c91830cb 1659
35d11700 1660 size_t outLen = 0;
5c33522f 1661 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1662 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1663 {
35d11700
VZ
1664 wxUint32 ch;
1665 wxUint16 tmp[2];
ef199164
DS
1666
1667 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1668 inBuff++;
1669 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1670
35d11700
VZ
1671 const size_t numChars = decode_utf16(tmp, ch);
1672 if ( numChars == wxCONV_FAILED )
1673 return wxCONV_FAILED;
c91830cb 1674
35d11700 1675 if ( numChars == 2 )
ef199164 1676 inBuff++;
35d11700
VZ
1677
1678 if ( ++outLen > dstLen )
1679 return wxCONV_FAILED;
c91830cb 1680
35d11700 1681 *dst++ = ch;
c91830cb 1682 }
c91830cb 1683
c91830cb 1684
35d11700
VZ
1685 return outLen;
1686}
c91830cb 1687
35d11700
VZ
1688size_t
1689wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1690 const wchar_t *src, size_t srcLen) const
c91830cb 1691{
35d11700
VZ
1692 if ( srcLen == wxNO_LEN )
1693 srcLen = wxWcslen(src) + 1;
c91830cb 1694
35d11700 1695 size_t outLen = 0;
5c33522f 1696 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1697 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1698 {
1699 wxUint16 cc[2];
35d11700
VZ
1700 const size_t numChars = encode_utf16(*src, cc);
1701 if ( numChars == wxCONV_FAILED )
1702 return wxCONV_FAILED;
c91830cb 1703
ef199164
DS
1704 outLen += numChars * BYTES_PER_CHAR;
1705 if ( outBuff )
c91830cb 1706 {
35d11700
VZ
1707 if ( outLen > dstLen )
1708 return wxCONV_FAILED;
1709
ef199164 1710 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1711 if ( numChars == 2 )
c91830cb 1712 {
35d11700 1713 // second character of a surrogate
ef199164 1714 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1715 }
1716 }
c91830cb 1717 }
c91830cb 1718
35d11700 1719 return outLen;
c91830cb
VZ
1720}
1721
467e0479 1722#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1723
1724
35d11700 1725// ============================================================================
c91830cb 1726// UTF-32
35d11700 1727// ============================================================================
c91830cb
VZ
1728
1729#ifdef WORDS_BIGENDIAN
467e0479
VZ
1730 #define wxMBConvUTF32straight wxMBConvUTF32BE
1731 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1732#else
467e0479
VZ
1733 #define wxMBConvUTF32swap wxMBConvUTF32BE
1734 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1735#endif
1736
1737
1738WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1739WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1740
467e0479
VZ
1741/* static */
1742size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1743{
1744 if ( srcLen == wxNO_LEN )
1745 {
1746 // count the number of bytes in input, including the trailing NULs
5c33522f 1747 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1748 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1749 ;
c91830cb 1750
467e0479
VZ
1751 srcLen *= BYTES_PER_CHAR;
1752 }
1753 else // we already have the length
1754 {
1755 // we can only convert an entire number of UTF-32 characters
1756 if ( srcLen % BYTES_PER_CHAR )
1757 return wxCONV_FAILED;
1758 }
1759
1760 return srcLen;
1761}
1762
1763// case when in-memory representation is UTF-16
c91830cb
VZ
1764#ifdef WC_UTF16
1765
467e0479
VZ
1766// ----------------------------------------------------------------------------
1767// conversions without endianness change
1768// ----------------------------------------------------------------------------
1769
1770size_t
1771wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1772 const char *src, size_t srcLen) const
c91830cb 1773{
467e0479
VZ
1774 srcLen = GetLength(src, srcLen);
1775 if ( srcLen == wxNO_LEN )
1776 return wxCONV_FAILED;
c91830cb 1777
5c33522f 1778 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1779 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1780 size_t outLen = 0;
1781 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1782 {
1783 wxUint16 cc[2];
ef199164 1784 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1785 if ( numChars == wxCONV_FAILED )
1786 return wxCONV_FAILED;
c91830cb 1787
467e0479
VZ
1788 outLen += numChars;
1789 if ( dst )
c91830cb 1790 {
467e0479
VZ
1791 if ( outLen > dstLen )
1792 return wxCONV_FAILED;
d32a507d 1793
467e0479
VZ
1794 *dst++ = cc[0];
1795 if ( numChars == 2 )
1796 {
1797 // second character of a surrogate
1798 *dst++ = cc[1];
1799 }
1800 }
c91830cb 1801 }
d32a507d 1802
467e0479 1803 return outLen;
c91830cb
VZ
1804}
1805
467e0479
VZ
1806size_t
1807wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1808 const wchar_t *src, size_t srcLen) const
c91830cb 1809{
467e0479
VZ
1810 if ( srcLen == wxNO_LEN )
1811 srcLen = wxWcslen(src) + 1;
c91830cb 1812
467e0479 1813 if ( !dst )
c91830cb 1814 {
467e0479
VZ
1815 // optimization: return maximal space which could be needed for this
1816 // string instead of the exact amount which could be less if there are
1817 // any surrogates in the input
1818 //
1819 // we consider that surrogates are rare enough to make it worthwhile to
1820 // avoid running the loop below at the cost of slightly extra memory
1821 // consumption
ef199164 1822 return srcLen * BYTES_PER_CHAR;
467e0479 1823 }
c91830cb 1824
5c33522f 1825 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1826 size_t outLen = 0;
1827 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1828 {
1829 const wxUint32 ch = wxDecodeSurrogate(&src);
1830 if ( !src )
1831 return wxCONV_FAILED;
c91830cb 1832
467e0479 1833 outLen += BYTES_PER_CHAR;
d32a507d 1834
467e0479
VZ
1835 if ( outLen > dstLen )
1836 return wxCONV_FAILED;
b5153fd8 1837
ef199164 1838 *outBuff++ = ch;
467e0479 1839 }
c91830cb 1840
467e0479 1841 return outLen;
c91830cb
VZ
1842}
1843
467e0479
VZ
1844// ----------------------------------------------------------------------------
1845// endian-reversing conversions
1846// ----------------------------------------------------------------------------
c91830cb 1847
467e0479
VZ
1848size_t
1849wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1850 const char *src, size_t srcLen) const
c91830cb 1851{
467e0479
VZ
1852 srcLen = GetLength(src, srcLen);
1853 if ( srcLen == wxNO_LEN )
1854 return wxCONV_FAILED;
c91830cb 1855
5c33522f 1856 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1857 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1858 size_t outLen = 0;
ef199164 1859 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1860 {
c91830cb 1861 wxUint16 cc[2];
ef199164 1862 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1863 if ( numChars == wxCONV_FAILED )
1864 return wxCONV_FAILED;
c91830cb 1865
467e0479
VZ
1866 outLen += numChars;
1867 if ( dst )
c91830cb 1868 {
467e0479
VZ
1869 if ( outLen > dstLen )
1870 return wxCONV_FAILED;
d32a507d 1871
467e0479
VZ
1872 *dst++ = cc[0];
1873 if ( numChars == 2 )
1874 {
1875 // second character of a surrogate
1876 *dst++ = cc[1];
1877 }
1878 }
c91830cb 1879 }
b5153fd8 1880
467e0479 1881 return outLen;
c91830cb
VZ
1882}
1883
467e0479
VZ
1884size_t
1885wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1886 const wchar_t *src, size_t srcLen) const
c91830cb 1887{
467e0479
VZ
1888 if ( srcLen == wxNO_LEN )
1889 srcLen = wxWcslen(src) + 1;
c91830cb 1890
467e0479 1891 if ( !dst )
c91830cb 1892 {
467e0479
VZ
1893 // optimization: return maximal space which could be needed for this
1894 // string instead of the exact amount which could be less if there are
1895 // any surrogates in the input
1896 //
1897 // we consider that surrogates are rare enough to make it worthwhile to
1898 // avoid running the loop below at the cost of slightly extra memory
1899 // consumption
1900 return srcLen*BYTES_PER_CHAR;
1901 }
c91830cb 1902
5c33522f 1903 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1904 size_t outLen = 0;
1905 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1906 {
1907 const wxUint32 ch = wxDecodeSurrogate(&src);
1908 if ( !src )
1909 return wxCONV_FAILED;
c91830cb 1910
467e0479 1911 outLen += BYTES_PER_CHAR;
d32a507d 1912
467e0479
VZ
1913 if ( outLen > dstLen )
1914 return wxCONV_FAILED;
b5153fd8 1915
ef199164 1916 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1917 }
c91830cb 1918
467e0479 1919 return outLen;
c91830cb
VZ
1920}
1921
467e0479 1922#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1923
35d11700
VZ
1924// ----------------------------------------------------------------------------
1925// conversions without endianness change
1926// ----------------------------------------------------------------------------
1927
1928size_t
1929wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1930 const char *src, size_t srcLen) const
c91830cb 1931{
35d11700
VZ
1932 // use memcpy() as it should be much faster than hand-written loop
1933 srcLen = GetLength(src, srcLen);
1934 if ( srcLen == wxNO_LEN )
1935 return wxCONV_FAILED;
c91830cb 1936
35d11700
VZ
1937 const size_t inLen = srcLen/BYTES_PER_CHAR;
1938 if ( dst )
c91830cb 1939 {
35d11700
VZ
1940 if ( dstLen < inLen )
1941 return wxCONV_FAILED;
b5153fd8 1942
35d11700
VZ
1943 memcpy(dst, src, srcLen);
1944 }
c91830cb 1945
35d11700 1946 return inLen;
c91830cb
VZ
1947}
1948
35d11700
VZ
1949size_t
1950wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1951 const wchar_t *src, size_t srcLen) const
c91830cb 1952{
35d11700
VZ
1953 if ( srcLen == wxNO_LEN )
1954 srcLen = wxWcslen(src) + 1;
1955
1956 srcLen *= BYTES_PER_CHAR;
c91830cb 1957
35d11700 1958 if ( dst )
c91830cb 1959 {
35d11700
VZ
1960 if ( dstLen < srcLen )
1961 return wxCONV_FAILED;
c91830cb 1962
35d11700 1963 memcpy(dst, src, srcLen);
c91830cb
VZ
1964 }
1965
35d11700 1966 return srcLen;
c91830cb
VZ
1967}
1968
35d11700
VZ
1969// ----------------------------------------------------------------------------
1970// endian-reversing conversions
1971// ----------------------------------------------------------------------------
c91830cb 1972
35d11700
VZ
1973size_t
1974wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1975 const char *src, size_t srcLen) const
c91830cb 1976{
35d11700
VZ
1977 srcLen = GetLength(src, srcLen);
1978 if ( srcLen == wxNO_LEN )
1979 return wxCONV_FAILED;
1980
1981 srcLen /= BYTES_PER_CHAR;
c91830cb 1982
35d11700 1983 if ( dst )
c91830cb 1984 {
35d11700
VZ
1985 if ( dstLen < srcLen )
1986 return wxCONV_FAILED;
1987
5c33522f 1988 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1989 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1990 {
ef199164 1991 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1992 }
c91830cb 1993 }
b5153fd8 1994
35d11700 1995 return srcLen;
c91830cb
VZ
1996}
1997
35d11700
VZ
1998size_t
1999wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2000 const wchar_t *src, size_t srcLen) const
c91830cb 2001{
35d11700
VZ
2002 if ( srcLen == wxNO_LEN )
2003 srcLen = wxWcslen(src) + 1;
2004
2005 srcLen *= BYTES_PER_CHAR;
c91830cb 2006
35d11700 2007 if ( dst )
c91830cb 2008 {
35d11700
VZ
2009 if ( dstLen < srcLen )
2010 return wxCONV_FAILED;
2011
5c33522f 2012 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2013 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2014 {
ef199164 2015 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2016 }
c91830cb 2017 }
b5153fd8 2018
35d11700 2019 return srcLen;
c91830cb
VZ
2020}
2021
467e0479 2022#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2023
2024
36acb880
VZ
2025// ============================================================================
2026// The classes doing conversion using the iconv_xxx() functions
2027// ============================================================================
3caec1bb 2028
b040e242 2029#ifdef HAVE_ICONV
3a0d76bc 2030
b1d547eb
VS
2031// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2032// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2033// (unless there's yet another bug in glibc) the only case when iconv()
2034// returns with (size_t)-1 (which means error) and says there are 0 bytes
2035// left in the input buffer -- when _real_ error occurs,
2036// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2037// iconv() failure.
3caec1bb
VS
2038// [This bug does not appear in glibc 2.2.]
2039#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2040#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2041 (errno != E2BIG || bufLeft != 0))
2042#else
2043#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2044#endif
2045
ab217dba 2046#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2047
74a7eb0b
VZ
2048#define ICONV_T_INVALID ((iconv_t)-1)
2049
2050#if SIZEOF_WCHAR_T == 4
2051 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2052 #define WC_ENC wxFONTENCODING_UTF32
2053#elif SIZEOF_WCHAR_T == 2
2054 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2055 #define WC_ENC wxFONTENCODING_UTF16
2056#else // sizeof(wchar_t) != 2 nor 4
2057 // does this ever happen?
2058 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2059#endif
2060
36acb880 2061// ----------------------------------------------------------------------------
e95354ec 2062// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2063// ----------------------------------------------------------------------------
2064
e95354ec 2065class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2066{
2067public:
86501081 2068 wxMBConv_iconv(const char *name);
e95354ec 2069 virtual ~wxMBConv_iconv();
36acb880 2070
8f4b0f43
VZ
2071 // implement base class virtual methods
2072 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2073 const char *src, size_t srcLen = wxNO_LEN) const;
2074 virtual size_t FromWChar(char *dst, size_t dstLen,
2075 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2076 virtual size_t GetMBNulLen() const;
2077
ba98e032
VS
2078#if wxUSE_UNICODE_UTF8
2079 virtual bool IsUTF8() const;
2080#endif
2081
d36c9347
VZ
2082 virtual wxMBConv *Clone() const
2083 {
86501081 2084 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2085 p->m_minMBCharWidth = m_minMBCharWidth;
2086 return p;
2087 }
2088
e95354ec 2089 bool IsOk() const
74a7eb0b 2090 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2091
2092protected:
ef199164
DS
2093 // the iconv handlers used to translate from multibyte
2094 // to wide char and in the other direction
36acb880
VZ
2095 iconv_t m2w,
2096 w2m;
ef199164 2097
b1d547eb
VS
2098#if wxUSE_THREADS
2099 // guards access to m2w and w2m objects
2100 wxMutex m_iconvMutex;
2101#endif
36acb880
VZ
2102
2103private:
e95354ec 2104 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2105 // available on this machine, it will remain NULL
74a7eb0b 2106 static wxString ms_wcCharsetName;
36acb880
VZ
2107
2108 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2109 // different endian-ness than the native one
405d8f46 2110 static bool ms_wcNeedsSwap;
eec47cc6 2111
d36c9347
VZ
2112
2113 // name of the encoding handled by this conversion
2114 wxString m_name;
2115
7ef3ab50 2116 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2117 // initially
2118 size_t m_minMBCharWidth;
36acb880
VZ
2119};
2120
8f115891 2121// make the constructor available for unit testing
86501081 2122WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2123{
2124 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2125 if ( !result->IsOk() )
2126 {
2127 delete result;
2128 return 0;
2129 }
ef199164 2130
8f115891
MW
2131 return result;
2132}
2133
422e411e 2134wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2135bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2136
86501081 2137wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2138 : m_name(name)
36acb880 2139{
c1464d9d 2140 m_minMBCharWidth = 0;
eec47cc6 2141
36acb880 2142 // check for charset that represents wchar_t:
74a7eb0b 2143 if ( ms_wcCharsetName.empty() )
f1339c56 2144 {
9a83f860 2145 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2146
74a7eb0b
VZ
2147#if wxUSE_FONTMAP
2148 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2149#else // !wxUSE_FONTMAP
91cb7f52 2150 static const wxChar *names_static[] =
36acb880 2151 {
74a7eb0b 2152#if SIZEOF_WCHAR_T == 4
9a83f860 2153 wxT("UCS-4"),
74a7eb0b 2154#elif SIZEOF_WCHAR_T = 2
9a83f860 2155 wxT("UCS-2"),
74a7eb0b
VZ
2156#endif
2157 NULL
2158 };
91cb7f52 2159 const wxChar **names = names_static;
74a7eb0b 2160#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2161
d1f024a8 2162 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2163 {
17a1ebd1 2164 const wxString nameCS(*names);
74a7eb0b
VZ
2165
2166 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2167 wxString nameXE(nameCS);
ef199164
DS
2168
2169#ifdef WORDS_BIGENDIAN
9a83f860 2170 nameXE += wxT("BE");
ef199164 2171#else // little endian
9a83f860 2172 nameXE += wxT("LE");
ef199164 2173#endif
74a7eb0b 2174
9a83f860 2175 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2176 nameXE.c_str());
2177
86501081 2178 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2179 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2180 {
74a7eb0b 2181 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2182 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2183 nameCS.c_str());
86501081 2184 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2185
74a7eb0b
VZ
2186 // and check for bytesex ourselves:
2187 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2188 {
74a7eb0b 2189 char buf[2], *bufPtr;
e8769ed1 2190 wchar_t wbuf[2];
74a7eb0b
VZ
2191 size_t insz, outsz;
2192 size_t res;
2193
2194 buf[0] = 'A';
2195 buf[1] = 0;
2196 wbuf[0] = 0;
2197 insz = 2;
2198 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2199 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2200 bufPtr = buf;
2201
ef199164
DS
2202 res = iconv(
2203 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2204 &wbufPtr, &outsz);
74a7eb0b
VZ
2205
2206 if (ICONV_FAILED(res, insz))
2207 {
2208 wxLogLastError(wxT("iconv"));
422e411e 2209 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2210 nameCS.c_str());
74a7eb0b
VZ
2211 }
2212 else // ok, can convert to this encoding, remember it
2213 {
17a1ebd1 2214 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2215 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2216 }
3a0d76bc
VS
2217 }
2218 }
74a7eb0b 2219 else // use charset not requiring byte swapping
36acb880 2220 {
74a7eb0b 2221 ms_wcCharsetName = nameXE;
36acb880 2222 }
3a0d76bc 2223 }
74a7eb0b 2224
0944fceb 2225 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2226 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2227 ms_wcCharsetName.empty() ? wxString("<none>")
2228 : ms_wcCharsetName,
9a83f860
VZ
2229 ms_wcNeedsSwap ? wxT(" (needs swap)")
2230 : wxT(""));
3a0d76bc 2231 }
36acb880 2232 else // we already have ms_wcCharsetName
3caec1bb 2233 {
86501081 2234 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2235 }
dccce9ea 2236
74a7eb0b 2237 if ( ms_wcCharsetName.empty() )
f1339c56 2238 {
74a7eb0b 2239 w2m = ICONV_T_INVALID;
36acb880 2240 }
405d8f46
VZ
2241 else
2242 {
86501081 2243 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2244 if ( w2m == ICONV_T_INVALID )
2245 {
2246 wxLogTrace(TRACE_STRCONV,
2247 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2248 ms_wcCharsetName.c_str(), name);
74a7eb0b 2249 }
405d8f46 2250 }
36acb880 2251}
3caec1bb 2252
e95354ec 2253wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2254{
74a7eb0b 2255 if ( m2w != ICONV_T_INVALID )
36acb880 2256 iconv_close(m2w);
74a7eb0b 2257 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2258 iconv_close(w2m);
2259}
3a0d76bc 2260
8f4b0f43
VZ
2261size_t
2262wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2263 const char *src, size_t srcLen) const
36acb880 2264{
8f4b0f43 2265 if ( srcLen == wxNO_LEN )
69373110 2266 {
8f4b0f43
VZ
2267 // find the string length: notice that must be done differently for
2268 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2269 // consecutive NULs
2270 const size_t nulLen = GetMBNulLen();
2271 switch ( nulLen )
2272 {
2273 default:
2274 return wxCONV_FAILED;
69373110 2275
8f4b0f43
VZ
2276 case 1:
2277 srcLen = strlen(src); // arguably more optimized than our version
2278 break;
69373110 2279
8f4b0f43
VZ
2280 case 2:
2281 case 4:
2282 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2283 // but they also have to start at character boundary and not
2284 // span two adjacent characters
2285 const char *p;
2286 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2287 ;
2288 srcLen = p - src;
2289 break;
2290 }
d50c0831
VZ
2291
2292 // when we're determining the length of the string ourselves we count
2293 // the terminating NUL(s) as part of it and always NUL-terminate the
2294 // output
2295 srcLen += nulLen;
69373110
VZ
2296 }
2297
8f4b0f43
VZ
2298 // we express length in the number of (wide) characters but iconv always
2299 // counts buffer sizes it in bytes
2300 dstLen *= SIZEOF_WCHAR_T;
2301
b1d547eb 2302#if wxUSE_THREADS
6a17b868
SN
2303 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2304 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2305 // wxConvLocal that are used all over wx code, so we have to make sure
2306 // the handle is used by at most one thread at the time. Otherwise
2307 // only a few wx classes would be safe to use from non-main threads
2308 // as MB<->WC conversion would fail "randomly".
2309 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2310#endif // wxUSE_THREADS
2311
36acb880 2312 size_t res, cres;
8f4b0f43 2313 const char *pszPtr = src;
36acb880 2314
8f4b0f43 2315 if ( dst )
36acb880 2316 {
8f4b0f43 2317 char* bufPtr = (char*)dst;
e8769ed1 2318
36acb880 2319 // have destination buffer, convert there
1752fda6 2320 size_t dstLenOrig = dstLen;
36acb880 2321 cres = iconv(m2w,
8f4b0f43
VZ
2322 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2323 &bufPtr, &dstLen);
1752fda6
VZ
2324
2325 // convert the number of bytes converted as returned by iconv to the
2326 // number of (wide) characters converted that we need
2327 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2328
36acb880 2329 if (ms_wcNeedsSwap)
3a0d76bc 2330 {
36acb880 2331 // convert to native endianness
17a1ebd1 2332 for ( unsigned i = 0; i < res; i++ )
467a2982 2333 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2334 }
36acb880 2335 }
8f4b0f43 2336 else // no destination buffer
36acb880 2337 {
8f4b0f43 2338 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2339 wchar_t tbuf[256];
36acb880 2340 res = 0;
ef199164
DS
2341
2342 do
2343 {
e8769ed1 2344 char* bufPtr = (char*)tbuf;
8f4b0f43 2345 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2346
2347 cres = iconv(m2w,
8f4b0f43
VZ
2348 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2349 &bufPtr, &dstLen );
36acb880 2350
8f4b0f43 2351 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2352 }
2353 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2354 }
dccce9ea 2355
8f4b0f43 2356 if (ICONV_FAILED(cres, srcLen))
f1339c56 2357 {
36acb880 2358 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2359 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2360 return wxCONV_FAILED;
36acb880
VZ
2361 }
2362
2363 return res;
2364}
2365
8f4b0f43
VZ
2366size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2367 const wchar_t *src, size_t srcLen) const
36acb880 2368{
b1d547eb
VS
2369#if wxUSE_THREADS
2370 // NB: explained in MB2WC
2371 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2372#endif
3698ae71 2373
8f4b0f43 2374 if ( srcLen == wxNO_LEN )
2588ee86 2375 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2376
2377 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2378 size_t outbuflen = dstLen;
36acb880 2379 size_t res, cres;
3a0d76bc 2380
36acb880 2381 wchar_t *tmpbuf = 0;
3caec1bb 2382
36acb880
VZ
2383 if (ms_wcNeedsSwap)
2384 {
2385 // need to copy to temp buffer to switch endianness
51725fc0 2386 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2387 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2388 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2389 for ( size_t i = 0; i < srcLen; i++ )
2390 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2391
8f4b0f43 2392 src = tmpbuf;
36acb880 2393 }
3a0d76bc 2394
8f4b0f43
VZ
2395 char* inbuf = (char*)src;
2396 if ( dst )
36acb880
VZ
2397 {
2398 // have destination buffer, convert there
8f4b0f43 2399 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2400
8f4b0f43 2401 res = dstLen - outbuflen;
36acb880 2402 }
8f4b0f43 2403 else // no destination buffer
36acb880 2404 {
8f4b0f43 2405 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2406 char tbuf[256];
36acb880 2407 res = 0;
ef199164
DS
2408 do
2409 {
8f4b0f43 2410 dst = tbuf;
51725fc0 2411 outbuflen = WXSIZEOF(tbuf);
36acb880 2412
8f4b0f43 2413 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2414
51725fc0 2415 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2416 }
2417 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2418 }
dccce9ea 2419
36acb880
VZ
2420 if (ms_wcNeedsSwap)
2421 {
2422 free(tmpbuf);
2423 }
dccce9ea 2424
e8769ed1 2425 if (ICONV_FAILED(cres, inbuflen))
36acb880 2426 {
ce6f8d6f 2427 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2428 return wxCONV_FAILED;
36acb880
VZ
2429 }
2430
2431 return res;
2432}
2433
7ef3ab50 2434size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2435{
c1464d9d 2436 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2437 {
2438 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2439
2440#if wxUSE_THREADS
2441 // NB: explained in MB2WC
2442 wxMutexLocker lock(self->m_iconvMutex);
2443#endif
2444
999020e1 2445 const wchar_t *wnul = L"";
c1464d9d 2446 char buf[8]; // should be enough for NUL in any encoding
356410fc 2447 size_t inLen = sizeof(wchar_t),
c1464d9d 2448 outLen = WXSIZEOF(buf);
ef199164
DS
2449 char *inBuff = (char *)wnul;
2450 char *outBuff = buf;
2451 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2452 {
c1464d9d 2453 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2454 }
2455 else // ok
2456 {
ef199164 2457 self->m_minMBCharWidth = outBuff - buf;
356410fc 2458 }
eec47cc6
VZ
2459 }
2460
c1464d9d 2461 return m_minMBCharWidth;
eec47cc6
VZ
2462}
2463
ba98e032
VS
2464#if wxUSE_UNICODE_UTF8
2465bool wxMBConv_iconv::IsUTF8() const
2466{
86501081
VS
2467 return wxStricmp(m_name, "UTF-8") == 0 ||
2468 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2469}
2470#endif
2471
b040e242 2472#endif // HAVE_ICONV
36acb880 2473
e95354ec 2474
36acb880
VZ
2475// ============================================================================
2476// Win32 conversion classes
2477// ============================================================================
1cd52418 2478
e95354ec 2479#ifdef wxHAVE_WIN32_MB2WC
373658eb 2480
8b04d4c4 2481// from utils.cpp
d775fa82 2482#if wxUSE_FONTMAP
86501081 2483extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2484extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2485#endif
373658eb 2486
e95354ec 2487class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2488{
2489public:
bde4baac
VZ
2490 wxMBConv_win32()
2491 {
2492 m_CodePage = CP_ACP;
c1464d9d 2493 m_minMBCharWidth = 0;
bde4baac
VZ
2494 }
2495
d36c9347 2496 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2497 : wxMBConv()
d36c9347
VZ
2498 {
2499 m_CodePage = conv.m_CodePage;
2500 m_minMBCharWidth = conv.m_minMBCharWidth;
2501 }
2502
7608a683 2503#if wxUSE_FONTMAP
86501081 2504 wxMBConv_win32(const char* name)
bde4baac
VZ
2505 {
2506 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2507 m_minMBCharWidth = 0;
bde4baac 2508 }
dccce9ea 2509
e95354ec 2510 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2511 {
2512 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2513 m_minMBCharWidth = 0;
bde4baac 2514 }
eec47cc6 2515#endif // wxUSE_FONTMAP
8b04d4c4 2516
d36c9347 2517 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2518 {
02272c9c
VZ
2519 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2520 // the behaviour is not compatible with the Unix version (using iconv)
2521 // and break the library itself, e.g. wxTextInputStream::NextChar()
2522 // wouldn't work if reading an incomplete MB char didn't result in an
2523 // error
667e5b3e 2524 //
89028980 2525 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2526 // Win XP or newer and it is not supported for UTF-[78] so we always
2527 // use our own conversions in this case. See
89028980
VS
2528 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2529 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2530 if ( m_CodePage == CP_UTF8 )
89028980 2531 {
5487ff0f 2532 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2533 }
830f8f11
VZ
2534
2535 if ( m_CodePage == CP_UTF7 )
2536 {
5487ff0f 2537 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2538 }
2539
2540 int flags = 0;
2541 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2542 IsAtLeastWin2kSP4() )
89028980 2543 {
830f8f11 2544 flags = MB_ERR_INVALID_CHARS;
89028980 2545 }
667e5b3e 2546
2b5f62a0
VZ
2547 const size_t len = ::MultiByteToWideChar
2548 (
2549 m_CodePage, // code page
667e5b3e 2550 flags, // flags: fall on error
2b5f62a0
VZ
2551 psz, // input string
2552 -1, // its length (NUL-terminated)
b4da152e 2553 buf, // output string
2b5f62a0
VZ
2554 buf ? n : 0 // size of output buffer
2555 );
89028980
VS
2556 if ( !len )
2557 {
2558 // function totally failed
467e0479 2559 return wxCONV_FAILED;
89028980
VS
2560 }
2561
2562 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2563 // check if we succeeded, by doing a double trip:
2564 if ( !flags && buf )
2565 {
53c174fc
VZ
2566 const size_t mbLen = strlen(psz);
2567 wxCharBuffer mbBuf(mbLen);
89028980
VS
2568 if ( ::WideCharToMultiByte
2569 (
2570 m_CodePage,
2571 0,
2572 buf,
2573 -1,
2574 mbBuf.data(),
53c174fc 2575 mbLen + 1, // size in bytes, not length
89028980
VS
2576 NULL,
2577 NULL
2578 ) == 0 ||
2579 strcmp(mbBuf, psz) != 0 )
2580 {
2581 // we didn't obtain the same thing we started from, hence
2582 // the conversion was lossy and we consider that it failed
467e0479 2583 return wxCONV_FAILED;
89028980
VS
2584 }
2585 }
2b5f62a0 2586
03a991bc
VZ
2587 // note that it returns count of written chars for buf != NULL and size
2588 // of the needed buffer for buf == NULL so in either case the length of
2589 // the string (which never includes the terminating NUL) is one less
89028980 2590 return len - 1;
f1339c56 2591 }
dccce9ea 2592
d36c9347 2593 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2594 {
13dd924a
VZ
2595 /*
2596 we have a problem here: by default, WideCharToMultiByte() may
2597 replace characters unrepresentable in the target code page with bad
2598 quality approximations such as turning "1/2" symbol (U+00BD) into
2599 "1" for the code pages which don't have it and we, obviously, want
2600 to avoid this at any price
d775fa82 2601
13dd924a
VZ
2602 the trouble is that this function does it _silently_, i.e. it won't
2603 even tell us whether it did or not... Win98/2000 and higher provide
2604 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2605 we have to resort to a round trip, i.e. check that converting back
2606 results in the same string -- this is, of course, expensive but
2607 otherwise we simply can't be sure to not garble the data.
2608 */
2609
2610 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2611 // it doesn't work with CJK encodings (which we test for rather roughly
2612 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2613 // supporting it
907173e5
WS
2614 BOOL usedDef wxDUMMY_INITIALIZE(false);
2615 BOOL *pUsedDef;
13dd924a
VZ
2616 int flags;
2617 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2618 {
2619 // it's our lucky day
2620 flags = WC_NO_BEST_FIT_CHARS;
2621 pUsedDef = &usedDef;
2622 }
2623 else // old system or unsupported encoding
2624 {
2625 flags = 0;
2626 pUsedDef = NULL;
2627 }
2628
2b5f62a0
VZ
2629 const size_t len = ::WideCharToMultiByte
2630 (
2631 m_CodePage, // code page
13dd924a
VZ
2632 flags, // either none or no best fit
2633 pwz, // input string
2b5f62a0
VZ
2634 -1, // it is (wide) NUL-terminated
2635 buf, // output buffer
2636 buf ? n : 0, // and its size
2637 NULL, // default "replacement" char
13dd924a 2638 pUsedDef // [out] was it used?
2b5f62a0
VZ
2639 );
2640
13dd924a
VZ
2641 if ( !len )
2642 {
2643 // function totally failed
467e0479 2644 return wxCONV_FAILED;
13dd924a
VZ
2645 }
2646
765bdb4a
VZ
2647 // we did something, check if we really succeeded
2648 if ( flags )
13dd924a 2649 {
765bdb4a
VZ
2650 // check if the conversion failed, i.e. if any replacements
2651 // were done
2652 if ( usedDef )
2653 return wxCONV_FAILED;
2654 }
2655 else // we must resort to double tripping...
2656 {
2657 // first we need to ensure that we really have the MB data: this is
2658 // not the case if we're called with NULL buffer, in which case we
2659 // need to do the conversion yet again
2660 wxCharBuffer bufDef;
2661 if ( !buf )
13dd924a 2662 {
765bdb4a
VZ
2663 bufDef = wxCharBuffer(len);
2664 buf = bufDef.data();
2665 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2666 buf, len, NULL, NULL) )
467e0479 2667 return wxCONV_FAILED;
13dd924a 2668 }
765bdb4a 2669
564da6ff
VZ
2670 if ( !n )
2671 n = wcslen(pwz);
765bdb4a 2672 wxWCharBuffer wcBuf(n);
564da6ff 2673 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2674 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2675 {
765bdb4a
VZ
2676 // we didn't obtain the same thing we started from, hence
2677 // the conversion was lossy and we consider that it failed
2678 return wxCONV_FAILED;
13dd924a
VZ
2679 }
2680 }
2681
03a991bc 2682 // see the comment above for the reason of "len - 1"
13dd924a 2683 return len - 1;
f1339c56 2684 }
dccce9ea 2685
7ef3ab50
VZ
2686 virtual size_t GetMBNulLen() const
2687 {
2688 if ( m_minMBCharWidth == 0 )
2689 {
2690 int len = ::WideCharToMultiByte
2691 (
2692 m_CodePage, // code page
2693 0, // no flags
2694 L"", // input string
2695 1, // translate just the NUL
2696 NULL, // output buffer
2697 0, // and its size
2698 NULL, // no replacement char
2699 NULL // [out] don't care if it was used
2700 );
2701
2702 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2703 switch ( len )
2704 {
2705 default:
9a83f860 2706 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2707 self->m_minMBCharWidth = (size_t)-1;
2708 break;
7ef3ab50
VZ
2709
2710 case 0:
2711 self->m_minMBCharWidth = (size_t)-1;
2712 break;
2713
2714 case 1:
2715 case 2:
2716 case 4:
2717 self->m_minMBCharWidth = len;
2718 break;
2719 }
2720 }
2721
2722 return m_minMBCharWidth;
2723 }
2724
d36c9347
VZ
2725 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2726
13dd924a
VZ
2727 bool IsOk() const { return m_CodePage != -1; }
2728
2729private:
2730 static bool CanUseNoBestFit()
2731 {
2732 static int s_isWin98Or2k = -1;
2733
2734 if ( s_isWin98Or2k == -1 )
2735 {
2736 int verMaj, verMin;
2737 switch ( wxGetOsVersion(&verMaj, &verMin) )
2738 {
406d283a 2739 case wxOS_WINDOWS_9X:
13dd924a
VZ
2740 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2741 break;
2742
406d283a 2743 case wxOS_WINDOWS_NT:
13dd924a
VZ
2744 s_isWin98Or2k = verMaj >= 5;
2745 break;
2746
2747 default:
ef199164 2748 // unknown: be conservative by default
13dd924a 2749 s_isWin98Or2k = 0;
ef199164 2750 break;
13dd924a
VZ
2751 }
2752
9a83f860 2753 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2754 }
2755
2756 return s_isWin98Or2k == 1;
2757 }
f1339c56 2758
89028980
VS
2759 static bool IsAtLeastWin2kSP4()
2760 {
8942f83a
WS
2761#ifdef __WXWINCE__
2762 return false;
2763#else
89028980
VS
2764 static int s_isAtLeastWin2kSP4 = -1;
2765
2766 if ( s_isAtLeastWin2kSP4 == -1 )
2767 {
2768 OSVERSIONINFOEX ver;
2769
2770 memset(&ver, 0, sizeof(ver));
2771 ver.dwOSVersionInfoSize = sizeof(ver);
2772 GetVersionEx((OSVERSIONINFO*)&ver);
2773
2774 s_isAtLeastWin2kSP4 =
2775 ((ver.dwMajorVersion > 5) || // Vista+
2776 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2777 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2778 ver.wServicePackMajor >= 4)) // 2000 SP4+
2779 ? 1 : 0;
2780 }
2781
2782 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2783#endif
89028980
VS
2784 }
2785
eec47cc6 2786
c1464d9d 2787 // the code page we're working with
b1d66b54 2788 long m_CodePage;
c1464d9d 2789
7ef3ab50 2790 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2791 // "unknown"
2792 size_t m_minMBCharWidth;
1cd52418 2793};
e95354ec
VZ
2794
2795#endif // wxHAVE_WIN32_MB2WC
2796
f7e98dee 2797
36acb880
VZ
2798// ============================================================================
2799// wxEncodingConverter based conversion classes
2800// ============================================================================
2801
1e6feb95 2802#if wxUSE_FONTMAP
1cd52418 2803
e95354ec 2804class wxMBConv_wxwin : public wxMBConv
1cd52418 2805{
8b04d4c4
VZ
2806private:
2807 void Init()
2808 {
6ac84a78
DE
2809 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2810 // The wxMBConv_cf class does a better job.
2811 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2812 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2813 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2814 }
2815
6001e347 2816public:
f1339c56
RR
2817 // temporarily just use wxEncodingConverter stuff,
2818 // so that it works while a better implementation is built
86501081 2819 wxMBConv_wxwin(const char* name)
f1339c56
RR
2820 {
2821 if (name)
267e11c5 2822 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2823 else
2824 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2825
8b04d4c4
VZ
2826 Init();
2827 }
2828
e95354ec 2829 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2830 {
2831 m_enc = enc;
2832
2833 Init();
f1339c56 2834 }
dccce9ea 2835
bde4baac 2836 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2837 {
2838 size_t inbuf = strlen(psz);
dccce9ea 2839 if (buf)
c643a977 2840 {
ef199164 2841 if (!m2w.Convert(psz, buf))
467e0479 2842 return wxCONV_FAILED;
c643a977 2843 }
f1339c56
RR
2844 return inbuf;
2845 }
dccce9ea 2846
bde4baac 2847 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2848 {
f8d791e0 2849 const size_t inbuf = wxWcslen(psz);
f1339c56 2850 if (buf)
c643a977 2851 {
ef199164 2852 if (!w2m.Convert(psz, buf))
467e0479 2853 return wxCONV_FAILED;
c643a977 2854 }
dccce9ea 2855
f1339c56
RR
2856 return inbuf;
2857 }
dccce9ea 2858
7ef3ab50 2859 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2860 {
2861 switch ( m_enc )
2862 {
2863 case wxFONTENCODING_UTF16BE:
2864 case wxFONTENCODING_UTF16LE:
c1464d9d 2865 return 2;
eec47cc6
VZ
2866
2867 case wxFONTENCODING_UTF32BE:
2868 case wxFONTENCODING_UTF32LE:
c1464d9d 2869 return 4;
eec47cc6
VZ
2870
2871 default:
c1464d9d 2872 return 1;
eec47cc6
VZ
2873 }
2874 }
2875
d36c9347
VZ
2876 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2877
7ef3ab50
VZ
2878 bool IsOk() const { return m_ok; }
2879
2880public:
2881 wxFontEncoding m_enc;
2882 wxEncodingConverter m2w, w2m;
2883
2884private:
cafbf6fb
VZ
2885 // were we initialized successfully?
2886 bool m_ok;
fc7a2a60 2887
c0c133e1 2888 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2889};
6001e347 2890
8f115891 2891// make the constructors available for unit testing
86501081 2892WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2893{
2894 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2895 if ( !result->IsOk() )
2896 {
2897 delete result;
2898 return 0;
2899 }
ef199164 2900
8f115891
MW
2901 return result;
2902}
2903
1e6feb95
VZ
2904#endif // wxUSE_FONTMAP
2905
36acb880
VZ
2906// ============================================================================
2907// wxCSConv implementation
2908// ============================================================================
2909
8b04d4c4 2910void wxCSConv::Init()
6001e347 2911{
e95354ec
VZ
2912 m_name = NULL;
2913 m_convReal = NULL;
2914 m_deferred = true;
2915}
2916
86501081 2917wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2918{
2919 Init();
82713003 2920
86501081 2921 if ( !charset.empty() )
e95354ec 2922 {
86501081 2923 SetName(charset.ToAscii());
e95354ec 2924 }
bda3d86a 2925
e4277538
VZ
2926#if wxUSE_FONTMAP
2927 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
e3276230
VZ
2928 if ( m_encoding == wxFONTENCODING_MAX )
2929 {
2930 // set to unknown/invalid value
2931 m_encoding = wxFONTENCODING_SYSTEM;
2932 }
2933 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2934 {
2935 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2936 m_encoding = wxFONTENCODING_ISO8859_1;
2937 }
e4277538 2938#else
bda3d86a 2939 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2940#endif
6001e347
RR
2941}
2942
8b04d4c4
VZ
2943wxCSConv::wxCSConv(wxFontEncoding encoding)
2944{
bda3d86a 2945 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 2946 {
9a83f860 2947 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
2948
2949 encoding = wxFONTENCODING_SYSTEM;
2950 }
2951
8b04d4c4
VZ
2952 Init();
2953
bda3d86a 2954 m_encoding = encoding;
8b04d4c4
VZ
2955}
2956
6001e347
RR
2957wxCSConv::~wxCSConv()
2958{
65e50848
JS
2959 Clear();
2960}
2961
54380f29 2962wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2963 : wxMBConv()
54380f29 2964{
8b04d4c4
VZ
2965 Init();
2966
54380f29 2967 SetName(conv.m_name);
8b04d4c4 2968 m_encoding = conv.m_encoding;
54380f29
GD
2969}
2970
2971wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2972{
2973 Clear();
8b04d4c4 2974
54380f29 2975 SetName(conv.m_name);
8b04d4c4
VZ
2976 m_encoding = conv.m_encoding;
2977
54380f29
GD
2978 return *this;
2979}
2980
65e50848
JS
2981void wxCSConv::Clear()
2982{
8b04d4c4 2983 free(m_name);
e95354ec 2984 delete m_convReal;
8b04d4c4 2985
65e50848 2986 m_name = NULL;
e95354ec 2987 m_convReal = NULL;
6001e347
RR
2988}
2989
86501081 2990void wxCSConv::SetName(const char *charset)
6001e347 2991{
f1339c56
RR
2992 if (charset)
2993 {
d6f2a891 2994 m_name = wxStrdup(charset);
e95354ec 2995 m_deferred = true;
f1339c56 2996 }
6001e347
RR
2997}
2998
8b3eb85d 2999#if wxUSE_FONTMAP
8b3eb85d
VZ
3000
3001WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3002 wxEncodingNameCache );
8b3eb85d
VZ
3003
3004static wxEncodingNameCache gs_nameCache;
3005#endif
3006
e95354ec
VZ
3007wxMBConv *wxCSConv::DoCreate() const
3008{
ce6f8d6f
VZ
3009#if wxUSE_FONTMAP
3010 wxLogTrace(TRACE_STRCONV,
3011 wxT("creating conversion for %s"),
3012 (m_name ? m_name
86501081 3013 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3014#endif // wxUSE_FONTMAP
3015
c547282d
VZ
3016 // check for the special case of ASCII or ISO8859-1 charset: as we have
3017 // special knowledge of it anyhow, we don't need to create a special
3018 // conversion object
e4277538
VZ
3019 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3020 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3021 {
e95354ec
VZ
3022 // don't convert at all
3023 return NULL;
3024 }
dccce9ea 3025
e95354ec
VZ
3026 // we trust OS to do conversion better than we can so try external
3027 // conversion methods first
3028 //
3029 // the full order is:
3030 // 1. OS conversion (iconv() under Unix or Win32 API)
3031 // 2. hard coded conversions for UTF
3032 // 3. wxEncodingConverter as fall back
3033
3034 // step (1)
3035#ifdef HAVE_ICONV
c547282d 3036#if !wxUSE_FONTMAP
e95354ec 3037 if ( m_name )
c547282d 3038#endif // !wxUSE_FONTMAP
e95354ec 3039 {
3ef10cfc 3040#if wxUSE_FONTMAP
8b3eb85d 3041 wxFontEncoding encoding(m_encoding);
3ef10cfc 3042#endif
8b3eb85d 3043
86501081 3044 if ( m_name )
8b3eb85d 3045 {
86501081 3046 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3047 if ( conv->IsOk() )
3048 return conv;
3049
3050 delete conv;
c547282d
VZ
3051
3052#if wxUSE_FONTMAP
8b3eb85d 3053 encoding =
86501081 3054 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3055#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3056 }
3057#if wxUSE_FONTMAP
3058 {
3059 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3060 if ( it != gs_nameCache.end() )
3061 {
3062 if ( it->second.empty() )
3063 return NULL;
c547282d 3064
86501081 3065 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3066 if ( conv->IsOk() )
3067 return conv;
e95354ec 3068
8b3eb85d
VZ
3069 delete conv;
3070 }
3071
3072 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3073 // CS : in case this does not return valid names (eg for MacRoman)
3074 // encoding got a 'failure' entry in the cache all the same,
3075 // although it just has to be created using a different method, so
3076 // only store failed iconv creation attempts (or perhaps we
3077 // shoulnd't do this at all ?)
3c67ec06 3078 if ( names[0] != NULL )
8b3eb85d 3079 {
3c67ec06 3080 for ( ; *names; ++names )
8b3eb85d 3081 {
86501081
VS
3082 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3083 // will need changes that will obsolete this
3084 wxString name(*names);
3085 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3086 if ( conv->IsOk() )
3087 {
3088 gs_nameCache[encoding] = *names;
3089 return conv;
3090 }
3091
3092 delete conv;
8b3eb85d
VZ
3093 }
3094
9a83f860 3095 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3096 }
8b3eb85d
VZ
3097 }
3098#endif // wxUSE_FONTMAP
e95354ec
VZ
3099 }
3100#endif // HAVE_ICONV
3101
3102#ifdef wxHAVE_WIN32_MB2WC
3103 {
7608a683 3104#if wxUSE_FONTMAP
e95354ec
VZ
3105 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3106 : new wxMBConv_win32(m_encoding);
3107 if ( conv->IsOk() )
3108 return conv;
3109
3110 delete conv;
7608a683
WS
3111#else
3112 return NULL;
3113#endif
e95354ec
VZ
3114 }
3115#endif // wxHAVE_WIN32_MB2WC
ef199164 3116
5c4ed98d 3117#ifdef __DARWIN__
f7e98dee 3118 {
6ff49cbc
DE
3119 // leave UTF16 and UTF32 to the built-ins of wx
3120 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3121 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3122 {
a6900d10 3123#if wxUSE_FONTMAP
5c4ed98d
DE
3124 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3125 : new wxMBConv_cf(m_encoding);
a6900d10 3126#else
5c4ed98d 3127 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3128#endif
ef199164 3129
f7e98dee 3130 if ( conv->IsOk() )
d775fa82
WS
3131 return conv;
3132
3133 delete conv;
3134 }
335d31e0 3135 }
5c4ed98d
DE
3136#endif // __DARWIN__
3137
e95354ec
VZ
3138 // step (2)
3139 wxFontEncoding enc = m_encoding;
3140#if wxUSE_FONTMAP
c547282d
VZ
3141 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3142 {
3143 // use "false" to suppress interactive dialogs -- we can be called from
3144 // anywhere and popping up a dialog from here is the last thing we want to
3145 // do
267e11c5 3146 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3147 }
e95354ec
VZ
3148#endif // wxUSE_FONTMAP
3149
3150 switch ( enc )
3151 {
3152 case wxFONTENCODING_UTF7:
3153 return new wxMBConvUTF7;
3154
3155 case wxFONTENCODING_UTF8:
3156 return new wxMBConvUTF8;
3157
e95354ec
VZ
3158 case wxFONTENCODING_UTF16BE:
3159 return new wxMBConvUTF16BE;
3160
3161 case wxFONTENCODING_UTF16LE:
3162 return new wxMBConvUTF16LE;
3163
e95354ec
VZ
3164 case wxFONTENCODING_UTF32BE:
3165 return new wxMBConvUTF32BE;
3166
3167 case wxFONTENCODING_UTF32LE:
3168 return new wxMBConvUTF32LE;
3169
3170 default:
3171 // nothing to do but put here to suppress gcc warnings
ef199164 3172 break;
e95354ec
VZ
3173 }
3174
3175 // step (3)
3176#if wxUSE_FONTMAP
3177 {
3178 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3179 : new wxMBConv_wxwin(m_encoding);
3180 if ( conv->IsOk() )
3181 return conv;
3182
3183 delete conv;
3184 }
ef199164 3185
3df31b2d
VZ
3186 wxLogTrace(TRACE_STRCONV,
3187 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3188 (m_name ? wxString(m_name)
3df31b2d
VZ
3189 : wxFontMapperBase::GetEncodingName(m_encoding)));
3190#endif // wxUSE_FONTMAP
e95354ec
VZ
3191
3192 return NULL;
3193}
3194
3195void wxCSConv::CreateConvIfNeeded() const
3196{
3197 if ( m_deferred )
3198 {
3199 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3200
bda3d86a
VZ
3201 // if we don't have neither the name nor the encoding, use the default
3202 // encoding for this system
3203 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3204 {
4c75209f 3205#if wxUSE_INTL
02c7347b 3206 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3207#else
3208 // fallback to some reasonable default:
3209 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3210#endif // wxUSE_INTL
4c75209f 3211 }
bda3d86a 3212
e95354ec
VZ
3213 self->m_convReal = DoCreate();
3214 self->m_deferred = false;
6001e347 3215 }
6001e347
RR
3216}
3217
0f0298b1
VZ
3218bool wxCSConv::IsOk() const
3219{
3220 CreateConvIfNeeded();
3221
3222 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3223 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3224 return true; // always ok as we do it ourselves
3225
3226 // m_convReal->IsOk() is called at its own creation, so we know it must
3227 // be ok if m_convReal is non-NULL
3228 return m_convReal != NULL;
3229}
3230
1c714a5d
VZ
3231size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3232 const char *src, size_t srcLen) const
3233{
3234 CreateConvIfNeeded();
3235
2c74c558
VS
3236 if (m_convReal)
3237 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3238
3239 // latin-1 (direct)
05392dc8
VZ
3240 if ( srcLen == wxNO_LEN )
3241 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3242
05392dc8
VZ
3243 if ( dst )
3244 {
3245 if ( dstLen < srcLen )
3246 return wxCONV_FAILED;
1c714a5d 3247
05392dc8
VZ
3248 for ( size_t n = 0; n < srcLen; n++ )
3249 dst[n] = (unsigned char)(src[n]);
3250 }
2c74c558 3251
05392dc8 3252 return srcLen;
1c714a5d
VZ
3253}
3254
05392dc8
VZ
3255size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3256 const wchar_t *src, size_t srcLen) const
6001e347 3257{
e95354ec 3258 CreateConvIfNeeded();
dccce9ea 3259
e95354ec 3260 if (m_convReal)
05392dc8 3261 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3262
3263 // latin-1 (direct)
05392dc8
VZ
3264 if ( srcLen == wxNO_LEN )
3265 srcLen = wxWcslen(src) + 1;
dccce9ea 3266
05392dc8 3267 if ( dst )
f1339c56 3268 {
05392dc8
VZ
3269 if ( dstLen < srcLen )
3270 return wxCONV_FAILED;
1cd52418 3271
05392dc8 3272 for ( size_t n = 0; n < srcLen; n++ )
24642831 3273 {
05392dc8 3274 if ( src[n] > 0xFF )
467e0479 3275 return wxCONV_FAILED;
ef199164 3276
05392dc8 3277 dst[n] = (char)src[n];
24642831 3278 }
05392dc8 3279
24642831 3280 }
05392dc8 3281 else // still need to check the input validity
24642831 3282 {
05392dc8 3283 for ( size_t n = 0; n < srcLen; n++ )
24642831 3284 {
05392dc8 3285 if ( src[n] > 0xFF )
467e0479 3286 return wxCONV_FAILED;
24642831 3287 }
f1339c56 3288 }
dccce9ea 3289
05392dc8 3290 return srcLen;
6001e347
RR
3291}
3292
7ef3ab50 3293size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3294{
3295 CreateConvIfNeeded();
3296
3297 if ( m_convReal )
3298 {
7ef3ab50 3299 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3300 }
3301
ba98e032 3302 // otherwise, we are ISO-8859-1
c1464d9d 3303 return 1;
eec47cc6
VZ
3304}
3305
ba98e032
VS
3306#if wxUSE_UNICODE_UTF8
3307bool wxCSConv::IsUTF8() const
3308{
3309 CreateConvIfNeeded();
3310
3311 if ( m_convReal )
3312 {
3313 return m_convReal->IsUTF8();
3314 }
3315
3316 // otherwise, we are ISO-8859-1
3317 return false;
3318}
3319#endif
3320
69c928ef
VZ
3321
3322#if wxUSE_UNICODE
3323
3324wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3325{
3326 if ( !s )
3327 return wxWCharBuffer();
3328
3329 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3330 if ( !wbuf )
5487ff0f 3331 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3332 if ( !wbuf )
3333 wbuf = wxConvISO8859_1.cMB2WX(s);
3334
3335 return wbuf;
3336}
3337
3338wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3339{
3340 if ( !ws )
3341 return wxCharBuffer();
3342
3343 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3344 if ( !buf )
3345 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3346
3347 return buf;
3348}
3349
3350#endif // wxUSE_UNICODE
f5a1953b 3351
1e50d914
VS
3352// ----------------------------------------------------------------------------
3353// globals
3354// ----------------------------------------------------------------------------
3355
3356// NB: The reason why we create converted objects in this convoluted way,
3357// using a factory function instead of global variable, is that they
3358// may be used at static initialization time (some of them are used by
3359// wxString ctors and there may be a global wxString object). In other
3360// words, possibly _before_ the converter global object would be
3361// initialized.
3362
3363#undef wxConvLibc
3364#undef wxConvUTF8
3365#undef wxConvUTF7
3366#undef wxConvLocal
3367#undef wxConvISO8859_1
3368
3369#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3370 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3371 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3372 { \
3373 static impl_klass name##Obj ctor_args; \
3374 return &name##Obj; \
3375 } \
3376 /* this ensures that all global converter objects are created */ \
3377 /* by the time static initialization is done, i.e. before any */ \
3378 /* thread is launched: */ \
3379 static klass* gs_##name##instance = wxGet_##name##Ptr()
3380
3381#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3382 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3383
5c69ef61
VZ
3384#ifdef __INTELC__
3385 // disable warning "variable 'xxx' was declared but never referenced"
3386 #pragma warning(disable: 177)
3387#endif // Intel C++
3388
1e50d914
VS
3389#ifdef __WINDOWS__
3390 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3391#elif 0 // defined(__WXOSX__)
3392 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3393#else
3394 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3395#endif
3396
e1079eda
VZ
3397// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3398// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3399// provokes an error message about "not enough macro parameters"; and we
3400// can't use "()" here as the name##Obj declaration would be parsed as a
3401// function declaration then, so use a semicolon and live with an extra
3402// empty statement (and hope that no compilers warns about this)
3403WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3404WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3405
3406WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3407WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3408
3409WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3410WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3411
6ac84a78
DE
3412#ifdef __DARWIN__
3413// The xnu kernel always communicates file paths in decomposed UTF-8.
3414// WARNING: Are we sure that CFString's conversion will cause decomposition?
3415static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3416#endif
6ac84a78 3417
1e50d914 3418WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3419#ifdef __DARWIN__
1e50d914 3420 &wxConvMacUTF8DObj;
6ac84a78 3421#else // !__DARWIN__
1e50d914 3422 wxGet_wxConvLibcPtr();
6ac84a78 3423#endif // __DARWIN__/!__DARWIN__
1e50d914 3424
bde4baac
VZ
3425#else // !wxUSE_WCHAR_T
3426
1e50d914 3427// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3428// stand-ins in absence of wchar_t
3429WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3430 wxConvISO8859_1,
3431 wxConvLocal,
3432 wxConvUTF8;
3433
3434#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T