]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Merge SOC2009_FSWATCHER branch into trunk.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
c933e267 56#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
9a83f860 60#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069 144 const size_t
5c33522f 145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434 162 // although new conversion classes are supposed to implement this function
36f93678 163 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
36f93678
VZ
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
36f93678
VZ
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
bbb0ff36 219 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
483b0434 227 for ( ;; )
eec47cc6 228 {
c1464d9d 229 // try to convert the current chunk
483b0434 230 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
e4e3bbb4 233
483b0434 234 dstWritten += lenChunk;
f6a02087
VZ
235 if ( !srcEnd )
236 dstWritten++;
f5fb6871 237
f6a02087 238 if ( !lenChunk )
467e0479
VZ
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
483b0434
VZ
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
f6a02087
VZ
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
f6a02087
VZ
254 if ( !srcEnd )
255 dst++;
483b0434 256 }
c1464d9d 257
483b0434 258 if ( !srcEnd )
c1464d9d 259 {
467e0479 260 // we convert just one chunk in this case as this is the entire
bbb0ff36 261 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
262 break;
263 }
eec47cc6 264
bbb0ff36
VZ
265 // advance the input pointer past the end of this chunk: notice that we
266 // will always stop before srcEnd because we know that the chunk is
267 // always properly NUL-terminated
483b0434 268 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
269 {
270 // notice that we must skip over multiple bytes here as we suppose
271 // that if NUL takes 2 or 4 bytes, then all the other characters do
272 // too and so if advanced by a single byte we might erroneously
273 // detect sequences of NUL bytes in the middle of the input
483b0434 274 src += nulLen;
c1464d9d 275 }
e4e3bbb4 276
bbb0ff36
VZ
277 // if the buffer ends before this NUL, we shouldn't count it in our
278 // output so skip the code below
279 if ( src == srcEnd )
280 break;
281
282 // do count this terminator as it's inside the buffer we convert
283 dstWritten++;
284 if ( dst )
285 dst++;
286
287 src += nulLen; // skip the terminator itself
c1464d9d 288
483b0434 289 if ( src >= srcEnd )
c1464d9d
VZ
290 break;
291 }
292
483b0434 293 return dstWritten;
e4e3bbb4
RN
294}
295
483b0434
VZ
296size_t
297wxMBConv::FromWChar(char *dst, size_t dstLen,
298 const wchar_t *src, size_t srcLen) const
e4e3bbb4 299{
483b0434
VZ
300 // the number of chars [which would be] written to dst [if it were not NULL]
301 size_t dstWritten = 0;
e4e3bbb4 302
f6a02087
VZ
303 // if we don't know its length we have no choice but to assume that it is
304 // NUL-terminated (notice that it can still be NUL-terminated even if
305 // explicit length is given but it doesn't change our return value)
306 const bool isNulTerminated = srcLen == wxNO_LEN;
307
eec47cc6
VZ
308 // make a copy of the input string unless it is already properly
309 // NUL-terminated
eec47cc6 310 wxWCharBuffer bufTmp;
f6a02087 311 if ( isNulTerminated )
e4e3bbb4 312 {
483b0434 313 srcLen = wxWcslen(src) + 1;
eec47cc6 314 }
483b0434 315 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
316 {
317 // make a copy in order to properly NUL-terminate the string
483b0434 318 bufTmp = wxWCharBuffer(srcLen);
ef199164 319 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
320 src = bufTmp;
321 }
322
323 const size_t lenNul = GetMBNulLen();
324 for ( const wchar_t * const srcEnd = src + srcLen;
325 src < srcEnd;
326 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
327 {
328 // try to convert the current chunk
329 size_t lenChunk = WC2MB(NULL, src, 0);
330
331 if ( lenChunk == wxCONV_FAILED )
332 return wxCONV_FAILED;
333
483b0434 334 dstWritten += lenChunk;
bbb0ff36 335 if ( src + lenChunk < srcEnd || isNulTerminated )
f6a02087 336 dstWritten += lenNul;
483b0434
VZ
337
338 if ( dst )
339 {
340 if ( dstWritten > dstLen )
341 return wxCONV_FAILED;
342
f6a02087 343 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
344 return wxCONV_FAILED;
345
346 dst += lenChunk;
bbb0ff36 347 if ( src + lenChunk < srcEnd || isNulTerminated )
f6a02087 348 dst += lenNul;
483b0434 349 }
eec47cc6 350 }
e4e3bbb4 351
483b0434
VZ
352 return dstWritten;
353}
354
ef199164 355size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 356{
51725fc0 357 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 358 if ( rc != wxCONV_FAILED )
509da451
VZ
359 {
360 // ToWChar() returns the buffer length, i.e. including the trailing
361 // NUL, while this method doesn't take it into account
362 rc--;
363 }
364
365 return rc;
366}
367
ef199164 368size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 369{
51725fc0 370 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 371 if ( rc != wxCONV_FAILED )
509da451 372 {
51725fc0 373 rc -= GetMBNulLen();
509da451
VZ
374 }
375
376 return rc;
377}
378
483b0434
VZ
379wxMBConv::~wxMBConv()
380{
381 // nothing to do here (necessary for Darwin linking probably)
382}
e4e3bbb4 383
483b0434
VZ
384const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
385{
386 if ( psz )
eec47cc6 387 {
483b0434 388 // calculate the length of the buffer needed first
a2db25a1 389 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 390 if ( nLen != wxCONV_FAILED )
f5fb6871 391 {
483b0434 392 // now do the actual conversion
a2db25a1 393 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 394
483b0434 395 // +1 for the trailing NULL
a2db25a1 396 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 397 return buf;
f5fb6871 398 }
483b0434 399 }
e4e3bbb4 400
483b0434
VZ
401 return wxWCharBuffer();
402}
3698ae71 403
483b0434
VZ
404const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
405{
406 if ( pwz )
407 {
a2db25a1 408 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 409 if ( nLen != wxCONV_FAILED )
483b0434 410 {
a2db25a1
VZ
411 wxCharBuffer buf(nLen - 1);
412 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
413 return buf;
414 }
415 }
416
417 return wxCharBuffer();
418}
e4e3bbb4 419
483b0434 420const wxWCharBuffer
ef199164 421wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 422{
ef199164 423 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 424 if ( dstLen != wxCONV_FAILED )
483b0434 425 {
0dd13d21
VZ
426 // notice that we allocate space for dstLen+1 wide characters here
427 // because we want the buffer to always be NUL-terminated, even if the
428 // input isn't (as otherwise the caller has no way to know its length)
429 wxWCharBuffer wbuf(dstLen);
f6a02087 430 wbuf.data()[dstLen] = L'\0';
ef199164 431 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
432 {
433 if ( outLen )
467e0479
VZ
434 {
435 *outLen = dstLen;
f6a02087
VZ
436
437 // we also need to handle NUL-terminated input strings
438 // specially: for them the output is the length of the string
439 // excluding the trailing NUL, however if we're asked to
440 // convert a specific number of characters we return the length
441 // of the resulting output even if it's NUL-terminated
442 if ( inLen == wxNO_LEN )
467e0479
VZ
443 (*outLen)--;
444 }
445
483b0434
VZ
446 return wbuf;
447 }
448 }
449
450 if ( outLen )
451 *outLen = 0;
452
453 return wxWCharBuffer();
454}
455
456const wxCharBuffer
ef199164 457wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 458{
13d92ad6 459 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 460 if ( dstLen != wxCONV_FAILED )
483b0434 461 {
0dd13d21
VZ
462 const size_t nulLen = GetMBNulLen();
463
464 // as above, ensure that the buffer is always NUL-terminated, even if
465 // the input is not
466 wxCharBuffer buf(dstLen + nulLen - 1);
467 memset(buf.data() + dstLen, 0, nulLen);
ef199164 468 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
469 {
470 if ( outLen )
467e0479
VZ
471 {
472 *outLen = dstLen;
473
f6a02087 474 if ( inLen == wxNO_LEN )
467e0479 475 {
f6a02087
VZ
476 // in this case both input and output are NUL-terminated
477 // and we're not supposed to count NUL
13d92ad6 478 *outLen -= nulLen;
467e0479
VZ
479 }
480 }
d32a507d 481
483b0434
VZ
482 return buf;
483 }
e4e3bbb4
RN
484 }
485
eec47cc6
VZ
486 if ( outLen )
487 *outLen = 0;
488
489 return wxCharBuffer();
e4e3bbb4
RN
490}
491
40ac5040
VZ
492const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
493{
494 const size_t srcLen = buf.length();
495 if ( srcLen )
496 {
497 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
498 if ( dstLen != wxCONV_FAILED )
499 {
500 wxWCharBuffer wbuf(dstLen);
501 wbuf.data()[dstLen] = L'\0';
502 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
503 return wbuf;
504 }
505 }
506
507 return wxWCharBuffer();
508}
509
510const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
511{
512 const size_t srcLen = wbuf.length();
513 if ( srcLen )
514 {
515 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
516 if ( dstLen != wxCONV_FAILED )
517 {
518 wxCharBuffer buf(dstLen);
519 buf.data()[dstLen] = '\0';
520 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
521 return buf;
522 }
523 }
524
525 return wxCharBuffer();
526}
527
6001e347 528// ----------------------------------------------------------------------------
bde4baac 529// wxMBConvLibc
6001e347
RR
530// ----------------------------------------------------------------------------
531
bde4baac
VZ
532size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
533{
534 return wxMB2WC(buf, psz, n);
535}
536
537size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
538{
539 return wxWC2MB(buf, psz, n);
540}
e1bfe89e
RR
541
542// ----------------------------------------------------------------------------
532d575b 543// wxConvBrokenFileNames
e1bfe89e
RR
544// ----------------------------------------------------------------------------
545
eec47cc6
VZ
546#ifdef __UNIX__
547
86501081 548wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 549{
9a83f860
VZ
550 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
551 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 552 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
553 else
554 m_conv = new wxCSConv(charset);
ea8ce907
RR
555}
556
eec47cc6 557#endif // __UNIX__
c12b7f79 558
bde4baac 559// ----------------------------------------------------------------------------
3698ae71 560// UTF-7
bde4baac 561// ----------------------------------------------------------------------------
6001e347 562
15f2ee32 563// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
564//
565// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 566
15f2ee32
RN
567//
568// BASE64 decoding table
569//
570static const unsigned char utf7unb64[] =
6001e347 571{
15f2ee32
RN
572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
576 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
577 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
578 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
579 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
580 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
581 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
582 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
583 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
584 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
585 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
586 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
587 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
588 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
604};
605
9d653e81
VZ
606size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
607 const char *src, size_t srcLen) const
15f2ee32 608{
9d653e81 609 DecoderState stateOrig,
852dcba5 610 *statePtr;
9d653e81
VZ
611 if ( srcLen == wxNO_LEN )
612 {
613 // convert the entire string, up to and including the trailing NUL
614 srcLen = strlen(src) + 1;
615
616 // when working on the entire strings we don't update nor use the shift
617 // state from the previous call
618 statePtr = &stateOrig;
619 }
620 else // when working with partial strings we do use the shift state
621 {
5c33522f 622 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
623
624 // also save the old state to be able to rollback to it on error
625 stateOrig = m_stateDecoder;
626 }
627
628 // but to simplify the code below we use this variable in both cases
629 DecoderState& state = *statePtr;
630
631
632 // number of characters [which would have been] written to dst [if it were
633 // not NULL]
15f2ee32
RN
634 size_t len = 0;
635
9d653e81
VZ
636 const char * const srcEnd = src + srcLen;
637
638 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 639 {
9d653e81
VZ
640 const unsigned char cc = *src++;
641
642 if ( state.IsShifted() )
15f2ee32 643 {
9d653e81
VZ
644 const unsigned char dc = utf7unb64[cc];
645 if ( dc == 0xff )
15f2ee32 646 {
ccaa848d
VZ
647 // end of encoded part, check that nothing was left: there can
648 // be up to 4 bits of 0 padding but nothing else (we also need
649 // to check isLSB as we count bits modulo 8 while a valid UTF-7
650 // encoded sequence must contain an integral number of UTF-16
651 // characters)
652 if ( state.isLSB || state.bit > 4 ||
653 (state.accum & ((1 << state.bit) - 1)) )
654 {
655 if ( !len )
656 state = stateOrig;
657
852dcba5 658 return wxCONV_FAILED;
ccaa848d 659 }
852dcba5 660
9d653e81
VZ
661 state.ToDirect();
662
663 // re-parse this character normally below unless it's '-' which
664 // is consumed by the decoder
665 if ( cc == '-' )
666 continue;
667 }
668 else // valid encoded character
669 {
670 // mini base64 decoder: each character is 6 bits
671 state.bit += 6;
672 state.accum <<= 6;
673 state.accum += dc;
674
675 if ( state.bit >= 8 )
15f2ee32 676 {
9d653e81
VZ
677 // got the full byte, consume it
678 state.bit -= 8;
679 unsigned char b = (state.accum >> state.bit) & 0x00ff;
680
681 if ( state.isLSB )
15f2ee32 682 {
9d653e81
VZ
683 // we've got the full word, output it
684 if ( dst )
685 *dst++ = (state.msb << 8) | b;
686 len++;
687 state.isLSB = false;
15f2ee32 688 }
9d653e81 689 else // MSB
04a37834 690 {
9d653e81
VZ
691 // just store it while we wait for LSB
692 state.msb = b;
693 state.isLSB = true;
04a37834 694 }
15f2ee32
RN
695 }
696 }
9d653e81 697 }
04a37834 698
9d653e81
VZ
699 if ( state.IsDirect() )
700 {
701 // start of an encoded segment?
702 if ( cc == '+' )
04a37834 703 {
9d653e81
VZ
704 if ( *src == '-' )
705 {
706 // just the encoded plus sign, don't switch to shifted mode
707 if ( dst )
708 *dst++ = '+';
709 len++;
710 src++;
711 }
ccaa848d
VZ
712 else if ( utf7unb64[(unsigned)*src] == 0xff )
713 {
714 // empty encoded chunks are not allowed
715 if ( !len )
716 state = stateOrig;
717
718 return wxCONV_FAILED;
719 }
720 else // base-64 encoded chunk follows
9d653e81
VZ
721 {
722 state.ToShifted();
723 }
724 }
725 else // not '+'
726 {
727 // only printable 7 bit ASCII characters (with the exception of
728 // NUL, TAB, CR and LF) can be used directly
729 if ( cc >= 0x7f || (cc < ' ' &&
730 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
731 return wxCONV_FAILED;
732
733 if ( dst )
734 *dst++ = cc;
735 len++;
736 }
15f2ee32
RN
737 }
738 }
04a37834 739
9d653e81
VZ
740 if ( !len )
741 {
742 // as we didn't read any characters we should be called with the same
743 // data (followed by some more new data) again later so don't save our
744 // state
745 state = stateOrig;
746
747 return wxCONV_FAILED;
748 }
04a37834 749
15f2ee32 750 return len;
6001e347
RR
751}
752
15f2ee32
RN
753//
754// BASE64 encoding table
755//
756static const unsigned char utf7enb64[] =
757{
758 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
759 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
760 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
761 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
762 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
763 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
764 'w', 'x', 'y', 'z', '0', '1', '2', '3',
765 '4', '5', '6', '7', '8', '9', '+', '/'
766};
767
768//
769// UTF-7 encoding table
770//
771// 0 - Set D (directly encoded characters)
772// 1 - Set O (optional direct characters)
773// 2 - whitespace characters (optional)
774// 3 - special characters
775//
776static const unsigned char utf7encode[128] =
6001e347 777{
9d653e81 778 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
779 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
780 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
782 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
784 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
786};
787
9d653e81
VZ
788static inline bool wxIsUTF7Direct(wchar_t wc)
789{
790 return wc < 0x80 && utf7encode[wc] < 1;
791}
792
793size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
794 const wchar_t *src, size_t srcLen) const
15f2ee32 795{
9d653e81
VZ
796 EncoderState stateOrig,
797 *statePtr;
798 if ( srcLen == wxNO_LEN )
799 {
800 // we don't apply the stored state when operating on entire strings at
801 // once
802 statePtr = &stateOrig;
803
804 srcLen = wxWcslen(src) + 1;
805 }
806 else // do use the mode we left the output in previously
807 {
808 stateOrig = m_stateEncoder;
5c33522f 809 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
810 }
811
812 EncoderState& state = *statePtr;
813
814
15f2ee32
RN
815 size_t len = 0;
816
9d653e81
VZ
817 const wchar_t * const srcEnd = src + srcLen;
818 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 819 {
9d653e81
VZ
820 wchar_t cc = *src++;
821 if ( wxIsUTF7Direct(cc) )
15f2ee32 822 {
9d653e81
VZ
823 if ( state.IsShifted() )
824 {
825 // pad with zeros the last encoded block if necessary
826 if ( state.bit )
827 {
828 if ( dst )
829 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
830 len++;
831 }
ef199164 832
9d653e81
VZ
833 state.ToDirect();
834
835 if ( dst )
836 *dst++ = '-';
837 len++;
838 }
839
840 if ( dst )
841 *dst++ = (char)cc;
15f2ee32
RN
842 len++;
843 }
9d653e81
VZ
844 else if ( cc == '+' && state.IsDirect() )
845 {
846 if ( dst )
847 {
848 *dst++ = '+';
849 *dst++ = '-';
850 }
851
852 len += 2;
853 }
15f2ee32 854#ifndef WC_UTF16
79c78d42 855 else if (((wxUint32)cc) > 0xffff)
b2c13097 856 {
15f2ee32 857 // no surrogate pair generation (yet?)
467e0479 858 return wxCONV_FAILED;
15f2ee32
RN
859 }
860#endif
861 else
862 {
9d653e81
VZ
863 if ( state.IsDirect() )
864 {
865 state.ToShifted();
ef199164 866
9d653e81
VZ
867 if ( dst )
868 *dst++ = '+';
869 len++;
870 }
871
872 // BASE64 encode string
873 for ( ;; )
15f2ee32 874 {
9d653e81 875 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 876 {
9d653e81
VZ
877 state.accum <<= 8;
878 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
879
880 for (state.bit += 8; state.bit >= 6; )
15f2ee32 881 {
9d653e81
VZ
882 state.bit -= 6;
883 if ( dst )
884 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
885 len++;
15f2ee32 886 }
15f2ee32 887 }
ef199164 888
9d653e81
VZ
889 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
890 break;
ef199164 891
9d653e81 892 src++;
15f2ee32 893 }
15f2ee32
RN
894 }
895 }
ef199164 896
9d653e81
VZ
897 // we need to restore the original encoder state if we were called just to
898 // calculate the amount of space needed as we will presumably be called
899 // again to really convert the data now
900 if ( !dst )
901 state = stateOrig;
ef199164 902
15f2ee32 903 return len;
6001e347
RR
904}
905
f6bcfd97 906// ----------------------------------------------------------------------------
6001e347 907// UTF-8
f6bcfd97 908// ----------------------------------------------------------------------------
6001e347 909
1774c3c5 910static const wxUint32 utf8_max[]=
4def3b35 911 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 912
3698ae71
VZ
913// boundaries of the private use area we use to (temporarily) remap invalid
914// characters invalid in a UTF-8 encoded string
ea8ce907
RR
915const wxUint32 wxUnicodePUA = 0x100000;
916const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
917
0286d08d 918// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 919const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
920 // single-byte sequences (ASCII):
921 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
929
930 // these are invalid:
931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
935 0, 0, // C0,C1
936
937 // two-byte sequences:
938 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
939 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
940
941 // three-byte sequences:
942 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
943
944 // four-byte sequences:
945 4, 4, 4, 4, 4, // F0..F4
946
947 // these are invalid again (5- or 6-byte
948 // sequences and sequences for code points
949 // above U+10FFFF, as restricted by RFC 3629):
950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
951};
952
953size_t
954wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
955 const char *src, size_t srcLen) const
956{
957 wchar_t *out = dstLen ? dst : NULL;
958 size_t written = 0;
959
960 if ( srcLen == wxNO_LEN )
961 srcLen = strlen(src) + 1;
962
963 for ( const char *p = src; ; p++ )
964 {
965 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
966 {
967 // all done successfully, just add the trailing NULL if we are not
968 // using explicit length
969 if ( srcLen == wxNO_LEN )
970 {
971 if ( out )
972 {
973 if ( !dstLen )
974 break;
975
976 *out = L'\0';
977 }
978
979 written++;
980 }
981
982 return written;
983 }
984
0286d08d
VZ
985 if ( out && !dstLen-- )
986 break;
987
5367a38a
VS
988 wxUint32 code;
989 unsigned char c = *p;
0286d08d 990
5367a38a
VS
991 if ( c < 0x80 )
992 {
993 if ( srcLen == 0 ) // the test works for wxNO_LEN too
994 break;
0286d08d 995
5367a38a
VS
996 if ( srcLen != wxNO_LEN )
997 srcLen--;
0286d08d 998
5367a38a
VS
999 code = c;
1000 }
1001 else
0286d08d 1002 {
5367a38a
VS
1003 unsigned len = tableUtf8Lengths[c];
1004 if ( !len )
1005 break;
1006
1007 if ( srcLen < len ) // the test works for wxNO_LEN too
1008 break;
1009
1010 if ( srcLen != wxNO_LEN )
1011 srcLen -= len;
1012
1013 // Char. number range | UTF-8 octet sequence
1014 // (hexadecimal) | (binary)
1015 // ----------------------+----------------------------------------
1016 // 0000 0000 - 0000 007F | 0xxxxxxx
1017 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1018 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1019 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1020 //
1021 // Code point value is stored in bits marked with 'x',
1022 // lowest-order bit of the value on the right side in the diagram
1023 // above. (from RFC 3629)
1024
1025 // mask to extract lead byte's value ('x' bits above), by sequence
1026 // length:
1027 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1028
1029 // mask and value of lead byte's most significant bits, by length:
1030 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1031 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1032
1033 len--; // it's more convenient to work with 0-based length here
1034
1035 // extract the lead byte's value bits:
1036 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1037 break;
1038
1039 code = c & leadValueMask[len];
1040
1041 // all remaining bytes, if any, are handled in the same way
1042 // regardless of sequence's length:
1043 for ( ; len; --len )
1044 {
1045 c = *++p;
1046 if ( (c & 0xC0) != 0x80 )
1047 return wxCONV_FAILED;
0286d08d 1048
5367a38a
VS
1049 code <<= 6;
1050 code |= c & 0x3F;
1051 }
0286d08d
VZ
1052 }
1053
1054#ifdef WC_UTF16
1055 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1056 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1057 {
1058 if ( out )
1059 out++;
1060 written++;
1061 }
1062#else // !WC_UTF16
1063 if ( out )
1064 *out = code;
1065#endif // WC_UTF16/!WC_UTF16
1066
1067 if ( out )
1068 out++;
1069
1070 written++;
1071 }
1072
1073 return wxCONV_FAILED;
1074}
1075
1076size_t
1077wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1078 const wchar_t *src, size_t srcLen) const
1079{
1080 char *out = dstLen ? dst : NULL;
1081 size_t written = 0;
1082
1083 for ( const wchar_t *wp = src; ; wp++ )
1084 {
a964d3ed 1085 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1086 {
1087 // all done successfully, just add the trailing NULL if we are not
1088 // using explicit length
1089 if ( srcLen == wxNO_LEN )
1090 {
1091 if ( out )
1092 {
1093 if ( !dstLen )
1094 break;
1095
1096 *out = '\0';
1097 }
1098
1099 written++;
1100 }
1101
1102 return written;
1103 }
1104
a964d3ed
VZ
1105 if ( srcLen != wxNO_LEN )
1106 srcLen--;
0286d08d
VZ
1107
1108 wxUint32 code;
1109#ifdef WC_UTF16
1110 // cast is ok for WC_UTF16
1111 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1112 {
1113 // skip the next char too as we decoded a surrogate
1114 wp++;
1115 }
1116#else // wchar_t is UTF-32
1117 code = *wp & 0x7fffffff;
1118#endif
1119
1120 unsigned len;
1121 if ( code <= 0x7F )
1122 {
1123 len = 1;
1124 if ( out )
1125 {
1126 if ( dstLen < len )
1127 break;
1128
1129 out[0] = (char)code;
1130 }
1131 }
1132 else if ( code <= 0x07FF )
1133 {
1134 len = 2;
1135 if ( out )
1136 {
1137 if ( dstLen < len )
1138 break;
1139
1140 // NB: this line takes 6 least significant bits, encodes them as
1141 // 10xxxxxx and discards them so that the next byte can be encoded:
1142 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1143 out[0] = 0xC0 | code;
1144 }
1145 }
1146 else if ( code < 0xFFFF )
1147 {
1148 len = 3;
1149 if ( out )
1150 {
1151 if ( dstLen < len )
1152 break;
1153
1154 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1155 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1156 out[0] = 0xE0 | code;
1157 }
1158 }
1159 else if ( code <= 0x10FFFF )
1160 {
1161 len = 4;
1162 if ( out )
1163 {
1164 if ( dstLen < len )
1165 break;
1166
1167 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1168 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1169 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1170 out[0] = 0xF0 | code;
1171 }
1172 }
1173 else
1174 {
9a83f860 1175 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1176 break;
1177 }
1178
1179 if ( out )
1180 {
1181 out += len;
1182 dstLen -= len;
1183 }
1184
1185 written += len;
1186 }
1187
1188 // we only get here if an error occurs during decoding
1189 return wxCONV_FAILED;
1190}
1191
d16d0917
VZ
1192size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1193 const char *psz, size_t srcLen) const
6001e347 1194{
0286d08d 1195 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1196 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1197
4def3b35
VS
1198 size_t len = 0;
1199
d16d0917 1200 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1201 {
ea8ce907
RR
1202 const char *opsz = psz;
1203 bool invalid = false;
4def3b35
VS
1204 unsigned char cc = *psz++, fc = cc;
1205 unsigned cnt;
dccce9ea 1206 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1207 fc <<= 1;
ef199164 1208
dccce9ea 1209 if (!cnt)
4def3b35
VS
1210 {
1211 // plain ASCII char
dccce9ea 1212 if (buf)
4def3b35
VS
1213 *buf++ = cc;
1214 len++;
561488ef
MW
1215
1216 // escape the escape character for octal escapes
1217 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1218 && cc == '\\' && (!buf || len < n))
1219 {
1220 if (buf)
1221 *buf++ = cc;
1222 len++;
1223 }
dccce9ea
VZ
1224 }
1225 else
4def3b35
VS
1226 {
1227 cnt--;
dccce9ea 1228 if (!cnt)
4def3b35
VS
1229 {
1230 // invalid UTF-8 sequence
ea8ce907 1231 invalid = true;
dccce9ea
VZ
1232 }
1233 else
4def3b35
VS
1234 {
1235 unsigned ocnt = cnt - 1;
1236 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1237 while (cnt--)
4def3b35 1238 {
ea8ce907 1239 cc = *psz;
dccce9ea 1240 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1241 {
1242 // invalid UTF-8 sequence
ea8ce907
RR
1243 invalid = true;
1244 break;
4def3b35 1245 }
ef199164 1246
ea8ce907 1247 psz++;
4def3b35
VS
1248 res = (res << 6) | (cc & 0x3f);
1249 }
ef199164 1250
ea8ce907 1251 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1252 {
1253 // illegal UTF-8 encoding
ea8ce907 1254 invalid = true;
4def3b35 1255 }
ea8ce907
RR
1256 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1257 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1258 {
1259 // if one of our PUA characters turns up externally
1260 // it must also be treated as an illegal sequence
1261 // (a bit like you have to escape an escape character)
1262 invalid = true;
1263 }
1264 else
1265 {
1cd52418 1266#ifdef WC_UTF16
0286d08d 1267 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1268 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1269 if (pa == wxCONV_FAILED)
ea8ce907
RR
1270 {
1271 invalid = true;
1272 }
1273 else
1274 {
1275 if (buf)
1276 buf += pa;
1277 len += pa;
1278 }
373658eb 1279#else // !WC_UTF16
ea8ce907 1280 if (buf)
38d4b1e4 1281 *buf++ = (wchar_t)res;
ea8ce907 1282 len++;
373658eb 1283#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1284 }
1285 }
ef199164 1286
ea8ce907
RR
1287 if (invalid)
1288 {
1289 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1290 {
1291 while (opsz < psz && (!buf || len < n))
1292 {
1293#ifdef WC_UTF16
1294 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1295 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1296 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1297 if (buf)
1298 buf += pa;
1299 opsz++;
1300 len += pa;
1301#else
1302 if (buf)
38d4b1e4 1303 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1304 opsz++;
1305 len++;
1306#endif
1307 }
1308 }
3698ae71 1309 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1310 {
1311 while (opsz < psz && (!buf || len < n))
1312 {
3698ae71
VZ
1313 if ( buf && len + 3 < n )
1314 {
17a1ebd1 1315 unsigned char on = *opsz;
3698ae71 1316 *buf++ = L'\\';
17a1ebd1
VZ
1317 *buf++ = (wchar_t)( L'0' + on / 0100 );
1318 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1319 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1320 }
ef199164 1321
ea8ce907
RR
1322 opsz++;
1323 len += 4;
1324 }
1325 }
3698ae71 1326 else // MAP_INVALID_UTF8_NOT
ea8ce907 1327 {
467e0479 1328 return wxCONV_FAILED;
ea8ce907 1329 }
4def3b35
VS
1330 }
1331 }
6001e347 1332 }
ef199164 1333
d16d0917 1334 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1335 *buf = 0;
ef199164 1336
d16d0917 1337 return len + 1;
6001e347
RR
1338}
1339
3698ae71
VZ
1340static inline bool isoctal(wchar_t wch)
1341{
1342 return L'0' <= wch && wch <= L'7';
1343}
1344
d16d0917
VZ
1345size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1346 const wchar_t *psz, size_t srcLen) const
6001e347 1347{
0286d08d 1348 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1349 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1350
4def3b35 1351 size_t len = 0;
6001e347 1352
d16d0917 1353 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1354 {
1355 wxUint32 cc;
ef199164 1356
1cd52418 1357#ifdef WC_UTF16
b5153fd8
VZ
1358 // cast is ok for WC_UTF16
1359 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1360 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1361#else
ef199164 1362 cc = (*psz++) & 0x7fffffff;
4def3b35 1363#endif
3698ae71
VZ
1364
1365 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1366 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1367 {
dccce9ea 1368 if (buf)
ea8ce907 1369 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1370 len++;
3698ae71 1371 }
561488ef
MW
1372 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1373 && cc == L'\\' && psz[0] == L'\\' )
1374 {
1375 if (buf)
1376 *buf++ = (char)cc;
1377 psz++;
1378 len++;
1379 }
3698ae71
VZ
1380 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1381 cc == L'\\' &&
1382 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1383 {
dccce9ea 1384 if (buf)
3698ae71 1385 {
ef199164
DS
1386 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1387 (psz[1] - L'0') * 010 +
b2c13097 1388 (psz[2] - L'0'));
3698ae71
VZ
1389 }
1390
1391 psz += 3;
ea8ce907
RR
1392 len++;
1393 }
1394 else
1395 {
1396 unsigned cnt;
ef199164
DS
1397 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1398 {
1399 }
1400
ea8ce907 1401 if (!cnt)
4def3b35 1402 {
ea8ce907
RR
1403 // plain ASCII char
1404 if (buf)
1405 *buf++ = (char) cc;
1406 len++;
1407 }
ea8ce907
RR
1408 else
1409 {
1410 len += cnt + 1;
1411 if (buf)
1412 {
1413 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1414 while (cnt--)
1415 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1416 }
4def3b35
VS
1417 }
1418 }
6001e347 1419 }
4def3b35 1420
d16d0917 1421 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1422 *buf = 0;
adb45366 1423
d16d0917 1424 return len + 1;
6001e347
RR
1425}
1426
467e0479 1427// ============================================================================
c91830cb 1428// UTF-16
467e0479 1429// ============================================================================
c91830cb
VZ
1430
1431#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1432 #define wxMBConvUTF16straight wxMBConvUTF16BE
1433 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1434#else
bde4baac
VZ
1435 #define wxMBConvUTF16swap wxMBConvUTF16BE
1436 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1437#endif
1438
467e0479
VZ
1439/* static */
1440size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1441{
1442 if ( srcLen == wxNO_LEN )
1443 {
1444 // count the number of bytes in input, including the trailing NULs
5c33522f 1445 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1446 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1447 ;
c91830cb 1448
467e0479
VZ
1449 srcLen *= BYTES_PER_CHAR;
1450 }
1451 else // we already have the length
1452 {
1453 // we can only convert an entire number of UTF-16 characters
1454 if ( srcLen % BYTES_PER_CHAR )
1455 return wxCONV_FAILED;
1456 }
1457
1458 return srcLen;
1459}
1460
1461// case when in-memory representation is UTF-16 too
c91830cb
VZ
1462#ifdef WC_UTF16
1463
467e0479
VZ
1464// ----------------------------------------------------------------------------
1465// conversions without endianness change
1466// ----------------------------------------------------------------------------
1467
1468size_t
1469wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1470 const char *src, size_t srcLen) const
c91830cb 1471{
467e0479
VZ
1472 // set up the scene for using memcpy() (which is presumably more efficient
1473 // than copying the bytes one by one)
1474 srcLen = GetLength(src, srcLen);
1475 if ( srcLen == wxNO_LEN )
1476 return wxCONV_FAILED;
c91830cb 1477
ef199164 1478 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1479 if ( dst )
c91830cb 1480 {
467e0479
VZ
1481 if ( dstLen < inLen )
1482 return wxCONV_FAILED;
c91830cb 1483
467e0479 1484 memcpy(dst, src, srcLen);
c91830cb 1485 }
d32a507d 1486
467e0479 1487 return inLen;
c91830cb
VZ
1488}
1489
467e0479
VZ
1490size_t
1491wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1492 const wchar_t *src, size_t srcLen) const
c91830cb 1493{
467e0479
VZ
1494 if ( srcLen == wxNO_LEN )
1495 srcLen = wxWcslen(src) + 1;
c91830cb 1496
467e0479
VZ
1497 srcLen *= BYTES_PER_CHAR;
1498
1499 if ( dst )
c91830cb 1500 {
467e0479
VZ
1501 if ( dstLen < srcLen )
1502 return wxCONV_FAILED;
d32a507d 1503
467e0479 1504 memcpy(dst, src, srcLen);
c91830cb 1505 }
d32a507d 1506
467e0479 1507 return srcLen;
c91830cb
VZ
1508}
1509
467e0479
VZ
1510// ----------------------------------------------------------------------------
1511// endian-reversing conversions
1512// ----------------------------------------------------------------------------
c91830cb 1513
467e0479
VZ
1514size_t
1515wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1516 const char *src, size_t srcLen) const
c91830cb 1517{
467e0479
VZ
1518 srcLen = GetLength(src, srcLen);
1519 if ( srcLen == wxNO_LEN )
1520 return wxCONV_FAILED;
c91830cb 1521
467e0479
VZ
1522 srcLen /= BYTES_PER_CHAR;
1523
1524 if ( dst )
c91830cb 1525 {
467e0479
VZ
1526 if ( dstLen < srcLen )
1527 return wxCONV_FAILED;
1528
5c33522f 1529 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1530 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1531 {
ef199164 1532 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1533 }
c91830cb 1534 }
bfab25d4 1535
467e0479 1536 return srcLen;
c91830cb
VZ
1537}
1538
467e0479
VZ
1539size_t
1540wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1541 const wchar_t *src, size_t srcLen) const
c91830cb 1542{
467e0479
VZ
1543 if ( srcLen == wxNO_LEN )
1544 srcLen = wxWcslen(src) + 1;
c91830cb 1545
467e0479
VZ
1546 srcLen *= BYTES_PER_CHAR;
1547
1548 if ( dst )
c91830cb 1549 {
467e0479
VZ
1550 if ( dstLen < srcLen )
1551 return wxCONV_FAILED;
1552
5c33522f 1553 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1554 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1555 {
ef199164 1556 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1557 }
c91830cb 1558 }
eec47cc6 1559
467e0479 1560 return srcLen;
c91830cb
VZ
1561}
1562
467e0479 1563#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1564
467e0479
VZ
1565// ----------------------------------------------------------------------------
1566// conversions without endianness change
1567// ----------------------------------------------------------------------------
c91830cb 1568
35d11700
VZ
1569size_t
1570wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1571 const char *src, size_t srcLen) const
c91830cb 1572{
35d11700
VZ
1573 srcLen = GetLength(src, srcLen);
1574 if ( srcLen == wxNO_LEN )
1575 return wxCONV_FAILED;
c91830cb 1576
ef199164 1577 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1578 if ( !dst )
c91830cb 1579 {
35d11700
VZ
1580 // optimization: return maximal space which could be needed for this
1581 // string even if the real size could be smaller if the buffer contains
1582 // any surrogates
1583 return inLen;
c91830cb 1584 }
c91830cb 1585
35d11700 1586 size_t outLen = 0;
5c33522f 1587 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1588 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1589 {
ef199164
DS
1590 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1591 if ( !inBuff )
35d11700
VZ
1592 return wxCONV_FAILED;
1593
1594 if ( ++outLen > dstLen )
1595 return wxCONV_FAILED;
c91830cb 1596
35d11700
VZ
1597 *dst++ = ch;
1598 }
1599
1600
1601 return outLen;
1602}
c91830cb 1603
35d11700
VZ
1604size_t
1605wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1606 const wchar_t *src, size_t srcLen) const
c91830cb 1607{
35d11700
VZ
1608 if ( srcLen == wxNO_LEN )
1609 srcLen = wxWcslen(src) + 1;
c91830cb 1610
35d11700 1611 size_t outLen = 0;
5c33522f 1612 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1613 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1614 {
1615 wxUint16 cc[2];
35d11700
VZ
1616 const size_t numChars = encode_utf16(*src++, cc);
1617 if ( numChars == wxCONV_FAILED )
1618 return wxCONV_FAILED;
c91830cb 1619
ef199164
DS
1620 outLen += numChars * BYTES_PER_CHAR;
1621 if ( outBuff )
c91830cb 1622 {
35d11700
VZ
1623 if ( outLen > dstLen )
1624 return wxCONV_FAILED;
1625
ef199164 1626 *outBuff++ = cc[0];
35d11700 1627 if ( numChars == 2 )
69b80d28 1628 {
35d11700 1629 // second character of a surrogate
ef199164 1630 *outBuff++ = cc[1];
69b80d28 1631 }
c91830cb 1632 }
c91830cb 1633 }
c91830cb 1634
35d11700 1635 return outLen;
c91830cb
VZ
1636}
1637
467e0479
VZ
1638// ----------------------------------------------------------------------------
1639// endian-reversing conversions
1640// ----------------------------------------------------------------------------
c91830cb 1641
35d11700
VZ
1642size_t
1643wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1644 const char *src, size_t srcLen) const
c91830cb 1645{
35d11700
VZ
1646 srcLen = GetLength(src, srcLen);
1647 if ( srcLen == wxNO_LEN )
1648 return wxCONV_FAILED;
1649
ef199164 1650 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1651 if ( !dst )
1652 {
1653 // optimization: return maximal space which could be needed for this
1654 // string even if the real size could be smaller if the buffer contains
1655 // any surrogates
1656 return inLen;
1657 }
c91830cb 1658
35d11700 1659 size_t outLen = 0;
5c33522f 1660 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1661 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1662 {
35d11700
VZ
1663 wxUint32 ch;
1664 wxUint16 tmp[2];
ef199164
DS
1665
1666 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1667 inBuff++;
1668 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1669
35d11700
VZ
1670 const size_t numChars = decode_utf16(tmp, ch);
1671 if ( numChars == wxCONV_FAILED )
1672 return wxCONV_FAILED;
c91830cb 1673
35d11700 1674 if ( numChars == 2 )
ef199164 1675 inBuff++;
35d11700
VZ
1676
1677 if ( ++outLen > dstLen )
1678 return wxCONV_FAILED;
c91830cb 1679
35d11700 1680 *dst++ = ch;
c91830cb 1681 }
c91830cb 1682
c91830cb 1683
35d11700
VZ
1684 return outLen;
1685}
c91830cb 1686
35d11700
VZ
1687size_t
1688wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1689 const wchar_t *src, size_t srcLen) const
c91830cb 1690{
35d11700
VZ
1691 if ( srcLen == wxNO_LEN )
1692 srcLen = wxWcslen(src) + 1;
c91830cb 1693
35d11700 1694 size_t outLen = 0;
5c33522f 1695 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1696 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1697 {
1698 wxUint16 cc[2];
35d11700
VZ
1699 const size_t numChars = encode_utf16(*src, cc);
1700 if ( numChars == wxCONV_FAILED )
1701 return wxCONV_FAILED;
c91830cb 1702
ef199164
DS
1703 outLen += numChars * BYTES_PER_CHAR;
1704 if ( outBuff )
c91830cb 1705 {
35d11700
VZ
1706 if ( outLen > dstLen )
1707 return wxCONV_FAILED;
1708
ef199164 1709 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1710 if ( numChars == 2 )
c91830cb 1711 {
35d11700 1712 // second character of a surrogate
ef199164 1713 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1714 }
1715 }
c91830cb 1716 }
c91830cb 1717
35d11700 1718 return outLen;
c91830cb
VZ
1719}
1720
467e0479 1721#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1722
1723
35d11700 1724// ============================================================================
c91830cb 1725// UTF-32
35d11700 1726// ============================================================================
c91830cb
VZ
1727
1728#ifdef WORDS_BIGENDIAN
467e0479
VZ
1729 #define wxMBConvUTF32straight wxMBConvUTF32BE
1730 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1731#else
467e0479
VZ
1732 #define wxMBConvUTF32swap wxMBConvUTF32BE
1733 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1734#endif
1735
1736
1737WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1738WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1739
467e0479
VZ
1740/* static */
1741size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1742{
1743 if ( srcLen == wxNO_LEN )
1744 {
1745 // count the number of bytes in input, including the trailing NULs
5c33522f 1746 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1747 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1748 ;
c91830cb 1749
467e0479
VZ
1750 srcLen *= BYTES_PER_CHAR;
1751 }
1752 else // we already have the length
1753 {
1754 // we can only convert an entire number of UTF-32 characters
1755 if ( srcLen % BYTES_PER_CHAR )
1756 return wxCONV_FAILED;
1757 }
1758
1759 return srcLen;
1760}
1761
1762// case when in-memory representation is UTF-16
c91830cb
VZ
1763#ifdef WC_UTF16
1764
467e0479
VZ
1765// ----------------------------------------------------------------------------
1766// conversions without endianness change
1767// ----------------------------------------------------------------------------
1768
1769size_t
1770wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1771 const char *src, size_t srcLen) const
c91830cb 1772{
467e0479
VZ
1773 srcLen = GetLength(src, srcLen);
1774 if ( srcLen == wxNO_LEN )
1775 return wxCONV_FAILED;
c91830cb 1776
5c33522f 1777 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1778 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1779 size_t outLen = 0;
1780 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1781 {
1782 wxUint16 cc[2];
ef199164 1783 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1784 if ( numChars == wxCONV_FAILED )
1785 return wxCONV_FAILED;
c91830cb 1786
467e0479
VZ
1787 outLen += numChars;
1788 if ( dst )
c91830cb 1789 {
467e0479
VZ
1790 if ( outLen > dstLen )
1791 return wxCONV_FAILED;
d32a507d 1792
467e0479
VZ
1793 *dst++ = cc[0];
1794 if ( numChars == 2 )
1795 {
1796 // second character of a surrogate
1797 *dst++ = cc[1];
1798 }
1799 }
c91830cb 1800 }
d32a507d 1801
467e0479 1802 return outLen;
c91830cb
VZ
1803}
1804
467e0479
VZ
1805size_t
1806wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1807 const wchar_t *src, size_t srcLen) const
c91830cb 1808{
467e0479
VZ
1809 if ( srcLen == wxNO_LEN )
1810 srcLen = wxWcslen(src) + 1;
c91830cb 1811
467e0479 1812 if ( !dst )
c91830cb 1813 {
467e0479
VZ
1814 // optimization: return maximal space which could be needed for this
1815 // string instead of the exact amount which could be less if there are
1816 // any surrogates in the input
1817 //
1818 // we consider that surrogates are rare enough to make it worthwhile to
1819 // avoid running the loop below at the cost of slightly extra memory
1820 // consumption
ef199164 1821 return srcLen * BYTES_PER_CHAR;
467e0479 1822 }
c91830cb 1823
5c33522f 1824 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1825 size_t outLen = 0;
1826 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1827 {
1828 const wxUint32 ch = wxDecodeSurrogate(&src);
1829 if ( !src )
1830 return wxCONV_FAILED;
c91830cb 1831
467e0479 1832 outLen += BYTES_PER_CHAR;
d32a507d 1833
467e0479
VZ
1834 if ( outLen > dstLen )
1835 return wxCONV_FAILED;
b5153fd8 1836
ef199164 1837 *outBuff++ = ch;
467e0479 1838 }
c91830cb 1839
467e0479 1840 return outLen;
c91830cb
VZ
1841}
1842
467e0479
VZ
1843// ----------------------------------------------------------------------------
1844// endian-reversing conversions
1845// ----------------------------------------------------------------------------
c91830cb 1846
467e0479
VZ
1847size_t
1848wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1849 const char *src, size_t srcLen) const
c91830cb 1850{
467e0479
VZ
1851 srcLen = GetLength(src, srcLen);
1852 if ( srcLen == wxNO_LEN )
1853 return wxCONV_FAILED;
c91830cb 1854
5c33522f 1855 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1856 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1857 size_t outLen = 0;
ef199164 1858 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1859 {
c91830cb 1860 wxUint16 cc[2];
ef199164 1861 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1862 if ( numChars == wxCONV_FAILED )
1863 return wxCONV_FAILED;
c91830cb 1864
467e0479
VZ
1865 outLen += numChars;
1866 if ( dst )
c91830cb 1867 {
467e0479
VZ
1868 if ( outLen > dstLen )
1869 return wxCONV_FAILED;
d32a507d 1870
467e0479
VZ
1871 *dst++ = cc[0];
1872 if ( numChars == 2 )
1873 {
1874 // second character of a surrogate
1875 *dst++ = cc[1];
1876 }
1877 }
c91830cb 1878 }
b5153fd8 1879
467e0479 1880 return outLen;
c91830cb
VZ
1881}
1882
467e0479
VZ
1883size_t
1884wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1885 const wchar_t *src, size_t srcLen) const
c91830cb 1886{
467e0479
VZ
1887 if ( srcLen == wxNO_LEN )
1888 srcLen = wxWcslen(src) + 1;
c91830cb 1889
467e0479 1890 if ( !dst )
c91830cb 1891 {
467e0479
VZ
1892 // optimization: return maximal space which could be needed for this
1893 // string instead of the exact amount which could be less if there are
1894 // any surrogates in the input
1895 //
1896 // we consider that surrogates are rare enough to make it worthwhile to
1897 // avoid running the loop below at the cost of slightly extra memory
1898 // consumption
1899 return srcLen*BYTES_PER_CHAR;
1900 }
c91830cb 1901
5c33522f 1902 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1903 size_t outLen = 0;
1904 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1905 {
1906 const wxUint32 ch = wxDecodeSurrogate(&src);
1907 if ( !src )
1908 return wxCONV_FAILED;
c91830cb 1909
467e0479 1910 outLen += BYTES_PER_CHAR;
d32a507d 1911
467e0479
VZ
1912 if ( outLen > dstLen )
1913 return wxCONV_FAILED;
b5153fd8 1914
ef199164 1915 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1916 }
c91830cb 1917
467e0479 1918 return outLen;
c91830cb
VZ
1919}
1920
467e0479 1921#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1922
35d11700
VZ
1923// ----------------------------------------------------------------------------
1924// conversions without endianness change
1925// ----------------------------------------------------------------------------
1926
1927size_t
1928wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1929 const char *src, size_t srcLen) const
c91830cb 1930{
35d11700
VZ
1931 // use memcpy() as it should be much faster than hand-written loop
1932 srcLen = GetLength(src, srcLen);
1933 if ( srcLen == wxNO_LEN )
1934 return wxCONV_FAILED;
c91830cb 1935
35d11700
VZ
1936 const size_t inLen = srcLen/BYTES_PER_CHAR;
1937 if ( dst )
c91830cb 1938 {
35d11700
VZ
1939 if ( dstLen < inLen )
1940 return wxCONV_FAILED;
b5153fd8 1941
35d11700
VZ
1942 memcpy(dst, src, srcLen);
1943 }
c91830cb 1944
35d11700 1945 return inLen;
c91830cb
VZ
1946}
1947
35d11700
VZ
1948size_t
1949wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1950 const wchar_t *src, size_t srcLen) const
c91830cb 1951{
35d11700
VZ
1952 if ( srcLen == wxNO_LEN )
1953 srcLen = wxWcslen(src) + 1;
1954
1955 srcLen *= BYTES_PER_CHAR;
c91830cb 1956
35d11700 1957 if ( dst )
c91830cb 1958 {
35d11700
VZ
1959 if ( dstLen < srcLen )
1960 return wxCONV_FAILED;
c91830cb 1961
35d11700 1962 memcpy(dst, src, srcLen);
c91830cb
VZ
1963 }
1964
35d11700 1965 return srcLen;
c91830cb
VZ
1966}
1967
35d11700
VZ
1968// ----------------------------------------------------------------------------
1969// endian-reversing conversions
1970// ----------------------------------------------------------------------------
c91830cb 1971
35d11700
VZ
1972size_t
1973wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1974 const char *src, size_t srcLen) const
c91830cb 1975{
35d11700
VZ
1976 srcLen = GetLength(src, srcLen);
1977 if ( srcLen == wxNO_LEN )
1978 return wxCONV_FAILED;
1979
1980 srcLen /= BYTES_PER_CHAR;
c91830cb 1981
35d11700 1982 if ( dst )
c91830cb 1983 {
35d11700
VZ
1984 if ( dstLen < srcLen )
1985 return wxCONV_FAILED;
1986
5c33522f 1987 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1988 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1989 {
ef199164 1990 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1991 }
c91830cb 1992 }
b5153fd8 1993
35d11700 1994 return srcLen;
c91830cb
VZ
1995}
1996
35d11700
VZ
1997size_t
1998wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1999 const wchar_t *src, size_t srcLen) const
c91830cb 2000{
35d11700
VZ
2001 if ( srcLen == wxNO_LEN )
2002 srcLen = wxWcslen(src) + 1;
2003
2004 srcLen *= BYTES_PER_CHAR;
c91830cb 2005
35d11700 2006 if ( dst )
c91830cb 2007 {
35d11700
VZ
2008 if ( dstLen < srcLen )
2009 return wxCONV_FAILED;
2010
5c33522f 2011 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2012 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2013 {
ef199164 2014 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2015 }
c91830cb 2016 }
b5153fd8 2017
35d11700 2018 return srcLen;
c91830cb
VZ
2019}
2020
467e0479 2021#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2022
2023
36acb880
VZ
2024// ============================================================================
2025// The classes doing conversion using the iconv_xxx() functions
2026// ============================================================================
3caec1bb 2027
b040e242 2028#ifdef HAVE_ICONV
3a0d76bc 2029
b1d547eb
VS
2030// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2031// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2032// (unless there's yet another bug in glibc) the only case when iconv()
2033// returns with (size_t)-1 (which means error) and says there are 0 bytes
2034// left in the input buffer -- when _real_ error occurs,
2035// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2036// iconv() failure.
3caec1bb
VS
2037// [This bug does not appear in glibc 2.2.]
2038#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2039#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2040 (errno != E2BIG || bufLeft != 0))
2041#else
2042#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2043#endif
2044
ab217dba 2045#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2046
74a7eb0b
VZ
2047#define ICONV_T_INVALID ((iconv_t)-1)
2048
2049#if SIZEOF_WCHAR_T == 4
2050 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2051 #define WC_ENC wxFONTENCODING_UTF32
2052#elif SIZEOF_WCHAR_T == 2
2053 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2054 #define WC_ENC wxFONTENCODING_UTF16
2055#else // sizeof(wchar_t) != 2 nor 4
2056 // does this ever happen?
2057 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2058#endif
2059
36acb880 2060// ----------------------------------------------------------------------------
e95354ec 2061// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2062// ----------------------------------------------------------------------------
2063
e95354ec 2064class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2065{
2066public:
86501081 2067 wxMBConv_iconv(const char *name);
e95354ec 2068 virtual ~wxMBConv_iconv();
36acb880 2069
8f4b0f43
VZ
2070 // implement base class virtual methods
2071 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2072 const char *src, size_t srcLen = wxNO_LEN) const;
2073 virtual size_t FromWChar(char *dst, size_t dstLen,
2074 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2075 virtual size_t GetMBNulLen() const;
2076
ba98e032
VS
2077#if wxUSE_UNICODE_UTF8
2078 virtual bool IsUTF8() const;
2079#endif
2080
d36c9347
VZ
2081 virtual wxMBConv *Clone() const
2082 {
86501081 2083 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2084 p->m_minMBCharWidth = m_minMBCharWidth;
2085 return p;
2086 }
2087
e95354ec 2088 bool IsOk() const
74a7eb0b 2089 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2090
2091protected:
ef199164
DS
2092 // the iconv handlers used to translate from multibyte
2093 // to wide char and in the other direction
36acb880
VZ
2094 iconv_t m2w,
2095 w2m;
ef199164 2096
b1d547eb
VS
2097#if wxUSE_THREADS
2098 // guards access to m2w and w2m objects
2099 wxMutex m_iconvMutex;
2100#endif
36acb880
VZ
2101
2102private:
e95354ec 2103 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2104 // available on this machine, it will remain NULL
74a7eb0b 2105 static wxString ms_wcCharsetName;
36acb880
VZ
2106
2107 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2108 // different endian-ness than the native one
405d8f46 2109 static bool ms_wcNeedsSwap;
eec47cc6 2110
d36c9347
VZ
2111
2112 // name of the encoding handled by this conversion
2113 wxString m_name;
2114
7ef3ab50 2115 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2116 // initially
2117 size_t m_minMBCharWidth;
36acb880
VZ
2118};
2119
8f115891 2120// make the constructor available for unit testing
86501081 2121WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2122{
2123 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2124 if ( !result->IsOk() )
2125 {
2126 delete result;
2127 return 0;
2128 }
ef199164 2129
8f115891
MW
2130 return result;
2131}
2132
422e411e 2133wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2134bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2135
86501081 2136wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2137 : m_name(name)
36acb880 2138{
c1464d9d 2139 m_minMBCharWidth = 0;
eec47cc6 2140
36acb880 2141 // check for charset that represents wchar_t:
74a7eb0b 2142 if ( ms_wcCharsetName.empty() )
f1339c56 2143 {
9a83f860 2144 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2145
74a7eb0b
VZ
2146#if wxUSE_FONTMAP
2147 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2148#else // !wxUSE_FONTMAP
91cb7f52 2149 static const wxChar *names_static[] =
36acb880 2150 {
74a7eb0b 2151#if SIZEOF_WCHAR_T == 4
9a83f860 2152 wxT("UCS-4"),
74a7eb0b 2153#elif SIZEOF_WCHAR_T = 2
9a83f860 2154 wxT("UCS-2"),
74a7eb0b
VZ
2155#endif
2156 NULL
2157 };
91cb7f52 2158 const wxChar **names = names_static;
74a7eb0b 2159#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2160
d1f024a8 2161 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2162 {
17a1ebd1 2163 const wxString nameCS(*names);
74a7eb0b
VZ
2164
2165 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2166 wxString nameXE(nameCS);
ef199164
DS
2167
2168#ifdef WORDS_BIGENDIAN
9a83f860 2169 nameXE += wxT("BE");
ef199164 2170#else // little endian
9a83f860 2171 nameXE += wxT("LE");
ef199164 2172#endif
74a7eb0b 2173
9a83f860 2174 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2175 nameXE.c_str());
2176
86501081 2177 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2178 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2179 {
74a7eb0b 2180 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2181 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2182 nameCS.c_str());
86501081 2183 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2184
74a7eb0b
VZ
2185 // and check for bytesex ourselves:
2186 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2187 {
74a7eb0b 2188 char buf[2], *bufPtr;
e8769ed1 2189 wchar_t wbuf[2];
74a7eb0b
VZ
2190 size_t insz, outsz;
2191 size_t res;
2192
2193 buf[0] = 'A';
2194 buf[1] = 0;
2195 wbuf[0] = 0;
2196 insz = 2;
2197 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2198 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2199 bufPtr = buf;
2200
ef199164
DS
2201 res = iconv(
2202 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2203 &wbufPtr, &outsz);
74a7eb0b
VZ
2204
2205 if (ICONV_FAILED(res, insz))
2206 {
2207 wxLogLastError(wxT("iconv"));
422e411e 2208 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2209 nameCS.c_str());
74a7eb0b
VZ
2210 }
2211 else // ok, can convert to this encoding, remember it
2212 {
17a1ebd1 2213 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2214 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2215 }
3a0d76bc
VS
2216 }
2217 }
74a7eb0b 2218 else // use charset not requiring byte swapping
36acb880 2219 {
74a7eb0b 2220 ms_wcCharsetName = nameXE;
36acb880 2221 }
3a0d76bc 2222 }
74a7eb0b 2223
0944fceb 2224 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2225 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2226 ms_wcCharsetName.empty() ? wxString("<none>")
2227 : ms_wcCharsetName,
9a83f860
VZ
2228 ms_wcNeedsSwap ? wxT(" (needs swap)")
2229 : wxT(""));
3a0d76bc 2230 }
36acb880 2231 else // we already have ms_wcCharsetName
3caec1bb 2232 {
86501081 2233 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2234 }
dccce9ea 2235
74a7eb0b 2236 if ( ms_wcCharsetName.empty() )
f1339c56 2237 {
74a7eb0b 2238 w2m = ICONV_T_INVALID;
36acb880 2239 }
405d8f46
VZ
2240 else
2241 {
86501081 2242 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2243 if ( w2m == ICONV_T_INVALID )
2244 {
2245 wxLogTrace(TRACE_STRCONV,
2246 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2247 ms_wcCharsetName.c_str(), name);
74a7eb0b 2248 }
405d8f46 2249 }
36acb880 2250}
3caec1bb 2251
e95354ec 2252wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2253{
74a7eb0b 2254 if ( m2w != ICONV_T_INVALID )
36acb880 2255 iconv_close(m2w);
74a7eb0b 2256 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2257 iconv_close(w2m);
2258}
3a0d76bc 2259
8f4b0f43
VZ
2260size_t
2261wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2262 const char *src, size_t srcLen) const
36acb880 2263{
8f4b0f43 2264 if ( srcLen == wxNO_LEN )
69373110 2265 {
8f4b0f43
VZ
2266 // find the string length: notice that must be done differently for
2267 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2268 // consecutive NULs
2269 const size_t nulLen = GetMBNulLen();
2270 switch ( nulLen )
2271 {
2272 default:
2273 return wxCONV_FAILED;
69373110 2274
8f4b0f43
VZ
2275 case 1:
2276 srcLen = strlen(src); // arguably more optimized than our version
2277 break;
69373110 2278
8f4b0f43
VZ
2279 case 2:
2280 case 4:
2281 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2282 // but they also have to start at character boundary and not
2283 // span two adjacent characters
2284 const char *p;
2285 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2286 ;
2287 srcLen = p - src;
2288 break;
2289 }
d50c0831
VZ
2290
2291 // when we're determining the length of the string ourselves we count
2292 // the terminating NUL(s) as part of it and always NUL-terminate the
2293 // output
2294 srcLen += nulLen;
69373110
VZ
2295 }
2296
8f4b0f43
VZ
2297 // we express length in the number of (wide) characters but iconv always
2298 // counts buffer sizes it in bytes
2299 dstLen *= SIZEOF_WCHAR_T;
2300
b1d547eb 2301#if wxUSE_THREADS
6a17b868
SN
2302 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2303 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2304 // wxConvLocal that are used all over wx code, so we have to make sure
2305 // the handle is used by at most one thread at the time. Otherwise
2306 // only a few wx classes would be safe to use from non-main threads
2307 // as MB<->WC conversion would fail "randomly".
2308 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2309#endif // wxUSE_THREADS
2310
36acb880 2311 size_t res, cres;
8f4b0f43 2312 const char *pszPtr = src;
36acb880 2313
8f4b0f43 2314 if ( dst )
36acb880 2315 {
8f4b0f43 2316 char* bufPtr = (char*)dst;
e8769ed1 2317
36acb880 2318 // have destination buffer, convert there
1752fda6 2319 size_t dstLenOrig = dstLen;
36acb880 2320 cres = iconv(m2w,
8f4b0f43
VZ
2321 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2322 &bufPtr, &dstLen);
1752fda6
VZ
2323
2324 // convert the number of bytes converted as returned by iconv to the
2325 // number of (wide) characters converted that we need
2326 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2327
36acb880 2328 if (ms_wcNeedsSwap)
3a0d76bc 2329 {
36acb880 2330 // convert to native endianness
17a1ebd1 2331 for ( unsigned i = 0; i < res; i++ )
467a2982 2332 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2333 }
36acb880 2334 }
8f4b0f43 2335 else // no destination buffer
36acb880 2336 {
8f4b0f43 2337 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2338 wchar_t tbuf[256];
36acb880 2339 res = 0;
ef199164
DS
2340
2341 do
2342 {
e8769ed1 2343 char* bufPtr = (char*)tbuf;
8f4b0f43 2344 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2345
2346 cres = iconv(m2w,
8f4b0f43
VZ
2347 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2348 &bufPtr, &dstLen );
36acb880 2349
8f4b0f43 2350 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2351 }
2352 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2353 }
dccce9ea 2354
8f4b0f43 2355 if (ICONV_FAILED(cres, srcLen))
f1339c56 2356 {
36acb880 2357 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2358 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2359 return wxCONV_FAILED;
36acb880
VZ
2360 }
2361
2362 return res;
2363}
2364
8f4b0f43
VZ
2365size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2366 const wchar_t *src, size_t srcLen) const
36acb880 2367{
b1d547eb
VS
2368#if wxUSE_THREADS
2369 // NB: explained in MB2WC
2370 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2371#endif
3698ae71 2372
8f4b0f43 2373 if ( srcLen == wxNO_LEN )
2588ee86 2374 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2375
2376 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2377 size_t outbuflen = dstLen;
36acb880 2378 size_t res, cres;
3a0d76bc 2379
36acb880 2380 wchar_t *tmpbuf = 0;
3caec1bb 2381
36acb880
VZ
2382 if (ms_wcNeedsSwap)
2383 {
2384 // need to copy to temp buffer to switch endianness
51725fc0 2385 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2386 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2387 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2388 for ( size_t i = 0; i < srcLen; i++ )
2389 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2390
8f4b0f43 2391 src = tmpbuf;
36acb880 2392 }
3a0d76bc 2393
8f4b0f43
VZ
2394 char* inbuf = (char*)src;
2395 if ( dst )
36acb880
VZ
2396 {
2397 // have destination buffer, convert there
8f4b0f43 2398 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2399
8f4b0f43 2400 res = dstLen - outbuflen;
36acb880 2401 }
8f4b0f43 2402 else // no destination buffer
36acb880 2403 {
8f4b0f43 2404 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2405 char tbuf[256];
36acb880 2406 res = 0;
ef199164
DS
2407 do
2408 {
8f4b0f43 2409 dst = tbuf;
51725fc0 2410 outbuflen = WXSIZEOF(tbuf);
36acb880 2411
8f4b0f43 2412 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2413
51725fc0 2414 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2415 }
2416 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2417 }
dccce9ea 2418
36acb880
VZ
2419 if (ms_wcNeedsSwap)
2420 {
2421 free(tmpbuf);
2422 }
dccce9ea 2423
e8769ed1 2424 if (ICONV_FAILED(cres, inbuflen))
36acb880 2425 {
ce6f8d6f 2426 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2427 return wxCONV_FAILED;
36acb880
VZ
2428 }
2429
2430 return res;
2431}
2432
7ef3ab50 2433size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2434{
c1464d9d 2435 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2436 {
2437 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2438
2439#if wxUSE_THREADS
2440 // NB: explained in MB2WC
2441 wxMutexLocker lock(self->m_iconvMutex);
2442#endif
2443
999020e1 2444 const wchar_t *wnul = L"";
c1464d9d 2445 char buf[8]; // should be enough for NUL in any encoding
356410fc 2446 size_t inLen = sizeof(wchar_t),
c1464d9d 2447 outLen = WXSIZEOF(buf);
ef199164
DS
2448 char *inBuff = (char *)wnul;
2449 char *outBuff = buf;
2450 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2451 {
c1464d9d 2452 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2453 }
2454 else // ok
2455 {
ef199164 2456 self->m_minMBCharWidth = outBuff - buf;
356410fc 2457 }
eec47cc6
VZ
2458 }
2459
c1464d9d 2460 return m_minMBCharWidth;
eec47cc6
VZ
2461}
2462
ba98e032
VS
2463#if wxUSE_UNICODE_UTF8
2464bool wxMBConv_iconv::IsUTF8() const
2465{
86501081
VS
2466 return wxStricmp(m_name, "UTF-8") == 0 ||
2467 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2468}
2469#endif
2470
b040e242 2471#endif // HAVE_ICONV
36acb880 2472
e95354ec 2473
36acb880
VZ
2474// ============================================================================
2475// Win32 conversion classes
2476// ============================================================================
1cd52418 2477
e95354ec 2478#ifdef wxHAVE_WIN32_MB2WC
373658eb 2479
8b04d4c4 2480// from utils.cpp
d775fa82 2481#if wxUSE_FONTMAP
86501081 2482extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2483extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2484#endif
373658eb 2485
e95354ec 2486class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2487{
2488public:
bde4baac
VZ
2489 wxMBConv_win32()
2490 {
2491 m_CodePage = CP_ACP;
c1464d9d 2492 m_minMBCharWidth = 0;
bde4baac
VZ
2493 }
2494
d36c9347 2495 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2496 : wxMBConv()
d36c9347
VZ
2497 {
2498 m_CodePage = conv.m_CodePage;
2499 m_minMBCharWidth = conv.m_minMBCharWidth;
2500 }
2501
7608a683 2502#if wxUSE_FONTMAP
86501081 2503 wxMBConv_win32(const char* name)
bde4baac
VZ
2504 {
2505 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2506 m_minMBCharWidth = 0;
bde4baac 2507 }
dccce9ea 2508
e95354ec 2509 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2510 {
2511 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2512 m_minMBCharWidth = 0;
bde4baac 2513 }
eec47cc6 2514#endif // wxUSE_FONTMAP
8b04d4c4 2515
d36c9347 2516 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2517 {
02272c9c
VZ
2518 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2519 // the behaviour is not compatible with the Unix version (using iconv)
2520 // and break the library itself, e.g. wxTextInputStream::NextChar()
2521 // wouldn't work if reading an incomplete MB char didn't result in an
2522 // error
667e5b3e 2523 //
89028980 2524 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2525 // Win XP or newer and it is not supported for UTF-[78] so we always
2526 // use our own conversions in this case. See
89028980
VS
2527 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2528 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2529 if ( m_CodePage == CP_UTF8 )
89028980 2530 {
5487ff0f 2531 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2532 }
830f8f11
VZ
2533
2534 if ( m_CodePage == CP_UTF7 )
2535 {
5487ff0f 2536 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2537 }
2538
2539 int flags = 0;
2540 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2541 IsAtLeastWin2kSP4() )
89028980 2542 {
830f8f11 2543 flags = MB_ERR_INVALID_CHARS;
89028980 2544 }
667e5b3e 2545
2b5f62a0
VZ
2546 const size_t len = ::MultiByteToWideChar
2547 (
2548 m_CodePage, // code page
667e5b3e 2549 flags, // flags: fall on error
2b5f62a0
VZ
2550 psz, // input string
2551 -1, // its length (NUL-terminated)
b4da152e 2552 buf, // output string
2b5f62a0
VZ
2553 buf ? n : 0 // size of output buffer
2554 );
89028980
VS
2555 if ( !len )
2556 {
2557 // function totally failed
467e0479 2558 return wxCONV_FAILED;
89028980
VS
2559 }
2560
2561 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2562 // check if we succeeded, by doing a double trip:
2563 if ( !flags && buf )
2564 {
53c174fc
VZ
2565 const size_t mbLen = strlen(psz);
2566 wxCharBuffer mbBuf(mbLen);
89028980
VS
2567 if ( ::WideCharToMultiByte
2568 (
2569 m_CodePage,
2570 0,
2571 buf,
2572 -1,
2573 mbBuf.data(),
53c174fc 2574 mbLen + 1, // size in bytes, not length
89028980
VS
2575 NULL,
2576 NULL
2577 ) == 0 ||
2578 strcmp(mbBuf, psz) != 0 )
2579 {
2580 // we didn't obtain the same thing we started from, hence
2581 // the conversion was lossy and we consider that it failed
467e0479 2582 return wxCONV_FAILED;
89028980
VS
2583 }
2584 }
2b5f62a0 2585
03a991bc
VZ
2586 // note that it returns count of written chars for buf != NULL and size
2587 // of the needed buffer for buf == NULL so in either case the length of
2588 // the string (which never includes the terminating NUL) is one less
89028980 2589 return len - 1;
f1339c56 2590 }
dccce9ea 2591
d36c9347 2592 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2593 {
13dd924a
VZ
2594 /*
2595 we have a problem here: by default, WideCharToMultiByte() may
2596 replace characters unrepresentable in the target code page with bad
2597 quality approximations such as turning "1/2" symbol (U+00BD) into
2598 "1" for the code pages which don't have it and we, obviously, want
2599 to avoid this at any price
d775fa82 2600
13dd924a
VZ
2601 the trouble is that this function does it _silently_, i.e. it won't
2602 even tell us whether it did or not... Win98/2000 and higher provide
2603 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2604 we have to resort to a round trip, i.e. check that converting back
2605 results in the same string -- this is, of course, expensive but
2606 otherwise we simply can't be sure to not garble the data.
2607 */
2608
2609 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2610 // it doesn't work with CJK encodings (which we test for rather roughly
2611 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2612 // supporting it
907173e5
WS
2613 BOOL usedDef wxDUMMY_INITIALIZE(false);
2614 BOOL *pUsedDef;
13dd924a
VZ
2615 int flags;
2616 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2617 {
2618 // it's our lucky day
2619 flags = WC_NO_BEST_FIT_CHARS;
2620 pUsedDef = &usedDef;
2621 }
2622 else // old system or unsupported encoding
2623 {
2624 flags = 0;
2625 pUsedDef = NULL;
2626 }
2627
2b5f62a0
VZ
2628 const size_t len = ::WideCharToMultiByte
2629 (
2630 m_CodePage, // code page
13dd924a
VZ
2631 flags, // either none or no best fit
2632 pwz, // input string
2b5f62a0
VZ
2633 -1, // it is (wide) NUL-terminated
2634 buf, // output buffer
2635 buf ? n : 0, // and its size
2636 NULL, // default "replacement" char
13dd924a 2637 pUsedDef // [out] was it used?
2b5f62a0
VZ
2638 );
2639
13dd924a
VZ
2640 if ( !len )
2641 {
2642 // function totally failed
467e0479 2643 return wxCONV_FAILED;
13dd924a
VZ
2644 }
2645
765bdb4a
VZ
2646 // we did something, check if we really succeeded
2647 if ( flags )
13dd924a 2648 {
765bdb4a
VZ
2649 // check if the conversion failed, i.e. if any replacements
2650 // were done
2651 if ( usedDef )
2652 return wxCONV_FAILED;
2653 }
2654 else // we must resort to double tripping...
2655 {
2656 // first we need to ensure that we really have the MB data: this is
2657 // not the case if we're called with NULL buffer, in which case we
2658 // need to do the conversion yet again
2659 wxCharBuffer bufDef;
2660 if ( !buf )
13dd924a 2661 {
765bdb4a
VZ
2662 bufDef = wxCharBuffer(len);
2663 buf = bufDef.data();
2664 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2665 buf, len, NULL, NULL) )
467e0479 2666 return wxCONV_FAILED;
13dd924a 2667 }
765bdb4a 2668
564da6ff
VZ
2669 if ( !n )
2670 n = wcslen(pwz);
765bdb4a 2671 wxWCharBuffer wcBuf(n);
564da6ff 2672 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2673 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2674 {
765bdb4a
VZ
2675 // we didn't obtain the same thing we started from, hence
2676 // the conversion was lossy and we consider that it failed
2677 return wxCONV_FAILED;
13dd924a
VZ
2678 }
2679 }
2680
03a991bc 2681 // see the comment above for the reason of "len - 1"
13dd924a 2682 return len - 1;
f1339c56 2683 }
dccce9ea 2684
7ef3ab50
VZ
2685 virtual size_t GetMBNulLen() const
2686 {
2687 if ( m_minMBCharWidth == 0 )
2688 {
2689 int len = ::WideCharToMultiByte
2690 (
2691 m_CodePage, // code page
2692 0, // no flags
2693 L"", // input string
2694 1, // translate just the NUL
2695 NULL, // output buffer
2696 0, // and its size
2697 NULL, // no replacement char
2698 NULL // [out] don't care if it was used
2699 );
2700
2701 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2702 switch ( len )
2703 {
2704 default:
9a83f860 2705 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2706 self->m_minMBCharWidth = (size_t)-1;
2707 break;
7ef3ab50
VZ
2708
2709 case 0:
2710 self->m_minMBCharWidth = (size_t)-1;
2711 break;
2712
2713 case 1:
2714 case 2:
2715 case 4:
2716 self->m_minMBCharWidth = len;
2717 break;
2718 }
2719 }
2720
2721 return m_minMBCharWidth;
2722 }
2723
d36c9347
VZ
2724 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2725
13dd924a
VZ
2726 bool IsOk() const { return m_CodePage != -1; }
2727
2728private:
2729 static bool CanUseNoBestFit()
2730 {
2731 static int s_isWin98Or2k = -1;
2732
2733 if ( s_isWin98Or2k == -1 )
2734 {
2735 int verMaj, verMin;
2736 switch ( wxGetOsVersion(&verMaj, &verMin) )
2737 {
406d283a 2738 case wxOS_WINDOWS_9X:
13dd924a
VZ
2739 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2740 break;
2741
406d283a 2742 case wxOS_WINDOWS_NT:
13dd924a
VZ
2743 s_isWin98Or2k = verMaj >= 5;
2744 break;
2745
2746 default:
ef199164 2747 // unknown: be conservative by default
13dd924a 2748 s_isWin98Or2k = 0;
ef199164 2749 break;
13dd924a
VZ
2750 }
2751
9a83f860 2752 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2753 }
2754
2755 return s_isWin98Or2k == 1;
2756 }
f1339c56 2757
89028980
VS
2758 static bool IsAtLeastWin2kSP4()
2759 {
8942f83a
WS
2760#ifdef __WXWINCE__
2761 return false;
2762#else
89028980
VS
2763 static int s_isAtLeastWin2kSP4 = -1;
2764
2765 if ( s_isAtLeastWin2kSP4 == -1 )
2766 {
2767 OSVERSIONINFOEX ver;
2768
2769 memset(&ver, 0, sizeof(ver));
2770 ver.dwOSVersionInfoSize = sizeof(ver);
2771 GetVersionEx((OSVERSIONINFO*)&ver);
2772
2773 s_isAtLeastWin2kSP4 =
2774 ((ver.dwMajorVersion > 5) || // Vista+
2775 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2776 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2777 ver.wServicePackMajor >= 4)) // 2000 SP4+
2778 ? 1 : 0;
2779 }
2780
2781 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2782#endif
89028980
VS
2783 }
2784
eec47cc6 2785
c1464d9d 2786 // the code page we're working with
b1d66b54 2787 long m_CodePage;
c1464d9d 2788
7ef3ab50 2789 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2790 // "unknown"
2791 size_t m_minMBCharWidth;
1cd52418 2792};
e95354ec
VZ
2793
2794#endif // wxHAVE_WIN32_MB2WC
2795
f7e98dee 2796
36acb880
VZ
2797// ============================================================================
2798// wxEncodingConverter based conversion classes
2799// ============================================================================
2800
1e6feb95 2801#if wxUSE_FONTMAP
1cd52418 2802
e95354ec 2803class wxMBConv_wxwin : public wxMBConv
1cd52418 2804{
8b04d4c4
VZ
2805private:
2806 void Init()
2807 {
6ac84a78
DE
2808 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2809 // The wxMBConv_cf class does a better job.
2810 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2811 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2812 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2813 }
2814
6001e347 2815public:
f1339c56
RR
2816 // temporarily just use wxEncodingConverter stuff,
2817 // so that it works while a better implementation is built
86501081 2818 wxMBConv_wxwin(const char* name)
f1339c56
RR
2819 {
2820 if (name)
267e11c5 2821 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2822 else
2823 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2824
8b04d4c4
VZ
2825 Init();
2826 }
2827
e95354ec 2828 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2829 {
2830 m_enc = enc;
2831
2832 Init();
f1339c56 2833 }
dccce9ea 2834
bde4baac 2835 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2836 {
2837 size_t inbuf = strlen(psz);
dccce9ea 2838 if (buf)
c643a977 2839 {
ef199164 2840 if (!m2w.Convert(psz, buf))
467e0479 2841 return wxCONV_FAILED;
c643a977 2842 }
f1339c56
RR
2843 return inbuf;
2844 }
dccce9ea 2845
bde4baac 2846 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2847 {
f8d791e0 2848 const size_t inbuf = wxWcslen(psz);
f1339c56 2849 if (buf)
c643a977 2850 {
ef199164 2851 if (!w2m.Convert(psz, buf))
467e0479 2852 return wxCONV_FAILED;
c643a977 2853 }
dccce9ea 2854
f1339c56
RR
2855 return inbuf;
2856 }
dccce9ea 2857
7ef3ab50 2858 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2859 {
2860 switch ( m_enc )
2861 {
2862 case wxFONTENCODING_UTF16BE:
2863 case wxFONTENCODING_UTF16LE:
c1464d9d 2864 return 2;
eec47cc6
VZ
2865
2866 case wxFONTENCODING_UTF32BE:
2867 case wxFONTENCODING_UTF32LE:
c1464d9d 2868 return 4;
eec47cc6
VZ
2869
2870 default:
c1464d9d 2871 return 1;
eec47cc6
VZ
2872 }
2873 }
2874
d36c9347
VZ
2875 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2876
7ef3ab50
VZ
2877 bool IsOk() const { return m_ok; }
2878
2879public:
2880 wxFontEncoding m_enc;
2881 wxEncodingConverter m2w, w2m;
2882
2883private:
cafbf6fb
VZ
2884 // were we initialized successfully?
2885 bool m_ok;
fc7a2a60 2886
c0c133e1 2887 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2888};
6001e347 2889
8f115891 2890// make the constructors available for unit testing
86501081 2891WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2892{
2893 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2894 if ( !result->IsOk() )
2895 {
2896 delete result;
2897 return 0;
2898 }
ef199164 2899
8f115891
MW
2900 return result;
2901}
2902
1e6feb95
VZ
2903#endif // wxUSE_FONTMAP
2904
36acb880
VZ
2905// ============================================================================
2906// wxCSConv implementation
2907// ============================================================================
2908
8b04d4c4 2909void wxCSConv::Init()
6001e347 2910{
e95354ec
VZ
2911 m_name = NULL;
2912 m_convReal = NULL;
2913 m_deferred = true;
2914}
2915
86501081 2916wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2917{
2918 Init();
82713003 2919
86501081 2920 if ( !charset.empty() )
e95354ec 2921 {
86501081 2922 SetName(charset.ToAscii());
e95354ec 2923 }
bda3d86a 2924
e4277538
VZ
2925#if wxUSE_FONTMAP
2926 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
e3276230
VZ
2927 if ( m_encoding == wxFONTENCODING_MAX )
2928 {
2929 // set to unknown/invalid value
2930 m_encoding = wxFONTENCODING_SYSTEM;
2931 }
2932 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2933 {
2934 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2935 m_encoding = wxFONTENCODING_ISO8859_1;
2936 }
e4277538 2937#else
bda3d86a 2938 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2939#endif
6001e347
RR
2940}
2941
8b04d4c4
VZ
2942wxCSConv::wxCSConv(wxFontEncoding encoding)
2943{
bda3d86a 2944 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 2945 {
9a83f860 2946 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
2947
2948 encoding = wxFONTENCODING_SYSTEM;
2949 }
2950
8b04d4c4
VZ
2951 Init();
2952
bda3d86a 2953 m_encoding = encoding;
8b04d4c4
VZ
2954}
2955
6001e347
RR
2956wxCSConv::~wxCSConv()
2957{
65e50848
JS
2958 Clear();
2959}
2960
54380f29 2961wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2962 : wxMBConv()
54380f29 2963{
8b04d4c4
VZ
2964 Init();
2965
54380f29 2966 SetName(conv.m_name);
8b04d4c4 2967 m_encoding = conv.m_encoding;
54380f29
GD
2968}
2969
2970wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2971{
2972 Clear();
8b04d4c4 2973
54380f29 2974 SetName(conv.m_name);
8b04d4c4
VZ
2975 m_encoding = conv.m_encoding;
2976
54380f29
GD
2977 return *this;
2978}
2979
65e50848
JS
2980void wxCSConv::Clear()
2981{
8b04d4c4 2982 free(m_name);
e95354ec 2983 delete m_convReal;
8b04d4c4 2984
65e50848 2985 m_name = NULL;
e95354ec 2986 m_convReal = NULL;
6001e347
RR
2987}
2988
86501081 2989void wxCSConv::SetName(const char *charset)
6001e347 2990{
f1339c56
RR
2991 if (charset)
2992 {
d6f2a891 2993 m_name = wxStrdup(charset);
e95354ec 2994 m_deferred = true;
f1339c56 2995 }
6001e347
RR
2996}
2997
8b3eb85d 2998#if wxUSE_FONTMAP
8b3eb85d
VZ
2999
3000WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3001 wxEncodingNameCache );
8b3eb85d
VZ
3002
3003static wxEncodingNameCache gs_nameCache;
3004#endif
3005
e95354ec
VZ
3006wxMBConv *wxCSConv::DoCreate() const
3007{
ce6f8d6f
VZ
3008#if wxUSE_FONTMAP
3009 wxLogTrace(TRACE_STRCONV,
3010 wxT("creating conversion for %s"),
3011 (m_name ? m_name
86501081 3012 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3013#endif // wxUSE_FONTMAP
3014
c547282d
VZ
3015 // check for the special case of ASCII or ISO8859-1 charset: as we have
3016 // special knowledge of it anyhow, we don't need to create a special
3017 // conversion object
e4277538
VZ
3018 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3019 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3020 {
e95354ec
VZ
3021 // don't convert at all
3022 return NULL;
3023 }
dccce9ea 3024
e95354ec
VZ
3025 // we trust OS to do conversion better than we can so try external
3026 // conversion methods first
3027 //
3028 // the full order is:
3029 // 1. OS conversion (iconv() under Unix or Win32 API)
3030 // 2. hard coded conversions for UTF
3031 // 3. wxEncodingConverter as fall back
3032
3033 // step (1)
3034#ifdef HAVE_ICONV
c547282d 3035#if !wxUSE_FONTMAP
e95354ec 3036 if ( m_name )
c547282d 3037#endif // !wxUSE_FONTMAP
e95354ec 3038 {
3ef10cfc 3039#if wxUSE_FONTMAP
8b3eb85d 3040 wxFontEncoding encoding(m_encoding);
3ef10cfc 3041#endif
8b3eb85d 3042
86501081 3043 if ( m_name )
8b3eb85d 3044 {
86501081 3045 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3046 if ( conv->IsOk() )
3047 return conv;
3048
3049 delete conv;
c547282d
VZ
3050
3051#if wxUSE_FONTMAP
8b3eb85d 3052 encoding =
86501081 3053 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3054#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3055 }
3056#if wxUSE_FONTMAP
3057 {
3058 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3059 if ( it != gs_nameCache.end() )
3060 {
3061 if ( it->second.empty() )
3062 return NULL;
c547282d 3063
86501081 3064 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3065 if ( conv->IsOk() )
3066 return conv;
e95354ec 3067
8b3eb85d
VZ
3068 delete conv;
3069 }
3070
3071 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3072 // CS : in case this does not return valid names (eg for MacRoman)
3073 // encoding got a 'failure' entry in the cache all the same,
3074 // although it just has to be created using a different method, so
3075 // only store failed iconv creation attempts (or perhaps we
3076 // shoulnd't do this at all ?)
3c67ec06 3077 if ( names[0] != NULL )
8b3eb85d 3078 {
3c67ec06 3079 for ( ; *names; ++names )
8b3eb85d 3080 {
86501081
VS
3081 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3082 // will need changes that will obsolete this
3083 wxString name(*names);
3084 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3085 if ( conv->IsOk() )
3086 {
3087 gs_nameCache[encoding] = *names;
3088 return conv;
3089 }
3090
3091 delete conv;
8b3eb85d
VZ
3092 }
3093
9a83f860 3094 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3095 }
8b3eb85d
VZ
3096 }
3097#endif // wxUSE_FONTMAP
e95354ec
VZ
3098 }
3099#endif // HAVE_ICONV
3100
3101#ifdef wxHAVE_WIN32_MB2WC
3102 {
7608a683 3103#if wxUSE_FONTMAP
e95354ec
VZ
3104 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3105 : new wxMBConv_win32(m_encoding);
3106 if ( conv->IsOk() )
3107 return conv;
3108
3109 delete conv;
7608a683
WS
3110#else
3111 return NULL;
3112#endif
e95354ec
VZ
3113 }
3114#endif // wxHAVE_WIN32_MB2WC
ef199164 3115
5c4ed98d 3116#ifdef __DARWIN__
f7e98dee 3117 {
6ff49cbc
DE
3118 // leave UTF16 and UTF32 to the built-ins of wx
3119 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3120 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3121 {
a6900d10 3122#if wxUSE_FONTMAP
5c4ed98d
DE
3123 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3124 : new wxMBConv_cf(m_encoding);
a6900d10 3125#else
5c4ed98d 3126 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3127#endif
ef199164 3128
f7e98dee 3129 if ( conv->IsOk() )
d775fa82
WS
3130 return conv;
3131
3132 delete conv;
3133 }
335d31e0 3134 }
5c4ed98d
DE
3135#endif // __DARWIN__
3136
e95354ec
VZ
3137 // step (2)
3138 wxFontEncoding enc = m_encoding;
3139#if wxUSE_FONTMAP
c547282d
VZ
3140 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3141 {
3142 // use "false" to suppress interactive dialogs -- we can be called from
3143 // anywhere and popping up a dialog from here is the last thing we want to
3144 // do
267e11c5 3145 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3146 }
e95354ec
VZ
3147#endif // wxUSE_FONTMAP
3148
3149 switch ( enc )
3150 {
3151 case wxFONTENCODING_UTF7:
3152 return new wxMBConvUTF7;
3153
3154 case wxFONTENCODING_UTF8:
3155 return new wxMBConvUTF8;
3156
e95354ec
VZ
3157 case wxFONTENCODING_UTF16BE:
3158 return new wxMBConvUTF16BE;
3159
3160 case wxFONTENCODING_UTF16LE:
3161 return new wxMBConvUTF16LE;
3162
e95354ec
VZ
3163 case wxFONTENCODING_UTF32BE:
3164 return new wxMBConvUTF32BE;
3165
3166 case wxFONTENCODING_UTF32LE:
3167 return new wxMBConvUTF32LE;
3168
3169 default:
3170 // nothing to do but put here to suppress gcc warnings
ef199164 3171 break;
e95354ec
VZ
3172 }
3173
3174 // step (3)
3175#if wxUSE_FONTMAP
3176 {
3177 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3178 : new wxMBConv_wxwin(m_encoding);
3179 if ( conv->IsOk() )
3180 return conv;
3181
3182 delete conv;
3183 }
ef199164 3184
3df31b2d
VZ
3185 wxLogTrace(TRACE_STRCONV,
3186 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3187 (m_name ? wxString(m_name)
3df31b2d
VZ
3188 : wxFontMapperBase::GetEncodingName(m_encoding)));
3189#endif // wxUSE_FONTMAP
e95354ec
VZ
3190
3191 return NULL;
3192}
3193
3194void wxCSConv::CreateConvIfNeeded() const
3195{
3196 if ( m_deferred )
3197 {
3198 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3199
bda3d86a
VZ
3200 // if we don't have neither the name nor the encoding, use the default
3201 // encoding for this system
3202 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3203 {
4c75209f 3204#if wxUSE_INTL
02c7347b 3205 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3206#else
3207 // fallback to some reasonable default:
3208 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3209#endif // wxUSE_INTL
4c75209f 3210 }
bda3d86a 3211
e95354ec
VZ
3212 self->m_convReal = DoCreate();
3213 self->m_deferred = false;
6001e347 3214 }
6001e347
RR
3215}
3216
0f0298b1
VZ
3217bool wxCSConv::IsOk() const
3218{
3219 CreateConvIfNeeded();
3220
3221 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3222 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3223 return true; // always ok as we do it ourselves
3224
3225 // m_convReal->IsOk() is called at its own creation, so we know it must
3226 // be ok if m_convReal is non-NULL
3227 return m_convReal != NULL;
3228}
3229
1c714a5d
VZ
3230size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3231 const char *src, size_t srcLen) const
3232{
3233 CreateConvIfNeeded();
3234
2c74c558
VS
3235 if (m_convReal)
3236 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3237
3238 // latin-1 (direct)
05392dc8
VZ
3239 if ( srcLen == wxNO_LEN )
3240 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3241
05392dc8
VZ
3242 if ( dst )
3243 {
3244 if ( dstLen < srcLen )
3245 return wxCONV_FAILED;
1c714a5d 3246
05392dc8
VZ
3247 for ( size_t n = 0; n < srcLen; n++ )
3248 dst[n] = (unsigned char)(src[n]);
3249 }
2c74c558 3250
05392dc8 3251 return srcLen;
1c714a5d
VZ
3252}
3253
05392dc8
VZ
3254size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3255 const wchar_t *src, size_t srcLen) const
6001e347 3256{
e95354ec 3257 CreateConvIfNeeded();
dccce9ea 3258
e95354ec 3259 if (m_convReal)
05392dc8 3260 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3261
3262 // latin-1 (direct)
05392dc8
VZ
3263 if ( srcLen == wxNO_LEN )
3264 srcLen = wxWcslen(src) + 1;
dccce9ea 3265
05392dc8 3266 if ( dst )
f1339c56 3267 {
05392dc8
VZ
3268 if ( dstLen < srcLen )
3269 return wxCONV_FAILED;
1cd52418 3270
05392dc8 3271 for ( size_t n = 0; n < srcLen; n++ )
24642831 3272 {
05392dc8 3273 if ( src[n] > 0xFF )
467e0479 3274 return wxCONV_FAILED;
ef199164 3275
05392dc8 3276 dst[n] = (char)src[n];
24642831 3277 }
05392dc8 3278
24642831 3279 }
05392dc8 3280 else // still need to check the input validity
24642831 3281 {
05392dc8 3282 for ( size_t n = 0; n < srcLen; n++ )
24642831 3283 {
05392dc8 3284 if ( src[n] > 0xFF )
467e0479 3285 return wxCONV_FAILED;
24642831 3286 }
f1339c56 3287 }
dccce9ea 3288
05392dc8 3289 return srcLen;
6001e347
RR
3290}
3291
7ef3ab50 3292size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3293{
3294 CreateConvIfNeeded();
3295
3296 if ( m_convReal )
3297 {
7ef3ab50 3298 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3299 }
3300
ba98e032 3301 // otherwise, we are ISO-8859-1
c1464d9d 3302 return 1;
eec47cc6
VZ
3303}
3304
ba98e032
VS
3305#if wxUSE_UNICODE_UTF8
3306bool wxCSConv::IsUTF8() const
3307{
3308 CreateConvIfNeeded();
3309
3310 if ( m_convReal )
3311 {
3312 return m_convReal->IsUTF8();
3313 }
3314
3315 // otherwise, we are ISO-8859-1
3316 return false;
3317}
3318#endif
3319
69c928ef
VZ
3320
3321#if wxUSE_UNICODE
3322
3323wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3324{
3325 if ( !s )
3326 return wxWCharBuffer();
3327
3328 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3329 if ( !wbuf )
5487ff0f 3330 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3331 if ( !wbuf )
3332 wbuf = wxConvISO8859_1.cMB2WX(s);
3333
3334 return wbuf;
3335}
3336
3337wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3338{
3339 if ( !ws )
3340 return wxCharBuffer();
3341
3342 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3343 if ( !buf )
3344 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3345
3346 return buf;
3347}
3348
3349#endif // wxUSE_UNICODE
f5a1953b 3350
1e50d914
VS
3351// ----------------------------------------------------------------------------
3352// globals
3353// ----------------------------------------------------------------------------
3354
3355// NB: The reason why we create converted objects in this convoluted way,
3356// using a factory function instead of global variable, is that they
3357// may be used at static initialization time (some of them are used by
3358// wxString ctors and there may be a global wxString object). In other
3359// words, possibly _before_ the converter global object would be
3360// initialized.
3361
3362#undef wxConvLibc
3363#undef wxConvUTF8
3364#undef wxConvUTF7
3365#undef wxConvLocal
3366#undef wxConvISO8859_1
3367
3368#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3369 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3370 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3371 { \
3372 static impl_klass name##Obj ctor_args; \
3373 return &name##Obj; \
3374 } \
3375 /* this ensures that all global converter objects are created */ \
3376 /* by the time static initialization is done, i.e. before any */ \
3377 /* thread is launched: */ \
3378 static klass* gs_##name##instance = wxGet_##name##Ptr()
3379
3380#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3381 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3382
5c69ef61
VZ
3383#ifdef __INTELC__
3384 // disable warning "variable 'xxx' was declared but never referenced"
3385 #pragma warning(disable: 177)
3386#endif // Intel C++
3387
1e50d914
VS
3388#ifdef __WINDOWS__
3389 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3390#elif 0 // defined(__WXOSX__)
3391 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3392#else
3393 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3394#endif
3395
e1079eda
VZ
3396// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3397// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3398// provokes an error message about "not enough macro parameters"; and we
3399// can't use "()" here as the name##Obj declaration would be parsed as a
3400// function declaration then, so use a semicolon and live with an extra
3401// empty statement (and hope that no compilers warns about this)
3402WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3403WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3404
3405WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3406WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3407
3408WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3409WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3410
6ac84a78
DE
3411#ifdef __DARWIN__
3412// The xnu kernel always communicates file paths in decomposed UTF-8.
3413// WARNING: Are we sure that CFString's conversion will cause decomposition?
3414static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3415#endif
6ac84a78 3416
1e50d914 3417WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3418#ifdef __DARWIN__
1e50d914 3419 &wxConvMacUTF8DObj;
6ac84a78 3420#else // !__DARWIN__
1e50d914 3421 wxGet_wxConvLibcPtr();
6ac84a78 3422#endif // __DARWIN__/!__DARWIN__
1e50d914 3423
bde4baac
VZ
3424#else // !wxUSE_WCHAR_T
3425
1e50d914 3426// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3427// stand-ins in absence of wchar_t
3428WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3429 wxConvISO8859_1,
3430 wxConvLocal,
3431 wxConvUTF8;
3432
3433#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T