]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
added find performance test (see #9870) and the possibility to set the number of...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
c933e267 56#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
ce6f8d6f
VZ
60#define TRACE_STRCONV _T("strconv")
61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069 144 const size_t
5c33522f 145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434 162 // although new conversion classes are supposed to implement this function
36f93678 163 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
36f93678
VZ
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
36f93678
VZ
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
483b0434 227 for ( ;; )
eec47cc6 228 {
c1464d9d 229 // try to convert the current chunk
483b0434 230 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
e4e3bbb4 233
483b0434 234 dstWritten += lenChunk;
f6a02087
VZ
235 if ( !srcEnd )
236 dstWritten++;
f5fb6871 237
f6a02087 238 if ( !lenChunk )
467e0479
VZ
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
483b0434
VZ
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
f6a02087
VZ
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
f6a02087
VZ
254 if ( !srcEnd )
255 dst++;
483b0434 256 }
c1464d9d 257
483b0434 258 if ( !srcEnd )
c1464d9d 259 {
467e0479
VZ
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
c1464d9d
VZ
262 break;
263 }
eec47cc6
VZ
264
265 // advance the input pointer past the end of this chunk
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
483b0434 275 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
36f93678 279 // delimited by srcEnd
483b0434 280 if ( src >= srcEnd )
c1464d9d 281 break;
36f93678
VZ
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
c1464d9d
VZ
292 }
293
483b0434 294 return dstWritten;
e4e3bbb4
RN
295}
296
483b0434
VZ
297size_t
298wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
e4e3bbb4 300{
483b0434
VZ
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
e4e3bbb4 303
f6a02087
VZ
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
eec47cc6
VZ
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
eec47cc6 311 wxWCharBuffer bufTmp;
f6a02087 312 if ( isNulTerminated )
e4e3bbb4 313 {
483b0434 314 srcLen = wxWcslen(src) + 1;
eec47cc6 315 }
483b0434 316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
317 {
318 // make a copy in order to properly NUL-terminate the string
483b0434 319 bufTmp = wxWCharBuffer(srcLen);
ef199164 320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
483b0434 335 dstWritten += lenChunk;
c45fad9a 336 if ( src+lenChunk < srcEnd || isNulTerminated )
f6a02087 337 dstWritten += lenNul;
483b0434
VZ
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
f6a02087 344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
c45fad9a 348 if ( src+lenChunk < srcEnd || isNulTerminated )
f6a02087 349 dst += lenNul;
483b0434 350 }
eec47cc6 351 }
e4e3bbb4 352
483b0434
VZ
353 return dstWritten;
354}
355
ef199164 356size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 357{
51725fc0 358 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 359 if ( rc != wxCONV_FAILED )
509da451
VZ
360 {
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
363 rc--;
364 }
365
366 return rc;
367}
368
ef199164 369size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 370{
51725fc0 371 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 372 if ( rc != wxCONV_FAILED )
509da451 373 {
51725fc0 374 rc -= GetMBNulLen();
509da451
VZ
375 }
376
377 return rc;
378}
379
483b0434
VZ
380wxMBConv::~wxMBConv()
381{
382 // nothing to do here (necessary for Darwin linking probably)
383}
e4e3bbb4 384
483b0434
VZ
385const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
386{
387 if ( psz )
eec47cc6 388 {
483b0434 389 // calculate the length of the buffer needed first
a2db25a1 390 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 391 if ( nLen != wxCONV_FAILED )
f5fb6871 392 {
483b0434 393 // now do the actual conversion
a2db25a1 394 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 395
483b0434 396 // +1 for the trailing NULL
a2db25a1 397 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 398 return buf;
f5fb6871 399 }
483b0434 400 }
e4e3bbb4 401
483b0434
VZ
402 return wxWCharBuffer();
403}
3698ae71 404
483b0434
VZ
405const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
406{
407 if ( pwz )
408 {
a2db25a1 409 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 410 if ( nLen != wxCONV_FAILED )
483b0434 411 {
a2db25a1
VZ
412 wxCharBuffer buf(nLen - 1);
413 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
414 return buf;
415 }
416 }
417
418 return wxCharBuffer();
419}
e4e3bbb4 420
483b0434 421const wxWCharBuffer
ef199164 422wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 423{
ef199164 424 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 425 if ( dstLen != wxCONV_FAILED )
483b0434 426 {
0dd13d21
VZ
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer wbuf(dstLen);
f6a02087 431 wbuf.data()[dstLen] = L'\0';
ef199164 432 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
433 {
434 if ( outLen )
467e0479
VZ
435 {
436 *outLen = dstLen;
f6a02087
VZ
437
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen == wxNO_LEN )
467e0479
VZ
444 (*outLen)--;
445 }
446
483b0434
VZ
447 return wbuf;
448 }
449 }
450
451 if ( outLen )
452 *outLen = 0;
453
454 return wxWCharBuffer();
455}
456
457const wxCharBuffer
ef199164 458wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 459{
13d92ad6 460 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 461 if ( dstLen != wxCONV_FAILED )
483b0434 462 {
0dd13d21
VZ
463 const size_t nulLen = GetMBNulLen();
464
465 // as above, ensure that the buffer is always NUL-terminated, even if
466 // the input is not
467 wxCharBuffer buf(dstLen + nulLen - 1);
468 memset(buf.data() + dstLen, 0, nulLen);
ef199164 469 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
470 {
471 if ( outLen )
467e0479
VZ
472 {
473 *outLen = dstLen;
474
f6a02087 475 if ( inLen == wxNO_LEN )
467e0479 476 {
f6a02087
VZ
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
13d92ad6 479 *outLen -= nulLen;
467e0479
VZ
480 }
481 }
d32a507d 482
483b0434
VZ
483 return buf;
484 }
e4e3bbb4
RN
485 }
486
eec47cc6
VZ
487 if ( outLen )
488 *outLen = 0;
489
490 return wxCharBuffer();
e4e3bbb4
RN
491}
492
6001e347 493// ----------------------------------------------------------------------------
bde4baac 494// wxMBConvLibc
6001e347
RR
495// ----------------------------------------------------------------------------
496
bde4baac
VZ
497size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
498{
499 return wxMB2WC(buf, psz, n);
500}
501
502size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
503{
504 return wxWC2MB(buf, psz, n);
505}
e1bfe89e
RR
506
507// ----------------------------------------------------------------------------
532d575b 508// wxConvBrokenFileNames
e1bfe89e
RR
509// ----------------------------------------------------------------------------
510
eec47cc6
VZ
511#ifdef __UNIX__
512
86501081 513wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 514{
86501081
VS
515 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
516 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 517 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
518 else
519 m_conv = new wxCSConv(charset);
ea8ce907
RR
520}
521
eec47cc6 522#endif // __UNIX__
c12b7f79 523
bde4baac 524// ----------------------------------------------------------------------------
3698ae71 525// UTF-7
bde4baac 526// ----------------------------------------------------------------------------
6001e347 527
15f2ee32 528// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
529//
530// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 531
15f2ee32
RN
532//
533// BASE64 decoding table
534//
535static const unsigned char utf7unb64[] =
6001e347 536{
15f2ee32
RN
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
543 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
544 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
546 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
547 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
548 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
550 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
551 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
552 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
554 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
555 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
556 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
569};
570
9d653e81
VZ
571size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
572 const char *src, size_t srcLen) const
15f2ee32 573{
9d653e81 574 DecoderState stateOrig,
852dcba5 575 *statePtr;
9d653e81
VZ
576 if ( srcLen == wxNO_LEN )
577 {
578 // convert the entire string, up to and including the trailing NUL
579 srcLen = strlen(src) + 1;
580
581 // when working on the entire strings we don't update nor use the shift
582 // state from the previous call
583 statePtr = &stateOrig;
584 }
585 else // when working with partial strings we do use the shift state
586 {
5c33522f 587 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
588
589 // also save the old state to be able to rollback to it on error
590 stateOrig = m_stateDecoder;
591 }
592
593 // but to simplify the code below we use this variable in both cases
594 DecoderState& state = *statePtr;
595
596
597 // number of characters [which would have been] written to dst [if it were
598 // not NULL]
15f2ee32
RN
599 size_t len = 0;
600
9d653e81
VZ
601 const char * const srcEnd = src + srcLen;
602
603 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 604 {
9d653e81
VZ
605 const unsigned char cc = *src++;
606
607 if ( state.IsShifted() )
15f2ee32 608 {
9d653e81
VZ
609 const unsigned char dc = utf7unb64[cc];
610 if ( dc == 0xff )
15f2ee32 611 {
ccaa848d
VZ
612 // end of encoded part, check that nothing was left: there can
613 // be up to 4 bits of 0 padding but nothing else (we also need
614 // to check isLSB as we count bits modulo 8 while a valid UTF-7
615 // encoded sequence must contain an integral number of UTF-16
616 // characters)
617 if ( state.isLSB || state.bit > 4 ||
618 (state.accum & ((1 << state.bit) - 1)) )
619 {
620 if ( !len )
621 state = stateOrig;
622
852dcba5 623 return wxCONV_FAILED;
ccaa848d 624 }
852dcba5 625
9d653e81
VZ
626 state.ToDirect();
627
628 // re-parse this character normally below unless it's '-' which
629 // is consumed by the decoder
630 if ( cc == '-' )
631 continue;
632 }
633 else // valid encoded character
634 {
635 // mini base64 decoder: each character is 6 bits
636 state.bit += 6;
637 state.accum <<= 6;
638 state.accum += dc;
639
640 if ( state.bit >= 8 )
15f2ee32 641 {
9d653e81
VZ
642 // got the full byte, consume it
643 state.bit -= 8;
644 unsigned char b = (state.accum >> state.bit) & 0x00ff;
645
646 if ( state.isLSB )
15f2ee32 647 {
9d653e81
VZ
648 // we've got the full word, output it
649 if ( dst )
650 *dst++ = (state.msb << 8) | b;
651 len++;
652 state.isLSB = false;
15f2ee32 653 }
9d653e81 654 else // MSB
04a37834 655 {
9d653e81
VZ
656 // just store it while we wait for LSB
657 state.msb = b;
658 state.isLSB = true;
04a37834 659 }
15f2ee32
RN
660 }
661 }
9d653e81 662 }
04a37834 663
9d653e81
VZ
664 if ( state.IsDirect() )
665 {
666 // start of an encoded segment?
667 if ( cc == '+' )
04a37834 668 {
9d653e81
VZ
669 if ( *src == '-' )
670 {
671 // just the encoded plus sign, don't switch to shifted mode
672 if ( dst )
673 *dst++ = '+';
674 len++;
675 src++;
676 }
ccaa848d
VZ
677 else if ( utf7unb64[(unsigned)*src] == 0xff )
678 {
679 // empty encoded chunks are not allowed
680 if ( !len )
681 state = stateOrig;
682
683 return wxCONV_FAILED;
684 }
685 else // base-64 encoded chunk follows
9d653e81
VZ
686 {
687 state.ToShifted();
688 }
689 }
690 else // not '+'
691 {
692 // only printable 7 bit ASCII characters (with the exception of
693 // NUL, TAB, CR and LF) can be used directly
694 if ( cc >= 0x7f || (cc < ' ' &&
695 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
696 return wxCONV_FAILED;
697
698 if ( dst )
699 *dst++ = cc;
700 len++;
701 }
15f2ee32
RN
702 }
703 }
04a37834 704
9d653e81
VZ
705 if ( !len )
706 {
707 // as we didn't read any characters we should be called with the same
708 // data (followed by some more new data) again later so don't save our
709 // state
710 state = stateOrig;
711
712 return wxCONV_FAILED;
713 }
04a37834 714
15f2ee32 715 return len;
6001e347
RR
716}
717
15f2ee32
RN
718//
719// BASE64 encoding table
720//
721static const unsigned char utf7enb64[] =
722{
723 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
724 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
725 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
726 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
727 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
728 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
729 'w', 'x', 'y', 'z', '0', '1', '2', '3',
730 '4', '5', '6', '7', '8', '9', '+', '/'
731};
732
733//
734// UTF-7 encoding table
735//
736// 0 - Set D (directly encoded characters)
737// 1 - Set O (optional direct characters)
738// 2 - whitespace characters (optional)
739// 3 - special characters
740//
741static const unsigned char utf7encode[128] =
6001e347 742{
9d653e81 743 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
745 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
747 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
749 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
751};
752
9d653e81
VZ
753static inline bool wxIsUTF7Direct(wchar_t wc)
754{
755 return wc < 0x80 && utf7encode[wc] < 1;
756}
757
758size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
759 const wchar_t *src, size_t srcLen) const
15f2ee32 760{
9d653e81
VZ
761 EncoderState stateOrig,
762 *statePtr;
763 if ( srcLen == wxNO_LEN )
764 {
765 // we don't apply the stored state when operating on entire strings at
766 // once
767 statePtr = &stateOrig;
768
769 srcLen = wxWcslen(src) + 1;
770 }
771 else // do use the mode we left the output in previously
772 {
773 stateOrig = m_stateEncoder;
5c33522f 774 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
775 }
776
777 EncoderState& state = *statePtr;
778
779
15f2ee32
RN
780 size_t len = 0;
781
9d653e81
VZ
782 const wchar_t * const srcEnd = src + srcLen;
783 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 784 {
9d653e81
VZ
785 wchar_t cc = *src++;
786 if ( wxIsUTF7Direct(cc) )
15f2ee32 787 {
9d653e81
VZ
788 if ( state.IsShifted() )
789 {
790 // pad with zeros the last encoded block if necessary
791 if ( state.bit )
792 {
793 if ( dst )
794 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
795 len++;
796 }
ef199164 797
9d653e81
VZ
798 state.ToDirect();
799
800 if ( dst )
801 *dst++ = '-';
802 len++;
803 }
804
805 if ( dst )
806 *dst++ = (char)cc;
15f2ee32
RN
807 len++;
808 }
9d653e81
VZ
809 else if ( cc == '+' && state.IsDirect() )
810 {
811 if ( dst )
812 {
813 *dst++ = '+';
814 *dst++ = '-';
815 }
816
817 len += 2;
818 }
15f2ee32 819#ifndef WC_UTF16
79c78d42 820 else if (((wxUint32)cc) > 0xffff)
b2c13097 821 {
15f2ee32 822 // no surrogate pair generation (yet?)
467e0479 823 return wxCONV_FAILED;
15f2ee32
RN
824 }
825#endif
826 else
827 {
9d653e81
VZ
828 if ( state.IsDirect() )
829 {
830 state.ToShifted();
ef199164 831
9d653e81
VZ
832 if ( dst )
833 *dst++ = '+';
834 len++;
835 }
836
837 // BASE64 encode string
838 for ( ;; )
15f2ee32 839 {
9d653e81 840 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 841 {
9d653e81
VZ
842 state.accum <<= 8;
843 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
844
845 for (state.bit += 8; state.bit >= 6; )
15f2ee32 846 {
9d653e81
VZ
847 state.bit -= 6;
848 if ( dst )
849 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
850 len++;
15f2ee32 851 }
15f2ee32 852 }
ef199164 853
9d653e81
VZ
854 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
855 break;
ef199164 856
9d653e81 857 src++;
15f2ee32 858 }
15f2ee32
RN
859 }
860 }
ef199164 861
9d653e81
VZ
862 // we need to restore the original encoder state if we were called just to
863 // calculate the amount of space needed as we will presumably be called
864 // again to really convert the data now
865 if ( !dst )
866 state = stateOrig;
ef199164 867
15f2ee32 868 return len;
6001e347
RR
869}
870
f6bcfd97 871// ----------------------------------------------------------------------------
6001e347 872// UTF-8
f6bcfd97 873// ----------------------------------------------------------------------------
6001e347 874
1774c3c5 875static const wxUint32 utf8_max[]=
4def3b35 876 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 877
3698ae71
VZ
878// boundaries of the private use area we use to (temporarily) remap invalid
879// characters invalid in a UTF-8 encoded string
ea8ce907
RR
880const wxUint32 wxUnicodePUA = 0x100000;
881const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
882
0286d08d 883// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 884const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
885 // single-byte sequences (ASCII):
886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
894
895 // these are invalid:
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
900 0, 0, // C0,C1
901
902 // two-byte sequences:
903 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
904 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
905
906 // three-byte sequences:
907 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
908
909 // four-byte sequences:
910 4, 4, 4, 4, 4, // F0..F4
911
912 // these are invalid again (5- or 6-byte
913 // sequences and sequences for code points
914 // above U+10FFFF, as restricted by RFC 3629):
915 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
916};
917
918size_t
919wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
920 const char *src, size_t srcLen) const
921{
922 wchar_t *out = dstLen ? dst : NULL;
923 size_t written = 0;
924
925 if ( srcLen == wxNO_LEN )
926 srcLen = strlen(src) + 1;
927
928 for ( const char *p = src; ; p++ )
929 {
930 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
931 {
932 // all done successfully, just add the trailing NULL if we are not
933 // using explicit length
934 if ( srcLen == wxNO_LEN )
935 {
936 if ( out )
937 {
938 if ( !dstLen )
939 break;
940
941 *out = L'\0';
942 }
943
944 written++;
945 }
946
947 return written;
948 }
949
0286d08d
VZ
950 if ( out && !dstLen-- )
951 break;
952
5367a38a
VS
953 wxUint32 code;
954 unsigned char c = *p;
0286d08d 955
5367a38a
VS
956 if ( c < 0x80 )
957 {
958 if ( srcLen == 0 ) // the test works for wxNO_LEN too
959 break;
0286d08d 960
5367a38a
VS
961 if ( srcLen != wxNO_LEN )
962 srcLen--;
0286d08d 963
5367a38a
VS
964 code = c;
965 }
966 else
0286d08d 967 {
5367a38a
VS
968 unsigned len = tableUtf8Lengths[c];
969 if ( !len )
970 break;
971
972 if ( srcLen < len ) // the test works for wxNO_LEN too
973 break;
974
975 if ( srcLen != wxNO_LEN )
976 srcLen -= len;
977
978 // Char. number range | UTF-8 octet sequence
979 // (hexadecimal) | (binary)
980 // ----------------------+----------------------------------------
981 // 0000 0000 - 0000 007F | 0xxxxxxx
982 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
983 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
984 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
985 //
986 // Code point value is stored in bits marked with 'x',
987 // lowest-order bit of the value on the right side in the diagram
988 // above. (from RFC 3629)
989
990 // mask to extract lead byte's value ('x' bits above), by sequence
991 // length:
992 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
993
994 // mask and value of lead byte's most significant bits, by length:
995 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
996 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
997
998 len--; // it's more convenient to work with 0-based length here
999
1000 // extract the lead byte's value bits:
1001 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1002 break;
1003
1004 code = c & leadValueMask[len];
1005
1006 // all remaining bytes, if any, are handled in the same way
1007 // regardless of sequence's length:
1008 for ( ; len; --len )
1009 {
1010 c = *++p;
1011 if ( (c & 0xC0) != 0x80 )
1012 return wxCONV_FAILED;
0286d08d 1013
5367a38a
VS
1014 code <<= 6;
1015 code |= c & 0x3F;
1016 }
0286d08d
VZ
1017 }
1018
1019#ifdef WC_UTF16
1020 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1021 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1022 {
1023 if ( out )
1024 out++;
1025 written++;
1026 }
1027#else // !WC_UTF16
1028 if ( out )
1029 *out = code;
1030#endif // WC_UTF16/!WC_UTF16
1031
1032 if ( out )
1033 out++;
1034
1035 written++;
1036 }
1037
1038 return wxCONV_FAILED;
1039}
1040
1041size_t
1042wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1043 const wchar_t *src, size_t srcLen) const
1044{
1045 char *out = dstLen ? dst : NULL;
1046 size_t written = 0;
1047
1048 for ( const wchar_t *wp = src; ; wp++ )
1049 {
a964d3ed 1050 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1051 {
1052 // all done successfully, just add the trailing NULL if we are not
1053 // using explicit length
1054 if ( srcLen == wxNO_LEN )
1055 {
1056 if ( out )
1057 {
1058 if ( !dstLen )
1059 break;
1060
1061 *out = '\0';
1062 }
1063
1064 written++;
1065 }
1066
1067 return written;
1068 }
1069
a964d3ed
VZ
1070 if ( srcLen != wxNO_LEN )
1071 srcLen--;
0286d08d
VZ
1072
1073 wxUint32 code;
1074#ifdef WC_UTF16
1075 // cast is ok for WC_UTF16
1076 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1077 {
1078 // skip the next char too as we decoded a surrogate
1079 wp++;
1080 }
1081#else // wchar_t is UTF-32
1082 code = *wp & 0x7fffffff;
1083#endif
1084
1085 unsigned len;
1086 if ( code <= 0x7F )
1087 {
1088 len = 1;
1089 if ( out )
1090 {
1091 if ( dstLen < len )
1092 break;
1093
1094 out[0] = (char)code;
1095 }
1096 }
1097 else if ( code <= 0x07FF )
1098 {
1099 len = 2;
1100 if ( out )
1101 {
1102 if ( dstLen < len )
1103 break;
1104
1105 // NB: this line takes 6 least significant bits, encodes them as
1106 // 10xxxxxx and discards them so that the next byte can be encoded:
1107 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1108 out[0] = 0xC0 | code;
1109 }
1110 }
1111 else if ( code < 0xFFFF )
1112 {
1113 len = 3;
1114 if ( out )
1115 {
1116 if ( dstLen < len )
1117 break;
1118
1119 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1120 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1121 out[0] = 0xE0 | code;
1122 }
1123 }
1124 else if ( code <= 0x10FFFF )
1125 {
1126 len = 4;
1127 if ( out )
1128 {
1129 if ( dstLen < len )
1130 break;
1131
1132 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1133 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1134 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1135 out[0] = 0xF0 | code;
1136 }
1137 }
1138 else
1139 {
1140 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1141 break;
1142 }
1143
1144 if ( out )
1145 {
1146 out += len;
1147 dstLen -= len;
1148 }
1149
1150 written += len;
1151 }
1152
1153 // we only get here if an error occurs during decoding
1154 return wxCONV_FAILED;
1155}
1156
d16d0917
VZ
1157size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1158 const char *psz, size_t srcLen) const
6001e347 1159{
0286d08d 1160 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1161 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1162
4def3b35
VS
1163 size_t len = 0;
1164
d16d0917 1165 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1166 {
ea8ce907
RR
1167 const char *opsz = psz;
1168 bool invalid = false;
4def3b35
VS
1169 unsigned char cc = *psz++, fc = cc;
1170 unsigned cnt;
dccce9ea 1171 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1172 fc <<= 1;
ef199164 1173
dccce9ea 1174 if (!cnt)
4def3b35
VS
1175 {
1176 // plain ASCII char
dccce9ea 1177 if (buf)
4def3b35
VS
1178 *buf++ = cc;
1179 len++;
561488ef
MW
1180
1181 // escape the escape character for octal escapes
1182 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1183 && cc == '\\' && (!buf || len < n))
1184 {
1185 if (buf)
1186 *buf++ = cc;
1187 len++;
1188 }
dccce9ea
VZ
1189 }
1190 else
4def3b35
VS
1191 {
1192 cnt--;
dccce9ea 1193 if (!cnt)
4def3b35
VS
1194 {
1195 // invalid UTF-8 sequence
ea8ce907 1196 invalid = true;
dccce9ea
VZ
1197 }
1198 else
4def3b35
VS
1199 {
1200 unsigned ocnt = cnt - 1;
1201 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1202 while (cnt--)
4def3b35 1203 {
ea8ce907 1204 cc = *psz;
dccce9ea 1205 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1206 {
1207 // invalid UTF-8 sequence
ea8ce907
RR
1208 invalid = true;
1209 break;
4def3b35 1210 }
ef199164 1211
ea8ce907 1212 psz++;
4def3b35
VS
1213 res = (res << 6) | (cc & 0x3f);
1214 }
ef199164 1215
ea8ce907 1216 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1217 {
1218 // illegal UTF-8 encoding
ea8ce907 1219 invalid = true;
4def3b35 1220 }
ea8ce907
RR
1221 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1222 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1223 {
1224 // if one of our PUA characters turns up externally
1225 // it must also be treated as an illegal sequence
1226 // (a bit like you have to escape an escape character)
1227 invalid = true;
1228 }
1229 else
1230 {
1cd52418 1231#ifdef WC_UTF16
0286d08d 1232 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1233 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1234 if (pa == wxCONV_FAILED)
ea8ce907
RR
1235 {
1236 invalid = true;
1237 }
1238 else
1239 {
1240 if (buf)
1241 buf += pa;
1242 len += pa;
1243 }
373658eb 1244#else // !WC_UTF16
ea8ce907 1245 if (buf)
38d4b1e4 1246 *buf++ = (wchar_t)res;
ea8ce907 1247 len++;
373658eb 1248#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1249 }
1250 }
ef199164 1251
ea8ce907
RR
1252 if (invalid)
1253 {
1254 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1255 {
1256 while (opsz < psz && (!buf || len < n))
1257 {
1258#ifdef WC_UTF16
1259 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1260 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1261 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1262 if (buf)
1263 buf += pa;
1264 opsz++;
1265 len += pa;
1266#else
1267 if (buf)
38d4b1e4 1268 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1269 opsz++;
1270 len++;
1271#endif
1272 }
1273 }
3698ae71 1274 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1275 {
1276 while (opsz < psz && (!buf || len < n))
1277 {
3698ae71
VZ
1278 if ( buf && len + 3 < n )
1279 {
17a1ebd1 1280 unsigned char on = *opsz;
3698ae71 1281 *buf++ = L'\\';
17a1ebd1
VZ
1282 *buf++ = (wchar_t)( L'0' + on / 0100 );
1283 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1284 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1285 }
ef199164 1286
ea8ce907
RR
1287 opsz++;
1288 len += 4;
1289 }
1290 }
3698ae71 1291 else // MAP_INVALID_UTF8_NOT
ea8ce907 1292 {
467e0479 1293 return wxCONV_FAILED;
ea8ce907 1294 }
4def3b35
VS
1295 }
1296 }
6001e347 1297 }
ef199164 1298
d16d0917 1299 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1300 *buf = 0;
ef199164 1301
d16d0917 1302 return len + 1;
6001e347
RR
1303}
1304
3698ae71
VZ
1305static inline bool isoctal(wchar_t wch)
1306{
1307 return L'0' <= wch && wch <= L'7';
1308}
1309
d16d0917
VZ
1310size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1311 const wchar_t *psz, size_t srcLen) const
6001e347 1312{
0286d08d 1313 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1314 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1315
4def3b35 1316 size_t len = 0;
6001e347 1317
d16d0917 1318 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1319 {
1320 wxUint32 cc;
ef199164 1321
1cd52418 1322#ifdef WC_UTF16
b5153fd8
VZ
1323 // cast is ok for WC_UTF16
1324 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1325 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1326#else
ef199164 1327 cc = (*psz++) & 0x7fffffff;
4def3b35 1328#endif
3698ae71
VZ
1329
1330 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1331 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1332 {
dccce9ea 1333 if (buf)
ea8ce907 1334 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1335 len++;
3698ae71 1336 }
561488ef
MW
1337 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1338 && cc == L'\\' && psz[0] == L'\\' )
1339 {
1340 if (buf)
1341 *buf++ = (char)cc;
1342 psz++;
1343 len++;
1344 }
3698ae71
VZ
1345 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1346 cc == L'\\' &&
1347 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1348 {
dccce9ea 1349 if (buf)
3698ae71 1350 {
ef199164
DS
1351 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1352 (psz[1] - L'0') * 010 +
b2c13097 1353 (psz[2] - L'0'));
3698ae71
VZ
1354 }
1355
1356 psz += 3;
ea8ce907
RR
1357 len++;
1358 }
1359 else
1360 {
1361 unsigned cnt;
ef199164
DS
1362 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1363 {
1364 }
1365
ea8ce907 1366 if (!cnt)
4def3b35 1367 {
ea8ce907
RR
1368 // plain ASCII char
1369 if (buf)
1370 *buf++ = (char) cc;
1371 len++;
1372 }
ea8ce907
RR
1373 else
1374 {
1375 len += cnt + 1;
1376 if (buf)
1377 {
1378 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1379 while (cnt--)
1380 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1381 }
4def3b35
VS
1382 }
1383 }
6001e347 1384 }
4def3b35 1385
d16d0917 1386 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1387 *buf = 0;
adb45366 1388
d16d0917 1389 return len + 1;
6001e347
RR
1390}
1391
467e0479 1392// ============================================================================
c91830cb 1393// UTF-16
467e0479 1394// ============================================================================
c91830cb
VZ
1395
1396#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1397 #define wxMBConvUTF16straight wxMBConvUTF16BE
1398 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1399#else
bde4baac
VZ
1400 #define wxMBConvUTF16swap wxMBConvUTF16BE
1401 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1402#endif
1403
467e0479
VZ
1404/* static */
1405size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1406{
1407 if ( srcLen == wxNO_LEN )
1408 {
1409 // count the number of bytes in input, including the trailing NULs
5c33522f 1410 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1411 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1412 ;
c91830cb 1413
467e0479
VZ
1414 srcLen *= BYTES_PER_CHAR;
1415 }
1416 else // we already have the length
1417 {
1418 // we can only convert an entire number of UTF-16 characters
1419 if ( srcLen % BYTES_PER_CHAR )
1420 return wxCONV_FAILED;
1421 }
1422
1423 return srcLen;
1424}
1425
1426// case when in-memory representation is UTF-16 too
c91830cb
VZ
1427#ifdef WC_UTF16
1428
467e0479
VZ
1429// ----------------------------------------------------------------------------
1430// conversions without endianness change
1431// ----------------------------------------------------------------------------
1432
1433size_t
1434wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1435 const char *src, size_t srcLen) const
c91830cb 1436{
467e0479
VZ
1437 // set up the scene for using memcpy() (which is presumably more efficient
1438 // than copying the bytes one by one)
1439 srcLen = GetLength(src, srcLen);
1440 if ( srcLen == wxNO_LEN )
1441 return wxCONV_FAILED;
c91830cb 1442
ef199164 1443 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1444 if ( dst )
c91830cb 1445 {
467e0479
VZ
1446 if ( dstLen < inLen )
1447 return wxCONV_FAILED;
c91830cb 1448
467e0479 1449 memcpy(dst, src, srcLen);
c91830cb 1450 }
d32a507d 1451
467e0479 1452 return inLen;
c91830cb
VZ
1453}
1454
467e0479
VZ
1455size_t
1456wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1457 const wchar_t *src, size_t srcLen) const
c91830cb 1458{
467e0479
VZ
1459 if ( srcLen == wxNO_LEN )
1460 srcLen = wxWcslen(src) + 1;
c91830cb 1461
467e0479
VZ
1462 srcLen *= BYTES_PER_CHAR;
1463
1464 if ( dst )
c91830cb 1465 {
467e0479
VZ
1466 if ( dstLen < srcLen )
1467 return wxCONV_FAILED;
d32a507d 1468
467e0479 1469 memcpy(dst, src, srcLen);
c91830cb 1470 }
d32a507d 1471
467e0479 1472 return srcLen;
c91830cb
VZ
1473}
1474
467e0479
VZ
1475// ----------------------------------------------------------------------------
1476// endian-reversing conversions
1477// ----------------------------------------------------------------------------
c91830cb 1478
467e0479
VZ
1479size_t
1480wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1481 const char *src, size_t srcLen) const
c91830cb 1482{
467e0479
VZ
1483 srcLen = GetLength(src, srcLen);
1484 if ( srcLen == wxNO_LEN )
1485 return wxCONV_FAILED;
c91830cb 1486
467e0479
VZ
1487 srcLen /= BYTES_PER_CHAR;
1488
1489 if ( dst )
c91830cb 1490 {
467e0479
VZ
1491 if ( dstLen < srcLen )
1492 return wxCONV_FAILED;
1493
5c33522f 1494 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1495 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1496 {
ef199164 1497 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1498 }
c91830cb 1499 }
bfab25d4 1500
467e0479 1501 return srcLen;
c91830cb
VZ
1502}
1503
467e0479
VZ
1504size_t
1505wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1506 const wchar_t *src, size_t srcLen) const
c91830cb 1507{
467e0479
VZ
1508 if ( srcLen == wxNO_LEN )
1509 srcLen = wxWcslen(src) + 1;
c91830cb 1510
467e0479
VZ
1511 srcLen *= BYTES_PER_CHAR;
1512
1513 if ( dst )
c91830cb 1514 {
467e0479
VZ
1515 if ( dstLen < srcLen )
1516 return wxCONV_FAILED;
1517
5c33522f 1518 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1519 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1520 {
ef199164 1521 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1522 }
c91830cb 1523 }
eec47cc6 1524
467e0479 1525 return srcLen;
c91830cb
VZ
1526}
1527
467e0479 1528#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1529
467e0479
VZ
1530// ----------------------------------------------------------------------------
1531// conversions without endianness change
1532// ----------------------------------------------------------------------------
c91830cb 1533
35d11700
VZ
1534size_t
1535wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1536 const char *src, size_t srcLen) const
c91830cb 1537{
35d11700
VZ
1538 srcLen = GetLength(src, srcLen);
1539 if ( srcLen == wxNO_LEN )
1540 return wxCONV_FAILED;
c91830cb 1541
ef199164 1542 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1543 if ( !dst )
c91830cb 1544 {
35d11700
VZ
1545 // optimization: return maximal space which could be needed for this
1546 // string even if the real size could be smaller if the buffer contains
1547 // any surrogates
1548 return inLen;
c91830cb 1549 }
c91830cb 1550
35d11700 1551 size_t outLen = 0;
5c33522f 1552 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1553 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1554 {
ef199164
DS
1555 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1556 if ( !inBuff )
35d11700
VZ
1557 return wxCONV_FAILED;
1558
1559 if ( ++outLen > dstLen )
1560 return wxCONV_FAILED;
c91830cb 1561
35d11700
VZ
1562 *dst++ = ch;
1563 }
1564
1565
1566 return outLen;
1567}
c91830cb 1568
35d11700
VZ
1569size_t
1570wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1571 const wchar_t *src, size_t srcLen) const
c91830cb 1572{
35d11700
VZ
1573 if ( srcLen == wxNO_LEN )
1574 srcLen = wxWcslen(src) + 1;
c91830cb 1575
35d11700 1576 size_t outLen = 0;
5c33522f 1577 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1578 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1579 {
1580 wxUint16 cc[2];
35d11700
VZ
1581 const size_t numChars = encode_utf16(*src++, cc);
1582 if ( numChars == wxCONV_FAILED )
1583 return wxCONV_FAILED;
c91830cb 1584
ef199164
DS
1585 outLen += numChars * BYTES_PER_CHAR;
1586 if ( outBuff )
c91830cb 1587 {
35d11700
VZ
1588 if ( outLen > dstLen )
1589 return wxCONV_FAILED;
1590
ef199164 1591 *outBuff++ = cc[0];
35d11700 1592 if ( numChars == 2 )
69b80d28 1593 {
35d11700 1594 // second character of a surrogate
ef199164 1595 *outBuff++ = cc[1];
69b80d28 1596 }
c91830cb 1597 }
c91830cb 1598 }
c91830cb 1599
35d11700 1600 return outLen;
c91830cb
VZ
1601}
1602
467e0479
VZ
1603// ----------------------------------------------------------------------------
1604// endian-reversing conversions
1605// ----------------------------------------------------------------------------
c91830cb 1606
35d11700
VZ
1607size_t
1608wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1609 const char *src, size_t srcLen) const
c91830cb 1610{
35d11700
VZ
1611 srcLen = GetLength(src, srcLen);
1612 if ( srcLen == wxNO_LEN )
1613 return wxCONV_FAILED;
1614
ef199164 1615 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1616 if ( !dst )
1617 {
1618 // optimization: return maximal space which could be needed for this
1619 // string even if the real size could be smaller if the buffer contains
1620 // any surrogates
1621 return inLen;
1622 }
c91830cb 1623
35d11700 1624 size_t outLen = 0;
5c33522f 1625 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1626 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1627 {
35d11700
VZ
1628 wxUint32 ch;
1629 wxUint16 tmp[2];
ef199164
DS
1630
1631 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1632 inBuff++;
1633 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1634
35d11700
VZ
1635 const size_t numChars = decode_utf16(tmp, ch);
1636 if ( numChars == wxCONV_FAILED )
1637 return wxCONV_FAILED;
c91830cb 1638
35d11700 1639 if ( numChars == 2 )
ef199164 1640 inBuff++;
35d11700
VZ
1641
1642 if ( ++outLen > dstLen )
1643 return wxCONV_FAILED;
c91830cb 1644
35d11700 1645 *dst++ = ch;
c91830cb 1646 }
c91830cb 1647
c91830cb 1648
35d11700
VZ
1649 return outLen;
1650}
c91830cb 1651
35d11700
VZ
1652size_t
1653wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1654 const wchar_t *src, size_t srcLen) const
c91830cb 1655{
35d11700
VZ
1656 if ( srcLen == wxNO_LEN )
1657 srcLen = wxWcslen(src) + 1;
c91830cb 1658
35d11700 1659 size_t outLen = 0;
5c33522f 1660 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1661 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1662 {
1663 wxUint16 cc[2];
35d11700
VZ
1664 const size_t numChars = encode_utf16(*src, cc);
1665 if ( numChars == wxCONV_FAILED )
1666 return wxCONV_FAILED;
c91830cb 1667
ef199164
DS
1668 outLen += numChars * BYTES_PER_CHAR;
1669 if ( outBuff )
c91830cb 1670 {
35d11700
VZ
1671 if ( outLen > dstLen )
1672 return wxCONV_FAILED;
1673
ef199164 1674 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1675 if ( numChars == 2 )
c91830cb 1676 {
35d11700 1677 // second character of a surrogate
ef199164 1678 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1679 }
1680 }
c91830cb 1681 }
c91830cb 1682
35d11700 1683 return outLen;
c91830cb
VZ
1684}
1685
467e0479 1686#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1687
1688
35d11700 1689// ============================================================================
c91830cb 1690// UTF-32
35d11700 1691// ============================================================================
c91830cb
VZ
1692
1693#ifdef WORDS_BIGENDIAN
467e0479
VZ
1694 #define wxMBConvUTF32straight wxMBConvUTF32BE
1695 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1696#else
467e0479
VZ
1697 #define wxMBConvUTF32swap wxMBConvUTF32BE
1698 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1699#endif
1700
1701
1702WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1703WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1704
467e0479
VZ
1705/* static */
1706size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1707{
1708 if ( srcLen == wxNO_LEN )
1709 {
1710 // count the number of bytes in input, including the trailing NULs
5c33522f 1711 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1712 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1713 ;
c91830cb 1714
467e0479
VZ
1715 srcLen *= BYTES_PER_CHAR;
1716 }
1717 else // we already have the length
1718 {
1719 // we can only convert an entire number of UTF-32 characters
1720 if ( srcLen % BYTES_PER_CHAR )
1721 return wxCONV_FAILED;
1722 }
1723
1724 return srcLen;
1725}
1726
1727// case when in-memory representation is UTF-16
c91830cb
VZ
1728#ifdef WC_UTF16
1729
467e0479
VZ
1730// ----------------------------------------------------------------------------
1731// conversions without endianness change
1732// ----------------------------------------------------------------------------
1733
1734size_t
1735wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1736 const char *src, size_t srcLen) const
c91830cb 1737{
467e0479
VZ
1738 srcLen = GetLength(src, srcLen);
1739 if ( srcLen == wxNO_LEN )
1740 return wxCONV_FAILED;
c91830cb 1741
5c33522f 1742 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1743 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1744 size_t outLen = 0;
1745 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1746 {
1747 wxUint16 cc[2];
ef199164 1748 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1749 if ( numChars == wxCONV_FAILED )
1750 return wxCONV_FAILED;
c91830cb 1751
467e0479
VZ
1752 outLen += numChars;
1753 if ( dst )
c91830cb 1754 {
467e0479
VZ
1755 if ( outLen > dstLen )
1756 return wxCONV_FAILED;
d32a507d 1757
467e0479
VZ
1758 *dst++ = cc[0];
1759 if ( numChars == 2 )
1760 {
1761 // second character of a surrogate
1762 *dst++ = cc[1];
1763 }
1764 }
c91830cb 1765 }
d32a507d 1766
467e0479 1767 return outLen;
c91830cb
VZ
1768}
1769
467e0479
VZ
1770size_t
1771wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1772 const wchar_t *src, size_t srcLen) const
c91830cb 1773{
467e0479
VZ
1774 if ( srcLen == wxNO_LEN )
1775 srcLen = wxWcslen(src) + 1;
c91830cb 1776
467e0479 1777 if ( !dst )
c91830cb 1778 {
467e0479
VZ
1779 // optimization: return maximal space which could be needed for this
1780 // string instead of the exact amount which could be less if there are
1781 // any surrogates in the input
1782 //
1783 // we consider that surrogates are rare enough to make it worthwhile to
1784 // avoid running the loop below at the cost of slightly extra memory
1785 // consumption
ef199164 1786 return srcLen * BYTES_PER_CHAR;
467e0479 1787 }
c91830cb 1788
5c33522f 1789 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1790 size_t outLen = 0;
1791 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1792 {
1793 const wxUint32 ch = wxDecodeSurrogate(&src);
1794 if ( !src )
1795 return wxCONV_FAILED;
c91830cb 1796
467e0479 1797 outLen += BYTES_PER_CHAR;
d32a507d 1798
467e0479
VZ
1799 if ( outLen > dstLen )
1800 return wxCONV_FAILED;
b5153fd8 1801
ef199164 1802 *outBuff++ = ch;
467e0479 1803 }
c91830cb 1804
467e0479 1805 return outLen;
c91830cb
VZ
1806}
1807
467e0479
VZ
1808// ----------------------------------------------------------------------------
1809// endian-reversing conversions
1810// ----------------------------------------------------------------------------
c91830cb 1811
467e0479
VZ
1812size_t
1813wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1814 const char *src, size_t srcLen) const
c91830cb 1815{
467e0479
VZ
1816 srcLen = GetLength(src, srcLen);
1817 if ( srcLen == wxNO_LEN )
1818 return wxCONV_FAILED;
c91830cb 1819
5c33522f 1820 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1821 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1822 size_t outLen = 0;
ef199164 1823 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1824 {
c91830cb 1825 wxUint16 cc[2];
ef199164 1826 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1827 if ( numChars == wxCONV_FAILED )
1828 return wxCONV_FAILED;
c91830cb 1829
467e0479
VZ
1830 outLen += numChars;
1831 if ( dst )
c91830cb 1832 {
467e0479
VZ
1833 if ( outLen > dstLen )
1834 return wxCONV_FAILED;
d32a507d 1835
467e0479
VZ
1836 *dst++ = cc[0];
1837 if ( numChars == 2 )
1838 {
1839 // second character of a surrogate
1840 *dst++ = cc[1];
1841 }
1842 }
c91830cb 1843 }
b5153fd8 1844
467e0479 1845 return outLen;
c91830cb
VZ
1846}
1847
467e0479
VZ
1848size_t
1849wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1850 const wchar_t *src, size_t srcLen) const
c91830cb 1851{
467e0479
VZ
1852 if ( srcLen == wxNO_LEN )
1853 srcLen = wxWcslen(src) + 1;
c91830cb 1854
467e0479 1855 if ( !dst )
c91830cb 1856 {
467e0479
VZ
1857 // optimization: return maximal space which could be needed for this
1858 // string instead of the exact amount which could be less if there are
1859 // any surrogates in the input
1860 //
1861 // we consider that surrogates are rare enough to make it worthwhile to
1862 // avoid running the loop below at the cost of slightly extra memory
1863 // consumption
1864 return srcLen*BYTES_PER_CHAR;
1865 }
c91830cb 1866
5c33522f 1867 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1868 size_t outLen = 0;
1869 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1870 {
1871 const wxUint32 ch = wxDecodeSurrogate(&src);
1872 if ( !src )
1873 return wxCONV_FAILED;
c91830cb 1874
467e0479 1875 outLen += BYTES_PER_CHAR;
d32a507d 1876
467e0479
VZ
1877 if ( outLen > dstLen )
1878 return wxCONV_FAILED;
b5153fd8 1879
ef199164 1880 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1881 }
c91830cb 1882
467e0479 1883 return outLen;
c91830cb
VZ
1884}
1885
467e0479 1886#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1887
35d11700
VZ
1888// ----------------------------------------------------------------------------
1889// conversions without endianness change
1890// ----------------------------------------------------------------------------
1891
1892size_t
1893wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1894 const char *src, size_t srcLen) const
c91830cb 1895{
35d11700
VZ
1896 // use memcpy() as it should be much faster than hand-written loop
1897 srcLen = GetLength(src, srcLen);
1898 if ( srcLen == wxNO_LEN )
1899 return wxCONV_FAILED;
c91830cb 1900
35d11700
VZ
1901 const size_t inLen = srcLen/BYTES_PER_CHAR;
1902 if ( dst )
c91830cb 1903 {
35d11700
VZ
1904 if ( dstLen < inLen )
1905 return wxCONV_FAILED;
b5153fd8 1906
35d11700
VZ
1907 memcpy(dst, src, srcLen);
1908 }
c91830cb 1909
35d11700 1910 return inLen;
c91830cb
VZ
1911}
1912
35d11700
VZ
1913size_t
1914wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1915 const wchar_t *src, size_t srcLen) const
c91830cb 1916{
35d11700
VZ
1917 if ( srcLen == wxNO_LEN )
1918 srcLen = wxWcslen(src) + 1;
1919
1920 srcLen *= BYTES_PER_CHAR;
c91830cb 1921
35d11700 1922 if ( dst )
c91830cb 1923 {
35d11700
VZ
1924 if ( dstLen < srcLen )
1925 return wxCONV_FAILED;
c91830cb 1926
35d11700 1927 memcpy(dst, src, srcLen);
c91830cb
VZ
1928 }
1929
35d11700 1930 return srcLen;
c91830cb
VZ
1931}
1932
35d11700
VZ
1933// ----------------------------------------------------------------------------
1934// endian-reversing conversions
1935// ----------------------------------------------------------------------------
c91830cb 1936
35d11700
VZ
1937size_t
1938wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1939 const char *src, size_t srcLen) const
c91830cb 1940{
35d11700
VZ
1941 srcLen = GetLength(src, srcLen);
1942 if ( srcLen == wxNO_LEN )
1943 return wxCONV_FAILED;
1944
1945 srcLen /= BYTES_PER_CHAR;
c91830cb 1946
35d11700 1947 if ( dst )
c91830cb 1948 {
35d11700
VZ
1949 if ( dstLen < srcLen )
1950 return wxCONV_FAILED;
1951
5c33522f 1952 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1953 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1954 {
ef199164 1955 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1956 }
c91830cb 1957 }
b5153fd8 1958
35d11700 1959 return srcLen;
c91830cb
VZ
1960}
1961
35d11700
VZ
1962size_t
1963wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1964 const wchar_t *src, size_t srcLen) const
c91830cb 1965{
35d11700
VZ
1966 if ( srcLen == wxNO_LEN )
1967 srcLen = wxWcslen(src) + 1;
1968
1969 srcLen *= BYTES_PER_CHAR;
c91830cb 1970
35d11700 1971 if ( dst )
c91830cb 1972 {
35d11700
VZ
1973 if ( dstLen < srcLen )
1974 return wxCONV_FAILED;
1975
5c33522f 1976 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 1977 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1978 {
ef199164 1979 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1980 }
c91830cb 1981 }
b5153fd8 1982
35d11700 1983 return srcLen;
c91830cb
VZ
1984}
1985
467e0479 1986#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1987
1988
36acb880
VZ
1989// ============================================================================
1990// The classes doing conversion using the iconv_xxx() functions
1991// ============================================================================
3caec1bb 1992
b040e242 1993#ifdef HAVE_ICONV
3a0d76bc 1994
b1d547eb
VS
1995// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1996// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1997// (unless there's yet another bug in glibc) the only case when iconv()
1998// returns with (size_t)-1 (which means error) and says there are 0 bytes
1999// left in the input buffer -- when _real_ error occurs,
2000// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2001// iconv() failure.
3caec1bb
VS
2002// [This bug does not appear in glibc 2.2.]
2003#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2004#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2005 (errno != E2BIG || bufLeft != 0))
2006#else
2007#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2008#endif
2009
ab217dba 2010#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2011
74a7eb0b
VZ
2012#define ICONV_T_INVALID ((iconv_t)-1)
2013
2014#if SIZEOF_WCHAR_T == 4
2015 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2016 #define WC_ENC wxFONTENCODING_UTF32
2017#elif SIZEOF_WCHAR_T == 2
2018 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2019 #define WC_ENC wxFONTENCODING_UTF16
2020#else // sizeof(wchar_t) != 2 nor 4
2021 // does this ever happen?
2022 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2023#endif
2024
36acb880 2025// ----------------------------------------------------------------------------
e95354ec 2026// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2027// ----------------------------------------------------------------------------
2028
e95354ec 2029class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2030{
2031public:
86501081 2032 wxMBConv_iconv(const char *name);
e95354ec 2033 virtual ~wxMBConv_iconv();
36acb880 2034
8f4b0f43
VZ
2035 // implement base class virtual methods
2036 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2037 const char *src, size_t srcLen = wxNO_LEN) const;
2038 virtual size_t FromWChar(char *dst, size_t dstLen,
2039 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2040 virtual size_t GetMBNulLen() const;
2041
ba98e032
VS
2042#if wxUSE_UNICODE_UTF8
2043 virtual bool IsUTF8() const;
2044#endif
2045
d36c9347
VZ
2046 virtual wxMBConv *Clone() const
2047 {
86501081 2048 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2049 p->m_minMBCharWidth = m_minMBCharWidth;
2050 return p;
2051 }
2052
e95354ec 2053 bool IsOk() const
74a7eb0b 2054 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2055
2056protected:
ef199164
DS
2057 // the iconv handlers used to translate from multibyte
2058 // to wide char and in the other direction
36acb880
VZ
2059 iconv_t m2w,
2060 w2m;
ef199164 2061
b1d547eb
VS
2062#if wxUSE_THREADS
2063 // guards access to m2w and w2m objects
2064 wxMutex m_iconvMutex;
2065#endif
36acb880
VZ
2066
2067private:
e95354ec 2068 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2069 // available on this machine, it will remain NULL
74a7eb0b 2070 static wxString ms_wcCharsetName;
36acb880
VZ
2071
2072 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2073 // different endian-ness than the native one
405d8f46 2074 static bool ms_wcNeedsSwap;
eec47cc6 2075
d36c9347
VZ
2076
2077 // name of the encoding handled by this conversion
2078 wxString m_name;
2079
7ef3ab50 2080 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2081 // initially
2082 size_t m_minMBCharWidth;
36acb880
VZ
2083};
2084
8f115891 2085// make the constructor available for unit testing
86501081 2086WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2087{
2088 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2089 if ( !result->IsOk() )
2090 {
2091 delete result;
2092 return 0;
2093 }
ef199164 2094
8f115891
MW
2095 return result;
2096}
2097
422e411e 2098wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2099bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2100
86501081 2101wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2102 : m_name(name)
36acb880 2103{
c1464d9d 2104 m_minMBCharWidth = 0;
eec47cc6 2105
36acb880 2106 // check for charset that represents wchar_t:
74a7eb0b 2107 if ( ms_wcCharsetName.empty() )
f1339c56 2108 {
c2b83fdd
VZ
2109 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2110
74a7eb0b
VZ
2111#if wxUSE_FONTMAP
2112 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2113#else // !wxUSE_FONTMAP
91cb7f52 2114 static const wxChar *names_static[] =
36acb880 2115 {
74a7eb0b
VZ
2116#if SIZEOF_WCHAR_T == 4
2117 _T("UCS-4"),
2118#elif SIZEOF_WCHAR_T = 2
2119 _T("UCS-2"),
2120#endif
2121 NULL
2122 };
91cb7f52 2123 const wxChar **names = names_static;
74a7eb0b 2124#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2125
d1f024a8 2126 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2127 {
17a1ebd1 2128 const wxString nameCS(*names);
74a7eb0b
VZ
2129
2130 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2131 wxString nameXE(nameCS);
ef199164
DS
2132
2133#ifdef WORDS_BIGENDIAN
74a7eb0b 2134 nameXE += _T("BE");
ef199164 2135#else // little endian
74a7eb0b 2136 nameXE += _T("LE");
ef199164 2137#endif
74a7eb0b 2138
c2b83fdd
VZ
2139 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2140 nameXE.c_str());
2141
86501081 2142 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2143 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2144 {
74a7eb0b 2145 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
2146 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2147 nameCS.c_str());
86501081 2148 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2149
74a7eb0b
VZ
2150 // and check for bytesex ourselves:
2151 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2152 {
74a7eb0b 2153 char buf[2], *bufPtr;
e8769ed1 2154 wchar_t wbuf[2];
74a7eb0b
VZ
2155 size_t insz, outsz;
2156 size_t res;
2157
2158 buf[0] = 'A';
2159 buf[1] = 0;
2160 wbuf[0] = 0;
2161 insz = 2;
2162 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2163 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2164 bufPtr = buf;
2165
ef199164
DS
2166 res = iconv(
2167 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2168 &wbufPtr, &outsz);
74a7eb0b
VZ
2169
2170 if (ICONV_FAILED(res, insz))
2171 {
2172 wxLogLastError(wxT("iconv"));
422e411e 2173 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2174 nameCS.c_str());
74a7eb0b
VZ
2175 }
2176 else // ok, can convert to this encoding, remember it
2177 {
17a1ebd1 2178 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2179 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2180 }
3a0d76bc
VS
2181 }
2182 }
74a7eb0b 2183 else // use charset not requiring byte swapping
36acb880 2184 {
74a7eb0b 2185 ms_wcCharsetName = nameXE;
36acb880 2186 }
3a0d76bc 2187 }
74a7eb0b 2188
0944fceb 2189 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2190 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2191 ms_wcCharsetName.empty() ? wxString("<none>")
2192 : ms_wcCharsetName,
74a7eb0b
VZ
2193 ms_wcNeedsSwap ? _T(" (needs swap)")
2194 : _T(""));
3a0d76bc 2195 }
36acb880 2196 else // we already have ms_wcCharsetName
3caec1bb 2197 {
86501081 2198 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2199 }
dccce9ea 2200
74a7eb0b 2201 if ( ms_wcCharsetName.empty() )
f1339c56 2202 {
74a7eb0b 2203 w2m = ICONV_T_INVALID;
36acb880 2204 }
405d8f46
VZ
2205 else
2206 {
86501081 2207 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2208 if ( w2m == ICONV_T_INVALID )
2209 {
2210 wxLogTrace(TRACE_STRCONV,
2211 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2212 ms_wcCharsetName.c_str(), name);
74a7eb0b 2213 }
405d8f46 2214 }
36acb880 2215}
3caec1bb 2216
e95354ec 2217wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2218{
74a7eb0b 2219 if ( m2w != ICONV_T_INVALID )
36acb880 2220 iconv_close(m2w);
74a7eb0b 2221 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2222 iconv_close(w2m);
2223}
3a0d76bc 2224
8f4b0f43
VZ
2225size_t
2226wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2227 const char *src, size_t srcLen) const
36acb880 2228{
8f4b0f43 2229 if ( srcLen == wxNO_LEN )
69373110 2230 {
8f4b0f43
VZ
2231 // find the string length: notice that must be done differently for
2232 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2233 // consecutive NULs
2234 const size_t nulLen = GetMBNulLen();
2235 switch ( nulLen )
2236 {
2237 default:
2238 return wxCONV_FAILED;
69373110 2239
8f4b0f43
VZ
2240 case 1:
2241 srcLen = strlen(src); // arguably more optimized than our version
2242 break;
69373110 2243
8f4b0f43
VZ
2244 case 2:
2245 case 4:
2246 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2247 // but they also have to start at character boundary and not
2248 // span two adjacent characters
2249 const char *p;
2250 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2251 ;
2252 srcLen = p - src;
2253 break;
2254 }
d50c0831
VZ
2255
2256 // when we're determining the length of the string ourselves we count
2257 // the terminating NUL(s) as part of it and always NUL-terminate the
2258 // output
2259 srcLen += nulLen;
69373110
VZ
2260 }
2261
8f4b0f43
VZ
2262 // we express length in the number of (wide) characters but iconv always
2263 // counts buffer sizes it in bytes
2264 dstLen *= SIZEOF_WCHAR_T;
2265
b1d547eb 2266#if wxUSE_THREADS
6a17b868
SN
2267 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2268 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2269 // wxConvLocal that are used all over wx code, so we have to make sure
2270 // the handle is used by at most one thread at the time. Otherwise
2271 // only a few wx classes would be safe to use from non-main threads
2272 // as MB<->WC conversion would fail "randomly".
2273 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2274#endif // wxUSE_THREADS
2275
36acb880 2276 size_t res, cres;
8f4b0f43 2277 const char *pszPtr = src;
36acb880 2278
8f4b0f43 2279 if ( dst )
36acb880 2280 {
8f4b0f43 2281 char* bufPtr = (char*)dst;
e8769ed1 2282
36acb880 2283 // have destination buffer, convert there
1752fda6 2284 size_t dstLenOrig = dstLen;
36acb880 2285 cres = iconv(m2w,
8f4b0f43
VZ
2286 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2287 &bufPtr, &dstLen);
1752fda6
VZ
2288
2289 // convert the number of bytes converted as returned by iconv to the
2290 // number of (wide) characters converted that we need
2291 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2292
36acb880 2293 if (ms_wcNeedsSwap)
3a0d76bc 2294 {
36acb880 2295 // convert to native endianness
17a1ebd1 2296 for ( unsigned i = 0; i < res; i++ )
467a2982 2297 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2298 }
36acb880 2299 }
8f4b0f43 2300 else // no destination buffer
36acb880 2301 {
8f4b0f43 2302 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2303 wchar_t tbuf[256];
36acb880 2304 res = 0;
ef199164
DS
2305
2306 do
2307 {
e8769ed1 2308 char* bufPtr = (char*)tbuf;
8f4b0f43 2309 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2310
2311 cres = iconv(m2w,
8f4b0f43
VZ
2312 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2313 &bufPtr, &dstLen );
36acb880 2314
8f4b0f43 2315 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2316 }
2317 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2318 }
dccce9ea 2319
8f4b0f43 2320 if (ICONV_FAILED(cres, srcLen))
f1339c56 2321 {
36acb880 2322 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2323 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2324 return wxCONV_FAILED;
36acb880
VZ
2325 }
2326
2327 return res;
2328}
2329
8f4b0f43
VZ
2330size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2331 const wchar_t *src, size_t srcLen) const
36acb880 2332{
b1d547eb
VS
2333#if wxUSE_THREADS
2334 // NB: explained in MB2WC
2335 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2336#endif
3698ae71 2337
8f4b0f43 2338 if ( srcLen == wxNO_LEN )
2588ee86 2339 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2340
2341 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2342 size_t outbuflen = dstLen;
36acb880 2343 size_t res, cres;
3a0d76bc 2344
36acb880 2345 wchar_t *tmpbuf = 0;
3caec1bb 2346
36acb880
VZ
2347 if (ms_wcNeedsSwap)
2348 {
2349 // need to copy to temp buffer to switch endianness
51725fc0 2350 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2351 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2352 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2353 for ( size_t i = 0; i < srcLen; i++ )
2354 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2355
8f4b0f43 2356 src = tmpbuf;
36acb880 2357 }
3a0d76bc 2358
8f4b0f43
VZ
2359 char* inbuf = (char*)src;
2360 if ( dst )
36acb880
VZ
2361 {
2362 // have destination buffer, convert there
8f4b0f43 2363 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2364
8f4b0f43 2365 res = dstLen - outbuflen;
36acb880 2366 }
8f4b0f43 2367 else // no destination buffer
36acb880 2368 {
8f4b0f43 2369 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2370 char tbuf[256];
36acb880 2371 res = 0;
ef199164
DS
2372 do
2373 {
8f4b0f43 2374 dst = tbuf;
51725fc0 2375 outbuflen = WXSIZEOF(tbuf);
36acb880 2376
8f4b0f43 2377 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2378
51725fc0 2379 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2380 }
2381 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2382 }
dccce9ea 2383
36acb880
VZ
2384 if (ms_wcNeedsSwap)
2385 {
2386 free(tmpbuf);
2387 }
dccce9ea 2388
e8769ed1 2389 if (ICONV_FAILED(cres, inbuflen))
36acb880 2390 {
ce6f8d6f 2391 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2392 return wxCONV_FAILED;
36acb880
VZ
2393 }
2394
2395 return res;
2396}
2397
7ef3ab50 2398size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2399{
c1464d9d 2400 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2401 {
2402 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2403
2404#if wxUSE_THREADS
2405 // NB: explained in MB2WC
2406 wxMutexLocker lock(self->m_iconvMutex);
2407#endif
2408
999020e1 2409 const wchar_t *wnul = L"";
c1464d9d 2410 char buf[8]; // should be enough for NUL in any encoding
356410fc 2411 size_t inLen = sizeof(wchar_t),
c1464d9d 2412 outLen = WXSIZEOF(buf);
ef199164
DS
2413 char *inBuff = (char *)wnul;
2414 char *outBuff = buf;
2415 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2416 {
c1464d9d 2417 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2418 }
2419 else // ok
2420 {
ef199164 2421 self->m_minMBCharWidth = outBuff - buf;
356410fc 2422 }
eec47cc6
VZ
2423 }
2424
c1464d9d 2425 return m_minMBCharWidth;
eec47cc6
VZ
2426}
2427
ba98e032
VS
2428#if wxUSE_UNICODE_UTF8
2429bool wxMBConv_iconv::IsUTF8() const
2430{
86501081
VS
2431 return wxStricmp(m_name, "UTF-8") == 0 ||
2432 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2433}
2434#endif
2435
b040e242 2436#endif // HAVE_ICONV
36acb880 2437
e95354ec 2438
36acb880
VZ
2439// ============================================================================
2440// Win32 conversion classes
2441// ============================================================================
1cd52418 2442
e95354ec 2443#ifdef wxHAVE_WIN32_MB2WC
373658eb 2444
8b04d4c4 2445// from utils.cpp
d775fa82 2446#if wxUSE_FONTMAP
86501081 2447extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2448extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2449#endif
373658eb 2450
e95354ec 2451class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2452{
2453public:
bde4baac
VZ
2454 wxMBConv_win32()
2455 {
2456 m_CodePage = CP_ACP;
c1464d9d 2457 m_minMBCharWidth = 0;
bde4baac
VZ
2458 }
2459
d36c9347 2460 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2461 : wxMBConv()
d36c9347
VZ
2462 {
2463 m_CodePage = conv.m_CodePage;
2464 m_minMBCharWidth = conv.m_minMBCharWidth;
2465 }
2466
7608a683 2467#if wxUSE_FONTMAP
86501081 2468 wxMBConv_win32(const char* name)
bde4baac
VZ
2469 {
2470 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2471 m_minMBCharWidth = 0;
bde4baac 2472 }
dccce9ea 2473
e95354ec 2474 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2475 {
2476 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2477 m_minMBCharWidth = 0;
bde4baac 2478 }
eec47cc6 2479#endif // wxUSE_FONTMAP
8b04d4c4 2480
d36c9347 2481 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2482 {
02272c9c
VZ
2483 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2484 // the behaviour is not compatible with the Unix version (using iconv)
2485 // and break the library itself, e.g. wxTextInputStream::NextChar()
2486 // wouldn't work if reading an incomplete MB char didn't result in an
2487 // error
667e5b3e 2488 //
89028980 2489 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2490 // Win XP or newer and it is not supported for UTF-[78] so we always
2491 // use our own conversions in this case. See
89028980
VS
2492 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2493 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2494 if ( m_CodePage == CP_UTF8 )
89028980 2495 {
5487ff0f 2496 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2497 }
830f8f11
VZ
2498
2499 if ( m_CodePage == CP_UTF7 )
2500 {
5487ff0f 2501 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2502 }
2503
2504 int flags = 0;
2505 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2506 IsAtLeastWin2kSP4() )
89028980 2507 {
830f8f11 2508 flags = MB_ERR_INVALID_CHARS;
89028980 2509 }
667e5b3e 2510
2b5f62a0
VZ
2511 const size_t len = ::MultiByteToWideChar
2512 (
2513 m_CodePage, // code page
667e5b3e 2514 flags, // flags: fall on error
2b5f62a0
VZ
2515 psz, // input string
2516 -1, // its length (NUL-terminated)
b4da152e 2517 buf, // output string
2b5f62a0
VZ
2518 buf ? n : 0 // size of output buffer
2519 );
89028980
VS
2520 if ( !len )
2521 {
2522 // function totally failed
467e0479 2523 return wxCONV_FAILED;
89028980
VS
2524 }
2525
2526 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2527 // check if we succeeded, by doing a double trip:
2528 if ( !flags && buf )
2529 {
53c174fc
VZ
2530 const size_t mbLen = strlen(psz);
2531 wxCharBuffer mbBuf(mbLen);
89028980
VS
2532 if ( ::WideCharToMultiByte
2533 (
2534 m_CodePage,
2535 0,
2536 buf,
2537 -1,
2538 mbBuf.data(),
53c174fc 2539 mbLen + 1, // size in bytes, not length
89028980
VS
2540 NULL,
2541 NULL
2542 ) == 0 ||
2543 strcmp(mbBuf, psz) != 0 )
2544 {
2545 // we didn't obtain the same thing we started from, hence
2546 // the conversion was lossy and we consider that it failed
467e0479 2547 return wxCONV_FAILED;
89028980
VS
2548 }
2549 }
2b5f62a0 2550
03a991bc
VZ
2551 // note that it returns count of written chars for buf != NULL and size
2552 // of the needed buffer for buf == NULL so in either case the length of
2553 // the string (which never includes the terminating NUL) is one less
89028980 2554 return len - 1;
f1339c56 2555 }
dccce9ea 2556
d36c9347 2557 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2558 {
13dd924a
VZ
2559 /*
2560 we have a problem here: by default, WideCharToMultiByte() may
2561 replace characters unrepresentable in the target code page with bad
2562 quality approximations such as turning "1/2" symbol (U+00BD) into
2563 "1" for the code pages which don't have it and we, obviously, want
2564 to avoid this at any price
d775fa82 2565
13dd924a
VZ
2566 the trouble is that this function does it _silently_, i.e. it won't
2567 even tell us whether it did or not... Win98/2000 and higher provide
2568 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2569 we have to resort to a round trip, i.e. check that converting back
2570 results in the same string -- this is, of course, expensive but
2571 otherwise we simply can't be sure to not garble the data.
2572 */
2573
2574 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2575 // it doesn't work with CJK encodings (which we test for rather roughly
2576 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2577 // supporting it
907173e5
WS
2578 BOOL usedDef wxDUMMY_INITIALIZE(false);
2579 BOOL *pUsedDef;
13dd924a
VZ
2580 int flags;
2581 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2582 {
2583 // it's our lucky day
2584 flags = WC_NO_BEST_FIT_CHARS;
2585 pUsedDef = &usedDef;
2586 }
2587 else // old system or unsupported encoding
2588 {
2589 flags = 0;
2590 pUsedDef = NULL;
2591 }
2592
2b5f62a0
VZ
2593 const size_t len = ::WideCharToMultiByte
2594 (
2595 m_CodePage, // code page
13dd924a
VZ
2596 flags, // either none or no best fit
2597 pwz, // input string
2b5f62a0
VZ
2598 -1, // it is (wide) NUL-terminated
2599 buf, // output buffer
2600 buf ? n : 0, // and its size
2601 NULL, // default "replacement" char
13dd924a 2602 pUsedDef // [out] was it used?
2b5f62a0
VZ
2603 );
2604
13dd924a
VZ
2605 if ( !len )
2606 {
2607 // function totally failed
467e0479 2608 return wxCONV_FAILED;
13dd924a
VZ
2609 }
2610
765bdb4a
VZ
2611 // we did something, check if we really succeeded
2612 if ( flags )
13dd924a 2613 {
765bdb4a
VZ
2614 // check if the conversion failed, i.e. if any replacements
2615 // were done
2616 if ( usedDef )
2617 return wxCONV_FAILED;
2618 }
2619 else // we must resort to double tripping...
2620 {
2621 // first we need to ensure that we really have the MB data: this is
2622 // not the case if we're called with NULL buffer, in which case we
2623 // need to do the conversion yet again
2624 wxCharBuffer bufDef;
2625 if ( !buf )
13dd924a 2626 {
765bdb4a
VZ
2627 bufDef = wxCharBuffer(len);
2628 buf = bufDef.data();
2629 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2630 buf, len, NULL, NULL) )
467e0479 2631 return wxCONV_FAILED;
13dd924a 2632 }
765bdb4a 2633
564da6ff
VZ
2634 if ( !n )
2635 n = wcslen(pwz);
765bdb4a 2636 wxWCharBuffer wcBuf(n);
564da6ff 2637 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2638 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2639 {
765bdb4a
VZ
2640 // we didn't obtain the same thing we started from, hence
2641 // the conversion was lossy and we consider that it failed
2642 return wxCONV_FAILED;
13dd924a
VZ
2643 }
2644 }
2645
03a991bc 2646 // see the comment above for the reason of "len - 1"
13dd924a 2647 return len - 1;
f1339c56 2648 }
dccce9ea 2649
7ef3ab50
VZ
2650 virtual size_t GetMBNulLen() const
2651 {
2652 if ( m_minMBCharWidth == 0 )
2653 {
2654 int len = ::WideCharToMultiByte
2655 (
2656 m_CodePage, // code page
2657 0, // no flags
2658 L"", // input string
2659 1, // translate just the NUL
2660 NULL, // output buffer
2661 0, // and its size
2662 NULL, // no replacement char
2663 NULL // [out] don't care if it was used
2664 );
2665
2666 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2667 switch ( len )
2668 {
2669 default:
2670 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2671 self->m_minMBCharWidth = (size_t)-1;
2672 break;
7ef3ab50
VZ
2673
2674 case 0:
2675 self->m_minMBCharWidth = (size_t)-1;
2676 break;
2677
2678 case 1:
2679 case 2:
2680 case 4:
2681 self->m_minMBCharWidth = len;
2682 break;
2683 }
2684 }
2685
2686 return m_minMBCharWidth;
2687 }
2688
d36c9347
VZ
2689 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2690
13dd924a
VZ
2691 bool IsOk() const { return m_CodePage != -1; }
2692
2693private:
2694 static bool CanUseNoBestFit()
2695 {
2696 static int s_isWin98Or2k = -1;
2697
2698 if ( s_isWin98Or2k == -1 )
2699 {
2700 int verMaj, verMin;
2701 switch ( wxGetOsVersion(&verMaj, &verMin) )
2702 {
406d283a 2703 case wxOS_WINDOWS_9X:
13dd924a
VZ
2704 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2705 break;
2706
406d283a 2707 case wxOS_WINDOWS_NT:
13dd924a
VZ
2708 s_isWin98Or2k = verMaj >= 5;
2709 break;
2710
2711 default:
ef199164 2712 // unknown: be conservative by default
13dd924a 2713 s_isWin98Or2k = 0;
ef199164 2714 break;
13dd924a
VZ
2715 }
2716
2717 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2718 }
2719
2720 return s_isWin98Or2k == 1;
2721 }
f1339c56 2722
89028980
VS
2723 static bool IsAtLeastWin2kSP4()
2724 {
8942f83a
WS
2725#ifdef __WXWINCE__
2726 return false;
2727#else
89028980
VS
2728 static int s_isAtLeastWin2kSP4 = -1;
2729
2730 if ( s_isAtLeastWin2kSP4 == -1 )
2731 {
2732 OSVERSIONINFOEX ver;
2733
2734 memset(&ver, 0, sizeof(ver));
2735 ver.dwOSVersionInfoSize = sizeof(ver);
2736 GetVersionEx((OSVERSIONINFO*)&ver);
2737
2738 s_isAtLeastWin2kSP4 =
2739 ((ver.dwMajorVersion > 5) || // Vista+
2740 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2741 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2742 ver.wServicePackMajor >= 4)) // 2000 SP4+
2743 ? 1 : 0;
2744 }
2745
2746 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2747#endif
89028980
VS
2748 }
2749
eec47cc6 2750
c1464d9d 2751 // the code page we're working with
b1d66b54 2752 long m_CodePage;
c1464d9d 2753
7ef3ab50 2754 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2755 // "unknown"
2756 size_t m_minMBCharWidth;
1cd52418 2757};
e95354ec
VZ
2758
2759#endif // wxHAVE_WIN32_MB2WC
2760
f7e98dee 2761
36acb880
VZ
2762// ============================================================================
2763// wxEncodingConverter based conversion classes
2764// ============================================================================
2765
1e6feb95 2766#if wxUSE_FONTMAP
1cd52418 2767
e95354ec 2768class wxMBConv_wxwin : public wxMBConv
1cd52418 2769{
8b04d4c4
VZ
2770private:
2771 void Init()
2772 {
6ac84a78
DE
2773 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2774 // The wxMBConv_cf class does a better job.
2775 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2776 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2777 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2778 }
2779
6001e347 2780public:
f1339c56
RR
2781 // temporarily just use wxEncodingConverter stuff,
2782 // so that it works while a better implementation is built
86501081 2783 wxMBConv_wxwin(const char* name)
f1339c56
RR
2784 {
2785 if (name)
267e11c5 2786 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2787 else
2788 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2789
8b04d4c4
VZ
2790 Init();
2791 }
2792
e95354ec 2793 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2794 {
2795 m_enc = enc;
2796
2797 Init();
f1339c56 2798 }
dccce9ea 2799
bde4baac 2800 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2801 {
2802 size_t inbuf = strlen(psz);
dccce9ea 2803 if (buf)
c643a977 2804 {
ef199164 2805 if (!m2w.Convert(psz, buf))
467e0479 2806 return wxCONV_FAILED;
c643a977 2807 }
f1339c56
RR
2808 return inbuf;
2809 }
dccce9ea 2810
bde4baac 2811 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2812 {
f8d791e0 2813 const size_t inbuf = wxWcslen(psz);
f1339c56 2814 if (buf)
c643a977 2815 {
ef199164 2816 if (!w2m.Convert(psz, buf))
467e0479 2817 return wxCONV_FAILED;
c643a977 2818 }
dccce9ea 2819
f1339c56
RR
2820 return inbuf;
2821 }
dccce9ea 2822
7ef3ab50 2823 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2824 {
2825 switch ( m_enc )
2826 {
2827 case wxFONTENCODING_UTF16BE:
2828 case wxFONTENCODING_UTF16LE:
c1464d9d 2829 return 2;
eec47cc6
VZ
2830
2831 case wxFONTENCODING_UTF32BE:
2832 case wxFONTENCODING_UTF32LE:
c1464d9d 2833 return 4;
eec47cc6
VZ
2834
2835 default:
c1464d9d 2836 return 1;
eec47cc6
VZ
2837 }
2838 }
2839
d36c9347
VZ
2840 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2841
7ef3ab50
VZ
2842 bool IsOk() const { return m_ok; }
2843
2844public:
2845 wxFontEncoding m_enc;
2846 wxEncodingConverter m2w, w2m;
2847
2848private:
cafbf6fb
VZ
2849 // were we initialized successfully?
2850 bool m_ok;
fc7a2a60 2851
c0c133e1 2852 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2853};
6001e347 2854
8f115891 2855// make the constructors available for unit testing
86501081 2856WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2857{
2858 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2859 if ( !result->IsOk() )
2860 {
2861 delete result;
2862 return 0;
2863 }
ef199164 2864
8f115891
MW
2865 return result;
2866}
2867
1e6feb95
VZ
2868#endif // wxUSE_FONTMAP
2869
36acb880
VZ
2870// ============================================================================
2871// wxCSConv implementation
2872// ============================================================================
2873
8b04d4c4 2874void wxCSConv::Init()
6001e347 2875{
e95354ec
VZ
2876 m_name = NULL;
2877 m_convReal = NULL;
2878 m_deferred = true;
2879}
2880
86501081 2881wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2882{
2883 Init();
82713003 2884
86501081 2885 if ( !charset.empty() )
e95354ec 2886 {
86501081 2887 SetName(charset.ToAscii());
e95354ec 2888 }
bda3d86a 2889
e4277538
VZ
2890#if wxUSE_FONTMAP
2891 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
e3276230
VZ
2892 if ( m_encoding == wxFONTENCODING_MAX )
2893 {
2894 // set to unknown/invalid value
2895 m_encoding = wxFONTENCODING_SYSTEM;
2896 }
2897 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2898 {
2899 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2900 m_encoding = wxFONTENCODING_ISO8859_1;
2901 }
e4277538 2902#else
bda3d86a 2903 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2904#endif
6001e347
RR
2905}
2906
8b04d4c4
VZ
2907wxCSConv::wxCSConv(wxFontEncoding encoding)
2908{
bda3d86a 2909 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2910 {
2911 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2912
2913 encoding = wxFONTENCODING_SYSTEM;
2914 }
2915
8b04d4c4
VZ
2916 Init();
2917
bda3d86a 2918 m_encoding = encoding;
8b04d4c4
VZ
2919}
2920
6001e347
RR
2921wxCSConv::~wxCSConv()
2922{
65e50848
JS
2923 Clear();
2924}
2925
54380f29 2926wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2927 : wxMBConv()
54380f29 2928{
8b04d4c4
VZ
2929 Init();
2930
54380f29 2931 SetName(conv.m_name);
8b04d4c4 2932 m_encoding = conv.m_encoding;
54380f29
GD
2933}
2934
2935wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2936{
2937 Clear();
8b04d4c4 2938
54380f29 2939 SetName(conv.m_name);
8b04d4c4
VZ
2940 m_encoding = conv.m_encoding;
2941
54380f29
GD
2942 return *this;
2943}
2944
65e50848
JS
2945void wxCSConv::Clear()
2946{
8b04d4c4 2947 free(m_name);
e95354ec 2948 delete m_convReal;
8b04d4c4 2949
65e50848 2950 m_name = NULL;
e95354ec 2951 m_convReal = NULL;
6001e347
RR
2952}
2953
86501081 2954void wxCSConv::SetName(const char *charset)
6001e347 2955{
f1339c56
RR
2956 if (charset)
2957 {
d6f2a891 2958 m_name = wxStrdup(charset);
e95354ec 2959 m_deferred = true;
f1339c56 2960 }
6001e347
RR
2961}
2962
8b3eb85d 2963#if wxUSE_FONTMAP
8b3eb85d
VZ
2964
2965WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2966 wxEncodingNameCache );
8b3eb85d
VZ
2967
2968static wxEncodingNameCache gs_nameCache;
2969#endif
2970
e95354ec
VZ
2971wxMBConv *wxCSConv::DoCreate() const
2972{
ce6f8d6f
VZ
2973#if wxUSE_FONTMAP
2974 wxLogTrace(TRACE_STRCONV,
2975 wxT("creating conversion for %s"),
2976 (m_name ? m_name
86501081 2977 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2978#endif // wxUSE_FONTMAP
2979
c547282d
VZ
2980 // check for the special case of ASCII or ISO8859-1 charset: as we have
2981 // special knowledge of it anyhow, we don't need to create a special
2982 // conversion object
e4277538
VZ
2983 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2984 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2985 {
e95354ec
VZ
2986 // don't convert at all
2987 return NULL;
2988 }
dccce9ea 2989
e95354ec
VZ
2990 // we trust OS to do conversion better than we can so try external
2991 // conversion methods first
2992 //
2993 // the full order is:
2994 // 1. OS conversion (iconv() under Unix or Win32 API)
2995 // 2. hard coded conversions for UTF
2996 // 3. wxEncodingConverter as fall back
2997
2998 // step (1)
2999#ifdef HAVE_ICONV
c547282d 3000#if !wxUSE_FONTMAP
e95354ec 3001 if ( m_name )
c547282d 3002#endif // !wxUSE_FONTMAP
e95354ec 3003 {
3ef10cfc 3004#if wxUSE_FONTMAP
8b3eb85d 3005 wxFontEncoding encoding(m_encoding);
3ef10cfc 3006#endif
8b3eb85d 3007
86501081 3008 if ( m_name )
8b3eb85d 3009 {
86501081 3010 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3011 if ( conv->IsOk() )
3012 return conv;
3013
3014 delete conv;
c547282d
VZ
3015
3016#if wxUSE_FONTMAP
8b3eb85d 3017 encoding =
86501081 3018 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3019#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3020 }
3021#if wxUSE_FONTMAP
3022 {
3023 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3024 if ( it != gs_nameCache.end() )
3025 {
3026 if ( it->second.empty() )
3027 return NULL;
c547282d 3028
86501081 3029 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3030 if ( conv->IsOk() )
3031 return conv;
e95354ec 3032
8b3eb85d
VZ
3033 delete conv;
3034 }
3035
3036 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3037 // CS : in case this does not return valid names (eg for MacRoman)
3038 // encoding got a 'failure' entry in the cache all the same,
3039 // although it just has to be created using a different method, so
3040 // only store failed iconv creation attempts (or perhaps we
3041 // shoulnd't do this at all ?)
3c67ec06 3042 if ( names[0] != NULL )
8b3eb85d 3043 {
3c67ec06 3044 for ( ; *names; ++names )
8b3eb85d 3045 {
86501081
VS
3046 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3047 // will need changes that will obsolete this
3048 wxString name(*names);
3049 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3050 if ( conv->IsOk() )
3051 {
3052 gs_nameCache[encoding] = *names;
3053 return conv;
3054 }
3055
3056 delete conv;
8b3eb85d
VZ
3057 }
3058
3c67ec06 3059 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 3060 }
8b3eb85d
VZ
3061 }
3062#endif // wxUSE_FONTMAP
e95354ec
VZ
3063 }
3064#endif // HAVE_ICONV
3065
3066#ifdef wxHAVE_WIN32_MB2WC
3067 {
7608a683 3068#if wxUSE_FONTMAP
e95354ec
VZ
3069 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3070 : new wxMBConv_win32(m_encoding);
3071 if ( conv->IsOk() )
3072 return conv;
3073
3074 delete conv;
7608a683
WS
3075#else
3076 return NULL;
3077#endif
e95354ec
VZ
3078 }
3079#endif // wxHAVE_WIN32_MB2WC
ef199164 3080
5c4ed98d 3081#ifdef __DARWIN__
f7e98dee 3082 {
6ff49cbc
DE
3083 // leave UTF16 and UTF32 to the built-ins of wx
3084 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3085 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3086 {
a6900d10 3087#if wxUSE_FONTMAP
5c4ed98d
DE
3088 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3089 : new wxMBConv_cf(m_encoding);
a6900d10 3090#else
5c4ed98d 3091 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3092#endif
ef199164 3093
f7e98dee 3094 if ( conv->IsOk() )
d775fa82
WS
3095 return conv;
3096
3097 delete conv;
3098 }
335d31e0 3099 }
5c4ed98d
DE
3100#endif // __DARWIN__
3101
e95354ec
VZ
3102 // step (2)
3103 wxFontEncoding enc = m_encoding;
3104#if wxUSE_FONTMAP
c547282d
VZ
3105 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3106 {
3107 // use "false" to suppress interactive dialogs -- we can be called from
3108 // anywhere and popping up a dialog from here is the last thing we want to
3109 // do
267e11c5 3110 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3111 }
e95354ec
VZ
3112#endif // wxUSE_FONTMAP
3113
3114 switch ( enc )
3115 {
3116 case wxFONTENCODING_UTF7:
3117 return new wxMBConvUTF7;
3118
3119 case wxFONTENCODING_UTF8:
3120 return new wxMBConvUTF8;
3121
e95354ec
VZ
3122 case wxFONTENCODING_UTF16BE:
3123 return new wxMBConvUTF16BE;
3124
3125 case wxFONTENCODING_UTF16LE:
3126 return new wxMBConvUTF16LE;
3127
e95354ec
VZ
3128 case wxFONTENCODING_UTF32BE:
3129 return new wxMBConvUTF32BE;
3130
3131 case wxFONTENCODING_UTF32LE:
3132 return new wxMBConvUTF32LE;
3133
3134 default:
3135 // nothing to do but put here to suppress gcc warnings
ef199164 3136 break;
e95354ec
VZ
3137 }
3138
3139 // step (3)
3140#if wxUSE_FONTMAP
3141 {
3142 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3143 : new wxMBConv_wxwin(m_encoding);
3144 if ( conv->IsOk() )
3145 return conv;
3146
3147 delete conv;
3148 }
3149#endif // wxUSE_FONTMAP
3150
a58d4f4d
VS
3151 // NB: This is a hack to prevent deadlock. What could otherwise happen
3152 // in Unicode build: wxConvLocal creation ends up being here
3153 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
3154 // attach a timestamp, for which it will need wxConvLocal (to convert
3155 // time to char* and then wchar_t*), but that fails, tries to log the
3156 // error, but wxLog has an (already locked) critical section that
3157 // guards the static buffer.
a58d4f4d
VS
3158 static bool alreadyLoggingError = false;
3159 if (!alreadyLoggingError)
3160 {
3161 alreadyLoggingError = true;
3162 wxLogError(_("Cannot convert from the charset '%s'!"),
3163 m_name ? m_name
e95354ec
VZ
3164 :
3165#if wxUSE_FONTMAP
86501081 3166 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 3167#else // !wxUSE_FONTMAP
86501081 3168 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
3169#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3170 );
ef199164 3171
a58d4f4d
VS
3172 alreadyLoggingError = false;
3173 }
e95354ec
VZ
3174
3175 return NULL;
3176}
3177
3178void wxCSConv::CreateConvIfNeeded() const
3179{
3180 if ( m_deferred )
3181 {
3182 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3183
bda3d86a
VZ
3184 // if we don't have neither the name nor the encoding, use the default
3185 // encoding for this system
3186 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3187 {
4c75209f 3188#if wxUSE_INTL
02c7347b 3189 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3190#else
3191 // fallback to some reasonable default:
3192 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3193#endif // wxUSE_INTL
4c75209f 3194 }
bda3d86a 3195
e95354ec
VZ
3196 self->m_convReal = DoCreate();
3197 self->m_deferred = false;
6001e347 3198 }
6001e347
RR
3199}
3200
0f0298b1
VZ
3201bool wxCSConv::IsOk() const
3202{
3203 CreateConvIfNeeded();
3204
3205 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3206 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3207 return true; // always ok as we do it ourselves
3208
3209 // m_convReal->IsOk() is called at its own creation, so we know it must
3210 // be ok if m_convReal is non-NULL
3211 return m_convReal != NULL;
3212}
3213
1c714a5d
VZ
3214size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3215 const char *src, size_t srcLen) const
3216{
3217 CreateConvIfNeeded();
3218
2c74c558
VS
3219 if (m_convReal)
3220 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3221
3222 // latin-1 (direct)
05392dc8
VZ
3223 if ( srcLen == wxNO_LEN )
3224 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3225
05392dc8
VZ
3226 if ( dst )
3227 {
3228 if ( dstLen < srcLen )
3229 return wxCONV_FAILED;
1c714a5d 3230
05392dc8
VZ
3231 for ( size_t n = 0; n < srcLen; n++ )
3232 dst[n] = (unsigned char)(src[n]);
3233 }
2c74c558 3234
05392dc8 3235 return srcLen;
1c714a5d
VZ
3236}
3237
05392dc8
VZ
3238size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3239 const wchar_t *src, size_t srcLen) const
6001e347 3240{
e95354ec 3241 CreateConvIfNeeded();
dccce9ea 3242
e95354ec 3243 if (m_convReal)
05392dc8 3244 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3245
3246 // latin-1 (direct)
05392dc8
VZ
3247 if ( srcLen == wxNO_LEN )
3248 srcLen = wxWcslen(src) + 1;
dccce9ea 3249
05392dc8 3250 if ( dst )
f1339c56 3251 {
05392dc8
VZ
3252 if ( dstLen < srcLen )
3253 return wxCONV_FAILED;
1cd52418 3254
05392dc8 3255 for ( size_t n = 0; n < srcLen; n++ )
24642831 3256 {
05392dc8 3257 if ( src[n] > 0xFF )
467e0479 3258 return wxCONV_FAILED;
ef199164 3259
05392dc8 3260 dst[n] = (char)src[n];
24642831 3261 }
05392dc8 3262
24642831 3263 }
05392dc8 3264 else // still need to check the input validity
24642831 3265 {
05392dc8 3266 for ( size_t n = 0; n < srcLen; n++ )
24642831 3267 {
05392dc8 3268 if ( src[n] > 0xFF )
467e0479 3269 return wxCONV_FAILED;
24642831 3270 }
f1339c56 3271 }
dccce9ea 3272
05392dc8 3273 return srcLen;
6001e347
RR
3274}
3275
7ef3ab50 3276size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3277{
3278 CreateConvIfNeeded();
3279
3280 if ( m_convReal )
3281 {
7ef3ab50 3282 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3283 }
3284
ba98e032 3285 // otherwise, we are ISO-8859-1
c1464d9d 3286 return 1;
eec47cc6
VZ
3287}
3288
ba98e032
VS
3289#if wxUSE_UNICODE_UTF8
3290bool wxCSConv::IsUTF8() const
3291{
3292 CreateConvIfNeeded();
3293
3294 if ( m_convReal )
3295 {
3296 return m_convReal->IsUTF8();
3297 }
3298
3299 // otherwise, we are ISO-8859-1
3300 return false;
3301}
3302#endif
3303
69c928ef
VZ
3304
3305#if wxUSE_UNICODE
3306
3307wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3308{
3309 if ( !s )
3310 return wxWCharBuffer();
3311
3312 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3313 if ( !wbuf )
5487ff0f 3314 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3315 if ( !wbuf )
3316 wbuf = wxConvISO8859_1.cMB2WX(s);
3317
3318 return wbuf;
3319}
3320
3321wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3322{
3323 if ( !ws )
3324 return wxCharBuffer();
3325
3326 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3327 if ( !buf )
3328 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3329
3330 return buf;
3331}
3332
3333#endif // wxUSE_UNICODE
f5a1953b 3334
1e50d914
VS
3335// ----------------------------------------------------------------------------
3336// globals
3337// ----------------------------------------------------------------------------
3338
3339// NB: The reason why we create converted objects in this convoluted way,
3340// using a factory function instead of global variable, is that they
3341// may be used at static initialization time (some of them are used by
3342// wxString ctors and there may be a global wxString object). In other
3343// words, possibly _before_ the converter global object would be
3344// initialized.
3345
3346#undef wxConvLibc
3347#undef wxConvUTF8
3348#undef wxConvUTF7
3349#undef wxConvLocal
3350#undef wxConvISO8859_1
3351
3352#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3353 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3354 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3355 { \
3356 static impl_klass name##Obj ctor_args; \
3357 return &name##Obj; \
3358 } \
3359 /* this ensures that all global converter objects are created */ \
3360 /* by the time static initialization is done, i.e. before any */ \
3361 /* thread is launched: */ \
3362 static klass* gs_##name##instance = wxGet_##name##Ptr()
3363
3364#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3365 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3366
5c69ef61
VZ
3367#ifdef __INTELC__
3368 // disable warning "variable 'xxx' was declared but never referenced"
3369 #pragma warning(disable: 177)
3370#endif // Intel C++
3371
1e50d914
VS
3372#ifdef __WINDOWS__
3373 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3374#elif 0 // defined(__WXOSX__)
3375 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3376#else
3377 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3378#endif
3379
e1079eda
VZ
3380// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3381// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3382// provokes an error message about "not enough macro parameters"; and we
3383// can't use "()" here as the name##Obj declaration would be parsed as a
3384// function declaration then, so use a semicolon and live with an extra
3385// empty statement (and hope that no compilers warns about this)
3386WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3387WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3388
3389WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3390WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3391
3392WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3393WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3394
6ac84a78
DE
3395#ifdef __DARWIN__
3396// The xnu kernel always communicates file paths in decomposed UTF-8.
3397// WARNING: Are we sure that CFString's conversion will cause decomposition?
3398static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3399#endif
6ac84a78 3400
1e50d914 3401WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3402#ifdef __DARWIN__
1e50d914 3403 &wxConvMacUTF8DObj;
6ac84a78 3404#else // !__DARWIN__
1e50d914 3405 wxGet_wxConvLibcPtr();
6ac84a78 3406#endif // __DARWIN__/!__DARWIN__
1e50d914 3407
bde4baac
VZ
3408#else // !wxUSE_WCHAR_T
3409
1e50d914 3410// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3411// stand-ins in absence of wchar_t
3412WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3413 wxConvISO8859_1,
3414 wxConvLocal,
3415 wxConvUTF8;
3416
3417#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T