]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
add very simple (but already exposing many problems) wxIPC benchmark
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
c933e267 56#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
ce6f8d6f
VZ
60#define TRACE_STRCONV _T("strconv")
61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069
VZ
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434 162 // although new conversion classes are supposed to implement this function
36f93678 163 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
36f93678
VZ
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
36f93678
VZ
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
483b0434 227 for ( ;; )
eec47cc6 228 {
c1464d9d 229 // try to convert the current chunk
483b0434 230 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
e4e3bbb4 233
483b0434 234 dstWritten += lenChunk;
f6a02087
VZ
235 if ( !srcEnd )
236 dstWritten++;
f5fb6871 237
f6a02087 238 if ( !lenChunk )
467e0479
VZ
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
483b0434
VZ
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
f6a02087
VZ
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
f6a02087
VZ
254 if ( !srcEnd )
255 dst++;
483b0434 256 }
c1464d9d 257
483b0434 258 if ( !srcEnd )
c1464d9d 259 {
467e0479
VZ
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
c1464d9d
VZ
262 break;
263 }
eec47cc6
VZ
264
265 // advance the input pointer past the end of this chunk
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
483b0434 275 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
36f93678 279 // delimited by srcEnd
483b0434 280 if ( src >= srcEnd )
c1464d9d 281 break;
36f93678
VZ
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
c1464d9d
VZ
292 }
293
483b0434 294 return dstWritten;
e4e3bbb4
RN
295}
296
483b0434
VZ
297size_t
298wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
e4e3bbb4 300{
483b0434
VZ
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
e4e3bbb4 303
f6a02087
VZ
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
eec47cc6
VZ
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
eec47cc6 311 wxWCharBuffer bufTmp;
f6a02087 312 if ( isNulTerminated )
e4e3bbb4 313 {
483b0434 314 srcLen = wxWcslen(src) + 1;
eec47cc6 315 }
483b0434 316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
317 {
318 // make a copy in order to properly NUL-terminate the string
483b0434 319 bufTmp = wxWCharBuffer(srcLen);
ef199164 320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
483b0434 335 dstWritten += lenChunk;
f6a02087
VZ
336 if ( isNulTerminated )
337 dstWritten += lenNul;
483b0434
VZ
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
f6a02087 344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
f6a02087
VZ
348 if ( isNulTerminated )
349 dst += lenNul;
483b0434 350 }
eec47cc6 351 }
e4e3bbb4 352
483b0434
VZ
353 return dstWritten;
354}
355
ef199164 356size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 357{
8a493b67
VZ
358 // add 1 to available buffer length because MB2WC() parameter counts the
359 // number of non-NUL characters while ToWChar() counts everything
360 size_t rc = ToWChar(outBuff, outLen + 1, inBuff);
467e0479 361 if ( rc != wxCONV_FAILED )
509da451
VZ
362 {
363 // ToWChar() returns the buffer length, i.e. including the trailing
364 // NUL, while this method doesn't take it into account
365 rc--;
366 }
367
368 return rc;
369}
370
ef199164 371size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 372{
8a493b67
VZ
373 const size_t nulLen = GetMBNulLen();
374
375 size_t rc = FromWChar(outBuff, outLen + nulLen, inBuff);
467e0479 376 if ( rc != wxCONV_FAILED )
509da451 377 {
8a493b67 378 rc -= nulLen;
509da451
VZ
379 }
380
381 return rc;
382}
383
483b0434
VZ
384wxMBConv::~wxMBConv()
385{
386 // nothing to do here (necessary for Darwin linking probably)
387}
e4e3bbb4 388
483b0434
VZ
389const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
390{
391 if ( psz )
eec47cc6 392 {
483b0434 393 // calculate the length of the buffer needed first
a2db25a1 394 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 395 if ( nLen != wxCONV_FAILED )
f5fb6871 396 {
483b0434 397 // now do the actual conversion
a2db25a1 398 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 399
483b0434 400 // +1 for the trailing NULL
a2db25a1 401 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 402 return buf;
f5fb6871 403 }
483b0434 404 }
e4e3bbb4 405
483b0434
VZ
406 return wxWCharBuffer();
407}
3698ae71 408
483b0434
VZ
409const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
410{
411 if ( pwz )
412 {
a2db25a1 413 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 414 if ( nLen != wxCONV_FAILED )
483b0434 415 {
a2db25a1
VZ
416 wxCharBuffer buf(nLen - 1);
417 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
418 return buf;
419 }
420 }
421
422 return wxCharBuffer();
423}
e4e3bbb4 424
483b0434 425const wxWCharBuffer
ef199164 426wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 427{
ef199164 428 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 429 if ( dstLen != wxCONV_FAILED )
483b0434 430 {
0dd13d21
VZ
431 // notice that we allocate space for dstLen+1 wide characters here
432 // because we want the buffer to always be NUL-terminated, even if the
433 // input isn't (as otherwise the caller has no way to know its length)
434 wxWCharBuffer wbuf(dstLen);
f6a02087 435 wbuf.data()[dstLen] = L'\0';
ef199164 436 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
437 {
438 if ( outLen )
467e0479
VZ
439 {
440 *outLen = dstLen;
f6a02087
VZ
441
442 // we also need to handle NUL-terminated input strings
443 // specially: for them the output is the length of the string
444 // excluding the trailing NUL, however if we're asked to
445 // convert a specific number of characters we return the length
446 // of the resulting output even if it's NUL-terminated
447 if ( inLen == wxNO_LEN )
467e0479
VZ
448 (*outLen)--;
449 }
450
483b0434
VZ
451 return wbuf;
452 }
453 }
454
455 if ( outLen )
456 *outLen = 0;
457
458 return wxWCharBuffer();
459}
460
461const wxCharBuffer
ef199164 462wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 463{
13d92ad6 464 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 465 if ( dstLen != wxCONV_FAILED )
483b0434 466 {
0dd13d21
VZ
467 const size_t nulLen = GetMBNulLen();
468
469 // as above, ensure that the buffer is always NUL-terminated, even if
470 // the input is not
471 wxCharBuffer buf(dstLen + nulLen - 1);
472 memset(buf.data() + dstLen, 0, nulLen);
ef199164 473 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
474 {
475 if ( outLen )
467e0479
VZ
476 {
477 *outLen = dstLen;
478
f6a02087 479 if ( inLen == wxNO_LEN )
467e0479 480 {
f6a02087
VZ
481 // in this case both input and output are NUL-terminated
482 // and we're not supposed to count NUL
13d92ad6 483 *outLen -= nulLen;
467e0479
VZ
484 }
485 }
d32a507d 486
483b0434
VZ
487 return buf;
488 }
e4e3bbb4
RN
489 }
490
eec47cc6
VZ
491 if ( outLen )
492 *outLen = 0;
493
494 return wxCharBuffer();
e4e3bbb4
RN
495}
496
6001e347 497// ----------------------------------------------------------------------------
bde4baac 498// wxMBConvLibc
6001e347
RR
499// ----------------------------------------------------------------------------
500
bde4baac
VZ
501size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
502{
503 return wxMB2WC(buf, psz, n);
504}
505
506size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
507{
508 return wxWC2MB(buf, psz, n);
509}
e1bfe89e
RR
510
511// ----------------------------------------------------------------------------
532d575b 512// wxConvBrokenFileNames
e1bfe89e
RR
513// ----------------------------------------------------------------------------
514
eec47cc6
VZ
515#ifdef __UNIX__
516
86501081 517wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 518{
86501081
VS
519 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
520 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 521 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
522 else
523 m_conv = new wxCSConv(charset);
ea8ce907
RR
524}
525
eec47cc6 526#endif // __UNIX__
c12b7f79 527
bde4baac 528// ----------------------------------------------------------------------------
3698ae71 529// UTF-7
bde4baac 530// ----------------------------------------------------------------------------
6001e347 531
15f2ee32 532// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
533//
534// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 535
15f2ee32
RN
536//
537// BASE64 decoding table
538//
539static const unsigned char utf7unb64[] =
6001e347 540{
15f2ee32
RN
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
543 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
544 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
546 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
547 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
548 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
550 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
551 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
552 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
554 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
555 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
556 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
569 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
570 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
571 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
573};
574
9d653e81
VZ
575size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
576 const char *src, size_t srcLen) const
15f2ee32 577{
9d653e81 578 DecoderState stateOrig,
852dcba5 579 *statePtr;
9d653e81
VZ
580 if ( srcLen == wxNO_LEN )
581 {
582 // convert the entire string, up to and including the trailing NUL
583 srcLen = strlen(src) + 1;
584
585 // when working on the entire strings we don't update nor use the shift
586 // state from the previous call
587 statePtr = &stateOrig;
588 }
589 else // when working with partial strings we do use the shift state
590 {
591 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
592
593 // also save the old state to be able to rollback to it on error
594 stateOrig = m_stateDecoder;
595 }
596
597 // but to simplify the code below we use this variable in both cases
598 DecoderState& state = *statePtr;
599
600
601 // number of characters [which would have been] written to dst [if it were
602 // not NULL]
15f2ee32
RN
603 size_t len = 0;
604
9d653e81
VZ
605 const char * const srcEnd = src + srcLen;
606
607 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 608 {
9d653e81
VZ
609 const unsigned char cc = *src++;
610
611 if ( state.IsShifted() )
15f2ee32 612 {
9d653e81
VZ
613 const unsigned char dc = utf7unb64[cc];
614 if ( dc == 0xff )
15f2ee32 615 {
ccaa848d
VZ
616 // end of encoded part, check that nothing was left: there can
617 // be up to 4 bits of 0 padding but nothing else (we also need
618 // to check isLSB as we count bits modulo 8 while a valid UTF-7
619 // encoded sequence must contain an integral number of UTF-16
620 // characters)
621 if ( state.isLSB || state.bit > 4 ||
622 (state.accum & ((1 << state.bit) - 1)) )
623 {
624 if ( !len )
625 state = stateOrig;
626
852dcba5 627 return wxCONV_FAILED;
ccaa848d 628 }
852dcba5 629
9d653e81
VZ
630 state.ToDirect();
631
632 // re-parse this character normally below unless it's '-' which
633 // is consumed by the decoder
634 if ( cc == '-' )
635 continue;
636 }
637 else // valid encoded character
638 {
639 // mini base64 decoder: each character is 6 bits
640 state.bit += 6;
641 state.accum <<= 6;
642 state.accum += dc;
643
644 if ( state.bit >= 8 )
15f2ee32 645 {
9d653e81
VZ
646 // got the full byte, consume it
647 state.bit -= 8;
648 unsigned char b = (state.accum >> state.bit) & 0x00ff;
649
650 if ( state.isLSB )
15f2ee32 651 {
9d653e81
VZ
652 // we've got the full word, output it
653 if ( dst )
654 *dst++ = (state.msb << 8) | b;
655 len++;
656 state.isLSB = false;
15f2ee32 657 }
9d653e81 658 else // MSB
04a37834 659 {
9d653e81
VZ
660 // just store it while we wait for LSB
661 state.msb = b;
662 state.isLSB = true;
04a37834 663 }
15f2ee32
RN
664 }
665 }
9d653e81 666 }
04a37834 667
9d653e81
VZ
668 if ( state.IsDirect() )
669 {
670 // start of an encoded segment?
671 if ( cc == '+' )
04a37834 672 {
9d653e81
VZ
673 if ( *src == '-' )
674 {
675 // just the encoded plus sign, don't switch to shifted mode
676 if ( dst )
677 *dst++ = '+';
678 len++;
679 src++;
680 }
ccaa848d
VZ
681 else if ( utf7unb64[(unsigned)*src] == 0xff )
682 {
683 // empty encoded chunks are not allowed
684 if ( !len )
685 state = stateOrig;
686
687 return wxCONV_FAILED;
688 }
689 else // base-64 encoded chunk follows
9d653e81
VZ
690 {
691 state.ToShifted();
692 }
693 }
694 else // not '+'
695 {
696 // only printable 7 bit ASCII characters (with the exception of
697 // NUL, TAB, CR and LF) can be used directly
698 if ( cc >= 0x7f || (cc < ' ' &&
699 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
700 return wxCONV_FAILED;
701
702 if ( dst )
703 *dst++ = cc;
704 len++;
705 }
15f2ee32
RN
706 }
707 }
04a37834 708
9d653e81
VZ
709 if ( !len )
710 {
711 // as we didn't read any characters we should be called with the same
712 // data (followed by some more new data) again later so don't save our
713 // state
714 state = stateOrig;
715
716 return wxCONV_FAILED;
717 }
04a37834 718
15f2ee32 719 return len;
6001e347
RR
720}
721
15f2ee32
RN
722//
723// BASE64 encoding table
724//
725static const unsigned char utf7enb64[] =
726{
727 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
728 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
729 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
730 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
731 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
732 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
733 'w', 'x', 'y', 'z', '0', '1', '2', '3',
734 '4', '5', '6', '7', '8', '9', '+', '/'
735};
736
737//
738// UTF-7 encoding table
739//
740// 0 - Set D (directly encoded characters)
741// 1 - Set O (optional direct characters)
742// 2 - whitespace characters (optional)
743// 3 - special characters
744//
745static const unsigned char utf7encode[128] =
6001e347 746{
9d653e81 747 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
748 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
749 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
751 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
753 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
755};
756
9d653e81
VZ
757static inline bool wxIsUTF7Direct(wchar_t wc)
758{
759 return wc < 0x80 && utf7encode[wc] < 1;
760}
761
762size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
763 const wchar_t *src, size_t srcLen) const
15f2ee32 764{
9d653e81
VZ
765 EncoderState stateOrig,
766 *statePtr;
767 if ( srcLen == wxNO_LEN )
768 {
769 // we don't apply the stored state when operating on entire strings at
770 // once
771 statePtr = &stateOrig;
772
773 srcLen = wxWcslen(src) + 1;
774 }
775 else // do use the mode we left the output in previously
776 {
777 stateOrig = m_stateEncoder;
778 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
779 }
780
781 EncoderState& state = *statePtr;
782
783
15f2ee32
RN
784 size_t len = 0;
785
9d653e81
VZ
786 const wchar_t * const srcEnd = src + srcLen;
787 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 788 {
9d653e81
VZ
789 wchar_t cc = *src++;
790 if ( wxIsUTF7Direct(cc) )
15f2ee32 791 {
9d653e81
VZ
792 if ( state.IsShifted() )
793 {
794 // pad with zeros the last encoded block if necessary
795 if ( state.bit )
796 {
797 if ( dst )
798 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
799 len++;
800 }
ef199164 801
9d653e81
VZ
802 state.ToDirect();
803
804 if ( dst )
805 *dst++ = '-';
806 len++;
807 }
808
809 if ( dst )
810 *dst++ = (char)cc;
15f2ee32
RN
811 len++;
812 }
9d653e81
VZ
813 else if ( cc == '+' && state.IsDirect() )
814 {
815 if ( dst )
816 {
817 *dst++ = '+';
818 *dst++ = '-';
819 }
820
821 len += 2;
822 }
15f2ee32 823#ifndef WC_UTF16
79c78d42 824 else if (((wxUint32)cc) > 0xffff)
b2c13097 825 {
15f2ee32 826 // no surrogate pair generation (yet?)
467e0479 827 return wxCONV_FAILED;
15f2ee32
RN
828 }
829#endif
830 else
831 {
9d653e81
VZ
832 if ( state.IsDirect() )
833 {
834 state.ToShifted();
ef199164 835
9d653e81
VZ
836 if ( dst )
837 *dst++ = '+';
838 len++;
839 }
840
841 // BASE64 encode string
842 for ( ;; )
15f2ee32 843 {
9d653e81 844 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 845 {
9d653e81
VZ
846 state.accum <<= 8;
847 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
848
849 for (state.bit += 8; state.bit >= 6; )
15f2ee32 850 {
9d653e81
VZ
851 state.bit -= 6;
852 if ( dst )
853 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
854 len++;
15f2ee32 855 }
15f2ee32 856 }
ef199164 857
9d653e81
VZ
858 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
859 break;
ef199164 860
9d653e81 861 src++;
15f2ee32 862 }
15f2ee32
RN
863 }
864 }
ef199164 865
9d653e81
VZ
866 // we need to restore the original encoder state if we were called just to
867 // calculate the amount of space needed as we will presumably be called
868 // again to really convert the data now
869 if ( !dst )
870 state = stateOrig;
ef199164 871
15f2ee32 872 return len;
6001e347
RR
873}
874
f6bcfd97 875// ----------------------------------------------------------------------------
6001e347 876// UTF-8
f6bcfd97 877// ----------------------------------------------------------------------------
6001e347 878
1774c3c5 879static const wxUint32 utf8_max[]=
4def3b35 880 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 881
3698ae71
VZ
882// boundaries of the private use area we use to (temporarily) remap invalid
883// characters invalid in a UTF-8 encoded string
ea8ce907
RR
884const wxUint32 wxUnicodePUA = 0x100000;
885const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
886
0286d08d 887// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 888const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
889 // single-byte sequences (ASCII):
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
895 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
896 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
897 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
898
899 // these are invalid:
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
904 0, 0, // C0,C1
905
906 // two-byte sequences:
907 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
908 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
909
910 // three-byte sequences:
911 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
912
913 // four-byte sequences:
914 4, 4, 4, 4, 4, // F0..F4
915
916 // these are invalid again (5- or 6-byte
917 // sequences and sequences for code points
918 // above U+10FFFF, as restricted by RFC 3629):
919 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
920};
921
922size_t
923wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
924 const char *src, size_t srcLen) const
925{
926 wchar_t *out = dstLen ? dst : NULL;
927 size_t written = 0;
928
929 if ( srcLen == wxNO_LEN )
930 srcLen = strlen(src) + 1;
931
932 for ( const char *p = src; ; p++ )
933 {
934 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
935 {
936 // all done successfully, just add the trailing NULL if we are not
937 // using explicit length
938 if ( srcLen == wxNO_LEN )
939 {
940 if ( out )
941 {
942 if ( !dstLen )
943 break;
944
945 *out = L'\0';
946 }
947
948 written++;
949 }
950
951 return written;
952 }
953
0286d08d
VZ
954 if ( out && !dstLen-- )
955 break;
956
5367a38a
VS
957 wxUint32 code;
958 unsigned char c = *p;
0286d08d 959
5367a38a
VS
960 if ( c < 0x80 )
961 {
962 if ( srcLen == 0 ) // the test works for wxNO_LEN too
963 break;
0286d08d 964
5367a38a
VS
965 if ( srcLen != wxNO_LEN )
966 srcLen--;
0286d08d 967
5367a38a
VS
968 code = c;
969 }
970 else
0286d08d 971 {
5367a38a
VS
972 unsigned len = tableUtf8Lengths[c];
973 if ( !len )
974 break;
975
976 if ( srcLen < len ) // the test works for wxNO_LEN too
977 break;
978
979 if ( srcLen != wxNO_LEN )
980 srcLen -= len;
981
982 // Char. number range | UTF-8 octet sequence
983 // (hexadecimal) | (binary)
984 // ----------------------+----------------------------------------
985 // 0000 0000 - 0000 007F | 0xxxxxxx
986 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
987 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
988 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
989 //
990 // Code point value is stored in bits marked with 'x',
991 // lowest-order bit of the value on the right side in the diagram
992 // above. (from RFC 3629)
993
994 // mask to extract lead byte's value ('x' bits above), by sequence
995 // length:
996 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
997
998 // mask and value of lead byte's most significant bits, by length:
999 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1000 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1001
1002 len--; // it's more convenient to work with 0-based length here
1003
1004 // extract the lead byte's value bits:
1005 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1006 break;
1007
1008 code = c & leadValueMask[len];
1009
1010 // all remaining bytes, if any, are handled in the same way
1011 // regardless of sequence's length:
1012 for ( ; len; --len )
1013 {
1014 c = *++p;
1015 if ( (c & 0xC0) != 0x80 )
1016 return wxCONV_FAILED;
0286d08d 1017
5367a38a
VS
1018 code <<= 6;
1019 code |= c & 0x3F;
1020 }
0286d08d
VZ
1021 }
1022
1023#ifdef WC_UTF16
1024 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1025 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1026 {
1027 if ( out )
1028 out++;
1029 written++;
1030 }
1031#else // !WC_UTF16
1032 if ( out )
1033 *out = code;
1034#endif // WC_UTF16/!WC_UTF16
1035
1036 if ( out )
1037 out++;
1038
1039 written++;
1040 }
1041
1042 return wxCONV_FAILED;
1043}
1044
1045size_t
1046wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1047 const wchar_t *src, size_t srcLen) const
1048{
1049 char *out = dstLen ? dst : NULL;
1050 size_t written = 0;
1051
1052 for ( const wchar_t *wp = src; ; wp++ )
1053 {
a964d3ed 1054 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1055 {
1056 // all done successfully, just add the trailing NULL if we are not
1057 // using explicit length
1058 if ( srcLen == wxNO_LEN )
1059 {
1060 if ( out )
1061 {
1062 if ( !dstLen )
1063 break;
1064
1065 *out = '\0';
1066 }
1067
1068 written++;
1069 }
1070
1071 return written;
1072 }
1073
a964d3ed
VZ
1074 if ( srcLen != wxNO_LEN )
1075 srcLen--;
0286d08d
VZ
1076
1077 wxUint32 code;
1078#ifdef WC_UTF16
1079 // cast is ok for WC_UTF16
1080 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1081 {
1082 // skip the next char too as we decoded a surrogate
1083 wp++;
1084 }
1085#else // wchar_t is UTF-32
1086 code = *wp & 0x7fffffff;
1087#endif
1088
1089 unsigned len;
1090 if ( code <= 0x7F )
1091 {
1092 len = 1;
1093 if ( out )
1094 {
1095 if ( dstLen < len )
1096 break;
1097
1098 out[0] = (char)code;
1099 }
1100 }
1101 else if ( code <= 0x07FF )
1102 {
1103 len = 2;
1104 if ( out )
1105 {
1106 if ( dstLen < len )
1107 break;
1108
1109 // NB: this line takes 6 least significant bits, encodes them as
1110 // 10xxxxxx and discards them so that the next byte can be encoded:
1111 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1112 out[0] = 0xC0 | code;
1113 }
1114 }
1115 else if ( code < 0xFFFF )
1116 {
1117 len = 3;
1118 if ( out )
1119 {
1120 if ( dstLen < len )
1121 break;
1122
1123 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1124 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1125 out[0] = 0xE0 | code;
1126 }
1127 }
1128 else if ( code <= 0x10FFFF )
1129 {
1130 len = 4;
1131 if ( out )
1132 {
1133 if ( dstLen < len )
1134 break;
1135
1136 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1137 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1138 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1139 out[0] = 0xF0 | code;
1140 }
1141 }
1142 else
1143 {
1144 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1145 break;
1146 }
1147
1148 if ( out )
1149 {
1150 out += len;
1151 dstLen -= len;
1152 }
1153
1154 written += len;
1155 }
1156
1157 // we only get here if an error occurs during decoding
1158 return wxCONV_FAILED;
1159}
1160
d16d0917
VZ
1161size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1162 const char *psz, size_t srcLen) const
6001e347 1163{
0286d08d 1164 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1165 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1166
4def3b35
VS
1167 size_t len = 0;
1168
d16d0917 1169 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1170 {
ea8ce907
RR
1171 const char *opsz = psz;
1172 bool invalid = false;
4def3b35
VS
1173 unsigned char cc = *psz++, fc = cc;
1174 unsigned cnt;
dccce9ea 1175 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1176 fc <<= 1;
ef199164 1177
dccce9ea 1178 if (!cnt)
4def3b35
VS
1179 {
1180 // plain ASCII char
dccce9ea 1181 if (buf)
4def3b35
VS
1182 *buf++ = cc;
1183 len++;
561488ef
MW
1184
1185 // escape the escape character for octal escapes
1186 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1187 && cc == '\\' && (!buf || len < n))
1188 {
1189 if (buf)
1190 *buf++ = cc;
1191 len++;
1192 }
dccce9ea
VZ
1193 }
1194 else
4def3b35
VS
1195 {
1196 cnt--;
dccce9ea 1197 if (!cnt)
4def3b35
VS
1198 {
1199 // invalid UTF-8 sequence
ea8ce907 1200 invalid = true;
dccce9ea
VZ
1201 }
1202 else
4def3b35
VS
1203 {
1204 unsigned ocnt = cnt - 1;
1205 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1206 while (cnt--)
4def3b35 1207 {
ea8ce907 1208 cc = *psz;
dccce9ea 1209 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1210 {
1211 // invalid UTF-8 sequence
ea8ce907
RR
1212 invalid = true;
1213 break;
4def3b35 1214 }
ef199164 1215
ea8ce907 1216 psz++;
4def3b35
VS
1217 res = (res << 6) | (cc & 0x3f);
1218 }
ef199164 1219
ea8ce907 1220 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1221 {
1222 // illegal UTF-8 encoding
ea8ce907 1223 invalid = true;
4def3b35 1224 }
ea8ce907
RR
1225 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1226 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1227 {
1228 // if one of our PUA characters turns up externally
1229 // it must also be treated as an illegal sequence
1230 // (a bit like you have to escape an escape character)
1231 invalid = true;
1232 }
1233 else
1234 {
1cd52418 1235#ifdef WC_UTF16
0286d08d 1236 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1237 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1238 if (pa == wxCONV_FAILED)
ea8ce907
RR
1239 {
1240 invalid = true;
1241 }
1242 else
1243 {
1244 if (buf)
1245 buf += pa;
1246 len += pa;
1247 }
373658eb 1248#else // !WC_UTF16
ea8ce907 1249 if (buf)
38d4b1e4 1250 *buf++ = (wchar_t)res;
ea8ce907 1251 len++;
373658eb 1252#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1253 }
1254 }
ef199164 1255
ea8ce907
RR
1256 if (invalid)
1257 {
1258 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1259 {
1260 while (opsz < psz && (!buf || len < n))
1261 {
1262#ifdef WC_UTF16
1263 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1264 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1265 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1266 if (buf)
1267 buf += pa;
1268 opsz++;
1269 len += pa;
1270#else
1271 if (buf)
38d4b1e4 1272 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1273 opsz++;
1274 len++;
1275#endif
1276 }
1277 }
3698ae71 1278 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1279 {
1280 while (opsz < psz && (!buf || len < n))
1281 {
3698ae71
VZ
1282 if ( buf && len + 3 < n )
1283 {
17a1ebd1 1284 unsigned char on = *opsz;
3698ae71 1285 *buf++ = L'\\';
17a1ebd1
VZ
1286 *buf++ = (wchar_t)( L'0' + on / 0100 );
1287 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1288 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1289 }
ef199164 1290
ea8ce907
RR
1291 opsz++;
1292 len += 4;
1293 }
1294 }
3698ae71 1295 else // MAP_INVALID_UTF8_NOT
ea8ce907 1296 {
467e0479 1297 return wxCONV_FAILED;
ea8ce907 1298 }
4def3b35
VS
1299 }
1300 }
6001e347 1301 }
ef199164 1302
d16d0917 1303 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1304 *buf = 0;
ef199164 1305
d16d0917 1306 return len + 1;
6001e347
RR
1307}
1308
3698ae71
VZ
1309static inline bool isoctal(wchar_t wch)
1310{
1311 return L'0' <= wch && wch <= L'7';
1312}
1313
d16d0917
VZ
1314size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1315 const wchar_t *psz, size_t srcLen) const
6001e347 1316{
0286d08d 1317 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1318 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1319
4def3b35 1320 size_t len = 0;
6001e347 1321
d16d0917 1322 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1323 {
1324 wxUint32 cc;
ef199164 1325
1cd52418 1326#ifdef WC_UTF16
b5153fd8
VZ
1327 // cast is ok for WC_UTF16
1328 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1329 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1330#else
ef199164 1331 cc = (*psz++) & 0x7fffffff;
4def3b35 1332#endif
3698ae71
VZ
1333
1334 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1335 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1336 {
dccce9ea 1337 if (buf)
ea8ce907 1338 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1339 len++;
3698ae71 1340 }
561488ef
MW
1341 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1342 && cc == L'\\' && psz[0] == L'\\' )
1343 {
1344 if (buf)
1345 *buf++ = (char)cc;
1346 psz++;
1347 len++;
1348 }
3698ae71
VZ
1349 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1350 cc == L'\\' &&
1351 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1352 {
dccce9ea 1353 if (buf)
3698ae71 1354 {
ef199164
DS
1355 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1356 (psz[1] - L'0') * 010 +
b2c13097 1357 (psz[2] - L'0'));
3698ae71
VZ
1358 }
1359
1360 psz += 3;
ea8ce907
RR
1361 len++;
1362 }
1363 else
1364 {
1365 unsigned cnt;
ef199164
DS
1366 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1367 {
1368 }
1369
ea8ce907 1370 if (!cnt)
4def3b35 1371 {
ea8ce907
RR
1372 // plain ASCII char
1373 if (buf)
1374 *buf++ = (char) cc;
1375 len++;
1376 }
ea8ce907
RR
1377 else
1378 {
1379 len += cnt + 1;
1380 if (buf)
1381 {
1382 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1383 while (cnt--)
1384 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1385 }
4def3b35
VS
1386 }
1387 }
6001e347 1388 }
4def3b35 1389
d16d0917 1390 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1391 *buf = 0;
adb45366 1392
d16d0917 1393 return len + 1;
6001e347
RR
1394}
1395
467e0479 1396// ============================================================================
c91830cb 1397// UTF-16
467e0479 1398// ============================================================================
c91830cb
VZ
1399
1400#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1401 #define wxMBConvUTF16straight wxMBConvUTF16BE
1402 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1403#else
bde4baac
VZ
1404 #define wxMBConvUTF16swap wxMBConvUTF16BE
1405 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1406#endif
1407
467e0479
VZ
1408/* static */
1409size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1410{
1411 if ( srcLen == wxNO_LEN )
1412 {
1413 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1414 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1415 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1416 ;
c91830cb 1417
467e0479
VZ
1418 srcLen *= BYTES_PER_CHAR;
1419 }
1420 else // we already have the length
1421 {
1422 // we can only convert an entire number of UTF-16 characters
1423 if ( srcLen % BYTES_PER_CHAR )
1424 return wxCONV_FAILED;
1425 }
1426
1427 return srcLen;
1428}
1429
1430// case when in-memory representation is UTF-16 too
c91830cb
VZ
1431#ifdef WC_UTF16
1432
467e0479
VZ
1433// ----------------------------------------------------------------------------
1434// conversions without endianness change
1435// ----------------------------------------------------------------------------
1436
1437size_t
1438wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1439 const char *src, size_t srcLen) const
c91830cb 1440{
467e0479
VZ
1441 // set up the scene for using memcpy() (which is presumably more efficient
1442 // than copying the bytes one by one)
1443 srcLen = GetLength(src, srcLen);
1444 if ( srcLen == wxNO_LEN )
1445 return wxCONV_FAILED;
c91830cb 1446
ef199164 1447 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1448 if ( dst )
c91830cb 1449 {
467e0479
VZ
1450 if ( dstLen < inLen )
1451 return wxCONV_FAILED;
c91830cb 1452
467e0479 1453 memcpy(dst, src, srcLen);
c91830cb 1454 }
d32a507d 1455
467e0479 1456 return inLen;
c91830cb
VZ
1457}
1458
467e0479
VZ
1459size_t
1460wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1461 const wchar_t *src, size_t srcLen) const
c91830cb 1462{
467e0479
VZ
1463 if ( srcLen == wxNO_LEN )
1464 srcLen = wxWcslen(src) + 1;
c91830cb 1465
467e0479
VZ
1466 srcLen *= BYTES_PER_CHAR;
1467
1468 if ( dst )
c91830cb 1469 {
467e0479
VZ
1470 if ( dstLen < srcLen )
1471 return wxCONV_FAILED;
d32a507d 1472
467e0479 1473 memcpy(dst, src, srcLen);
c91830cb 1474 }
d32a507d 1475
467e0479 1476 return srcLen;
c91830cb
VZ
1477}
1478
467e0479
VZ
1479// ----------------------------------------------------------------------------
1480// endian-reversing conversions
1481// ----------------------------------------------------------------------------
c91830cb 1482
467e0479
VZ
1483size_t
1484wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1485 const char *src, size_t srcLen) const
c91830cb 1486{
467e0479
VZ
1487 srcLen = GetLength(src, srcLen);
1488 if ( srcLen == wxNO_LEN )
1489 return wxCONV_FAILED;
c91830cb 1490
467e0479
VZ
1491 srcLen /= BYTES_PER_CHAR;
1492
1493 if ( dst )
c91830cb 1494 {
467e0479
VZ
1495 if ( dstLen < srcLen )
1496 return wxCONV_FAILED;
1497
ef199164
DS
1498 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1499 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1500 {
ef199164 1501 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1502 }
c91830cb 1503 }
bfab25d4 1504
467e0479 1505 return srcLen;
c91830cb
VZ
1506}
1507
467e0479
VZ
1508size_t
1509wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1510 const wchar_t *src, size_t srcLen) const
c91830cb 1511{
467e0479
VZ
1512 if ( srcLen == wxNO_LEN )
1513 srcLen = wxWcslen(src) + 1;
c91830cb 1514
467e0479
VZ
1515 srcLen *= BYTES_PER_CHAR;
1516
1517 if ( dst )
c91830cb 1518 {
467e0479
VZ
1519 if ( dstLen < srcLen )
1520 return wxCONV_FAILED;
1521
ef199164 1522 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1523 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1524 {
ef199164 1525 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1526 }
c91830cb 1527 }
eec47cc6 1528
467e0479 1529 return srcLen;
c91830cb
VZ
1530}
1531
467e0479 1532#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1533
467e0479
VZ
1534// ----------------------------------------------------------------------------
1535// conversions without endianness change
1536// ----------------------------------------------------------------------------
c91830cb 1537
35d11700
VZ
1538size_t
1539wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1540 const char *src, size_t srcLen) const
c91830cb 1541{
35d11700
VZ
1542 srcLen = GetLength(src, srcLen);
1543 if ( srcLen == wxNO_LEN )
1544 return wxCONV_FAILED;
c91830cb 1545
ef199164 1546 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1547 if ( !dst )
c91830cb 1548 {
35d11700
VZ
1549 // optimization: return maximal space which could be needed for this
1550 // string even if the real size could be smaller if the buffer contains
1551 // any surrogates
1552 return inLen;
c91830cb 1553 }
c91830cb 1554
35d11700 1555 size_t outLen = 0;
ef199164
DS
1556 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1557 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1558 {
ef199164
DS
1559 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1560 if ( !inBuff )
35d11700
VZ
1561 return wxCONV_FAILED;
1562
1563 if ( ++outLen > dstLen )
1564 return wxCONV_FAILED;
c91830cb 1565
35d11700
VZ
1566 *dst++ = ch;
1567 }
1568
1569
1570 return outLen;
1571}
c91830cb 1572
35d11700
VZ
1573size_t
1574wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1575 const wchar_t *src, size_t srcLen) const
c91830cb 1576{
35d11700
VZ
1577 if ( srcLen == wxNO_LEN )
1578 srcLen = wxWcslen(src) + 1;
c91830cb 1579
35d11700 1580 size_t outLen = 0;
ef199164 1581 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1582 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1583 {
1584 wxUint16 cc[2];
35d11700
VZ
1585 const size_t numChars = encode_utf16(*src++, cc);
1586 if ( numChars == wxCONV_FAILED )
1587 return wxCONV_FAILED;
c91830cb 1588
ef199164
DS
1589 outLen += numChars * BYTES_PER_CHAR;
1590 if ( outBuff )
c91830cb 1591 {
35d11700
VZ
1592 if ( outLen > dstLen )
1593 return wxCONV_FAILED;
1594
ef199164 1595 *outBuff++ = cc[0];
35d11700 1596 if ( numChars == 2 )
69b80d28 1597 {
35d11700 1598 // second character of a surrogate
ef199164 1599 *outBuff++ = cc[1];
69b80d28 1600 }
c91830cb 1601 }
c91830cb 1602 }
c91830cb 1603
35d11700 1604 return outLen;
c91830cb
VZ
1605}
1606
467e0479
VZ
1607// ----------------------------------------------------------------------------
1608// endian-reversing conversions
1609// ----------------------------------------------------------------------------
c91830cb 1610
35d11700
VZ
1611size_t
1612wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1613 const char *src, size_t srcLen) const
c91830cb 1614{
35d11700
VZ
1615 srcLen = GetLength(src, srcLen);
1616 if ( srcLen == wxNO_LEN )
1617 return wxCONV_FAILED;
1618
ef199164 1619 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1620 if ( !dst )
1621 {
1622 // optimization: return maximal space which could be needed for this
1623 // string even if the real size could be smaller if the buffer contains
1624 // any surrogates
1625 return inLen;
1626 }
c91830cb 1627
35d11700 1628 size_t outLen = 0;
ef199164
DS
1629 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1630 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1631 {
35d11700
VZ
1632 wxUint32 ch;
1633 wxUint16 tmp[2];
ef199164
DS
1634
1635 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1636 inBuff++;
1637 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1638
35d11700
VZ
1639 const size_t numChars = decode_utf16(tmp, ch);
1640 if ( numChars == wxCONV_FAILED )
1641 return wxCONV_FAILED;
c91830cb 1642
35d11700 1643 if ( numChars == 2 )
ef199164 1644 inBuff++;
35d11700
VZ
1645
1646 if ( ++outLen > dstLen )
1647 return wxCONV_FAILED;
c91830cb 1648
35d11700 1649 *dst++ = ch;
c91830cb 1650 }
c91830cb 1651
c91830cb 1652
35d11700
VZ
1653 return outLen;
1654}
c91830cb 1655
35d11700
VZ
1656size_t
1657wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1658 const wchar_t *src, size_t srcLen) const
c91830cb 1659{
35d11700
VZ
1660 if ( srcLen == wxNO_LEN )
1661 srcLen = wxWcslen(src) + 1;
c91830cb 1662
35d11700 1663 size_t outLen = 0;
ef199164 1664 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1665 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1666 {
1667 wxUint16 cc[2];
35d11700
VZ
1668 const size_t numChars = encode_utf16(*src, cc);
1669 if ( numChars == wxCONV_FAILED )
1670 return wxCONV_FAILED;
c91830cb 1671
ef199164
DS
1672 outLen += numChars * BYTES_PER_CHAR;
1673 if ( outBuff )
c91830cb 1674 {
35d11700
VZ
1675 if ( outLen > dstLen )
1676 return wxCONV_FAILED;
1677
ef199164 1678 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1679 if ( numChars == 2 )
c91830cb 1680 {
35d11700 1681 // second character of a surrogate
ef199164 1682 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1683 }
1684 }
c91830cb 1685 }
c91830cb 1686
35d11700 1687 return outLen;
c91830cb
VZ
1688}
1689
467e0479 1690#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1691
1692
35d11700 1693// ============================================================================
c91830cb 1694// UTF-32
35d11700 1695// ============================================================================
c91830cb
VZ
1696
1697#ifdef WORDS_BIGENDIAN
467e0479
VZ
1698 #define wxMBConvUTF32straight wxMBConvUTF32BE
1699 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1700#else
467e0479
VZ
1701 #define wxMBConvUTF32swap wxMBConvUTF32BE
1702 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1703#endif
1704
1705
1706WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1707WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1708
467e0479
VZ
1709/* static */
1710size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1711{
1712 if ( srcLen == wxNO_LEN )
1713 {
1714 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1715 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1716 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1717 ;
c91830cb 1718
467e0479
VZ
1719 srcLen *= BYTES_PER_CHAR;
1720 }
1721 else // we already have the length
1722 {
1723 // we can only convert an entire number of UTF-32 characters
1724 if ( srcLen % BYTES_PER_CHAR )
1725 return wxCONV_FAILED;
1726 }
1727
1728 return srcLen;
1729}
1730
1731// case when in-memory representation is UTF-16
c91830cb
VZ
1732#ifdef WC_UTF16
1733
467e0479
VZ
1734// ----------------------------------------------------------------------------
1735// conversions without endianness change
1736// ----------------------------------------------------------------------------
1737
1738size_t
1739wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1740 const char *src, size_t srcLen) const
c91830cb 1741{
467e0479
VZ
1742 srcLen = GetLength(src, srcLen);
1743 if ( srcLen == wxNO_LEN )
1744 return wxCONV_FAILED;
c91830cb 1745
ef199164
DS
1746 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1747 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1748 size_t outLen = 0;
1749 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1750 {
1751 wxUint16 cc[2];
ef199164 1752 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1753 if ( numChars == wxCONV_FAILED )
1754 return wxCONV_FAILED;
c91830cb 1755
467e0479
VZ
1756 outLen += numChars;
1757 if ( dst )
c91830cb 1758 {
467e0479
VZ
1759 if ( outLen > dstLen )
1760 return wxCONV_FAILED;
d32a507d 1761
467e0479
VZ
1762 *dst++ = cc[0];
1763 if ( numChars == 2 )
1764 {
1765 // second character of a surrogate
1766 *dst++ = cc[1];
1767 }
1768 }
c91830cb 1769 }
d32a507d 1770
467e0479 1771 return outLen;
c91830cb
VZ
1772}
1773
467e0479
VZ
1774size_t
1775wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1776 const wchar_t *src, size_t srcLen) const
c91830cb 1777{
467e0479
VZ
1778 if ( srcLen == wxNO_LEN )
1779 srcLen = wxWcslen(src) + 1;
c91830cb 1780
467e0479 1781 if ( !dst )
c91830cb 1782 {
467e0479
VZ
1783 // optimization: return maximal space which could be needed for this
1784 // string instead of the exact amount which could be less if there are
1785 // any surrogates in the input
1786 //
1787 // we consider that surrogates are rare enough to make it worthwhile to
1788 // avoid running the loop below at the cost of slightly extra memory
1789 // consumption
ef199164 1790 return srcLen * BYTES_PER_CHAR;
467e0479 1791 }
c91830cb 1792
ef199164 1793 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1794 size_t outLen = 0;
1795 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1796 {
1797 const wxUint32 ch = wxDecodeSurrogate(&src);
1798 if ( !src )
1799 return wxCONV_FAILED;
c91830cb 1800
467e0479 1801 outLen += BYTES_PER_CHAR;
d32a507d 1802
467e0479
VZ
1803 if ( outLen > dstLen )
1804 return wxCONV_FAILED;
b5153fd8 1805
ef199164 1806 *outBuff++ = ch;
467e0479 1807 }
c91830cb 1808
467e0479 1809 return outLen;
c91830cb
VZ
1810}
1811
467e0479
VZ
1812// ----------------------------------------------------------------------------
1813// endian-reversing conversions
1814// ----------------------------------------------------------------------------
c91830cb 1815
467e0479
VZ
1816size_t
1817wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1818 const char *src, size_t srcLen) const
c91830cb 1819{
467e0479
VZ
1820 srcLen = GetLength(src, srcLen);
1821 if ( srcLen == wxNO_LEN )
1822 return wxCONV_FAILED;
c91830cb 1823
ef199164
DS
1824 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1825 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1826 size_t outLen = 0;
ef199164 1827 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1828 {
c91830cb 1829 wxUint16 cc[2];
ef199164 1830 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1831 if ( numChars == wxCONV_FAILED )
1832 return wxCONV_FAILED;
c91830cb 1833
467e0479
VZ
1834 outLen += numChars;
1835 if ( dst )
c91830cb 1836 {
467e0479
VZ
1837 if ( outLen > dstLen )
1838 return wxCONV_FAILED;
d32a507d 1839
467e0479
VZ
1840 *dst++ = cc[0];
1841 if ( numChars == 2 )
1842 {
1843 // second character of a surrogate
1844 *dst++ = cc[1];
1845 }
1846 }
c91830cb 1847 }
b5153fd8 1848
467e0479 1849 return outLen;
c91830cb
VZ
1850}
1851
467e0479
VZ
1852size_t
1853wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1854 const wchar_t *src, size_t srcLen) const
c91830cb 1855{
467e0479
VZ
1856 if ( srcLen == wxNO_LEN )
1857 srcLen = wxWcslen(src) + 1;
c91830cb 1858
467e0479 1859 if ( !dst )
c91830cb 1860 {
467e0479
VZ
1861 // optimization: return maximal space which could be needed for this
1862 // string instead of the exact amount which could be less if there are
1863 // any surrogates in the input
1864 //
1865 // we consider that surrogates are rare enough to make it worthwhile to
1866 // avoid running the loop below at the cost of slightly extra memory
1867 // consumption
1868 return srcLen*BYTES_PER_CHAR;
1869 }
c91830cb 1870
ef199164 1871 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1872 size_t outLen = 0;
1873 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1874 {
1875 const wxUint32 ch = wxDecodeSurrogate(&src);
1876 if ( !src )
1877 return wxCONV_FAILED;
c91830cb 1878
467e0479 1879 outLen += BYTES_PER_CHAR;
d32a507d 1880
467e0479
VZ
1881 if ( outLen > dstLen )
1882 return wxCONV_FAILED;
b5153fd8 1883
ef199164 1884 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1885 }
c91830cb 1886
467e0479 1887 return outLen;
c91830cb
VZ
1888}
1889
467e0479 1890#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1891
35d11700
VZ
1892// ----------------------------------------------------------------------------
1893// conversions without endianness change
1894// ----------------------------------------------------------------------------
1895
1896size_t
1897wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1898 const char *src, size_t srcLen) const
c91830cb 1899{
35d11700
VZ
1900 // use memcpy() as it should be much faster than hand-written loop
1901 srcLen = GetLength(src, srcLen);
1902 if ( srcLen == wxNO_LEN )
1903 return wxCONV_FAILED;
c91830cb 1904
35d11700
VZ
1905 const size_t inLen = srcLen/BYTES_PER_CHAR;
1906 if ( dst )
c91830cb 1907 {
35d11700
VZ
1908 if ( dstLen < inLen )
1909 return wxCONV_FAILED;
b5153fd8 1910
35d11700
VZ
1911 memcpy(dst, src, srcLen);
1912 }
c91830cb 1913
35d11700 1914 return inLen;
c91830cb
VZ
1915}
1916
35d11700
VZ
1917size_t
1918wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1919 const wchar_t *src, size_t srcLen) const
c91830cb 1920{
35d11700
VZ
1921 if ( srcLen == wxNO_LEN )
1922 srcLen = wxWcslen(src) + 1;
1923
1924 srcLen *= BYTES_PER_CHAR;
c91830cb 1925
35d11700 1926 if ( dst )
c91830cb 1927 {
35d11700
VZ
1928 if ( dstLen < srcLen )
1929 return wxCONV_FAILED;
c91830cb 1930
35d11700 1931 memcpy(dst, src, srcLen);
c91830cb
VZ
1932 }
1933
35d11700 1934 return srcLen;
c91830cb
VZ
1935}
1936
35d11700
VZ
1937// ----------------------------------------------------------------------------
1938// endian-reversing conversions
1939// ----------------------------------------------------------------------------
c91830cb 1940
35d11700
VZ
1941size_t
1942wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1943 const char *src, size_t srcLen) const
c91830cb 1944{
35d11700
VZ
1945 srcLen = GetLength(src, srcLen);
1946 if ( srcLen == wxNO_LEN )
1947 return wxCONV_FAILED;
1948
1949 srcLen /= BYTES_PER_CHAR;
c91830cb 1950
35d11700 1951 if ( dst )
c91830cb 1952 {
35d11700
VZ
1953 if ( dstLen < srcLen )
1954 return wxCONV_FAILED;
1955
ef199164
DS
1956 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1957 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1958 {
ef199164 1959 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1960 }
c91830cb 1961 }
b5153fd8 1962
35d11700 1963 return srcLen;
c91830cb
VZ
1964}
1965
35d11700
VZ
1966size_t
1967wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1968 const wchar_t *src, size_t srcLen) const
c91830cb 1969{
35d11700
VZ
1970 if ( srcLen == wxNO_LEN )
1971 srcLen = wxWcslen(src) + 1;
1972
1973 srcLen *= BYTES_PER_CHAR;
c91830cb 1974
35d11700 1975 if ( dst )
c91830cb 1976 {
35d11700
VZ
1977 if ( dstLen < srcLen )
1978 return wxCONV_FAILED;
1979
ef199164 1980 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1981 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1982 {
ef199164 1983 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1984 }
c91830cb 1985 }
b5153fd8 1986
35d11700 1987 return srcLen;
c91830cb
VZ
1988}
1989
467e0479 1990#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1991
1992
36acb880
VZ
1993// ============================================================================
1994// The classes doing conversion using the iconv_xxx() functions
1995// ============================================================================
3caec1bb 1996
b040e242 1997#ifdef HAVE_ICONV
3a0d76bc 1998
b1d547eb
VS
1999// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2000// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2001// (unless there's yet another bug in glibc) the only case when iconv()
2002// returns with (size_t)-1 (which means error) and says there are 0 bytes
2003// left in the input buffer -- when _real_ error occurs,
2004// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2005// iconv() failure.
3caec1bb
VS
2006// [This bug does not appear in glibc 2.2.]
2007#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2008#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2009 (errno != E2BIG || bufLeft != 0))
2010#else
2011#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2012#endif
2013
ab217dba 2014#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2015
74a7eb0b
VZ
2016#define ICONV_T_INVALID ((iconv_t)-1)
2017
2018#if SIZEOF_WCHAR_T == 4
2019 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2020 #define WC_ENC wxFONTENCODING_UTF32
2021#elif SIZEOF_WCHAR_T == 2
2022 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2023 #define WC_ENC wxFONTENCODING_UTF16
2024#else // sizeof(wchar_t) != 2 nor 4
2025 // does this ever happen?
2026 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2027#endif
2028
36acb880 2029// ----------------------------------------------------------------------------
e95354ec 2030// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2031// ----------------------------------------------------------------------------
2032
e95354ec 2033class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2034{
2035public:
86501081 2036 wxMBConv_iconv(const char *name);
e95354ec 2037 virtual ~wxMBConv_iconv();
36acb880 2038
8f4b0f43
VZ
2039 // implement base class virtual methods
2040 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2041 const char *src, size_t srcLen = wxNO_LEN) const;
2042 virtual size_t FromWChar(char *dst, size_t dstLen,
2043 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2044 virtual size_t GetMBNulLen() const;
2045
ba98e032
VS
2046#if wxUSE_UNICODE_UTF8
2047 virtual bool IsUTF8() const;
2048#endif
2049
d36c9347
VZ
2050 virtual wxMBConv *Clone() const
2051 {
86501081 2052 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2053 p->m_minMBCharWidth = m_minMBCharWidth;
2054 return p;
2055 }
2056
e95354ec 2057 bool IsOk() const
74a7eb0b 2058 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2059
2060protected:
ef199164
DS
2061 // the iconv handlers used to translate from multibyte
2062 // to wide char and in the other direction
36acb880
VZ
2063 iconv_t m2w,
2064 w2m;
ef199164 2065
b1d547eb
VS
2066#if wxUSE_THREADS
2067 // guards access to m2w and w2m objects
2068 wxMutex m_iconvMutex;
2069#endif
36acb880
VZ
2070
2071private:
e95354ec 2072 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2073 // available on this machine, it will remain NULL
74a7eb0b 2074 static wxString ms_wcCharsetName;
36acb880
VZ
2075
2076 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2077 // different endian-ness than the native one
405d8f46 2078 static bool ms_wcNeedsSwap;
eec47cc6 2079
d36c9347
VZ
2080
2081 // name of the encoding handled by this conversion
2082 wxString m_name;
2083
7ef3ab50 2084 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2085 // initially
2086 size_t m_minMBCharWidth;
36acb880
VZ
2087};
2088
8f115891 2089// make the constructor available for unit testing
86501081 2090WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2091{
2092 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2093 if ( !result->IsOk() )
2094 {
2095 delete result;
2096 return 0;
2097 }
ef199164 2098
8f115891
MW
2099 return result;
2100}
2101
422e411e 2102wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2103bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2104
86501081 2105wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2106 : m_name(name)
36acb880 2107{
c1464d9d 2108 m_minMBCharWidth = 0;
eec47cc6 2109
36acb880 2110 // check for charset that represents wchar_t:
74a7eb0b 2111 if ( ms_wcCharsetName.empty() )
f1339c56 2112 {
c2b83fdd
VZ
2113 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2114
74a7eb0b
VZ
2115#if wxUSE_FONTMAP
2116 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2117#else // !wxUSE_FONTMAP
91cb7f52 2118 static const wxChar *names_static[] =
36acb880 2119 {
74a7eb0b
VZ
2120#if SIZEOF_WCHAR_T == 4
2121 _T("UCS-4"),
2122#elif SIZEOF_WCHAR_T = 2
2123 _T("UCS-2"),
2124#endif
2125 NULL
2126 };
91cb7f52 2127 const wxChar **names = names_static;
74a7eb0b 2128#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2129
d1f024a8 2130 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2131 {
17a1ebd1 2132 const wxString nameCS(*names);
74a7eb0b
VZ
2133
2134 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2135 wxString nameXE(nameCS);
ef199164
DS
2136
2137#ifdef WORDS_BIGENDIAN
74a7eb0b 2138 nameXE += _T("BE");
ef199164 2139#else // little endian
74a7eb0b 2140 nameXE += _T("LE");
ef199164 2141#endif
74a7eb0b 2142
c2b83fdd
VZ
2143 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2144 nameXE.c_str());
2145
86501081 2146 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2147 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2148 {
74a7eb0b 2149 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
2150 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2151 nameCS.c_str());
86501081 2152 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2153
74a7eb0b
VZ
2154 // and check for bytesex ourselves:
2155 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2156 {
74a7eb0b 2157 char buf[2], *bufPtr;
e8769ed1 2158 wchar_t wbuf[2];
74a7eb0b
VZ
2159 size_t insz, outsz;
2160 size_t res;
2161
2162 buf[0] = 'A';
2163 buf[1] = 0;
2164 wbuf[0] = 0;
2165 insz = 2;
2166 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2167 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2168 bufPtr = buf;
2169
ef199164
DS
2170 res = iconv(
2171 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2172 &wbufPtr, &outsz);
74a7eb0b
VZ
2173
2174 if (ICONV_FAILED(res, insz))
2175 {
2176 wxLogLastError(wxT("iconv"));
422e411e 2177 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2178 nameCS.c_str());
74a7eb0b
VZ
2179 }
2180 else // ok, can convert to this encoding, remember it
2181 {
17a1ebd1 2182 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2183 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2184 }
3a0d76bc
VS
2185 }
2186 }
74a7eb0b 2187 else // use charset not requiring byte swapping
36acb880 2188 {
74a7eb0b 2189 ms_wcCharsetName = nameXE;
36acb880 2190 }
3a0d76bc 2191 }
74a7eb0b 2192
0944fceb 2193 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2194 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2195 ms_wcCharsetName.empty() ? wxString("<none>")
2196 : ms_wcCharsetName,
74a7eb0b
VZ
2197 ms_wcNeedsSwap ? _T(" (needs swap)")
2198 : _T(""));
3a0d76bc 2199 }
36acb880 2200 else // we already have ms_wcCharsetName
3caec1bb 2201 {
86501081 2202 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2203 }
dccce9ea 2204
74a7eb0b 2205 if ( ms_wcCharsetName.empty() )
f1339c56 2206 {
74a7eb0b 2207 w2m = ICONV_T_INVALID;
36acb880 2208 }
405d8f46
VZ
2209 else
2210 {
86501081 2211 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2212 if ( w2m == ICONV_T_INVALID )
2213 {
2214 wxLogTrace(TRACE_STRCONV,
2215 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2216 ms_wcCharsetName.c_str(), name);
74a7eb0b 2217 }
405d8f46 2218 }
36acb880 2219}
3caec1bb 2220
e95354ec 2221wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2222{
74a7eb0b 2223 if ( m2w != ICONV_T_INVALID )
36acb880 2224 iconv_close(m2w);
74a7eb0b 2225 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2226 iconv_close(w2m);
2227}
3a0d76bc 2228
8f4b0f43
VZ
2229size_t
2230wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2231 const char *src, size_t srcLen) const
36acb880 2232{
8f4b0f43 2233 if ( srcLen == wxNO_LEN )
69373110 2234 {
8f4b0f43
VZ
2235 // find the string length: notice that must be done differently for
2236 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2237 // consecutive NULs
2238 const size_t nulLen = GetMBNulLen();
2239 switch ( nulLen )
2240 {
2241 default:
2242 return wxCONV_FAILED;
69373110 2243
8f4b0f43
VZ
2244 case 1:
2245 srcLen = strlen(src); // arguably more optimized than our version
2246 break;
69373110 2247
8f4b0f43
VZ
2248 case 2:
2249 case 4:
2250 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2251 // but they also have to start at character boundary and not
2252 // span two adjacent characters
2253 const char *p;
2254 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2255 ;
2256 srcLen = p - src;
2257 break;
2258 }
d50c0831
VZ
2259
2260 // when we're determining the length of the string ourselves we count
2261 // the terminating NUL(s) as part of it and always NUL-terminate the
2262 // output
2263 srcLen += nulLen;
69373110
VZ
2264 }
2265
8f4b0f43
VZ
2266 // we express length in the number of (wide) characters but iconv always
2267 // counts buffer sizes it in bytes
2268 dstLen *= SIZEOF_WCHAR_T;
2269
b1d547eb 2270#if wxUSE_THREADS
6a17b868
SN
2271 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2272 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2273 // wxConvLocal that are used all over wx code, so we have to make sure
2274 // the handle is used by at most one thread at the time. Otherwise
2275 // only a few wx classes would be safe to use from non-main threads
2276 // as MB<->WC conversion would fail "randomly".
2277 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2278#endif // wxUSE_THREADS
2279
36acb880 2280 size_t res, cres;
8f4b0f43 2281 const char *pszPtr = src;
36acb880 2282
8f4b0f43 2283 if ( dst )
36acb880 2284 {
8f4b0f43 2285 char* bufPtr = (char*)dst;
e8769ed1 2286
36acb880 2287 // have destination buffer, convert there
1752fda6 2288 size_t dstLenOrig = dstLen;
36acb880 2289 cres = iconv(m2w,
8f4b0f43
VZ
2290 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2291 &bufPtr, &dstLen);
1752fda6
VZ
2292
2293 // convert the number of bytes converted as returned by iconv to the
2294 // number of (wide) characters converted that we need
2295 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2296
36acb880 2297 if (ms_wcNeedsSwap)
3a0d76bc 2298 {
36acb880 2299 // convert to native endianness
17a1ebd1 2300 for ( unsigned i = 0; i < res; i++ )
467a2982 2301 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2302 }
36acb880 2303 }
8f4b0f43 2304 else // no destination buffer
36acb880 2305 {
8f4b0f43 2306 // convert using temp buffer to calculate the size of the buffer needed
36acb880
VZ
2307 wchar_t tbuf[8];
2308 res = 0;
ef199164
DS
2309
2310 do
2311 {
e8769ed1 2312 char* bufPtr = (char*)tbuf;
8f4b0f43 2313 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2314
2315 cres = iconv(m2w,
8f4b0f43
VZ
2316 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2317 &bufPtr, &dstLen );
36acb880 2318
8f4b0f43 2319 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2320 }
2321 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2322 }
dccce9ea 2323
8f4b0f43 2324 if (ICONV_FAILED(cres, srcLen))
f1339c56 2325 {
36acb880 2326 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2327 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2328 return wxCONV_FAILED;
36acb880
VZ
2329 }
2330
2331 return res;
2332}
2333
8f4b0f43
VZ
2334size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2335 const wchar_t *src, size_t srcLen) const
36acb880 2336{
b1d547eb
VS
2337#if wxUSE_THREADS
2338 // NB: explained in MB2WC
2339 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2340#endif
3698ae71 2341
8f4b0f43 2342 if ( srcLen == wxNO_LEN )
2588ee86 2343 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2344
2345 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2346 size_t outbuflen = dstLen;
36acb880 2347 size_t res, cres;
3a0d76bc 2348
36acb880 2349 wchar_t *tmpbuf = 0;
3caec1bb 2350
36acb880
VZ
2351 if (ms_wcNeedsSwap)
2352 {
2353 // need to copy to temp buffer to switch endianness
74a7eb0b 2354 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2355 // could be in read-only memory, or be accessed in some other thread)
e8769ed1 2356 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
8f4b0f43
VZ
2357 for ( size_t i = 0; i < srcLen; i++ )
2358 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2359
8f4b0f43
VZ
2360 tmpbuf[srcLen] = L'\0';
2361 src = tmpbuf;
36acb880 2362 }
3a0d76bc 2363
8f4b0f43
VZ
2364 char* inbuf = (char*)src;
2365 if ( dst )
36acb880
VZ
2366 {
2367 // have destination buffer, convert there
8f4b0f43 2368 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2369
8f4b0f43 2370 res = dstLen - outbuflen;
36acb880 2371 }
8f4b0f43 2372 else // no destination buffer
36acb880 2373 {
8f4b0f43 2374 // convert using temp buffer to calculate the size of the buffer needed
36acb880
VZ
2375 char tbuf[16];
2376 res = 0;
ef199164
DS
2377 do
2378 {
8f4b0f43 2379 dst = tbuf;
e8769ed1 2380 outbuflen = 16;
36acb880 2381
8f4b0f43 2382 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2383
e8769ed1 2384 res += 16 - outbuflen;
ef199164
DS
2385 }
2386 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2387 }
dccce9ea 2388
36acb880
VZ
2389 if (ms_wcNeedsSwap)
2390 {
2391 free(tmpbuf);
2392 }
dccce9ea 2393
e8769ed1 2394 if (ICONV_FAILED(cres, inbuflen))
36acb880 2395 {
ce6f8d6f 2396 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2397 return wxCONV_FAILED;
36acb880
VZ
2398 }
2399
2400 return res;
2401}
2402
7ef3ab50 2403size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2404{
c1464d9d 2405 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2406 {
2407 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2408
2409#if wxUSE_THREADS
2410 // NB: explained in MB2WC
2411 wxMutexLocker lock(self->m_iconvMutex);
2412#endif
2413
999020e1 2414 const wchar_t *wnul = L"";
c1464d9d 2415 char buf[8]; // should be enough for NUL in any encoding
356410fc 2416 size_t inLen = sizeof(wchar_t),
c1464d9d 2417 outLen = WXSIZEOF(buf);
ef199164
DS
2418 char *inBuff = (char *)wnul;
2419 char *outBuff = buf;
2420 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2421 {
c1464d9d 2422 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2423 }
2424 else // ok
2425 {
ef199164 2426 self->m_minMBCharWidth = outBuff - buf;
356410fc 2427 }
eec47cc6
VZ
2428 }
2429
c1464d9d 2430 return m_minMBCharWidth;
eec47cc6
VZ
2431}
2432
ba98e032
VS
2433#if wxUSE_UNICODE_UTF8
2434bool wxMBConv_iconv::IsUTF8() const
2435{
86501081
VS
2436 return wxStricmp(m_name, "UTF-8") == 0 ||
2437 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2438}
2439#endif
2440
b040e242 2441#endif // HAVE_ICONV
36acb880 2442
e95354ec 2443
36acb880
VZ
2444// ============================================================================
2445// Win32 conversion classes
2446// ============================================================================
1cd52418 2447
e95354ec 2448#ifdef wxHAVE_WIN32_MB2WC
373658eb 2449
8b04d4c4 2450// from utils.cpp
d775fa82 2451#if wxUSE_FONTMAP
86501081 2452extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2453extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2454#endif
373658eb 2455
e95354ec 2456class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2457{
2458public:
bde4baac
VZ
2459 wxMBConv_win32()
2460 {
2461 m_CodePage = CP_ACP;
c1464d9d 2462 m_minMBCharWidth = 0;
bde4baac
VZ
2463 }
2464
d36c9347 2465 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2466 : wxMBConv()
d36c9347
VZ
2467 {
2468 m_CodePage = conv.m_CodePage;
2469 m_minMBCharWidth = conv.m_minMBCharWidth;
2470 }
2471
7608a683 2472#if wxUSE_FONTMAP
86501081 2473 wxMBConv_win32(const char* name)
bde4baac
VZ
2474 {
2475 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2476 m_minMBCharWidth = 0;
bde4baac 2477 }
dccce9ea 2478
e95354ec 2479 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2480 {
2481 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2482 m_minMBCharWidth = 0;
bde4baac 2483 }
eec47cc6 2484#endif // wxUSE_FONTMAP
8b04d4c4 2485
d36c9347 2486 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2487 {
02272c9c
VZ
2488 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2489 // the behaviour is not compatible with the Unix version (using iconv)
2490 // and break the library itself, e.g. wxTextInputStream::NextChar()
2491 // wouldn't work if reading an incomplete MB char didn't result in an
2492 // error
667e5b3e 2493 //
89028980 2494 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2495 // Win XP or newer and it is not supported for UTF-[78] so we always
2496 // use our own conversions in this case. See
89028980
VS
2497 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2498 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2499 if ( m_CodePage == CP_UTF8 )
89028980 2500 {
5487ff0f 2501 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2502 }
830f8f11
VZ
2503
2504 if ( m_CodePage == CP_UTF7 )
2505 {
5487ff0f 2506 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2507 }
2508
2509 int flags = 0;
2510 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2511 IsAtLeastWin2kSP4() )
89028980 2512 {
830f8f11 2513 flags = MB_ERR_INVALID_CHARS;
89028980 2514 }
667e5b3e 2515
2b5f62a0
VZ
2516 const size_t len = ::MultiByteToWideChar
2517 (
2518 m_CodePage, // code page
667e5b3e 2519 flags, // flags: fall on error
2b5f62a0
VZ
2520 psz, // input string
2521 -1, // its length (NUL-terminated)
b4da152e 2522 buf, // output string
2b5f62a0
VZ
2523 buf ? n : 0 // size of output buffer
2524 );
89028980
VS
2525 if ( !len )
2526 {
2527 // function totally failed
467e0479 2528 return wxCONV_FAILED;
89028980
VS
2529 }
2530
2531 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2532 // check if we succeeded, by doing a double trip:
2533 if ( !flags && buf )
2534 {
53c174fc
VZ
2535 const size_t mbLen = strlen(psz);
2536 wxCharBuffer mbBuf(mbLen);
89028980
VS
2537 if ( ::WideCharToMultiByte
2538 (
2539 m_CodePage,
2540 0,
2541 buf,
2542 -1,
2543 mbBuf.data(),
53c174fc 2544 mbLen + 1, // size in bytes, not length
89028980
VS
2545 NULL,
2546 NULL
2547 ) == 0 ||
2548 strcmp(mbBuf, psz) != 0 )
2549 {
2550 // we didn't obtain the same thing we started from, hence
2551 // the conversion was lossy and we consider that it failed
467e0479 2552 return wxCONV_FAILED;
89028980
VS
2553 }
2554 }
2b5f62a0 2555
03a991bc
VZ
2556 // note that it returns count of written chars for buf != NULL and size
2557 // of the needed buffer for buf == NULL so in either case the length of
2558 // the string (which never includes the terminating NUL) is one less
89028980 2559 return len - 1;
f1339c56 2560 }
dccce9ea 2561
d36c9347 2562 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2563 {
13dd924a
VZ
2564 /*
2565 we have a problem here: by default, WideCharToMultiByte() may
2566 replace characters unrepresentable in the target code page with bad
2567 quality approximations such as turning "1/2" symbol (U+00BD) into
2568 "1" for the code pages which don't have it and we, obviously, want
2569 to avoid this at any price
d775fa82 2570
13dd924a
VZ
2571 the trouble is that this function does it _silently_, i.e. it won't
2572 even tell us whether it did or not... Win98/2000 and higher provide
2573 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2574 we have to resort to a round trip, i.e. check that converting back
2575 results in the same string -- this is, of course, expensive but
2576 otherwise we simply can't be sure to not garble the data.
2577 */
2578
2579 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2580 // it doesn't work with CJK encodings (which we test for rather roughly
2581 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2582 // supporting it
907173e5
WS
2583 BOOL usedDef wxDUMMY_INITIALIZE(false);
2584 BOOL *pUsedDef;
13dd924a
VZ
2585 int flags;
2586 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2587 {
2588 // it's our lucky day
2589 flags = WC_NO_BEST_FIT_CHARS;
2590 pUsedDef = &usedDef;
2591 }
2592 else // old system or unsupported encoding
2593 {
2594 flags = 0;
2595 pUsedDef = NULL;
2596 }
2597
2b5f62a0
VZ
2598 const size_t len = ::WideCharToMultiByte
2599 (
2600 m_CodePage, // code page
13dd924a
VZ
2601 flags, // either none or no best fit
2602 pwz, // input string
2b5f62a0
VZ
2603 -1, // it is (wide) NUL-terminated
2604 buf, // output buffer
2605 buf ? n : 0, // and its size
2606 NULL, // default "replacement" char
13dd924a 2607 pUsedDef // [out] was it used?
2b5f62a0
VZ
2608 );
2609
13dd924a
VZ
2610 if ( !len )
2611 {
2612 // function totally failed
467e0479 2613 return wxCONV_FAILED;
13dd924a
VZ
2614 }
2615
765bdb4a
VZ
2616 // we did something, check if we really succeeded
2617 if ( flags )
13dd924a 2618 {
765bdb4a
VZ
2619 // check if the conversion failed, i.e. if any replacements
2620 // were done
2621 if ( usedDef )
2622 return wxCONV_FAILED;
2623 }
2624 else // we must resort to double tripping...
2625 {
2626 // first we need to ensure that we really have the MB data: this is
2627 // not the case if we're called with NULL buffer, in which case we
2628 // need to do the conversion yet again
2629 wxCharBuffer bufDef;
2630 if ( !buf )
13dd924a 2631 {
765bdb4a
VZ
2632 bufDef = wxCharBuffer(len);
2633 buf = bufDef.data();
2634 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2635 buf, len, NULL, NULL) )
467e0479 2636 return wxCONV_FAILED;
13dd924a 2637 }
765bdb4a 2638
564da6ff
VZ
2639 if ( !n )
2640 n = wcslen(pwz);
765bdb4a 2641 wxWCharBuffer wcBuf(n);
564da6ff 2642 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2643 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2644 {
765bdb4a
VZ
2645 // we didn't obtain the same thing we started from, hence
2646 // the conversion was lossy and we consider that it failed
2647 return wxCONV_FAILED;
13dd924a
VZ
2648 }
2649 }
2650
03a991bc 2651 // see the comment above for the reason of "len - 1"
13dd924a 2652 return len - 1;
f1339c56 2653 }
dccce9ea 2654
7ef3ab50
VZ
2655 virtual size_t GetMBNulLen() const
2656 {
2657 if ( m_minMBCharWidth == 0 )
2658 {
2659 int len = ::WideCharToMultiByte
2660 (
2661 m_CodePage, // code page
2662 0, // no flags
2663 L"", // input string
2664 1, // translate just the NUL
2665 NULL, // output buffer
2666 0, // and its size
2667 NULL, // no replacement char
2668 NULL // [out] don't care if it was used
2669 );
2670
2671 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2672 switch ( len )
2673 {
2674 default:
2675 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2676 self->m_minMBCharWidth = (size_t)-1;
2677 break;
7ef3ab50
VZ
2678
2679 case 0:
2680 self->m_minMBCharWidth = (size_t)-1;
2681 break;
2682
2683 case 1:
2684 case 2:
2685 case 4:
2686 self->m_minMBCharWidth = len;
2687 break;
2688 }
2689 }
2690
2691 return m_minMBCharWidth;
2692 }
2693
d36c9347
VZ
2694 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2695
13dd924a
VZ
2696 bool IsOk() const { return m_CodePage != -1; }
2697
2698private:
2699 static bool CanUseNoBestFit()
2700 {
2701 static int s_isWin98Or2k = -1;
2702
2703 if ( s_isWin98Or2k == -1 )
2704 {
2705 int verMaj, verMin;
2706 switch ( wxGetOsVersion(&verMaj, &verMin) )
2707 {
406d283a 2708 case wxOS_WINDOWS_9X:
13dd924a
VZ
2709 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2710 break;
2711
406d283a 2712 case wxOS_WINDOWS_NT:
13dd924a
VZ
2713 s_isWin98Or2k = verMaj >= 5;
2714 break;
2715
2716 default:
ef199164 2717 // unknown: be conservative by default
13dd924a 2718 s_isWin98Or2k = 0;
ef199164 2719 break;
13dd924a
VZ
2720 }
2721
2722 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2723 }
2724
2725 return s_isWin98Or2k == 1;
2726 }
f1339c56 2727
89028980
VS
2728 static bool IsAtLeastWin2kSP4()
2729 {
8942f83a
WS
2730#ifdef __WXWINCE__
2731 return false;
2732#else
89028980
VS
2733 static int s_isAtLeastWin2kSP4 = -1;
2734
2735 if ( s_isAtLeastWin2kSP4 == -1 )
2736 {
2737 OSVERSIONINFOEX ver;
2738
2739 memset(&ver, 0, sizeof(ver));
2740 ver.dwOSVersionInfoSize = sizeof(ver);
2741 GetVersionEx((OSVERSIONINFO*)&ver);
2742
2743 s_isAtLeastWin2kSP4 =
2744 ((ver.dwMajorVersion > 5) || // Vista+
2745 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2746 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2747 ver.wServicePackMajor >= 4)) // 2000 SP4+
2748 ? 1 : 0;
2749 }
2750
2751 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2752#endif
89028980
VS
2753 }
2754
eec47cc6 2755
c1464d9d 2756 // the code page we're working with
b1d66b54 2757 long m_CodePage;
c1464d9d 2758
7ef3ab50 2759 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2760 // "unknown"
2761 size_t m_minMBCharWidth;
1cd52418 2762};
e95354ec
VZ
2763
2764#endif // wxHAVE_WIN32_MB2WC
2765
f7e98dee 2766
36acb880
VZ
2767// ============================================================================
2768// wxEncodingConverter based conversion classes
2769// ============================================================================
2770
1e6feb95 2771#if wxUSE_FONTMAP
1cd52418 2772
e95354ec 2773class wxMBConv_wxwin : public wxMBConv
1cd52418 2774{
8b04d4c4
VZ
2775private:
2776 void Init()
2777 {
6ac84a78
DE
2778 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2779 // The wxMBConv_cf class does a better job.
2780 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2781 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2782 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2783 }
2784
6001e347 2785public:
f1339c56
RR
2786 // temporarily just use wxEncodingConverter stuff,
2787 // so that it works while a better implementation is built
86501081 2788 wxMBConv_wxwin(const char* name)
f1339c56
RR
2789 {
2790 if (name)
267e11c5 2791 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2792 else
2793 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2794
8b04d4c4
VZ
2795 Init();
2796 }
2797
e95354ec 2798 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2799 {
2800 m_enc = enc;
2801
2802 Init();
f1339c56 2803 }
dccce9ea 2804
bde4baac 2805 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2806 {
2807 size_t inbuf = strlen(psz);
dccce9ea 2808 if (buf)
c643a977 2809 {
ef199164 2810 if (!m2w.Convert(psz, buf))
467e0479 2811 return wxCONV_FAILED;
c643a977 2812 }
f1339c56
RR
2813 return inbuf;
2814 }
dccce9ea 2815
bde4baac 2816 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2817 {
f8d791e0 2818 const size_t inbuf = wxWcslen(psz);
f1339c56 2819 if (buf)
c643a977 2820 {
ef199164 2821 if (!w2m.Convert(psz, buf))
467e0479 2822 return wxCONV_FAILED;
c643a977 2823 }
dccce9ea 2824
f1339c56
RR
2825 return inbuf;
2826 }
dccce9ea 2827
7ef3ab50 2828 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2829 {
2830 switch ( m_enc )
2831 {
2832 case wxFONTENCODING_UTF16BE:
2833 case wxFONTENCODING_UTF16LE:
c1464d9d 2834 return 2;
eec47cc6
VZ
2835
2836 case wxFONTENCODING_UTF32BE:
2837 case wxFONTENCODING_UTF32LE:
c1464d9d 2838 return 4;
eec47cc6
VZ
2839
2840 default:
c1464d9d 2841 return 1;
eec47cc6
VZ
2842 }
2843 }
2844
d36c9347
VZ
2845 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2846
7ef3ab50
VZ
2847 bool IsOk() const { return m_ok; }
2848
2849public:
2850 wxFontEncoding m_enc;
2851 wxEncodingConverter m2w, w2m;
2852
2853private:
cafbf6fb
VZ
2854 // were we initialized successfully?
2855 bool m_ok;
fc7a2a60 2856
e95354ec 2857 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2858};
6001e347 2859
8f115891 2860// make the constructors available for unit testing
86501081 2861WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2862{
2863 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2864 if ( !result->IsOk() )
2865 {
2866 delete result;
2867 return 0;
2868 }
ef199164 2869
8f115891
MW
2870 return result;
2871}
2872
1e6feb95
VZ
2873#endif // wxUSE_FONTMAP
2874
36acb880
VZ
2875// ============================================================================
2876// wxCSConv implementation
2877// ============================================================================
2878
8b04d4c4 2879void wxCSConv::Init()
6001e347 2880{
e95354ec
VZ
2881 m_name = NULL;
2882 m_convReal = NULL;
2883 m_deferred = true;
2884}
2885
86501081 2886wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2887{
2888 Init();
82713003 2889
86501081 2890 if ( !charset.empty() )
e95354ec 2891 {
86501081 2892 SetName(charset.ToAscii());
e95354ec 2893 }
bda3d86a 2894
e4277538
VZ
2895#if wxUSE_FONTMAP
2896 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
e3276230
VZ
2897 if ( m_encoding == wxFONTENCODING_MAX )
2898 {
2899 // set to unknown/invalid value
2900 m_encoding = wxFONTENCODING_SYSTEM;
2901 }
2902 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2903 {
2904 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2905 m_encoding = wxFONTENCODING_ISO8859_1;
2906 }
e4277538 2907#else
bda3d86a 2908 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2909#endif
6001e347
RR
2910}
2911
8b04d4c4
VZ
2912wxCSConv::wxCSConv(wxFontEncoding encoding)
2913{
bda3d86a 2914 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2915 {
2916 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2917
2918 encoding = wxFONTENCODING_SYSTEM;
2919 }
2920
8b04d4c4
VZ
2921 Init();
2922
bda3d86a 2923 m_encoding = encoding;
8b04d4c4
VZ
2924}
2925
6001e347
RR
2926wxCSConv::~wxCSConv()
2927{
65e50848
JS
2928 Clear();
2929}
2930
54380f29 2931wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2932 : wxMBConv()
54380f29 2933{
8b04d4c4
VZ
2934 Init();
2935
54380f29 2936 SetName(conv.m_name);
8b04d4c4 2937 m_encoding = conv.m_encoding;
54380f29
GD
2938}
2939
2940wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2941{
2942 Clear();
8b04d4c4 2943
54380f29 2944 SetName(conv.m_name);
8b04d4c4
VZ
2945 m_encoding = conv.m_encoding;
2946
54380f29
GD
2947 return *this;
2948}
2949
65e50848
JS
2950void wxCSConv::Clear()
2951{
8b04d4c4 2952 free(m_name);
e95354ec 2953 delete m_convReal;
8b04d4c4 2954
65e50848 2955 m_name = NULL;
e95354ec 2956 m_convReal = NULL;
6001e347
RR
2957}
2958
86501081 2959void wxCSConv::SetName(const char *charset)
6001e347 2960{
f1339c56
RR
2961 if (charset)
2962 {
d6f2a891 2963 m_name = wxStrdup(charset);
e95354ec 2964 m_deferred = true;
f1339c56 2965 }
6001e347
RR
2966}
2967
8b3eb85d 2968#if wxUSE_FONTMAP
8b3eb85d
VZ
2969
2970WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2971 wxEncodingNameCache );
8b3eb85d
VZ
2972
2973static wxEncodingNameCache gs_nameCache;
2974#endif
2975
e95354ec
VZ
2976wxMBConv *wxCSConv::DoCreate() const
2977{
ce6f8d6f
VZ
2978#if wxUSE_FONTMAP
2979 wxLogTrace(TRACE_STRCONV,
2980 wxT("creating conversion for %s"),
2981 (m_name ? m_name
86501081 2982 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2983#endif // wxUSE_FONTMAP
2984
c547282d
VZ
2985 // check for the special case of ASCII or ISO8859-1 charset: as we have
2986 // special knowledge of it anyhow, we don't need to create a special
2987 // conversion object
e4277538
VZ
2988 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2989 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2990 {
e95354ec
VZ
2991 // don't convert at all
2992 return NULL;
2993 }
dccce9ea 2994
e95354ec
VZ
2995 // we trust OS to do conversion better than we can so try external
2996 // conversion methods first
2997 //
2998 // the full order is:
2999 // 1. OS conversion (iconv() under Unix or Win32 API)
3000 // 2. hard coded conversions for UTF
3001 // 3. wxEncodingConverter as fall back
3002
3003 // step (1)
3004#ifdef HAVE_ICONV
c547282d 3005#if !wxUSE_FONTMAP
e95354ec 3006 if ( m_name )
c547282d 3007#endif // !wxUSE_FONTMAP
e95354ec 3008 {
3ef10cfc 3009#if wxUSE_FONTMAP
8b3eb85d 3010 wxFontEncoding encoding(m_encoding);
3ef10cfc 3011#endif
8b3eb85d 3012
86501081 3013 if ( m_name )
8b3eb85d 3014 {
86501081 3015 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3016 if ( conv->IsOk() )
3017 return conv;
3018
3019 delete conv;
c547282d
VZ
3020
3021#if wxUSE_FONTMAP
8b3eb85d 3022 encoding =
86501081 3023 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3024#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3025 }
3026#if wxUSE_FONTMAP
3027 {
3028 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3029 if ( it != gs_nameCache.end() )
3030 {
3031 if ( it->second.empty() )
3032 return NULL;
c547282d 3033
86501081 3034 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3035 if ( conv->IsOk() )
3036 return conv;
e95354ec 3037
8b3eb85d
VZ
3038 delete conv;
3039 }
3040
3041 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3042 // CS : in case this does not return valid names (eg for MacRoman)
3043 // encoding got a 'failure' entry in the cache all the same,
3044 // although it just has to be created using a different method, so
3045 // only store failed iconv creation attempts (or perhaps we
3046 // shoulnd't do this at all ?)
3c67ec06 3047 if ( names[0] != NULL )
8b3eb85d 3048 {
3c67ec06 3049 for ( ; *names; ++names )
8b3eb85d 3050 {
86501081
VS
3051 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3052 // will need changes that will obsolete this
3053 wxString name(*names);
3054 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3055 if ( conv->IsOk() )
3056 {
3057 gs_nameCache[encoding] = *names;
3058 return conv;
3059 }
3060
3061 delete conv;
8b3eb85d
VZ
3062 }
3063
3c67ec06 3064 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 3065 }
8b3eb85d
VZ
3066 }
3067#endif // wxUSE_FONTMAP
e95354ec
VZ
3068 }
3069#endif // HAVE_ICONV
3070
3071#ifdef wxHAVE_WIN32_MB2WC
3072 {
7608a683 3073#if wxUSE_FONTMAP
e95354ec
VZ
3074 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3075 : new wxMBConv_win32(m_encoding);
3076 if ( conv->IsOk() )
3077 return conv;
3078
3079 delete conv;
7608a683
WS
3080#else
3081 return NULL;
3082#endif
e95354ec
VZ
3083 }
3084#endif // wxHAVE_WIN32_MB2WC
ef199164 3085
5c4ed98d 3086#ifdef __DARWIN__
f7e98dee 3087 {
6ff49cbc
DE
3088 // leave UTF16 and UTF32 to the built-ins of wx
3089 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3090 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3091 {
a6900d10 3092#if wxUSE_FONTMAP
5c4ed98d
DE
3093 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3094 : new wxMBConv_cf(m_encoding);
a6900d10 3095#else
5c4ed98d 3096 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3097#endif
ef199164 3098
f7e98dee 3099 if ( conv->IsOk() )
d775fa82
WS
3100 return conv;
3101
3102 delete conv;
3103 }
335d31e0 3104 }
5c4ed98d
DE
3105#endif // __DARWIN__
3106
e95354ec
VZ
3107 // step (2)
3108 wxFontEncoding enc = m_encoding;
3109#if wxUSE_FONTMAP
c547282d
VZ
3110 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3111 {
3112 // use "false" to suppress interactive dialogs -- we can be called from
3113 // anywhere and popping up a dialog from here is the last thing we want to
3114 // do
267e11c5 3115 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3116 }
e95354ec
VZ
3117#endif // wxUSE_FONTMAP
3118
3119 switch ( enc )
3120 {
3121 case wxFONTENCODING_UTF7:
3122 return new wxMBConvUTF7;
3123
3124 case wxFONTENCODING_UTF8:
3125 return new wxMBConvUTF8;
3126
e95354ec
VZ
3127 case wxFONTENCODING_UTF16BE:
3128 return new wxMBConvUTF16BE;
3129
3130 case wxFONTENCODING_UTF16LE:
3131 return new wxMBConvUTF16LE;
3132
e95354ec
VZ
3133 case wxFONTENCODING_UTF32BE:
3134 return new wxMBConvUTF32BE;
3135
3136 case wxFONTENCODING_UTF32LE:
3137 return new wxMBConvUTF32LE;
3138
3139 default:
3140 // nothing to do but put here to suppress gcc warnings
ef199164 3141 break;
e95354ec
VZ
3142 }
3143
3144 // step (3)
3145#if wxUSE_FONTMAP
3146 {
3147 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3148 : new wxMBConv_wxwin(m_encoding);
3149 if ( conv->IsOk() )
3150 return conv;
3151
3152 delete conv;
3153 }
3154#endif // wxUSE_FONTMAP
3155
a58d4f4d
VS
3156 // NB: This is a hack to prevent deadlock. What could otherwise happen
3157 // in Unicode build: wxConvLocal creation ends up being here
3158 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
3159 // attach a timestamp, for which it will need wxConvLocal (to convert
3160 // time to char* and then wchar_t*), but that fails, tries to log the
3161 // error, but wxLog has an (already locked) critical section that
3162 // guards the static buffer.
a58d4f4d
VS
3163 static bool alreadyLoggingError = false;
3164 if (!alreadyLoggingError)
3165 {
3166 alreadyLoggingError = true;
3167 wxLogError(_("Cannot convert from the charset '%s'!"),
3168 m_name ? m_name
e95354ec
VZ
3169 :
3170#if wxUSE_FONTMAP
86501081 3171 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 3172#else // !wxUSE_FONTMAP
86501081 3173 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
3174#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3175 );
ef199164 3176
a58d4f4d
VS
3177 alreadyLoggingError = false;
3178 }
e95354ec
VZ
3179
3180 return NULL;
3181}
3182
3183void wxCSConv::CreateConvIfNeeded() const
3184{
3185 if ( m_deferred )
3186 {
3187 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3188
bda3d86a
VZ
3189 // if we don't have neither the name nor the encoding, use the default
3190 // encoding for this system
3191 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3192 {
4c75209f 3193#if wxUSE_INTL
02c7347b 3194 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3195#else
3196 // fallback to some reasonable default:
3197 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3198#endif // wxUSE_INTL
4c75209f 3199 }
bda3d86a 3200
e95354ec
VZ
3201 self->m_convReal = DoCreate();
3202 self->m_deferred = false;
6001e347 3203 }
6001e347
RR
3204}
3205
0f0298b1
VZ
3206bool wxCSConv::IsOk() const
3207{
3208 CreateConvIfNeeded();
3209
3210 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3211 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3212 return true; // always ok as we do it ourselves
3213
3214 // m_convReal->IsOk() is called at its own creation, so we know it must
3215 // be ok if m_convReal is non-NULL
3216 return m_convReal != NULL;
3217}
3218
1c714a5d
VZ
3219size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3220 const char *src, size_t srcLen) const
3221{
3222 CreateConvIfNeeded();
3223
2c74c558
VS
3224 if (m_convReal)
3225 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3226
3227 // latin-1 (direct)
05392dc8
VZ
3228 if ( srcLen == wxNO_LEN )
3229 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3230
05392dc8
VZ
3231 if ( dst )
3232 {
3233 if ( dstLen < srcLen )
3234 return wxCONV_FAILED;
1c714a5d 3235
05392dc8
VZ
3236 for ( size_t n = 0; n < srcLen; n++ )
3237 dst[n] = (unsigned char)(src[n]);
3238 }
2c74c558 3239
05392dc8 3240 return srcLen;
1c714a5d
VZ
3241}
3242
05392dc8
VZ
3243size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3244 const wchar_t *src, size_t srcLen) const
6001e347 3245{
e95354ec 3246 CreateConvIfNeeded();
dccce9ea 3247
e95354ec 3248 if (m_convReal)
05392dc8 3249 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3250
3251 // latin-1 (direct)
05392dc8
VZ
3252 if ( srcLen == wxNO_LEN )
3253 srcLen = wxWcslen(src) + 1;
dccce9ea 3254
05392dc8 3255 if ( dst )
f1339c56 3256 {
05392dc8
VZ
3257 if ( dstLen < srcLen )
3258 return wxCONV_FAILED;
1cd52418 3259
05392dc8 3260 for ( size_t n = 0; n < srcLen; n++ )
24642831 3261 {
05392dc8 3262 if ( src[n] > 0xFF )
467e0479 3263 return wxCONV_FAILED;
ef199164 3264
05392dc8 3265 dst[n] = (char)src[n];
24642831 3266 }
05392dc8 3267
24642831 3268 }
05392dc8 3269 else // still need to check the input validity
24642831 3270 {
05392dc8 3271 for ( size_t n = 0; n < srcLen; n++ )
24642831 3272 {
05392dc8 3273 if ( src[n] > 0xFF )
467e0479 3274 return wxCONV_FAILED;
24642831 3275 }
f1339c56 3276 }
dccce9ea 3277
05392dc8 3278 return srcLen;
6001e347
RR
3279}
3280
7ef3ab50 3281size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3282{
3283 CreateConvIfNeeded();
3284
3285 if ( m_convReal )
3286 {
7ef3ab50 3287 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3288 }
3289
ba98e032 3290 // otherwise, we are ISO-8859-1
c1464d9d 3291 return 1;
eec47cc6
VZ
3292}
3293
ba98e032
VS
3294#if wxUSE_UNICODE_UTF8
3295bool wxCSConv::IsUTF8() const
3296{
3297 CreateConvIfNeeded();
3298
3299 if ( m_convReal )
3300 {
3301 return m_convReal->IsUTF8();
3302 }
3303
3304 // otherwise, we are ISO-8859-1
3305 return false;
3306}
3307#endif
3308
69c928ef
VZ
3309
3310#if wxUSE_UNICODE
3311
3312wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3313{
3314 if ( !s )
3315 return wxWCharBuffer();
3316
3317 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3318 if ( !wbuf )
5487ff0f 3319 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3320 if ( !wbuf )
3321 wbuf = wxConvISO8859_1.cMB2WX(s);
3322
3323 return wbuf;
3324}
3325
3326wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3327{
3328 if ( !ws )
3329 return wxCharBuffer();
3330
3331 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3332 if ( !buf )
3333 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3334
3335 return buf;
3336}
3337
3338#endif // wxUSE_UNICODE
f5a1953b 3339
1e50d914
VS
3340// ----------------------------------------------------------------------------
3341// globals
3342// ----------------------------------------------------------------------------
3343
3344// NB: The reason why we create converted objects in this convoluted way,
3345// using a factory function instead of global variable, is that they
3346// may be used at static initialization time (some of them are used by
3347// wxString ctors and there may be a global wxString object). In other
3348// words, possibly _before_ the converter global object would be
3349// initialized.
3350
3351#undef wxConvLibc
3352#undef wxConvUTF8
3353#undef wxConvUTF7
3354#undef wxConvLocal
3355#undef wxConvISO8859_1
3356
3357#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3358 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3359 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3360 { \
3361 static impl_klass name##Obj ctor_args; \
3362 return &name##Obj; \
3363 } \
3364 /* this ensures that all global converter objects are created */ \
3365 /* by the time static initialization is done, i.e. before any */ \
3366 /* thread is launched: */ \
3367 static klass* gs_##name##instance = wxGet_##name##Ptr()
3368
3369#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3370 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3371
3372#ifdef __WINDOWS__
3373 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3374#else
3375 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3376#endif
3377
e1079eda
VZ
3378// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3379// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3380// provokes an error message about "not enough macro parameters"; and we
3381// can't use "()" here as the name##Obj declaration would be parsed as a
3382// function declaration then, so use a semicolon and live with an extra
3383// empty statement (and hope that no compilers warns about this)
3384WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3385WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3386
3387WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3388WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3389
3390WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3391WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3392
6ac84a78
DE
3393#ifdef __DARWIN__
3394// The xnu kernel always communicates file paths in decomposed UTF-8.
3395// WARNING: Are we sure that CFString's conversion will cause decomposition?
3396static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3397#endif
6ac84a78 3398
1e50d914 3399WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3400#ifdef __DARWIN__
1e50d914 3401 &wxConvMacUTF8DObj;
6ac84a78 3402#else // !__DARWIN__
1e50d914 3403 wxGet_wxConvLibcPtr();
6ac84a78 3404#endif // __DARWIN__/!__DARWIN__
1e50d914 3405
bde4baac
VZ
3406#else // !wxUSE_WCHAR_T
3407
1e50d914 3408// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3409// stand-ins in absence of wchar_t
3410WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3411 wxConvISO8859_1,
3412 wxConvLocal,
3413 wxConvUTF8;
3414
3415#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T