]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Fix wxHtmlHelpData::SetTempDir() to behave correctly without trailing slash.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
e95354ec
VZ
8// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9// (c) 2000-2003 Vadim Zeitlin
15f2ee32 10// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 11// Licence: wxWindows licence
6001e347
RR
12/////////////////////////////////////////////////////////////////////////////
13
6001e347
RR
14// For compilers that support precompilation, includes "wx.h".
15#include "wx/wxprec.h"
16
480f42ec
VS
17#ifdef __BORLANDC__
18 #pragma hdrstop
19#endif //__BORLANDC__
20
373658eb
VZ
21#ifndef WX_PRECOMP
22 #include "wx/intl.h"
23 #include "wx/log.h"
de6185e2 24 #include "wx/utils.h"
df69528b 25 #include "wx/hashmap.h"
ef199164 26#endif
373658eb 27
bde4baac
VZ
28#include "wx/strconv.h"
29
1c193821 30#ifndef __WXWINCE__
1cd52418 31#include <errno.h>
1c193821
JS
32#endif
33
6001e347
RR
34#include <ctype.h>
35#include <string.h>
36#include <stdlib.h>
37
e95354ec 38#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
39 #include "wx/msw/private.h"
40 #include "wx/msw/missing.h"
e95354ec 41 #define wxHAVE_WIN32_MB2WC
ef199164 42#endif
e95354ec 43
b040e242 44#ifdef HAVE_ICONV
373658eb 45 #include <iconv.h>
b1d547eb 46 #include "wx/thread.h"
1cd52418 47#endif
1cd52418 48
373658eb
VZ
49#include "wx/encconv.h"
50#include "wx/fontmap.h"
51
5c4ed98d 52#ifdef __DARWIN__
c933e267 53#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
54#endif //def __DARWIN__
55
ef199164 56
9a83f860 57#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 58
467e0479
VZ
59// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
60// be 4 bytes
4948c2b6 61#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
62 #define WC_UTF16
63#endif
64
ef199164 65
373658eb
VZ
66// ============================================================================
67// implementation
68// ============================================================================
69
69373110
VZ
70// helper function of cMB2WC(): check if n bytes at this location are all NUL
71static bool NotAllNULs(const char *p, size_t n)
72{
73 while ( n && *p++ == '\0' )
74 n--;
75
76 return n != 0;
77}
78
373658eb 79// ----------------------------------------------------------------------------
467e0479 80// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 81// ----------------------------------------------------------------------------
6001e347 82
c91830cb 83static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 84{
ef199164 85 if (input <= 0xffff)
4def3b35 86 {
999836aa
VZ
87 if (output)
88 *output = (wxUint16) input;
ef199164 89
4def3b35 90 return 1;
dccce9ea 91 }
ef199164 92 else if (input >= 0x110000)
4def3b35 93 {
467e0479 94 return wxCONV_FAILED;
dccce9ea
VZ
95 }
96 else
4def3b35 97 {
dccce9ea 98 if (output)
4def3b35 99 {
ef199164
DS
100 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
101 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 102 }
ef199164 103
4def3b35 104 return 2;
1cd52418 105 }
1cd52418
OK
106}
107
c91830cb 108static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 109{
ef199164 110 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
111 {
112 output = *input;
113 return 1;
dccce9ea 114 }
ef199164 115 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
116 {
117 output = *input;
467e0479 118 return wxCONV_FAILED;
dccce9ea
VZ
119 }
120 else
4def3b35
VS
121 {
122 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
123 return 2;
124 }
1cd52418
OK
125}
126
467e0479 127#ifdef WC_UTF16
35d11700
VZ
128 typedef wchar_t wxDecodeSurrogate_t;
129#else // !WC_UTF16
130 typedef wxUint16 wxDecodeSurrogate_t;
131#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
132
133// returns the next UTF-32 character from the wchar_t buffer and advances the
134// pointer to the character after this one
135//
136// if an invalid character is found, *pSrc is set to NULL, the caller must
137// check for this
35d11700 138static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
139{
140 wxUint32 out;
8d3dd069 141 const size_t
5c33522f 142 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
143 if ( n == wxCONV_FAILED )
144 *pSrc = NULL;
145 else
146 *pSrc += n;
147
148 return out;
149}
150
f6bcfd97 151// ----------------------------------------------------------------------------
6001e347 152// wxMBConv
f6bcfd97 153// ----------------------------------------------------------------------------
2c53a80a 154
483b0434
VZ
155size_t
156wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
157 const char *src, size_t srcLen) const
6001e347 158{
483b0434 159 // although new conversion classes are supposed to implement this function
36f93678 160 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
161 // avoid to have to rewrite all conversion classes at once, we provide a
162 // default (but not efficient) implementation of this one in terms of the
163 // old function by copying the input to ensure that it's NUL-terminated and
164 // then using MB2WC() to convert it
36f93678
VZ
165 //
166 // moreover, some conversion classes simply can't implement ToWChar()
167 // directly, the primary example is wxConvLibc: mbstowcs() only handles
168 // NUL-terminated strings
6001e347 169
483b0434
VZ
170 // the number of chars [which would be] written to dst [if it were not NULL]
171 size_t dstWritten = 0;
eec47cc6 172
c1464d9d 173 // the number of NULs terminating this string
a78c43f1 174 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 175
c1464d9d
VZ
176 // if we were not given the input size we just have to assume that the
177 // string is properly terminated as we have no way of knowing how long it
178 // is anyhow, but if we do have the size check whether there are enough
179 // NULs at the end
483b0434
VZ
180 wxCharBuffer bufTmp;
181 const char *srcEnd;
467e0479 182 if ( srcLen != wxNO_LEN )
eec47cc6 183 {
c1464d9d 184 // we need to know how to find the end of this string
7ef3ab50 185 nulLen = GetMBNulLen();
483b0434
VZ
186 if ( nulLen == wxCONV_FAILED )
187 return wxCONV_FAILED;
e4e3bbb4 188
c1464d9d 189 // if there are enough NULs we can avoid the copy
483b0434 190 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
191 {
192 // make a copy in order to properly NUL-terminate the string
483b0434 193 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 194 char * const p = bufTmp.data();
483b0434
VZ
195 memcpy(p, src, srcLen);
196 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 197 *s = '\0';
483b0434
VZ
198
199 src = bufTmp;
eec47cc6 200 }
e4e3bbb4 201
483b0434
VZ
202 srcEnd = src + srcLen;
203 }
204 else // quit after the first loop iteration
205 {
206 srcEnd = NULL;
207 }
e4e3bbb4 208
36f93678
VZ
209 // the idea of this code is straightforward: it converts a NUL-terminated
210 // chunk of the string during each iteration and updates the output buffer
211 // with the result
212 //
213 // all the complication come from the fact that this function, for
214 // historical reasons, must behave in 2 subtly different ways when it's
215 // called with a fixed number of characters and when it's called for the
bbb0ff36 216 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
217 // must count all characters we convert, NUL or not; but in the latter we
218 // do not count the trailing NUL -- but still count all the NULs inside the
219 // string
220 //
221 // so for the (simple) former case we just always count the trailing NUL,
222 // but for the latter we need to wait until we see if there is going to be
223 // another loop iteration and only count it then
483b0434 224 for ( ;; )
eec47cc6 225 {
c1464d9d 226 // try to convert the current chunk
483b0434 227 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
228 if ( lenChunk == wxCONV_FAILED )
229 return wxCONV_FAILED;
e4e3bbb4 230
483b0434 231 dstWritten += lenChunk;
f6a02087
VZ
232 if ( !srcEnd )
233 dstWritten++;
f5fb6871 234
f6a02087 235 if ( !lenChunk )
467e0479
VZ
236 {
237 // nothing left in the input string, conversion succeeded
238 break;
239 }
240
483b0434
VZ
241 if ( dst )
242 {
243 if ( dstWritten > dstLen )
244 return wxCONV_FAILED;
245
f6a02087
VZ
246 // +1 is for trailing NUL
247 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
248 return wxCONV_FAILED;
249
250 dst += lenChunk;
f6a02087
VZ
251 if ( !srcEnd )
252 dst++;
483b0434 253 }
c1464d9d 254
483b0434 255 if ( !srcEnd )
c1464d9d 256 {
467e0479 257 // we convert just one chunk in this case as this is the entire
bbb0ff36 258 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
259 break;
260 }
eec47cc6 261
bbb0ff36
VZ
262 // advance the input pointer past the end of this chunk: notice that we
263 // will always stop before srcEnd because we know that the chunk is
264 // always properly NUL-terminated
483b0434 265 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
266 {
267 // notice that we must skip over multiple bytes here as we suppose
268 // that if NUL takes 2 or 4 bytes, then all the other characters do
269 // too and so if advanced by a single byte we might erroneously
270 // detect sequences of NUL bytes in the middle of the input
483b0434 271 src += nulLen;
c1464d9d 272 }
e4e3bbb4 273
bbb0ff36
VZ
274 // if the buffer ends before this NUL, we shouldn't count it in our
275 // output so skip the code below
276 if ( src == srcEnd )
277 break;
278
279 // do count this terminator as it's inside the buffer we convert
280 dstWritten++;
281 if ( dst )
282 dst++;
283
284 src += nulLen; // skip the terminator itself
c1464d9d 285
483b0434 286 if ( src >= srcEnd )
c1464d9d
VZ
287 break;
288 }
289
483b0434 290 return dstWritten;
e4e3bbb4
RN
291}
292
483b0434
VZ
293size_t
294wxMBConv::FromWChar(char *dst, size_t dstLen,
295 const wchar_t *src, size_t srcLen) const
e4e3bbb4 296{
483b0434
VZ
297 // the number of chars [which would be] written to dst [if it were not NULL]
298 size_t dstWritten = 0;
e4e3bbb4 299
f6a02087
VZ
300 // if we don't know its length we have no choice but to assume that it is
301 // NUL-terminated (notice that it can still be NUL-terminated even if
302 // explicit length is given but it doesn't change our return value)
303 const bool isNulTerminated = srcLen == wxNO_LEN;
304
eec47cc6
VZ
305 // make a copy of the input string unless it is already properly
306 // NUL-terminated
eec47cc6 307 wxWCharBuffer bufTmp;
f6a02087 308 if ( isNulTerminated )
e4e3bbb4 309 {
483b0434 310 srcLen = wxWcslen(src) + 1;
eec47cc6 311 }
483b0434 312 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
313 {
314 // make a copy in order to properly NUL-terminate the string
483b0434 315 bufTmp = wxWCharBuffer(srcLen);
ef199164 316 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
317 src = bufTmp;
318 }
319
320 const size_t lenNul = GetMBNulLen();
321 for ( const wchar_t * const srcEnd = src + srcLen;
322 src < srcEnd;
27307233 323 src++ /* skip L'\0' too */ )
483b0434
VZ
324 {
325 // try to convert the current chunk
326 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
327 if ( lenChunk == wxCONV_FAILED )
328 return wxCONV_FAILED;
329
483b0434 330 dstWritten += lenChunk;
27307233
VZ
331
332 const wchar_t * const
333 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
334
335 // our return value accounts for the trailing NUL(s), unlike that of
336 // WC2MB(), however don't do it for the last NUL we artificially added
337 // ourselves above
338 if ( chunkEnd < srcEnd )
f6a02087 339 dstWritten += lenNul;
483b0434
VZ
340
341 if ( dst )
342 {
343 if ( dstWritten > dstLen )
344 return wxCONV_FAILED;
345
27307233
VZ
346 // if we know that there is enough space in the destination buffer
347 // (because we accounted for lenNul in dstWritten above), we can
348 // convert directly in place -- but otherwise we need another
349 // temporary buffer to ensure that we don't overwrite the output
350 wxCharBuffer dstBuf;
351 char *dstTmp;
352 if ( chunkEnd == srcEnd )
353 {
354 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
355 dstTmp = dstBuf.data();
356 }
357 else
358 {
359 dstTmp = dst;
360 }
361
362 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
363 return wxCONV_FAILED;
364
27307233
VZ
365 if ( dstTmp != dst )
366 {
367 // copy everything up to but excluding the terminating NUL(s)
368 // into the real output buffer
369 memcpy(dst, dstTmp, lenChunk);
370
371 // micro-optimization: if dstTmp != dst it means that chunkEnd
372 // == srcEnd and so we're done, no need to update anything below
373 break;
374 }
375
483b0434 376 dst += lenChunk;
27307233 377 if ( chunkEnd < srcEnd )
f6a02087 378 dst += lenNul;
483b0434 379 }
27307233
VZ
380
381 src = chunkEnd;
eec47cc6 382 }
e4e3bbb4 383
483b0434
VZ
384 return dstWritten;
385}
386
ef199164 387size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 388{
51725fc0 389 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 390 if ( rc != wxCONV_FAILED )
509da451
VZ
391 {
392 // ToWChar() returns the buffer length, i.e. including the trailing
393 // NUL, while this method doesn't take it into account
394 rc--;
395 }
396
397 return rc;
398}
399
ef199164 400size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 401{
51725fc0 402 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 403 if ( rc != wxCONV_FAILED )
509da451 404 {
51725fc0 405 rc -= GetMBNulLen();
509da451
VZ
406 }
407
408 return rc;
409}
410
483b0434
VZ
411wxMBConv::~wxMBConv()
412{
413 // nothing to do here (necessary for Darwin linking probably)
414}
e4e3bbb4 415
483b0434
VZ
416const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
417{
418 if ( psz )
eec47cc6 419 {
483b0434 420 // calculate the length of the buffer needed first
a2db25a1 421 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 422 if ( nLen != wxCONV_FAILED )
f5fb6871 423 {
483b0434 424 // now do the actual conversion
a2db25a1 425 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 426
483b0434 427 // +1 for the trailing NULL
a2db25a1 428 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 429 return buf;
f5fb6871 430 }
483b0434 431 }
e4e3bbb4 432
483b0434
VZ
433 return wxWCharBuffer();
434}
3698ae71 435
483b0434
VZ
436const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
437{
438 if ( pwz )
439 {
a2db25a1 440 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 441 if ( nLen != wxCONV_FAILED )
483b0434 442 {
a2db25a1
VZ
443 wxCharBuffer buf(nLen - 1);
444 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
445 return buf;
446 }
447 }
448
449 return wxCharBuffer();
450}
e4e3bbb4 451
483b0434 452const wxWCharBuffer
ef199164 453wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 454{
ef199164 455 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 456 if ( dstLen != wxCONV_FAILED )
483b0434 457 {
0dd13d21
VZ
458 // notice that we allocate space for dstLen+1 wide characters here
459 // because we want the buffer to always be NUL-terminated, even if the
460 // input isn't (as otherwise the caller has no way to know its length)
461 wxWCharBuffer wbuf(dstLen);
ef199164 462 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
463 {
464 if ( outLen )
467e0479
VZ
465 {
466 *outLen = dstLen;
f6a02087
VZ
467
468 // we also need to handle NUL-terminated input strings
469 // specially: for them the output is the length of the string
470 // excluding the trailing NUL, however if we're asked to
471 // convert a specific number of characters we return the length
472 // of the resulting output even if it's NUL-terminated
473 if ( inLen == wxNO_LEN )
467e0479
VZ
474 (*outLen)--;
475 }
476
483b0434
VZ
477 return wbuf;
478 }
479 }
480
481 if ( outLen )
482 *outLen = 0;
483
484 return wxWCharBuffer();
485}
486
487const wxCharBuffer
ef199164 488wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 489{
13d92ad6 490 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 491 if ( dstLen != wxCONV_FAILED )
483b0434 492 {
0dd13d21
VZ
493 const size_t nulLen = GetMBNulLen();
494
495 // as above, ensure that the buffer is always NUL-terminated, even if
496 // the input is not
497 wxCharBuffer buf(dstLen + nulLen - 1);
498 memset(buf.data() + dstLen, 0, nulLen);
ef199164 499 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
500 {
501 if ( outLen )
467e0479
VZ
502 {
503 *outLen = dstLen;
504
f6a02087 505 if ( inLen == wxNO_LEN )
467e0479 506 {
f6a02087
VZ
507 // in this case both input and output are NUL-terminated
508 // and we're not supposed to count NUL
13d92ad6 509 *outLen -= nulLen;
467e0479
VZ
510 }
511 }
d32a507d 512
483b0434
VZ
513 return buf;
514 }
e4e3bbb4
RN
515 }
516
eec47cc6
VZ
517 if ( outLen )
518 *outLen = 0;
519
520 return wxCharBuffer();
e4e3bbb4
RN
521}
522
40ac5040
VZ
523const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
524{
525 const size_t srcLen = buf.length();
526 if ( srcLen )
527 {
528 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
529 if ( dstLen != wxCONV_FAILED )
530 {
531 wxWCharBuffer wbuf(dstLen);
532 wbuf.data()[dstLen] = L'\0';
533 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
534 return wbuf;
535 }
536 }
537
cfcfada9 538 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
539}
540
541const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
542{
543 const size_t srcLen = wbuf.length();
544 if ( srcLen )
545 {
546 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
547 if ( dstLen != wxCONV_FAILED )
548 {
549 wxCharBuffer buf(dstLen);
550 buf.data()[dstLen] = '\0';
551 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
552 return buf;
553 }
554 }
555
cfcfada9 556 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
557}
558
6001e347 559// ----------------------------------------------------------------------------
bde4baac 560// wxMBConvLibc
6001e347
RR
561// ----------------------------------------------------------------------------
562
bde4baac
VZ
563size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
564{
565 return wxMB2WC(buf, psz, n);
566}
567
568size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
569{
570 return wxWC2MB(buf, psz, n);
571}
e1bfe89e
RR
572
573// ----------------------------------------------------------------------------
532d575b 574// wxConvBrokenFileNames
e1bfe89e
RR
575// ----------------------------------------------------------------------------
576
eec47cc6
VZ
577#ifdef __UNIX__
578
86501081 579wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 580{
9a83f860
VZ
581 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
582 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 583 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
584 else
585 m_conv = new wxCSConv(charset);
ea8ce907
RR
586}
587
eec47cc6 588#endif // __UNIX__
c12b7f79 589
bde4baac 590// ----------------------------------------------------------------------------
3698ae71 591// UTF-7
bde4baac 592// ----------------------------------------------------------------------------
6001e347 593
15f2ee32 594// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
595//
596// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 597
15f2ee32
RN
598//
599// BASE64 decoding table
600//
601static const unsigned char utf7unb64[] =
6001e347 602{
15f2ee32
RN
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
609 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
610 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
612 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
613 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
614 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
616 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
617 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
618 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
620 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
635};
636
9d653e81
VZ
637size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
638 const char *src, size_t srcLen) const
15f2ee32 639{
9d653e81 640 DecoderState stateOrig,
852dcba5 641 *statePtr;
9d653e81
VZ
642 if ( srcLen == wxNO_LEN )
643 {
644 // convert the entire string, up to and including the trailing NUL
645 srcLen = strlen(src) + 1;
646
647 // when working on the entire strings we don't update nor use the shift
648 // state from the previous call
649 statePtr = &stateOrig;
650 }
651 else // when working with partial strings we do use the shift state
652 {
5c33522f 653 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
654
655 // also save the old state to be able to rollback to it on error
656 stateOrig = m_stateDecoder;
657 }
658
659 // but to simplify the code below we use this variable in both cases
660 DecoderState& state = *statePtr;
661
662
663 // number of characters [which would have been] written to dst [if it were
664 // not NULL]
15f2ee32
RN
665 size_t len = 0;
666
9d653e81
VZ
667 const char * const srcEnd = src + srcLen;
668
669 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 670 {
9d653e81
VZ
671 const unsigned char cc = *src++;
672
673 if ( state.IsShifted() )
15f2ee32 674 {
9d653e81
VZ
675 const unsigned char dc = utf7unb64[cc];
676 if ( dc == 0xff )
15f2ee32 677 {
ccaa848d
VZ
678 // end of encoded part, check that nothing was left: there can
679 // be up to 4 bits of 0 padding but nothing else (we also need
680 // to check isLSB as we count bits modulo 8 while a valid UTF-7
681 // encoded sequence must contain an integral number of UTF-16
682 // characters)
683 if ( state.isLSB || state.bit > 4 ||
684 (state.accum & ((1 << state.bit) - 1)) )
685 {
686 if ( !len )
687 state = stateOrig;
688
852dcba5 689 return wxCONV_FAILED;
ccaa848d 690 }
852dcba5 691
9d653e81
VZ
692 state.ToDirect();
693
694 // re-parse this character normally below unless it's '-' which
695 // is consumed by the decoder
696 if ( cc == '-' )
697 continue;
698 }
699 else // valid encoded character
700 {
701 // mini base64 decoder: each character is 6 bits
702 state.bit += 6;
703 state.accum <<= 6;
704 state.accum += dc;
705
706 if ( state.bit >= 8 )
15f2ee32 707 {
9d653e81
VZ
708 // got the full byte, consume it
709 state.bit -= 8;
710 unsigned char b = (state.accum >> state.bit) & 0x00ff;
711
712 if ( state.isLSB )
15f2ee32 713 {
9d653e81
VZ
714 // we've got the full word, output it
715 if ( dst )
716 *dst++ = (state.msb << 8) | b;
717 len++;
718 state.isLSB = false;
15f2ee32 719 }
9d653e81 720 else // MSB
04a37834 721 {
9d653e81
VZ
722 // just store it while we wait for LSB
723 state.msb = b;
724 state.isLSB = true;
04a37834 725 }
15f2ee32
RN
726 }
727 }
9d653e81 728 }
04a37834 729
9d653e81
VZ
730 if ( state.IsDirect() )
731 {
732 // start of an encoded segment?
733 if ( cc == '+' )
04a37834 734 {
9d653e81
VZ
735 if ( *src == '-' )
736 {
737 // just the encoded plus sign, don't switch to shifted mode
738 if ( dst )
739 *dst++ = '+';
740 len++;
741 src++;
742 }
ccaa848d
VZ
743 else if ( utf7unb64[(unsigned)*src] == 0xff )
744 {
745 // empty encoded chunks are not allowed
746 if ( !len )
747 state = stateOrig;
748
749 return wxCONV_FAILED;
750 }
751 else // base-64 encoded chunk follows
9d653e81
VZ
752 {
753 state.ToShifted();
754 }
755 }
756 else // not '+'
757 {
758 // only printable 7 bit ASCII characters (with the exception of
759 // NUL, TAB, CR and LF) can be used directly
760 if ( cc >= 0x7f || (cc < ' ' &&
761 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
762 return wxCONV_FAILED;
763
764 if ( dst )
765 *dst++ = cc;
766 len++;
767 }
15f2ee32
RN
768 }
769 }
04a37834 770
9d653e81
VZ
771 if ( !len )
772 {
773 // as we didn't read any characters we should be called with the same
774 // data (followed by some more new data) again later so don't save our
775 // state
776 state = stateOrig;
777
778 return wxCONV_FAILED;
779 }
04a37834 780
15f2ee32 781 return len;
6001e347
RR
782}
783
15f2ee32
RN
784//
785// BASE64 encoding table
786//
787static const unsigned char utf7enb64[] =
788{
789 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
790 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
791 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
792 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
793 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
794 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
795 'w', 'x', 'y', 'z', '0', '1', '2', '3',
796 '4', '5', '6', '7', '8', '9', '+', '/'
797};
798
799//
800// UTF-7 encoding table
801//
802// 0 - Set D (directly encoded characters)
803// 1 - Set O (optional direct characters)
804// 2 - whitespace characters (optional)
805// 3 - special characters
806//
807static const unsigned char utf7encode[128] =
6001e347 808{
9d653e81 809 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
810 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
811 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
813 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
817};
818
9d653e81
VZ
819static inline bool wxIsUTF7Direct(wchar_t wc)
820{
821 return wc < 0x80 && utf7encode[wc] < 1;
822}
823
824size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
825 const wchar_t *src, size_t srcLen) const
15f2ee32 826{
9d653e81
VZ
827 EncoderState stateOrig,
828 *statePtr;
829 if ( srcLen == wxNO_LEN )
830 {
831 // we don't apply the stored state when operating on entire strings at
832 // once
833 statePtr = &stateOrig;
834
835 srcLen = wxWcslen(src) + 1;
836 }
837 else // do use the mode we left the output in previously
838 {
839 stateOrig = m_stateEncoder;
5c33522f 840 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
841 }
842
843 EncoderState& state = *statePtr;
844
845
15f2ee32
RN
846 size_t len = 0;
847
9d653e81
VZ
848 const wchar_t * const srcEnd = src + srcLen;
849 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 850 {
9d653e81
VZ
851 wchar_t cc = *src++;
852 if ( wxIsUTF7Direct(cc) )
15f2ee32 853 {
9d653e81
VZ
854 if ( state.IsShifted() )
855 {
856 // pad with zeros the last encoded block if necessary
857 if ( state.bit )
858 {
859 if ( dst )
860 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
861 len++;
862 }
ef199164 863
9d653e81
VZ
864 state.ToDirect();
865
866 if ( dst )
867 *dst++ = '-';
868 len++;
869 }
870
871 if ( dst )
872 *dst++ = (char)cc;
15f2ee32
RN
873 len++;
874 }
9d653e81
VZ
875 else if ( cc == '+' && state.IsDirect() )
876 {
877 if ( dst )
878 {
879 *dst++ = '+';
880 *dst++ = '-';
881 }
882
883 len += 2;
884 }
15f2ee32 885#ifndef WC_UTF16
79c78d42 886 else if (((wxUint32)cc) > 0xffff)
b2c13097 887 {
15f2ee32 888 // no surrogate pair generation (yet?)
467e0479 889 return wxCONV_FAILED;
15f2ee32
RN
890 }
891#endif
892 else
893 {
9d653e81
VZ
894 if ( state.IsDirect() )
895 {
896 state.ToShifted();
ef199164 897
9d653e81
VZ
898 if ( dst )
899 *dst++ = '+';
900 len++;
901 }
902
903 // BASE64 encode string
904 for ( ;; )
15f2ee32 905 {
9d653e81 906 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 907 {
9d653e81
VZ
908 state.accum <<= 8;
909 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
910
911 for (state.bit += 8; state.bit >= 6; )
15f2ee32 912 {
9d653e81
VZ
913 state.bit -= 6;
914 if ( dst )
915 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
916 len++;
15f2ee32 917 }
15f2ee32 918 }
ef199164 919
9d653e81
VZ
920 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
921 break;
ef199164 922
9d653e81 923 src++;
15f2ee32 924 }
15f2ee32
RN
925 }
926 }
ef199164 927
9d653e81
VZ
928 // we need to restore the original encoder state if we were called just to
929 // calculate the amount of space needed as we will presumably be called
930 // again to really convert the data now
931 if ( !dst )
932 state = stateOrig;
ef199164 933
15f2ee32 934 return len;
6001e347
RR
935}
936
f6bcfd97 937// ----------------------------------------------------------------------------
6001e347 938// UTF-8
f6bcfd97 939// ----------------------------------------------------------------------------
6001e347 940
1774c3c5 941static const wxUint32 utf8_max[]=
4def3b35 942 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 943
3698ae71
VZ
944// boundaries of the private use area we use to (temporarily) remap invalid
945// characters invalid in a UTF-8 encoded string
ea8ce907
RR
946const wxUint32 wxUnicodePUA = 0x100000;
947const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
948
0286d08d 949// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 950const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
951 // single-byte sequences (ASCII):
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
960
961 // these are invalid:
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
966 0, 0, // C0,C1
967
968 // two-byte sequences:
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
971
972 // three-byte sequences:
973 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
974
975 // four-byte sequences:
976 4, 4, 4, 4, 4, // F0..F4
977
978 // these are invalid again (5- or 6-byte
979 // sequences and sequences for code points
980 // above U+10FFFF, as restricted by RFC 3629):
981 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
982};
983
984size_t
985wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
986 const char *src, size_t srcLen) const
987{
988 wchar_t *out = dstLen ? dst : NULL;
989 size_t written = 0;
990
991 if ( srcLen == wxNO_LEN )
992 srcLen = strlen(src) + 1;
993
994 for ( const char *p = src; ; p++ )
995 {
0dcbb107 996 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
0286d08d
VZ
997 {
998 // all done successfully, just add the trailing NULL if we are not
999 // using explicit length
1000 if ( srcLen == wxNO_LEN )
1001 {
1002 if ( out )
1003 {
1004 if ( !dstLen )
1005 break;
1006
1007 *out = L'\0';
1008 }
1009
1010 written++;
1011 }
1012
1013 return written;
1014 }
1015
0286d08d
VZ
1016 if ( out && !dstLen-- )
1017 break;
1018
5367a38a
VS
1019 wxUint32 code;
1020 unsigned char c = *p;
0286d08d 1021
5367a38a
VS
1022 if ( c < 0x80 )
1023 {
1024 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1025 break;
0286d08d 1026
5367a38a
VS
1027 if ( srcLen != wxNO_LEN )
1028 srcLen--;
0286d08d 1029
5367a38a
VS
1030 code = c;
1031 }
1032 else
0286d08d 1033 {
5367a38a
VS
1034 unsigned len = tableUtf8Lengths[c];
1035 if ( !len )
1036 break;
1037
1038 if ( srcLen < len ) // the test works for wxNO_LEN too
1039 break;
1040
1041 if ( srcLen != wxNO_LEN )
1042 srcLen -= len;
1043
1044 // Char. number range | UTF-8 octet sequence
1045 // (hexadecimal) | (binary)
1046 // ----------------------+----------------------------------------
1047 // 0000 0000 - 0000 007F | 0xxxxxxx
1048 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1049 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1050 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1051 //
1052 // Code point value is stored in bits marked with 'x',
1053 // lowest-order bit of the value on the right side in the diagram
1054 // above. (from RFC 3629)
1055
1056 // mask to extract lead byte's value ('x' bits above), by sequence
1057 // length:
1058 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1059
1060 // mask and value of lead byte's most significant bits, by length:
1061 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1062 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1063
1064 len--; // it's more convenient to work with 0-based length here
1065
1066 // extract the lead byte's value bits:
1067 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1068 break;
1069
1070 code = c & leadValueMask[len];
1071
1072 // all remaining bytes, if any, are handled in the same way
1073 // regardless of sequence's length:
1074 for ( ; len; --len )
1075 {
1076 c = *++p;
1077 if ( (c & 0xC0) != 0x80 )
1078 return wxCONV_FAILED;
0286d08d 1079
5367a38a
VS
1080 code <<= 6;
1081 code |= c & 0x3F;
1082 }
0286d08d
VZ
1083 }
1084
1085#ifdef WC_UTF16
1086 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1087 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1088 {
1089 if ( out )
1090 out++;
1091 written++;
1092 }
1093#else // !WC_UTF16
1094 if ( out )
1095 *out = code;
1096#endif // WC_UTF16/!WC_UTF16
1097
1098 if ( out )
1099 out++;
1100
1101 written++;
1102 }
1103
1104 return wxCONV_FAILED;
1105}
1106
1107size_t
1108wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1109 const wchar_t *src, size_t srcLen) const
1110{
1111 char *out = dstLen ? dst : NULL;
1112 size_t written = 0;
1113
1114 for ( const wchar_t *wp = src; ; wp++ )
1115 {
0dcbb107 1116 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
0286d08d
VZ
1117 {
1118 // all done successfully, just add the trailing NULL if we are not
1119 // using explicit length
1120 if ( srcLen == wxNO_LEN )
1121 {
1122 if ( out )
1123 {
1124 if ( !dstLen )
1125 break;
1126
1127 *out = '\0';
1128 }
1129
1130 written++;
1131 }
1132
1133 return written;
1134 }
1135
a964d3ed
VZ
1136 if ( srcLen != wxNO_LEN )
1137 srcLen--;
0286d08d
VZ
1138
1139 wxUint32 code;
1140#ifdef WC_UTF16
1141 // cast is ok for WC_UTF16
1142 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1143 {
1144 // skip the next char too as we decoded a surrogate
1145 wp++;
041e6050
VZ
1146 if ( srcLen != wxNO_LEN )
1147 srcLen--;
0286d08d
VZ
1148 }
1149#else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151#endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
9a83f860 1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223}
1224
d16d0917
VZ
1225size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
6001e347 1227{
0286d08d 1228 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1230
4def3b35
VS
1231 size_t len = 0;
1232
f4cb7c58
VZ
1233 // The length can be either given explicitly or computed implicitly for the
1234 // NUL-terminated strings.
1235 const bool isNulTerminated = srcLen == wxNO_LEN;
1236 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1237 {
ea8ce907
RR
1238 const char *opsz = psz;
1239 bool invalid = false;
4def3b35
VS
1240 unsigned char cc = *psz++, fc = cc;
1241 unsigned cnt;
dccce9ea 1242 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1243 fc <<= 1;
ef199164 1244
dccce9ea 1245 if (!cnt)
4def3b35
VS
1246 {
1247 // plain ASCII char
dccce9ea 1248 if (buf)
4def3b35
VS
1249 *buf++ = cc;
1250 len++;
561488ef
MW
1251
1252 // escape the escape character for octal escapes
1253 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1254 && cc == '\\' && (!buf || len < n))
1255 {
1256 if (buf)
1257 *buf++ = cc;
1258 len++;
1259 }
dccce9ea
VZ
1260 }
1261 else
4def3b35
VS
1262 {
1263 cnt--;
dccce9ea 1264 if (!cnt)
4def3b35
VS
1265 {
1266 // invalid UTF-8 sequence
ea8ce907 1267 invalid = true;
dccce9ea
VZ
1268 }
1269 else
4def3b35
VS
1270 {
1271 unsigned ocnt = cnt - 1;
1272 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1273 while (cnt--)
4def3b35 1274 {
ea8ce907 1275 cc = *psz;
dccce9ea 1276 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1277 {
1278 // invalid UTF-8 sequence
ea8ce907
RR
1279 invalid = true;
1280 break;
4def3b35 1281 }
ef199164 1282
ea8ce907 1283 psz++;
4def3b35
VS
1284 res = (res << 6) | (cc & 0x3f);
1285 }
ef199164 1286
ea8ce907 1287 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1288 {
1289 // illegal UTF-8 encoding
ea8ce907 1290 invalid = true;
4def3b35 1291 }
ea8ce907
RR
1292 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1293 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1294 {
1295 // if one of our PUA characters turns up externally
1296 // it must also be treated as an illegal sequence
1297 // (a bit like you have to escape an escape character)
1298 invalid = true;
1299 }
1300 else
1301 {
1cd52418 1302#ifdef WC_UTF16
0286d08d 1303 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1304 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1305 if (pa == wxCONV_FAILED)
ea8ce907
RR
1306 {
1307 invalid = true;
1308 }
1309 else
1310 {
1311 if (buf)
1312 buf += pa;
1313 len += pa;
1314 }
373658eb 1315#else // !WC_UTF16
ea8ce907 1316 if (buf)
38d4b1e4 1317 *buf++ = (wchar_t)res;
ea8ce907 1318 len++;
373658eb 1319#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1320 }
1321 }
ef199164 1322
ea8ce907
RR
1323 if (invalid)
1324 {
1325 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1326 {
1327 while (opsz < psz && (!buf || len < n))
1328 {
1329#ifdef WC_UTF16
1330 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1331 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1332 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1333 if (buf)
1334 buf += pa;
1335 opsz++;
1336 len += pa;
1337#else
1338 if (buf)
38d4b1e4 1339 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1340 opsz++;
1341 len++;
1342#endif
1343 }
1344 }
3698ae71 1345 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1346 {
1347 while (opsz < psz && (!buf || len < n))
1348 {
3698ae71
VZ
1349 if ( buf && len + 3 < n )
1350 {
17a1ebd1 1351 unsigned char on = *opsz;
3698ae71 1352 *buf++ = L'\\';
17a1ebd1
VZ
1353 *buf++ = (wchar_t)( L'0' + on / 0100 );
1354 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1355 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1356 }
ef199164 1357
ea8ce907
RR
1358 opsz++;
1359 len += 4;
1360 }
1361 }
3698ae71 1362 else // MAP_INVALID_UTF8_NOT
ea8ce907 1363 {
467e0479 1364 return wxCONV_FAILED;
ea8ce907 1365 }
4def3b35
VS
1366 }
1367 }
6001e347 1368 }
ef199164 1369
f4cb7c58
VZ
1370 if ( isNulTerminated )
1371 {
1372 // Add the trailing NUL in this case if we have a large enough buffer.
1373 if ( buf && (len < n) )
1374 *buf = 0;
ef199164 1375
f4cb7c58
VZ
1376 // And count it in any case.
1377 len++;
1378 }
1379
1380 return len;
6001e347
RR
1381}
1382
3698ae71
VZ
1383static inline bool isoctal(wchar_t wch)
1384{
1385 return L'0' <= wch && wch <= L'7';
1386}
1387
d16d0917
VZ
1388size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1389 const wchar_t *psz, size_t srcLen) const
6001e347 1390{
0286d08d 1391 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1392 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1393
4def3b35 1394 size_t len = 0;
6001e347 1395
2ba61518
VZ
1396 // The length can be either given explicitly or computed implicitly for the
1397 // NUL-terminated strings.
1398 const bool isNulTerminated = srcLen == wxNO_LEN;
1399 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1400 {
1401 wxUint32 cc;
ef199164 1402
1cd52418 1403#ifdef WC_UTF16
b5153fd8
VZ
1404 // cast is ok for WC_UTF16
1405 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1406 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1407#else
ef199164 1408 cc = (*psz++) & 0x7fffffff;
4def3b35 1409#endif
3698ae71
VZ
1410
1411 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1412 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1413 {
dccce9ea 1414 if (buf)
ea8ce907 1415 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1416 len++;
3698ae71 1417 }
561488ef
MW
1418 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1419 && cc == L'\\' && psz[0] == L'\\' )
1420 {
1421 if (buf)
1422 *buf++ = (char)cc;
1423 psz++;
1424 len++;
1425 }
3698ae71
VZ
1426 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1427 cc == L'\\' &&
1428 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1429 {
dccce9ea 1430 if (buf)
3698ae71 1431 {
ef199164
DS
1432 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1433 (psz[1] - L'0') * 010 +
b2c13097 1434 (psz[2] - L'0'));
3698ae71
VZ
1435 }
1436
1437 psz += 3;
ea8ce907
RR
1438 len++;
1439 }
1440 else
1441 {
1442 unsigned cnt;
ef199164
DS
1443 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1444 {
1445 }
1446
ea8ce907 1447 if (!cnt)
4def3b35 1448 {
ea8ce907
RR
1449 // plain ASCII char
1450 if (buf)
1451 *buf++ = (char) cc;
1452 len++;
1453 }
ea8ce907
RR
1454 else
1455 {
1456 len += cnt + 1;
1457 if (buf)
1458 {
1459 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1460 while (cnt--)
1461 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1462 }
4def3b35
VS
1463 }
1464 }
6001e347 1465 }
4def3b35 1466
2ba61518
VZ
1467 if ( isNulTerminated )
1468 {
1469 // Add the trailing NUL in this case if we have a large enough buffer.
1470 if ( buf && (len < n) )
1471 *buf = 0;
1472
1473 // And count it in any case.
1474 len++;
1475 }
adb45366 1476
2ba61518 1477 return len;
6001e347
RR
1478}
1479
467e0479 1480// ============================================================================
c91830cb 1481// UTF-16
467e0479 1482// ============================================================================
c91830cb
VZ
1483
1484#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1485 #define wxMBConvUTF16straight wxMBConvUTF16BE
1486 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1487#else
bde4baac
VZ
1488 #define wxMBConvUTF16swap wxMBConvUTF16BE
1489 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1490#endif
1491
467e0479
VZ
1492/* static */
1493size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1494{
1495 if ( srcLen == wxNO_LEN )
1496 {
1497 // count the number of bytes in input, including the trailing NULs
5c33522f 1498 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1499 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1500 ;
c91830cb 1501
467e0479
VZ
1502 srcLen *= BYTES_PER_CHAR;
1503 }
1504 else // we already have the length
1505 {
1506 // we can only convert an entire number of UTF-16 characters
1507 if ( srcLen % BYTES_PER_CHAR )
1508 return wxCONV_FAILED;
1509 }
1510
1511 return srcLen;
1512}
1513
1514// case when in-memory representation is UTF-16 too
c91830cb
VZ
1515#ifdef WC_UTF16
1516
467e0479
VZ
1517// ----------------------------------------------------------------------------
1518// conversions without endianness change
1519// ----------------------------------------------------------------------------
1520
1521size_t
1522wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1523 const char *src, size_t srcLen) const
c91830cb 1524{
467e0479
VZ
1525 // set up the scene for using memcpy() (which is presumably more efficient
1526 // than copying the bytes one by one)
1527 srcLen = GetLength(src, srcLen);
1528 if ( srcLen == wxNO_LEN )
1529 return wxCONV_FAILED;
c91830cb 1530
ef199164 1531 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1532 if ( dst )
c91830cb 1533 {
467e0479
VZ
1534 if ( dstLen < inLen )
1535 return wxCONV_FAILED;
c91830cb 1536
467e0479 1537 memcpy(dst, src, srcLen);
c91830cb 1538 }
d32a507d 1539
467e0479 1540 return inLen;
c91830cb
VZ
1541}
1542
467e0479
VZ
1543size_t
1544wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1545 const wchar_t *src, size_t srcLen) const
c91830cb 1546{
467e0479
VZ
1547 if ( srcLen == wxNO_LEN )
1548 srcLen = wxWcslen(src) + 1;
c91830cb 1549
467e0479
VZ
1550 srcLen *= BYTES_PER_CHAR;
1551
1552 if ( dst )
c91830cb 1553 {
467e0479
VZ
1554 if ( dstLen < srcLen )
1555 return wxCONV_FAILED;
d32a507d 1556
467e0479 1557 memcpy(dst, src, srcLen);
c91830cb 1558 }
d32a507d 1559
467e0479 1560 return srcLen;
c91830cb
VZ
1561}
1562
467e0479
VZ
1563// ----------------------------------------------------------------------------
1564// endian-reversing conversions
1565// ----------------------------------------------------------------------------
c91830cb 1566
467e0479
VZ
1567size_t
1568wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1569 const char *src, size_t srcLen) const
c91830cb 1570{
467e0479
VZ
1571 srcLen = GetLength(src, srcLen);
1572 if ( srcLen == wxNO_LEN )
1573 return wxCONV_FAILED;
c91830cb 1574
467e0479
VZ
1575 srcLen /= BYTES_PER_CHAR;
1576
1577 if ( dst )
c91830cb 1578 {
467e0479
VZ
1579 if ( dstLen < srcLen )
1580 return wxCONV_FAILED;
1581
5c33522f 1582 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1583 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1584 {
ef199164 1585 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1586 }
c91830cb 1587 }
bfab25d4 1588
467e0479 1589 return srcLen;
c91830cb
VZ
1590}
1591
467e0479
VZ
1592size_t
1593wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1594 const wchar_t *src, size_t srcLen) const
c91830cb 1595{
467e0479
VZ
1596 if ( srcLen == wxNO_LEN )
1597 srcLen = wxWcslen(src) + 1;
c91830cb 1598
467e0479
VZ
1599 srcLen *= BYTES_PER_CHAR;
1600
1601 if ( dst )
c91830cb 1602 {
467e0479
VZ
1603 if ( dstLen < srcLen )
1604 return wxCONV_FAILED;
1605
5c33522f 1606 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1607 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1608 {
ef199164 1609 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1610 }
c91830cb 1611 }
eec47cc6 1612
467e0479 1613 return srcLen;
c91830cb
VZ
1614}
1615
467e0479 1616#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1617
467e0479
VZ
1618// ----------------------------------------------------------------------------
1619// conversions without endianness change
1620// ----------------------------------------------------------------------------
c91830cb 1621
35d11700
VZ
1622size_t
1623wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1624 const char *src, size_t srcLen) const
c91830cb 1625{
35d11700
VZ
1626 srcLen = GetLength(src, srcLen);
1627 if ( srcLen == wxNO_LEN )
1628 return wxCONV_FAILED;
c91830cb 1629
ef199164 1630 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1631 if ( !dst )
c91830cb 1632 {
35d11700
VZ
1633 // optimization: return maximal space which could be needed for this
1634 // string even if the real size could be smaller if the buffer contains
1635 // any surrogates
1636 return inLen;
c91830cb 1637 }
c91830cb 1638
35d11700 1639 size_t outLen = 0;
5c33522f 1640 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1641 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1642 {
ef199164
DS
1643 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1644 if ( !inBuff )
35d11700
VZ
1645 return wxCONV_FAILED;
1646
1647 if ( ++outLen > dstLen )
1648 return wxCONV_FAILED;
c91830cb 1649
35d11700
VZ
1650 *dst++ = ch;
1651 }
1652
1653
1654 return outLen;
1655}
c91830cb 1656
35d11700
VZ
1657size_t
1658wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1659 const wchar_t *src, size_t srcLen) const
c91830cb 1660{
35d11700
VZ
1661 if ( srcLen == wxNO_LEN )
1662 srcLen = wxWcslen(src) + 1;
c91830cb 1663
35d11700 1664 size_t outLen = 0;
5c33522f 1665 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1666 for ( size_t n = 0; n < srcLen; n++ )
c91830cb 1667 {
d883acaa 1668 wxUint16 cc[2] = { 0 };
35d11700
VZ
1669 const size_t numChars = encode_utf16(*src++, cc);
1670 if ( numChars == wxCONV_FAILED )
1671 return wxCONV_FAILED;
c91830cb 1672
ef199164
DS
1673 outLen += numChars * BYTES_PER_CHAR;
1674 if ( outBuff )
c91830cb 1675 {
35d11700
VZ
1676 if ( outLen > dstLen )
1677 return wxCONV_FAILED;
1678
ef199164 1679 *outBuff++ = cc[0];
35d11700 1680 if ( numChars == 2 )
69b80d28 1681 {
35d11700 1682 // second character of a surrogate
ef199164 1683 *outBuff++ = cc[1];
69b80d28 1684 }
c91830cb 1685 }
c91830cb 1686 }
c91830cb 1687
35d11700 1688 return outLen;
c91830cb
VZ
1689}
1690
467e0479
VZ
1691// ----------------------------------------------------------------------------
1692// endian-reversing conversions
1693// ----------------------------------------------------------------------------
c91830cb 1694
35d11700
VZ
1695size_t
1696wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1697 const char *src, size_t srcLen) const
c91830cb 1698{
35d11700
VZ
1699 srcLen = GetLength(src, srcLen);
1700 if ( srcLen == wxNO_LEN )
1701 return wxCONV_FAILED;
1702
ef199164 1703 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1704 if ( !dst )
1705 {
1706 // optimization: return maximal space which could be needed for this
1707 // string even if the real size could be smaller if the buffer contains
1708 // any surrogates
1709 return inLen;
1710 }
c91830cb 1711
35d11700 1712 size_t outLen = 0;
5c33522f 1713 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1714 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1715 {
35d11700
VZ
1716 wxUint32 ch;
1717 wxUint16 tmp[2];
ef199164
DS
1718
1719 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1720 inBuff++;
1721 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1722
35d11700
VZ
1723 const size_t numChars = decode_utf16(tmp, ch);
1724 if ( numChars == wxCONV_FAILED )
1725 return wxCONV_FAILED;
c91830cb 1726
35d11700 1727 if ( numChars == 2 )
ef199164 1728 inBuff++;
35d11700
VZ
1729
1730 if ( ++outLen > dstLen )
1731 return wxCONV_FAILED;
c91830cb 1732
35d11700 1733 *dst++ = ch;
c91830cb 1734 }
c91830cb 1735
c91830cb 1736
35d11700
VZ
1737 return outLen;
1738}
c91830cb 1739
35d11700
VZ
1740size_t
1741wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1742 const wchar_t *src, size_t srcLen) const
c91830cb 1743{
35d11700
VZ
1744 if ( srcLen == wxNO_LEN )
1745 srcLen = wxWcslen(src) + 1;
c91830cb 1746
35d11700 1747 size_t outLen = 0;
5c33522f 1748 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1749 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb 1750 {
d883acaa 1751 wxUint16 cc[2] = { 0 };
35d11700
VZ
1752 const size_t numChars = encode_utf16(*src, cc);
1753 if ( numChars == wxCONV_FAILED )
1754 return wxCONV_FAILED;
c91830cb 1755
ef199164
DS
1756 outLen += numChars * BYTES_PER_CHAR;
1757 if ( outBuff )
c91830cb 1758 {
35d11700
VZ
1759 if ( outLen > dstLen )
1760 return wxCONV_FAILED;
1761
ef199164 1762 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1763 if ( numChars == 2 )
c91830cb 1764 {
35d11700 1765 // second character of a surrogate
ef199164 1766 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1767 }
1768 }
c91830cb 1769 }
c91830cb 1770
35d11700 1771 return outLen;
c91830cb
VZ
1772}
1773
467e0479 1774#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1775
1776
35d11700 1777// ============================================================================
c91830cb 1778// UTF-32
35d11700 1779// ============================================================================
c91830cb
VZ
1780
1781#ifdef WORDS_BIGENDIAN
467e0479
VZ
1782 #define wxMBConvUTF32straight wxMBConvUTF32BE
1783 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1784#else
467e0479
VZ
1785 #define wxMBConvUTF32swap wxMBConvUTF32BE
1786 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1787#endif
1788
1789
1790WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1791WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1792
467e0479
VZ
1793/* static */
1794size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1795{
1796 if ( srcLen == wxNO_LEN )
1797 {
1798 // count the number of bytes in input, including the trailing NULs
5c33522f 1799 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1800 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1801 ;
c91830cb 1802
467e0479
VZ
1803 srcLen *= BYTES_PER_CHAR;
1804 }
1805 else // we already have the length
1806 {
1807 // we can only convert an entire number of UTF-32 characters
1808 if ( srcLen % BYTES_PER_CHAR )
1809 return wxCONV_FAILED;
1810 }
1811
1812 return srcLen;
1813}
1814
1815// case when in-memory representation is UTF-16
c91830cb
VZ
1816#ifdef WC_UTF16
1817
467e0479
VZ
1818// ----------------------------------------------------------------------------
1819// conversions without endianness change
1820// ----------------------------------------------------------------------------
1821
1822size_t
1823wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1824 const char *src, size_t srcLen) const
c91830cb 1825{
467e0479
VZ
1826 srcLen = GetLength(src, srcLen);
1827 if ( srcLen == wxNO_LEN )
1828 return wxCONV_FAILED;
c91830cb 1829
5c33522f 1830 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1831 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1832 size_t outLen = 0;
1833 for ( size_t n = 0; n < inLen; n++ )
c91830cb 1834 {
d883acaa 1835 wxUint16 cc[2] = { 0 };
ef199164 1836 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1837 if ( numChars == wxCONV_FAILED )
1838 return wxCONV_FAILED;
c91830cb 1839
467e0479
VZ
1840 outLen += numChars;
1841 if ( dst )
c91830cb 1842 {
467e0479
VZ
1843 if ( outLen > dstLen )
1844 return wxCONV_FAILED;
d32a507d 1845
467e0479
VZ
1846 *dst++ = cc[0];
1847 if ( numChars == 2 )
1848 {
1849 // second character of a surrogate
1850 *dst++ = cc[1];
1851 }
1852 }
c91830cb 1853 }
d32a507d 1854
467e0479 1855 return outLen;
c91830cb
VZ
1856}
1857
467e0479
VZ
1858size_t
1859wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1860 const wchar_t *src, size_t srcLen) const
c91830cb 1861{
467e0479
VZ
1862 if ( srcLen == wxNO_LEN )
1863 srcLen = wxWcslen(src) + 1;
c91830cb 1864
467e0479 1865 if ( !dst )
c91830cb 1866 {
467e0479
VZ
1867 // optimization: return maximal space which could be needed for this
1868 // string instead of the exact amount which could be less if there are
1869 // any surrogates in the input
1870 //
1871 // we consider that surrogates are rare enough to make it worthwhile to
1872 // avoid running the loop below at the cost of slightly extra memory
1873 // consumption
ef199164 1874 return srcLen * BYTES_PER_CHAR;
467e0479 1875 }
c91830cb 1876
5c33522f 1877 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1878 size_t outLen = 0;
1879 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1880 {
1881 const wxUint32 ch = wxDecodeSurrogate(&src);
1882 if ( !src )
1883 return wxCONV_FAILED;
c91830cb 1884
467e0479 1885 outLen += BYTES_PER_CHAR;
d32a507d 1886
467e0479
VZ
1887 if ( outLen > dstLen )
1888 return wxCONV_FAILED;
b5153fd8 1889
ef199164 1890 *outBuff++ = ch;
467e0479 1891 }
c91830cb 1892
467e0479 1893 return outLen;
c91830cb
VZ
1894}
1895
467e0479
VZ
1896// ----------------------------------------------------------------------------
1897// endian-reversing conversions
1898// ----------------------------------------------------------------------------
c91830cb 1899
467e0479
VZ
1900size_t
1901wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1902 const char *src, size_t srcLen) const
c91830cb 1903{
467e0479
VZ
1904 srcLen = GetLength(src, srcLen);
1905 if ( srcLen == wxNO_LEN )
1906 return wxCONV_FAILED;
c91830cb 1907
5c33522f 1908 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1909 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1910 size_t outLen = 0;
ef199164 1911 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1912 {
d883acaa 1913 wxUint16 cc[2] = { 0 };
ef199164 1914 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1915 if ( numChars == wxCONV_FAILED )
1916 return wxCONV_FAILED;
c91830cb 1917
467e0479
VZ
1918 outLen += numChars;
1919 if ( dst )
c91830cb 1920 {
467e0479
VZ
1921 if ( outLen > dstLen )
1922 return wxCONV_FAILED;
d32a507d 1923
467e0479
VZ
1924 *dst++ = cc[0];
1925 if ( numChars == 2 )
1926 {
1927 // second character of a surrogate
1928 *dst++ = cc[1];
1929 }
1930 }
c91830cb 1931 }
b5153fd8 1932
467e0479 1933 return outLen;
c91830cb
VZ
1934}
1935
467e0479
VZ
1936size_t
1937wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1938 const wchar_t *src, size_t srcLen) const
c91830cb 1939{
467e0479
VZ
1940 if ( srcLen == wxNO_LEN )
1941 srcLen = wxWcslen(src) + 1;
c91830cb 1942
467e0479 1943 if ( !dst )
c91830cb 1944 {
467e0479
VZ
1945 // optimization: return maximal space which could be needed for this
1946 // string instead of the exact amount which could be less if there are
1947 // any surrogates in the input
1948 //
1949 // we consider that surrogates are rare enough to make it worthwhile to
1950 // avoid running the loop below at the cost of slightly extra memory
1951 // consumption
1952 return srcLen*BYTES_PER_CHAR;
1953 }
c91830cb 1954
5c33522f 1955 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1956 size_t outLen = 0;
1957 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1958 {
1959 const wxUint32 ch = wxDecodeSurrogate(&src);
1960 if ( !src )
1961 return wxCONV_FAILED;
c91830cb 1962
467e0479 1963 outLen += BYTES_PER_CHAR;
d32a507d 1964
467e0479
VZ
1965 if ( outLen > dstLen )
1966 return wxCONV_FAILED;
b5153fd8 1967
ef199164 1968 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1969 }
c91830cb 1970
467e0479 1971 return outLen;
c91830cb
VZ
1972}
1973
467e0479 1974#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1975
35d11700
VZ
1976// ----------------------------------------------------------------------------
1977// conversions without endianness change
1978// ----------------------------------------------------------------------------
1979
1980size_t
1981wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1982 const char *src, size_t srcLen) const
c91830cb 1983{
35d11700
VZ
1984 // use memcpy() as it should be much faster than hand-written loop
1985 srcLen = GetLength(src, srcLen);
1986 if ( srcLen == wxNO_LEN )
1987 return wxCONV_FAILED;
c91830cb 1988
35d11700
VZ
1989 const size_t inLen = srcLen/BYTES_PER_CHAR;
1990 if ( dst )
c91830cb 1991 {
35d11700
VZ
1992 if ( dstLen < inLen )
1993 return wxCONV_FAILED;
b5153fd8 1994
35d11700
VZ
1995 memcpy(dst, src, srcLen);
1996 }
c91830cb 1997
35d11700 1998 return inLen;
c91830cb
VZ
1999}
2000
35d11700
VZ
2001size_t
2002wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2003 const wchar_t *src, size_t srcLen) const
c91830cb 2004{
35d11700
VZ
2005 if ( srcLen == wxNO_LEN )
2006 srcLen = wxWcslen(src) + 1;
2007
2008 srcLen *= BYTES_PER_CHAR;
c91830cb 2009
35d11700 2010 if ( dst )
c91830cb 2011 {
35d11700
VZ
2012 if ( dstLen < srcLen )
2013 return wxCONV_FAILED;
c91830cb 2014
35d11700 2015 memcpy(dst, src, srcLen);
c91830cb
VZ
2016 }
2017
35d11700 2018 return srcLen;
c91830cb
VZ
2019}
2020
35d11700
VZ
2021// ----------------------------------------------------------------------------
2022// endian-reversing conversions
2023// ----------------------------------------------------------------------------
c91830cb 2024
35d11700
VZ
2025size_t
2026wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2027 const char *src, size_t srcLen) const
c91830cb 2028{
35d11700
VZ
2029 srcLen = GetLength(src, srcLen);
2030 if ( srcLen == wxNO_LEN )
2031 return wxCONV_FAILED;
2032
2033 srcLen /= BYTES_PER_CHAR;
c91830cb 2034
35d11700 2035 if ( dst )
c91830cb 2036 {
35d11700
VZ
2037 if ( dstLen < srcLen )
2038 return wxCONV_FAILED;
2039
5c33522f 2040 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2041 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2042 {
ef199164 2043 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2044 }
c91830cb 2045 }
b5153fd8 2046
35d11700 2047 return srcLen;
c91830cb
VZ
2048}
2049
35d11700
VZ
2050size_t
2051wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2052 const wchar_t *src, size_t srcLen) const
c91830cb 2053{
35d11700
VZ
2054 if ( srcLen == wxNO_LEN )
2055 srcLen = wxWcslen(src) + 1;
2056
2057 srcLen *= BYTES_PER_CHAR;
c91830cb 2058
35d11700 2059 if ( dst )
c91830cb 2060 {
35d11700
VZ
2061 if ( dstLen < srcLen )
2062 return wxCONV_FAILED;
2063
5c33522f 2064 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2065 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2066 {
ef199164 2067 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2068 }
c91830cb 2069 }
b5153fd8 2070
35d11700 2071 return srcLen;
c91830cb
VZ
2072}
2073
467e0479 2074#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2075
2076
36acb880
VZ
2077// ============================================================================
2078// The classes doing conversion using the iconv_xxx() functions
2079// ============================================================================
3caec1bb 2080
b040e242 2081#ifdef HAVE_ICONV
3a0d76bc 2082
b1d547eb
VS
2083// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2084// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2085// (unless there's yet another bug in glibc) the only case when iconv()
2086// returns with (size_t)-1 (which means error) and says there are 0 bytes
2087// left in the input buffer -- when _real_ error occurs,
2088// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2089// iconv() failure.
3caec1bb
VS
2090// [This bug does not appear in glibc 2.2.]
2091#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2092#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2093 (errno != E2BIG || bufLeft != 0))
2094#else
2095#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2096#endif
2097
ab217dba 2098#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2099
74a7eb0b
VZ
2100#define ICONV_T_INVALID ((iconv_t)-1)
2101
2102#if SIZEOF_WCHAR_T == 4
2103 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2104 #define WC_ENC wxFONTENCODING_UTF32
2105#elif SIZEOF_WCHAR_T == 2
2106 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2107 #define WC_ENC wxFONTENCODING_UTF16
2108#else // sizeof(wchar_t) != 2 nor 4
2109 // does this ever happen?
2110 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2111#endif
2112
36acb880 2113// ----------------------------------------------------------------------------
e95354ec 2114// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2115// ----------------------------------------------------------------------------
2116
e95354ec 2117class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2118{
2119public:
86501081 2120 wxMBConv_iconv(const char *name);
e95354ec 2121 virtual ~wxMBConv_iconv();
36acb880 2122
8f4b0f43
VZ
2123 // implement base class virtual methods
2124 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2125 const char *src, size_t srcLen = wxNO_LEN) const;
2126 virtual size_t FromWChar(char *dst, size_t dstLen,
2127 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2128 virtual size_t GetMBNulLen() const;
2129
ba98e032
VS
2130#if wxUSE_UNICODE_UTF8
2131 virtual bool IsUTF8() const;
2132#endif
2133
d36c9347
VZ
2134 virtual wxMBConv *Clone() const
2135 {
b64f93b6 2136 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
d36c9347
VZ
2137 p->m_minMBCharWidth = m_minMBCharWidth;
2138 return p;
2139 }
2140
e95354ec 2141 bool IsOk() const
74a7eb0b 2142 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2143
2144protected:
ef199164
DS
2145 // the iconv handlers used to translate from multibyte
2146 // to wide char and in the other direction
36acb880
VZ
2147 iconv_t m2w,
2148 w2m;
ef199164 2149
b1d547eb
VS
2150#if wxUSE_THREADS
2151 // guards access to m2w and w2m objects
2152 wxMutex m_iconvMutex;
2153#endif
36acb880
VZ
2154
2155private:
e95354ec 2156 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2157 // available on this machine, it will remain NULL
74a7eb0b 2158 static wxString ms_wcCharsetName;
36acb880
VZ
2159
2160 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2161 // different endian-ness than the native one
405d8f46 2162 static bool ms_wcNeedsSwap;
eec47cc6 2163
d36c9347
VZ
2164
2165 // name of the encoding handled by this conversion
b64f93b6 2166 const char *m_name;
d36c9347 2167
7ef3ab50 2168 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2169 // initially
2170 size_t m_minMBCharWidth;
36acb880
VZ
2171};
2172
8f115891 2173// make the constructor available for unit testing
86501081 2174WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2175{
2176 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2177 if ( !result->IsOk() )
2178 {
2179 delete result;
2180 return 0;
2181 }
ef199164 2182
8f115891
MW
2183 return result;
2184}
2185
422e411e 2186wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2187bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2188
86501081 2189wxMBConv_iconv::wxMBConv_iconv(const char *name)
b64f93b6 2190 : m_name(wxStrdup(name))
36acb880 2191{
c1464d9d 2192 m_minMBCharWidth = 0;
eec47cc6 2193
36acb880 2194 // check for charset that represents wchar_t:
74a7eb0b 2195 if ( ms_wcCharsetName.empty() )
f1339c56 2196 {
9a83f860 2197 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2198
74a7eb0b 2199#if wxUSE_FONTMAP
a243da29 2200 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2201#else // !wxUSE_FONTMAP
a243da29 2202 static const wxChar *const names_static[] =
36acb880 2203 {
74a7eb0b 2204#if SIZEOF_WCHAR_T == 4
9a83f860 2205 wxT("UCS-4"),
da2f1172 2206#elif SIZEOF_WCHAR_T == 2
9a83f860 2207 wxT("UCS-2"),
74a7eb0b
VZ
2208#endif
2209 NULL
2210 };
a243da29 2211 const wxChar *const *names = names_static;
74a7eb0b 2212#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2213
d1f024a8 2214 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2215 {
17a1ebd1 2216 const wxString nameCS(*names);
74a7eb0b
VZ
2217
2218 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2219 wxString nameXE(nameCS);
ef199164
DS
2220
2221#ifdef WORDS_BIGENDIAN
9a83f860 2222 nameXE += wxT("BE");
ef199164 2223#else // little endian
9a83f860 2224 nameXE += wxT("LE");
ef199164 2225#endif
74a7eb0b 2226
9a83f860 2227 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2228 nameXE.c_str());
2229
86501081 2230 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2231 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2232 {
74a7eb0b 2233 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2234 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2235 nameCS.c_str());
86501081 2236 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2237
74a7eb0b
VZ
2238 // and check for bytesex ourselves:
2239 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2240 {
74a7eb0b 2241 char buf[2], *bufPtr;
e8769ed1 2242 wchar_t wbuf[2];
74a7eb0b
VZ
2243 size_t insz, outsz;
2244 size_t res;
2245
2246 buf[0] = 'A';
2247 buf[1] = 0;
2248 wbuf[0] = 0;
2249 insz = 2;
2250 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2251 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2252 bufPtr = buf;
2253
ef199164
DS
2254 res = iconv(
2255 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2256 &wbufPtr, &outsz);
74a7eb0b
VZ
2257
2258 if (ICONV_FAILED(res, insz))
2259 {
2260 wxLogLastError(wxT("iconv"));
422e411e 2261 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2262 nameCS.c_str());
74a7eb0b
VZ
2263 }
2264 else // ok, can convert to this encoding, remember it
2265 {
17a1ebd1 2266 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2267 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2268 }
3a0d76bc
VS
2269 }
2270 }
74a7eb0b 2271 else // use charset not requiring byte swapping
36acb880 2272 {
74a7eb0b 2273 ms_wcCharsetName = nameXE;
36acb880 2274 }
3a0d76bc 2275 }
74a7eb0b 2276
0944fceb 2277 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2278 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2279 ms_wcCharsetName.empty() ? wxString("<none>")
2280 : ms_wcCharsetName,
9a83f860
VZ
2281 ms_wcNeedsSwap ? wxT(" (needs swap)")
2282 : wxT(""));
3a0d76bc 2283 }
36acb880 2284 else // we already have ms_wcCharsetName
3caec1bb 2285 {
86501081 2286 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2287 }
dccce9ea 2288
74a7eb0b 2289 if ( ms_wcCharsetName.empty() )
f1339c56 2290 {
74a7eb0b 2291 w2m = ICONV_T_INVALID;
36acb880 2292 }
405d8f46
VZ
2293 else
2294 {
86501081 2295 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2296 if ( w2m == ICONV_T_INVALID )
2297 {
2298 wxLogTrace(TRACE_STRCONV,
2299 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2300 ms_wcCharsetName.c_str(), name);
74a7eb0b 2301 }
405d8f46 2302 }
36acb880 2303}
3caec1bb 2304
e95354ec 2305wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2306{
b64f93b6
VZ
2307 free(const_cast<char *>(m_name));
2308
74a7eb0b 2309 if ( m2w != ICONV_T_INVALID )
36acb880 2310 iconv_close(m2w);
74a7eb0b 2311 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2312 iconv_close(w2m);
2313}
3a0d76bc 2314
8f4b0f43
VZ
2315size_t
2316wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2317 const char *src, size_t srcLen) const
36acb880 2318{
8f4b0f43 2319 if ( srcLen == wxNO_LEN )
69373110 2320 {
8f4b0f43
VZ
2321 // find the string length: notice that must be done differently for
2322 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2323 // consecutive NULs
2324 const size_t nulLen = GetMBNulLen();
2325 switch ( nulLen )
2326 {
2327 default:
2328 return wxCONV_FAILED;
69373110 2329
8f4b0f43
VZ
2330 case 1:
2331 srcLen = strlen(src); // arguably more optimized than our version
2332 break;
69373110 2333
8f4b0f43
VZ
2334 case 2:
2335 case 4:
2336 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2337 // but they also have to start at character boundary and not
2338 // span two adjacent characters
2339 const char *p;
2340 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2341 ;
2342 srcLen = p - src;
2343 break;
2344 }
d50c0831
VZ
2345
2346 // when we're determining the length of the string ourselves we count
2347 // the terminating NUL(s) as part of it and always NUL-terminate the
2348 // output
2349 srcLen += nulLen;
69373110
VZ
2350 }
2351
8f4b0f43
VZ
2352 // we express length in the number of (wide) characters but iconv always
2353 // counts buffer sizes it in bytes
2354 dstLen *= SIZEOF_WCHAR_T;
2355
b1d547eb 2356#if wxUSE_THREADS
6a17b868
SN
2357 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2358 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2359 // wxConvLocal that are used all over wx code, so we have to make sure
2360 // the handle is used by at most one thread at the time. Otherwise
2361 // only a few wx classes would be safe to use from non-main threads
2362 // as MB<->WC conversion would fail "randomly".
2363 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2364#endif // wxUSE_THREADS
2365
36acb880 2366 size_t res, cres;
8f4b0f43 2367 const char *pszPtr = src;
36acb880 2368
8f4b0f43 2369 if ( dst )
36acb880 2370 {
8f4b0f43 2371 char* bufPtr = (char*)dst;
e8769ed1 2372
36acb880 2373 // have destination buffer, convert there
1752fda6 2374 size_t dstLenOrig = dstLen;
36acb880 2375 cres = iconv(m2w,
8f4b0f43
VZ
2376 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2377 &bufPtr, &dstLen);
1752fda6
VZ
2378
2379 // convert the number of bytes converted as returned by iconv to the
2380 // number of (wide) characters converted that we need
2381 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2382
36acb880 2383 if (ms_wcNeedsSwap)
3a0d76bc 2384 {
36acb880 2385 // convert to native endianness
17a1ebd1 2386 for ( unsigned i = 0; i < res; i++ )
467a2982 2387 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2388 }
36acb880 2389 }
8f4b0f43 2390 else // no destination buffer
36acb880 2391 {
8f4b0f43 2392 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2393 wchar_t tbuf[256];
36acb880 2394 res = 0;
ef199164
DS
2395
2396 do
2397 {
e8769ed1 2398 char* bufPtr = (char*)tbuf;
8f4b0f43 2399 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2400
2401 cres = iconv(m2w,
8f4b0f43
VZ
2402 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2403 &bufPtr, &dstLen );
36acb880 2404
8f4b0f43 2405 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2406 }
2407 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2408 }
dccce9ea 2409
8f4b0f43 2410 if (ICONV_FAILED(cres, srcLen))
f1339c56 2411 {
36acb880 2412 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2413 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2414 return wxCONV_FAILED;
36acb880
VZ
2415 }
2416
2417 return res;
2418}
2419
8f4b0f43
VZ
2420size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2421 const wchar_t *src, size_t srcLen) const
36acb880 2422{
b1d547eb
VS
2423#if wxUSE_THREADS
2424 // NB: explained in MB2WC
2425 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2426#endif
3698ae71 2427
8f4b0f43 2428 if ( srcLen == wxNO_LEN )
2588ee86 2429 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2430
2431 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2432 size_t outbuflen = dstLen;
36acb880 2433 size_t res, cres;
3a0d76bc 2434
36acb880 2435 wchar_t *tmpbuf = 0;
3caec1bb 2436
36acb880
VZ
2437 if (ms_wcNeedsSwap)
2438 {
2439 // need to copy to temp buffer to switch endianness
51725fc0 2440 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2441 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2442 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2443 for ( size_t i = 0; i < srcLen; i++ )
2444 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2445
8f4b0f43 2446 src = tmpbuf;
36acb880 2447 }
3a0d76bc 2448
8f4b0f43
VZ
2449 char* inbuf = (char*)src;
2450 if ( dst )
36acb880
VZ
2451 {
2452 // have destination buffer, convert there
8f4b0f43 2453 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2454
8f4b0f43 2455 res = dstLen - outbuflen;
36acb880 2456 }
8f4b0f43 2457 else // no destination buffer
36acb880 2458 {
8f4b0f43 2459 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2460 char tbuf[256];
36acb880 2461 res = 0;
ef199164
DS
2462 do
2463 {
8f4b0f43 2464 dst = tbuf;
51725fc0 2465 outbuflen = WXSIZEOF(tbuf);
36acb880 2466
8f4b0f43 2467 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2468
51725fc0 2469 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2470 }
2471 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2472 }
dccce9ea 2473
36acb880
VZ
2474 if (ms_wcNeedsSwap)
2475 {
2476 free(tmpbuf);
2477 }
dccce9ea 2478
e8769ed1 2479 if (ICONV_FAILED(cres, inbuflen))
36acb880 2480 {
ce6f8d6f 2481 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2482 return wxCONV_FAILED;
36acb880
VZ
2483 }
2484
2485 return res;
2486}
2487
7ef3ab50 2488size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2489{
c1464d9d 2490 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2491 {
2492 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2493
2494#if wxUSE_THREADS
2495 // NB: explained in MB2WC
2496 wxMutexLocker lock(self->m_iconvMutex);
2497#endif
2498
999020e1 2499 const wchar_t *wnul = L"";
c1464d9d 2500 char buf[8]; // should be enough for NUL in any encoding
356410fc 2501 size_t inLen = sizeof(wchar_t),
c1464d9d 2502 outLen = WXSIZEOF(buf);
ef199164
DS
2503 char *inBuff = (char *)wnul;
2504 char *outBuff = buf;
2505 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2506 {
c1464d9d 2507 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2508 }
2509 else // ok
2510 {
ef199164 2511 self->m_minMBCharWidth = outBuff - buf;
356410fc 2512 }
eec47cc6
VZ
2513 }
2514
c1464d9d 2515 return m_minMBCharWidth;
eec47cc6
VZ
2516}
2517
ba98e032
VS
2518#if wxUSE_UNICODE_UTF8
2519bool wxMBConv_iconv::IsUTF8() const
2520{
86501081
VS
2521 return wxStricmp(m_name, "UTF-8") == 0 ||
2522 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2523}
2524#endif
2525
b040e242 2526#endif // HAVE_ICONV
36acb880 2527
e95354ec 2528
36acb880
VZ
2529// ============================================================================
2530// Win32 conversion classes
2531// ============================================================================
1cd52418 2532
e95354ec 2533#ifdef wxHAVE_WIN32_MB2WC
373658eb 2534
8b04d4c4 2535// from utils.cpp
d775fa82 2536#if wxUSE_FONTMAP
86501081 2537extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2538extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2539#endif
373658eb 2540
e95354ec 2541class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2542{
2543public:
bde4baac
VZ
2544 wxMBConv_win32()
2545 {
2546 m_CodePage = CP_ACP;
c1464d9d 2547 m_minMBCharWidth = 0;
bde4baac
VZ
2548 }
2549
d36c9347 2550 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2551 : wxMBConv()
d36c9347
VZ
2552 {
2553 m_CodePage = conv.m_CodePage;
2554 m_minMBCharWidth = conv.m_minMBCharWidth;
2555 }
2556
7608a683 2557#if wxUSE_FONTMAP
86501081 2558 wxMBConv_win32(const char* name)
bde4baac
VZ
2559 {
2560 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2561 m_minMBCharWidth = 0;
bde4baac 2562 }
dccce9ea 2563
e95354ec 2564 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2565 {
2566 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2567 m_minMBCharWidth = 0;
bde4baac 2568 }
eec47cc6 2569#endif // wxUSE_FONTMAP
8b04d4c4 2570
d36c9347 2571 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2572 {
02272c9c
VZ
2573 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2574 // the behaviour is not compatible with the Unix version (using iconv)
2575 // and break the library itself, e.g. wxTextInputStream::NextChar()
2576 // wouldn't work if reading an incomplete MB char didn't result in an
2577 // error
667e5b3e 2578 //
89028980 2579 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2580 // Win XP or newer and it is not supported for UTF-[78] so we always
2581 // use our own conversions in this case. See
89028980
VS
2582 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2583 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2584 if ( m_CodePage == CP_UTF8 )
89028980 2585 {
5487ff0f 2586 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2587 }
830f8f11
VZ
2588
2589 if ( m_CodePage == CP_UTF7 )
2590 {
5487ff0f 2591 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2592 }
2593
2594 int flags = 0;
2595 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2596 IsAtLeastWin2kSP4() )
89028980 2597 {
830f8f11 2598 flags = MB_ERR_INVALID_CHARS;
89028980 2599 }
667e5b3e 2600
2b5f62a0
VZ
2601 const size_t len = ::MultiByteToWideChar
2602 (
2603 m_CodePage, // code page
667e5b3e 2604 flags, // flags: fall on error
2b5f62a0
VZ
2605 psz, // input string
2606 -1, // its length (NUL-terminated)
b4da152e 2607 buf, // output string
2b5f62a0
VZ
2608 buf ? n : 0 // size of output buffer
2609 );
89028980
VS
2610 if ( !len )
2611 {
2612 // function totally failed
467e0479 2613 return wxCONV_FAILED;
89028980
VS
2614 }
2615
2616 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2617 // check if we succeeded, by doing a double trip:
2618 if ( !flags && buf )
2619 {
53c174fc
VZ
2620 const size_t mbLen = strlen(psz);
2621 wxCharBuffer mbBuf(mbLen);
89028980
VS
2622 if ( ::WideCharToMultiByte
2623 (
2624 m_CodePage,
2625 0,
2626 buf,
2627 -1,
2628 mbBuf.data(),
53c174fc 2629 mbLen + 1, // size in bytes, not length
89028980
VS
2630 NULL,
2631 NULL
2632 ) == 0 ||
2633 strcmp(mbBuf, psz) != 0 )
2634 {
2635 // we didn't obtain the same thing we started from, hence
2636 // the conversion was lossy and we consider that it failed
467e0479 2637 return wxCONV_FAILED;
89028980
VS
2638 }
2639 }
2b5f62a0 2640
03a991bc
VZ
2641 // note that it returns count of written chars for buf != NULL and size
2642 // of the needed buffer for buf == NULL so in either case the length of
2643 // the string (which never includes the terminating NUL) is one less
89028980 2644 return len - 1;
f1339c56 2645 }
dccce9ea 2646
d36c9347 2647 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2648 {
13dd924a
VZ
2649 /*
2650 we have a problem here: by default, WideCharToMultiByte() may
2651 replace characters unrepresentable in the target code page with bad
2652 quality approximations such as turning "1/2" symbol (U+00BD) into
2653 "1" for the code pages which don't have it and we, obviously, want
2654 to avoid this at any price
d775fa82 2655
13dd924a
VZ
2656 the trouble is that this function does it _silently_, i.e. it won't
2657 even tell us whether it did or not... Win98/2000 and higher provide
2658 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2659 we have to resort to a round trip, i.e. check that converting back
2660 results in the same string -- this is, of course, expensive but
2661 otherwise we simply can't be sure to not garble the data.
2662 */
2663
2664 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2665 // it doesn't work with CJK encodings (which we test for rather roughly
2666 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2667 // supporting it
907173e5
WS
2668 BOOL usedDef wxDUMMY_INITIALIZE(false);
2669 BOOL *pUsedDef;
13dd924a
VZ
2670 int flags;
2671 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2672 {
2673 // it's our lucky day
2674 flags = WC_NO_BEST_FIT_CHARS;
2675 pUsedDef = &usedDef;
2676 }
2677 else // old system or unsupported encoding
2678 {
2679 flags = 0;
2680 pUsedDef = NULL;
2681 }
2682
2b5f62a0
VZ
2683 const size_t len = ::WideCharToMultiByte
2684 (
2685 m_CodePage, // code page
13dd924a
VZ
2686 flags, // either none or no best fit
2687 pwz, // input string
2b5f62a0
VZ
2688 -1, // it is (wide) NUL-terminated
2689 buf, // output buffer
2690 buf ? n : 0, // and its size
2691 NULL, // default "replacement" char
13dd924a 2692 pUsedDef // [out] was it used?
2b5f62a0
VZ
2693 );
2694
13dd924a
VZ
2695 if ( !len )
2696 {
2697 // function totally failed
467e0479 2698 return wxCONV_FAILED;
13dd924a
VZ
2699 }
2700
765bdb4a
VZ
2701 // we did something, check if we really succeeded
2702 if ( flags )
13dd924a 2703 {
765bdb4a
VZ
2704 // check if the conversion failed, i.e. if any replacements
2705 // were done
2706 if ( usedDef )
2707 return wxCONV_FAILED;
2708 }
2709 else // we must resort to double tripping...
2710 {
2711 // first we need to ensure that we really have the MB data: this is
2712 // not the case if we're called with NULL buffer, in which case we
2713 // need to do the conversion yet again
2714 wxCharBuffer bufDef;
2715 if ( !buf )
13dd924a 2716 {
765bdb4a
VZ
2717 bufDef = wxCharBuffer(len);
2718 buf = bufDef.data();
2719 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2720 buf, len, NULL, NULL) )
467e0479 2721 return wxCONV_FAILED;
13dd924a 2722 }
765bdb4a 2723
564da6ff
VZ
2724 if ( !n )
2725 n = wcslen(pwz);
765bdb4a 2726 wxWCharBuffer wcBuf(n);
564da6ff 2727 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2728 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2729 {
765bdb4a
VZ
2730 // we didn't obtain the same thing we started from, hence
2731 // the conversion was lossy and we consider that it failed
2732 return wxCONV_FAILED;
13dd924a
VZ
2733 }
2734 }
2735
03a991bc 2736 // see the comment above for the reason of "len - 1"
13dd924a 2737 return len - 1;
f1339c56 2738 }
dccce9ea 2739
7ef3ab50
VZ
2740 virtual size_t GetMBNulLen() const
2741 {
2742 if ( m_minMBCharWidth == 0 )
2743 {
2744 int len = ::WideCharToMultiByte
2745 (
2746 m_CodePage, // code page
2747 0, // no flags
2748 L"", // input string
2749 1, // translate just the NUL
2750 NULL, // output buffer
2751 0, // and its size
2752 NULL, // no replacement char
2753 NULL // [out] don't care if it was used
2754 );
2755
2756 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2757 switch ( len )
2758 {
2759 default:
9a83f860 2760 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2761 self->m_minMBCharWidth = (size_t)-1;
2762 break;
7ef3ab50
VZ
2763
2764 case 0:
2765 self->m_minMBCharWidth = (size_t)-1;
2766 break;
2767
2768 case 1:
2769 case 2:
2770 case 4:
2771 self->m_minMBCharWidth = len;
2772 break;
2773 }
2774 }
2775
2776 return m_minMBCharWidth;
2777 }
2778
d36c9347
VZ
2779 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2780
13dd924a
VZ
2781 bool IsOk() const { return m_CodePage != -1; }
2782
2783private:
2784 static bool CanUseNoBestFit()
2785 {
2786 static int s_isWin98Or2k = -1;
2787
2788 if ( s_isWin98Or2k == -1 )
2789 {
2790 int verMaj, verMin;
2791 switch ( wxGetOsVersion(&verMaj, &verMin) )
2792 {
406d283a 2793 case wxOS_WINDOWS_9X:
13dd924a
VZ
2794 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2795 break;
2796
406d283a 2797 case wxOS_WINDOWS_NT:
13dd924a
VZ
2798 s_isWin98Or2k = verMaj >= 5;
2799 break;
2800
2801 default:
ef199164 2802 // unknown: be conservative by default
13dd924a 2803 s_isWin98Or2k = 0;
ef199164 2804 break;
13dd924a
VZ
2805 }
2806
9a83f860 2807 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2808 }
2809
2810 return s_isWin98Or2k == 1;
2811 }
f1339c56 2812
89028980
VS
2813 static bool IsAtLeastWin2kSP4()
2814 {
8942f83a
WS
2815#ifdef __WXWINCE__
2816 return false;
2817#else
89028980
VS
2818 static int s_isAtLeastWin2kSP4 = -1;
2819
2820 if ( s_isAtLeastWin2kSP4 == -1 )
2821 {
2822 OSVERSIONINFOEX ver;
2823
2824 memset(&ver, 0, sizeof(ver));
2825 ver.dwOSVersionInfoSize = sizeof(ver);
2826 GetVersionEx((OSVERSIONINFO*)&ver);
2827
2828 s_isAtLeastWin2kSP4 =
2829 ((ver.dwMajorVersion > 5) || // Vista+
2830 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2831 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2832 ver.wServicePackMajor >= 4)) // 2000 SP4+
2833 ? 1 : 0;
2834 }
2835
2836 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2837#endif
89028980
VS
2838 }
2839
eec47cc6 2840
c1464d9d 2841 // the code page we're working with
b1d66b54 2842 long m_CodePage;
c1464d9d 2843
7ef3ab50 2844 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2845 // "unknown"
2846 size_t m_minMBCharWidth;
1cd52418 2847};
e95354ec
VZ
2848
2849#endif // wxHAVE_WIN32_MB2WC
2850
f7e98dee 2851
36acb880
VZ
2852// ============================================================================
2853// wxEncodingConverter based conversion classes
2854// ============================================================================
2855
1e6feb95 2856#if wxUSE_FONTMAP
1cd52418 2857
e95354ec 2858class wxMBConv_wxwin : public wxMBConv
1cd52418 2859{
8b04d4c4
VZ
2860private:
2861 void Init()
2862 {
6ac84a78
DE
2863 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2864 // The wxMBConv_cf class does a better job.
2865 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2866 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2867 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2868 }
2869
6001e347 2870public:
f1339c56
RR
2871 // temporarily just use wxEncodingConverter stuff,
2872 // so that it works while a better implementation is built
86501081 2873 wxMBConv_wxwin(const char* name)
f1339c56
RR
2874 {
2875 if (name)
267e11c5 2876 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2877 else
2878 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2879
8b04d4c4
VZ
2880 Init();
2881 }
2882
e95354ec 2883 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2884 {
2885 m_enc = enc;
2886
2887 Init();
f1339c56 2888 }
dccce9ea 2889
bde4baac 2890 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2891 {
2892 size_t inbuf = strlen(psz);
dccce9ea 2893 if (buf)
c643a977 2894 {
ef199164 2895 if (!m2w.Convert(psz, buf))
467e0479 2896 return wxCONV_FAILED;
c643a977 2897 }
f1339c56
RR
2898 return inbuf;
2899 }
dccce9ea 2900
bde4baac 2901 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2902 {
f8d791e0 2903 const size_t inbuf = wxWcslen(psz);
f1339c56 2904 if (buf)
c643a977 2905 {
ef199164 2906 if (!w2m.Convert(psz, buf))
467e0479 2907 return wxCONV_FAILED;
c643a977 2908 }
dccce9ea 2909
f1339c56
RR
2910 return inbuf;
2911 }
dccce9ea 2912
7ef3ab50 2913 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2914 {
2915 switch ( m_enc )
2916 {
2917 case wxFONTENCODING_UTF16BE:
2918 case wxFONTENCODING_UTF16LE:
c1464d9d 2919 return 2;
eec47cc6
VZ
2920
2921 case wxFONTENCODING_UTF32BE:
2922 case wxFONTENCODING_UTF32LE:
c1464d9d 2923 return 4;
eec47cc6
VZ
2924
2925 default:
c1464d9d 2926 return 1;
eec47cc6
VZ
2927 }
2928 }
2929
d36c9347
VZ
2930 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2931
7ef3ab50
VZ
2932 bool IsOk() const { return m_ok; }
2933
2934public:
2935 wxFontEncoding m_enc;
2936 wxEncodingConverter m2w, w2m;
2937
2938private:
cafbf6fb
VZ
2939 // were we initialized successfully?
2940 bool m_ok;
fc7a2a60 2941
c0c133e1 2942 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2943};
6001e347 2944
8f115891 2945// make the constructors available for unit testing
86501081 2946WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2947{
2948 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2949 if ( !result->IsOk() )
2950 {
2951 delete result;
2952 return 0;
2953 }
ef199164 2954
8f115891
MW
2955 return result;
2956}
2957
1e6feb95
VZ
2958#endif // wxUSE_FONTMAP
2959
36acb880
VZ
2960// ============================================================================
2961// wxCSConv implementation
2962// ============================================================================
2963
8b04d4c4 2964void wxCSConv::Init()
6001e347 2965{
e95354ec
VZ
2966 m_name = NULL;
2967 m_convReal = NULL;
6c4d607e
VZ
2968}
2969
2970void wxCSConv::SetEncoding(wxFontEncoding encoding)
2971{
2972 switch ( encoding )
2973 {
2974 case wxFONTENCODING_MAX:
2975 case wxFONTENCODING_SYSTEM:
2976 if ( m_name )
2977 {
2978 // It's ok to not have encoding value if we have a name for it.
2979 m_encoding = wxFONTENCODING_SYSTEM;
2980 }
2981 else // No name neither.
2982 {
2983 // Fall back to the system default encoding in this case (not
2984 // sure how much sense does this make but this is how the old
2985 // code used to behave).
2986#if wxUSE_INTL
2987 m_encoding = wxLocale::GetSystemEncoding();
2988 if ( m_encoding == wxFONTENCODING_SYSTEM )
2989#endif // wxUSE_INTL
2990 m_encoding = wxFONTENCODING_ISO8859_1;
2991 }
2992 break;
2993
2994 case wxFONTENCODING_DEFAULT:
2995 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2996 m_encoding = wxFONTENCODING_ISO8859_1;
2997 break;
2998
2999 default:
3000 // Just use the provided encoding.
3001 m_encoding = encoding;
3002 }
e95354ec
VZ
3003}
3004
86501081 3005wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
3006{
3007 Init();
82713003 3008
86501081 3009 if ( !charset.empty() )
e95354ec 3010 {
86501081 3011 SetName(charset.ToAscii());
e95354ec 3012 }
bda3d86a 3013
e4277538 3014#if wxUSE_FONTMAP
6c4d607e 3015 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
e4277538 3016#else
6c4d607e 3017 SetEncoding(wxFONTENCODING_SYSTEM);
e4277538 3018#endif
6c4d607e
VZ
3019
3020 m_convReal = DoCreate();
6001e347
RR
3021}
3022
8b04d4c4
VZ
3023wxCSConv::wxCSConv(wxFontEncoding encoding)
3024{
bda3d86a 3025 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 3026 {
9a83f860 3027 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
3028
3029 encoding = wxFONTENCODING_SYSTEM;
3030 }
3031
8b04d4c4
VZ
3032 Init();
3033
6c4d607e
VZ
3034 SetEncoding(encoding);
3035
3036 m_convReal = DoCreate();
8b04d4c4
VZ
3037}
3038
6001e347
RR
3039wxCSConv::~wxCSConv()
3040{
65e50848
JS
3041 Clear();
3042}
3043
54380f29 3044wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3045 : wxMBConv()
54380f29 3046{
8b04d4c4
VZ
3047 Init();
3048
54380f29 3049 SetName(conv.m_name);
6c4d607e
VZ
3050 SetEncoding(conv.m_encoding);
3051
3052 m_convReal = DoCreate();
54380f29
GD
3053}
3054
3055wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3056{
3057 Clear();
8b04d4c4 3058
54380f29 3059 SetName(conv.m_name);
6c4d607e
VZ
3060 SetEncoding(conv.m_encoding);
3061
3062 m_convReal = DoCreate();
8b04d4c4 3063
54380f29
GD
3064 return *this;
3065}
3066
65e50848
JS
3067void wxCSConv::Clear()
3068{
8b04d4c4 3069 free(m_name);
65e50848 3070 m_name = NULL;
6c4d607e
VZ
3071
3072 wxDELETE(m_convReal);
6001e347
RR
3073}
3074
86501081 3075void wxCSConv::SetName(const char *charset)
6001e347 3076{
6c4d607e 3077 if ( charset )
d6f2a891 3078 m_name = wxStrdup(charset);
6001e347
RR
3079}
3080
8b3eb85d 3081#if wxUSE_FONTMAP
8b3eb85d
VZ
3082
3083WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3084 wxEncodingNameCache );
8b3eb85d
VZ
3085
3086static wxEncodingNameCache gs_nameCache;
3087#endif
3088
e95354ec
VZ
3089wxMBConv *wxCSConv::DoCreate() const
3090{
ce6f8d6f
VZ
3091#if wxUSE_FONTMAP
3092 wxLogTrace(TRACE_STRCONV,
3093 wxT("creating conversion for %s"),
3094 (m_name ? m_name
86501081 3095 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3096#endif // wxUSE_FONTMAP
3097
c547282d
VZ
3098 // check for the special case of ASCII or ISO8859-1 charset: as we have
3099 // special knowledge of it anyhow, we don't need to create a special
3100 // conversion object
6c4d607e 3101 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 3102 {
e95354ec
VZ
3103 // don't convert at all
3104 return NULL;
3105 }
dccce9ea 3106
e95354ec
VZ
3107 // we trust OS to do conversion better than we can so try external
3108 // conversion methods first
3109 //
3110 // the full order is:
3111 // 1. OS conversion (iconv() under Unix or Win32 API)
3112 // 2. hard coded conversions for UTF
3113 // 3. wxEncodingConverter as fall back
3114
3115 // step (1)
3116#ifdef HAVE_ICONV
c547282d 3117#if !wxUSE_FONTMAP
e95354ec 3118 if ( m_name )
c547282d 3119#endif // !wxUSE_FONTMAP
e95354ec 3120 {
3ef10cfc 3121#if wxUSE_FONTMAP
8b3eb85d 3122 wxFontEncoding encoding(m_encoding);
3ef10cfc 3123#endif
8b3eb85d 3124
86501081 3125 if ( m_name )
8b3eb85d 3126 {
86501081 3127 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3128 if ( conv->IsOk() )
3129 return conv;
3130
3131 delete conv;
c547282d
VZ
3132
3133#if wxUSE_FONTMAP
8b3eb85d 3134 encoding =
86501081 3135 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3136#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3137 }
3138#if wxUSE_FONTMAP
3139 {
3140 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3141 if ( it != gs_nameCache.end() )
3142 {
3143 if ( it->second.empty() )
3144 return NULL;
c547282d 3145
86501081 3146 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3147 if ( conv->IsOk() )
3148 return conv;
e95354ec 3149
8b3eb85d
VZ
3150 delete conv;
3151 }
3152
a243da29 3153 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3154 // CS : in case this does not return valid names (eg for MacRoman)
3155 // encoding got a 'failure' entry in the cache all the same,
3156 // although it just has to be created using a different method, so
3157 // only store failed iconv creation attempts (or perhaps we
3158 // shoulnd't do this at all ?)
3c67ec06 3159 if ( names[0] != NULL )
8b3eb85d 3160 {
3c67ec06 3161 for ( ; *names; ++names )
8b3eb85d 3162 {
86501081
VS
3163 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3164 // will need changes that will obsolete this
3165 wxString name(*names);
3166 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3167 if ( conv->IsOk() )
3168 {
3169 gs_nameCache[encoding] = *names;
3170 return conv;
3171 }
3172
3173 delete conv;
8b3eb85d
VZ
3174 }
3175
9a83f860 3176 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3177 }
8b3eb85d
VZ
3178 }
3179#endif // wxUSE_FONTMAP
e95354ec
VZ
3180 }
3181#endif // HAVE_ICONV
3182
3183#ifdef wxHAVE_WIN32_MB2WC
3184 {
7608a683 3185#if wxUSE_FONTMAP
e95354ec
VZ
3186 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3187 : new wxMBConv_win32(m_encoding);
3188 if ( conv->IsOk() )
3189 return conv;
3190
3191 delete conv;
7608a683
WS
3192#else
3193 return NULL;
3194#endif
e95354ec
VZ
3195 }
3196#endif // wxHAVE_WIN32_MB2WC
ef199164 3197
5c4ed98d 3198#ifdef __DARWIN__
f7e98dee 3199 {
6ff49cbc
DE
3200 // leave UTF16 and UTF32 to the built-ins of wx
3201 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3202 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3203 {
a6900d10 3204#if wxUSE_FONTMAP
5c4ed98d
DE
3205 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3206 : new wxMBConv_cf(m_encoding);
a6900d10 3207#else
5c4ed98d 3208 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3209#endif
ef199164 3210
f7e98dee 3211 if ( conv->IsOk() )
d775fa82
WS
3212 return conv;
3213
3214 delete conv;
3215 }
335d31e0 3216 }
5c4ed98d
DE
3217#endif // __DARWIN__
3218
e95354ec
VZ
3219 // step (2)
3220 wxFontEncoding enc = m_encoding;
3221#if wxUSE_FONTMAP
c547282d
VZ
3222 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3223 {
3224 // use "false" to suppress interactive dialogs -- we can be called from
3225 // anywhere and popping up a dialog from here is the last thing we want to
3226 // do
267e11c5 3227 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3228 }
e95354ec
VZ
3229#endif // wxUSE_FONTMAP
3230
3231 switch ( enc )
3232 {
3233 case wxFONTENCODING_UTF7:
3234 return new wxMBConvUTF7;
3235
3236 case wxFONTENCODING_UTF8:
3237 return new wxMBConvUTF8;
3238
e95354ec
VZ
3239 case wxFONTENCODING_UTF16BE:
3240 return new wxMBConvUTF16BE;
3241
3242 case wxFONTENCODING_UTF16LE:
3243 return new wxMBConvUTF16LE;
3244
e95354ec
VZ
3245 case wxFONTENCODING_UTF32BE:
3246 return new wxMBConvUTF32BE;
3247
3248 case wxFONTENCODING_UTF32LE:
3249 return new wxMBConvUTF32LE;
3250
3251 default:
3252 // nothing to do but put here to suppress gcc warnings
ef199164 3253 break;
e95354ec
VZ
3254 }
3255
3256 // step (3)
3257#if wxUSE_FONTMAP
3258 {
3259 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3260 : new wxMBConv_wxwin(m_encoding);
3261 if ( conv->IsOk() )
3262 return conv;
3263
3264 delete conv;
3265 }
ef199164 3266
3df31b2d
VZ
3267 wxLogTrace(TRACE_STRCONV,
3268 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3269 (m_name ? wxString(m_name)
3df31b2d
VZ
3270 : wxFontMapperBase::GetEncodingName(m_encoding)));
3271#endif // wxUSE_FONTMAP
e95354ec
VZ
3272
3273 return NULL;
3274}
3275
0f0298b1
VZ
3276bool wxCSConv::IsOk() const
3277{
0f0298b1
VZ
3278 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3279 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3280 return true; // always ok as we do it ourselves
3281
3282 // m_convReal->IsOk() is called at its own creation, so we know it must
3283 // be ok if m_convReal is non-NULL
3284 return m_convReal != NULL;
3285}
3286
1c714a5d
VZ
3287size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3288 const char *src, size_t srcLen) const
3289{
2c74c558
VS
3290 if (m_convReal)
3291 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3292
3293 // latin-1 (direct)
05392dc8
VZ
3294 if ( srcLen == wxNO_LEN )
3295 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3296
05392dc8
VZ
3297 if ( dst )
3298 {
3299 if ( dstLen < srcLen )
3300 return wxCONV_FAILED;
1c714a5d 3301
05392dc8
VZ
3302 for ( size_t n = 0; n < srcLen; n++ )
3303 dst[n] = (unsigned char)(src[n]);
3304 }
2c74c558 3305
05392dc8 3306 return srcLen;
1c714a5d
VZ
3307}
3308
05392dc8
VZ
3309size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3310 const wchar_t *src, size_t srcLen) const
6001e347 3311{
e95354ec 3312 if (m_convReal)
05392dc8 3313 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3314
3315 // latin-1 (direct)
05392dc8
VZ
3316 if ( srcLen == wxNO_LEN )
3317 srcLen = wxWcslen(src) + 1;
dccce9ea 3318
05392dc8 3319 if ( dst )
f1339c56 3320 {
05392dc8
VZ
3321 if ( dstLen < srcLen )
3322 return wxCONV_FAILED;
1cd52418 3323
05392dc8 3324 for ( size_t n = 0; n < srcLen; n++ )
24642831 3325 {
05392dc8 3326 if ( src[n] > 0xFF )
467e0479 3327 return wxCONV_FAILED;
ef199164 3328
05392dc8 3329 dst[n] = (char)src[n];
24642831 3330 }
05392dc8 3331
24642831 3332 }
05392dc8 3333 else // still need to check the input validity
24642831 3334 {
05392dc8 3335 for ( size_t n = 0; n < srcLen; n++ )
24642831 3336 {
05392dc8 3337 if ( src[n] > 0xFF )
467e0479 3338 return wxCONV_FAILED;
24642831 3339 }
f1339c56 3340 }
dccce9ea 3341
05392dc8 3342 return srcLen;
6001e347
RR
3343}
3344
7ef3ab50 3345size_t wxCSConv::GetMBNulLen() const
eec47cc6 3346{
eec47cc6 3347 if ( m_convReal )
7ef3ab50 3348 return m_convReal->GetMBNulLen();
eec47cc6 3349
ba98e032 3350 // otherwise, we are ISO-8859-1
c1464d9d 3351 return 1;
eec47cc6
VZ
3352}
3353
ba98e032
VS
3354#if wxUSE_UNICODE_UTF8
3355bool wxCSConv::IsUTF8() const
3356{
ba98e032 3357 if ( m_convReal )
ba98e032 3358 return m_convReal->IsUTF8();
ba98e032
VS
3359
3360 // otherwise, we are ISO-8859-1
3361 return false;
3362}
3363#endif
3364
69c928ef
VZ
3365
3366#if wxUSE_UNICODE
3367
3368wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3369{
3370 if ( !s )
3371 return wxWCharBuffer();
3372
3373 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3374 if ( !wbuf )
5487ff0f 3375 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3376 if ( !wbuf )
3377 wbuf = wxConvISO8859_1.cMB2WX(s);
3378
3379 return wbuf;
3380}
3381
3382wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3383{
3384 if ( !ws )
3385 return wxCharBuffer();
3386
3387 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3388 if ( !buf )
3389 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3390
3391 return buf;
3392}
3393
3394#endif // wxUSE_UNICODE
f5a1953b 3395
1e50d914
VS
3396// ----------------------------------------------------------------------------
3397// globals
3398// ----------------------------------------------------------------------------
3399
3400// NB: The reason why we create converted objects in this convoluted way,
3401// using a factory function instead of global variable, is that they
3402// may be used at static initialization time (some of them are used by
3403// wxString ctors and there may be a global wxString object). In other
3404// words, possibly _before_ the converter global object would be
3405// initialized.
3406
3407#undef wxConvLibc
3408#undef wxConvUTF8
3409#undef wxConvUTF7
3410#undef wxConvLocal
3411#undef wxConvISO8859_1
3412
3413#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3414 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3415 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3416 { \
3417 static impl_klass name##Obj ctor_args; \
3418 return &name##Obj; \
3419 } \
3420 /* this ensures that all global converter objects are created */ \
3421 /* by the time static initialization is done, i.e. before any */ \
3422 /* thread is launched: */ \
3423 static klass* gs_##name##instance = wxGet_##name##Ptr()
3424
3425#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3426 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3427
5c69ef61
VZ
3428#ifdef __INTELC__
3429 // disable warning "variable 'xxx' was declared but never referenced"
3430 #pragma warning(disable: 177)
3431#endif // Intel C++
3432
1e50d914
VS
3433#ifdef __WINDOWS__
3434 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3435#elif 0 // defined(__WXOSX__)
3436 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3437#else
3438 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3439#endif
3440
e1079eda
VZ
3441// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3442// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3443// provokes an error message about "not enough macro parameters"; and we
3444// can't use "()" here as the name##Obj declaration would be parsed as a
3445// function declaration then, so use a semicolon and live with an extra
3446// empty statement (and hope that no compilers warns about this)
3447WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3448WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3449
3450WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3451WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3452
3453WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3454WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3455
6ac84a78 3456#ifdef __DARWIN__
8244507f
VZ
3457// It is important to use this conversion object under Darwin as it ensures
3458// that Unicode strings are (re)composed correctly even though xnu kernel uses
3459// decomposed form internally (at least for the file names).
6ac84a78 3460static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3461#endif
6ac84a78 3462
1e50d914 3463WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3464#ifdef __DARWIN__
1e50d914 3465 &wxConvMacUTF8DObj;
6ac84a78 3466#else // !__DARWIN__
1e50d914 3467 wxGet_wxConvLibcPtr();
6ac84a78 3468#endif // __DARWIN__/!__DARWIN__