]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Update version to 2.9.4 in version.bkl too and rebake everything.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
1c193821 31#ifndef __WXWINCE__
1cd52418 32#include <errno.h>
1c193821
JS
33#endif
34
6001e347
RR
35#include <ctype.h>
36#include <string.h>
37#include <stdlib.h>
38
e95354ec 39#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
e95354ec 42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
b040e242 45#ifdef HAVE_ICONV
373658eb 46 #include <iconv.h>
b1d547eb 47 #include "wx/thread.h"
1cd52418 48#endif
1cd52418 49
373658eb
VZ
50#include "wx/encconv.h"
51#include "wx/fontmap.h"
52
5c4ed98d 53#ifdef __DARWIN__
c933e267 54#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
55#endif //def __DARWIN__
56
ef199164 57
9a83f860 58#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 59
467e0479
VZ
60// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61// be 4 bytes
4948c2b6 62#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
63 #define WC_UTF16
64#endif
65
ef199164 66
373658eb
VZ
67// ============================================================================
68// implementation
69// ============================================================================
70
69373110
VZ
71// helper function of cMB2WC(): check if n bytes at this location are all NUL
72static bool NotAllNULs(const char *p, size_t n)
73{
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78}
79
373658eb 80// ----------------------------------------------------------------------------
467e0479 81// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 82// ----------------------------------------------------------------------------
6001e347 83
c91830cb 84static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 85{
ef199164 86 if (input <= 0xffff)
4def3b35 87 {
999836aa
VZ
88 if (output)
89 *output = (wxUint16) input;
ef199164 90
4def3b35 91 return 1;
dccce9ea 92 }
ef199164 93 else if (input >= 0x110000)
4def3b35 94 {
467e0479 95 return wxCONV_FAILED;
dccce9ea
VZ
96 }
97 else
4def3b35 98 {
dccce9ea 99 if (output)
4def3b35 100 {
ef199164
DS
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 103 }
ef199164 104
4def3b35 105 return 2;
1cd52418 106 }
1cd52418
OK
107}
108
c91830cb 109static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 110{
ef199164 111 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
112 {
113 output = *input;
114 return 1;
dccce9ea 115 }
ef199164 116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
117 {
118 output = *input;
467e0479 119 return wxCONV_FAILED;
dccce9ea
VZ
120 }
121 else
4def3b35
VS
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
1cd52418
OK
126}
127
467e0479 128#ifdef WC_UTF16
35d11700
VZ
129 typedef wchar_t wxDecodeSurrogate_t;
130#else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
133
134// returns the next UTF-32 character from the wchar_t buffer and advances the
135// pointer to the character after this one
136//
137// if an invalid character is found, *pSrc is set to NULL, the caller must
138// check for this
35d11700 139static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
140{
141 wxUint32 out;
8d3dd069 142 const size_t
5c33522f 143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150}
151
f6bcfd97 152// ----------------------------------------------------------------------------
6001e347 153// wxMBConv
f6bcfd97 154// ----------------------------------------------------------------------------
2c53a80a 155
483b0434
VZ
156size_t
157wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
6001e347 159{
483b0434 160 // although new conversion classes are supposed to implement this function
36f93678 161 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
36f93678
VZ
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
6001e347 170
483b0434
VZ
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
eec47cc6 173
c1464d9d 174 // the number of NULs terminating this string
a78c43f1 175 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 176
c1464d9d
VZ
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
483b0434
VZ
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
467e0479 183 if ( srcLen != wxNO_LEN )
eec47cc6 184 {
c1464d9d 185 // we need to know how to find the end of this string
7ef3ab50 186 nulLen = GetMBNulLen();
483b0434
VZ
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
e4e3bbb4 189
c1464d9d 190 // if there are enough NULs we can avoid the copy
483b0434 191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
192 {
193 // make a copy in order to properly NUL-terminate the string
483b0434 194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 195 char * const p = bufTmp.data();
483b0434
VZ
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 198 *s = '\0';
483b0434
VZ
199
200 src = bufTmp;
eec47cc6 201 }
e4e3bbb4 202
483b0434
VZ
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
e4e3bbb4 209
36f93678
VZ
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
bbb0ff36 217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
483b0434 225 for ( ;; )
eec47cc6 226 {
c1464d9d 227 // try to convert the current chunk
483b0434 228 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
e4e3bbb4 231
483b0434 232 dstWritten += lenChunk;
f6a02087
VZ
233 if ( !srcEnd )
234 dstWritten++;
f5fb6871 235
f6a02087 236 if ( !lenChunk )
467e0479
VZ
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
483b0434
VZ
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
f6a02087
VZ
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
f6a02087
VZ
252 if ( !srcEnd )
253 dst++;
483b0434 254 }
c1464d9d 255
483b0434 256 if ( !srcEnd )
c1464d9d 257 {
467e0479 258 // we convert just one chunk in this case as this is the entire
bbb0ff36 259 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
260 break;
261 }
eec47cc6 262
bbb0ff36
VZ
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
bbb0ff36
VZ
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
c1464d9d 286
483b0434 287 if ( src >= srcEnd )
c1464d9d
VZ
288 break;
289 }
290
483b0434 291 return dstWritten;
e4e3bbb4
RN
292}
293
483b0434
VZ
294size_t
295wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
e4e3bbb4 297{
483b0434
VZ
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
e4e3bbb4 300
f6a02087
VZ
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
eec47cc6
VZ
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
eec47cc6 308 wxWCharBuffer bufTmp;
f6a02087 309 if ( isNulTerminated )
e4e3bbb4 310 {
483b0434 311 srcLen = wxWcslen(src) + 1;
eec47cc6 312 }
483b0434 313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
314 {
315 // make a copy in order to properly NUL-terminate the string
483b0434 316 bufTmp = wxWCharBuffer(srcLen);
ef199164 317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
27307233 324 src++ /* skip L'\0' too */ )
483b0434
VZ
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
483b0434 331 dstWritten += lenChunk;
27307233
VZ
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
f6a02087 340 dstWritten += lenNul;
483b0434
VZ
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
27307233
VZ
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
364 return wxCONV_FAILED;
365
27307233
VZ
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
483b0434 377 dst += lenChunk;
27307233 378 if ( chunkEnd < srcEnd )
f6a02087 379 dst += lenNul;
483b0434 380 }
27307233
VZ
381
382 src = chunkEnd;
eec47cc6 383 }
e4e3bbb4 384
483b0434
VZ
385 return dstWritten;
386}
387
ef199164 388size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 389{
51725fc0 390 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 391 if ( rc != wxCONV_FAILED )
509da451
VZ
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399}
400
ef199164 401size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 402{
51725fc0 403 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 404 if ( rc != wxCONV_FAILED )
509da451 405 {
51725fc0 406 rc -= GetMBNulLen();
509da451
VZ
407 }
408
409 return rc;
410}
411
483b0434
VZ
412wxMBConv::~wxMBConv()
413{
414 // nothing to do here (necessary for Darwin linking probably)
415}
e4e3bbb4 416
483b0434
VZ
417const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418{
419 if ( psz )
eec47cc6 420 {
483b0434 421 // calculate the length of the buffer needed first
a2db25a1 422 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 423 if ( nLen != wxCONV_FAILED )
f5fb6871 424 {
483b0434 425 // now do the actual conversion
a2db25a1 426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 427
483b0434 428 // +1 for the trailing NULL
a2db25a1 429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 430 return buf;
f5fb6871 431 }
483b0434 432 }
e4e3bbb4 433
483b0434
VZ
434 return wxWCharBuffer();
435}
3698ae71 436
483b0434
VZ
437const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438{
439 if ( pwz )
440 {
a2db25a1 441 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 442 if ( nLen != wxCONV_FAILED )
483b0434 443 {
a2db25a1
VZ
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451}
e4e3bbb4 452
483b0434 453const wxWCharBuffer
ef199164 454wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 455{
ef199164 456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 457 if ( dstLen != wxCONV_FAILED )
483b0434 458 {
0dd13d21
VZ
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
f6a02087 463 wbuf.data()[dstLen] = L'\0';
ef199164 464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
465 {
466 if ( outLen )
467e0479
VZ
467 {
468 *outLen = dstLen;
f6a02087
VZ
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
467e0479
VZ
476 (*outLen)--;
477 }
478
483b0434
VZ
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487}
488
489const wxCharBuffer
ef199164 490wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 491{
13d92ad6 492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 493 if ( dstLen != wxCONV_FAILED )
483b0434 494 {
0dd13d21
VZ
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
ef199164 501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
502 {
503 if ( outLen )
467e0479
VZ
504 {
505 *outLen = dstLen;
506
f6a02087 507 if ( inLen == wxNO_LEN )
467e0479 508 {
f6a02087
VZ
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
13d92ad6 511 *outLen -= nulLen;
467e0479
VZ
512 }
513 }
d32a507d 514
483b0434
VZ
515 return buf;
516 }
e4e3bbb4
RN
517 }
518
eec47cc6
VZ
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
e4e3bbb4
RN
523}
524
40ac5040
VZ
525const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526{
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
cfcfada9 540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
541}
542
543const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544{
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
cfcfada9 558 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
559}
560
6001e347 561// ----------------------------------------------------------------------------
bde4baac 562// wxMBConvLibc
6001e347
RR
563// ----------------------------------------------------------------------------
564
bde4baac
VZ
565size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566{
567 return wxMB2WC(buf, psz, n);
568}
569
570size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571{
572 return wxWC2MB(buf, psz, n);
573}
e1bfe89e
RR
574
575// ----------------------------------------------------------------------------
532d575b 576// wxConvBrokenFileNames
e1bfe89e
RR
577// ----------------------------------------------------------------------------
578
eec47cc6
VZ
579#ifdef __UNIX__
580
86501081 581wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 582{
9a83f860
VZ
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
586 else
587 m_conv = new wxCSConv(charset);
ea8ce907
RR
588}
589
eec47cc6 590#endif // __UNIX__
c12b7f79 591
bde4baac 592// ----------------------------------------------------------------------------
3698ae71 593// UTF-7
bde4baac 594// ----------------------------------------------------------------------------
6001e347 595
15f2ee32 596// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
597//
598// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 599
15f2ee32
RN
600//
601// BASE64 decoding table
602//
603static const unsigned char utf7unb64[] =
6001e347 604{
15f2ee32
RN
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
637};
638
9d653e81
VZ
639size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
15f2ee32 641{
9d653e81 642 DecoderState stateOrig,
852dcba5 643 *statePtr;
9d653e81
VZ
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
5c33522f 655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
15f2ee32
RN
667 size_t len = 0;
668
9d653e81
VZ
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 672 {
9d653e81
VZ
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
15f2ee32 676 {
9d653e81
VZ
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
15f2ee32 679 {
ccaa848d
VZ
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
852dcba5 691 return wxCONV_FAILED;
ccaa848d 692 }
852dcba5 693
9d653e81
VZ
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
15f2ee32 709 {
9d653e81
VZ
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
15f2ee32 715 {
9d653e81
VZ
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
15f2ee32 721 }
9d653e81 722 else // MSB
04a37834 723 {
9d653e81
VZ
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
04a37834 727 }
15f2ee32
RN
728 }
729 }
9d653e81 730 }
04a37834 731
9d653e81
VZ
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
04a37834 736 {
9d653e81
VZ
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
ccaa848d
VZ
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
9d653e81
VZ
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
15f2ee32
RN
770 }
771 }
04a37834 772
9d653e81
VZ
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
04a37834 782
15f2ee32 783 return len;
6001e347
RR
784}
785
15f2ee32
RN
786//
787// BASE64 encoding table
788//
789static const unsigned char utf7enb64[] =
790{
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799};
800
801//
802// UTF-7 encoding table
803//
804// 0 - Set D (directly encoded characters)
805// 1 - Set O (optional direct characters)
806// 2 - whitespace characters (optional)
807// 3 - special characters
808//
809static const unsigned char utf7encode[128] =
6001e347 810{
9d653e81 811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819};
820
9d653e81
VZ
821static inline bool wxIsUTF7Direct(wchar_t wc)
822{
823 return wc < 0x80 && utf7encode[wc] < 1;
824}
825
826size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
15f2ee32 828{
9d653e81
VZ
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
5c33522f 842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
843 }
844
845 EncoderState& state = *statePtr;
846
847
15f2ee32
RN
848 size_t len = 0;
849
9d653e81
VZ
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 852 {
9d653e81
VZ
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
15f2ee32 855 {
9d653e81
VZ
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
ef199164 865
9d653e81
VZ
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
15f2ee32
RN
875 len++;
876 }
9d653e81
VZ
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
15f2ee32 887#ifndef WC_UTF16
79c78d42 888 else if (((wxUint32)cc) > 0xffff)
b2c13097 889 {
15f2ee32 890 // no surrogate pair generation (yet?)
467e0479 891 return wxCONV_FAILED;
15f2ee32
RN
892 }
893#endif
894 else
895 {
9d653e81
VZ
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
ef199164 899
9d653e81
VZ
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
15f2ee32 907 {
9d653e81 908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 909 {
9d653e81
VZ
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
15f2ee32 914 {
9d653e81
VZ
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
15f2ee32 919 }
15f2ee32 920 }
ef199164 921
9d653e81
VZ
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
ef199164 924
9d653e81 925 src++;
15f2ee32 926 }
15f2ee32
RN
927 }
928 }
ef199164 929
9d653e81
VZ
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
ef199164 935
15f2ee32 936 return len;
6001e347
RR
937}
938
f6bcfd97 939// ----------------------------------------------------------------------------
6001e347 940// UTF-8
f6bcfd97 941// ----------------------------------------------------------------------------
6001e347 942
1774c3c5 943static const wxUint32 utf8_max[]=
4def3b35 944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 945
3698ae71
VZ
946// boundaries of the private use area we use to (temporarily) remap invalid
947// characters invalid in a UTF-8 encoded string
ea8ce907
RR
948const wxUint32 wxUnicodePUA = 0x100000;
949const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
0286d08d 951// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 952const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984};
985
986size_t
987wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989{
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
0dcbb107 998 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
0286d08d
VZ
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
0286d08d
VZ
1018 if ( out && !dstLen-- )
1019 break;
1020
5367a38a
VS
1021 wxUint32 code;
1022 unsigned char c = *p;
0286d08d 1023
5367a38a
VS
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
0286d08d 1028
5367a38a
VS
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
0286d08d 1031
5367a38a
VS
1032 code = c;
1033 }
1034 else
0286d08d 1035 {
5367a38a
VS
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
0286d08d 1081
5367a38a
VS
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
0286d08d
VZ
1085 }
1086
1087#ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095#else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098#endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107}
1108
1109size_t
1110wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112{
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
0dcbb107 1118 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
0286d08d
VZ
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
a964d3ed
VZ
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
0286d08d
VZ
1140
1141 wxUint32 code;
1142#ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
041e6050
VZ
1148 if ( srcLen != wxNO_LEN )
1149 srcLen--;
0286d08d
VZ
1150 }
1151#else // wchar_t is UTF-32
1152 code = *wp & 0x7fffffff;
1153#endif
1154
1155 unsigned len;
1156 if ( code <= 0x7F )
1157 {
1158 len = 1;
1159 if ( out )
1160 {
1161 if ( dstLen < len )
1162 break;
1163
1164 out[0] = (char)code;
1165 }
1166 }
1167 else if ( code <= 0x07FF )
1168 {
1169 len = 2;
1170 if ( out )
1171 {
1172 if ( dstLen < len )
1173 break;
1174
1175 // NB: this line takes 6 least significant bits, encodes them as
1176 // 10xxxxxx and discards them so that the next byte can be encoded:
1177 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1178 out[0] = 0xC0 | code;
1179 }
1180 }
1181 else if ( code < 0xFFFF )
1182 {
1183 len = 3;
1184 if ( out )
1185 {
1186 if ( dstLen < len )
1187 break;
1188
1189 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1191 out[0] = 0xE0 | code;
1192 }
1193 }
1194 else if ( code <= 0x10FFFF )
1195 {
1196 len = 4;
1197 if ( out )
1198 {
1199 if ( dstLen < len )
1200 break;
1201
1202 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1205 out[0] = 0xF0 | code;
1206 }
1207 }
1208 else
1209 {
9a83f860 1210 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1211 break;
1212 }
1213
1214 if ( out )
1215 {
1216 out += len;
1217 dstLen -= len;
1218 }
1219
1220 written += len;
1221 }
1222
1223 // we only get here if an error occurs during decoding
1224 return wxCONV_FAILED;
1225}
1226
d16d0917
VZ
1227size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1228 const char *psz, size_t srcLen) const
6001e347 1229{
0286d08d 1230 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1231 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1232
4def3b35
VS
1233 size_t len = 0;
1234
f4cb7c58
VZ
1235 // The length can be either given explicitly or computed implicitly for the
1236 // NUL-terminated strings.
1237 const bool isNulTerminated = srcLen == wxNO_LEN;
1238 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1239 {
ea8ce907
RR
1240 const char *opsz = psz;
1241 bool invalid = false;
4def3b35
VS
1242 unsigned char cc = *psz++, fc = cc;
1243 unsigned cnt;
dccce9ea 1244 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1245 fc <<= 1;
ef199164 1246
dccce9ea 1247 if (!cnt)
4def3b35
VS
1248 {
1249 // plain ASCII char
dccce9ea 1250 if (buf)
4def3b35
VS
1251 *buf++ = cc;
1252 len++;
561488ef
MW
1253
1254 // escape the escape character for octal escapes
1255 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1256 && cc == '\\' && (!buf || len < n))
1257 {
1258 if (buf)
1259 *buf++ = cc;
1260 len++;
1261 }
dccce9ea
VZ
1262 }
1263 else
4def3b35
VS
1264 {
1265 cnt--;
dccce9ea 1266 if (!cnt)
4def3b35
VS
1267 {
1268 // invalid UTF-8 sequence
ea8ce907 1269 invalid = true;
dccce9ea
VZ
1270 }
1271 else
4def3b35
VS
1272 {
1273 unsigned ocnt = cnt - 1;
1274 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1275 while (cnt--)
4def3b35 1276 {
ea8ce907 1277 cc = *psz;
dccce9ea 1278 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1279 {
1280 // invalid UTF-8 sequence
ea8ce907
RR
1281 invalid = true;
1282 break;
4def3b35 1283 }
ef199164 1284
ea8ce907 1285 psz++;
4def3b35
VS
1286 res = (res << 6) | (cc & 0x3f);
1287 }
ef199164 1288
ea8ce907 1289 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1290 {
1291 // illegal UTF-8 encoding
ea8ce907 1292 invalid = true;
4def3b35 1293 }
ea8ce907
RR
1294 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1295 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1296 {
1297 // if one of our PUA characters turns up externally
1298 // it must also be treated as an illegal sequence
1299 // (a bit like you have to escape an escape character)
1300 invalid = true;
1301 }
1302 else
1303 {
1cd52418 1304#ifdef WC_UTF16
0286d08d 1305 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1306 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1307 if (pa == wxCONV_FAILED)
ea8ce907
RR
1308 {
1309 invalid = true;
1310 }
1311 else
1312 {
1313 if (buf)
1314 buf += pa;
1315 len += pa;
1316 }
373658eb 1317#else // !WC_UTF16
ea8ce907 1318 if (buf)
38d4b1e4 1319 *buf++ = (wchar_t)res;
ea8ce907 1320 len++;
373658eb 1321#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1322 }
1323 }
ef199164 1324
ea8ce907
RR
1325 if (invalid)
1326 {
1327 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1328 {
1329 while (opsz < psz && (!buf || len < n))
1330 {
1331#ifdef WC_UTF16
1332 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1333 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1334 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1335 if (buf)
1336 buf += pa;
1337 opsz++;
1338 len += pa;
1339#else
1340 if (buf)
38d4b1e4 1341 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1342 opsz++;
1343 len++;
1344#endif
1345 }
1346 }
3698ae71 1347 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1348 {
1349 while (opsz < psz && (!buf || len < n))
1350 {
3698ae71
VZ
1351 if ( buf && len + 3 < n )
1352 {
17a1ebd1 1353 unsigned char on = *opsz;
3698ae71 1354 *buf++ = L'\\';
17a1ebd1
VZ
1355 *buf++ = (wchar_t)( L'0' + on / 0100 );
1356 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1357 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1358 }
ef199164 1359
ea8ce907
RR
1360 opsz++;
1361 len += 4;
1362 }
1363 }
3698ae71 1364 else // MAP_INVALID_UTF8_NOT
ea8ce907 1365 {
467e0479 1366 return wxCONV_FAILED;
ea8ce907 1367 }
4def3b35
VS
1368 }
1369 }
6001e347 1370 }
ef199164 1371
f4cb7c58
VZ
1372 if ( isNulTerminated )
1373 {
1374 // Add the trailing NUL in this case if we have a large enough buffer.
1375 if ( buf && (len < n) )
1376 *buf = 0;
ef199164 1377
f4cb7c58
VZ
1378 // And count it in any case.
1379 len++;
1380 }
1381
1382 return len;
6001e347
RR
1383}
1384
3698ae71
VZ
1385static inline bool isoctal(wchar_t wch)
1386{
1387 return L'0' <= wch && wch <= L'7';
1388}
1389
d16d0917
VZ
1390size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1391 const wchar_t *psz, size_t srcLen) const
6001e347 1392{
0286d08d 1393 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1394 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1395
4def3b35 1396 size_t len = 0;
6001e347 1397
d16d0917 1398 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1399 {
1400 wxUint32 cc;
ef199164 1401
1cd52418 1402#ifdef WC_UTF16
b5153fd8
VZ
1403 // cast is ok for WC_UTF16
1404 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1405 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1406#else
ef199164 1407 cc = (*psz++) & 0x7fffffff;
4def3b35 1408#endif
3698ae71
VZ
1409
1410 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1411 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1412 {
dccce9ea 1413 if (buf)
ea8ce907 1414 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1415 len++;
3698ae71 1416 }
561488ef
MW
1417 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1418 && cc == L'\\' && psz[0] == L'\\' )
1419 {
1420 if (buf)
1421 *buf++ = (char)cc;
1422 psz++;
1423 len++;
1424 }
3698ae71
VZ
1425 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1426 cc == L'\\' &&
1427 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1428 {
dccce9ea 1429 if (buf)
3698ae71 1430 {
ef199164
DS
1431 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1432 (psz[1] - L'0') * 010 +
b2c13097 1433 (psz[2] - L'0'));
3698ae71
VZ
1434 }
1435
1436 psz += 3;
ea8ce907
RR
1437 len++;
1438 }
1439 else
1440 {
1441 unsigned cnt;
ef199164
DS
1442 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1443 {
1444 }
1445
ea8ce907 1446 if (!cnt)
4def3b35 1447 {
ea8ce907
RR
1448 // plain ASCII char
1449 if (buf)
1450 *buf++ = (char) cc;
1451 len++;
1452 }
ea8ce907
RR
1453 else
1454 {
1455 len += cnt + 1;
1456 if (buf)
1457 {
1458 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1459 while (cnt--)
1460 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1461 }
4def3b35
VS
1462 }
1463 }
6001e347 1464 }
4def3b35 1465
d16d0917 1466 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1467 *buf = 0;
adb45366 1468
d16d0917 1469 return len + 1;
6001e347
RR
1470}
1471
467e0479 1472// ============================================================================
c91830cb 1473// UTF-16
467e0479 1474// ============================================================================
c91830cb
VZ
1475
1476#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1477 #define wxMBConvUTF16straight wxMBConvUTF16BE
1478 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1479#else
bde4baac
VZ
1480 #define wxMBConvUTF16swap wxMBConvUTF16BE
1481 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1482#endif
1483
467e0479
VZ
1484/* static */
1485size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1486{
1487 if ( srcLen == wxNO_LEN )
1488 {
1489 // count the number of bytes in input, including the trailing NULs
5c33522f 1490 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1491 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1492 ;
c91830cb 1493
467e0479
VZ
1494 srcLen *= BYTES_PER_CHAR;
1495 }
1496 else // we already have the length
1497 {
1498 // we can only convert an entire number of UTF-16 characters
1499 if ( srcLen % BYTES_PER_CHAR )
1500 return wxCONV_FAILED;
1501 }
1502
1503 return srcLen;
1504}
1505
1506// case when in-memory representation is UTF-16 too
c91830cb
VZ
1507#ifdef WC_UTF16
1508
467e0479
VZ
1509// ----------------------------------------------------------------------------
1510// conversions without endianness change
1511// ----------------------------------------------------------------------------
1512
1513size_t
1514wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1515 const char *src, size_t srcLen) const
c91830cb 1516{
467e0479
VZ
1517 // set up the scene for using memcpy() (which is presumably more efficient
1518 // than copying the bytes one by one)
1519 srcLen = GetLength(src, srcLen);
1520 if ( srcLen == wxNO_LEN )
1521 return wxCONV_FAILED;
c91830cb 1522
ef199164 1523 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1524 if ( dst )
c91830cb 1525 {
467e0479
VZ
1526 if ( dstLen < inLen )
1527 return wxCONV_FAILED;
c91830cb 1528
467e0479 1529 memcpy(dst, src, srcLen);
c91830cb 1530 }
d32a507d 1531
467e0479 1532 return inLen;
c91830cb
VZ
1533}
1534
467e0479
VZ
1535size_t
1536wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1537 const wchar_t *src, size_t srcLen) const
c91830cb 1538{
467e0479
VZ
1539 if ( srcLen == wxNO_LEN )
1540 srcLen = wxWcslen(src) + 1;
c91830cb 1541
467e0479
VZ
1542 srcLen *= BYTES_PER_CHAR;
1543
1544 if ( dst )
c91830cb 1545 {
467e0479
VZ
1546 if ( dstLen < srcLen )
1547 return wxCONV_FAILED;
d32a507d 1548
467e0479 1549 memcpy(dst, src, srcLen);
c91830cb 1550 }
d32a507d 1551
467e0479 1552 return srcLen;
c91830cb
VZ
1553}
1554
467e0479
VZ
1555// ----------------------------------------------------------------------------
1556// endian-reversing conversions
1557// ----------------------------------------------------------------------------
c91830cb 1558
467e0479
VZ
1559size_t
1560wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1561 const char *src, size_t srcLen) const
c91830cb 1562{
467e0479
VZ
1563 srcLen = GetLength(src, srcLen);
1564 if ( srcLen == wxNO_LEN )
1565 return wxCONV_FAILED;
c91830cb 1566
467e0479
VZ
1567 srcLen /= BYTES_PER_CHAR;
1568
1569 if ( dst )
c91830cb 1570 {
467e0479
VZ
1571 if ( dstLen < srcLen )
1572 return wxCONV_FAILED;
1573
5c33522f 1574 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1575 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1576 {
ef199164 1577 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1578 }
c91830cb 1579 }
bfab25d4 1580
467e0479 1581 return srcLen;
c91830cb
VZ
1582}
1583
467e0479
VZ
1584size_t
1585wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1586 const wchar_t *src, size_t srcLen) const
c91830cb 1587{
467e0479
VZ
1588 if ( srcLen == wxNO_LEN )
1589 srcLen = wxWcslen(src) + 1;
c91830cb 1590
467e0479
VZ
1591 srcLen *= BYTES_PER_CHAR;
1592
1593 if ( dst )
c91830cb 1594 {
467e0479
VZ
1595 if ( dstLen < srcLen )
1596 return wxCONV_FAILED;
1597
5c33522f 1598 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1599 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1600 {
ef199164 1601 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1602 }
c91830cb 1603 }
eec47cc6 1604
467e0479 1605 return srcLen;
c91830cb
VZ
1606}
1607
467e0479 1608#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1609
467e0479
VZ
1610// ----------------------------------------------------------------------------
1611// conversions without endianness change
1612// ----------------------------------------------------------------------------
c91830cb 1613
35d11700
VZ
1614size_t
1615wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1616 const char *src, size_t srcLen) const
c91830cb 1617{
35d11700
VZ
1618 srcLen = GetLength(src, srcLen);
1619 if ( srcLen == wxNO_LEN )
1620 return wxCONV_FAILED;
c91830cb 1621
ef199164 1622 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1623 if ( !dst )
c91830cb 1624 {
35d11700
VZ
1625 // optimization: return maximal space which could be needed for this
1626 // string even if the real size could be smaller if the buffer contains
1627 // any surrogates
1628 return inLen;
c91830cb 1629 }
c91830cb 1630
35d11700 1631 size_t outLen = 0;
5c33522f 1632 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1633 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1634 {
ef199164
DS
1635 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1636 if ( !inBuff )
35d11700
VZ
1637 return wxCONV_FAILED;
1638
1639 if ( ++outLen > dstLen )
1640 return wxCONV_FAILED;
c91830cb 1641
35d11700
VZ
1642 *dst++ = ch;
1643 }
1644
1645
1646 return outLen;
1647}
c91830cb 1648
35d11700
VZ
1649size_t
1650wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1651 const wchar_t *src, size_t srcLen) const
c91830cb 1652{
35d11700
VZ
1653 if ( srcLen == wxNO_LEN )
1654 srcLen = wxWcslen(src) + 1;
c91830cb 1655
35d11700 1656 size_t outLen = 0;
5c33522f 1657 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1658 for ( size_t n = 0; n < srcLen; n++ )
c91830cb 1659 {
d883acaa 1660 wxUint16 cc[2] = { 0 };
35d11700
VZ
1661 const size_t numChars = encode_utf16(*src++, cc);
1662 if ( numChars == wxCONV_FAILED )
1663 return wxCONV_FAILED;
c91830cb 1664
ef199164
DS
1665 outLen += numChars * BYTES_PER_CHAR;
1666 if ( outBuff )
c91830cb 1667 {
35d11700
VZ
1668 if ( outLen > dstLen )
1669 return wxCONV_FAILED;
1670
ef199164 1671 *outBuff++ = cc[0];
35d11700 1672 if ( numChars == 2 )
69b80d28 1673 {
35d11700 1674 // second character of a surrogate
ef199164 1675 *outBuff++ = cc[1];
69b80d28 1676 }
c91830cb 1677 }
c91830cb 1678 }
c91830cb 1679
35d11700 1680 return outLen;
c91830cb
VZ
1681}
1682
467e0479
VZ
1683// ----------------------------------------------------------------------------
1684// endian-reversing conversions
1685// ----------------------------------------------------------------------------
c91830cb 1686
35d11700
VZ
1687size_t
1688wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1689 const char *src, size_t srcLen) const
c91830cb 1690{
35d11700
VZ
1691 srcLen = GetLength(src, srcLen);
1692 if ( srcLen == wxNO_LEN )
1693 return wxCONV_FAILED;
1694
ef199164 1695 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1696 if ( !dst )
1697 {
1698 // optimization: return maximal space which could be needed for this
1699 // string even if the real size could be smaller if the buffer contains
1700 // any surrogates
1701 return inLen;
1702 }
c91830cb 1703
35d11700 1704 size_t outLen = 0;
5c33522f 1705 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1706 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1707 {
35d11700
VZ
1708 wxUint32 ch;
1709 wxUint16 tmp[2];
ef199164
DS
1710
1711 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1712 inBuff++;
1713 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1714
35d11700
VZ
1715 const size_t numChars = decode_utf16(tmp, ch);
1716 if ( numChars == wxCONV_FAILED )
1717 return wxCONV_FAILED;
c91830cb 1718
35d11700 1719 if ( numChars == 2 )
ef199164 1720 inBuff++;
35d11700
VZ
1721
1722 if ( ++outLen > dstLen )
1723 return wxCONV_FAILED;
c91830cb 1724
35d11700 1725 *dst++ = ch;
c91830cb 1726 }
c91830cb 1727
c91830cb 1728
35d11700
VZ
1729 return outLen;
1730}
c91830cb 1731
35d11700
VZ
1732size_t
1733wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1734 const wchar_t *src, size_t srcLen) const
c91830cb 1735{
35d11700
VZ
1736 if ( srcLen == wxNO_LEN )
1737 srcLen = wxWcslen(src) + 1;
c91830cb 1738
35d11700 1739 size_t outLen = 0;
5c33522f 1740 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1741 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb 1742 {
d883acaa 1743 wxUint16 cc[2] = { 0 };
35d11700
VZ
1744 const size_t numChars = encode_utf16(*src, cc);
1745 if ( numChars == wxCONV_FAILED )
1746 return wxCONV_FAILED;
c91830cb 1747
ef199164
DS
1748 outLen += numChars * BYTES_PER_CHAR;
1749 if ( outBuff )
c91830cb 1750 {
35d11700
VZ
1751 if ( outLen > dstLen )
1752 return wxCONV_FAILED;
1753
ef199164 1754 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1755 if ( numChars == 2 )
c91830cb 1756 {
35d11700 1757 // second character of a surrogate
ef199164 1758 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1759 }
1760 }
c91830cb 1761 }
c91830cb 1762
35d11700 1763 return outLen;
c91830cb
VZ
1764}
1765
467e0479 1766#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1767
1768
35d11700 1769// ============================================================================
c91830cb 1770// UTF-32
35d11700 1771// ============================================================================
c91830cb
VZ
1772
1773#ifdef WORDS_BIGENDIAN
467e0479
VZ
1774 #define wxMBConvUTF32straight wxMBConvUTF32BE
1775 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1776#else
467e0479
VZ
1777 #define wxMBConvUTF32swap wxMBConvUTF32BE
1778 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1779#endif
1780
1781
1782WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1783WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1784
467e0479
VZ
1785/* static */
1786size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1787{
1788 if ( srcLen == wxNO_LEN )
1789 {
1790 // count the number of bytes in input, including the trailing NULs
5c33522f 1791 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1792 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1793 ;
c91830cb 1794
467e0479
VZ
1795 srcLen *= BYTES_PER_CHAR;
1796 }
1797 else // we already have the length
1798 {
1799 // we can only convert an entire number of UTF-32 characters
1800 if ( srcLen % BYTES_PER_CHAR )
1801 return wxCONV_FAILED;
1802 }
1803
1804 return srcLen;
1805}
1806
1807// case when in-memory representation is UTF-16
c91830cb
VZ
1808#ifdef WC_UTF16
1809
467e0479
VZ
1810// ----------------------------------------------------------------------------
1811// conversions without endianness change
1812// ----------------------------------------------------------------------------
1813
1814size_t
1815wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1816 const char *src, size_t srcLen) const
c91830cb 1817{
467e0479
VZ
1818 srcLen = GetLength(src, srcLen);
1819 if ( srcLen == wxNO_LEN )
1820 return wxCONV_FAILED;
c91830cb 1821
5c33522f 1822 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1823 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1824 size_t outLen = 0;
1825 for ( size_t n = 0; n < inLen; n++ )
c91830cb 1826 {
d883acaa 1827 wxUint16 cc[2] = { 0 };
ef199164 1828 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1829 if ( numChars == wxCONV_FAILED )
1830 return wxCONV_FAILED;
c91830cb 1831
467e0479
VZ
1832 outLen += numChars;
1833 if ( dst )
c91830cb 1834 {
467e0479
VZ
1835 if ( outLen > dstLen )
1836 return wxCONV_FAILED;
d32a507d 1837
467e0479
VZ
1838 *dst++ = cc[0];
1839 if ( numChars == 2 )
1840 {
1841 // second character of a surrogate
1842 *dst++ = cc[1];
1843 }
1844 }
c91830cb 1845 }
d32a507d 1846
467e0479 1847 return outLen;
c91830cb
VZ
1848}
1849
467e0479
VZ
1850size_t
1851wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1852 const wchar_t *src, size_t srcLen) const
c91830cb 1853{
467e0479
VZ
1854 if ( srcLen == wxNO_LEN )
1855 srcLen = wxWcslen(src) + 1;
c91830cb 1856
467e0479 1857 if ( !dst )
c91830cb 1858 {
467e0479
VZ
1859 // optimization: return maximal space which could be needed for this
1860 // string instead of the exact amount which could be less if there are
1861 // any surrogates in the input
1862 //
1863 // we consider that surrogates are rare enough to make it worthwhile to
1864 // avoid running the loop below at the cost of slightly extra memory
1865 // consumption
ef199164 1866 return srcLen * BYTES_PER_CHAR;
467e0479 1867 }
c91830cb 1868
5c33522f 1869 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1870 size_t outLen = 0;
1871 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1872 {
1873 const wxUint32 ch = wxDecodeSurrogate(&src);
1874 if ( !src )
1875 return wxCONV_FAILED;
c91830cb 1876
467e0479 1877 outLen += BYTES_PER_CHAR;
d32a507d 1878
467e0479
VZ
1879 if ( outLen > dstLen )
1880 return wxCONV_FAILED;
b5153fd8 1881
ef199164 1882 *outBuff++ = ch;
467e0479 1883 }
c91830cb 1884
467e0479 1885 return outLen;
c91830cb
VZ
1886}
1887
467e0479
VZ
1888// ----------------------------------------------------------------------------
1889// endian-reversing conversions
1890// ----------------------------------------------------------------------------
c91830cb 1891
467e0479
VZ
1892size_t
1893wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1894 const char *src, size_t srcLen) const
c91830cb 1895{
467e0479
VZ
1896 srcLen = GetLength(src, srcLen);
1897 if ( srcLen == wxNO_LEN )
1898 return wxCONV_FAILED;
c91830cb 1899
5c33522f 1900 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1901 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1902 size_t outLen = 0;
ef199164 1903 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1904 {
d883acaa 1905 wxUint16 cc[2] = { 0 };
ef199164 1906 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1907 if ( numChars == wxCONV_FAILED )
1908 return wxCONV_FAILED;
c91830cb 1909
467e0479
VZ
1910 outLen += numChars;
1911 if ( dst )
c91830cb 1912 {
467e0479
VZ
1913 if ( outLen > dstLen )
1914 return wxCONV_FAILED;
d32a507d 1915
467e0479
VZ
1916 *dst++ = cc[0];
1917 if ( numChars == 2 )
1918 {
1919 // second character of a surrogate
1920 *dst++ = cc[1];
1921 }
1922 }
c91830cb 1923 }
b5153fd8 1924
467e0479 1925 return outLen;
c91830cb
VZ
1926}
1927
467e0479
VZ
1928size_t
1929wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1930 const wchar_t *src, size_t srcLen) const
c91830cb 1931{
467e0479
VZ
1932 if ( srcLen == wxNO_LEN )
1933 srcLen = wxWcslen(src) + 1;
c91830cb 1934
467e0479 1935 if ( !dst )
c91830cb 1936 {
467e0479
VZ
1937 // optimization: return maximal space which could be needed for this
1938 // string instead of the exact amount which could be less if there are
1939 // any surrogates in the input
1940 //
1941 // we consider that surrogates are rare enough to make it worthwhile to
1942 // avoid running the loop below at the cost of slightly extra memory
1943 // consumption
1944 return srcLen*BYTES_PER_CHAR;
1945 }
c91830cb 1946
5c33522f 1947 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1948 size_t outLen = 0;
1949 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1950 {
1951 const wxUint32 ch = wxDecodeSurrogate(&src);
1952 if ( !src )
1953 return wxCONV_FAILED;
c91830cb 1954
467e0479 1955 outLen += BYTES_PER_CHAR;
d32a507d 1956
467e0479
VZ
1957 if ( outLen > dstLen )
1958 return wxCONV_FAILED;
b5153fd8 1959
ef199164 1960 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1961 }
c91830cb 1962
467e0479 1963 return outLen;
c91830cb
VZ
1964}
1965
467e0479 1966#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1967
35d11700
VZ
1968// ----------------------------------------------------------------------------
1969// conversions without endianness change
1970// ----------------------------------------------------------------------------
1971
1972size_t
1973wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1974 const char *src, size_t srcLen) const
c91830cb 1975{
35d11700
VZ
1976 // use memcpy() as it should be much faster than hand-written loop
1977 srcLen = GetLength(src, srcLen);
1978 if ( srcLen == wxNO_LEN )
1979 return wxCONV_FAILED;
c91830cb 1980
35d11700
VZ
1981 const size_t inLen = srcLen/BYTES_PER_CHAR;
1982 if ( dst )
c91830cb 1983 {
35d11700
VZ
1984 if ( dstLen < inLen )
1985 return wxCONV_FAILED;
b5153fd8 1986
35d11700
VZ
1987 memcpy(dst, src, srcLen);
1988 }
c91830cb 1989
35d11700 1990 return inLen;
c91830cb
VZ
1991}
1992
35d11700
VZ
1993size_t
1994wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1995 const wchar_t *src, size_t srcLen) const
c91830cb 1996{
35d11700
VZ
1997 if ( srcLen == wxNO_LEN )
1998 srcLen = wxWcslen(src) + 1;
1999
2000 srcLen *= BYTES_PER_CHAR;
c91830cb 2001
35d11700 2002 if ( dst )
c91830cb 2003 {
35d11700
VZ
2004 if ( dstLen < srcLen )
2005 return wxCONV_FAILED;
c91830cb 2006
35d11700 2007 memcpy(dst, src, srcLen);
c91830cb
VZ
2008 }
2009
35d11700 2010 return srcLen;
c91830cb
VZ
2011}
2012
35d11700
VZ
2013// ----------------------------------------------------------------------------
2014// endian-reversing conversions
2015// ----------------------------------------------------------------------------
c91830cb 2016
35d11700
VZ
2017size_t
2018wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2019 const char *src, size_t srcLen) const
c91830cb 2020{
35d11700
VZ
2021 srcLen = GetLength(src, srcLen);
2022 if ( srcLen == wxNO_LEN )
2023 return wxCONV_FAILED;
2024
2025 srcLen /= BYTES_PER_CHAR;
c91830cb 2026
35d11700 2027 if ( dst )
c91830cb 2028 {
35d11700
VZ
2029 if ( dstLen < srcLen )
2030 return wxCONV_FAILED;
2031
5c33522f 2032 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2033 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2034 {
ef199164 2035 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2036 }
c91830cb 2037 }
b5153fd8 2038
35d11700 2039 return srcLen;
c91830cb
VZ
2040}
2041
35d11700
VZ
2042size_t
2043wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2044 const wchar_t *src, size_t srcLen) const
c91830cb 2045{
35d11700
VZ
2046 if ( srcLen == wxNO_LEN )
2047 srcLen = wxWcslen(src) + 1;
2048
2049 srcLen *= BYTES_PER_CHAR;
c91830cb 2050
35d11700 2051 if ( dst )
c91830cb 2052 {
35d11700
VZ
2053 if ( dstLen < srcLen )
2054 return wxCONV_FAILED;
2055
5c33522f 2056 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2057 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2058 {
ef199164 2059 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2060 }
c91830cb 2061 }
b5153fd8 2062
35d11700 2063 return srcLen;
c91830cb
VZ
2064}
2065
467e0479 2066#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2067
2068
36acb880
VZ
2069// ============================================================================
2070// The classes doing conversion using the iconv_xxx() functions
2071// ============================================================================
3caec1bb 2072
b040e242 2073#ifdef HAVE_ICONV
3a0d76bc 2074
b1d547eb
VS
2075// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2076// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2077// (unless there's yet another bug in glibc) the only case when iconv()
2078// returns with (size_t)-1 (which means error) and says there are 0 bytes
2079// left in the input buffer -- when _real_ error occurs,
2080// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2081// iconv() failure.
3caec1bb
VS
2082// [This bug does not appear in glibc 2.2.]
2083#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2084#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2085 (errno != E2BIG || bufLeft != 0))
2086#else
2087#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2088#endif
2089
ab217dba 2090#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2091
74a7eb0b
VZ
2092#define ICONV_T_INVALID ((iconv_t)-1)
2093
2094#if SIZEOF_WCHAR_T == 4
2095 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2096 #define WC_ENC wxFONTENCODING_UTF32
2097#elif SIZEOF_WCHAR_T == 2
2098 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2099 #define WC_ENC wxFONTENCODING_UTF16
2100#else // sizeof(wchar_t) != 2 nor 4
2101 // does this ever happen?
2102 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2103#endif
2104
36acb880 2105// ----------------------------------------------------------------------------
e95354ec 2106// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2107// ----------------------------------------------------------------------------
2108
e95354ec 2109class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2110{
2111public:
86501081 2112 wxMBConv_iconv(const char *name);
e95354ec 2113 virtual ~wxMBConv_iconv();
36acb880 2114
8f4b0f43
VZ
2115 // implement base class virtual methods
2116 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2117 const char *src, size_t srcLen = wxNO_LEN) const;
2118 virtual size_t FromWChar(char *dst, size_t dstLen,
2119 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2120 virtual size_t GetMBNulLen() const;
2121
ba98e032
VS
2122#if wxUSE_UNICODE_UTF8
2123 virtual bool IsUTF8() const;
2124#endif
2125
d36c9347
VZ
2126 virtual wxMBConv *Clone() const
2127 {
b64f93b6 2128 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
d36c9347
VZ
2129 p->m_minMBCharWidth = m_minMBCharWidth;
2130 return p;
2131 }
2132
e95354ec 2133 bool IsOk() const
74a7eb0b 2134 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2135
2136protected:
ef199164
DS
2137 // the iconv handlers used to translate from multibyte
2138 // to wide char and in the other direction
36acb880
VZ
2139 iconv_t m2w,
2140 w2m;
ef199164 2141
b1d547eb
VS
2142#if wxUSE_THREADS
2143 // guards access to m2w and w2m objects
2144 wxMutex m_iconvMutex;
2145#endif
36acb880
VZ
2146
2147private:
e95354ec 2148 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2149 // available on this machine, it will remain NULL
74a7eb0b 2150 static wxString ms_wcCharsetName;
36acb880
VZ
2151
2152 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2153 // different endian-ness than the native one
405d8f46 2154 static bool ms_wcNeedsSwap;
eec47cc6 2155
d36c9347
VZ
2156
2157 // name of the encoding handled by this conversion
b64f93b6 2158 const char *m_name;
d36c9347 2159
7ef3ab50 2160 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2161 // initially
2162 size_t m_minMBCharWidth;
36acb880
VZ
2163};
2164
8f115891 2165// make the constructor available for unit testing
86501081 2166WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2167{
2168 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2169 if ( !result->IsOk() )
2170 {
2171 delete result;
2172 return 0;
2173 }
ef199164 2174
8f115891
MW
2175 return result;
2176}
2177
422e411e 2178wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2179bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2180
86501081 2181wxMBConv_iconv::wxMBConv_iconv(const char *name)
b64f93b6 2182 : m_name(wxStrdup(name))
36acb880 2183{
c1464d9d 2184 m_minMBCharWidth = 0;
eec47cc6 2185
36acb880 2186 // check for charset that represents wchar_t:
74a7eb0b 2187 if ( ms_wcCharsetName.empty() )
f1339c56 2188 {
9a83f860 2189 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2190
74a7eb0b 2191#if wxUSE_FONTMAP
a243da29 2192 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2193#else // !wxUSE_FONTMAP
a243da29 2194 static const wxChar *const names_static[] =
36acb880 2195 {
74a7eb0b 2196#if SIZEOF_WCHAR_T == 4
9a83f860 2197 wxT("UCS-4"),
da2f1172 2198#elif SIZEOF_WCHAR_T == 2
9a83f860 2199 wxT("UCS-2"),
74a7eb0b
VZ
2200#endif
2201 NULL
2202 };
a243da29 2203 const wxChar *const *names = names_static;
74a7eb0b 2204#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2205
d1f024a8 2206 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2207 {
17a1ebd1 2208 const wxString nameCS(*names);
74a7eb0b
VZ
2209
2210 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2211 wxString nameXE(nameCS);
ef199164
DS
2212
2213#ifdef WORDS_BIGENDIAN
9a83f860 2214 nameXE += wxT("BE");
ef199164 2215#else // little endian
9a83f860 2216 nameXE += wxT("LE");
ef199164 2217#endif
74a7eb0b 2218
9a83f860 2219 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2220 nameXE.c_str());
2221
86501081 2222 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2223 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2224 {
74a7eb0b 2225 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2226 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2227 nameCS.c_str());
86501081 2228 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2229
74a7eb0b
VZ
2230 // and check for bytesex ourselves:
2231 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2232 {
74a7eb0b 2233 char buf[2], *bufPtr;
e8769ed1 2234 wchar_t wbuf[2];
74a7eb0b
VZ
2235 size_t insz, outsz;
2236 size_t res;
2237
2238 buf[0] = 'A';
2239 buf[1] = 0;
2240 wbuf[0] = 0;
2241 insz = 2;
2242 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2243 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2244 bufPtr = buf;
2245
ef199164
DS
2246 res = iconv(
2247 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2248 &wbufPtr, &outsz);
74a7eb0b
VZ
2249
2250 if (ICONV_FAILED(res, insz))
2251 {
2252 wxLogLastError(wxT("iconv"));
422e411e 2253 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2254 nameCS.c_str());
74a7eb0b
VZ
2255 }
2256 else // ok, can convert to this encoding, remember it
2257 {
17a1ebd1 2258 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2259 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2260 }
3a0d76bc
VS
2261 }
2262 }
74a7eb0b 2263 else // use charset not requiring byte swapping
36acb880 2264 {
74a7eb0b 2265 ms_wcCharsetName = nameXE;
36acb880 2266 }
3a0d76bc 2267 }
74a7eb0b 2268
0944fceb 2269 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2270 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2271 ms_wcCharsetName.empty() ? wxString("<none>")
2272 : ms_wcCharsetName,
9a83f860
VZ
2273 ms_wcNeedsSwap ? wxT(" (needs swap)")
2274 : wxT(""));
3a0d76bc 2275 }
36acb880 2276 else // we already have ms_wcCharsetName
3caec1bb 2277 {
86501081 2278 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2279 }
dccce9ea 2280
74a7eb0b 2281 if ( ms_wcCharsetName.empty() )
f1339c56 2282 {
74a7eb0b 2283 w2m = ICONV_T_INVALID;
36acb880 2284 }
405d8f46
VZ
2285 else
2286 {
86501081 2287 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2288 if ( w2m == ICONV_T_INVALID )
2289 {
2290 wxLogTrace(TRACE_STRCONV,
2291 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2292 ms_wcCharsetName.c_str(), name);
74a7eb0b 2293 }
405d8f46 2294 }
36acb880 2295}
3caec1bb 2296
e95354ec 2297wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2298{
b64f93b6
VZ
2299 free(const_cast<char *>(m_name));
2300
74a7eb0b 2301 if ( m2w != ICONV_T_INVALID )
36acb880 2302 iconv_close(m2w);
74a7eb0b 2303 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2304 iconv_close(w2m);
2305}
3a0d76bc 2306
8f4b0f43
VZ
2307size_t
2308wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2309 const char *src, size_t srcLen) const
36acb880 2310{
8f4b0f43 2311 if ( srcLen == wxNO_LEN )
69373110 2312 {
8f4b0f43
VZ
2313 // find the string length: notice that must be done differently for
2314 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2315 // consecutive NULs
2316 const size_t nulLen = GetMBNulLen();
2317 switch ( nulLen )
2318 {
2319 default:
2320 return wxCONV_FAILED;
69373110 2321
8f4b0f43
VZ
2322 case 1:
2323 srcLen = strlen(src); // arguably more optimized than our version
2324 break;
69373110 2325
8f4b0f43
VZ
2326 case 2:
2327 case 4:
2328 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2329 // but they also have to start at character boundary and not
2330 // span two adjacent characters
2331 const char *p;
2332 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2333 ;
2334 srcLen = p - src;
2335 break;
2336 }
d50c0831
VZ
2337
2338 // when we're determining the length of the string ourselves we count
2339 // the terminating NUL(s) as part of it and always NUL-terminate the
2340 // output
2341 srcLen += nulLen;
69373110
VZ
2342 }
2343
8f4b0f43
VZ
2344 // we express length in the number of (wide) characters but iconv always
2345 // counts buffer sizes it in bytes
2346 dstLen *= SIZEOF_WCHAR_T;
2347
b1d547eb 2348#if wxUSE_THREADS
6a17b868
SN
2349 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2350 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2351 // wxConvLocal that are used all over wx code, so we have to make sure
2352 // the handle is used by at most one thread at the time. Otherwise
2353 // only a few wx classes would be safe to use from non-main threads
2354 // as MB<->WC conversion would fail "randomly".
2355 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2356#endif // wxUSE_THREADS
2357
36acb880 2358 size_t res, cres;
8f4b0f43 2359 const char *pszPtr = src;
36acb880 2360
8f4b0f43 2361 if ( dst )
36acb880 2362 {
8f4b0f43 2363 char* bufPtr = (char*)dst;
e8769ed1 2364
36acb880 2365 // have destination buffer, convert there
1752fda6 2366 size_t dstLenOrig = dstLen;
36acb880 2367 cres = iconv(m2w,
8f4b0f43
VZ
2368 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2369 &bufPtr, &dstLen);
1752fda6
VZ
2370
2371 // convert the number of bytes converted as returned by iconv to the
2372 // number of (wide) characters converted that we need
2373 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2374
36acb880 2375 if (ms_wcNeedsSwap)
3a0d76bc 2376 {
36acb880 2377 // convert to native endianness
17a1ebd1 2378 for ( unsigned i = 0; i < res; i++ )
467a2982 2379 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2380 }
36acb880 2381 }
8f4b0f43 2382 else // no destination buffer
36acb880 2383 {
8f4b0f43 2384 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2385 wchar_t tbuf[256];
36acb880 2386 res = 0;
ef199164
DS
2387
2388 do
2389 {
e8769ed1 2390 char* bufPtr = (char*)tbuf;
8f4b0f43 2391 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2392
2393 cres = iconv(m2w,
8f4b0f43
VZ
2394 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2395 &bufPtr, &dstLen );
36acb880 2396
8f4b0f43 2397 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2398 }
2399 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2400 }
dccce9ea 2401
8f4b0f43 2402 if (ICONV_FAILED(cres, srcLen))
f1339c56 2403 {
36acb880 2404 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2405 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2406 return wxCONV_FAILED;
36acb880
VZ
2407 }
2408
2409 return res;
2410}
2411
8f4b0f43
VZ
2412size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2413 const wchar_t *src, size_t srcLen) const
36acb880 2414{
b1d547eb
VS
2415#if wxUSE_THREADS
2416 // NB: explained in MB2WC
2417 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2418#endif
3698ae71 2419
8f4b0f43 2420 if ( srcLen == wxNO_LEN )
2588ee86 2421 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2422
2423 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2424 size_t outbuflen = dstLen;
36acb880 2425 size_t res, cres;
3a0d76bc 2426
36acb880 2427 wchar_t *tmpbuf = 0;
3caec1bb 2428
36acb880
VZ
2429 if (ms_wcNeedsSwap)
2430 {
2431 // need to copy to temp buffer to switch endianness
51725fc0 2432 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2433 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2434 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2435 for ( size_t i = 0; i < srcLen; i++ )
2436 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2437
8f4b0f43 2438 src = tmpbuf;
36acb880 2439 }
3a0d76bc 2440
8f4b0f43
VZ
2441 char* inbuf = (char*)src;
2442 if ( dst )
36acb880
VZ
2443 {
2444 // have destination buffer, convert there
8f4b0f43 2445 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2446
8f4b0f43 2447 res = dstLen - outbuflen;
36acb880 2448 }
8f4b0f43 2449 else // no destination buffer
36acb880 2450 {
8f4b0f43 2451 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2452 char tbuf[256];
36acb880 2453 res = 0;
ef199164
DS
2454 do
2455 {
8f4b0f43 2456 dst = tbuf;
51725fc0 2457 outbuflen = WXSIZEOF(tbuf);
36acb880 2458
8f4b0f43 2459 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2460
51725fc0 2461 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2462 }
2463 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2464 }
dccce9ea 2465
36acb880
VZ
2466 if (ms_wcNeedsSwap)
2467 {
2468 free(tmpbuf);
2469 }
dccce9ea 2470
e8769ed1 2471 if (ICONV_FAILED(cres, inbuflen))
36acb880 2472 {
ce6f8d6f 2473 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2474 return wxCONV_FAILED;
36acb880
VZ
2475 }
2476
2477 return res;
2478}
2479
7ef3ab50 2480size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2481{
c1464d9d 2482 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2483 {
2484 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2485
2486#if wxUSE_THREADS
2487 // NB: explained in MB2WC
2488 wxMutexLocker lock(self->m_iconvMutex);
2489#endif
2490
999020e1 2491 const wchar_t *wnul = L"";
c1464d9d 2492 char buf[8]; // should be enough for NUL in any encoding
356410fc 2493 size_t inLen = sizeof(wchar_t),
c1464d9d 2494 outLen = WXSIZEOF(buf);
ef199164
DS
2495 char *inBuff = (char *)wnul;
2496 char *outBuff = buf;
2497 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2498 {
c1464d9d 2499 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2500 }
2501 else // ok
2502 {
ef199164 2503 self->m_minMBCharWidth = outBuff - buf;
356410fc 2504 }
eec47cc6
VZ
2505 }
2506
c1464d9d 2507 return m_minMBCharWidth;
eec47cc6
VZ
2508}
2509
ba98e032
VS
2510#if wxUSE_UNICODE_UTF8
2511bool wxMBConv_iconv::IsUTF8() const
2512{
86501081
VS
2513 return wxStricmp(m_name, "UTF-8") == 0 ||
2514 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2515}
2516#endif
2517
b040e242 2518#endif // HAVE_ICONV
36acb880 2519
e95354ec 2520
36acb880
VZ
2521// ============================================================================
2522// Win32 conversion classes
2523// ============================================================================
1cd52418 2524
e95354ec 2525#ifdef wxHAVE_WIN32_MB2WC
373658eb 2526
8b04d4c4 2527// from utils.cpp
d775fa82 2528#if wxUSE_FONTMAP
86501081 2529extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2530extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2531#endif
373658eb 2532
e95354ec 2533class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2534{
2535public:
bde4baac
VZ
2536 wxMBConv_win32()
2537 {
2538 m_CodePage = CP_ACP;
c1464d9d 2539 m_minMBCharWidth = 0;
bde4baac
VZ
2540 }
2541
d36c9347 2542 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2543 : wxMBConv()
d36c9347
VZ
2544 {
2545 m_CodePage = conv.m_CodePage;
2546 m_minMBCharWidth = conv.m_minMBCharWidth;
2547 }
2548
7608a683 2549#if wxUSE_FONTMAP
86501081 2550 wxMBConv_win32(const char* name)
bde4baac
VZ
2551 {
2552 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2553 m_minMBCharWidth = 0;
bde4baac 2554 }
dccce9ea 2555
e95354ec 2556 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2557 {
2558 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2559 m_minMBCharWidth = 0;
bde4baac 2560 }
eec47cc6 2561#endif // wxUSE_FONTMAP
8b04d4c4 2562
d36c9347 2563 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2564 {
02272c9c
VZ
2565 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2566 // the behaviour is not compatible with the Unix version (using iconv)
2567 // and break the library itself, e.g. wxTextInputStream::NextChar()
2568 // wouldn't work if reading an incomplete MB char didn't result in an
2569 // error
667e5b3e 2570 //
89028980 2571 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2572 // Win XP or newer and it is not supported for UTF-[78] so we always
2573 // use our own conversions in this case. See
89028980
VS
2574 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2575 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2576 if ( m_CodePage == CP_UTF8 )
89028980 2577 {
5487ff0f 2578 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2579 }
830f8f11
VZ
2580
2581 if ( m_CodePage == CP_UTF7 )
2582 {
5487ff0f 2583 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2584 }
2585
2586 int flags = 0;
2587 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2588 IsAtLeastWin2kSP4() )
89028980 2589 {
830f8f11 2590 flags = MB_ERR_INVALID_CHARS;
89028980 2591 }
667e5b3e 2592
2b5f62a0
VZ
2593 const size_t len = ::MultiByteToWideChar
2594 (
2595 m_CodePage, // code page
667e5b3e 2596 flags, // flags: fall on error
2b5f62a0
VZ
2597 psz, // input string
2598 -1, // its length (NUL-terminated)
b4da152e 2599 buf, // output string
2b5f62a0
VZ
2600 buf ? n : 0 // size of output buffer
2601 );
89028980
VS
2602 if ( !len )
2603 {
2604 // function totally failed
467e0479 2605 return wxCONV_FAILED;
89028980
VS
2606 }
2607
2608 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2609 // check if we succeeded, by doing a double trip:
2610 if ( !flags && buf )
2611 {
53c174fc
VZ
2612 const size_t mbLen = strlen(psz);
2613 wxCharBuffer mbBuf(mbLen);
89028980
VS
2614 if ( ::WideCharToMultiByte
2615 (
2616 m_CodePage,
2617 0,
2618 buf,
2619 -1,
2620 mbBuf.data(),
53c174fc 2621 mbLen + 1, // size in bytes, not length
89028980
VS
2622 NULL,
2623 NULL
2624 ) == 0 ||
2625 strcmp(mbBuf, psz) != 0 )
2626 {
2627 // we didn't obtain the same thing we started from, hence
2628 // the conversion was lossy and we consider that it failed
467e0479 2629 return wxCONV_FAILED;
89028980
VS
2630 }
2631 }
2b5f62a0 2632
03a991bc
VZ
2633 // note that it returns count of written chars for buf != NULL and size
2634 // of the needed buffer for buf == NULL so in either case the length of
2635 // the string (which never includes the terminating NUL) is one less
89028980 2636 return len - 1;
f1339c56 2637 }
dccce9ea 2638
d36c9347 2639 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2640 {
13dd924a
VZ
2641 /*
2642 we have a problem here: by default, WideCharToMultiByte() may
2643 replace characters unrepresentable in the target code page with bad
2644 quality approximations such as turning "1/2" symbol (U+00BD) into
2645 "1" for the code pages which don't have it and we, obviously, want
2646 to avoid this at any price
d775fa82 2647
13dd924a
VZ
2648 the trouble is that this function does it _silently_, i.e. it won't
2649 even tell us whether it did or not... Win98/2000 and higher provide
2650 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2651 we have to resort to a round trip, i.e. check that converting back
2652 results in the same string -- this is, of course, expensive but
2653 otherwise we simply can't be sure to not garble the data.
2654 */
2655
2656 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2657 // it doesn't work with CJK encodings (which we test for rather roughly
2658 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2659 // supporting it
907173e5
WS
2660 BOOL usedDef wxDUMMY_INITIALIZE(false);
2661 BOOL *pUsedDef;
13dd924a
VZ
2662 int flags;
2663 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2664 {
2665 // it's our lucky day
2666 flags = WC_NO_BEST_FIT_CHARS;
2667 pUsedDef = &usedDef;
2668 }
2669 else // old system or unsupported encoding
2670 {
2671 flags = 0;
2672 pUsedDef = NULL;
2673 }
2674
2b5f62a0
VZ
2675 const size_t len = ::WideCharToMultiByte
2676 (
2677 m_CodePage, // code page
13dd924a
VZ
2678 flags, // either none or no best fit
2679 pwz, // input string
2b5f62a0
VZ
2680 -1, // it is (wide) NUL-terminated
2681 buf, // output buffer
2682 buf ? n : 0, // and its size
2683 NULL, // default "replacement" char
13dd924a 2684 pUsedDef // [out] was it used?
2b5f62a0
VZ
2685 );
2686
13dd924a
VZ
2687 if ( !len )
2688 {
2689 // function totally failed
467e0479 2690 return wxCONV_FAILED;
13dd924a
VZ
2691 }
2692
765bdb4a
VZ
2693 // we did something, check if we really succeeded
2694 if ( flags )
13dd924a 2695 {
765bdb4a
VZ
2696 // check if the conversion failed, i.e. if any replacements
2697 // were done
2698 if ( usedDef )
2699 return wxCONV_FAILED;
2700 }
2701 else // we must resort to double tripping...
2702 {
2703 // first we need to ensure that we really have the MB data: this is
2704 // not the case if we're called with NULL buffer, in which case we
2705 // need to do the conversion yet again
2706 wxCharBuffer bufDef;
2707 if ( !buf )
13dd924a 2708 {
765bdb4a
VZ
2709 bufDef = wxCharBuffer(len);
2710 buf = bufDef.data();
2711 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2712 buf, len, NULL, NULL) )
467e0479 2713 return wxCONV_FAILED;
13dd924a 2714 }
765bdb4a 2715
564da6ff
VZ
2716 if ( !n )
2717 n = wcslen(pwz);
765bdb4a 2718 wxWCharBuffer wcBuf(n);
564da6ff 2719 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2720 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2721 {
765bdb4a
VZ
2722 // we didn't obtain the same thing we started from, hence
2723 // the conversion was lossy and we consider that it failed
2724 return wxCONV_FAILED;
13dd924a
VZ
2725 }
2726 }
2727
03a991bc 2728 // see the comment above for the reason of "len - 1"
13dd924a 2729 return len - 1;
f1339c56 2730 }
dccce9ea 2731
7ef3ab50
VZ
2732 virtual size_t GetMBNulLen() const
2733 {
2734 if ( m_minMBCharWidth == 0 )
2735 {
2736 int len = ::WideCharToMultiByte
2737 (
2738 m_CodePage, // code page
2739 0, // no flags
2740 L"", // input string
2741 1, // translate just the NUL
2742 NULL, // output buffer
2743 0, // and its size
2744 NULL, // no replacement char
2745 NULL // [out] don't care if it was used
2746 );
2747
2748 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2749 switch ( len )
2750 {
2751 default:
9a83f860 2752 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2753 self->m_minMBCharWidth = (size_t)-1;
2754 break;
7ef3ab50
VZ
2755
2756 case 0:
2757 self->m_minMBCharWidth = (size_t)-1;
2758 break;
2759
2760 case 1:
2761 case 2:
2762 case 4:
2763 self->m_minMBCharWidth = len;
2764 break;
2765 }
2766 }
2767
2768 return m_minMBCharWidth;
2769 }
2770
d36c9347
VZ
2771 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2772
13dd924a
VZ
2773 bool IsOk() const { return m_CodePage != -1; }
2774
2775private:
2776 static bool CanUseNoBestFit()
2777 {
2778 static int s_isWin98Or2k = -1;
2779
2780 if ( s_isWin98Or2k == -1 )
2781 {
2782 int verMaj, verMin;
2783 switch ( wxGetOsVersion(&verMaj, &verMin) )
2784 {
406d283a 2785 case wxOS_WINDOWS_9X:
13dd924a
VZ
2786 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2787 break;
2788
406d283a 2789 case wxOS_WINDOWS_NT:
13dd924a
VZ
2790 s_isWin98Or2k = verMaj >= 5;
2791 break;
2792
2793 default:
ef199164 2794 // unknown: be conservative by default
13dd924a 2795 s_isWin98Or2k = 0;
ef199164 2796 break;
13dd924a
VZ
2797 }
2798
9a83f860 2799 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2800 }
2801
2802 return s_isWin98Or2k == 1;
2803 }
f1339c56 2804
89028980
VS
2805 static bool IsAtLeastWin2kSP4()
2806 {
8942f83a
WS
2807#ifdef __WXWINCE__
2808 return false;
2809#else
89028980
VS
2810 static int s_isAtLeastWin2kSP4 = -1;
2811
2812 if ( s_isAtLeastWin2kSP4 == -1 )
2813 {
2814 OSVERSIONINFOEX ver;
2815
2816 memset(&ver, 0, sizeof(ver));
2817 ver.dwOSVersionInfoSize = sizeof(ver);
2818 GetVersionEx((OSVERSIONINFO*)&ver);
2819
2820 s_isAtLeastWin2kSP4 =
2821 ((ver.dwMajorVersion > 5) || // Vista+
2822 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2823 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2824 ver.wServicePackMajor >= 4)) // 2000 SP4+
2825 ? 1 : 0;
2826 }
2827
2828 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2829#endif
89028980
VS
2830 }
2831
eec47cc6 2832
c1464d9d 2833 // the code page we're working with
b1d66b54 2834 long m_CodePage;
c1464d9d 2835
7ef3ab50 2836 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2837 // "unknown"
2838 size_t m_minMBCharWidth;
1cd52418 2839};
e95354ec
VZ
2840
2841#endif // wxHAVE_WIN32_MB2WC
2842
f7e98dee 2843
36acb880
VZ
2844// ============================================================================
2845// wxEncodingConverter based conversion classes
2846// ============================================================================
2847
1e6feb95 2848#if wxUSE_FONTMAP
1cd52418 2849
e95354ec 2850class wxMBConv_wxwin : public wxMBConv
1cd52418 2851{
8b04d4c4
VZ
2852private:
2853 void Init()
2854 {
6ac84a78
DE
2855 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2856 // The wxMBConv_cf class does a better job.
2857 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2858 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2859 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2860 }
2861
6001e347 2862public:
f1339c56
RR
2863 // temporarily just use wxEncodingConverter stuff,
2864 // so that it works while a better implementation is built
86501081 2865 wxMBConv_wxwin(const char* name)
f1339c56
RR
2866 {
2867 if (name)
267e11c5 2868 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2869 else
2870 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2871
8b04d4c4
VZ
2872 Init();
2873 }
2874
e95354ec 2875 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2876 {
2877 m_enc = enc;
2878
2879 Init();
f1339c56 2880 }
dccce9ea 2881
bde4baac 2882 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2883 {
2884 size_t inbuf = strlen(psz);
dccce9ea 2885 if (buf)
c643a977 2886 {
ef199164 2887 if (!m2w.Convert(psz, buf))
467e0479 2888 return wxCONV_FAILED;
c643a977 2889 }
f1339c56
RR
2890 return inbuf;
2891 }
dccce9ea 2892
bde4baac 2893 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2894 {
f8d791e0 2895 const size_t inbuf = wxWcslen(psz);
f1339c56 2896 if (buf)
c643a977 2897 {
ef199164 2898 if (!w2m.Convert(psz, buf))
467e0479 2899 return wxCONV_FAILED;
c643a977 2900 }
dccce9ea 2901
f1339c56
RR
2902 return inbuf;
2903 }
dccce9ea 2904
7ef3ab50 2905 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2906 {
2907 switch ( m_enc )
2908 {
2909 case wxFONTENCODING_UTF16BE:
2910 case wxFONTENCODING_UTF16LE:
c1464d9d 2911 return 2;
eec47cc6
VZ
2912
2913 case wxFONTENCODING_UTF32BE:
2914 case wxFONTENCODING_UTF32LE:
c1464d9d 2915 return 4;
eec47cc6
VZ
2916
2917 default:
c1464d9d 2918 return 1;
eec47cc6
VZ
2919 }
2920 }
2921
d36c9347
VZ
2922 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2923
7ef3ab50
VZ
2924 bool IsOk() const { return m_ok; }
2925
2926public:
2927 wxFontEncoding m_enc;
2928 wxEncodingConverter m2w, w2m;
2929
2930private:
cafbf6fb
VZ
2931 // were we initialized successfully?
2932 bool m_ok;
fc7a2a60 2933
c0c133e1 2934 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2935};
6001e347 2936
8f115891 2937// make the constructors available for unit testing
86501081 2938WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2939{
2940 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2941 if ( !result->IsOk() )
2942 {
2943 delete result;
2944 return 0;
2945 }
ef199164 2946
8f115891
MW
2947 return result;
2948}
2949
1e6feb95
VZ
2950#endif // wxUSE_FONTMAP
2951
36acb880
VZ
2952// ============================================================================
2953// wxCSConv implementation
2954// ============================================================================
2955
8b04d4c4 2956void wxCSConv::Init()
6001e347 2957{
e95354ec
VZ
2958 m_name = NULL;
2959 m_convReal = NULL;
6c4d607e
VZ
2960}
2961
2962void wxCSConv::SetEncoding(wxFontEncoding encoding)
2963{
2964 switch ( encoding )
2965 {
2966 case wxFONTENCODING_MAX:
2967 case wxFONTENCODING_SYSTEM:
2968 if ( m_name )
2969 {
2970 // It's ok to not have encoding value if we have a name for it.
2971 m_encoding = wxFONTENCODING_SYSTEM;
2972 }
2973 else // No name neither.
2974 {
2975 // Fall back to the system default encoding in this case (not
2976 // sure how much sense does this make but this is how the old
2977 // code used to behave).
2978#if wxUSE_INTL
2979 m_encoding = wxLocale::GetSystemEncoding();
2980 if ( m_encoding == wxFONTENCODING_SYSTEM )
2981#endif // wxUSE_INTL
2982 m_encoding = wxFONTENCODING_ISO8859_1;
2983 }
2984 break;
2985
2986 case wxFONTENCODING_DEFAULT:
2987 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2988 m_encoding = wxFONTENCODING_ISO8859_1;
2989 break;
2990
2991 default:
2992 // Just use the provided encoding.
2993 m_encoding = encoding;
2994 }
e95354ec
VZ
2995}
2996
86501081 2997wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2998{
2999 Init();
82713003 3000
86501081 3001 if ( !charset.empty() )
e95354ec 3002 {
86501081 3003 SetName(charset.ToAscii());
e95354ec 3004 }
bda3d86a 3005
e4277538 3006#if wxUSE_FONTMAP
6c4d607e 3007 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
e4277538 3008#else
6c4d607e 3009 SetEncoding(wxFONTENCODING_SYSTEM);
e4277538 3010#endif
6c4d607e
VZ
3011
3012 m_convReal = DoCreate();
6001e347
RR
3013}
3014
8b04d4c4
VZ
3015wxCSConv::wxCSConv(wxFontEncoding encoding)
3016{
bda3d86a 3017 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 3018 {
9a83f860 3019 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
3020
3021 encoding = wxFONTENCODING_SYSTEM;
3022 }
3023
8b04d4c4
VZ
3024 Init();
3025
6c4d607e
VZ
3026 SetEncoding(encoding);
3027
3028 m_convReal = DoCreate();
8b04d4c4
VZ
3029}
3030
6001e347
RR
3031wxCSConv::~wxCSConv()
3032{
65e50848
JS
3033 Clear();
3034}
3035
54380f29 3036wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3037 : wxMBConv()
54380f29 3038{
8b04d4c4
VZ
3039 Init();
3040
54380f29 3041 SetName(conv.m_name);
6c4d607e
VZ
3042 SetEncoding(conv.m_encoding);
3043
3044 m_convReal = DoCreate();
54380f29
GD
3045}
3046
3047wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3048{
3049 Clear();
8b04d4c4 3050
54380f29 3051 SetName(conv.m_name);
6c4d607e
VZ
3052 SetEncoding(conv.m_encoding);
3053
3054 m_convReal = DoCreate();
8b04d4c4 3055
54380f29
GD
3056 return *this;
3057}
3058
65e50848
JS
3059void wxCSConv::Clear()
3060{
8b04d4c4 3061 free(m_name);
65e50848 3062 m_name = NULL;
6c4d607e
VZ
3063
3064 wxDELETE(m_convReal);
6001e347
RR
3065}
3066
86501081 3067void wxCSConv::SetName(const char *charset)
6001e347 3068{
6c4d607e 3069 if ( charset )
d6f2a891 3070 m_name = wxStrdup(charset);
6001e347
RR
3071}
3072
8b3eb85d 3073#if wxUSE_FONTMAP
8b3eb85d
VZ
3074
3075WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3076 wxEncodingNameCache );
8b3eb85d
VZ
3077
3078static wxEncodingNameCache gs_nameCache;
3079#endif
3080
e95354ec
VZ
3081wxMBConv *wxCSConv::DoCreate() const
3082{
ce6f8d6f
VZ
3083#if wxUSE_FONTMAP
3084 wxLogTrace(TRACE_STRCONV,
3085 wxT("creating conversion for %s"),
3086 (m_name ? m_name
86501081 3087 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3088#endif // wxUSE_FONTMAP
3089
c547282d
VZ
3090 // check for the special case of ASCII or ISO8859-1 charset: as we have
3091 // special knowledge of it anyhow, we don't need to create a special
3092 // conversion object
6c4d607e 3093 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 3094 {
e95354ec
VZ
3095 // don't convert at all
3096 return NULL;
3097 }
dccce9ea 3098
e95354ec
VZ
3099 // we trust OS to do conversion better than we can so try external
3100 // conversion methods first
3101 //
3102 // the full order is:
3103 // 1. OS conversion (iconv() under Unix or Win32 API)
3104 // 2. hard coded conversions for UTF
3105 // 3. wxEncodingConverter as fall back
3106
3107 // step (1)
3108#ifdef HAVE_ICONV
c547282d 3109#if !wxUSE_FONTMAP
e95354ec 3110 if ( m_name )
c547282d 3111#endif // !wxUSE_FONTMAP
e95354ec 3112 {
3ef10cfc 3113#if wxUSE_FONTMAP
8b3eb85d 3114 wxFontEncoding encoding(m_encoding);
3ef10cfc 3115#endif
8b3eb85d 3116
86501081 3117 if ( m_name )
8b3eb85d 3118 {
86501081 3119 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3120 if ( conv->IsOk() )
3121 return conv;
3122
3123 delete conv;
c547282d
VZ
3124
3125#if wxUSE_FONTMAP
8b3eb85d 3126 encoding =
86501081 3127 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3128#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3129 }
3130#if wxUSE_FONTMAP
3131 {
3132 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3133 if ( it != gs_nameCache.end() )
3134 {
3135 if ( it->second.empty() )
3136 return NULL;
c547282d 3137
86501081 3138 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3139 if ( conv->IsOk() )
3140 return conv;
e95354ec 3141
8b3eb85d
VZ
3142 delete conv;
3143 }
3144
a243da29 3145 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3146 // CS : in case this does not return valid names (eg for MacRoman)
3147 // encoding got a 'failure' entry in the cache all the same,
3148 // although it just has to be created using a different method, so
3149 // only store failed iconv creation attempts (or perhaps we
3150 // shoulnd't do this at all ?)
3c67ec06 3151 if ( names[0] != NULL )
8b3eb85d 3152 {
3c67ec06 3153 for ( ; *names; ++names )
8b3eb85d 3154 {
86501081
VS
3155 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3156 // will need changes that will obsolete this
3157 wxString name(*names);
3158 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3159 if ( conv->IsOk() )
3160 {
3161 gs_nameCache[encoding] = *names;
3162 return conv;
3163 }
3164
3165 delete conv;
8b3eb85d
VZ
3166 }
3167
9a83f860 3168 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3169 }
8b3eb85d
VZ
3170 }
3171#endif // wxUSE_FONTMAP
e95354ec
VZ
3172 }
3173#endif // HAVE_ICONV
3174
3175#ifdef wxHAVE_WIN32_MB2WC
3176 {
7608a683 3177#if wxUSE_FONTMAP
e95354ec
VZ
3178 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3179 : new wxMBConv_win32(m_encoding);
3180 if ( conv->IsOk() )
3181 return conv;
3182
3183 delete conv;
7608a683
WS
3184#else
3185 return NULL;
3186#endif
e95354ec
VZ
3187 }
3188#endif // wxHAVE_WIN32_MB2WC
ef199164 3189
5c4ed98d 3190#ifdef __DARWIN__
f7e98dee 3191 {
6ff49cbc
DE
3192 // leave UTF16 and UTF32 to the built-ins of wx
3193 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3194 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3195 {
a6900d10 3196#if wxUSE_FONTMAP
5c4ed98d
DE
3197 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3198 : new wxMBConv_cf(m_encoding);
a6900d10 3199#else
5c4ed98d 3200 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3201#endif
ef199164 3202
f7e98dee 3203 if ( conv->IsOk() )
d775fa82
WS
3204 return conv;
3205
3206 delete conv;
3207 }
335d31e0 3208 }
5c4ed98d
DE
3209#endif // __DARWIN__
3210
e95354ec
VZ
3211 // step (2)
3212 wxFontEncoding enc = m_encoding;
3213#if wxUSE_FONTMAP
c547282d
VZ
3214 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3215 {
3216 // use "false" to suppress interactive dialogs -- we can be called from
3217 // anywhere and popping up a dialog from here is the last thing we want to
3218 // do
267e11c5 3219 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3220 }
e95354ec
VZ
3221#endif // wxUSE_FONTMAP
3222
3223 switch ( enc )
3224 {
3225 case wxFONTENCODING_UTF7:
3226 return new wxMBConvUTF7;
3227
3228 case wxFONTENCODING_UTF8:
3229 return new wxMBConvUTF8;
3230
e95354ec
VZ
3231 case wxFONTENCODING_UTF16BE:
3232 return new wxMBConvUTF16BE;
3233
3234 case wxFONTENCODING_UTF16LE:
3235 return new wxMBConvUTF16LE;
3236
e95354ec
VZ
3237 case wxFONTENCODING_UTF32BE:
3238 return new wxMBConvUTF32BE;
3239
3240 case wxFONTENCODING_UTF32LE:
3241 return new wxMBConvUTF32LE;
3242
3243 default:
3244 // nothing to do but put here to suppress gcc warnings
ef199164 3245 break;
e95354ec
VZ
3246 }
3247
3248 // step (3)
3249#if wxUSE_FONTMAP
3250 {
3251 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3252 : new wxMBConv_wxwin(m_encoding);
3253 if ( conv->IsOk() )
3254 return conv;
3255
3256 delete conv;
3257 }
ef199164 3258
3df31b2d
VZ
3259 wxLogTrace(TRACE_STRCONV,
3260 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3261 (m_name ? wxString(m_name)
3df31b2d
VZ
3262 : wxFontMapperBase::GetEncodingName(m_encoding)));
3263#endif // wxUSE_FONTMAP
e95354ec
VZ
3264
3265 return NULL;
3266}
3267
0f0298b1
VZ
3268bool wxCSConv::IsOk() const
3269{
0f0298b1
VZ
3270 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3271 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3272 return true; // always ok as we do it ourselves
3273
3274 // m_convReal->IsOk() is called at its own creation, so we know it must
3275 // be ok if m_convReal is non-NULL
3276 return m_convReal != NULL;
3277}
3278
1c714a5d
VZ
3279size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3280 const char *src, size_t srcLen) const
3281{
2c74c558
VS
3282 if (m_convReal)
3283 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3284
3285 // latin-1 (direct)
05392dc8
VZ
3286 if ( srcLen == wxNO_LEN )
3287 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3288
05392dc8
VZ
3289 if ( dst )
3290 {
3291 if ( dstLen < srcLen )
3292 return wxCONV_FAILED;
1c714a5d 3293
05392dc8
VZ
3294 for ( size_t n = 0; n < srcLen; n++ )
3295 dst[n] = (unsigned char)(src[n]);
3296 }
2c74c558 3297
05392dc8 3298 return srcLen;
1c714a5d
VZ
3299}
3300
05392dc8
VZ
3301size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3302 const wchar_t *src, size_t srcLen) const
6001e347 3303{
e95354ec 3304 if (m_convReal)
05392dc8 3305 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3306
3307 // latin-1 (direct)
05392dc8
VZ
3308 if ( srcLen == wxNO_LEN )
3309 srcLen = wxWcslen(src) + 1;
dccce9ea 3310
05392dc8 3311 if ( dst )
f1339c56 3312 {
05392dc8
VZ
3313 if ( dstLen < srcLen )
3314 return wxCONV_FAILED;
1cd52418 3315
05392dc8 3316 for ( size_t n = 0; n < srcLen; n++ )
24642831 3317 {
05392dc8 3318 if ( src[n] > 0xFF )
467e0479 3319 return wxCONV_FAILED;
ef199164 3320
05392dc8 3321 dst[n] = (char)src[n];
24642831 3322 }
05392dc8 3323
24642831 3324 }
05392dc8 3325 else // still need to check the input validity
24642831 3326 {
05392dc8 3327 for ( size_t n = 0; n < srcLen; n++ )
24642831 3328 {
05392dc8 3329 if ( src[n] > 0xFF )
467e0479 3330 return wxCONV_FAILED;
24642831 3331 }
f1339c56 3332 }
dccce9ea 3333
05392dc8 3334 return srcLen;
6001e347
RR
3335}
3336
7ef3ab50 3337size_t wxCSConv::GetMBNulLen() const
eec47cc6 3338{
eec47cc6 3339 if ( m_convReal )
7ef3ab50 3340 return m_convReal->GetMBNulLen();
eec47cc6 3341
ba98e032 3342 // otherwise, we are ISO-8859-1
c1464d9d 3343 return 1;
eec47cc6
VZ
3344}
3345
ba98e032
VS
3346#if wxUSE_UNICODE_UTF8
3347bool wxCSConv::IsUTF8() const
3348{
ba98e032 3349 if ( m_convReal )
ba98e032 3350 return m_convReal->IsUTF8();
ba98e032
VS
3351
3352 // otherwise, we are ISO-8859-1
3353 return false;
3354}
3355#endif
3356
69c928ef
VZ
3357
3358#if wxUSE_UNICODE
3359
3360wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3361{
3362 if ( !s )
3363 return wxWCharBuffer();
3364
3365 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3366 if ( !wbuf )
5487ff0f 3367 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3368 if ( !wbuf )
3369 wbuf = wxConvISO8859_1.cMB2WX(s);
3370
3371 return wbuf;
3372}
3373
3374wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3375{
3376 if ( !ws )
3377 return wxCharBuffer();
3378
3379 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3380 if ( !buf )
3381 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3382
3383 return buf;
3384}
3385
3386#endif // wxUSE_UNICODE
f5a1953b 3387
1e50d914
VS
3388// ----------------------------------------------------------------------------
3389// globals
3390// ----------------------------------------------------------------------------
3391
3392// NB: The reason why we create converted objects in this convoluted way,
3393// using a factory function instead of global variable, is that they
3394// may be used at static initialization time (some of them are used by
3395// wxString ctors and there may be a global wxString object). In other
3396// words, possibly _before_ the converter global object would be
3397// initialized.
3398
3399#undef wxConvLibc
3400#undef wxConvUTF8
3401#undef wxConvUTF7
3402#undef wxConvLocal
3403#undef wxConvISO8859_1
3404
3405#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3406 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3407 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3408 { \
3409 static impl_klass name##Obj ctor_args; \
3410 return &name##Obj; \
3411 } \
3412 /* this ensures that all global converter objects are created */ \
3413 /* by the time static initialization is done, i.e. before any */ \
3414 /* thread is launched: */ \
3415 static klass* gs_##name##instance = wxGet_##name##Ptr()
3416
3417#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3418 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3419
5c69ef61
VZ
3420#ifdef __INTELC__
3421 // disable warning "variable 'xxx' was declared but never referenced"
3422 #pragma warning(disable: 177)
3423#endif // Intel C++
3424
1e50d914
VS
3425#ifdef __WINDOWS__
3426 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3427#elif 0 // defined(__WXOSX__)
3428 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3429#else
3430 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3431#endif
3432
e1079eda
VZ
3433// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3434// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3435// provokes an error message about "not enough macro parameters"; and we
3436// can't use "()" here as the name##Obj declaration would be parsed as a
3437// function declaration then, so use a semicolon and live with an extra
3438// empty statement (and hope that no compilers warns about this)
3439WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3440WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3441
3442WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3443WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3444
3445WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3446WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3447
6ac84a78 3448#ifdef __DARWIN__
8244507f
VZ
3449// It is important to use this conversion object under Darwin as it ensures
3450// that Unicode strings are (re)composed correctly even though xnu kernel uses
3451// decomposed form internally (at least for the file names).
6ac84a78 3452static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3453#endif
6ac84a78 3454
1e50d914 3455WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3456#ifdef __DARWIN__
1e50d914 3457 &wxConvMacUTF8DObj;
6ac84a78 3458#else // !__DARWIN__
1e50d914 3459 wxGet_wxConvLibcPtr();
6ac84a78 3460#endif // __DARWIN__/!__DARWIN__