]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Set focus to generic wxDataViewCtrl when clicking with any mouse button, not just...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
1c193821 31#ifndef __WXWINCE__
1cd52418 32#include <errno.h>
1c193821
JS
33#endif
34
6001e347
RR
35#include <ctype.h>
36#include <string.h>
37#include <stdlib.h>
38
e95354ec 39#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
e95354ec 42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
b040e242 45#ifdef HAVE_ICONV
373658eb 46 #include <iconv.h>
b1d547eb 47 #include "wx/thread.h"
1cd52418 48#endif
1cd52418 49
373658eb
VZ
50#include "wx/encconv.h"
51#include "wx/fontmap.h"
52
5c4ed98d 53#ifdef __DARWIN__
c933e267 54#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
55#endif //def __DARWIN__
56
ef199164 57
9a83f860 58#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 59
467e0479
VZ
60// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61// be 4 bytes
4948c2b6 62#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
63 #define WC_UTF16
64#endif
65
ef199164 66
373658eb
VZ
67// ============================================================================
68// implementation
69// ============================================================================
70
69373110
VZ
71// helper function of cMB2WC(): check if n bytes at this location are all NUL
72static bool NotAllNULs(const char *p, size_t n)
73{
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78}
79
373658eb 80// ----------------------------------------------------------------------------
467e0479 81// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 82// ----------------------------------------------------------------------------
6001e347 83
c91830cb 84static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 85{
ef199164 86 if (input <= 0xffff)
4def3b35 87 {
999836aa
VZ
88 if (output)
89 *output = (wxUint16) input;
ef199164 90
4def3b35 91 return 1;
dccce9ea 92 }
ef199164 93 else if (input >= 0x110000)
4def3b35 94 {
467e0479 95 return wxCONV_FAILED;
dccce9ea
VZ
96 }
97 else
4def3b35 98 {
dccce9ea 99 if (output)
4def3b35 100 {
ef199164
DS
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 103 }
ef199164 104
4def3b35 105 return 2;
1cd52418 106 }
1cd52418
OK
107}
108
c91830cb 109static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 110{
ef199164 111 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
112 {
113 output = *input;
114 return 1;
dccce9ea 115 }
ef199164 116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
117 {
118 output = *input;
467e0479 119 return wxCONV_FAILED;
dccce9ea
VZ
120 }
121 else
4def3b35
VS
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
1cd52418
OK
126}
127
467e0479 128#ifdef WC_UTF16
35d11700
VZ
129 typedef wchar_t wxDecodeSurrogate_t;
130#else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
133
134// returns the next UTF-32 character from the wchar_t buffer and advances the
135// pointer to the character after this one
136//
137// if an invalid character is found, *pSrc is set to NULL, the caller must
138// check for this
35d11700 139static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
140{
141 wxUint32 out;
8d3dd069 142 const size_t
5c33522f 143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150}
151
f6bcfd97 152// ----------------------------------------------------------------------------
6001e347 153// wxMBConv
f6bcfd97 154// ----------------------------------------------------------------------------
2c53a80a 155
483b0434
VZ
156size_t
157wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
6001e347 159{
483b0434 160 // although new conversion classes are supposed to implement this function
36f93678 161 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
36f93678
VZ
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
6001e347 170
483b0434
VZ
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
eec47cc6 173
c1464d9d 174 // the number of NULs terminating this string
a78c43f1 175 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 176
c1464d9d
VZ
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
483b0434
VZ
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
467e0479 183 if ( srcLen != wxNO_LEN )
eec47cc6 184 {
c1464d9d 185 // we need to know how to find the end of this string
7ef3ab50 186 nulLen = GetMBNulLen();
483b0434
VZ
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
e4e3bbb4 189
c1464d9d 190 // if there are enough NULs we can avoid the copy
483b0434 191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
192 {
193 // make a copy in order to properly NUL-terminate the string
483b0434 194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 195 char * const p = bufTmp.data();
483b0434
VZ
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 198 *s = '\0';
483b0434
VZ
199
200 src = bufTmp;
eec47cc6 201 }
e4e3bbb4 202
483b0434
VZ
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
e4e3bbb4 209
36f93678
VZ
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
bbb0ff36 217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
483b0434 225 for ( ;; )
eec47cc6 226 {
c1464d9d 227 // try to convert the current chunk
483b0434 228 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
e4e3bbb4 231
483b0434 232 dstWritten += lenChunk;
f6a02087
VZ
233 if ( !srcEnd )
234 dstWritten++;
f5fb6871 235
f6a02087 236 if ( !lenChunk )
467e0479
VZ
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
483b0434
VZ
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
f6a02087
VZ
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
f6a02087
VZ
252 if ( !srcEnd )
253 dst++;
483b0434 254 }
c1464d9d 255
483b0434 256 if ( !srcEnd )
c1464d9d 257 {
467e0479 258 // we convert just one chunk in this case as this is the entire
bbb0ff36 259 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
260 break;
261 }
eec47cc6 262
bbb0ff36
VZ
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
bbb0ff36
VZ
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
c1464d9d 286
483b0434 287 if ( src >= srcEnd )
c1464d9d
VZ
288 break;
289 }
290
483b0434 291 return dstWritten;
e4e3bbb4
RN
292}
293
483b0434
VZ
294size_t
295wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
e4e3bbb4 297{
483b0434
VZ
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
e4e3bbb4 300
f6a02087
VZ
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
eec47cc6
VZ
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
eec47cc6 308 wxWCharBuffer bufTmp;
f6a02087 309 if ( isNulTerminated )
e4e3bbb4 310 {
483b0434 311 srcLen = wxWcslen(src) + 1;
eec47cc6 312 }
483b0434 313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
314 {
315 // make a copy in order to properly NUL-terminate the string
483b0434 316 bufTmp = wxWCharBuffer(srcLen);
ef199164 317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
27307233 324 src++ /* skip L'\0' too */ )
483b0434
VZ
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
483b0434 331 dstWritten += lenChunk;
27307233
VZ
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
f6a02087 340 dstWritten += lenNul;
483b0434
VZ
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
27307233
VZ
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
364 return wxCONV_FAILED;
365
27307233
VZ
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
483b0434 377 dst += lenChunk;
27307233 378 if ( chunkEnd < srcEnd )
f6a02087 379 dst += lenNul;
483b0434 380 }
27307233
VZ
381
382 src = chunkEnd;
eec47cc6 383 }
e4e3bbb4 384
483b0434
VZ
385 return dstWritten;
386}
387
ef199164 388size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 389{
51725fc0 390 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 391 if ( rc != wxCONV_FAILED )
509da451
VZ
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399}
400
ef199164 401size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 402{
51725fc0 403 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 404 if ( rc != wxCONV_FAILED )
509da451 405 {
51725fc0 406 rc -= GetMBNulLen();
509da451
VZ
407 }
408
409 return rc;
410}
411
483b0434
VZ
412wxMBConv::~wxMBConv()
413{
414 // nothing to do here (necessary for Darwin linking probably)
415}
e4e3bbb4 416
483b0434
VZ
417const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418{
419 if ( psz )
eec47cc6 420 {
483b0434 421 // calculate the length of the buffer needed first
a2db25a1 422 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 423 if ( nLen != wxCONV_FAILED )
f5fb6871 424 {
483b0434 425 // now do the actual conversion
a2db25a1 426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 427
483b0434 428 // +1 for the trailing NULL
a2db25a1 429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 430 return buf;
f5fb6871 431 }
483b0434 432 }
e4e3bbb4 433
483b0434
VZ
434 return wxWCharBuffer();
435}
3698ae71 436
483b0434
VZ
437const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438{
439 if ( pwz )
440 {
a2db25a1 441 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 442 if ( nLen != wxCONV_FAILED )
483b0434 443 {
a2db25a1
VZ
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451}
e4e3bbb4 452
483b0434 453const wxWCharBuffer
ef199164 454wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 455{
ef199164 456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 457 if ( dstLen != wxCONV_FAILED )
483b0434 458 {
0dd13d21
VZ
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
f6a02087 463 wbuf.data()[dstLen] = L'\0';
ef199164 464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
465 {
466 if ( outLen )
467e0479
VZ
467 {
468 *outLen = dstLen;
f6a02087
VZ
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
467e0479
VZ
476 (*outLen)--;
477 }
478
483b0434
VZ
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487}
488
489const wxCharBuffer
ef199164 490wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 491{
13d92ad6 492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 493 if ( dstLen != wxCONV_FAILED )
483b0434 494 {
0dd13d21
VZ
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
ef199164 501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
502 {
503 if ( outLen )
467e0479
VZ
504 {
505 *outLen = dstLen;
506
f6a02087 507 if ( inLen == wxNO_LEN )
467e0479 508 {
f6a02087
VZ
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
13d92ad6 511 *outLen -= nulLen;
467e0479
VZ
512 }
513 }
d32a507d 514
483b0434
VZ
515 return buf;
516 }
e4e3bbb4
RN
517 }
518
eec47cc6
VZ
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
e4e3bbb4
RN
523}
524
40ac5040
VZ
525const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526{
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
cfcfada9 540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
541}
542
543const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544{
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
cfcfada9 558 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
559}
560
6001e347 561// ----------------------------------------------------------------------------
bde4baac 562// wxMBConvLibc
6001e347
RR
563// ----------------------------------------------------------------------------
564
bde4baac
VZ
565size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566{
567 return wxMB2WC(buf, psz, n);
568}
569
570size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571{
572 return wxWC2MB(buf, psz, n);
573}
e1bfe89e
RR
574
575// ----------------------------------------------------------------------------
532d575b 576// wxConvBrokenFileNames
e1bfe89e
RR
577// ----------------------------------------------------------------------------
578
eec47cc6
VZ
579#ifdef __UNIX__
580
86501081 581wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 582{
9a83f860
VZ
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
586 else
587 m_conv = new wxCSConv(charset);
ea8ce907
RR
588}
589
eec47cc6 590#endif // __UNIX__
c12b7f79 591
bde4baac 592// ----------------------------------------------------------------------------
3698ae71 593// UTF-7
bde4baac 594// ----------------------------------------------------------------------------
6001e347 595
15f2ee32 596// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
597//
598// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 599
15f2ee32
RN
600//
601// BASE64 decoding table
602//
603static const unsigned char utf7unb64[] =
6001e347 604{
15f2ee32
RN
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
637};
638
9d653e81
VZ
639size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
15f2ee32 641{
9d653e81 642 DecoderState stateOrig,
852dcba5 643 *statePtr;
9d653e81
VZ
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
5c33522f 655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
15f2ee32
RN
667 size_t len = 0;
668
9d653e81
VZ
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 672 {
9d653e81
VZ
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
15f2ee32 676 {
9d653e81
VZ
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
15f2ee32 679 {
ccaa848d
VZ
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
852dcba5 691 return wxCONV_FAILED;
ccaa848d 692 }
852dcba5 693
9d653e81
VZ
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
15f2ee32 709 {
9d653e81
VZ
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
15f2ee32 715 {
9d653e81
VZ
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
15f2ee32 721 }
9d653e81 722 else // MSB
04a37834 723 {
9d653e81
VZ
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
04a37834 727 }
15f2ee32
RN
728 }
729 }
9d653e81 730 }
04a37834 731
9d653e81
VZ
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
04a37834 736 {
9d653e81
VZ
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
ccaa848d
VZ
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
9d653e81
VZ
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
15f2ee32
RN
770 }
771 }
04a37834 772
9d653e81
VZ
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
04a37834 782
15f2ee32 783 return len;
6001e347
RR
784}
785
15f2ee32
RN
786//
787// BASE64 encoding table
788//
789static const unsigned char utf7enb64[] =
790{
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799};
800
801//
802// UTF-7 encoding table
803//
804// 0 - Set D (directly encoded characters)
805// 1 - Set O (optional direct characters)
806// 2 - whitespace characters (optional)
807// 3 - special characters
808//
809static const unsigned char utf7encode[128] =
6001e347 810{
9d653e81 811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819};
820
9d653e81
VZ
821static inline bool wxIsUTF7Direct(wchar_t wc)
822{
823 return wc < 0x80 && utf7encode[wc] < 1;
824}
825
826size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
15f2ee32 828{
9d653e81
VZ
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
5c33522f 842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
843 }
844
845 EncoderState& state = *statePtr;
846
847
15f2ee32
RN
848 size_t len = 0;
849
9d653e81
VZ
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 852 {
9d653e81
VZ
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
15f2ee32 855 {
9d653e81
VZ
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
ef199164 865
9d653e81
VZ
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
15f2ee32
RN
875 len++;
876 }
9d653e81
VZ
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
15f2ee32 887#ifndef WC_UTF16
79c78d42 888 else if (((wxUint32)cc) > 0xffff)
b2c13097 889 {
15f2ee32 890 // no surrogate pair generation (yet?)
467e0479 891 return wxCONV_FAILED;
15f2ee32
RN
892 }
893#endif
894 else
895 {
9d653e81
VZ
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
ef199164 899
9d653e81
VZ
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
15f2ee32 907 {
9d653e81 908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 909 {
9d653e81
VZ
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
15f2ee32 914 {
9d653e81
VZ
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
15f2ee32 919 }
15f2ee32 920 }
ef199164 921
9d653e81
VZ
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
ef199164 924
9d653e81 925 src++;
15f2ee32 926 }
15f2ee32
RN
927 }
928 }
ef199164 929
9d653e81
VZ
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
ef199164 935
15f2ee32 936 return len;
6001e347
RR
937}
938
f6bcfd97 939// ----------------------------------------------------------------------------
6001e347 940// UTF-8
f6bcfd97 941// ----------------------------------------------------------------------------
6001e347 942
1774c3c5 943static const wxUint32 utf8_max[]=
4def3b35 944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 945
3698ae71
VZ
946// boundaries of the private use area we use to (temporarily) remap invalid
947// characters invalid in a UTF-8 encoded string
ea8ce907
RR
948const wxUint32 wxUnicodePUA = 0x100000;
949const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
0286d08d 951// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 952const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984};
985
986size_t
987wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989{
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
998 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
0286d08d
VZ
1018 if ( out && !dstLen-- )
1019 break;
1020
5367a38a
VS
1021 wxUint32 code;
1022 unsigned char c = *p;
0286d08d 1023
5367a38a
VS
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
0286d08d 1028
5367a38a
VS
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
0286d08d 1031
5367a38a
VS
1032 code = c;
1033 }
1034 else
0286d08d 1035 {
5367a38a
VS
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
0286d08d 1081
5367a38a
VS
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
0286d08d
VZ
1085 }
1086
1087#ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095#else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098#endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107}
1108
1109size_t
1110wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112{
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
a964d3ed 1118 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
a964d3ed
VZ
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
0286d08d
VZ
1140
1141 wxUint32 code;
1142#ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 }
1149#else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151#endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
9a83f860 1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223}
1224
d16d0917
VZ
1225size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
6001e347 1227{
0286d08d 1228 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1230
4def3b35
VS
1231 size_t len = 0;
1232
d16d0917 1233 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1234 {
ea8ce907
RR
1235 const char *opsz = psz;
1236 bool invalid = false;
4def3b35
VS
1237 unsigned char cc = *psz++, fc = cc;
1238 unsigned cnt;
dccce9ea 1239 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1240 fc <<= 1;
ef199164 1241
dccce9ea 1242 if (!cnt)
4def3b35
VS
1243 {
1244 // plain ASCII char
dccce9ea 1245 if (buf)
4def3b35
VS
1246 *buf++ = cc;
1247 len++;
561488ef
MW
1248
1249 // escape the escape character for octal escapes
1250 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1251 && cc == '\\' && (!buf || len < n))
1252 {
1253 if (buf)
1254 *buf++ = cc;
1255 len++;
1256 }
dccce9ea
VZ
1257 }
1258 else
4def3b35
VS
1259 {
1260 cnt--;
dccce9ea 1261 if (!cnt)
4def3b35
VS
1262 {
1263 // invalid UTF-8 sequence
ea8ce907 1264 invalid = true;
dccce9ea
VZ
1265 }
1266 else
4def3b35
VS
1267 {
1268 unsigned ocnt = cnt - 1;
1269 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1270 while (cnt--)
4def3b35 1271 {
ea8ce907 1272 cc = *psz;
dccce9ea 1273 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1274 {
1275 // invalid UTF-8 sequence
ea8ce907
RR
1276 invalid = true;
1277 break;
4def3b35 1278 }
ef199164 1279
ea8ce907 1280 psz++;
4def3b35
VS
1281 res = (res << 6) | (cc & 0x3f);
1282 }
ef199164 1283
ea8ce907 1284 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1285 {
1286 // illegal UTF-8 encoding
ea8ce907 1287 invalid = true;
4def3b35 1288 }
ea8ce907
RR
1289 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1290 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1291 {
1292 // if one of our PUA characters turns up externally
1293 // it must also be treated as an illegal sequence
1294 // (a bit like you have to escape an escape character)
1295 invalid = true;
1296 }
1297 else
1298 {
1cd52418 1299#ifdef WC_UTF16
0286d08d 1300 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1301 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1302 if (pa == wxCONV_FAILED)
ea8ce907
RR
1303 {
1304 invalid = true;
1305 }
1306 else
1307 {
1308 if (buf)
1309 buf += pa;
1310 len += pa;
1311 }
373658eb 1312#else // !WC_UTF16
ea8ce907 1313 if (buf)
38d4b1e4 1314 *buf++ = (wchar_t)res;
ea8ce907 1315 len++;
373658eb 1316#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1317 }
1318 }
ef199164 1319
ea8ce907
RR
1320 if (invalid)
1321 {
1322 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1323 {
1324 while (opsz < psz && (!buf || len < n))
1325 {
1326#ifdef WC_UTF16
1327 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1328 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1329 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1330 if (buf)
1331 buf += pa;
1332 opsz++;
1333 len += pa;
1334#else
1335 if (buf)
38d4b1e4 1336 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1337 opsz++;
1338 len++;
1339#endif
1340 }
1341 }
3698ae71 1342 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1343 {
1344 while (opsz < psz && (!buf || len < n))
1345 {
3698ae71
VZ
1346 if ( buf && len + 3 < n )
1347 {
17a1ebd1 1348 unsigned char on = *opsz;
3698ae71 1349 *buf++ = L'\\';
17a1ebd1
VZ
1350 *buf++ = (wchar_t)( L'0' + on / 0100 );
1351 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1352 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1353 }
ef199164 1354
ea8ce907
RR
1355 opsz++;
1356 len += 4;
1357 }
1358 }
3698ae71 1359 else // MAP_INVALID_UTF8_NOT
ea8ce907 1360 {
467e0479 1361 return wxCONV_FAILED;
ea8ce907 1362 }
4def3b35
VS
1363 }
1364 }
6001e347 1365 }
ef199164 1366
d16d0917 1367 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1368 *buf = 0;
ef199164 1369
d16d0917 1370 return len + 1;
6001e347
RR
1371}
1372
3698ae71
VZ
1373static inline bool isoctal(wchar_t wch)
1374{
1375 return L'0' <= wch && wch <= L'7';
1376}
1377
d16d0917
VZ
1378size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379 const wchar_t *psz, size_t srcLen) const
6001e347 1380{
0286d08d 1381 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1382 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1383
4def3b35 1384 size_t len = 0;
6001e347 1385
d16d0917 1386 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1387 {
1388 wxUint32 cc;
ef199164 1389
1cd52418 1390#ifdef WC_UTF16
b5153fd8
VZ
1391 // cast is ok for WC_UTF16
1392 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1393 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1394#else
ef199164 1395 cc = (*psz++) & 0x7fffffff;
4def3b35 1396#endif
3698ae71
VZ
1397
1398 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1399 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1400 {
dccce9ea 1401 if (buf)
ea8ce907 1402 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1403 len++;
3698ae71 1404 }
561488ef
MW
1405 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1406 && cc == L'\\' && psz[0] == L'\\' )
1407 {
1408 if (buf)
1409 *buf++ = (char)cc;
1410 psz++;
1411 len++;
1412 }
3698ae71
VZ
1413 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1414 cc == L'\\' &&
1415 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1416 {
dccce9ea 1417 if (buf)
3698ae71 1418 {
ef199164
DS
1419 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1420 (psz[1] - L'0') * 010 +
b2c13097 1421 (psz[2] - L'0'));
3698ae71
VZ
1422 }
1423
1424 psz += 3;
ea8ce907
RR
1425 len++;
1426 }
1427 else
1428 {
1429 unsigned cnt;
ef199164
DS
1430 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1431 {
1432 }
1433
ea8ce907 1434 if (!cnt)
4def3b35 1435 {
ea8ce907
RR
1436 // plain ASCII char
1437 if (buf)
1438 *buf++ = (char) cc;
1439 len++;
1440 }
ea8ce907
RR
1441 else
1442 {
1443 len += cnt + 1;
1444 if (buf)
1445 {
1446 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1447 while (cnt--)
1448 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1449 }
4def3b35
VS
1450 }
1451 }
6001e347 1452 }
4def3b35 1453
d16d0917 1454 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1455 *buf = 0;
adb45366 1456
d16d0917 1457 return len + 1;
6001e347
RR
1458}
1459
467e0479 1460// ============================================================================
c91830cb 1461// UTF-16
467e0479 1462// ============================================================================
c91830cb
VZ
1463
1464#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1465 #define wxMBConvUTF16straight wxMBConvUTF16BE
1466 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1467#else
bde4baac
VZ
1468 #define wxMBConvUTF16swap wxMBConvUTF16BE
1469 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1470#endif
1471
467e0479
VZ
1472/* static */
1473size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1474{
1475 if ( srcLen == wxNO_LEN )
1476 {
1477 // count the number of bytes in input, including the trailing NULs
5c33522f 1478 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1479 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1480 ;
c91830cb 1481
467e0479
VZ
1482 srcLen *= BYTES_PER_CHAR;
1483 }
1484 else // we already have the length
1485 {
1486 // we can only convert an entire number of UTF-16 characters
1487 if ( srcLen % BYTES_PER_CHAR )
1488 return wxCONV_FAILED;
1489 }
1490
1491 return srcLen;
1492}
1493
1494// case when in-memory representation is UTF-16 too
c91830cb
VZ
1495#ifdef WC_UTF16
1496
467e0479
VZ
1497// ----------------------------------------------------------------------------
1498// conversions without endianness change
1499// ----------------------------------------------------------------------------
1500
1501size_t
1502wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1503 const char *src, size_t srcLen) const
c91830cb 1504{
467e0479
VZ
1505 // set up the scene for using memcpy() (which is presumably more efficient
1506 // than copying the bytes one by one)
1507 srcLen = GetLength(src, srcLen);
1508 if ( srcLen == wxNO_LEN )
1509 return wxCONV_FAILED;
c91830cb 1510
ef199164 1511 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1512 if ( dst )
c91830cb 1513 {
467e0479
VZ
1514 if ( dstLen < inLen )
1515 return wxCONV_FAILED;
c91830cb 1516
467e0479 1517 memcpy(dst, src, srcLen);
c91830cb 1518 }
d32a507d 1519
467e0479 1520 return inLen;
c91830cb
VZ
1521}
1522
467e0479
VZ
1523size_t
1524wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1525 const wchar_t *src, size_t srcLen) const
c91830cb 1526{
467e0479
VZ
1527 if ( srcLen == wxNO_LEN )
1528 srcLen = wxWcslen(src) + 1;
c91830cb 1529
467e0479
VZ
1530 srcLen *= BYTES_PER_CHAR;
1531
1532 if ( dst )
c91830cb 1533 {
467e0479
VZ
1534 if ( dstLen < srcLen )
1535 return wxCONV_FAILED;
d32a507d 1536
467e0479 1537 memcpy(dst, src, srcLen);
c91830cb 1538 }
d32a507d 1539
467e0479 1540 return srcLen;
c91830cb
VZ
1541}
1542
467e0479
VZ
1543// ----------------------------------------------------------------------------
1544// endian-reversing conversions
1545// ----------------------------------------------------------------------------
c91830cb 1546
467e0479
VZ
1547size_t
1548wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549 const char *src, size_t srcLen) const
c91830cb 1550{
467e0479
VZ
1551 srcLen = GetLength(src, srcLen);
1552 if ( srcLen == wxNO_LEN )
1553 return wxCONV_FAILED;
c91830cb 1554
467e0479
VZ
1555 srcLen /= BYTES_PER_CHAR;
1556
1557 if ( dst )
c91830cb 1558 {
467e0479
VZ
1559 if ( dstLen < srcLen )
1560 return wxCONV_FAILED;
1561
5c33522f 1562 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1563 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1564 {
ef199164 1565 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1566 }
c91830cb 1567 }
bfab25d4 1568
467e0479 1569 return srcLen;
c91830cb
VZ
1570}
1571
467e0479
VZ
1572size_t
1573wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1574 const wchar_t *src, size_t srcLen) const
c91830cb 1575{
467e0479
VZ
1576 if ( srcLen == wxNO_LEN )
1577 srcLen = wxWcslen(src) + 1;
c91830cb 1578
467e0479
VZ
1579 srcLen *= BYTES_PER_CHAR;
1580
1581 if ( dst )
c91830cb 1582 {
467e0479
VZ
1583 if ( dstLen < srcLen )
1584 return wxCONV_FAILED;
1585
5c33522f 1586 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1587 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1588 {
ef199164 1589 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1590 }
c91830cb 1591 }
eec47cc6 1592
467e0479 1593 return srcLen;
c91830cb
VZ
1594}
1595
467e0479 1596#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1597
467e0479
VZ
1598// ----------------------------------------------------------------------------
1599// conversions without endianness change
1600// ----------------------------------------------------------------------------
c91830cb 1601
35d11700
VZ
1602size_t
1603wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1604 const char *src, size_t srcLen) const
c91830cb 1605{
35d11700
VZ
1606 srcLen = GetLength(src, srcLen);
1607 if ( srcLen == wxNO_LEN )
1608 return wxCONV_FAILED;
c91830cb 1609
ef199164 1610 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1611 if ( !dst )
c91830cb 1612 {
35d11700
VZ
1613 // optimization: return maximal space which could be needed for this
1614 // string even if the real size could be smaller if the buffer contains
1615 // any surrogates
1616 return inLen;
c91830cb 1617 }
c91830cb 1618
35d11700 1619 size_t outLen = 0;
5c33522f 1620 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1621 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1622 {
ef199164
DS
1623 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1624 if ( !inBuff )
35d11700
VZ
1625 return wxCONV_FAILED;
1626
1627 if ( ++outLen > dstLen )
1628 return wxCONV_FAILED;
c91830cb 1629
35d11700
VZ
1630 *dst++ = ch;
1631 }
1632
1633
1634 return outLen;
1635}
c91830cb 1636
35d11700
VZ
1637size_t
1638wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1639 const wchar_t *src, size_t srcLen) const
c91830cb 1640{
35d11700
VZ
1641 if ( srcLen == wxNO_LEN )
1642 srcLen = wxWcslen(src) + 1;
c91830cb 1643
35d11700 1644 size_t outLen = 0;
5c33522f 1645 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1646 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1647 {
1648 wxUint16 cc[2];
35d11700
VZ
1649 const size_t numChars = encode_utf16(*src++, cc);
1650 if ( numChars == wxCONV_FAILED )
1651 return wxCONV_FAILED;
c91830cb 1652
ef199164
DS
1653 outLen += numChars * BYTES_PER_CHAR;
1654 if ( outBuff )
c91830cb 1655 {
35d11700
VZ
1656 if ( outLen > dstLen )
1657 return wxCONV_FAILED;
1658
ef199164 1659 *outBuff++ = cc[0];
35d11700 1660 if ( numChars == 2 )
69b80d28 1661 {
35d11700 1662 // second character of a surrogate
ef199164 1663 *outBuff++ = cc[1];
69b80d28 1664 }
c91830cb 1665 }
c91830cb 1666 }
c91830cb 1667
35d11700 1668 return outLen;
c91830cb
VZ
1669}
1670
467e0479
VZ
1671// ----------------------------------------------------------------------------
1672// endian-reversing conversions
1673// ----------------------------------------------------------------------------
c91830cb 1674
35d11700
VZ
1675size_t
1676wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1677 const char *src, size_t srcLen) const
c91830cb 1678{
35d11700
VZ
1679 srcLen = GetLength(src, srcLen);
1680 if ( srcLen == wxNO_LEN )
1681 return wxCONV_FAILED;
1682
ef199164 1683 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1684 if ( !dst )
1685 {
1686 // optimization: return maximal space which could be needed for this
1687 // string even if the real size could be smaller if the buffer contains
1688 // any surrogates
1689 return inLen;
1690 }
c91830cb 1691
35d11700 1692 size_t outLen = 0;
5c33522f 1693 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1694 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1695 {
35d11700
VZ
1696 wxUint32 ch;
1697 wxUint16 tmp[2];
ef199164
DS
1698
1699 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1700 inBuff++;
1701 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1702
35d11700
VZ
1703 const size_t numChars = decode_utf16(tmp, ch);
1704 if ( numChars == wxCONV_FAILED )
1705 return wxCONV_FAILED;
c91830cb 1706
35d11700 1707 if ( numChars == 2 )
ef199164 1708 inBuff++;
35d11700
VZ
1709
1710 if ( ++outLen > dstLen )
1711 return wxCONV_FAILED;
c91830cb 1712
35d11700 1713 *dst++ = ch;
c91830cb 1714 }
c91830cb 1715
c91830cb 1716
35d11700
VZ
1717 return outLen;
1718}
c91830cb 1719
35d11700
VZ
1720size_t
1721wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1722 const wchar_t *src, size_t srcLen) const
c91830cb 1723{
35d11700
VZ
1724 if ( srcLen == wxNO_LEN )
1725 srcLen = wxWcslen(src) + 1;
c91830cb 1726
35d11700 1727 size_t outLen = 0;
5c33522f 1728 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1729 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1730 {
1731 wxUint16 cc[2];
35d11700
VZ
1732 const size_t numChars = encode_utf16(*src, cc);
1733 if ( numChars == wxCONV_FAILED )
1734 return wxCONV_FAILED;
c91830cb 1735
ef199164
DS
1736 outLen += numChars * BYTES_PER_CHAR;
1737 if ( outBuff )
c91830cb 1738 {
35d11700
VZ
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
1741
ef199164 1742 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1743 if ( numChars == 2 )
c91830cb 1744 {
35d11700 1745 // second character of a surrogate
ef199164 1746 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1747 }
1748 }
c91830cb 1749 }
c91830cb 1750
35d11700 1751 return outLen;
c91830cb
VZ
1752}
1753
467e0479 1754#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1755
1756
35d11700 1757// ============================================================================
c91830cb 1758// UTF-32
35d11700 1759// ============================================================================
c91830cb
VZ
1760
1761#ifdef WORDS_BIGENDIAN
467e0479
VZ
1762 #define wxMBConvUTF32straight wxMBConvUTF32BE
1763 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1764#else
467e0479
VZ
1765 #define wxMBConvUTF32swap wxMBConvUTF32BE
1766 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1767#endif
1768
1769
1770WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1771WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1772
467e0479
VZ
1773/* static */
1774size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1775{
1776 if ( srcLen == wxNO_LEN )
1777 {
1778 // count the number of bytes in input, including the trailing NULs
5c33522f 1779 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1780 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1781 ;
c91830cb 1782
467e0479
VZ
1783 srcLen *= BYTES_PER_CHAR;
1784 }
1785 else // we already have the length
1786 {
1787 // we can only convert an entire number of UTF-32 characters
1788 if ( srcLen % BYTES_PER_CHAR )
1789 return wxCONV_FAILED;
1790 }
1791
1792 return srcLen;
1793}
1794
1795// case when in-memory representation is UTF-16
c91830cb
VZ
1796#ifdef WC_UTF16
1797
467e0479
VZ
1798// ----------------------------------------------------------------------------
1799// conversions without endianness change
1800// ----------------------------------------------------------------------------
1801
1802size_t
1803wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1804 const char *src, size_t srcLen) const
c91830cb 1805{
467e0479
VZ
1806 srcLen = GetLength(src, srcLen);
1807 if ( srcLen == wxNO_LEN )
1808 return wxCONV_FAILED;
c91830cb 1809
5c33522f 1810 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1811 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1812 size_t outLen = 0;
1813 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1814 {
1815 wxUint16 cc[2];
ef199164 1816 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1817 if ( numChars == wxCONV_FAILED )
1818 return wxCONV_FAILED;
c91830cb 1819
467e0479
VZ
1820 outLen += numChars;
1821 if ( dst )
c91830cb 1822 {
467e0479
VZ
1823 if ( outLen > dstLen )
1824 return wxCONV_FAILED;
d32a507d 1825
467e0479
VZ
1826 *dst++ = cc[0];
1827 if ( numChars == 2 )
1828 {
1829 // second character of a surrogate
1830 *dst++ = cc[1];
1831 }
1832 }
c91830cb 1833 }
d32a507d 1834
467e0479 1835 return outLen;
c91830cb
VZ
1836}
1837
467e0479
VZ
1838size_t
1839wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1840 const wchar_t *src, size_t srcLen) const
c91830cb 1841{
467e0479
VZ
1842 if ( srcLen == wxNO_LEN )
1843 srcLen = wxWcslen(src) + 1;
c91830cb 1844
467e0479 1845 if ( !dst )
c91830cb 1846 {
467e0479
VZ
1847 // optimization: return maximal space which could be needed for this
1848 // string instead of the exact amount which could be less if there are
1849 // any surrogates in the input
1850 //
1851 // we consider that surrogates are rare enough to make it worthwhile to
1852 // avoid running the loop below at the cost of slightly extra memory
1853 // consumption
ef199164 1854 return srcLen * BYTES_PER_CHAR;
467e0479 1855 }
c91830cb 1856
5c33522f 1857 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1858 size_t outLen = 0;
1859 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1860 {
1861 const wxUint32 ch = wxDecodeSurrogate(&src);
1862 if ( !src )
1863 return wxCONV_FAILED;
c91830cb 1864
467e0479 1865 outLen += BYTES_PER_CHAR;
d32a507d 1866
467e0479
VZ
1867 if ( outLen > dstLen )
1868 return wxCONV_FAILED;
b5153fd8 1869
ef199164 1870 *outBuff++ = ch;
467e0479 1871 }
c91830cb 1872
467e0479 1873 return outLen;
c91830cb
VZ
1874}
1875
467e0479
VZ
1876// ----------------------------------------------------------------------------
1877// endian-reversing conversions
1878// ----------------------------------------------------------------------------
c91830cb 1879
467e0479
VZ
1880size_t
1881wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1882 const char *src, size_t srcLen) const
c91830cb 1883{
467e0479
VZ
1884 srcLen = GetLength(src, srcLen);
1885 if ( srcLen == wxNO_LEN )
1886 return wxCONV_FAILED;
c91830cb 1887
5c33522f 1888 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1889 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1890 size_t outLen = 0;
ef199164 1891 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1892 {
c91830cb 1893 wxUint16 cc[2];
ef199164 1894 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1895 if ( numChars == wxCONV_FAILED )
1896 return wxCONV_FAILED;
c91830cb 1897
467e0479
VZ
1898 outLen += numChars;
1899 if ( dst )
c91830cb 1900 {
467e0479
VZ
1901 if ( outLen > dstLen )
1902 return wxCONV_FAILED;
d32a507d 1903
467e0479
VZ
1904 *dst++ = cc[0];
1905 if ( numChars == 2 )
1906 {
1907 // second character of a surrogate
1908 *dst++ = cc[1];
1909 }
1910 }
c91830cb 1911 }
b5153fd8 1912
467e0479 1913 return outLen;
c91830cb
VZ
1914}
1915
467e0479
VZ
1916size_t
1917wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1918 const wchar_t *src, size_t srcLen) const
c91830cb 1919{
467e0479
VZ
1920 if ( srcLen == wxNO_LEN )
1921 srcLen = wxWcslen(src) + 1;
c91830cb 1922
467e0479 1923 if ( !dst )
c91830cb 1924 {
467e0479
VZ
1925 // optimization: return maximal space which could be needed for this
1926 // string instead of the exact amount which could be less if there are
1927 // any surrogates in the input
1928 //
1929 // we consider that surrogates are rare enough to make it worthwhile to
1930 // avoid running the loop below at the cost of slightly extra memory
1931 // consumption
1932 return srcLen*BYTES_PER_CHAR;
1933 }
c91830cb 1934
5c33522f 1935 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1936 size_t outLen = 0;
1937 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1938 {
1939 const wxUint32 ch = wxDecodeSurrogate(&src);
1940 if ( !src )
1941 return wxCONV_FAILED;
c91830cb 1942
467e0479 1943 outLen += BYTES_PER_CHAR;
d32a507d 1944
467e0479
VZ
1945 if ( outLen > dstLen )
1946 return wxCONV_FAILED;
b5153fd8 1947
ef199164 1948 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1949 }
c91830cb 1950
467e0479 1951 return outLen;
c91830cb
VZ
1952}
1953
467e0479 1954#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1955
35d11700
VZ
1956// ----------------------------------------------------------------------------
1957// conversions without endianness change
1958// ----------------------------------------------------------------------------
1959
1960size_t
1961wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1962 const char *src, size_t srcLen) const
c91830cb 1963{
35d11700
VZ
1964 // use memcpy() as it should be much faster than hand-written loop
1965 srcLen = GetLength(src, srcLen);
1966 if ( srcLen == wxNO_LEN )
1967 return wxCONV_FAILED;
c91830cb 1968
35d11700
VZ
1969 const size_t inLen = srcLen/BYTES_PER_CHAR;
1970 if ( dst )
c91830cb 1971 {
35d11700
VZ
1972 if ( dstLen < inLen )
1973 return wxCONV_FAILED;
b5153fd8 1974
35d11700
VZ
1975 memcpy(dst, src, srcLen);
1976 }
c91830cb 1977
35d11700 1978 return inLen;
c91830cb
VZ
1979}
1980
35d11700
VZ
1981size_t
1982wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1983 const wchar_t *src, size_t srcLen) const
c91830cb 1984{
35d11700
VZ
1985 if ( srcLen == wxNO_LEN )
1986 srcLen = wxWcslen(src) + 1;
1987
1988 srcLen *= BYTES_PER_CHAR;
c91830cb 1989
35d11700 1990 if ( dst )
c91830cb 1991 {
35d11700
VZ
1992 if ( dstLen < srcLen )
1993 return wxCONV_FAILED;
c91830cb 1994
35d11700 1995 memcpy(dst, src, srcLen);
c91830cb
VZ
1996 }
1997
35d11700 1998 return srcLen;
c91830cb
VZ
1999}
2000
35d11700
VZ
2001// ----------------------------------------------------------------------------
2002// endian-reversing conversions
2003// ----------------------------------------------------------------------------
c91830cb 2004
35d11700
VZ
2005size_t
2006wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2007 const char *src, size_t srcLen) const
c91830cb 2008{
35d11700
VZ
2009 srcLen = GetLength(src, srcLen);
2010 if ( srcLen == wxNO_LEN )
2011 return wxCONV_FAILED;
2012
2013 srcLen /= BYTES_PER_CHAR;
c91830cb 2014
35d11700 2015 if ( dst )
c91830cb 2016 {
35d11700
VZ
2017 if ( dstLen < srcLen )
2018 return wxCONV_FAILED;
2019
5c33522f 2020 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2021 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2022 {
ef199164 2023 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2024 }
c91830cb 2025 }
b5153fd8 2026
35d11700 2027 return srcLen;
c91830cb
VZ
2028}
2029
35d11700
VZ
2030size_t
2031wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2032 const wchar_t *src, size_t srcLen) const
c91830cb 2033{
35d11700
VZ
2034 if ( srcLen == wxNO_LEN )
2035 srcLen = wxWcslen(src) + 1;
2036
2037 srcLen *= BYTES_PER_CHAR;
c91830cb 2038
35d11700 2039 if ( dst )
c91830cb 2040 {
35d11700
VZ
2041 if ( dstLen < srcLen )
2042 return wxCONV_FAILED;
2043
5c33522f 2044 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2045 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2046 {
ef199164 2047 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2048 }
c91830cb 2049 }
b5153fd8 2050
35d11700 2051 return srcLen;
c91830cb
VZ
2052}
2053
467e0479 2054#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2055
2056
36acb880
VZ
2057// ============================================================================
2058// The classes doing conversion using the iconv_xxx() functions
2059// ============================================================================
3caec1bb 2060
b040e242 2061#ifdef HAVE_ICONV
3a0d76bc 2062
b1d547eb
VS
2063// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2064// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2065// (unless there's yet another bug in glibc) the only case when iconv()
2066// returns with (size_t)-1 (which means error) and says there are 0 bytes
2067// left in the input buffer -- when _real_ error occurs,
2068// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2069// iconv() failure.
3caec1bb
VS
2070// [This bug does not appear in glibc 2.2.]
2071#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2072#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2073 (errno != E2BIG || bufLeft != 0))
2074#else
2075#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2076#endif
2077
ab217dba 2078#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2079
74a7eb0b
VZ
2080#define ICONV_T_INVALID ((iconv_t)-1)
2081
2082#if SIZEOF_WCHAR_T == 4
2083 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2084 #define WC_ENC wxFONTENCODING_UTF32
2085#elif SIZEOF_WCHAR_T == 2
2086 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2087 #define WC_ENC wxFONTENCODING_UTF16
2088#else // sizeof(wchar_t) != 2 nor 4
2089 // does this ever happen?
2090 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2091#endif
2092
36acb880 2093// ----------------------------------------------------------------------------
e95354ec 2094// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2095// ----------------------------------------------------------------------------
2096
e95354ec 2097class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2098{
2099public:
86501081 2100 wxMBConv_iconv(const char *name);
e95354ec 2101 virtual ~wxMBConv_iconv();
36acb880 2102
8f4b0f43
VZ
2103 // implement base class virtual methods
2104 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2105 const char *src, size_t srcLen = wxNO_LEN) const;
2106 virtual size_t FromWChar(char *dst, size_t dstLen,
2107 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2108 virtual size_t GetMBNulLen() const;
2109
ba98e032
VS
2110#if wxUSE_UNICODE_UTF8
2111 virtual bool IsUTF8() const;
2112#endif
2113
d36c9347
VZ
2114 virtual wxMBConv *Clone() const
2115 {
86501081 2116 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2117 p->m_minMBCharWidth = m_minMBCharWidth;
2118 return p;
2119 }
2120
e95354ec 2121 bool IsOk() const
74a7eb0b 2122 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2123
2124protected:
ef199164
DS
2125 // the iconv handlers used to translate from multibyte
2126 // to wide char and in the other direction
36acb880
VZ
2127 iconv_t m2w,
2128 w2m;
ef199164 2129
b1d547eb
VS
2130#if wxUSE_THREADS
2131 // guards access to m2w and w2m objects
2132 wxMutex m_iconvMutex;
2133#endif
36acb880
VZ
2134
2135private:
e95354ec 2136 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2137 // available on this machine, it will remain NULL
74a7eb0b 2138 static wxString ms_wcCharsetName;
36acb880
VZ
2139
2140 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2141 // different endian-ness than the native one
405d8f46 2142 static bool ms_wcNeedsSwap;
eec47cc6 2143
d36c9347
VZ
2144
2145 // name of the encoding handled by this conversion
2146 wxString m_name;
2147
7ef3ab50 2148 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2149 // initially
2150 size_t m_minMBCharWidth;
36acb880
VZ
2151};
2152
8f115891 2153// make the constructor available for unit testing
86501081 2154WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2155{
2156 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2157 if ( !result->IsOk() )
2158 {
2159 delete result;
2160 return 0;
2161 }
ef199164 2162
8f115891
MW
2163 return result;
2164}
2165
422e411e 2166wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2167bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2168
86501081 2169wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2170 : m_name(name)
36acb880 2171{
c1464d9d 2172 m_minMBCharWidth = 0;
eec47cc6 2173
36acb880 2174 // check for charset that represents wchar_t:
74a7eb0b 2175 if ( ms_wcCharsetName.empty() )
f1339c56 2176 {
9a83f860 2177 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2178
74a7eb0b 2179#if wxUSE_FONTMAP
a243da29 2180 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2181#else // !wxUSE_FONTMAP
a243da29 2182 static const wxChar *const names_static[] =
36acb880 2183 {
74a7eb0b 2184#if SIZEOF_WCHAR_T == 4
9a83f860 2185 wxT("UCS-4"),
74a7eb0b 2186#elif SIZEOF_WCHAR_T = 2
9a83f860 2187 wxT("UCS-2"),
74a7eb0b
VZ
2188#endif
2189 NULL
2190 };
a243da29 2191 const wxChar *const *names = names_static;
74a7eb0b 2192#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2193
d1f024a8 2194 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2195 {
17a1ebd1 2196 const wxString nameCS(*names);
74a7eb0b
VZ
2197
2198 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2199 wxString nameXE(nameCS);
ef199164
DS
2200
2201#ifdef WORDS_BIGENDIAN
9a83f860 2202 nameXE += wxT("BE");
ef199164 2203#else // little endian
9a83f860 2204 nameXE += wxT("LE");
ef199164 2205#endif
74a7eb0b 2206
9a83f860 2207 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2208 nameXE.c_str());
2209
86501081 2210 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2211 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2212 {
74a7eb0b 2213 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2214 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2215 nameCS.c_str());
86501081 2216 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2217
74a7eb0b
VZ
2218 // and check for bytesex ourselves:
2219 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2220 {
74a7eb0b 2221 char buf[2], *bufPtr;
e8769ed1 2222 wchar_t wbuf[2];
74a7eb0b
VZ
2223 size_t insz, outsz;
2224 size_t res;
2225
2226 buf[0] = 'A';
2227 buf[1] = 0;
2228 wbuf[0] = 0;
2229 insz = 2;
2230 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2231 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2232 bufPtr = buf;
2233
ef199164
DS
2234 res = iconv(
2235 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2236 &wbufPtr, &outsz);
74a7eb0b
VZ
2237
2238 if (ICONV_FAILED(res, insz))
2239 {
2240 wxLogLastError(wxT("iconv"));
422e411e 2241 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2242 nameCS.c_str());
74a7eb0b
VZ
2243 }
2244 else // ok, can convert to this encoding, remember it
2245 {
17a1ebd1 2246 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2247 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2248 }
3a0d76bc
VS
2249 }
2250 }
74a7eb0b 2251 else // use charset not requiring byte swapping
36acb880 2252 {
74a7eb0b 2253 ms_wcCharsetName = nameXE;
36acb880 2254 }
3a0d76bc 2255 }
74a7eb0b 2256
0944fceb 2257 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2258 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2259 ms_wcCharsetName.empty() ? wxString("<none>")
2260 : ms_wcCharsetName,
9a83f860
VZ
2261 ms_wcNeedsSwap ? wxT(" (needs swap)")
2262 : wxT(""));
3a0d76bc 2263 }
36acb880 2264 else // we already have ms_wcCharsetName
3caec1bb 2265 {
86501081 2266 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2267 }
dccce9ea 2268
74a7eb0b 2269 if ( ms_wcCharsetName.empty() )
f1339c56 2270 {
74a7eb0b 2271 w2m = ICONV_T_INVALID;
36acb880 2272 }
405d8f46
VZ
2273 else
2274 {
86501081 2275 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2276 if ( w2m == ICONV_T_INVALID )
2277 {
2278 wxLogTrace(TRACE_STRCONV,
2279 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2280 ms_wcCharsetName.c_str(), name);
74a7eb0b 2281 }
405d8f46 2282 }
36acb880 2283}
3caec1bb 2284
e95354ec 2285wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2286{
74a7eb0b 2287 if ( m2w != ICONV_T_INVALID )
36acb880 2288 iconv_close(m2w);
74a7eb0b 2289 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2290 iconv_close(w2m);
2291}
3a0d76bc 2292
8f4b0f43
VZ
2293size_t
2294wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2295 const char *src, size_t srcLen) const
36acb880 2296{
8f4b0f43 2297 if ( srcLen == wxNO_LEN )
69373110 2298 {
8f4b0f43
VZ
2299 // find the string length: notice that must be done differently for
2300 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2301 // consecutive NULs
2302 const size_t nulLen = GetMBNulLen();
2303 switch ( nulLen )
2304 {
2305 default:
2306 return wxCONV_FAILED;
69373110 2307
8f4b0f43
VZ
2308 case 1:
2309 srcLen = strlen(src); // arguably more optimized than our version
2310 break;
69373110 2311
8f4b0f43
VZ
2312 case 2:
2313 case 4:
2314 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2315 // but they also have to start at character boundary and not
2316 // span two adjacent characters
2317 const char *p;
2318 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2319 ;
2320 srcLen = p - src;
2321 break;
2322 }
d50c0831
VZ
2323
2324 // when we're determining the length of the string ourselves we count
2325 // the terminating NUL(s) as part of it and always NUL-terminate the
2326 // output
2327 srcLen += nulLen;
69373110
VZ
2328 }
2329
8f4b0f43
VZ
2330 // we express length in the number of (wide) characters but iconv always
2331 // counts buffer sizes it in bytes
2332 dstLen *= SIZEOF_WCHAR_T;
2333
b1d547eb 2334#if wxUSE_THREADS
6a17b868
SN
2335 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2336 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2337 // wxConvLocal that are used all over wx code, so we have to make sure
2338 // the handle is used by at most one thread at the time. Otherwise
2339 // only a few wx classes would be safe to use from non-main threads
2340 // as MB<->WC conversion would fail "randomly".
2341 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2342#endif // wxUSE_THREADS
2343
36acb880 2344 size_t res, cres;
8f4b0f43 2345 const char *pszPtr = src;
36acb880 2346
8f4b0f43 2347 if ( dst )
36acb880 2348 {
8f4b0f43 2349 char* bufPtr = (char*)dst;
e8769ed1 2350
36acb880 2351 // have destination buffer, convert there
1752fda6 2352 size_t dstLenOrig = dstLen;
36acb880 2353 cres = iconv(m2w,
8f4b0f43
VZ
2354 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2355 &bufPtr, &dstLen);
1752fda6
VZ
2356
2357 // convert the number of bytes converted as returned by iconv to the
2358 // number of (wide) characters converted that we need
2359 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2360
36acb880 2361 if (ms_wcNeedsSwap)
3a0d76bc 2362 {
36acb880 2363 // convert to native endianness
17a1ebd1 2364 for ( unsigned i = 0; i < res; i++ )
467a2982 2365 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2366 }
36acb880 2367 }
8f4b0f43 2368 else // no destination buffer
36acb880 2369 {
8f4b0f43 2370 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2371 wchar_t tbuf[256];
36acb880 2372 res = 0;
ef199164
DS
2373
2374 do
2375 {
e8769ed1 2376 char* bufPtr = (char*)tbuf;
8f4b0f43 2377 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2378
2379 cres = iconv(m2w,
8f4b0f43
VZ
2380 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2381 &bufPtr, &dstLen );
36acb880 2382
8f4b0f43 2383 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2384 }
2385 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2386 }
dccce9ea 2387
8f4b0f43 2388 if (ICONV_FAILED(cres, srcLen))
f1339c56 2389 {
36acb880 2390 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2391 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2392 return wxCONV_FAILED;
36acb880
VZ
2393 }
2394
2395 return res;
2396}
2397
8f4b0f43
VZ
2398size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2399 const wchar_t *src, size_t srcLen) const
36acb880 2400{
b1d547eb
VS
2401#if wxUSE_THREADS
2402 // NB: explained in MB2WC
2403 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2404#endif
3698ae71 2405
8f4b0f43 2406 if ( srcLen == wxNO_LEN )
2588ee86 2407 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2408
2409 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2410 size_t outbuflen = dstLen;
36acb880 2411 size_t res, cres;
3a0d76bc 2412
36acb880 2413 wchar_t *tmpbuf = 0;
3caec1bb 2414
36acb880
VZ
2415 if (ms_wcNeedsSwap)
2416 {
2417 // need to copy to temp buffer to switch endianness
51725fc0 2418 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2419 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2420 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2421 for ( size_t i = 0; i < srcLen; i++ )
2422 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2423
8f4b0f43 2424 src = tmpbuf;
36acb880 2425 }
3a0d76bc 2426
8f4b0f43
VZ
2427 char* inbuf = (char*)src;
2428 if ( dst )
36acb880
VZ
2429 {
2430 // have destination buffer, convert there
8f4b0f43 2431 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2432
8f4b0f43 2433 res = dstLen - outbuflen;
36acb880 2434 }
8f4b0f43 2435 else // no destination buffer
36acb880 2436 {
8f4b0f43 2437 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2438 char tbuf[256];
36acb880 2439 res = 0;
ef199164
DS
2440 do
2441 {
8f4b0f43 2442 dst = tbuf;
51725fc0 2443 outbuflen = WXSIZEOF(tbuf);
36acb880 2444
8f4b0f43 2445 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2446
51725fc0 2447 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2448 }
2449 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2450 }
dccce9ea 2451
36acb880
VZ
2452 if (ms_wcNeedsSwap)
2453 {
2454 free(tmpbuf);
2455 }
dccce9ea 2456
e8769ed1 2457 if (ICONV_FAILED(cres, inbuflen))
36acb880 2458 {
ce6f8d6f 2459 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2460 return wxCONV_FAILED;
36acb880
VZ
2461 }
2462
2463 return res;
2464}
2465
7ef3ab50 2466size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2467{
c1464d9d 2468 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2469 {
2470 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2471
2472#if wxUSE_THREADS
2473 // NB: explained in MB2WC
2474 wxMutexLocker lock(self->m_iconvMutex);
2475#endif
2476
999020e1 2477 const wchar_t *wnul = L"";
c1464d9d 2478 char buf[8]; // should be enough for NUL in any encoding
356410fc 2479 size_t inLen = sizeof(wchar_t),
c1464d9d 2480 outLen = WXSIZEOF(buf);
ef199164
DS
2481 char *inBuff = (char *)wnul;
2482 char *outBuff = buf;
2483 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2484 {
c1464d9d 2485 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2486 }
2487 else // ok
2488 {
ef199164 2489 self->m_minMBCharWidth = outBuff - buf;
356410fc 2490 }
eec47cc6
VZ
2491 }
2492
c1464d9d 2493 return m_minMBCharWidth;
eec47cc6
VZ
2494}
2495
ba98e032
VS
2496#if wxUSE_UNICODE_UTF8
2497bool wxMBConv_iconv::IsUTF8() const
2498{
86501081
VS
2499 return wxStricmp(m_name, "UTF-8") == 0 ||
2500 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2501}
2502#endif
2503
b040e242 2504#endif // HAVE_ICONV
36acb880 2505
e95354ec 2506
36acb880
VZ
2507// ============================================================================
2508// Win32 conversion classes
2509// ============================================================================
1cd52418 2510
e95354ec 2511#ifdef wxHAVE_WIN32_MB2WC
373658eb 2512
8b04d4c4 2513// from utils.cpp
d775fa82 2514#if wxUSE_FONTMAP
86501081 2515extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2516extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2517#endif
373658eb 2518
e95354ec 2519class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2520{
2521public:
bde4baac
VZ
2522 wxMBConv_win32()
2523 {
2524 m_CodePage = CP_ACP;
c1464d9d 2525 m_minMBCharWidth = 0;
bde4baac
VZ
2526 }
2527
d36c9347 2528 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2529 : wxMBConv()
d36c9347
VZ
2530 {
2531 m_CodePage = conv.m_CodePage;
2532 m_minMBCharWidth = conv.m_minMBCharWidth;
2533 }
2534
7608a683 2535#if wxUSE_FONTMAP
86501081 2536 wxMBConv_win32(const char* name)
bde4baac
VZ
2537 {
2538 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2539 m_minMBCharWidth = 0;
bde4baac 2540 }
dccce9ea 2541
e95354ec 2542 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2543 {
2544 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2545 m_minMBCharWidth = 0;
bde4baac 2546 }
eec47cc6 2547#endif // wxUSE_FONTMAP
8b04d4c4 2548
d36c9347 2549 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2550 {
02272c9c
VZ
2551 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2552 // the behaviour is not compatible with the Unix version (using iconv)
2553 // and break the library itself, e.g. wxTextInputStream::NextChar()
2554 // wouldn't work if reading an incomplete MB char didn't result in an
2555 // error
667e5b3e 2556 //
89028980 2557 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2558 // Win XP or newer and it is not supported for UTF-[78] so we always
2559 // use our own conversions in this case. See
89028980
VS
2560 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2561 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2562 if ( m_CodePage == CP_UTF8 )
89028980 2563 {
5487ff0f 2564 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2565 }
830f8f11
VZ
2566
2567 if ( m_CodePage == CP_UTF7 )
2568 {
5487ff0f 2569 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2570 }
2571
2572 int flags = 0;
2573 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2574 IsAtLeastWin2kSP4() )
89028980 2575 {
830f8f11 2576 flags = MB_ERR_INVALID_CHARS;
89028980 2577 }
667e5b3e 2578
2b5f62a0
VZ
2579 const size_t len = ::MultiByteToWideChar
2580 (
2581 m_CodePage, // code page
667e5b3e 2582 flags, // flags: fall on error
2b5f62a0
VZ
2583 psz, // input string
2584 -1, // its length (NUL-terminated)
b4da152e 2585 buf, // output string
2b5f62a0
VZ
2586 buf ? n : 0 // size of output buffer
2587 );
89028980
VS
2588 if ( !len )
2589 {
2590 // function totally failed
467e0479 2591 return wxCONV_FAILED;
89028980
VS
2592 }
2593
2594 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2595 // check if we succeeded, by doing a double trip:
2596 if ( !flags && buf )
2597 {
53c174fc
VZ
2598 const size_t mbLen = strlen(psz);
2599 wxCharBuffer mbBuf(mbLen);
89028980
VS
2600 if ( ::WideCharToMultiByte
2601 (
2602 m_CodePage,
2603 0,
2604 buf,
2605 -1,
2606 mbBuf.data(),
53c174fc 2607 mbLen + 1, // size in bytes, not length
89028980
VS
2608 NULL,
2609 NULL
2610 ) == 0 ||
2611 strcmp(mbBuf, psz) != 0 )
2612 {
2613 // we didn't obtain the same thing we started from, hence
2614 // the conversion was lossy and we consider that it failed
467e0479 2615 return wxCONV_FAILED;
89028980
VS
2616 }
2617 }
2b5f62a0 2618
03a991bc
VZ
2619 // note that it returns count of written chars for buf != NULL and size
2620 // of the needed buffer for buf == NULL so in either case the length of
2621 // the string (which never includes the terminating NUL) is one less
89028980 2622 return len - 1;
f1339c56 2623 }
dccce9ea 2624
d36c9347 2625 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2626 {
13dd924a
VZ
2627 /*
2628 we have a problem here: by default, WideCharToMultiByte() may
2629 replace characters unrepresentable in the target code page with bad
2630 quality approximations such as turning "1/2" symbol (U+00BD) into
2631 "1" for the code pages which don't have it and we, obviously, want
2632 to avoid this at any price
d775fa82 2633
13dd924a
VZ
2634 the trouble is that this function does it _silently_, i.e. it won't
2635 even tell us whether it did or not... Win98/2000 and higher provide
2636 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2637 we have to resort to a round trip, i.e. check that converting back
2638 results in the same string -- this is, of course, expensive but
2639 otherwise we simply can't be sure to not garble the data.
2640 */
2641
2642 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2643 // it doesn't work with CJK encodings (which we test for rather roughly
2644 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2645 // supporting it
907173e5
WS
2646 BOOL usedDef wxDUMMY_INITIALIZE(false);
2647 BOOL *pUsedDef;
13dd924a
VZ
2648 int flags;
2649 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2650 {
2651 // it's our lucky day
2652 flags = WC_NO_BEST_FIT_CHARS;
2653 pUsedDef = &usedDef;
2654 }
2655 else // old system or unsupported encoding
2656 {
2657 flags = 0;
2658 pUsedDef = NULL;
2659 }
2660
2b5f62a0
VZ
2661 const size_t len = ::WideCharToMultiByte
2662 (
2663 m_CodePage, // code page
13dd924a
VZ
2664 flags, // either none or no best fit
2665 pwz, // input string
2b5f62a0
VZ
2666 -1, // it is (wide) NUL-terminated
2667 buf, // output buffer
2668 buf ? n : 0, // and its size
2669 NULL, // default "replacement" char
13dd924a 2670 pUsedDef // [out] was it used?
2b5f62a0
VZ
2671 );
2672
13dd924a
VZ
2673 if ( !len )
2674 {
2675 // function totally failed
467e0479 2676 return wxCONV_FAILED;
13dd924a
VZ
2677 }
2678
765bdb4a
VZ
2679 // we did something, check if we really succeeded
2680 if ( flags )
13dd924a 2681 {
765bdb4a
VZ
2682 // check if the conversion failed, i.e. if any replacements
2683 // were done
2684 if ( usedDef )
2685 return wxCONV_FAILED;
2686 }
2687 else // we must resort to double tripping...
2688 {
2689 // first we need to ensure that we really have the MB data: this is
2690 // not the case if we're called with NULL buffer, in which case we
2691 // need to do the conversion yet again
2692 wxCharBuffer bufDef;
2693 if ( !buf )
13dd924a 2694 {
765bdb4a
VZ
2695 bufDef = wxCharBuffer(len);
2696 buf = bufDef.data();
2697 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2698 buf, len, NULL, NULL) )
467e0479 2699 return wxCONV_FAILED;
13dd924a 2700 }
765bdb4a 2701
564da6ff
VZ
2702 if ( !n )
2703 n = wcslen(pwz);
765bdb4a 2704 wxWCharBuffer wcBuf(n);
564da6ff 2705 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2706 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2707 {
765bdb4a
VZ
2708 // we didn't obtain the same thing we started from, hence
2709 // the conversion was lossy and we consider that it failed
2710 return wxCONV_FAILED;
13dd924a
VZ
2711 }
2712 }
2713
03a991bc 2714 // see the comment above for the reason of "len - 1"
13dd924a 2715 return len - 1;
f1339c56 2716 }
dccce9ea 2717
7ef3ab50
VZ
2718 virtual size_t GetMBNulLen() const
2719 {
2720 if ( m_minMBCharWidth == 0 )
2721 {
2722 int len = ::WideCharToMultiByte
2723 (
2724 m_CodePage, // code page
2725 0, // no flags
2726 L"", // input string
2727 1, // translate just the NUL
2728 NULL, // output buffer
2729 0, // and its size
2730 NULL, // no replacement char
2731 NULL // [out] don't care if it was used
2732 );
2733
2734 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2735 switch ( len )
2736 {
2737 default:
9a83f860 2738 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2739 self->m_minMBCharWidth = (size_t)-1;
2740 break;
7ef3ab50
VZ
2741
2742 case 0:
2743 self->m_minMBCharWidth = (size_t)-1;
2744 break;
2745
2746 case 1:
2747 case 2:
2748 case 4:
2749 self->m_minMBCharWidth = len;
2750 break;
2751 }
2752 }
2753
2754 return m_minMBCharWidth;
2755 }
2756
d36c9347
VZ
2757 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2758
13dd924a
VZ
2759 bool IsOk() const { return m_CodePage != -1; }
2760
2761private:
2762 static bool CanUseNoBestFit()
2763 {
2764 static int s_isWin98Or2k = -1;
2765
2766 if ( s_isWin98Or2k == -1 )
2767 {
2768 int verMaj, verMin;
2769 switch ( wxGetOsVersion(&verMaj, &verMin) )
2770 {
406d283a 2771 case wxOS_WINDOWS_9X:
13dd924a
VZ
2772 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2773 break;
2774
406d283a 2775 case wxOS_WINDOWS_NT:
13dd924a
VZ
2776 s_isWin98Or2k = verMaj >= 5;
2777 break;
2778
2779 default:
ef199164 2780 // unknown: be conservative by default
13dd924a 2781 s_isWin98Or2k = 0;
ef199164 2782 break;
13dd924a
VZ
2783 }
2784
9a83f860 2785 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2786 }
2787
2788 return s_isWin98Or2k == 1;
2789 }
f1339c56 2790
89028980
VS
2791 static bool IsAtLeastWin2kSP4()
2792 {
8942f83a
WS
2793#ifdef __WXWINCE__
2794 return false;
2795#else
89028980
VS
2796 static int s_isAtLeastWin2kSP4 = -1;
2797
2798 if ( s_isAtLeastWin2kSP4 == -1 )
2799 {
2800 OSVERSIONINFOEX ver;
2801
2802 memset(&ver, 0, sizeof(ver));
2803 ver.dwOSVersionInfoSize = sizeof(ver);
2804 GetVersionEx((OSVERSIONINFO*)&ver);
2805
2806 s_isAtLeastWin2kSP4 =
2807 ((ver.dwMajorVersion > 5) || // Vista+
2808 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2809 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2810 ver.wServicePackMajor >= 4)) // 2000 SP4+
2811 ? 1 : 0;
2812 }
2813
2814 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2815#endif
89028980
VS
2816 }
2817
eec47cc6 2818
c1464d9d 2819 // the code page we're working with
b1d66b54 2820 long m_CodePage;
c1464d9d 2821
7ef3ab50 2822 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2823 // "unknown"
2824 size_t m_minMBCharWidth;
1cd52418 2825};
e95354ec
VZ
2826
2827#endif // wxHAVE_WIN32_MB2WC
2828
f7e98dee 2829
36acb880
VZ
2830// ============================================================================
2831// wxEncodingConverter based conversion classes
2832// ============================================================================
2833
1e6feb95 2834#if wxUSE_FONTMAP
1cd52418 2835
e95354ec 2836class wxMBConv_wxwin : public wxMBConv
1cd52418 2837{
8b04d4c4
VZ
2838private:
2839 void Init()
2840 {
6ac84a78
DE
2841 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2842 // The wxMBConv_cf class does a better job.
2843 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2844 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2845 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2846 }
2847
6001e347 2848public:
f1339c56
RR
2849 // temporarily just use wxEncodingConverter stuff,
2850 // so that it works while a better implementation is built
86501081 2851 wxMBConv_wxwin(const char* name)
f1339c56
RR
2852 {
2853 if (name)
267e11c5 2854 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2855 else
2856 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2857
8b04d4c4
VZ
2858 Init();
2859 }
2860
e95354ec 2861 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2862 {
2863 m_enc = enc;
2864
2865 Init();
f1339c56 2866 }
dccce9ea 2867
bde4baac 2868 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2869 {
2870 size_t inbuf = strlen(psz);
dccce9ea 2871 if (buf)
c643a977 2872 {
ef199164 2873 if (!m2w.Convert(psz, buf))
467e0479 2874 return wxCONV_FAILED;
c643a977 2875 }
f1339c56
RR
2876 return inbuf;
2877 }
dccce9ea 2878
bde4baac 2879 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2880 {
f8d791e0 2881 const size_t inbuf = wxWcslen(psz);
f1339c56 2882 if (buf)
c643a977 2883 {
ef199164 2884 if (!w2m.Convert(psz, buf))
467e0479 2885 return wxCONV_FAILED;
c643a977 2886 }
dccce9ea 2887
f1339c56
RR
2888 return inbuf;
2889 }
dccce9ea 2890
7ef3ab50 2891 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2892 {
2893 switch ( m_enc )
2894 {
2895 case wxFONTENCODING_UTF16BE:
2896 case wxFONTENCODING_UTF16LE:
c1464d9d 2897 return 2;
eec47cc6
VZ
2898
2899 case wxFONTENCODING_UTF32BE:
2900 case wxFONTENCODING_UTF32LE:
c1464d9d 2901 return 4;
eec47cc6
VZ
2902
2903 default:
c1464d9d 2904 return 1;
eec47cc6
VZ
2905 }
2906 }
2907
d36c9347
VZ
2908 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2909
7ef3ab50
VZ
2910 bool IsOk() const { return m_ok; }
2911
2912public:
2913 wxFontEncoding m_enc;
2914 wxEncodingConverter m2w, w2m;
2915
2916private:
cafbf6fb
VZ
2917 // were we initialized successfully?
2918 bool m_ok;
fc7a2a60 2919
c0c133e1 2920 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2921};
6001e347 2922
8f115891 2923// make the constructors available for unit testing
86501081 2924WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2925{
2926 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2927 if ( !result->IsOk() )
2928 {
2929 delete result;
2930 return 0;
2931 }
ef199164 2932
8f115891
MW
2933 return result;
2934}
2935
1e6feb95
VZ
2936#endif // wxUSE_FONTMAP
2937
36acb880
VZ
2938// ============================================================================
2939// wxCSConv implementation
2940// ============================================================================
2941
8b04d4c4 2942void wxCSConv::Init()
6001e347 2943{
e95354ec
VZ
2944 m_name = NULL;
2945 m_convReal = NULL;
2946 m_deferred = true;
2947}
2948
86501081 2949wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2950{
2951 Init();
82713003 2952
86501081 2953 if ( !charset.empty() )
e95354ec 2954 {
86501081 2955 SetName(charset.ToAscii());
e95354ec 2956 }
bda3d86a 2957
e4277538
VZ
2958#if wxUSE_FONTMAP
2959 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
e3276230
VZ
2960 if ( m_encoding == wxFONTENCODING_MAX )
2961 {
2962 // set to unknown/invalid value
2963 m_encoding = wxFONTENCODING_SYSTEM;
2964 }
2965 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2966 {
2967 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2968 m_encoding = wxFONTENCODING_ISO8859_1;
2969 }
e4277538 2970#else
bda3d86a 2971 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2972#endif
6001e347
RR
2973}
2974
8b04d4c4
VZ
2975wxCSConv::wxCSConv(wxFontEncoding encoding)
2976{
bda3d86a 2977 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 2978 {
9a83f860 2979 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
2980
2981 encoding = wxFONTENCODING_SYSTEM;
2982 }
2983
8b04d4c4
VZ
2984 Init();
2985
bda3d86a 2986 m_encoding = encoding;
8b04d4c4
VZ
2987}
2988
6001e347
RR
2989wxCSConv::~wxCSConv()
2990{
65e50848
JS
2991 Clear();
2992}
2993
54380f29 2994wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2995 : wxMBConv()
54380f29 2996{
8b04d4c4
VZ
2997 Init();
2998
54380f29 2999 SetName(conv.m_name);
8b04d4c4 3000 m_encoding = conv.m_encoding;
54380f29
GD
3001}
3002
3003wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3004{
3005 Clear();
8b04d4c4 3006
54380f29 3007 SetName(conv.m_name);
8b04d4c4
VZ
3008 m_encoding = conv.m_encoding;
3009
54380f29
GD
3010 return *this;
3011}
3012
65e50848
JS
3013void wxCSConv::Clear()
3014{
8b04d4c4 3015 free(m_name);
5276b0a5 3016 wxDELETE(m_convReal);
8b04d4c4 3017
65e50848 3018 m_name = NULL;
6001e347
RR
3019}
3020
86501081 3021void wxCSConv::SetName(const char *charset)
6001e347 3022{
f1339c56
RR
3023 if (charset)
3024 {
d6f2a891 3025 m_name = wxStrdup(charset);
e95354ec 3026 m_deferred = true;
f1339c56 3027 }
6001e347
RR
3028}
3029
8b3eb85d 3030#if wxUSE_FONTMAP
8b3eb85d
VZ
3031
3032WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3033 wxEncodingNameCache );
8b3eb85d
VZ
3034
3035static wxEncodingNameCache gs_nameCache;
3036#endif
3037
e95354ec
VZ
3038wxMBConv *wxCSConv::DoCreate() const
3039{
ce6f8d6f
VZ
3040#if wxUSE_FONTMAP
3041 wxLogTrace(TRACE_STRCONV,
3042 wxT("creating conversion for %s"),
3043 (m_name ? m_name
86501081 3044 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3045#endif // wxUSE_FONTMAP
3046
c547282d
VZ
3047 // check for the special case of ASCII or ISO8859-1 charset: as we have
3048 // special knowledge of it anyhow, we don't need to create a special
3049 // conversion object
e4277538
VZ
3050 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3051 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3052 {
e95354ec
VZ
3053 // don't convert at all
3054 return NULL;
3055 }
dccce9ea 3056
e95354ec
VZ
3057 // we trust OS to do conversion better than we can so try external
3058 // conversion methods first
3059 //
3060 // the full order is:
3061 // 1. OS conversion (iconv() under Unix or Win32 API)
3062 // 2. hard coded conversions for UTF
3063 // 3. wxEncodingConverter as fall back
3064
3065 // step (1)
3066#ifdef HAVE_ICONV
c547282d 3067#if !wxUSE_FONTMAP
e95354ec 3068 if ( m_name )
c547282d 3069#endif // !wxUSE_FONTMAP
e95354ec 3070 {
3ef10cfc 3071#if wxUSE_FONTMAP
8b3eb85d 3072 wxFontEncoding encoding(m_encoding);
3ef10cfc 3073#endif
8b3eb85d 3074
86501081 3075 if ( m_name )
8b3eb85d 3076 {
86501081 3077 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3078 if ( conv->IsOk() )
3079 return conv;
3080
3081 delete conv;
c547282d
VZ
3082
3083#if wxUSE_FONTMAP
8b3eb85d 3084 encoding =
86501081 3085 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3086#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3087 }
3088#if wxUSE_FONTMAP
3089 {
3090 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3091 if ( it != gs_nameCache.end() )
3092 {
3093 if ( it->second.empty() )
3094 return NULL;
c547282d 3095
86501081 3096 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3097 if ( conv->IsOk() )
3098 return conv;
e95354ec 3099
8b3eb85d
VZ
3100 delete conv;
3101 }
3102
a243da29 3103 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3104 // CS : in case this does not return valid names (eg for MacRoman)
3105 // encoding got a 'failure' entry in the cache all the same,
3106 // although it just has to be created using a different method, so
3107 // only store failed iconv creation attempts (or perhaps we
3108 // shoulnd't do this at all ?)
3c67ec06 3109 if ( names[0] != NULL )
8b3eb85d 3110 {
3c67ec06 3111 for ( ; *names; ++names )
8b3eb85d 3112 {
86501081
VS
3113 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3114 // will need changes that will obsolete this
3115 wxString name(*names);
3116 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3117 if ( conv->IsOk() )
3118 {
3119 gs_nameCache[encoding] = *names;
3120 return conv;
3121 }
3122
3123 delete conv;
8b3eb85d
VZ
3124 }
3125
9a83f860 3126 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3127 }
8b3eb85d
VZ
3128 }
3129#endif // wxUSE_FONTMAP
e95354ec
VZ
3130 }
3131#endif // HAVE_ICONV
3132
3133#ifdef wxHAVE_WIN32_MB2WC
3134 {
7608a683 3135#if wxUSE_FONTMAP
e95354ec
VZ
3136 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3137 : new wxMBConv_win32(m_encoding);
3138 if ( conv->IsOk() )
3139 return conv;
3140
3141 delete conv;
7608a683
WS
3142#else
3143 return NULL;
3144#endif
e95354ec
VZ
3145 }
3146#endif // wxHAVE_WIN32_MB2WC
ef199164 3147
5c4ed98d 3148#ifdef __DARWIN__
f7e98dee 3149 {
6ff49cbc
DE
3150 // leave UTF16 and UTF32 to the built-ins of wx
3151 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3152 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3153 {
a6900d10 3154#if wxUSE_FONTMAP
5c4ed98d
DE
3155 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3156 : new wxMBConv_cf(m_encoding);
a6900d10 3157#else
5c4ed98d 3158 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3159#endif
ef199164 3160
f7e98dee 3161 if ( conv->IsOk() )
d775fa82
WS
3162 return conv;
3163
3164 delete conv;
3165 }
335d31e0 3166 }
5c4ed98d
DE
3167#endif // __DARWIN__
3168
e95354ec
VZ
3169 // step (2)
3170 wxFontEncoding enc = m_encoding;
3171#if wxUSE_FONTMAP
c547282d
VZ
3172 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3173 {
3174 // use "false" to suppress interactive dialogs -- we can be called from
3175 // anywhere and popping up a dialog from here is the last thing we want to
3176 // do
267e11c5 3177 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3178 }
e95354ec
VZ
3179#endif // wxUSE_FONTMAP
3180
3181 switch ( enc )
3182 {
3183 case wxFONTENCODING_UTF7:
3184 return new wxMBConvUTF7;
3185
3186 case wxFONTENCODING_UTF8:
3187 return new wxMBConvUTF8;
3188
e95354ec
VZ
3189 case wxFONTENCODING_UTF16BE:
3190 return new wxMBConvUTF16BE;
3191
3192 case wxFONTENCODING_UTF16LE:
3193 return new wxMBConvUTF16LE;
3194
e95354ec
VZ
3195 case wxFONTENCODING_UTF32BE:
3196 return new wxMBConvUTF32BE;
3197
3198 case wxFONTENCODING_UTF32LE:
3199 return new wxMBConvUTF32LE;
3200
3201 default:
3202 // nothing to do but put here to suppress gcc warnings
ef199164 3203 break;
e95354ec
VZ
3204 }
3205
3206 // step (3)
3207#if wxUSE_FONTMAP
3208 {
3209 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3210 : new wxMBConv_wxwin(m_encoding);
3211 if ( conv->IsOk() )
3212 return conv;
3213
3214 delete conv;
3215 }
ef199164 3216
3df31b2d
VZ
3217 wxLogTrace(TRACE_STRCONV,
3218 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3219 (m_name ? wxString(m_name)
3df31b2d
VZ
3220 : wxFontMapperBase::GetEncodingName(m_encoding)));
3221#endif // wxUSE_FONTMAP
e95354ec
VZ
3222
3223 return NULL;
3224}
3225
3226void wxCSConv::CreateConvIfNeeded() const
3227{
3228 if ( m_deferred )
3229 {
f48a1159 3230 wxCSConv *self = const_cast<wxCSConv *>(this);
bda3d86a 3231
bda3d86a
VZ
3232 // if we don't have neither the name nor the encoding, use the default
3233 // encoding for this system
3234 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3235 {
4c75209f 3236#if wxUSE_INTL
02c7347b 3237 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3238#else
3239 // fallback to some reasonable default:
3240 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3241#endif // wxUSE_INTL
4c75209f 3242 }
bda3d86a 3243
e95354ec
VZ
3244 self->m_convReal = DoCreate();
3245 self->m_deferred = false;
6001e347 3246 }
6001e347
RR
3247}
3248
0f0298b1
VZ
3249bool wxCSConv::IsOk() const
3250{
3251 CreateConvIfNeeded();
3252
3253 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3254 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3255 return true; // always ok as we do it ourselves
3256
3257 // m_convReal->IsOk() is called at its own creation, so we know it must
3258 // be ok if m_convReal is non-NULL
3259 return m_convReal != NULL;
3260}
3261
1c714a5d
VZ
3262size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3263 const char *src, size_t srcLen) const
3264{
3265 CreateConvIfNeeded();
3266
2c74c558
VS
3267 if (m_convReal)
3268 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3269
3270 // latin-1 (direct)
05392dc8
VZ
3271 if ( srcLen == wxNO_LEN )
3272 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3273
05392dc8
VZ
3274 if ( dst )
3275 {
3276 if ( dstLen < srcLen )
3277 return wxCONV_FAILED;
1c714a5d 3278
05392dc8
VZ
3279 for ( size_t n = 0; n < srcLen; n++ )
3280 dst[n] = (unsigned char)(src[n]);
3281 }
2c74c558 3282
05392dc8 3283 return srcLen;
1c714a5d
VZ
3284}
3285
05392dc8
VZ
3286size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3287 const wchar_t *src, size_t srcLen) const
6001e347 3288{
e95354ec 3289 CreateConvIfNeeded();
dccce9ea 3290
e95354ec 3291 if (m_convReal)
05392dc8 3292 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3293
3294 // latin-1 (direct)
05392dc8
VZ
3295 if ( srcLen == wxNO_LEN )
3296 srcLen = wxWcslen(src) + 1;
dccce9ea 3297
05392dc8 3298 if ( dst )
f1339c56 3299 {
05392dc8
VZ
3300 if ( dstLen < srcLen )
3301 return wxCONV_FAILED;
1cd52418 3302
05392dc8 3303 for ( size_t n = 0; n < srcLen; n++ )
24642831 3304 {
05392dc8 3305 if ( src[n] > 0xFF )
467e0479 3306 return wxCONV_FAILED;
ef199164 3307
05392dc8 3308 dst[n] = (char)src[n];
24642831 3309 }
05392dc8 3310
24642831 3311 }
05392dc8 3312 else // still need to check the input validity
24642831 3313 {
05392dc8 3314 for ( size_t n = 0; n < srcLen; n++ )
24642831 3315 {
05392dc8 3316 if ( src[n] > 0xFF )
467e0479 3317 return wxCONV_FAILED;
24642831 3318 }
f1339c56 3319 }
dccce9ea 3320
05392dc8 3321 return srcLen;
6001e347
RR
3322}
3323
7ef3ab50 3324size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3325{
3326 CreateConvIfNeeded();
3327
3328 if ( m_convReal )
3329 {
7ef3ab50 3330 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3331 }
3332
ba98e032 3333 // otherwise, we are ISO-8859-1
c1464d9d 3334 return 1;
eec47cc6
VZ
3335}
3336
ba98e032
VS
3337#if wxUSE_UNICODE_UTF8
3338bool wxCSConv::IsUTF8() const
3339{
3340 CreateConvIfNeeded();
3341
3342 if ( m_convReal )
3343 {
3344 return m_convReal->IsUTF8();
3345 }
3346
3347 // otherwise, we are ISO-8859-1
3348 return false;
3349}
3350#endif
3351
69c928ef
VZ
3352
3353#if wxUSE_UNICODE
3354
3355wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3356{
3357 if ( !s )
3358 return wxWCharBuffer();
3359
3360 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3361 if ( !wbuf )
5487ff0f 3362 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3363 if ( !wbuf )
3364 wbuf = wxConvISO8859_1.cMB2WX(s);
3365
3366 return wbuf;
3367}
3368
3369wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3370{
3371 if ( !ws )
3372 return wxCharBuffer();
3373
3374 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3375 if ( !buf )
3376 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3377
3378 return buf;
3379}
3380
3381#endif // wxUSE_UNICODE
f5a1953b 3382
1e50d914
VS
3383// ----------------------------------------------------------------------------
3384// globals
3385// ----------------------------------------------------------------------------
3386
3387// NB: The reason why we create converted objects in this convoluted way,
3388// using a factory function instead of global variable, is that they
3389// may be used at static initialization time (some of them are used by
3390// wxString ctors and there may be a global wxString object). In other
3391// words, possibly _before_ the converter global object would be
3392// initialized.
3393
3394#undef wxConvLibc
3395#undef wxConvUTF8
3396#undef wxConvUTF7
3397#undef wxConvLocal
3398#undef wxConvISO8859_1
3399
3400#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3401 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3402 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3403 { \
3404 static impl_klass name##Obj ctor_args; \
3405 return &name##Obj; \
3406 } \
3407 /* this ensures that all global converter objects are created */ \
3408 /* by the time static initialization is done, i.e. before any */ \
3409 /* thread is launched: */ \
3410 static klass* gs_##name##instance = wxGet_##name##Ptr()
3411
3412#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3413 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3414
5c69ef61
VZ
3415#ifdef __INTELC__
3416 // disable warning "variable 'xxx' was declared but never referenced"
3417 #pragma warning(disable: 177)
3418#endif // Intel C++
3419
1e50d914
VS
3420#ifdef __WINDOWS__
3421 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3422#elif 0 // defined(__WXOSX__)
3423 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3424#else
3425 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3426#endif
3427
e1079eda
VZ
3428// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3429// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3430// provokes an error message about "not enough macro parameters"; and we
3431// can't use "()" here as the name##Obj declaration would be parsed as a
3432// function declaration then, so use a semicolon and live with an extra
3433// empty statement (and hope that no compilers warns about this)
3434WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3435WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3436
3437WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3438WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3439
3440WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3441WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3442
6ac84a78
DE
3443#ifdef __DARWIN__
3444// The xnu kernel always communicates file paths in decomposed UTF-8.
3445// WARNING: Are we sure that CFString's conversion will cause decomposition?
3446static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3447#endif
6ac84a78 3448
1e50d914 3449WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3450#ifdef __DARWIN__
1e50d914 3451 &wxConvMacUTF8DObj;
6ac84a78 3452#else // !__DARWIN__
1e50d914 3453 wxGet_wxConvLibcPtr();
6ac84a78 3454#endif // __DARWIN__/!__DARWIN__