]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Fix wxRichToolTip compilation under MSW without PCH and recent SDK headers.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
1c193821 31#ifndef __WXWINCE__
1cd52418 32#include <errno.h>
1c193821
JS
33#endif
34
6001e347
RR
35#include <ctype.h>
36#include <string.h>
37#include <stdlib.h>
38
e95354ec 39#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
e95354ec 42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
b040e242 45#ifdef HAVE_ICONV
373658eb 46 #include <iconv.h>
b1d547eb 47 #include "wx/thread.h"
1cd52418 48#endif
1cd52418 49
373658eb
VZ
50#include "wx/encconv.h"
51#include "wx/fontmap.h"
52
5c4ed98d 53#ifdef __DARWIN__
c933e267 54#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
55#endif //def __DARWIN__
56
ef199164 57
9a83f860 58#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 59
467e0479
VZ
60// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61// be 4 bytes
4948c2b6 62#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
63 #define WC_UTF16
64#endif
65
ef199164 66
373658eb
VZ
67// ============================================================================
68// implementation
69// ============================================================================
70
69373110
VZ
71// helper function of cMB2WC(): check if n bytes at this location are all NUL
72static bool NotAllNULs(const char *p, size_t n)
73{
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78}
79
373658eb 80// ----------------------------------------------------------------------------
467e0479 81// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 82// ----------------------------------------------------------------------------
6001e347 83
c91830cb 84static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 85{
ef199164 86 if (input <= 0xffff)
4def3b35 87 {
999836aa
VZ
88 if (output)
89 *output = (wxUint16) input;
ef199164 90
4def3b35 91 return 1;
dccce9ea 92 }
ef199164 93 else if (input >= 0x110000)
4def3b35 94 {
467e0479 95 return wxCONV_FAILED;
dccce9ea
VZ
96 }
97 else
4def3b35 98 {
dccce9ea 99 if (output)
4def3b35 100 {
ef199164
DS
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 103 }
ef199164 104
4def3b35 105 return 2;
1cd52418 106 }
1cd52418
OK
107}
108
c91830cb 109static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 110{
ef199164 111 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
112 {
113 output = *input;
114 return 1;
dccce9ea 115 }
ef199164 116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
117 {
118 output = *input;
467e0479 119 return wxCONV_FAILED;
dccce9ea
VZ
120 }
121 else
4def3b35
VS
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
1cd52418
OK
126}
127
467e0479 128#ifdef WC_UTF16
35d11700
VZ
129 typedef wchar_t wxDecodeSurrogate_t;
130#else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
133
134// returns the next UTF-32 character from the wchar_t buffer and advances the
135// pointer to the character after this one
136//
137// if an invalid character is found, *pSrc is set to NULL, the caller must
138// check for this
35d11700 139static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
140{
141 wxUint32 out;
8d3dd069 142 const size_t
5c33522f 143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150}
151
f6bcfd97 152// ----------------------------------------------------------------------------
6001e347 153// wxMBConv
f6bcfd97 154// ----------------------------------------------------------------------------
2c53a80a 155
483b0434
VZ
156size_t
157wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
6001e347 159{
483b0434 160 // although new conversion classes are supposed to implement this function
36f93678 161 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
36f93678
VZ
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
6001e347 170
483b0434
VZ
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
eec47cc6 173
c1464d9d 174 // the number of NULs terminating this string
a78c43f1 175 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 176
c1464d9d
VZ
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
483b0434
VZ
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
467e0479 183 if ( srcLen != wxNO_LEN )
eec47cc6 184 {
c1464d9d 185 // we need to know how to find the end of this string
7ef3ab50 186 nulLen = GetMBNulLen();
483b0434
VZ
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
e4e3bbb4 189
c1464d9d 190 // if there are enough NULs we can avoid the copy
483b0434 191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
192 {
193 // make a copy in order to properly NUL-terminate the string
483b0434 194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 195 char * const p = bufTmp.data();
483b0434
VZ
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 198 *s = '\0';
483b0434
VZ
199
200 src = bufTmp;
eec47cc6 201 }
e4e3bbb4 202
483b0434
VZ
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
e4e3bbb4 209
36f93678
VZ
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
bbb0ff36 217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
483b0434 225 for ( ;; )
eec47cc6 226 {
c1464d9d 227 // try to convert the current chunk
483b0434 228 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
e4e3bbb4 231
483b0434 232 dstWritten += lenChunk;
f6a02087
VZ
233 if ( !srcEnd )
234 dstWritten++;
f5fb6871 235
f6a02087 236 if ( !lenChunk )
467e0479
VZ
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
483b0434
VZ
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
f6a02087
VZ
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
f6a02087
VZ
252 if ( !srcEnd )
253 dst++;
483b0434 254 }
c1464d9d 255
483b0434 256 if ( !srcEnd )
c1464d9d 257 {
467e0479 258 // we convert just one chunk in this case as this is the entire
bbb0ff36 259 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
260 break;
261 }
eec47cc6 262
bbb0ff36
VZ
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
bbb0ff36
VZ
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
c1464d9d 286
483b0434 287 if ( src >= srcEnd )
c1464d9d
VZ
288 break;
289 }
290
483b0434 291 return dstWritten;
e4e3bbb4
RN
292}
293
483b0434
VZ
294size_t
295wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
e4e3bbb4 297{
483b0434
VZ
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
e4e3bbb4 300
f6a02087
VZ
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
eec47cc6
VZ
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
eec47cc6 308 wxWCharBuffer bufTmp;
f6a02087 309 if ( isNulTerminated )
e4e3bbb4 310 {
483b0434 311 srcLen = wxWcslen(src) + 1;
eec47cc6 312 }
483b0434 313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
314 {
315 // make a copy in order to properly NUL-terminate the string
483b0434 316 bufTmp = wxWCharBuffer(srcLen);
ef199164 317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
27307233 324 src++ /* skip L'\0' too */ )
483b0434
VZ
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
483b0434 331 dstWritten += lenChunk;
27307233
VZ
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
f6a02087 340 dstWritten += lenNul;
483b0434
VZ
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
27307233
VZ
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
364 return wxCONV_FAILED;
365
27307233
VZ
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
483b0434 377 dst += lenChunk;
27307233 378 if ( chunkEnd < srcEnd )
f6a02087 379 dst += lenNul;
483b0434 380 }
27307233
VZ
381
382 src = chunkEnd;
eec47cc6 383 }
e4e3bbb4 384
483b0434
VZ
385 return dstWritten;
386}
387
ef199164 388size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 389{
51725fc0 390 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 391 if ( rc != wxCONV_FAILED )
509da451
VZ
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399}
400
ef199164 401size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 402{
51725fc0 403 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 404 if ( rc != wxCONV_FAILED )
509da451 405 {
51725fc0 406 rc -= GetMBNulLen();
509da451
VZ
407 }
408
409 return rc;
410}
411
483b0434
VZ
412wxMBConv::~wxMBConv()
413{
414 // nothing to do here (necessary for Darwin linking probably)
415}
e4e3bbb4 416
483b0434
VZ
417const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418{
419 if ( psz )
eec47cc6 420 {
483b0434 421 // calculate the length of the buffer needed first
a2db25a1 422 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 423 if ( nLen != wxCONV_FAILED )
f5fb6871 424 {
483b0434 425 // now do the actual conversion
a2db25a1 426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 427
483b0434 428 // +1 for the trailing NULL
a2db25a1 429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 430 return buf;
f5fb6871 431 }
483b0434 432 }
e4e3bbb4 433
483b0434
VZ
434 return wxWCharBuffer();
435}
3698ae71 436
483b0434
VZ
437const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438{
439 if ( pwz )
440 {
a2db25a1 441 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 442 if ( nLen != wxCONV_FAILED )
483b0434 443 {
a2db25a1
VZ
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451}
e4e3bbb4 452
483b0434 453const wxWCharBuffer
ef199164 454wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 455{
ef199164 456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 457 if ( dstLen != wxCONV_FAILED )
483b0434 458 {
0dd13d21
VZ
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
f6a02087 463 wbuf.data()[dstLen] = L'\0';
ef199164 464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
465 {
466 if ( outLen )
467e0479
VZ
467 {
468 *outLen = dstLen;
f6a02087
VZ
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
467e0479
VZ
476 (*outLen)--;
477 }
478
483b0434
VZ
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487}
488
489const wxCharBuffer
ef199164 490wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 491{
13d92ad6 492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 493 if ( dstLen != wxCONV_FAILED )
483b0434 494 {
0dd13d21
VZ
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
ef199164 501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
502 {
503 if ( outLen )
467e0479
VZ
504 {
505 *outLen = dstLen;
506
f6a02087 507 if ( inLen == wxNO_LEN )
467e0479 508 {
f6a02087
VZ
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
13d92ad6 511 *outLen -= nulLen;
467e0479
VZ
512 }
513 }
d32a507d 514
483b0434
VZ
515 return buf;
516 }
e4e3bbb4
RN
517 }
518
eec47cc6
VZ
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
e4e3bbb4
RN
523}
524
40ac5040
VZ
525const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526{
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
cfcfada9 540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
541}
542
543const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544{
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
cfcfada9 558 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
559}
560
6001e347 561// ----------------------------------------------------------------------------
bde4baac 562// wxMBConvLibc
6001e347
RR
563// ----------------------------------------------------------------------------
564
bde4baac
VZ
565size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566{
567 return wxMB2WC(buf, psz, n);
568}
569
570size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571{
572 return wxWC2MB(buf, psz, n);
573}
e1bfe89e
RR
574
575// ----------------------------------------------------------------------------
532d575b 576// wxConvBrokenFileNames
e1bfe89e
RR
577// ----------------------------------------------------------------------------
578
eec47cc6
VZ
579#ifdef __UNIX__
580
86501081 581wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 582{
9a83f860
VZ
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
586 else
587 m_conv = new wxCSConv(charset);
ea8ce907
RR
588}
589
eec47cc6 590#endif // __UNIX__
c12b7f79 591
bde4baac 592// ----------------------------------------------------------------------------
3698ae71 593// UTF-7
bde4baac 594// ----------------------------------------------------------------------------
6001e347 595
15f2ee32 596// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
597//
598// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 599
15f2ee32
RN
600//
601// BASE64 decoding table
602//
603static const unsigned char utf7unb64[] =
6001e347 604{
15f2ee32
RN
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
637};
638
9d653e81
VZ
639size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
15f2ee32 641{
9d653e81 642 DecoderState stateOrig,
852dcba5 643 *statePtr;
9d653e81
VZ
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
5c33522f 655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
15f2ee32
RN
667 size_t len = 0;
668
9d653e81
VZ
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 672 {
9d653e81
VZ
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
15f2ee32 676 {
9d653e81
VZ
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
15f2ee32 679 {
ccaa848d
VZ
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
852dcba5 691 return wxCONV_FAILED;
ccaa848d 692 }
852dcba5 693
9d653e81
VZ
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
15f2ee32 709 {
9d653e81
VZ
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
15f2ee32 715 {
9d653e81
VZ
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
15f2ee32 721 }
9d653e81 722 else // MSB
04a37834 723 {
9d653e81
VZ
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
04a37834 727 }
15f2ee32
RN
728 }
729 }
9d653e81 730 }
04a37834 731
9d653e81
VZ
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
04a37834 736 {
9d653e81
VZ
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
ccaa848d
VZ
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
9d653e81
VZ
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
15f2ee32
RN
770 }
771 }
04a37834 772
9d653e81
VZ
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
04a37834 782
15f2ee32 783 return len;
6001e347
RR
784}
785
15f2ee32
RN
786//
787// BASE64 encoding table
788//
789static const unsigned char utf7enb64[] =
790{
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799};
800
801//
802// UTF-7 encoding table
803//
804// 0 - Set D (directly encoded characters)
805// 1 - Set O (optional direct characters)
806// 2 - whitespace characters (optional)
807// 3 - special characters
808//
809static const unsigned char utf7encode[128] =
6001e347 810{
9d653e81 811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819};
820
9d653e81
VZ
821static inline bool wxIsUTF7Direct(wchar_t wc)
822{
823 return wc < 0x80 && utf7encode[wc] < 1;
824}
825
826size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
15f2ee32 828{
9d653e81
VZ
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
5c33522f 842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
843 }
844
845 EncoderState& state = *statePtr;
846
847
15f2ee32
RN
848 size_t len = 0;
849
9d653e81
VZ
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 852 {
9d653e81
VZ
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
15f2ee32 855 {
9d653e81
VZ
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
ef199164 865
9d653e81
VZ
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
15f2ee32
RN
875 len++;
876 }
9d653e81
VZ
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
15f2ee32 887#ifndef WC_UTF16
79c78d42 888 else if (((wxUint32)cc) > 0xffff)
b2c13097 889 {
15f2ee32 890 // no surrogate pair generation (yet?)
467e0479 891 return wxCONV_FAILED;
15f2ee32
RN
892 }
893#endif
894 else
895 {
9d653e81
VZ
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
ef199164 899
9d653e81
VZ
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
15f2ee32 907 {
9d653e81 908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 909 {
9d653e81
VZ
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
15f2ee32 914 {
9d653e81
VZ
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
15f2ee32 919 }
15f2ee32 920 }
ef199164 921
9d653e81
VZ
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
ef199164 924
9d653e81 925 src++;
15f2ee32 926 }
15f2ee32
RN
927 }
928 }
ef199164 929
9d653e81
VZ
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
ef199164 935
15f2ee32 936 return len;
6001e347
RR
937}
938
f6bcfd97 939// ----------------------------------------------------------------------------
6001e347 940// UTF-8
f6bcfd97 941// ----------------------------------------------------------------------------
6001e347 942
1774c3c5 943static const wxUint32 utf8_max[]=
4def3b35 944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 945
3698ae71
VZ
946// boundaries of the private use area we use to (temporarily) remap invalid
947// characters invalid in a UTF-8 encoded string
ea8ce907
RR
948const wxUint32 wxUnicodePUA = 0x100000;
949const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
0286d08d 951// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 952const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984};
985
986size_t
987wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989{
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
0dcbb107 998 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
0286d08d
VZ
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
0286d08d
VZ
1018 if ( out && !dstLen-- )
1019 break;
1020
5367a38a
VS
1021 wxUint32 code;
1022 unsigned char c = *p;
0286d08d 1023
5367a38a
VS
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
0286d08d 1028
5367a38a
VS
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
0286d08d 1031
5367a38a
VS
1032 code = c;
1033 }
1034 else
0286d08d 1035 {
5367a38a
VS
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
0286d08d 1081
5367a38a
VS
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
0286d08d
VZ
1085 }
1086
1087#ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095#else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098#endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107}
1108
1109size_t
1110wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112{
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
0dcbb107 1118 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
0286d08d
VZ
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
a964d3ed
VZ
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
0286d08d
VZ
1140
1141 wxUint32 code;
1142#ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 }
1149#else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151#endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
9a83f860 1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223}
1224
d16d0917
VZ
1225size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
6001e347 1227{
0286d08d 1228 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1230
4def3b35
VS
1231 size_t len = 0;
1232
f4cb7c58
VZ
1233 // The length can be either given explicitly or computed implicitly for the
1234 // NUL-terminated strings.
1235 const bool isNulTerminated = srcLen == wxNO_LEN;
1236 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1237 {
ea8ce907
RR
1238 const char *opsz = psz;
1239 bool invalid = false;
4def3b35
VS
1240 unsigned char cc = *psz++, fc = cc;
1241 unsigned cnt;
dccce9ea 1242 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1243 fc <<= 1;
ef199164 1244
dccce9ea 1245 if (!cnt)
4def3b35
VS
1246 {
1247 // plain ASCII char
dccce9ea 1248 if (buf)
4def3b35
VS
1249 *buf++ = cc;
1250 len++;
561488ef
MW
1251
1252 // escape the escape character for octal escapes
1253 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1254 && cc == '\\' && (!buf || len < n))
1255 {
1256 if (buf)
1257 *buf++ = cc;
1258 len++;
1259 }
dccce9ea
VZ
1260 }
1261 else
4def3b35
VS
1262 {
1263 cnt--;
dccce9ea 1264 if (!cnt)
4def3b35
VS
1265 {
1266 // invalid UTF-8 sequence
ea8ce907 1267 invalid = true;
dccce9ea
VZ
1268 }
1269 else
4def3b35
VS
1270 {
1271 unsigned ocnt = cnt - 1;
1272 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1273 while (cnt--)
4def3b35 1274 {
ea8ce907 1275 cc = *psz;
dccce9ea 1276 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1277 {
1278 // invalid UTF-8 sequence
ea8ce907
RR
1279 invalid = true;
1280 break;
4def3b35 1281 }
ef199164 1282
ea8ce907 1283 psz++;
4def3b35
VS
1284 res = (res << 6) | (cc & 0x3f);
1285 }
ef199164 1286
ea8ce907 1287 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1288 {
1289 // illegal UTF-8 encoding
ea8ce907 1290 invalid = true;
4def3b35 1291 }
ea8ce907
RR
1292 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1293 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1294 {
1295 // if one of our PUA characters turns up externally
1296 // it must also be treated as an illegal sequence
1297 // (a bit like you have to escape an escape character)
1298 invalid = true;
1299 }
1300 else
1301 {
1cd52418 1302#ifdef WC_UTF16
0286d08d 1303 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1304 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1305 if (pa == wxCONV_FAILED)
ea8ce907
RR
1306 {
1307 invalid = true;
1308 }
1309 else
1310 {
1311 if (buf)
1312 buf += pa;
1313 len += pa;
1314 }
373658eb 1315#else // !WC_UTF16
ea8ce907 1316 if (buf)
38d4b1e4 1317 *buf++ = (wchar_t)res;
ea8ce907 1318 len++;
373658eb 1319#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1320 }
1321 }
ef199164 1322
ea8ce907
RR
1323 if (invalid)
1324 {
1325 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1326 {
1327 while (opsz < psz && (!buf || len < n))
1328 {
1329#ifdef WC_UTF16
1330 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1331 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1332 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1333 if (buf)
1334 buf += pa;
1335 opsz++;
1336 len += pa;
1337#else
1338 if (buf)
38d4b1e4 1339 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1340 opsz++;
1341 len++;
1342#endif
1343 }
1344 }
3698ae71 1345 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1346 {
1347 while (opsz < psz && (!buf || len < n))
1348 {
3698ae71
VZ
1349 if ( buf && len + 3 < n )
1350 {
17a1ebd1 1351 unsigned char on = *opsz;
3698ae71 1352 *buf++ = L'\\';
17a1ebd1
VZ
1353 *buf++ = (wchar_t)( L'0' + on / 0100 );
1354 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1355 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1356 }
ef199164 1357
ea8ce907
RR
1358 opsz++;
1359 len += 4;
1360 }
1361 }
3698ae71 1362 else // MAP_INVALID_UTF8_NOT
ea8ce907 1363 {
467e0479 1364 return wxCONV_FAILED;
ea8ce907 1365 }
4def3b35
VS
1366 }
1367 }
6001e347 1368 }
ef199164 1369
f4cb7c58
VZ
1370 if ( isNulTerminated )
1371 {
1372 // Add the trailing NUL in this case if we have a large enough buffer.
1373 if ( buf && (len < n) )
1374 *buf = 0;
ef199164 1375
f4cb7c58
VZ
1376 // And count it in any case.
1377 len++;
1378 }
1379
1380 return len;
6001e347
RR
1381}
1382
3698ae71
VZ
1383static inline bool isoctal(wchar_t wch)
1384{
1385 return L'0' <= wch && wch <= L'7';
1386}
1387
d16d0917
VZ
1388size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1389 const wchar_t *psz, size_t srcLen) const
6001e347 1390{
0286d08d 1391 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1392 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1393
4def3b35 1394 size_t len = 0;
6001e347 1395
d16d0917 1396 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1397 {
1398 wxUint32 cc;
ef199164 1399
1cd52418 1400#ifdef WC_UTF16
b5153fd8
VZ
1401 // cast is ok for WC_UTF16
1402 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1403 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1404#else
ef199164 1405 cc = (*psz++) & 0x7fffffff;
4def3b35 1406#endif
3698ae71
VZ
1407
1408 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1409 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1410 {
dccce9ea 1411 if (buf)
ea8ce907 1412 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1413 len++;
3698ae71 1414 }
561488ef
MW
1415 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1416 && cc == L'\\' && psz[0] == L'\\' )
1417 {
1418 if (buf)
1419 *buf++ = (char)cc;
1420 psz++;
1421 len++;
1422 }
3698ae71
VZ
1423 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1424 cc == L'\\' &&
1425 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1426 {
dccce9ea 1427 if (buf)
3698ae71 1428 {
ef199164
DS
1429 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1430 (psz[1] - L'0') * 010 +
b2c13097 1431 (psz[2] - L'0'));
3698ae71
VZ
1432 }
1433
1434 psz += 3;
ea8ce907
RR
1435 len++;
1436 }
1437 else
1438 {
1439 unsigned cnt;
ef199164
DS
1440 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1441 {
1442 }
1443
ea8ce907 1444 if (!cnt)
4def3b35 1445 {
ea8ce907
RR
1446 // plain ASCII char
1447 if (buf)
1448 *buf++ = (char) cc;
1449 len++;
1450 }
ea8ce907
RR
1451 else
1452 {
1453 len += cnt + 1;
1454 if (buf)
1455 {
1456 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1457 while (cnt--)
1458 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1459 }
4def3b35
VS
1460 }
1461 }
6001e347 1462 }
4def3b35 1463
d16d0917 1464 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1465 *buf = 0;
adb45366 1466
d16d0917 1467 return len + 1;
6001e347
RR
1468}
1469
467e0479 1470// ============================================================================
c91830cb 1471// UTF-16
467e0479 1472// ============================================================================
c91830cb
VZ
1473
1474#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1475 #define wxMBConvUTF16straight wxMBConvUTF16BE
1476 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1477#else
bde4baac
VZ
1478 #define wxMBConvUTF16swap wxMBConvUTF16BE
1479 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1480#endif
1481
467e0479
VZ
1482/* static */
1483size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1484{
1485 if ( srcLen == wxNO_LEN )
1486 {
1487 // count the number of bytes in input, including the trailing NULs
5c33522f 1488 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1489 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1490 ;
c91830cb 1491
467e0479
VZ
1492 srcLen *= BYTES_PER_CHAR;
1493 }
1494 else // we already have the length
1495 {
1496 // we can only convert an entire number of UTF-16 characters
1497 if ( srcLen % BYTES_PER_CHAR )
1498 return wxCONV_FAILED;
1499 }
1500
1501 return srcLen;
1502}
1503
1504// case when in-memory representation is UTF-16 too
c91830cb
VZ
1505#ifdef WC_UTF16
1506
467e0479
VZ
1507// ----------------------------------------------------------------------------
1508// conversions without endianness change
1509// ----------------------------------------------------------------------------
1510
1511size_t
1512wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1513 const char *src, size_t srcLen) const
c91830cb 1514{
467e0479
VZ
1515 // set up the scene for using memcpy() (which is presumably more efficient
1516 // than copying the bytes one by one)
1517 srcLen = GetLength(src, srcLen);
1518 if ( srcLen == wxNO_LEN )
1519 return wxCONV_FAILED;
c91830cb 1520
ef199164 1521 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1522 if ( dst )
c91830cb 1523 {
467e0479
VZ
1524 if ( dstLen < inLen )
1525 return wxCONV_FAILED;
c91830cb 1526
467e0479 1527 memcpy(dst, src, srcLen);
c91830cb 1528 }
d32a507d 1529
467e0479 1530 return inLen;
c91830cb
VZ
1531}
1532
467e0479
VZ
1533size_t
1534wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1535 const wchar_t *src, size_t srcLen) const
c91830cb 1536{
467e0479
VZ
1537 if ( srcLen == wxNO_LEN )
1538 srcLen = wxWcslen(src) + 1;
c91830cb 1539
467e0479
VZ
1540 srcLen *= BYTES_PER_CHAR;
1541
1542 if ( dst )
c91830cb 1543 {
467e0479
VZ
1544 if ( dstLen < srcLen )
1545 return wxCONV_FAILED;
d32a507d 1546
467e0479 1547 memcpy(dst, src, srcLen);
c91830cb 1548 }
d32a507d 1549
467e0479 1550 return srcLen;
c91830cb
VZ
1551}
1552
467e0479
VZ
1553// ----------------------------------------------------------------------------
1554// endian-reversing conversions
1555// ----------------------------------------------------------------------------
c91830cb 1556
467e0479
VZ
1557size_t
1558wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1559 const char *src, size_t srcLen) const
c91830cb 1560{
467e0479
VZ
1561 srcLen = GetLength(src, srcLen);
1562 if ( srcLen == wxNO_LEN )
1563 return wxCONV_FAILED;
c91830cb 1564
467e0479
VZ
1565 srcLen /= BYTES_PER_CHAR;
1566
1567 if ( dst )
c91830cb 1568 {
467e0479
VZ
1569 if ( dstLen < srcLen )
1570 return wxCONV_FAILED;
1571
5c33522f 1572 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1573 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1574 {
ef199164 1575 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1576 }
c91830cb 1577 }
bfab25d4 1578
467e0479 1579 return srcLen;
c91830cb
VZ
1580}
1581
467e0479
VZ
1582size_t
1583wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1584 const wchar_t *src, size_t srcLen) const
c91830cb 1585{
467e0479
VZ
1586 if ( srcLen == wxNO_LEN )
1587 srcLen = wxWcslen(src) + 1;
c91830cb 1588
467e0479
VZ
1589 srcLen *= BYTES_PER_CHAR;
1590
1591 if ( dst )
c91830cb 1592 {
467e0479
VZ
1593 if ( dstLen < srcLen )
1594 return wxCONV_FAILED;
1595
5c33522f 1596 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1597 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1598 {
ef199164 1599 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1600 }
c91830cb 1601 }
eec47cc6 1602
467e0479 1603 return srcLen;
c91830cb
VZ
1604}
1605
467e0479 1606#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1607
467e0479
VZ
1608// ----------------------------------------------------------------------------
1609// conversions without endianness change
1610// ----------------------------------------------------------------------------
c91830cb 1611
35d11700
VZ
1612size_t
1613wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1614 const char *src, size_t srcLen) const
c91830cb 1615{
35d11700
VZ
1616 srcLen = GetLength(src, srcLen);
1617 if ( srcLen == wxNO_LEN )
1618 return wxCONV_FAILED;
c91830cb 1619
ef199164 1620 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1621 if ( !dst )
c91830cb 1622 {
35d11700
VZ
1623 // optimization: return maximal space which could be needed for this
1624 // string even if the real size could be smaller if the buffer contains
1625 // any surrogates
1626 return inLen;
c91830cb 1627 }
c91830cb 1628
35d11700 1629 size_t outLen = 0;
5c33522f 1630 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1631 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1632 {
ef199164
DS
1633 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1634 if ( !inBuff )
35d11700
VZ
1635 return wxCONV_FAILED;
1636
1637 if ( ++outLen > dstLen )
1638 return wxCONV_FAILED;
c91830cb 1639
35d11700
VZ
1640 *dst++ = ch;
1641 }
1642
1643
1644 return outLen;
1645}
c91830cb 1646
35d11700
VZ
1647size_t
1648wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1649 const wchar_t *src, size_t srcLen) const
c91830cb 1650{
35d11700
VZ
1651 if ( srcLen == wxNO_LEN )
1652 srcLen = wxWcslen(src) + 1;
c91830cb 1653
35d11700 1654 size_t outLen = 0;
5c33522f 1655 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1656 for ( size_t n = 0; n < srcLen; n++ )
c91830cb 1657 {
d883acaa 1658 wxUint16 cc[2] = { 0 };
35d11700
VZ
1659 const size_t numChars = encode_utf16(*src++, cc);
1660 if ( numChars == wxCONV_FAILED )
1661 return wxCONV_FAILED;
c91830cb 1662
ef199164
DS
1663 outLen += numChars * BYTES_PER_CHAR;
1664 if ( outBuff )
c91830cb 1665 {
35d11700
VZ
1666 if ( outLen > dstLen )
1667 return wxCONV_FAILED;
1668
ef199164 1669 *outBuff++ = cc[0];
35d11700 1670 if ( numChars == 2 )
69b80d28 1671 {
35d11700 1672 // second character of a surrogate
ef199164 1673 *outBuff++ = cc[1];
69b80d28 1674 }
c91830cb 1675 }
c91830cb 1676 }
c91830cb 1677
35d11700 1678 return outLen;
c91830cb
VZ
1679}
1680
467e0479
VZ
1681// ----------------------------------------------------------------------------
1682// endian-reversing conversions
1683// ----------------------------------------------------------------------------
c91830cb 1684
35d11700
VZ
1685size_t
1686wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1687 const char *src, size_t srcLen) const
c91830cb 1688{
35d11700
VZ
1689 srcLen = GetLength(src, srcLen);
1690 if ( srcLen == wxNO_LEN )
1691 return wxCONV_FAILED;
1692
ef199164 1693 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1694 if ( !dst )
1695 {
1696 // optimization: return maximal space which could be needed for this
1697 // string even if the real size could be smaller if the buffer contains
1698 // any surrogates
1699 return inLen;
1700 }
c91830cb 1701
35d11700 1702 size_t outLen = 0;
5c33522f 1703 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1704 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1705 {
35d11700
VZ
1706 wxUint32 ch;
1707 wxUint16 tmp[2];
ef199164
DS
1708
1709 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1710 inBuff++;
1711 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1712
35d11700
VZ
1713 const size_t numChars = decode_utf16(tmp, ch);
1714 if ( numChars == wxCONV_FAILED )
1715 return wxCONV_FAILED;
c91830cb 1716
35d11700 1717 if ( numChars == 2 )
ef199164 1718 inBuff++;
35d11700
VZ
1719
1720 if ( ++outLen > dstLen )
1721 return wxCONV_FAILED;
c91830cb 1722
35d11700 1723 *dst++ = ch;
c91830cb 1724 }
c91830cb 1725
c91830cb 1726
35d11700
VZ
1727 return outLen;
1728}
c91830cb 1729
35d11700
VZ
1730size_t
1731wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1732 const wchar_t *src, size_t srcLen) const
c91830cb 1733{
35d11700
VZ
1734 if ( srcLen == wxNO_LEN )
1735 srcLen = wxWcslen(src) + 1;
c91830cb 1736
35d11700 1737 size_t outLen = 0;
5c33522f 1738 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1739 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb 1740 {
d883acaa 1741 wxUint16 cc[2] = { 0 };
35d11700
VZ
1742 const size_t numChars = encode_utf16(*src, cc);
1743 if ( numChars == wxCONV_FAILED )
1744 return wxCONV_FAILED;
c91830cb 1745
ef199164
DS
1746 outLen += numChars * BYTES_PER_CHAR;
1747 if ( outBuff )
c91830cb 1748 {
35d11700
VZ
1749 if ( outLen > dstLen )
1750 return wxCONV_FAILED;
1751
ef199164 1752 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1753 if ( numChars == 2 )
c91830cb 1754 {
35d11700 1755 // second character of a surrogate
ef199164 1756 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1757 }
1758 }
c91830cb 1759 }
c91830cb 1760
35d11700 1761 return outLen;
c91830cb
VZ
1762}
1763
467e0479 1764#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1765
1766
35d11700 1767// ============================================================================
c91830cb 1768// UTF-32
35d11700 1769// ============================================================================
c91830cb
VZ
1770
1771#ifdef WORDS_BIGENDIAN
467e0479
VZ
1772 #define wxMBConvUTF32straight wxMBConvUTF32BE
1773 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1774#else
467e0479
VZ
1775 #define wxMBConvUTF32swap wxMBConvUTF32BE
1776 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1777#endif
1778
1779
1780WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1781WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1782
467e0479
VZ
1783/* static */
1784size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1785{
1786 if ( srcLen == wxNO_LEN )
1787 {
1788 // count the number of bytes in input, including the trailing NULs
5c33522f 1789 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1790 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1791 ;
c91830cb 1792
467e0479
VZ
1793 srcLen *= BYTES_PER_CHAR;
1794 }
1795 else // we already have the length
1796 {
1797 // we can only convert an entire number of UTF-32 characters
1798 if ( srcLen % BYTES_PER_CHAR )
1799 return wxCONV_FAILED;
1800 }
1801
1802 return srcLen;
1803}
1804
1805// case when in-memory representation is UTF-16
c91830cb
VZ
1806#ifdef WC_UTF16
1807
467e0479
VZ
1808// ----------------------------------------------------------------------------
1809// conversions without endianness change
1810// ----------------------------------------------------------------------------
1811
1812size_t
1813wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1814 const char *src, size_t srcLen) const
c91830cb 1815{
467e0479
VZ
1816 srcLen = GetLength(src, srcLen);
1817 if ( srcLen == wxNO_LEN )
1818 return wxCONV_FAILED;
c91830cb 1819
5c33522f 1820 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1821 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1822 size_t outLen = 0;
1823 for ( size_t n = 0; n < inLen; n++ )
c91830cb 1824 {
d883acaa 1825 wxUint16 cc[2] = { 0 };
ef199164 1826 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1827 if ( numChars == wxCONV_FAILED )
1828 return wxCONV_FAILED;
c91830cb 1829
467e0479
VZ
1830 outLen += numChars;
1831 if ( dst )
c91830cb 1832 {
467e0479
VZ
1833 if ( outLen > dstLen )
1834 return wxCONV_FAILED;
d32a507d 1835
467e0479
VZ
1836 *dst++ = cc[0];
1837 if ( numChars == 2 )
1838 {
1839 // second character of a surrogate
1840 *dst++ = cc[1];
1841 }
1842 }
c91830cb 1843 }
d32a507d 1844
467e0479 1845 return outLen;
c91830cb
VZ
1846}
1847
467e0479
VZ
1848size_t
1849wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1850 const wchar_t *src, size_t srcLen) const
c91830cb 1851{
467e0479
VZ
1852 if ( srcLen == wxNO_LEN )
1853 srcLen = wxWcslen(src) + 1;
c91830cb 1854
467e0479 1855 if ( !dst )
c91830cb 1856 {
467e0479
VZ
1857 // optimization: return maximal space which could be needed for this
1858 // string instead of the exact amount which could be less if there are
1859 // any surrogates in the input
1860 //
1861 // we consider that surrogates are rare enough to make it worthwhile to
1862 // avoid running the loop below at the cost of slightly extra memory
1863 // consumption
ef199164 1864 return srcLen * BYTES_PER_CHAR;
467e0479 1865 }
c91830cb 1866
5c33522f 1867 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1868 size_t outLen = 0;
1869 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1870 {
1871 const wxUint32 ch = wxDecodeSurrogate(&src);
1872 if ( !src )
1873 return wxCONV_FAILED;
c91830cb 1874
467e0479 1875 outLen += BYTES_PER_CHAR;
d32a507d 1876
467e0479
VZ
1877 if ( outLen > dstLen )
1878 return wxCONV_FAILED;
b5153fd8 1879
ef199164 1880 *outBuff++ = ch;
467e0479 1881 }
c91830cb 1882
467e0479 1883 return outLen;
c91830cb
VZ
1884}
1885
467e0479
VZ
1886// ----------------------------------------------------------------------------
1887// endian-reversing conversions
1888// ----------------------------------------------------------------------------
c91830cb 1889
467e0479
VZ
1890size_t
1891wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1892 const char *src, size_t srcLen) const
c91830cb 1893{
467e0479
VZ
1894 srcLen = GetLength(src, srcLen);
1895 if ( srcLen == wxNO_LEN )
1896 return wxCONV_FAILED;
c91830cb 1897
5c33522f 1898 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1899 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1900 size_t outLen = 0;
ef199164 1901 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1902 {
d883acaa 1903 wxUint16 cc[2] = { 0 };
ef199164 1904 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1905 if ( numChars == wxCONV_FAILED )
1906 return wxCONV_FAILED;
c91830cb 1907
467e0479
VZ
1908 outLen += numChars;
1909 if ( dst )
c91830cb 1910 {
467e0479
VZ
1911 if ( outLen > dstLen )
1912 return wxCONV_FAILED;
d32a507d 1913
467e0479
VZ
1914 *dst++ = cc[0];
1915 if ( numChars == 2 )
1916 {
1917 // second character of a surrogate
1918 *dst++ = cc[1];
1919 }
1920 }
c91830cb 1921 }
b5153fd8 1922
467e0479 1923 return outLen;
c91830cb
VZ
1924}
1925
467e0479
VZ
1926size_t
1927wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1928 const wchar_t *src, size_t srcLen) const
c91830cb 1929{
467e0479
VZ
1930 if ( srcLen == wxNO_LEN )
1931 srcLen = wxWcslen(src) + 1;
c91830cb 1932
467e0479 1933 if ( !dst )
c91830cb 1934 {
467e0479
VZ
1935 // optimization: return maximal space which could be needed for this
1936 // string instead of the exact amount which could be less if there are
1937 // any surrogates in the input
1938 //
1939 // we consider that surrogates are rare enough to make it worthwhile to
1940 // avoid running the loop below at the cost of slightly extra memory
1941 // consumption
1942 return srcLen*BYTES_PER_CHAR;
1943 }
c91830cb 1944
5c33522f 1945 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1946 size_t outLen = 0;
1947 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1948 {
1949 const wxUint32 ch = wxDecodeSurrogate(&src);
1950 if ( !src )
1951 return wxCONV_FAILED;
c91830cb 1952
467e0479 1953 outLen += BYTES_PER_CHAR;
d32a507d 1954
467e0479
VZ
1955 if ( outLen > dstLen )
1956 return wxCONV_FAILED;
b5153fd8 1957
ef199164 1958 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1959 }
c91830cb 1960
467e0479 1961 return outLen;
c91830cb
VZ
1962}
1963
467e0479 1964#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1965
35d11700
VZ
1966// ----------------------------------------------------------------------------
1967// conversions without endianness change
1968// ----------------------------------------------------------------------------
1969
1970size_t
1971wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1972 const char *src, size_t srcLen) const
c91830cb 1973{
35d11700
VZ
1974 // use memcpy() as it should be much faster than hand-written loop
1975 srcLen = GetLength(src, srcLen);
1976 if ( srcLen == wxNO_LEN )
1977 return wxCONV_FAILED;
c91830cb 1978
35d11700
VZ
1979 const size_t inLen = srcLen/BYTES_PER_CHAR;
1980 if ( dst )
c91830cb 1981 {
35d11700
VZ
1982 if ( dstLen < inLen )
1983 return wxCONV_FAILED;
b5153fd8 1984
35d11700
VZ
1985 memcpy(dst, src, srcLen);
1986 }
c91830cb 1987
35d11700 1988 return inLen;
c91830cb
VZ
1989}
1990
35d11700
VZ
1991size_t
1992wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1993 const wchar_t *src, size_t srcLen) const
c91830cb 1994{
35d11700
VZ
1995 if ( srcLen == wxNO_LEN )
1996 srcLen = wxWcslen(src) + 1;
1997
1998 srcLen *= BYTES_PER_CHAR;
c91830cb 1999
35d11700 2000 if ( dst )
c91830cb 2001 {
35d11700
VZ
2002 if ( dstLen < srcLen )
2003 return wxCONV_FAILED;
c91830cb 2004
35d11700 2005 memcpy(dst, src, srcLen);
c91830cb
VZ
2006 }
2007
35d11700 2008 return srcLen;
c91830cb
VZ
2009}
2010
35d11700
VZ
2011// ----------------------------------------------------------------------------
2012// endian-reversing conversions
2013// ----------------------------------------------------------------------------
c91830cb 2014
35d11700
VZ
2015size_t
2016wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2017 const char *src, size_t srcLen) const
c91830cb 2018{
35d11700
VZ
2019 srcLen = GetLength(src, srcLen);
2020 if ( srcLen == wxNO_LEN )
2021 return wxCONV_FAILED;
2022
2023 srcLen /= BYTES_PER_CHAR;
c91830cb 2024
35d11700 2025 if ( dst )
c91830cb 2026 {
35d11700
VZ
2027 if ( dstLen < srcLen )
2028 return wxCONV_FAILED;
2029
5c33522f 2030 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2031 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2032 {
ef199164 2033 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2034 }
c91830cb 2035 }
b5153fd8 2036
35d11700 2037 return srcLen;
c91830cb
VZ
2038}
2039
35d11700
VZ
2040size_t
2041wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2042 const wchar_t *src, size_t srcLen) const
c91830cb 2043{
35d11700
VZ
2044 if ( srcLen == wxNO_LEN )
2045 srcLen = wxWcslen(src) + 1;
2046
2047 srcLen *= BYTES_PER_CHAR;
c91830cb 2048
35d11700 2049 if ( dst )
c91830cb 2050 {
35d11700
VZ
2051 if ( dstLen < srcLen )
2052 return wxCONV_FAILED;
2053
5c33522f 2054 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2055 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2056 {
ef199164 2057 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2058 }
c91830cb 2059 }
b5153fd8 2060
35d11700 2061 return srcLen;
c91830cb
VZ
2062}
2063
467e0479 2064#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2065
2066
36acb880
VZ
2067// ============================================================================
2068// The classes doing conversion using the iconv_xxx() functions
2069// ============================================================================
3caec1bb 2070
b040e242 2071#ifdef HAVE_ICONV
3a0d76bc 2072
b1d547eb
VS
2073// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2074// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2075// (unless there's yet another bug in glibc) the only case when iconv()
2076// returns with (size_t)-1 (which means error) and says there are 0 bytes
2077// left in the input buffer -- when _real_ error occurs,
2078// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2079// iconv() failure.
3caec1bb
VS
2080// [This bug does not appear in glibc 2.2.]
2081#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2082#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2083 (errno != E2BIG || bufLeft != 0))
2084#else
2085#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2086#endif
2087
ab217dba 2088#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2089
74a7eb0b
VZ
2090#define ICONV_T_INVALID ((iconv_t)-1)
2091
2092#if SIZEOF_WCHAR_T == 4
2093 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2094 #define WC_ENC wxFONTENCODING_UTF32
2095#elif SIZEOF_WCHAR_T == 2
2096 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2097 #define WC_ENC wxFONTENCODING_UTF16
2098#else // sizeof(wchar_t) != 2 nor 4
2099 // does this ever happen?
2100 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2101#endif
2102
36acb880 2103// ----------------------------------------------------------------------------
e95354ec 2104// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2105// ----------------------------------------------------------------------------
2106
e95354ec 2107class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2108{
2109public:
86501081 2110 wxMBConv_iconv(const char *name);
e95354ec 2111 virtual ~wxMBConv_iconv();
36acb880 2112
8f4b0f43
VZ
2113 // implement base class virtual methods
2114 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2115 const char *src, size_t srcLen = wxNO_LEN) const;
2116 virtual size_t FromWChar(char *dst, size_t dstLen,
2117 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2118 virtual size_t GetMBNulLen() const;
2119
ba98e032
VS
2120#if wxUSE_UNICODE_UTF8
2121 virtual bool IsUTF8() const;
2122#endif
2123
d36c9347
VZ
2124 virtual wxMBConv *Clone() const
2125 {
b64f93b6 2126 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
d36c9347
VZ
2127 p->m_minMBCharWidth = m_minMBCharWidth;
2128 return p;
2129 }
2130
e95354ec 2131 bool IsOk() const
74a7eb0b 2132 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2133
2134protected:
ef199164
DS
2135 // the iconv handlers used to translate from multibyte
2136 // to wide char and in the other direction
36acb880
VZ
2137 iconv_t m2w,
2138 w2m;
ef199164 2139
b1d547eb
VS
2140#if wxUSE_THREADS
2141 // guards access to m2w and w2m objects
2142 wxMutex m_iconvMutex;
2143#endif
36acb880
VZ
2144
2145private:
e95354ec 2146 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2147 // available on this machine, it will remain NULL
74a7eb0b 2148 static wxString ms_wcCharsetName;
36acb880
VZ
2149
2150 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2151 // different endian-ness than the native one
405d8f46 2152 static bool ms_wcNeedsSwap;
eec47cc6 2153
d36c9347
VZ
2154
2155 // name of the encoding handled by this conversion
b64f93b6 2156 const char *m_name;
d36c9347 2157
7ef3ab50 2158 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2159 // initially
2160 size_t m_minMBCharWidth;
36acb880
VZ
2161};
2162
8f115891 2163// make the constructor available for unit testing
86501081 2164WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2165{
2166 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2167 if ( !result->IsOk() )
2168 {
2169 delete result;
2170 return 0;
2171 }
ef199164 2172
8f115891
MW
2173 return result;
2174}
2175
422e411e 2176wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2177bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2178
86501081 2179wxMBConv_iconv::wxMBConv_iconv(const char *name)
b64f93b6 2180 : m_name(wxStrdup(name))
36acb880 2181{
c1464d9d 2182 m_minMBCharWidth = 0;
eec47cc6 2183
36acb880 2184 // check for charset that represents wchar_t:
74a7eb0b 2185 if ( ms_wcCharsetName.empty() )
f1339c56 2186 {
9a83f860 2187 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2188
74a7eb0b 2189#if wxUSE_FONTMAP
a243da29 2190 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2191#else // !wxUSE_FONTMAP
a243da29 2192 static const wxChar *const names_static[] =
36acb880 2193 {
74a7eb0b 2194#if SIZEOF_WCHAR_T == 4
9a83f860 2195 wxT("UCS-4"),
da2f1172 2196#elif SIZEOF_WCHAR_T == 2
9a83f860 2197 wxT("UCS-2"),
74a7eb0b
VZ
2198#endif
2199 NULL
2200 };
a243da29 2201 const wxChar *const *names = names_static;
74a7eb0b 2202#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2203
d1f024a8 2204 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2205 {
17a1ebd1 2206 const wxString nameCS(*names);
74a7eb0b
VZ
2207
2208 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2209 wxString nameXE(nameCS);
ef199164
DS
2210
2211#ifdef WORDS_BIGENDIAN
9a83f860 2212 nameXE += wxT("BE");
ef199164 2213#else // little endian
9a83f860 2214 nameXE += wxT("LE");
ef199164 2215#endif
74a7eb0b 2216
9a83f860 2217 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2218 nameXE.c_str());
2219
86501081 2220 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2221 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2222 {
74a7eb0b 2223 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2224 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2225 nameCS.c_str());
86501081 2226 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2227
74a7eb0b
VZ
2228 // and check for bytesex ourselves:
2229 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2230 {
74a7eb0b 2231 char buf[2], *bufPtr;
e8769ed1 2232 wchar_t wbuf[2];
74a7eb0b
VZ
2233 size_t insz, outsz;
2234 size_t res;
2235
2236 buf[0] = 'A';
2237 buf[1] = 0;
2238 wbuf[0] = 0;
2239 insz = 2;
2240 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2241 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2242 bufPtr = buf;
2243
ef199164
DS
2244 res = iconv(
2245 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2246 &wbufPtr, &outsz);
74a7eb0b
VZ
2247
2248 if (ICONV_FAILED(res, insz))
2249 {
2250 wxLogLastError(wxT("iconv"));
422e411e 2251 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2252 nameCS.c_str());
74a7eb0b
VZ
2253 }
2254 else // ok, can convert to this encoding, remember it
2255 {
17a1ebd1 2256 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2257 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2258 }
3a0d76bc
VS
2259 }
2260 }
74a7eb0b 2261 else // use charset not requiring byte swapping
36acb880 2262 {
74a7eb0b 2263 ms_wcCharsetName = nameXE;
36acb880 2264 }
3a0d76bc 2265 }
74a7eb0b 2266
0944fceb 2267 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2268 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2269 ms_wcCharsetName.empty() ? wxString("<none>")
2270 : ms_wcCharsetName,
9a83f860
VZ
2271 ms_wcNeedsSwap ? wxT(" (needs swap)")
2272 : wxT(""));
3a0d76bc 2273 }
36acb880 2274 else // we already have ms_wcCharsetName
3caec1bb 2275 {
86501081 2276 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2277 }
dccce9ea 2278
74a7eb0b 2279 if ( ms_wcCharsetName.empty() )
f1339c56 2280 {
74a7eb0b 2281 w2m = ICONV_T_INVALID;
36acb880 2282 }
405d8f46
VZ
2283 else
2284 {
86501081 2285 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2286 if ( w2m == ICONV_T_INVALID )
2287 {
2288 wxLogTrace(TRACE_STRCONV,
2289 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2290 ms_wcCharsetName.c_str(), name);
74a7eb0b 2291 }
405d8f46 2292 }
36acb880 2293}
3caec1bb 2294
e95354ec 2295wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2296{
b64f93b6
VZ
2297 free(const_cast<char *>(m_name));
2298
74a7eb0b 2299 if ( m2w != ICONV_T_INVALID )
36acb880 2300 iconv_close(m2w);
74a7eb0b 2301 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2302 iconv_close(w2m);
2303}
3a0d76bc 2304
8f4b0f43
VZ
2305size_t
2306wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2307 const char *src, size_t srcLen) const
36acb880 2308{
8f4b0f43 2309 if ( srcLen == wxNO_LEN )
69373110 2310 {
8f4b0f43
VZ
2311 // find the string length: notice that must be done differently for
2312 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2313 // consecutive NULs
2314 const size_t nulLen = GetMBNulLen();
2315 switch ( nulLen )
2316 {
2317 default:
2318 return wxCONV_FAILED;
69373110 2319
8f4b0f43
VZ
2320 case 1:
2321 srcLen = strlen(src); // arguably more optimized than our version
2322 break;
69373110 2323
8f4b0f43
VZ
2324 case 2:
2325 case 4:
2326 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2327 // but they also have to start at character boundary and not
2328 // span two adjacent characters
2329 const char *p;
2330 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2331 ;
2332 srcLen = p - src;
2333 break;
2334 }
d50c0831
VZ
2335
2336 // when we're determining the length of the string ourselves we count
2337 // the terminating NUL(s) as part of it and always NUL-terminate the
2338 // output
2339 srcLen += nulLen;
69373110
VZ
2340 }
2341
8f4b0f43
VZ
2342 // we express length in the number of (wide) characters but iconv always
2343 // counts buffer sizes it in bytes
2344 dstLen *= SIZEOF_WCHAR_T;
2345
b1d547eb 2346#if wxUSE_THREADS
6a17b868
SN
2347 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2348 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2349 // wxConvLocal that are used all over wx code, so we have to make sure
2350 // the handle is used by at most one thread at the time. Otherwise
2351 // only a few wx classes would be safe to use from non-main threads
2352 // as MB<->WC conversion would fail "randomly".
2353 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2354#endif // wxUSE_THREADS
2355
36acb880 2356 size_t res, cres;
8f4b0f43 2357 const char *pszPtr = src;
36acb880 2358
8f4b0f43 2359 if ( dst )
36acb880 2360 {
8f4b0f43 2361 char* bufPtr = (char*)dst;
e8769ed1 2362
36acb880 2363 // have destination buffer, convert there
1752fda6 2364 size_t dstLenOrig = dstLen;
36acb880 2365 cres = iconv(m2w,
8f4b0f43
VZ
2366 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2367 &bufPtr, &dstLen);
1752fda6
VZ
2368
2369 // convert the number of bytes converted as returned by iconv to the
2370 // number of (wide) characters converted that we need
2371 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2372
36acb880 2373 if (ms_wcNeedsSwap)
3a0d76bc 2374 {
36acb880 2375 // convert to native endianness
17a1ebd1 2376 for ( unsigned i = 0; i < res; i++ )
467a2982 2377 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2378 }
36acb880 2379 }
8f4b0f43 2380 else // no destination buffer
36acb880 2381 {
8f4b0f43 2382 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2383 wchar_t tbuf[256];
36acb880 2384 res = 0;
ef199164
DS
2385
2386 do
2387 {
e8769ed1 2388 char* bufPtr = (char*)tbuf;
8f4b0f43 2389 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2390
2391 cres = iconv(m2w,
8f4b0f43
VZ
2392 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2393 &bufPtr, &dstLen );
36acb880 2394
8f4b0f43 2395 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2396 }
2397 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2398 }
dccce9ea 2399
8f4b0f43 2400 if (ICONV_FAILED(cres, srcLen))
f1339c56 2401 {
36acb880 2402 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2403 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2404 return wxCONV_FAILED;
36acb880
VZ
2405 }
2406
2407 return res;
2408}
2409
8f4b0f43
VZ
2410size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2411 const wchar_t *src, size_t srcLen) const
36acb880 2412{
b1d547eb
VS
2413#if wxUSE_THREADS
2414 // NB: explained in MB2WC
2415 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2416#endif
3698ae71 2417
8f4b0f43 2418 if ( srcLen == wxNO_LEN )
2588ee86 2419 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2420
2421 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2422 size_t outbuflen = dstLen;
36acb880 2423 size_t res, cres;
3a0d76bc 2424
36acb880 2425 wchar_t *tmpbuf = 0;
3caec1bb 2426
36acb880
VZ
2427 if (ms_wcNeedsSwap)
2428 {
2429 // need to copy to temp buffer to switch endianness
51725fc0 2430 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2431 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2432 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2433 for ( size_t i = 0; i < srcLen; i++ )
2434 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2435
8f4b0f43 2436 src = tmpbuf;
36acb880 2437 }
3a0d76bc 2438
8f4b0f43
VZ
2439 char* inbuf = (char*)src;
2440 if ( dst )
36acb880
VZ
2441 {
2442 // have destination buffer, convert there
8f4b0f43 2443 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2444
8f4b0f43 2445 res = dstLen - outbuflen;
36acb880 2446 }
8f4b0f43 2447 else // no destination buffer
36acb880 2448 {
8f4b0f43 2449 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2450 char tbuf[256];
36acb880 2451 res = 0;
ef199164
DS
2452 do
2453 {
8f4b0f43 2454 dst = tbuf;
51725fc0 2455 outbuflen = WXSIZEOF(tbuf);
36acb880 2456
8f4b0f43 2457 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2458
51725fc0 2459 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2460 }
2461 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2462 }
dccce9ea 2463
36acb880
VZ
2464 if (ms_wcNeedsSwap)
2465 {
2466 free(tmpbuf);
2467 }
dccce9ea 2468
e8769ed1 2469 if (ICONV_FAILED(cres, inbuflen))
36acb880 2470 {
ce6f8d6f 2471 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2472 return wxCONV_FAILED;
36acb880
VZ
2473 }
2474
2475 return res;
2476}
2477
7ef3ab50 2478size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2479{
c1464d9d 2480 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2481 {
2482 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2483
2484#if wxUSE_THREADS
2485 // NB: explained in MB2WC
2486 wxMutexLocker lock(self->m_iconvMutex);
2487#endif
2488
999020e1 2489 const wchar_t *wnul = L"";
c1464d9d 2490 char buf[8]; // should be enough for NUL in any encoding
356410fc 2491 size_t inLen = sizeof(wchar_t),
c1464d9d 2492 outLen = WXSIZEOF(buf);
ef199164
DS
2493 char *inBuff = (char *)wnul;
2494 char *outBuff = buf;
2495 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2496 {
c1464d9d 2497 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2498 }
2499 else // ok
2500 {
ef199164 2501 self->m_minMBCharWidth = outBuff - buf;
356410fc 2502 }
eec47cc6
VZ
2503 }
2504
c1464d9d 2505 return m_minMBCharWidth;
eec47cc6
VZ
2506}
2507
ba98e032
VS
2508#if wxUSE_UNICODE_UTF8
2509bool wxMBConv_iconv::IsUTF8() const
2510{
86501081
VS
2511 return wxStricmp(m_name, "UTF-8") == 0 ||
2512 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2513}
2514#endif
2515
b040e242 2516#endif // HAVE_ICONV
36acb880 2517
e95354ec 2518
36acb880
VZ
2519// ============================================================================
2520// Win32 conversion classes
2521// ============================================================================
1cd52418 2522
e95354ec 2523#ifdef wxHAVE_WIN32_MB2WC
373658eb 2524
8b04d4c4 2525// from utils.cpp
d775fa82 2526#if wxUSE_FONTMAP
86501081 2527extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2528extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2529#endif
373658eb 2530
e95354ec 2531class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2532{
2533public:
bde4baac
VZ
2534 wxMBConv_win32()
2535 {
2536 m_CodePage = CP_ACP;
c1464d9d 2537 m_minMBCharWidth = 0;
bde4baac
VZ
2538 }
2539
d36c9347 2540 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2541 : wxMBConv()
d36c9347
VZ
2542 {
2543 m_CodePage = conv.m_CodePage;
2544 m_minMBCharWidth = conv.m_minMBCharWidth;
2545 }
2546
7608a683 2547#if wxUSE_FONTMAP
86501081 2548 wxMBConv_win32(const char* name)
bde4baac
VZ
2549 {
2550 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2551 m_minMBCharWidth = 0;
bde4baac 2552 }
dccce9ea 2553
e95354ec 2554 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2555 {
2556 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2557 m_minMBCharWidth = 0;
bde4baac 2558 }
eec47cc6 2559#endif // wxUSE_FONTMAP
8b04d4c4 2560
d36c9347 2561 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2562 {
02272c9c
VZ
2563 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2564 // the behaviour is not compatible with the Unix version (using iconv)
2565 // and break the library itself, e.g. wxTextInputStream::NextChar()
2566 // wouldn't work if reading an incomplete MB char didn't result in an
2567 // error
667e5b3e 2568 //
89028980 2569 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2570 // Win XP or newer and it is not supported for UTF-[78] so we always
2571 // use our own conversions in this case. See
89028980
VS
2572 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2573 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2574 if ( m_CodePage == CP_UTF8 )
89028980 2575 {
5487ff0f 2576 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2577 }
830f8f11
VZ
2578
2579 if ( m_CodePage == CP_UTF7 )
2580 {
5487ff0f 2581 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2582 }
2583
2584 int flags = 0;
2585 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2586 IsAtLeastWin2kSP4() )
89028980 2587 {
830f8f11 2588 flags = MB_ERR_INVALID_CHARS;
89028980 2589 }
667e5b3e 2590
2b5f62a0
VZ
2591 const size_t len = ::MultiByteToWideChar
2592 (
2593 m_CodePage, // code page
667e5b3e 2594 flags, // flags: fall on error
2b5f62a0
VZ
2595 psz, // input string
2596 -1, // its length (NUL-terminated)
b4da152e 2597 buf, // output string
2b5f62a0
VZ
2598 buf ? n : 0 // size of output buffer
2599 );
89028980
VS
2600 if ( !len )
2601 {
2602 // function totally failed
467e0479 2603 return wxCONV_FAILED;
89028980
VS
2604 }
2605
2606 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2607 // check if we succeeded, by doing a double trip:
2608 if ( !flags && buf )
2609 {
53c174fc
VZ
2610 const size_t mbLen = strlen(psz);
2611 wxCharBuffer mbBuf(mbLen);
89028980
VS
2612 if ( ::WideCharToMultiByte
2613 (
2614 m_CodePage,
2615 0,
2616 buf,
2617 -1,
2618 mbBuf.data(),
53c174fc 2619 mbLen + 1, // size in bytes, not length
89028980
VS
2620 NULL,
2621 NULL
2622 ) == 0 ||
2623 strcmp(mbBuf, psz) != 0 )
2624 {
2625 // we didn't obtain the same thing we started from, hence
2626 // the conversion was lossy and we consider that it failed
467e0479 2627 return wxCONV_FAILED;
89028980
VS
2628 }
2629 }
2b5f62a0 2630
03a991bc
VZ
2631 // note that it returns count of written chars for buf != NULL and size
2632 // of the needed buffer for buf == NULL so in either case the length of
2633 // the string (which never includes the terminating NUL) is one less
89028980 2634 return len - 1;
f1339c56 2635 }
dccce9ea 2636
d36c9347 2637 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2638 {
13dd924a
VZ
2639 /*
2640 we have a problem here: by default, WideCharToMultiByte() may
2641 replace characters unrepresentable in the target code page with bad
2642 quality approximations such as turning "1/2" symbol (U+00BD) into
2643 "1" for the code pages which don't have it and we, obviously, want
2644 to avoid this at any price
d775fa82 2645
13dd924a
VZ
2646 the trouble is that this function does it _silently_, i.e. it won't
2647 even tell us whether it did or not... Win98/2000 and higher provide
2648 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2649 we have to resort to a round trip, i.e. check that converting back
2650 results in the same string -- this is, of course, expensive but
2651 otherwise we simply can't be sure to not garble the data.
2652 */
2653
2654 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2655 // it doesn't work with CJK encodings (which we test for rather roughly
2656 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2657 // supporting it
907173e5
WS
2658 BOOL usedDef wxDUMMY_INITIALIZE(false);
2659 BOOL *pUsedDef;
13dd924a
VZ
2660 int flags;
2661 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2662 {
2663 // it's our lucky day
2664 flags = WC_NO_BEST_FIT_CHARS;
2665 pUsedDef = &usedDef;
2666 }
2667 else // old system or unsupported encoding
2668 {
2669 flags = 0;
2670 pUsedDef = NULL;
2671 }
2672
2b5f62a0
VZ
2673 const size_t len = ::WideCharToMultiByte
2674 (
2675 m_CodePage, // code page
13dd924a
VZ
2676 flags, // either none or no best fit
2677 pwz, // input string
2b5f62a0
VZ
2678 -1, // it is (wide) NUL-terminated
2679 buf, // output buffer
2680 buf ? n : 0, // and its size
2681 NULL, // default "replacement" char
13dd924a 2682 pUsedDef // [out] was it used?
2b5f62a0
VZ
2683 );
2684
13dd924a
VZ
2685 if ( !len )
2686 {
2687 // function totally failed
467e0479 2688 return wxCONV_FAILED;
13dd924a
VZ
2689 }
2690
765bdb4a
VZ
2691 // we did something, check if we really succeeded
2692 if ( flags )
13dd924a 2693 {
765bdb4a
VZ
2694 // check if the conversion failed, i.e. if any replacements
2695 // were done
2696 if ( usedDef )
2697 return wxCONV_FAILED;
2698 }
2699 else // we must resort to double tripping...
2700 {
2701 // first we need to ensure that we really have the MB data: this is
2702 // not the case if we're called with NULL buffer, in which case we
2703 // need to do the conversion yet again
2704 wxCharBuffer bufDef;
2705 if ( !buf )
13dd924a 2706 {
765bdb4a
VZ
2707 bufDef = wxCharBuffer(len);
2708 buf = bufDef.data();
2709 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2710 buf, len, NULL, NULL) )
467e0479 2711 return wxCONV_FAILED;
13dd924a 2712 }
765bdb4a 2713
564da6ff
VZ
2714 if ( !n )
2715 n = wcslen(pwz);
765bdb4a 2716 wxWCharBuffer wcBuf(n);
564da6ff 2717 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2718 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2719 {
765bdb4a
VZ
2720 // we didn't obtain the same thing we started from, hence
2721 // the conversion was lossy and we consider that it failed
2722 return wxCONV_FAILED;
13dd924a
VZ
2723 }
2724 }
2725
03a991bc 2726 // see the comment above for the reason of "len - 1"
13dd924a 2727 return len - 1;
f1339c56 2728 }
dccce9ea 2729
7ef3ab50
VZ
2730 virtual size_t GetMBNulLen() const
2731 {
2732 if ( m_minMBCharWidth == 0 )
2733 {
2734 int len = ::WideCharToMultiByte
2735 (
2736 m_CodePage, // code page
2737 0, // no flags
2738 L"", // input string
2739 1, // translate just the NUL
2740 NULL, // output buffer
2741 0, // and its size
2742 NULL, // no replacement char
2743 NULL // [out] don't care if it was used
2744 );
2745
2746 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2747 switch ( len )
2748 {
2749 default:
9a83f860 2750 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2751 self->m_minMBCharWidth = (size_t)-1;
2752 break;
7ef3ab50
VZ
2753
2754 case 0:
2755 self->m_minMBCharWidth = (size_t)-1;
2756 break;
2757
2758 case 1:
2759 case 2:
2760 case 4:
2761 self->m_minMBCharWidth = len;
2762 break;
2763 }
2764 }
2765
2766 return m_minMBCharWidth;
2767 }
2768
d36c9347
VZ
2769 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2770
13dd924a
VZ
2771 bool IsOk() const { return m_CodePage != -1; }
2772
2773private:
2774 static bool CanUseNoBestFit()
2775 {
2776 static int s_isWin98Or2k = -1;
2777
2778 if ( s_isWin98Or2k == -1 )
2779 {
2780 int verMaj, verMin;
2781 switch ( wxGetOsVersion(&verMaj, &verMin) )
2782 {
406d283a 2783 case wxOS_WINDOWS_9X:
13dd924a
VZ
2784 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2785 break;
2786
406d283a 2787 case wxOS_WINDOWS_NT:
13dd924a
VZ
2788 s_isWin98Or2k = verMaj >= 5;
2789 break;
2790
2791 default:
ef199164 2792 // unknown: be conservative by default
13dd924a 2793 s_isWin98Or2k = 0;
ef199164 2794 break;
13dd924a
VZ
2795 }
2796
9a83f860 2797 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2798 }
2799
2800 return s_isWin98Or2k == 1;
2801 }
f1339c56 2802
89028980
VS
2803 static bool IsAtLeastWin2kSP4()
2804 {
8942f83a
WS
2805#ifdef __WXWINCE__
2806 return false;
2807#else
89028980
VS
2808 static int s_isAtLeastWin2kSP4 = -1;
2809
2810 if ( s_isAtLeastWin2kSP4 == -1 )
2811 {
2812 OSVERSIONINFOEX ver;
2813
2814 memset(&ver, 0, sizeof(ver));
2815 ver.dwOSVersionInfoSize = sizeof(ver);
2816 GetVersionEx((OSVERSIONINFO*)&ver);
2817
2818 s_isAtLeastWin2kSP4 =
2819 ((ver.dwMajorVersion > 5) || // Vista+
2820 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2821 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2822 ver.wServicePackMajor >= 4)) // 2000 SP4+
2823 ? 1 : 0;
2824 }
2825
2826 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2827#endif
89028980
VS
2828 }
2829
eec47cc6 2830
c1464d9d 2831 // the code page we're working with
b1d66b54 2832 long m_CodePage;
c1464d9d 2833
7ef3ab50 2834 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2835 // "unknown"
2836 size_t m_minMBCharWidth;
1cd52418 2837};
e95354ec
VZ
2838
2839#endif // wxHAVE_WIN32_MB2WC
2840
f7e98dee 2841
36acb880
VZ
2842// ============================================================================
2843// wxEncodingConverter based conversion classes
2844// ============================================================================
2845
1e6feb95 2846#if wxUSE_FONTMAP
1cd52418 2847
e95354ec 2848class wxMBConv_wxwin : public wxMBConv
1cd52418 2849{
8b04d4c4
VZ
2850private:
2851 void Init()
2852 {
6ac84a78
DE
2853 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2854 // The wxMBConv_cf class does a better job.
2855 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2856 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2857 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2858 }
2859
6001e347 2860public:
f1339c56
RR
2861 // temporarily just use wxEncodingConverter stuff,
2862 // so that it works while a better implementation is built
86501081 2863 wxMBConv_wxwin(const char* name)
f1339c56
RR
2864 {
2865 if (name)
267e11c5 2866 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2867 else
2868 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2869
8b04d4c4
VZ
2870 Init();
2871 }
2872
e95354ec 2873 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2874 {
2875 m_enc = enc;
2876
2877 Init();
f1339c56 2878 }
dccce9ea 2879
bde4baac 2880 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2881 {
2882 size_t inbuf = strlen(psz);
dccce9ea 2883 if (buf)
c643a977 2884 {
ef199164 2885 if (!m2w.Convert(psz, buf))
467e0479 2886 return wxCONV_FAILED;
c643a977 2887 }
f1339c56
RR
2888 return inbuf;
2889 }
dccce9ea 2890
bde4baac 2891 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2892 {
f8d791e0 2893 const size_t inbuf = wxWcslen(psz);
f1339c56 2894 if (buf)
c643a977 2895 {
ef199164 2896 if (!w2m.Convert(psz, buf))
467e0479 2897 return wxCONV_FAILED;
c643a977 2898 }
dccce9ea 2899
f1339c56
RR
2900 return inbuf;
2901 }
dccce9ea 2902
7ef3ab50 2903 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2904 {
2905 switch ( m_enc )
2906 {
2907 case wxFONTENCODING_UTF16BE:
2908 case wxFONTENCODING_UTF16LE:
c1464d9d 2909 return 2;
eec47cc6
VZ
2910
2911 case wxFONTENCODING_UTF32BE:
2912 case wxFONTENCODING_UTF32LE:
c1464d9d 2913 return 4;
eec47cc6
VZ
2914
2915 default:
c1464d9d 2916 return 1;
eec47cc6
VZ
2917 }
2918 }
2919
d36c9347
VZ
2920 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2921
7ef3ab50
VZ
2922 bool IsOk() const { return m_ok; }
2923
2924public:
2925 wxFontEncoding m_enc;
2926 wxEncodingConverter m2w, w2m;
2927
2928private:
cafbf6fb
VZ
2929 // were we initialized successfully?
2930 bool m_ok;
fc7a2a60 2931
c0c133e1 2932 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2933};
6001e347 2934
8f115891 2935// make the constructors available for unit testing
86501081 2936WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2937{
2938 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2939 if ( !result->IsOk() )
2940 {
2941 delete result;
2942 return 0;
2943 }
ef199164 2944
8f115891
MW
2945 return result;
2946}
2947
1e6feb95
VZ
2948#endif // wxUSE_FONTMAP
2949
36acb880
VZ
2950// ============================================================================
2951// wxCSConv implementation
2952// ============================================================================
2953
8b04d4c4 2954void wxCSConv::Init()
6001e347 2955{
e95354ec
VZ
2956 m_name = NULL;
2957 m_convReal = NULL;
6c4d607e
VZ
2958}
2959
2960void wxCSConv::SetEncoding(wxFontEncoding encoding)
2961{
2962 switch ( encoding )
2963 {
2964 case wxFONTENCODING_MAX:
2965 case wxFONTENCODING_SYSTEM:
2966 if ( m_name )
2967 {
2968 // It's ok to not have encoding value if we have a name for it.
2969 m_encoding = wxFONTENCODING_SYSTEM;
2970 }
2971 else // No name neither.
2972 {
2973 // Fall back to the system default encoding in this case (not
2974 // sure how much sense does this make but this is how the old
2975 // code used to behave).
2976#if wxUSE_INTL
2977 m_encoding = wxLocale::GetSystemEncoding();
2978 if ( m_encoding == wxFONTENCODING_SYSTEM )
2979#endif // wxUSE_INTL
2980 m_encoding = wxFONTENCODING_ISO8859_1;
2981 }
2982 break;
2983
2984 case wxFONTENCODING_DEFAULT:
2985 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2986 m_encoding = wxFONTENCODING_ISO8859_1;
2987 break;
2988
2989 default:
2990 // Just use the provided encoding.
2991 m_encoding = encoding;
2992 }
e95354ec
VZ
2993}
2994
86501081 2995wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2996{
2997 Init();
82713003 2998
86501081 2999 if ( !charset.empty() )
e95354ec 3000 {
86501081 3001 SetName(charset.ToAscii());
e95354ec 3002 }
bda3d86a 3003
e4277538 3004#if wxUSE_FONTMAP
6c4d607e 3005 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
e4277538 3006#else
6c4d607e 3007 SetEncoding(wxFONTENCODING_SYSTEM);
e4277538 3008#endif
6c4d607e
VZ
3009
3010 m_convReal = DoCreate();
6001e347
RR
3011}
3012
8b04d4c4
VZ
3013wxCSConv::wxCSConv(wxFontEncoding encoding)
3014{
bda3d86a 3015 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 3016 {
9a83f860 3017 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
3018
3019 encoding = wxFONTENCODING_SYSTEM;
3020 }
3021
8b04d4c4
VZ
3022 Init();
3023
6c4d607e
VZ
3024 SetEncoding(encoding);
3025
3026 m_convReal = DoCreate();
8b04d4c4
VZ
3027}
3028
6001e347
RR
3029wxCSConv::~wxCSConv()
3030{
65e50848
JS
3031 Clear();
3032}
3033
54380f29 3034wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3035 : wxMBConv()
54380f29 3036{
8b04d4c4
VZ
3037 Init();
3038
54380f29 3039 SetName(conv.m_name);
6c4d607e
VZ
3040 SetEncoding(conv.m_encoding);
3041
3042 m_convReal = DoCreate();
54380f29
GD
3043}
3044
3045wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3046{
3047 Clear();
8b04d4c4 3048
54380f29 3049 SetName(conv.m_name);
6c4d607e
VZ
3050 SetEncoding(conv.m_encoding);
3051
3052 m_convReal = DoCreate();
8b04d4c4 3053
54380f29
GD
3054 return *this;
3055}
3056
65e50848
JS
3057void wxCSConv::Clear()
3058{
8b04d4c4 3059 free(m_name);
65e50848 3060 m_name = NULL;
6c4d607e
VZ
3061
3062 wxDELETE(m_convReal);
6001e347
RR
3063}
3064
86501081 3065void wxCSConv::SetName(const char *charset)
6001e347 3066{
6c4d607e 3067 if ( charset )
d6f2a891 3068 m_name = wxStrdup(charset);
6001e347
RR
3069}
3070
8b3eb85d 3071#if wxUSE_FONTMAP
8b3eb85d
VZ
3072
3073WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3074 wxEncodingNameCache );
8b3eb85d
VZ
3075
3076static wxEncodingNameCache gs_nameCache;
3077#endif
3078
e95354ec
VZ
3079wxMBConv *wxCSConv::DoCreate() const
3080{
ce6f8d6f
VZ
3081#if wxUSE_FONTMAP
3082 wxLogTrace(TRACE_STRCONV,
3083 wxT("creating conversion for %s"),
3084 (m_name ? m_name
86501081 3085 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3086#endif // wxUSE_FONTMAP
3087
c547282d
VZ
3088 // check for the special case of ASCII or ISO8859-1 charset: as we have
3089 // special knowledge of it anyhow, we don't need to create a special
3090 // conversion object
6c4d607e 3091 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 3092 {
e95354ec
VZ
3093 // don't convert at all
3094 return NULL;
3095 }
dccce9ea 3096
e95354ec
VZ
3097 // we trust OS to do conversion better than we can so try external
3098 // conversion methods first
3099 //
3100 // the full order is:
3101 // 1. OS conversion (iconv() under Unix or Win32 API)
3102 // 2. hard coded conversions for UTF
3103 // 3. wxEncodingConverter as fall back
3104
3105 // step (1)
3106#ifdef HAVE_ICONV
c547282d 3107#if !wxUSE_FONTMAP
e95354ec 3108 if ( m_name )
c547282d 3109#endif // !wxUSE_FONTMAP
e95354ec 3110 {
3ef10cfc 3111#if wxUSE_FONTMAP
8b3eb85d 3112 wxFontEncoding encoding(m_encoding);
3ef10cfc 3113#endif
8b3eb85d 3114
86501081 3115 if ( m_name )
8b3eb85d 3116 {
86501081 3117 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3118 if ( conv->IsOk() )
3119 return conv;
3120
3121 delete conv;
c547282d
VZ
3122
3123#if wxUSE_FONTMAP
8b3eb85d 3124 encoding =
86501081 3125 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3126#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3127 }
3128#if wxUSE_FONTMAP
3129 {
3130 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3131 if ( it != gs_nameCache.end() )
3132 {
3133 if ( it->second.empty() )
3134 return NULL;
c547282d 3135
86501081 3136 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3137 if ( conv->IsOk() )
3138 return conv;
e95354ec 3139
8b3eb85d
VZ
3140 delete conv;
3141 }
3142
a243da29 3143 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3144 // CS : in case this does not return valid names (eg for MacRoman)
3145 // encoding got a 'failure' entry in the cache all the same,
3146 // although it just has to be created using a different method, so
3147 // only store failed iconv creation attempts (or perhaps we
3148 // shoulnd't do this at all ?)
3c67ec06 3149 if ( names[0] != NULL )
8b3eb85d 3150 {
3c67ec06 3151 for ( ; *names; ++names )
8b3eb85d 3152 {
86501081
VS
3153 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3154 // will need changes that will obsolete this
3155 wxString name(*names);
3156 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3157 if ( conv->IsOk() )
3158 {
3159 gs_nameCache[encoding] = *names;
3160 return conv;
3161 }
3162
3163 delete conv;
8b3eb85d
VZ
3164 }
3165
9a83f860 3166 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3167 }
8b3eb85d
VZ
3168 }
3169#endif // wxUSE_FONTMAP
e95354ec
VZ
3170 }
3171#endif // HAVE_ICONV
3172
3173#ifdef wxHAVE_WIN32_MB2WC
3174 {
7608a683 3175#if wxUSE_FONTMAP
e95354ec
VZ
3176 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3177 : new wxMBConv_win32(m_encoding);
3178 if ( conv->IsOk() )
3179 return conv;
3180
3181 delete conv;
7608a683
WS
3182#else
3183 return NULL;
3184#endif
e95354ec
VZ
3185 }
3186#endif // wxHAVE_WIN32_MB2WC
ef199164 3187
5c4ed98d 3188#ifdef __DARWIN__
f7e98dee 3189 {
6ff49cbc
DE
3190 // leave UTF16 and UTF32 to the built-ins of wx
3191 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3192 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3193 {
a6900d10 3194#if wxUSE_FONTMAP
5c4ed98d
DE
3195 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3196 : new wxMBConv_cf(m_encoding);
a6900d10 3197#else
5c4ed98d 3198 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3199#endif
ef199164 3200
f7e98dee 3201 if ( conv->IsOk() )
d775fa82
WS
3202 return conv;
3203
3204 delete conv;
3205 }
335d31e0 3206 }
5c4ed98d
DE
3207#endif // __DARWIN__
3208
e95354ec
VZ
3209 // step (2)
3210 wxFontEncoding enc = m_encoding;
3211#if wxUSE_FONTMAP
c547282d
VZ
3212 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3213 {
3214 // use "false" to suppress interactive dialogs -- we can be called from
3215 // anywhere and popping up a dialog from here is the last thing we want to
3216 // do
267e11c5 3217 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3218 }
e95354ec
VZ
3219#endif // wxUSE_FONTMAP
3220
3221 switch ( enc )
3222 {
3223 case wxFONTENCODING_UTF7:
3224 return new wxMBConvUTF7;
3225
3226 case wxFONTENCODING_UTF8:
3227 return new wxMBConvUTF8;
3228
e95354ec
VZ
3229 case wxFONTENCODING_UTF16BE:
3230 return new wxMBConvUTF16BE;
3231
3232 case wxFONTENCODING_UTF16LE:
3233 return new wxMBConvUTF16LE;
3234
e95354ec
VZ
3235 case wxFONTENCODING_UTF32BE:
3236 return new wxMBConvUTF32BE;
3237
3238 case wxFONTENCODING_UTF32LE:
3239 return new wxMBConvUTF32LE;
3240
3241 default:
3242 // nothing to do but put here to suppress gcc warnings
ef199164 3243 break;
e95354ec
VZ
3244 }
3245
3246 // step (3)
3247#if wxUSE_FONTMAP
3248 {
3249 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3250 : new wxMBConv_wxwin(m_encoding);
3251 if ( conv->IsOk() )
3252 return conv;
3253
3254 delete conv;
3255 }
ef199164 3256
3df31b2d
VZ
3257 wxLogTrace(TRACE_STRCONV,
3258 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3259 (m_name ? wxString(m_name)
3df31b2d
VZ
3260 : wxFontMapperBase::GetEncodingName(m_encoding)));
3261#endif // wxUSE_FONTMAP
e95354ec
VZ
3262
3263 return NULL;
3264}
3265
0f0298b1
VZ
3266bool wxCSConv::IsOk() const
3267{
0f0298b1
VZ
3268 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3269 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3270 return true; // always ok as we do it ourselves
3271
3272 // m_convReal->IsOk() is called at its own creation, so we know it must
3273 // be ok if m_convReal is non-NULL
3274 return m_convReal != NULL;
3275}
3276
1c714a5d
VZ
3277size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3278 const char *src, size_t srcLen) const
3279{
2c74c558
VS
3280 if (m_convReal)
3281 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3282
3283 // latin-1 (direct)
05392dc8
VZ
3284 if ( srcLen == wxNO_LEN )
3285 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3286
05392dc8
VZ
3287 if ( dst )
3288 {
3289 if ( dstLen < srcLen )
3290 return wxCONV_FAILED;
1c714a5d 3291
05392dc8
VZ
3292 for ( size_t n = 0; n < srcLen; n++ )
3293 dst[n] = (unsigned char)(src[n]);
3294 }
2c74c558 3295
05392dc8 3296 return srcLen;
1c714a5d
VZ
3297}
3298
05392dc8
VZ
3299size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3300 const wchar_t *src, size_t srcLen) const
6001e347 3301{
e95354ec 3302 if (m_convReal)
05392dc8 3303 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3304
3305 // latin-1 (direct)
05392dc8
VZ
3306 if ( srcLen == wxNO_LEN )
3307 srcLen = wxWcslen(src) + 1;
dccce9ea 3308
05392dc8 3309 if ( dst )
f1339c56 3310 {
05392dc8
VZ
3311 if ( dstLen < srcLen )
3312 return wxCONV_FAILED;
1cd52418 3313
05392dc8 3314 for ( size_t n = 0; n < srcLen; n++ )
24642831 3315 {
05392dc8 3316 if ( src[n] > 0xFF )
467e0479 3317 return wxCONV_FAILED;
ef199164 3318
05392dc8 3319 dst[n] = (char)src[n];
24642831 3320 }
05392dc8 3321
24642831 3322 }
05392dc8 3323 else // still need to check the input validity
24642831 3324 {
05392dc8 3325 for ( size_t n = 0; n < srcLen; n++ )
24642831 3326 {
05392dc8 3327 if ( src[n] > 0xFF )
467e0479 3328 return wxCONV_FAILED;
24642831 3329 }
f1339c56 3330 }
dccce9ea 3331
05392dc8 3332 return srcLen;
6001e347
RR
3333}
3334
7ef3ab50 3335size_t wxCSConv::GetMBNulLen() const
eec47cc6 3336{
eec47cc6 3337 if ( m_convReal )
7ef3ab50 3338 return m_convReal->GetMBNulLen();
eec47cc6 3339
ba98e032 3340 // otherwise, we are ISO-8859-1
c1464d9d 3341 return 1;
eec47cc6
VZ
3342}
3343
ba98e032
VS
3344#if wxUSE_UNICODE_UTF8
3345bool wxCSConv::IsUTF8() const
3346{
ba98e032 3347 if ( m_convReal )
ba98e032 3348 return m_convReal->IsUTF8();
ba98e032
VS
3349
3350 // otherwise, we are ISO-8859-1
3351 return false;
3352}
3353#endif
3354
69c928ef
VZ
3355
3356#if wxUSE_UNICODE
3357
3358wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3359{
3360 if ( !s )
3361 return wxWCharBuffer();
3362
3363 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3364 if ( !wbuf )
5487ff0f 3365 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3366 if ( !wbuf )
3367 wbuf = wxConvISO8859_1.cMB2WX(s);
3368
3369 return wbuf;
3370}
3371
3372wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3373{
3374 if ( !ws )
3375 return wxCharBuffer();
3376
3377 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3378 if ( !buf )
3379 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3380
3381 return buf;
3382}
3383
3384#endif // wxUSE_UNICODE
f5a1953b 3385
1e50d914
VS
3386// ----------------------------------------------------------------------------
3387// globals
3388// ----------------------------------------------------------------------------
3389
3390// NB: The reason why we create converted objects in this convoluted way,
3391// using a factory function instead of global variable, is that they
3392// may be used at static initialization time (some of them are used by
3393// wxString ctors and there may be a global wxString object). In other
3394// words, possibly _before_ the converter global object would be
3395// initialized.
3396
3397#undef wxConvLibc
3398#undef wxConvUTF8
3399#undef wxConvUTF7
3400#undef wxConvLocal
3401#undef wxConvISO8859_1
3402
3403#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3404 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3405 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3406 { \
3407 static impl_klass name##Obj ctor_args; \
3408 return &name##Obj; \
3409 } \
3410 /* this ensures that all global converter objects are created */ \
3411 /* by the time static initialization is done, i.e. before any */ \
3412 /* thread is launched: */ \
3413 static klass* gs_##name##instance = wxGet_##name##Ptr()
3414
3415#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3416 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3417
5c69ef61
VZ
3418#ifdef __INTELC__
3419 // disable warning "variable 'xxx' was declared but never referenced"
3420 #pragma warning(disable: 177)
3421#endif // Intel C++
3422
1e50d914
VS
3423#ifdef __WINDOWS__
3424 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3425#elif 0 // defined(__WXOSX__)
3426 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3427#else
3428 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3429#endif
3430
e1079eda
VZ
3431// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3432// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3433// provokes an error message about "not enough macro parameters"; and we
3434// can't use "()" here as the name##Obj declaration would be parsed as a
3435// function declaration then, so use a semicolon and live with an extra
3436// empty statement (and hope that no compilers warns about this)
3437WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3438WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3439
3440WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3441WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3442
3443WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3444WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3445
6ac84a78 3446#ifdef __DARWIN__
8244507f
VZ
3447// It is important to use this conversion object under Darwin as it ensures
3448// that Unicode strings are (re)composed correctly even though xnu kernel uses
3449// decomposed form internally (at least for the file names).
6ac84a78 3450static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3451#endif
6ac84a78 3452
1e50d914 3453WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3454#ifdef __DARWIN__
1e50d914 3455 &wxConvMacUTF8DObj;
6ac84a78 3456#else // !__DARWIN__
1e50d914 3457 wxGet_wxConvLibcPtr();
6ac84a78 3458#endif // __DARWIN__/!__DARWIN__