]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Implement undo and redo for the ie and gtk webkit backends. Extend the sample to...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
1c193821 31#ifndef __WXWINCE__
1cd52418 32#include <errno.h>
1c193821
JS
33#endif
34
6001e347
RR
35#include <ctype.h>
36#include <string.h>
37#include <stdlib.h>
38
e95354ec 39#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
e95354ec 42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
b040e242 45#ifdef HAVE_ICONV
373658eb 46 #include <iconv.h>
b1d547eb 47 #include "wx/thread.h"
1cd52418 48#endif
1cd52418 49
373658eb
VZ
50#include "wx/encconv.h"
51#include "wx/fontmap.h"
52
5c4ed98d 53#ifdef __DARWIN__
c933e267 54#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
55#endif //def __DARWIN__
56
ef199164 57
9a83f860 58#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 59
467e0479
VZ
60// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61// be 4 bytes
4948c2b6 62#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
63 #define WC_UTF16
64#endif
65
ef199164 66
373658eb
VZ
67// ============================================================================
68// implementation
69// ============================================================================
70
69373110
VZ
71// helper function of cMB2WC(): check if n bytes at this location are all NUL
72static bool NotAllNULs(const char *p, size_t n)
73{
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78}
79
373658eb 80// ----------------------------------------------------------------------------
467e0479 81// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 82// ----------------------------------------------------------------------------
6001e347 83
c91830cb 84static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 85{
ef199164 86 if (input <= 0xffff)
4def3b35 87 {
999836aa
VZ
88 if (output)
89 *output = (wxUint16) input;
ef199164 90
4def3b35 91 return 1;
dccce9ea 92 }
ef199164 93 else if (input >= 0x110000)
4def3b35 94 {
467e0479 95 return wxCONV_FAILED;
dccce9ea
VZ
96 }
97 else
4def3b35 98 {
dccce9ea 99 if (output)
4def3b35 100 {
ef199164
DS
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 103 }
ef199164 104
4def3b35 105 return 2;
1cd52418 106 }
1cd52418
OK
107}
108
c91830cb 109static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 110{
ef199164 111 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
112 {
113 output = *input;
114 return 1;
dccce9ea 115 }
ef199164 116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
117 {
118 output = *input;
467e0479 119 return wxCONV_FAILED;
dccce9ea
VZ
120 }
121 else
4def3b35
VS
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
1cd52418
OK
126}
127
467e0479 128#ifdef WC_UTF16
35d11700
VZ
129 typedef wchar_t wxDecodeSurrogate_t;
130#else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
133
134// returns the next UTF-32 character from the wchar_t buffer and advances the
135// pointer to the character after this one
136//
137// if an invalid character is found, *pSrc is set to NULL, the caller must
138// check for this
35d11700 139static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
140{
141 wxUint32 out;
8d3dd069 142 const size_t
5c33522f 143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150}
151
f6bcfd97 152// ----------------------------------------------------------------------------
6001e347 153// wxMBConv
f6bcfd97 154// ----------------------------------------------------------------------------
2c53a80a 155
483b0434
VZ
156size_t
157wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
6001e347 159{
483b0434 160 // although new conversion classes are supposed to implement this function
36f93678 161 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
36f93678
VZ
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
6001e347 170
483b0434
VZ
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
eec47cc6 173
c1464d9d 174 // the number of NULs terminating this string
a78c43f1 175 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 176
c1464d9d
VZ
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
483b0434
VZ
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
467e0479 183 if ( srcLen != wxNO_LEN )
eec47cc6 184 {
c1464d9d 185 // we need to know how to find the end of this string
7ef3ab50 186 nulLen = GetMBNulLen();
483b0434
VZ
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
e4e3bbb4 189
c1464d9d 190 // if there are enough NULs we can avoid the copy
483b0434 191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
192 {
193 // make a copy in order to properly NUL-terminate the string
483b0434 194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 195 char * const p = bufTmp.data();
483b0434
VZ
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 198 *s = '\0';
483b0434
VZ
199
200 src = bufTmp;
eec47cc6 201 }
e4e3bbb4 202
483b0434
VZ
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
e4e3bbb4 209
36f93678
VZ
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
bbb0ff36 217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
483b0434 225 for ( ;; )
eec47cc6 226 {
c1464d9d 227 // try to convert the current chunk
483b0434 228 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
e4e3bbb4 231
483b0434 232 dstWritten += lenChunk;
f6a02087
VZ
233 if ( !srcEnd )
234 dstWritten++;
f5fb6871 235
f6a02087 236 if ( !lenChunk )
467e0479
VZ
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
483b0434
VZ
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
f6a02087
VZ
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
f6a02087
VZ
252 if ( !srcEnd )
253 dst++;
483b0434 254 }
c1464d9d 255
483b0434 256 if ( !srcEnd )
c1464d9d 257 {
467e0479 258 // we convert just one chunk in this case as this is the entire
bbb0ff36 259 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
260 break;
261 }
eec47cc6 262
bbb0ff36
VZ
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
bbb0ff36
VZ
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
c1464d9d 286
483b0434 287 if ( src >= srcEnd )
c1464d9d
VZ
288 break;
289 }
290
483b0434 291 return dstWritten;
e4e3bbb4
RN
292}
293
483b0434
VZ
294size_t
295wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
e4e3bbb4 297{
483b0434
VZ
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
e4e3bbb4 300
f6a02087
VZ
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
eec47cc6
VZ
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
eec47cc6 308 wxWCharBuffer bufTmp;
f6a02087 309 if ( isNulTerminated )
e4e3bbb4 310 {
483b0434 311 srcLen = wxWcslen(src) + 1;
eec47cc6 312 }
483b0434 313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
314 {
315 // make a copy in order to properly NUL-terminate the string
483b0434 316 bufTmp = wxWCharBuffer(srcLen);
ef199164 317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
27307233 324 src++ /* skip L'\0' too */ )
483b0434
VZ
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
483b0434 331 dstWritten += lenChunk;
27307233
VZ
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
f6a02087 340 dstWritten += lenNul;
483b0434
VZ
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
27307233
VZ
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
364 return wxCONV_FAILED;
365
27307233
VZ
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
483b0434 377 dst += lenChunk;
27307233 378 if ( chunkEnd < srcEnd )
f6a02087 379 dst += lenNul;
483b0434 380 }
27307233
VZ
381
382 src = chunkEnd;
eec47cc6 383 }
e4e3bbb4 384
483b0434
VZ
385 return dstWritten;
386}
387
ef199164 388size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 389{
51725fc0 390 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 391 if ( rc != wxCONV_FAILED )
509da451
VZ
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399}
400
ef199164 401size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 402{
51725fc0 403 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 404 if ( rc != wxCONV_FAILED )
509da451 405 {
51725fc0 406 rc -= GetMBNulLen();
509da451
VZ
407 }
408
409 return rc;
410}
411
483b0434
VZ
412wxMBConv::~wxMBConv()
413{
414 // nothing to do here (necessary for Darwin linking probably)
415}
e4e3bbb4 416
483b0434
VZ
417const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418{
419 if ( psz )
eec47cc6 420 {
483b0434 421 // calculate the length of the buffer needed first
a2db25a1 422 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 423 if ( nLen != wxCONV_FAILED )
f5fb6871 424 {
483b0434 425 // now do the actual conversion
a2db25a1 426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 427
483b0434 428 // +1 for the trailing NULL
a2db25a1 429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 430 return buf;
f5fb6871 431 }
483b0434 432 }
e4e3bbb4 433
483b0434
VZ
434 return wxWCharBuffer();
435}
3698ae71 436
483b0434
VZ
437const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438{
439 if ( pwz )
440 {
a2db25a1 441 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 442 if ( nLen != wxCONV_FAILED )
483b0434 443 {
a2db25a1
VZ
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451}
e4e3bbb4 452
483b0434 453const wxWCharBuffer
ef199164 454wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 455{
ef199164 456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 457 if ( dstLen != wxCONV_FAILED )
483b0434 458 {
0dd13d21
VZ
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
f6a02087 463 wbuf.data()[dstLen] = L'\0';
ef199164 464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
465 {
466 if ( outLen )
467e0479
VZ
467 {
468 *outLen = dstLen;
f6a02087
VZ
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
467e0479
VZ
476 (*outLen)--;
477 }
478
483b0434
VZ
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487}
488
489const wxCharBuffer
ef199164 490wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 491{
13d92ad6 492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 493 if ( dstLen != wxCONV_FAILED )
483b0434 494 {
0dd13d21
VZ
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
ef199164 501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
502 {
503 if ( outLen )
467e0479
VZ
504 {
505 *outLen = dstLen;
506
f6a02087 507 if ( inLen == wxNO_LEN )
467e0479 508 {
f6a02087
VZ
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
13d92ad6 511 *outLen -= nulLen;
467e0479
VZ
512 }
513 }
d32a507d 514
483b0434
VZ
515 return buf;
516 }
e4e3bbb4
RN
517 }
518
eec47cc6
VZ
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
e4e3bbb4
RN
523}
524
40ac5040
VZ
525const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526{
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
cfcfada9 540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
541}
542
543const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544{
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
cfcfada9 558 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
559}
560
6001e347 561// ----------------------------------------------------------------------------
bde4baac 562// wxMBConvLibc
6001e347
RR
563// ----------------------------------------------------------------------------
564
bde4baac
VZ
565size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566{
567 return wxMB2WC(buf, psz, n);
568}
569
570size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571{
572 return wxWC2MB(buf, psz, n);
573}
e1bfe89e
RR
574
575// ----------------------------------------------------------------------------
532d575b 576// wxConvBrokenFileNames
e1bfe89e
RR
577// ----------------------------------------------------------------------------
578
eec47cc6
VZ
579#ifdef __UNIX__
580
86501081 581wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 582{
9a83f860
VZ
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
586 else
587 m_conv = new wxCSConv(charset);
ea8ce907
RR
588}
589
eec47cc6 590#endif // __UNIX__
c12b7f79 591
bde4baac 592// ----------------------------------------------------------------------------
3698ae71 593// UTF-7
bde4baac 594// ----------------------------------------------------------------------------
6001e347 595
15f2ee32 596// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
597//
598// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 599
15f2ee32
RN
600//
601// BASE64 decoding table
602//
603static const unsigned char utf7unb64[] =
6001e347 604{
15f2ee32
RN
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
637};
638
9d653e81
VZ
639size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
15f2ee32 641{
9d653e81 642 DecoderState stateOrig,
852dcba5 643 *statePtr;
9d653e81
VZ
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
5c33522f 655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
15f2ee32
RN
667 size_t len = 0;
668
9d653e81
VZ
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 672 {
9d653e81
VZ
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
15f2ee32 676 {
9d653e81
VZ
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
15f2ee32 679 {
ccaa848d
VZ
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
852dcba5 691 return wxCONV_FAILED;
ccaa848d 692 }
852dcba5 693
9d653e81
VZ
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
15f2ee32 709 {
9d653e81
VZ
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
15f2ee32 715 {
9d653e81
VZ
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
15f2ee32 721 }
9d653e81 722 else // MSB
04a37834 723 {
9d653e81
VZ
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
04a37834 727 }
15f2ee32
RN
728 }
729 }
9d653e81 730 }
04a37834 731
9d653e81
VZ
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
04a37834 736 {
9d653e81
VZ
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
ccaa848d
VZ
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
9d653e81
VZ
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
15f2ee32
RN
770 }
771 }
04a37834 772
9d653e81
VZ
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
04a37834 782
15f2ee32 783 return len;
6001e347
RR
784}
785
15f2ee32
RN
786//
787// BASE64 encoding table
788//
789static const unsigned char utf7enb64[] =
790{
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799};
800
801//
802// UTF-7 encoding table
803//
804// 0 - Set D (directly encoded characters)
805// 1 - Set O (optional direct characters)
806// 2 - whitespace characters (optional)
807// 3 - special characters
808//
809static const unsigned char utf7encode[128] =
6001e347 810{
9d653e81 811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819};
820
9d653e81
VZ
821static inline bool wxIsUTF7Direct(wchar_t wc)
822{
823 return wc < 0x80 && utf7encode[wc] < 1;
824}
825
826size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
15f2ee32 828{
9d653e81
VZ
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
5c33522f 842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
843 }
844
845 EncoderState& state = *statePtr;
846
847
15f2ee32
RN
848 size_t len = 0;
849
9d653e81
VZ
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 852 {
9d653e81
VZ
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
15f2ee32 855 {
9d653e81
VZ
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
ef199164 865
9d653e81
VZ
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
15f2ee32
RN
875 len++;
876 }
9d653e81
VZ
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
15f2ee32 887#ifndef WC_UTF16
79c78d42 888 else if (((wxUint32)cc) > 0xffff)
b2c13097 889 {
15f2ee32 890 // no surrogate pair generation (yet?)
467e0479 891 return wxCONV_FAILED;
15f2ee32
RN
892 }
893#endif
894 else
895 {
9d653e81
VZ
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
ef199164 899
9d653e81
VZ
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
15f2ee32 907 {
9d653e81 908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 909 {
9d653e81
VZ
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
15f2ee32 914 {
9d653e81
VZ
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
15f2ee32 919 }
15f2ee32 920 }
ef199164 921
9d653e81
VZ
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
ef199164 924
9d653e81 925 src++;
15f2ee32 926 }
15f2ee32
RN
927 }
928 }
ef199164 929
9d653e81
VZ
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
ef199164 935
15f2ee32 936 return len;
6001e347
RR
937}
938
f6bcfd97 939// ----------------------------------------------------------------------------
6001e347 940// UTF-8
f6bcfd97 941// ----------------------------------------------------------------------------
6001e347 942
1774c3c5 943static const wxUint32 utf8_max[]=
4def3b35 944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 945
3698ae71
VZ
946// boundaries of the private use area we use to (temporarily) remap invalid
947// characters invalid in a UTF-8 encoded string
ea8ce907
RR
948const wxUint32 wxUnicodePUA = 0x100000;
949const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
0286d08d 951// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 952const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984};
985
986size_t
987wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989{
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
0dcbb107 998 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
0286d08d
VZ
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
0286d08d
VZ
1018 if ( out && !dstLen-- )
1019 break;
1020
5367a38a
VS
1021 wxUint32 code;
1022 unsigned char c = *p;
0286d08d 1023
5367a38a
VS
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
0286d08d 1028
5367a38a
VS
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
0286d08d 1031
5367a38a
VS
1032 code = c;
1033 }
1034 else
0286d08d 1035 {
5367a38a
VS
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
0286d08d 1081
5367a38a
VS
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
0286d08d
VZ
1085 }
1086
1087#ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095#else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098#endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107}
1108
1109size_t
1110wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112{
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
0dcbb107 1118 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
0286d08d
VZ
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
a964d3ed
VZ
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
0286d08d
VZ
1140
1141 wxUint32 code;
1142#ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 }
1149#else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151#endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
9a83f860 1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223}
1224
d16d0917
VZ
1225size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
6001e347 1227{
0286d08d 1228 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1230
4def3b35
VS
1231 size_t len = 0;
1232
d16d0917 1233 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1234 {
ea8ce907
RR
1235 const char *opsz = psz;
1236 bool invalid = false;
4def3b35
VS
1237 unsigned char cc = *psz++, fc = cc;
1238 unsigned cnt;
dccce9ea 1239 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1240 fc <<= 1;
ef199164 1241
dccce9ea 1242 if (!cnt)
4def3b35
VS
1243 {
1244 // plain ASCII char
dccce9ea 1245 if (buf)
4def3b35
VS
1246 *buf++ = cc;
1247 len++;
561488ef
MW
1248
1249 // escape the escape character for octal escapes
1250 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1251 && cc == '\\' && (!buf || len < n))
1252 {
1253 if (buf)
1254 *buf++ = cc;
1255 len++;
1256 }
dccce9ea
VZ
1257 }
1258 else
4def3b35
VS
1259 {
1260 cnt--;
dccce9ea 1261 if (!cnt)
4def3b35
VS
1262 {
1263 // invalid UTF-8 sequence
ea8ce907 1264 invalid = true;
dccce9ea
VZ
1265 }
1266 else
4def3b35
VS
1267 {
1268 unsigned ocnt = cnt - 1;
1269 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1270 while (cnt--)
4def3b35 1271 {
ea8ce907 1272 cc = *psz;
dccce9ea 1273 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1274 {
1275 // invalid UTF-8 sequence
ea8ce907
RR
1276 invalid = true;
1277 break;
4def3b35 1278 }
ef199164 1279
ea8ce907 1280 psz++;
4def3b35
VS
1281 res = (res << 6) | (cc & 0x3f);
1282 }
ef199164 1283
ea8ce907 1284 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1285 {
1286 // illegal UTF-8 encoding
ea8ce907 1287 invalid = true;
4def3b35 1288 }
ea8ce907
RR
1289 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1290 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1291 {
1292 // if one of our PUA characters turns up externally
1293 // it must also be treated as an illegal sequence
1294 // (a bit like you have to escape an escape character)
1295 invalid = true;
1296 }
1297 else
1298 {
1cd52418 1299#ifdef WC_UTF16
0286d08d 1300 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1301 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1302 if (pa == wxCONV_FAILED)
ea8ce907
RR
1303 {
1304 invalid = true;
1305 }
1306 else
1307 {
1308 if (buf)
1309 buf += pa;
1310 len += pa;
1311 }
373658eb 1312#else // !WC_UTF16
ea8ce907 1313 if (buf)
38d4b1e4 1314 *buf++ = (wchar_t)res;
ea8ce907 1315 len++;
373658eb 1316#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1317 }
1318 }
ef199164 1319
ea8ce907
RR
1320 if (invalid)
1321 {
1322 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1323 {
1324 while (opsz < psz && (!buf || len < n))
1325 {
1326#ifdef WC_UTF16
1327 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1328 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1329 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1330 if (buf)
1331 buf += pa;
1332 opsz++;
1333 len += pa;
1334#else
1335 if (buf)
38d4b1e4 1336 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1337 opsz++;
1338 len++;
1339#endif
1340 }
1341 }
3698ae71 1342 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1343 {
1344 while (opsz < psz && (!buf || len < n))
1345 {
3698ae71
VZ
1346 if ( buf && len + 3 < n )
1347 {
17a1ebd1 1348 unsigned char on = *opsz;
3698ae71 1349 *buf++ = L'\\';
17a1ebd1
VZ
1350 *buf++ = (wchar_t)( L'0' + on / 0100 );
1351 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1352 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1353 }
ef199164 1354
ea8ce907
RR
1355 opsz++;
1356 len += 4;
1357 }
1358 }
3698ae71 1359 else // MAP_INVALID_UTF8_NOT
ea8ce907 1360 {
467e0479 1361 return wxCONV_FAILED;
ea8ce907 1362 }
4def3b35
VS
1363 }
1364 }
6001e347 1365 }
ef199164 1366
d16d0917 1367 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1368 *buf = 0;
ef199164 1369
d16d0917 1370 return len + 1;
6001e347
RR
1371}
1372
3698ae71
VZ
1373static inline bool isoctal(wchar_t wch)
1374{
1375 return L'0' <= wch && wch <= L'7';
1376}
1377
d16d0917
VZ
1378size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379 const wchar_t *psz, size_t srcLen) const
6001e347 1380{
0286d08d 1381 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1382 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1383
4def3b35 1384 size_t len = 0;
6001e347 1385
d16d0917 1386 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1387 {
1388 wxUint32 cc;
ef199164 1389
1cd52418 1390#ifdef WC_UTF16
b5153fd8
VZ
1391 // cast is ok for WC_UTF16
1392 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1393 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1394#else
ef199164 1395 cc = (*psz++) & 0x7fffffff;
4def3b35 1396#endif
3698ae71
VZ
1397
1398 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1399 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1400 {
dccce9ea 1401 if (buf)
ea8ce907 1402 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1403 len++;
3698ae71 1404 }
561488ef
MW
1405 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1406 && cc == L'\\' && psz[0] == L'\\' )
1407 {
1408 if (buf)
1409 *buf++ = (char)cc;
1410 psz++;
1411 len++;
1412 }
3698ae71
VZ
1413 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1414 cc == L'\\' &&
1415 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1416 {
dccce9ea 1417 if (buf)
3698ae71 1418 {
ef199164
DS
1419 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1420 (psz[1] - L'0') * 010 +
b2c13097 1421 (psz[2] - L'0'));
3698ae71
VZ
1422 }
1423
1424 psz += 3;
ea8ce907
RR
1425 len++;
1426 }
1427 else
1428 {
1429 unsigned cnt;
ef199164
DS
1430 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1431 {
1432 }
1433
ea8ce907 1434 if (!cnt)
4def3b35 1435 {
ea8ce907
RR
1436 // plain ASCII char
1437 if (buf)
1438 *buf++ = (char) cc;
1439 len++;
1440 }
ea8ce907
RR
1441 else
1442 {
1443 len += cnt + 1;
1444 if (buf)
1445 {
1446 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1447 while (cnt--)
1448 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1449 }
4def3b35
VS
1450 }
1451 }
6001e347 1452 }
4def3b35 1453
d16d0917 1454 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1455 *buf = 0;
adb45366 1456
d16d0917 1457 return len + 1;
6001e347
RR
1458}
1459
467e0479 1460// ============================================================================
c91830cb 1461// UTF-16
467e0479 1462// ============================================================================
c91830cb
VZ
1463
1464#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1465 #define wxMBConvUTF16straight wxMBConvUTF16BE
1466 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1467#else
bde4baac
VZ
1468 #define wxMBConvUTF16swap wxMBConvUTF16BE
1469 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1470#endif
1471
467e0479
VZ
1472/* static */
1473size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1474{
1475 if ( srcLen == wxNO_LEN )
1476 {
1477 // count the number of bytes in input, including the trailing NULs
5c33522f 1478 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1479 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1480 ;
c91830cb 1481
467e0479
VZ
1482 srcLen *= BYTES_PER_CHAR;
1483 }
1484 else // we already have the length
1485 {
1486 // we can only convert an entire number of UTF-16 characters
1487 if ( srcLen % BYTES_PER_CHAR )
1488 return wxCONV_FAILED;
1489 }
1490
1491 return srcLen;
1492}
1493
1494// case when in-memory representation is UTF-16 too
c91830cb
VZ
1495#ifdef WC_UTF16
1496
467e0479
VZ
1497// ----------------------------------------------------------------------------
1498// conversions without endianness change
1499// ----------------------------------------------------------------------------
1500
1501size_t
1502wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1503 const char *src, size_t srcLen) const
c91830cb 1504{
467e0479
VZ
1505 // set up the scene for using memcpy() (which is presumably more efficient
1506 // than copying the bytes one by one)
1507 srcLen = GetLength(src, srcLen);
1508 if ( srcLen == wxNO_LEN )
1509 return wxCONV_FAILED;
c91830cb 1510
ef199164 1511 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1512 if ( dst )
c91830cb 1513 {
467e0479
VZ
1514 if ( dstLen < inLen )
1515 return wxCONV_FAILED;
c91830cb 1516
467e0479 1517 memcpy(dst, src, srcLen);
c91830cb 1518 }
d32a507d 1519
467e0479 1520 return inLen;
c91830cb
VZ
1521}
1522
467e0479
VZ
1523size_t
1524wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1525 const wchar_t *src, size_t srcLen) const
c91830cb 1526{
467e0479
VZ
1527 if ( srcLen == wxNO_LEN )
1528 srcLen = wxWcslen(src) + 1;
c91830cb 1529
467e0479
VZ
1530 srcLen *= BYTES_PER_CHAR;
1531
1532 if ( dst )
c91830cb 1533 {
467e0479
VZ
1534 if ( dstLen < srcLen )
1535 return wxCONV_FAILED;
d32a507d 1536
467e0479 1537 memcpy(dst, src, srcLen);
c91830cb 1538 }
d32a507d 1539
467e0479 1540 return srcLen;
c91830cb
VZ
1541}
1542
467e0479
VZ
1543// ----------------------------------------------------------------------------
1544// endian-reversing conversions
1545// ----------------------------------------------------------------------------
c91830cb 1546
467e0479
VZ
1547size_t
1548wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549 const char *src, size_t srcLen) const
c91830cb 1550{
467e0479
VZ
1551 srcLen = GetLength(src, srcLen);
1552 if ( srcLen == wxNO_LEN )
1553 return wxCONV_FAILED;
c91830cb 1554
467e0479
VZ
1555 srcLen /= BYTES_PER_CHAR;
1556
1557 if ( dst )
c91830cb 1558 {
467e0479
VZ
1559 if ( dstLen < srcLen )
1560 return wxCONV_FAILED;
1561
5c33522f 1562 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1563 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1564 {
ef199164 1565 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1566 }
c91830cb 1567 }
bfab25d4 1568
467e0479 1569 return srcLen;
c91830cb
VZ
1570}
1571
467e0479
VZ
1572size_t
1573wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1574 const wchar_t *src, size_t srcLen) const
c91830cb 1575{
467e0479
VZ
1576 if ( srcLen == wxNO_LEN )
1577 srcLen = wxWcslen(src) + 1;
c91830cb 1578
467e0479
VZ
1579 srcLen *= BYTES_PER_CHAR;
1580
1581 if ( dst )
c91830cb 1582 {
467e0479
VZ
1583 if ( dstLen < srcLen )
1584 return wxCONV_FAILED;
1585
5c33522f 1586 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1587 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1588 {
ef199164 1589 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1590 }
c91830cb 1591 }
eec47cc6 1592
467e0479 1593 return srcLen;
c91830cb
VZ
1594}
1595
467e0479 1596#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1597
467e0479
VZ
1598// ----------------------------------------------------------------------------
1599// conversions without endianness change
1600// ----------------------------------------------------------------------------
c91830cb 1601
35d11700
VZ
1602size_t
1603wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1604 const char *src, size_t srcLen) const
c91830cb 1605{
35d11700
VZ
1606 srcLen = GetLength(src, srcLen);
1607 if ( srcLen == wxNO_LEN )
1608 return wxCONV_FAILED;
c91830cb 1609
ef199164 1610 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1611 if ( !dst )
c91830cb 1612 {
35d11700
VZ
1613 // optimization: return maximal space which could be needed for this
1614 // string even if the real size could be smaller if the buffer contains
1615 // any surrogates
1616 return inLen;
c91830cb 1617 }
c91830cb 1618
35d11700 1619 size_t outLen = 0;
5c33522f 1620 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1621 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1622 {
ef199164
DS
1623 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1624 if ( !inBuff )
35d11700
VZ
1625 return wxCONV_FAILED;
1626
1627 if ( ++outLen > dstLen )
1628 return wxCONV_FAILED;
c91830cb 1629
35d11700
VZ
1630 *dst++ = ch;
1631 }
1632
1633
1634 return outLen;
1635}
c91830cb 1636
35d11700
VZ
1637size_t
1638wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1639 const wchar_t *src, size_t srcLen) const
c91830cb 1640{
35d11700
VZ
1641 if ( srcLen == wxNO_LEN )
1642 srcLen = wxWcslen(src) + 1;
c91830cb 1643
35d11700 1644 size_t outLen = 0;
5c33522f 1645 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1646 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1647 {
1648 wxUint16 cc[2];
35d11700
VZ
1649 const size_t numChars = encode_utf16(*src++, cc);
1650 if ( numChars == wxCONV_FAILED )
1651 return wxCONV_FAILED;
c91830cb 1652
ef199164
DS
1653 outLen += numChars * BYTES_PER_CHAR;
1654 if ( outBuff )
c91830cb 1655 {
35d11700
VZ
1656 if ( outLen > dstLen )
1657 return wxCONV_FAILED;
1658
ef199164 1659 *outBuff++ = cc[0];
35d11700 1660 if ( numChars == 2 )
69b80d28 1661 {
35d11700 1662 // second character of a surrogate
ef199164 1663 *outBuff++ = cc[1];
69b80d28 1664 }
c91830cb 1665 }
c91830cb 1666 }
c91830cb 1667
35d11700 1668 return outLen;
c91830cb
VZ
1669}
1670
467e0479
VZ
1671// ----------------------------------------------------------------------------
1672// endian-reversing conversions
1673// ----------------------------------------------------------------------------
c91830cb 1674
35d11700
VZ
1675size_t
1676wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1677 const char *src, size_t srcLen) const
c91830cb 1678{
35d11700
VZ
1679 srcLen = GetLength(src, srcLen);
1680 if ( srcLen == wxNO_LEN )
1681 return wxCONV_FAILED;
1682
ef199164 1683 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1684 if ( !dst )
1685 {
1686 // optimization: return maximal space which could be needed for this
1687 // string even if the real size could be smaller if the buffer contains
1688 // any surrogates
1689 return inLen;
1690 }
c91830cb 1691
35d11700 1692 size_t outLen = 0;
5c33522f 1693 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1694 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1695 {
35d11700
VZ
1696 wxUint32 ch;
1697 wxUint16 tmp[2];
ef199164
DS
1698
1699 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1700 inBuff++;
1701 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1702
35d11700
VZ
1703 const size_t numChars = decode_utf16(tmp, ch);
1704 if ( numChars == wxCONV_FAILED )
1705 return wxCONV_FAILED;
c91830cb 1706
35d11700 1707 if ( numChars == 2 )
ef199164 1708 inBuff++;
35d11700
VZ
1709
1710 if ( ++outLen > dstLen )
1711 return wxCONV_FAILED;
c91830cb 1712
35d11700 1713 *dst++ = ch;
c91830cb 1714 }
c91830cb 1715
c91830cb 1716
35d11700
VZ
1717 return outLen;
1718}
c91830cb 1719
35d11700
VZ
1720size_t
1721wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1722 const wchar_t *src, size_t srcLen) const
c91830cb 1723{
35d11700
VZ
1724 if ( srcLen == wxNO_LEN )
1725 srcLen = wxWcslen(src) + 1;
c91830cb 1726
35d11700 1727 size_t outLen = 0;
5c33522f 1728 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1729 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1730 {
1731 wxUint16 cc[2];
35d11700
VZ
1732 const size_t numChars = encode_utf16(*src, cc);
1733 if ( numChars == wxCONV_FAILED )
1734 return wxCONV_FAILED;
c91830cb 1735
ef199164
DS
1736 outLen += numChars * BYTES_PER_CHAR;
1737 if ( outBuff )
c91830cb 1738 {
35d11700
VZ
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
1741
ef199164 1742 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1743 if ( numChars == 2 )
c91830cb 1744 {
35d11700 1745 // second character of a surrogate
ef199164 1746 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1747 }
1748 }
c91830cb 1749 }
c91830cb 1750
35d11700 1751 return outLen;
c91830cb
VZ
1752}
1753
467e0479 1754#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1755
1756
35d11700 1757// ============================================================================
c91830cb 1758// UTF-32
35d11700 1759// ============================================================================
c91830cb
VZ
1760
1761#ifdef WORDS_BIGENDIAN
467e0479
VZ
1762 #define wxMBConvUTF32straight wxMBConvUTF32BE
1763 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1764#else
467e0479
VZ
1765 #define wxMBConvUTF32swap wxMBConvUTF32BE
1766 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1767#endif
1768
1769
1770WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1771WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1772
467e0479
VZ
1773/* static */
1774size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1775{
1776 if ( srcLen == wxNO_LEN )
1777 {
1778 // count the number of bytes in input, including the trailing NULs
5c33522f 1779 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1780 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1781 ;
c91830cb 1782
467e0479
VZ
1783 srcLen *= BYTES_PER_CHAR;
1784 }
1785 else // we already have the length
1786 {
1787 // we can only convert an entire number of UTF-32 characters
1788 if ( srcLen % BYTES_PER_CHAR )
1789 return wxCONV_FAILED;
1790 }
1791
1792 return srcLen;
1793}
1794
1795// case when in-memory representation is UTF-16
c91830cb
VZ
1796#ifdef WC_UTF16
1797
467e0479
VZ
1798// ----------------------------------------------------------------------------
1799// conversions without endianness change
1800// ----------------------------------------------------------------------------
1801
1802size_t
1803wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1804 const char *src, size_t srcLen) const
c91830cb 1805{
467e0479
VZ
1806 srcLen = GetLength(src, srcLen);
1807 if ( srcLen == wxNO_LEN )
1808 return wxCONV_FAILED;
c91830cb 1809
5c33522f 1810 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1811 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1812 size_t outLen = 0;
1813 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1814 {
1815 wxUint16 cc[2];
ef199164 1816 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1817 if ( numChars == wxCONV_FAILED )
1818 return wxCONV_FAILED;
c91830cb 1819
467e0479
VZ
1820 outLen += numChars;
1821 if ( dst )
c91830cb 1822 {
467e0479
VZ
1823 if ( outLen > dstLen )
1824 return wxCONV_FAILED;
d32a507d 1825
467e0479
VZ
1826 *dst++ = cc[0];
1827 if ( numChars == 2 )
1828 {
1829 // second character of a surrogate
1830 *dst++ = cc[1];
1831 }
1832 }
c91830cb 1833 }
d32a507d 1834
467e0479 1835 return outLen;
c91830cb
VZ
1836}
1837
467e0479
VZ
1838size_t
1839wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1840 const wchar_t *src, size_t srcLen) const
c91830cb 1841{
467e0479
VZ
1842 if ( srcLen == wxNO_LEN )
1843 srcLen = wxWcslen(src) + 1;
c91830cb 1844
467e0479 1845 if ( !dst )
c91830cb 1846 {
467e0479
VZ
1847 // optimization: return maximal space which could be needed for this
1848 // string instead of the exact amount which could be less if there are
1849 // any surrogates in the input
1850 //
1851 // we consider that surrogates are rare enough to make it worthwhile to
1852 // avoid running the loop below at the cost of slightly extra memory
1853 // consumption
ef199164 1854 return srcLen * BYTES_PER_CHAR;
467e0479 1855 }
c91830cb 1856
5c33522f 1857 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1858 size_t outLen = 0;
1859 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1860 {
1861 const wxUint32 ch = wxDecodeSurrogate(&src);
1862 if ( !src )
1863 return wxCONV_FAILED;
c91830cb 1864
467e0479 1865 outLen += BYTES_PER_CHAR;
d32a507d 1866
467e0479
VZ
1867 if ( outLen > dstLen )
1868 return wxCONV_FAILED;
b5153fd8 1869
ef199164 1870 *outBuff++ = ch;
467e0479 1871 }
c91830cb 1872
467e0479 1873 return outLen;
c91830cb
VZ
1874}
1875
467e0479
VZ
1876// ----------------------------------------------------------------------------
1877// endian-reversing conversions
1878// ----------------------------------------------------------------------------
c91830cb 1879
467e0479
VZ
1880size_t
1881wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1882 const char *src, size_t srcLen) const
c91830cb 1883{
467e0479
VZ
1884 srcLen = GetLength(src, srcLen);
1885 if ( srcLen == wxNO_LEN )
1886 return wxCONV_FAILED;
c91830cb 1887
5c33522f 1888 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1889 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1890 size_t outLen = 0;
ef199164 1891 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1892 {
c91830cb 1893 wxUint16 cc[2];
ef199164 1894 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1895 if ( numChars == wxCONV_FAILED )
1896 return wxCONV_FAILED;
c91830cb 1897
467e0479
VZ
1898 outLen += numChars;
1899 if ( dst )
c91830cb 1900 {
467e0479
VZ
1901 if ( outLen > dstLen )
1902 return wxCONV_FAILED;
d32a507d 1903
467e0479
VZ
1904 *dst++ = cc[0];
1905 if ( numChars == 2 )
1906 {
1907 // second character of a surrogate
1908 *dst++ = cc[1];
1909 }
1910 }
c91830cb 1911 }
b5153fd8 1912
467e0479 1913 return outLen;
c91830cb
VZ
1914}
1915
467e0479
VZ
1916size_t
1917wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1918 const wchar_t *src, size_t srcLen) const
c91830cb 1919{
467e0479
VZ
1920 if ( srcLen == wxNO_LEN )
1921 srcLen = wxWcslen(src) + 1;
c91830cb 1922
467e0479 1923 if ( !dst )
c91830cb 1924 {
467e0479
VZ
1925 // optimization: return maximal space which could be needed for this
1926 // string instead of the exact amount which could be less if there are
1927 // any surrogates in the input
1928 //
1929 // we consider that surrogates are rare enough to make it worthwhile to
1930 // avoid running the loop below at the cost of slightly extra memory
1931 // consumption
1932 return srcLen*BYTES_PER_CHAR;
1933 }
c91830cb 1934
5c33522f 1935 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1936 size_t outLen = 0;
1937 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1938 {
1939 const wxUint32 ch = wxDecodeSurrogate(&src);
1940 if ( !src )
1941 return wxCONV_FAILED;
c91830cb 1942
467e0479 1943 outLen += BYTES_PER_CHAR;
d32a507d 1944
467e0479
VZ
1945 if ( outLen > dstLen )
1946 return wxCONV_FAILED;
b5153fd8 1947
ef199164 1948 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1949 }
c91830cb 1950
467e0479 1951 return outLen;
c91830cb
VZ
1952}
1953
467e0479 1954#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1955
35d11700
VZ
1956// ----------------------------------------------------------------------------
1957// conversions without endianness change
1958// ----------------------------------------------------------------------------
1959
1960size_t
1961wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1962 const char *src, size_t srcLen) const
c91830cb 1963{
35d11700
VZ
1964 // use memcpy() as it should be much faster than hand-written loop
1965 srcLen = GetLength(src, srcLen);
1966 if ( srcLen == wxNO_LEN )
1967 return wxCONV_FAILED;
c91830cb 1968
35d11700
VZ
1969 const size_t inLen = srcLen/BYTES_PER_CHAR;
1970 if ( dst )
c91830cb 1971 {
35d11700
VZ
1972 if ( dstLen < inLen )
1973 return wxCONV_FAILED;
b5153fd8 1974
35d11700
VZ
1975 memcpy(dst, src, srcLen);
1976 }
c91830cb 1977
35d11700 1978 return inLen;
c91830cb
VZ
1979}
1980
35d11700
VZ
1981size_t
1982wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1983 const wchar_t *src, size_t srcLen) const
c91830cb 1984{
35d11700
VZ
1985 if ( srcLen == wxNO_LEN )
1986 srcLen = wxWcslen(src) + 1;
1987
1988 srcLen *= BYTES_PER_CHAR;
c91830cb 1989
35d11700 1990 if ( dst )
c91830cb 1991 {
35d11700
VZ
1992 if ( dstLen < srcLen )
1993 return wxCONV_FAILED;
c91830cb 1994
35d11700 1995 memcpy(dst, src, srcLen);
c91830cb
VZ
1996 }
1997
35d11700 1998 return srcLen;
c91830cb
VZ
1999}
2000
35d11700
VZ
2001// ----------------------------------------------------------------------------
2002// endian-reversing conversions
2003// ----------------------------------------------------------------------------
c91830cb 2004
35d11700
VZ
2005size_t
2006wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2007 const char *src, size_t srcLen) const
c91830cb 2008{
35d11700
VZ
2009 srcLen = GetLength(src, srcLen);
2010 if ( srcLen == wxNO_LEN )
2011 return wxCONV_FAILED;
2012
2013 srcLen /= BYTES_PER_CHAR;
c91830cb 2014
35d11700 2015 if ( dst )
c91830cb 2016 {
35d11700
VZ
2017 if ( dstLen < srcLen )
2018 return wxCONV_FAILED;
2019
5c33522f 2020 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2021 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2022 {
ef199164 2023 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2024 }
c91830cb 2025 }
b5153fd8 2026
35d11700 2027 return srcLen;
c91830cb
VZ
2028}
2029
35d11700
VZ
2030size_t
2031wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2032 const wchar_t *src, size_t srcLen) const
c91830cb 2033{
35d11700
VZ
2034 if ( srcLen == wxNO_LEN )
2035 srcLen = wxWcslen(src) + 1;
2036
2037 srcLen *= BYTES_PER_CHAR;
c91830cb 2038
35d11700 2039 if ( dst )
c91830cb 2040 {
35d11700
VZ
2041 if ( dstLen < srcLen )
2042 return wxCONV_FAILED;
2043
5c33522f 2044 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2045 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2046 {
ef199164 2047 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2048 }
c91830cb 2049 }
b5153fd8 2050
35d11700 2051 return srcLen;
c91830cb
VZ
2052}
2053
467e0479 2054#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2055
2056
36acb880
VZ
2057// ============================================================================
2058// The classes doing conversion using the iconv_xxx() functions
2059// ============================================================================
3caec1bb 2060
b040e242 2061#ifdef HAVE_ICONV
3a0d76bc 2062
b1d547eb
VS
2063// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2064// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2065// (unless there's yet another bug in glibc) the only case when iconv()
2066// returns with (size_t)-1 (which means error) and says there are 0 bytes
2067// left in the input buffer -- when _real_ error occurs,
2068// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2069// iconv() failure.
3caec1bb
VS
2070// [This bug does not appear in glibc 2.2.]
2071#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2072#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2073 (errno != E2BIG || bufLeft != 0))
2074#else
2075#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2076#endif
2077
ab217dba 2078#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2079
74a7eb0b
VZ
2080#define ICONV_T_INVALID ((iconv_t)-1)
2081
2082#if SIZEOF_WCHAR_T == 4
2083 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2084 #define WC_ENC wxFONTENCODING_UTF32
2085#elif SIZEOF_WCHAR_T == 2
2086 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2087 #define WC_ENC wxFONTENCODING_UTF16
2088#else // sizeof(wchar_t) != 2 nor 4
2089 // does this ever happen?
2090 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2091#endif
2092
36acb880 2093// ----------------------------------------------------------------------------
e95354ec 2094// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2095// ----------------------------------------------------------------------------
2096
e95354ec 2097class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2098{
2099public:
86501081 2100 wxMBConv_iconv(const char *name);
e95354ec 2101 virtual ~wxMBConv_iconv();
36acb880 2102
8f4b0f43
VZ
2103 // implement base class virtual methods
2104 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2105 const char *src, size_t srcLen = wxNO_LEN) const;
2106 virtual size_t FromWChar(char *dst, size_t dstLen,
2107 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2108 virtual size_t GetMBNulLen() const;
2109
ba98e032
VS
2110#if wxUSE_UNICODE_UTF8
2111 virtual bool IsUTF8() const;
2112#endif
2113
d36c9347
VZ
2114 virtual wxMBConv *Clone() const
2115 {
b64f93b6 2116 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
d36c9347
VZ
2117 p->m_minMBCharWidth = m_minMBCharWidth;
2118 return p;
2119 }
2120
e95354ec 2121 bool IsOk() const
74a7eb0b 2122 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2123
2124protected:
ef199164
DS
2125 // the iconv handlers used to translate from multibyte
2126 // to wide char and in the other direction
36acb880
VZ
2127 iconv_t m2w,
2128 w2m;
ef199164 2129
b1d547eb
VS
2130#if wxUSE_THREADS
2131 // guards access to m2w and w2m objects
2132 wxMutex m_iconvMutex;
2133#endif
36acb880
VZ
2134
2135private:
e95354ec 2136 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2137 // available on this machine, it will remain NULL
74a7eb0b 2138 static wxString ms_wcCharsetName;
36acb880
VZ
2139
2140 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2141 // different endian-ness than the native one
405d8f46 2142 static bool ms_wcNeedsSwap;
eec47cc6 2143
d36c9347
VZ
2144
2145 // name of the encoding handled by this conversion
b64f93b6 2146 const char *m_name;
d36c9347 2147
7ef3ab50 2148 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2149 // initially
2150 size_t m_minMBCharWidth;
36acb880
VZ
2151};
2152
8f115891 2153// make the constructor available for unit testing
86501081 2154WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2155{
2156 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2157 if ( !result->IsOk() )
2158 {
2159 delete result;
2160 return 0;
2161 }
ef199164 2162
8f115891
MW
2163 return result;
2164}
2165
422e411e 2166wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2167bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2168
86501081 2169wxMBConv_iconv::wxMBConv_iconv(const char *name)
b64f93b6 2170 : m_name(wxStrdup(name))
36acb880 2171{
c1464d9d 2172 m_minMBCharWidth = 0;
eec47cc6 2173
36acb880 2174 // check for charset that represents wchar_t:
74a7eb0b 2175 if ( ms_wcCharsetName.empty() )
f1339c56 2176 {
9a83f860 2177 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2178
74a7eb0b 2179#if wxUSE_FONTMAP
a243da29 2180 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2181#else // !wxUSE_FONTMAP
a243da29 2182 static const wxChar *const names_static[] =
36acb880 2183 {
74a7eb0b 2184#if SIZEOF_WCHAR_T == 4
9a83f860 2185 wxT("UCS-4"),
da2f1172 2186#elif SIZEOF_WCHAR_T == 2
9a83f860 2187 wxT("UCS-2"),
74a7eb0b
VZ
2188#endif
2189 NULL
2190 };
a243da29 2191 const wxChar *const *names = names_static;
74a7eb0b 2192#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2193
d1f024a8 2194 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2195 {
17a1ebd1 2196 const wxString nameCS(*names);
74a7eb0b
VZ
2197
2198 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2199 wxString nameXE(nameCS);
ef199164
DS
2200
2201#ifdef WORDS_BIGENDIAN
9a83f860 2202 nameXE += wxT("BE");
ef199164 2203#else // little endian
9a83f860 2204 nameXE += wxT("LE");
ef199164 2205#endif
74a7eb0b 2206
9a83f860 2207 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2208 nameXE.c_str());
2209
86501081 2210 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2211 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2212 {
74a7eb0b 2213 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2214 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2215 nameCS.c_str());
86501081 2216 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2217
74a7eb0b
VZ
2218 // and check for bytesex ourselves:
2219 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2220 {
74a7eb0b 2221 char buf[2], *bufPtr;
e8769ed1 2222 wchar_t wbuf[2];
74a7eb0b
VZ
2223 size_t insz, outsz;
2224 size_t res;
2225
2226 buf[0] = 'A';
2227 buf[1] = 0;
2228 wbuf[0] = 0;
2229 insz = 2;
2230 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2231 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2232 bufPtr = buf;
2233
ef199164
DS
2234 res = iconv(
2235 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2236 &wbufPtr, &outsz);
74a7eb0b
VZ
2237
2238 if (ICONV_FAILED(res, insz))
2239 {
2240 wxLogLastError(wxT("iconv"));
422e411e 2241 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2242 nameCS.c_str());
74a7eb0b
VZ
2243 }
2244 else // ok, can convert to this encoding, remember it
2245 {
17a1ebd1 2246 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2247 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2248 }
3a0d76bc
VS
2249 }
2250 }
74a7eb0b 2251 else // use charset not requiring byte swapping
36acb880 2252 {
74a7eb0b 2253 ms_wcCharsetName = nameXE;
36acb880 2254 }
3a0d76bc 2255 }
74a7eb0b 2256
0944fceb 2257 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2258 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2259 ms_wcCharsetName.empty() ? wxString("<none>")
2260 : ms_wcCharsetName,
9a83f860
VZ
2261 ms_wcNeedsSwap ? wxT(" (needs swap)")
2262 : wxT(""));
3a0d76bc 2263 }
36acb880 2264 else // we already have ms_wcCharsetName
3caec1bb 2265 {
86501081 2266 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2267 }
dccce9ea 2268
74a7eb0b 2269 if ( ms_wcCharsetName.empty() )
f1339c56 2270 {
74a7eb0b 2271 w2m = ICONV_T_INVALID;
36acb880 2272 }
405d8f46
VZ
2273 else
2274 {
86501081 2275 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2276 if ( w2m == ICONV_T_INVALID )
2277 {
2278 wxLogTrace(TRACE_STRCONV,
2279 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2280 ms_wcCharsetName.c_str(), name);
74a7eb0b 2281 }
405d8f46 2282 }
36acb880 2283}
3caec1bb 2284
e95354ec 2285wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2286{
b64f93b6
VZ
2287 free(const_cast<char *>(m_name));
2288
74a7eb0b 2289 if ( m2w != ICONV_T_INVALID )
36acb880 2290 iconv_close(m2w);
74a7eb0b 2291 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2292 iconv_close(w2m);
2293}
3a0d76bc 2294
8f4b0f43
VZ
2295size_t
2296wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2297 const char *src, size_t srcLen) const
36acb880 2298{
8f4b0f43 2299 if ( srcLen == wxNO_LEN )
69373110 2300 {
8f4b0f43
VZ
2301 // find the string length: notice that must be done differently for
2302 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2303 // consecutive NULs
2304 const size_t nulLen = GetMBNulLen();
2305 switch ( nulLen )
2306 {
2307 default:
2308 return wxCONV_FAILED;
69373110 2309
8f4b0f43
VZ
2310 case 1:
2311 srcLen = strlen(src); // arguably more optimized than our version
2312 break;
69373110 2313
8f4b0f43
VZ
2314 case 2:
2315 case 4:
2316 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317 // but they also have to start at character boundary and not
2318 // span two adjacent characters
2319 const char *p;
2320 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2321 ;
2322 srcLen = p - src;
2323 break;
2324 }
d50c0831
VZ
2325
2326 // when we're determining the length of the string ourselves we count
2327 // the terminating NUL(s) as part of it and always NUL-terminate the
2328 // output
2329 srcLen += nulLen;
69373110
VZ
2330 }
2331
8f4b0f43
VZ
2332 // we express length in the number of (wide) characters but iconv always
2333 // counts buffer sizes it in bytes
2334 dstLen *= SIZEOF_WCHAR_T;
2335
b1d547eb 2336#if wxUSE_THREADS
6a17b868
SN
2337 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2339 // wxConvLocal that are used all over wx code, so we have to make sure
2340 // the handle is used by at most one thread at the time. Otherwise
2341 // only a few wx classes would be safe to use from non-main threads
2342 // as MB<->WC conversion would fail "randomly".
2343 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2344#endif // wxUSE_THREADS
2345
36acb880 2346 size_t res, cres;
8f4b0f43 2347 const char *pszPtr = src;
36acb880 2348
8f4b0f43 2349 if ( dst )
36acb880 2350 {
8f4b0f43 2351 char* bufPtr = (char*)dst;
e8769ed1 2352
36acb880 2353 // have destination buffer, convert there
1752fda6 2354 size_t dstLenOrig = dstLen;
36acb880 2355 cres = iconv(m2w,
8f4b0f43
VZ
2356 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2357 &bufPtr, &dstLen);
1752fda6
VZ
2358
2359 // convert the number of bytes converted as returned by iconv to the
2360 // number of (wide) characters converted that we need
2361 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2362
36acb880 2363 if (ms_wcNeedsSwap)
3a0d76bc 2364 {
36acb880 2365 // convert to native endianness
17a1ebd1 2366 for ( unsigned i = 0; i < res; i++ )
467a2982 2367 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2368 }
36acb880 2369 }
8f4b0f43 2370 else // no destination buffer
36acb880 2371 {
8f4b0f43 2372 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2373 wchar_t tbuf[256];
36acb880 2374 res = 0;
ef199164
DS
2375
2376 do
2377 {
e8769ed1 2378 char* bufPtr = (char*)tbuf;
8f4b0f43 2379 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2380
2381 cres = iconv(m2w,
8f4b0f43
VZ
2382 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2383 &bufPtr, &dstLen );
36acb880 2384
8f4b0f43 2385 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2386 }
2387 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2388 }
dccce9ea 2389
8f4b0f43 2390 if (ICONV_FAILED(cres, srcLen))
f1339c56 2391 {
36acb880 2392 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2393 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2394 return wxCONV_FAILED;
36acb880
VZ
2395 }
2396
2397 return res;
2398}
2399
8f4b0f43
VZ
2400size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2401 const wchar_t *src, size_t srcLen) const
36acb880 2402{
b1d547eb
VS
2403#if wxUSE_THREADS
2404 // NB: explained in MB2WC
2405 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2406#endif
3698ae71 2407
8f4b0f43 2408 if ( srcLen == wxNO_LEN )
2588ee86 2409 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2410
2411 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2412 size_t outbuflen = dstLen;
36acb880 2413 size_t res, cres;
3a0d76bc 2414
36acb880 2415 wchar_t *tmpbuf = 0;
3caec1bb 2416
36acb880
VZ
2417 if (ms_wcNeedsSwap)
2418 {
2419 // need to copy to temp buffer to switch endianness
51725fc0 2420 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2421 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2422 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2423 for ( size_t i = 0; i < srcLen; i++ )
2424 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2425
8f4b0f43 2426 src = tmpbuf;
36acb880 2427 }
3a0d76bc 2428
8f4b0f43
VZ
2429 char* inbuf = (char*)src;
2430 if ( dst )
36acb880
VZ
2431 {
2432 // have destination buffer, convert there
8f4b0f43 2433 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2434
8f4b0f43 2435 res = dstLen - outbuflen;
36acb880 2436 }
8f4b0f43 2437 else // no destination buffer
36acb880 2438 {
8f4b0f43 2439 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2440 char tbuf[256];
36acb880 2441 res = 0;
ef199164
DS
2442 do
2443 {
8f4b0f43 2444 dst = tbuf;
51725fc0 2445 outbuflen = WXSIZEOF(tbuf);
36acb880 2446
8f4b0f43 2447 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2448
51725fc0 2449 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2450 }
2451 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2452 }
dccce9ea 2453
36acb880
VZ
2454 if (ms_wcNeedsSwap)
2455 {
2456 free(tmpbuf);
2457 }
dccce9ea 2458
e8769ed1 2459 if (ICONV_FAILED(cres, inbuflen))
36acb880 2460 {
ce6f8d6f 2461 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2462 return wxCONV_FAILED;
36acb880
VZ
2463 }
2464
2465 return res;
2466}
2467
7ef3ab50 2468size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2469{
c1464d9d 2470 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2471 {
2472 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2473
2474#if wxUSE_THREADS
2475 // NB: explained in MB2WC
2476 wxMutexLocker lock(self->m_iconvMutex);
2477#endif
2478
999020e1 2479 const wchar_t *wnul = L"";
c1464d9d 2480 char buf[8]; // should be enough for NUL in any encoding
356410fc 2481 size_t inLen = sizeof(wchar_t),
c1464d9d 2482 outLen = WXSIZEOF(buf);
ef199164
DS
2483 char *inBuff = (char *)wnul;
2484 char *outBuff = buf;
2485 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2486 {
c1464d9d 2487 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2488 }
2489 else // ok
2490 {
ef199164 2491 self->m_minMBCharWidth = outBuff - buf;
356410fc 2492 }
eec47cc6
VZ
2493 }
2494
c1464d9d 2495 return m_minMBCharWidth;
eec47cc6
VZ
2496}
2497
ba98e032
VS
2498#if wxUSE_UNICODE_UTF8
2499bool wxMBConv_iconv::IsUTF8() const
2500{
86501081
VS
2501 return wxStricmp(m_name, "UTF-8") == 0 ||
2502 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2503}
2504#endif
2505
b040e242 2506#endif // HAVE_ICONV
36acb880 2507
e95354ec 2508
36acb880
VZ
2509// ============================================================================
2510// Win32 conversion classes
2511// ============================================================================
1cd52418 2512
e95354ec 2513#ifdef wxHAVE_WIN32_MB2WC
373658eb 2514
8b04d4c4 2515// from utils.cpp
d775fa82 2516#if wxUSE_FONTMAP
86501081 2517extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2518extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2519#endif
373658eb 2520
e95354ec 2521class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2522{
2523public:
bde4baac
VZ
2524 wxMBConv_win32()
2525 {
2526 m_CodePage = CP_ACP;
c1464d9d 2527 m_minMBCharWidth = 0;
bde4baac
VZ
2528 }
2529
d36c9347 2530 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2531 : wxMBConv()
d36c9347
VZ
2532 {
2533 m_CodePage = conv.m_CodePage;
2534 m_minMBCharWidth = conv.m_minMBCharWidth;
2535 }
2536
7608a683 2537#if wxUSE_FONTMAP
86501081 2538 wxMBConv_win32(const char* name)
bde4baac
VZ
2539 {
2540 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2541 m_minMBCharWidth = 0;
bde4baac 2542 }
dccce9ea 2543
e95354ec 2544 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2545 {
2546 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2547 m_minMBCharWidth = 0;
bde4baac 2548 }
eec47cc6 2549#endif // wxUSE_FONTMAP
8b04d4c4 2550
d36c9347 2551 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2552 {
02272c9c
VZ
2553 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554 // the behaviour is not compatible with the Unix version (using iconv)
2555 // and break the library itself, e.g. wxTextInputStream::NextChar()
2556 // wouldn't work if reading an incomplete MB char didn't result in an
2557 // error
667e5b3e 2558 //
89028980 2559 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2560 // Win XP or newer and it is not supported for UTF-[78] so we always
2561 // use our own conversions in this case. See
89028980
VS
2562 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2564 if ( m_CodePage == CP_UTF8 )
89028980 2565 {
5487ff0f 2566 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2567 }
830f8f11
VZ
2568
2569 if ( m_CodePage == CP_UTF7 )
2570 {
5487ff0f 2571 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2572 }
2573
2574 int flags = 0;
2575 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2576 IsAtLeastWin2kSP4() )
89028980 2577 {
830f8f11 2578 flags = MB_ERR_INVALID_CHARS;
89028980 2579 }
667e5b3e 2580
2b5f62a0
VZ
2581 const size_t len = ::MultiByteToWideChar
2582 (
2583 m_CodePage, // code page
667e5b3e 2584 flags, // flags: fall on error
2b5f62a0
VZ
2585 psz, // input string
2586 -1, // its length (NUL-terminated)
b4da152e 2587 buf, // output string
2b5f62a0
VZ
2588 buf ? n : 0 // size of output buffer
2589 );
89028980
VS
2590 if ( !len )
2591 {
2592 // function totally failed
467e0479 2593 return wxCONV_FAILED;
89028980
VS
2594 }
2595
2596 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597 // check if we succeeded, by doing a double trip:
2598 if ( !flags && buf )
2599 {
53c174fc
VZ
2600 const size_t mbLen = strlen(psz);
2601 wxCharBuffer mbBuf(mbLen);
89028980
VS
2602 if ( ::WideCharToMultiByte
2603 (
2604 m_CodePage,
2605 0,
2606 buf,
2607 -1,
2608 mbBuf.data(),
53c174fc 2609 mbLen + 1, // size in bytes, not length
89028980
VS
2610 NULL,
2611 NULL
2612 ) == 0 ||
2613 strcmp(mbBuf, psz) != 0 )
2614 {
2615 // we didn't obtain the same thing we started from, hence
2616 // the conversion was lossy and we consider that it failed
467e0479 2617 return wxCONV_FAILED;
89028980
VS
2618 }
2619 }
2b5f62a0 2620
03a991bc
VZ
2621 // note that it returns count of written chars for buf != NULL and size
2622 // of the needed buffer for buf == NULL so in either case the length of
2623 // the string (which never includes the terminating NUL) is one less
89028980 2624 return len - 1;
f1339c56 2625 }
dccce9ea 2626
d36c9347 2627 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2628 {
13dd924a
VZ
2629 /*
2630 we have a problem here: by default, WideCharToMultiByte() may
2631 replace characters unrepresentable in the target code page with bad
2632 quality approximations such as turning "1/2" symbol (U+00BD) into
2633 "1" for the code pages which don't have it and we, obviously, want
2634 to avoid this at any price
d775fa82 2635
13dd924a
VZ
2636 the trouble is that this function does it _silently_, i.e. it won't
2637 even tell us whether it did or not... Win98/2000 and higher provide
2638 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639 we have to resort to a round trip, i.e. check that converting back
2640 results in the same string -- this is, of course, expensive but
2641 otherwise we simply can't be sure to not garble the data.
2642 */
2643
2644 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645 // it doesn't work with CJK encodings (which we test for rather roughly
2646 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2647 // supporting it
907173e5
WS
2648 BOOL usedDef wxDUMMY_INITIALIZE(false);
2649 BOOL *pUsedDef;
13dd924a
VZ
2650 int flags;
2651 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2652 {
2653 // it's our lucky day
2654 flags = WC_NO_BEST_FIT_CHARS;
2655 pUsedDef = &usedDef;
2656 }
2657 else // old system or unsupported encoding
2658 {
2659 flags = 0;
2660 pUsedDef = NULL;
2661 }
2662
2b5f62a0
VZ
2663 const size_t len = ::WideCharToMultiByte
2664 (
2665 m_CodePage, // code page
13dd924a
VZ
2666 flags, // either none or no best fit
2667 pwz, // input string
2b5f62a0
VZ
2668 -1, // it is (wide) NUL-terminated
2669 buf, // output buffer
2670 buf ? n : 0, // and its size
2671 NULL, // default "replacement" char
13dd924a 2672 pUsedDef // [out] was it used?
2b5f62a0
VZ
2673 );
2674
13dd924a
VZ
2675 if ( !len )
2676 {
2677 // function totally failed
467e0479 2678 return wxCONV_FAILED;
13dd924a
VZ
2679 }
2680
765bdb4a
VZ
2681 // we did something, check if we really succeeded
2682 if ( flags )
13dd924a 2683 {
765bdb4a
VZ
2684 // check if the conversion failed, i.e. if any replacements
2685 // were done
2686 if ( usedDef )
2687 return wxCONV_FAILED;
2688 }
2689 else // we must resort to double tripping...
2690 {
2691 // first we need to ensure that we really have the MB data: this is
2692 // not the case if we're called with NULL buffer, in which case we
2693 // need to do the conversion yet again
2694 wxCharBuffer bufDef;
2695 if ( !buf )
13dd924a 2696 {
765bdb4a
VZ
2697 bufDef = wxCharBuffer(len);
2698 buf = bufDef.data();
2699 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2700 buf, len, NULL, NULL) )
467e0479 2701 return wxCONV_FAILED;
13dd924a 2702 }
765bdb4a 2703
564da6ff
VZ
2704 if ( !n )
2705 n = wcslen(pwz);
765bdb4a 2706 wxWCharBuffer wcBuf(n);
564da6ff 2707 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2708 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2709 {
765bdb4a
VZ
2710 // we didn't obtain the same thing we started from, hence
2711 // the conversion was lossy and we consider that it failed
2712 return wxCONV_FAILED;
13dd924a
VZ
2713 }
2714 }
2715
03a991bc 2716 // see the comment above for the reason of "len - 1"
13dd924a 2717 return len - 1;
f1339c56 2718 }
dccce9ea 2719
7ef3ab50
VZ
2720 virtual size_t GetMBNulLen() const
2721 {
2722 if ( m_minMBCharWidth == 0 )
2723 {
2724 int len = ::WideCharToMultiByte
2725 (
2726 m_CodePage, // code page
2727 0, // no flags
2728 L"", // input string
2729 1, // translate just the NUL
2730 NULL, // output buffer
2731 0, // and its size
2732 NULL, // no replacement char
2733 NULL // [out] don't care if it was used
2734 );
2735
2736 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2737 switch ( len )
2738 {
2739 default:
9a83f860 2740 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2741 self->m_minMBCharWidth = (size_t)-1;
2742 break;
7ef3ab50
VZ
2743
2744 case 0:
2745 self->m_minMBCharWidth = (size_t)-1;
2746 break;
2747
2748 case 1:
2749 case 2:
2750 case 4:
2751 self->m_minMBCharWidth = len;
2752 break;
2753 }
2754 }
2755
2756 return m_minMBCharWidth;
2757 }
2758
d36c9347
VZ
2759 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2760
13dd924a
VZ
2761 bool IsOk() const { return m_CodePage != -1; }
2762
2763private:
2764 static bool CanUseNoBestFit()
2765 {
2766 static int s_isWin98Or2k = -1;
2767
2768 if ( s_isWin98Or2k == -1 )
2769 {
2770 int verMaj, verMin;
2771 switch ( wxGetOsVersion(&verMaj, &verMin) )
2772 {
406d283a 2773 case wxOS_WINDOWS_9X:
13dd924a
VZ
2774 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2775 break;
2776
406d283a 2777 case wxOS_WINDOWS_NT:
13dd924a
VZ
2778 s_isWin98Or2k = verMaj >= 5;
2779 break;
2780
2781 default:
ef199164 2782 // unknown: be conservative by default
13dd924a 2783 s_isWin98Or2k = 0;
ef199164 2784 break;
13dd924a
VZ
2785 }
2786
9a83f860 2787 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2788 }
2789
2790 return s_isWin98Or2k == 1;
2791 }
f1339c56 2792
89028980
VS
2793 static bool IsAtLeastWin2kSP4()
2794 {
8942f83a
WS
2795#ifdef __WXWINCE__
2796 return false;
2797#else
89028980
VS
2798 static int s_isAtLeastWin2kSP4 = -1;
2799
2800 if ( s_isAtLeastWin2kSP4 == -1 )
2801 {
2802 OSVERSIONINFOEX ver;
2803
2804 memset(&ver, 0, sizeof(ver));
2805 ver.dwOSVersionInfoSize = sizeof(ver);
2806 GetVersionEx((OSVERSIONINFO*)&ver);
2807
2808 s_isAtLeastWin2kSP4 =
2809 ((ver.dwMajorVersion > 5) || // Vista+
2810 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2811 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2812 ver.wServicePackMajor >= 4)) // 2000 SP4+
2813 ? 1 : 0;
2814 }
2815
2816 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2817#endif
89028980
VS
2818 }
2819
eec47cc6 2820
c1464d9d 2821 // the code page we're working with
b1d66b54 2822 long m_CodePage;
c1464d9d 2823
7ef3ab50 2824 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2825 // "unknown"
2826 size_t m_minMBCharWidth;
1cd52418 2827};
e95354ec
VZ
2828
2829#endif // wxHAVE_WIN32_MB2WC
2830
f7e98dee 2831
36acb880
VZ
2832// ============================================================================
2833// wxEncodingConverter based conversion classes
2834// ============================================================================
2835
1e6feb95 2836#if wxUSE_FONTMAP
1cd52418 2837
e95354ec 2838class wxMBConv_wxwin : public wxMBConv
1cd52418 2839{
8b04d4c4
VZ
2840private:
2841 void Init()
2842 {
6ac84a78
DE
2843 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844 // The wxMBConv_cf class does a better job.
2845 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2846 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2847 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2848 }
2849
6001e347 2850public:
f1339c56
RR
2851 // temporarily just use wxEncodingConverter stuff,
2852 // so that it works while a better implementation is built
86501081 2853 wxMBConv_wxwin(const char* name)
f1339c56
RR
2854 {
2855 if (name)
267e11c5 2856 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2857 else
2858 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2859
8b04d4c4
VZ
2860 Init();
2861 }
2862
e95354ec 2863 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2864 {
2865 m_enc = enc;
2866
2867 Init();
f1339c56 2868 }
dccce9ea 2869
bde4baac 2870 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2871 {
2872 size_t inbuf = strlen(psz);
dccce9ea 2873 if (buf)
c643a977 2874 {
ef199164 2875 if (!m2w.Convert(psz, buf))
467e0479 2876 return wxCONV_FAILED;
c643a977 2877 }
f1339c56
RR
2878 return inbuf;
2879 }
dccce9ea 2880
bde4baac 2881 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2882 {
f8d791e0 2883 const size_t inbuf = wxWcslen(psz);
f1339c56 2884 if (buf)
c643a977 2885 {
ef199164 2886 if (!w2m.Convert(psz, buf))
467e0479 2887 return wxCONV_FAILED;
c643a977 2888 }
dccce9ea 2889
f1339c56
RR
2890 return inbuf;
2891 }
dccce9ea 2892
7ef3ab50 2893 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2894 {
2895 switch ( m_enc )
2896 {
2897 case wxFONTENCODING_UTF16BE:
2898 case wxFONTENCODING_UTF16LE:
c1464d9d 2899 return 2;
eec47cc6
VZ
2900
2901 case wxFONTENCODING_UTF32BE:
2902 case wxFONTENCODING_UTF32LE:
c1464d9d 2903 return 4;
eec47cc6
VZ
2904
2905 default:
c1464d9d 2906 return 1;
eec47cc6
VZ
2907 }
2908 }
2909
d36c9347
VZ
2910 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2911
7ef3ab50
VZ
2912 bool IsOk() const { return m_ok; }
2913
2914public:
2915 wxFontEncoding m_enc;
2916 wxEncodingConverter m2w, w2m;
2917
2918private:
cafbf6fb
VZ
2919 // were we initialized successfully?
2920 bool m_ok;
fc7a2a60 2921
c0c133e1 2922 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2923};
6001e347 2924
8f115891 2925// make the constructors available for unit testing
86501081 2926WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2927{
2928 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2929 if ( !result->IsOk() )
2930 {
2931 delete result;
2932 return 0;
2933 }
ef199164 2934
8f115891
MW
2935 return result;
2936}
2937
1e6feb95
VZ
2938#endif // wxUSE_FONTMAP
2939
36acb880
VZ
2940// ============================================================================
2941// wxCSConv implementation
2942// ============================================================================
2943
8b04d4c4 2944void wxCSConv::Init()
6001e347 2945{
e95354ec
VZ
2946 m_name = NULL;
2947 m_convReal = NULL;
6c4d607e
VZ
2948}
2949
2950void wxCSConv::SetEncoding(wxFontEncoding encoding)
2951{
2952 switch ( encoding )
2953 {
2954 case wxFONTENCODING_MAX:
2955 case wxFONTENCODING_SYSTEM:
2956 if ( m_name )
2957 {
2958 // It's ok to not have encoding value if we have a name for it.
2959 m_encoding = wxFONTENCODING_SYSTEM;
2960 }
2961 else // No name neither.
2962 {
2963 // Fall back to the system default encoding in this case (not
2964 // sure how much sense does this make but this is how the old
2965 // code used to behave).
2966#if wxUSE_INTL
2967 m_encoding = wxLocale::GetSystemEncoding();
2968 if ( m_encoding == wxFONTENCODING_SYSTEM )
2969#endif // wxUSE_INTL
2970 m_encoding = wxFONTENCODING_ISO8859_1;
2971 }
2972 break;
2973
2974 case wxFONTENCODING_DEFAULT:
2975 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2976 m_encoding = wxFONTENCODING_ISO8859_1;
2977 break;
2978
2979 default:
2980 // Just use the provided encoding.
2981 m_encoding = encoding;
2982 }
e95354ec
VZ
2983}
2984
86501081 2985wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2986{
2987 Init();
82713003 2988
86501081 2989 if ( !charset.empty() )
e95354ec 2990 {
86501081 2991 SetName(charset.ToAscii());
e95354ec 2992 }
bda3d86a 2993
e4277538 2994#if wxUSE_FONTMAP
6c4d607e 2995 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
e4277538 2996#else
6c4d607e 2997 SetEncoding(wxFONTENCODING_SYSTEM);
e4277538 2998#endif
6c4d607e
VZ
2999
3000 m_convReal = DoCreate();
6001e347
RR
3001}
3002
8b04d4c4
VZ
3003wxCSConv::wxCSConv(wxFontEncoding encoding)
3004{
bda3d86a 3005 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 3006 {
9a83f860 3007 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
3008
3009 encoding = wxFONTENCODING_SYSTEM;
3010 }
3011
8b04d4c4
VZ
3012 Init();
3013
6c4d607e
VZ
3014 SetEncoding(encoding);
3015
3016 m_convReal = DoCreate();
8b04d4c4
VZ
3017}
3018
6001e347
RR
3019wxCSConv::~wxCSConv()
3020{
65e50848
JS
3021 Clear();
3022}
3023
54380f29 3024wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3025 : wxMBConv()
54380f29 3026{
8b04d4c4
VZ
3027 Init();
3028
54380f29 3029 SetName(conv.m_name);
6c4d607e
VZ
3030 SetEncoding(conv.m_encoding);
3031
3032 m_convReal = DoCreate();
54380f29
GD
3033}
3034
3035wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3036{
3037 Clear();
8b04d4c4 3038
54380f29 3039 SetName(conv.m_name);
6c4d607e
VZ
3040 SetEncoding(conv.m_encoding);
3041
3042 m_convReal = DoCreate();
8b04d4c4 3043
54380f29
GD
3044 return *this;
3045}
3046
65e50848
JS
3047void wxCSConv::Clear()
3048{
8b04d4c4 3049 free(m_name);
65e50848 3050 m_name = NULL;
6c4d607e
VZ
3051
3052 wxDELETE(m_convReal);
6001e347
RR
3053}
3054
86501081 3055void wxCSConv::SetName(const char *charset)
6001e347 3056{
6c4d607e 3057 if ( charset )
d6f2a891 3058 m_name = wxStrdup(charset);
6001e347
RR
3059}
3060
8b3eb85d 3061#if wxUSE_FONTMAP
8b3eb85d
VZ
3062
3063WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3064 wxEncodingNameCache );
8b3eb85d
VZ
3065
3066static wxEncodingNameCache gs_nameCache;
3067#endif
3068
e95354ec
VZ
3069wxMBConv *wxCSConv::DoCreate() const
3070{
ce6f8d6f
VZ
3071#if wxUSE_FONTMAP
3072 wxLogTrace(TRACE_STRCONV,
3073 wxT("creating conversion for %s"),
3074 (m_name ? m_name
86501081 3075 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3076#endif // wxUSE_FONTMAP
3077
c547282d
VZ
3078 // check for the special case of ASCII or ISO8859-1 charset: as we have
3079 // special knowledge of it anyhow, we don't need to create a special
3080 // conversion object
6c4d607e 3081 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 3082 {
e95354ec
VZ
3083 // don't convert at all
3084 return NULL;
3085 }
dccce9ea 3086
e95354ec
VZ
3087 // we trust OS to do conversion better than we can so try external
3088 // conversion methods first
3089 //
3090 // the full order is:
3091 // 1. OS conversion (iconv() under Unix or Win32 API)
3092 // 2. hard coded conversions for UTF
3093 // 3. wxEncodingConverter as fall back
3094
3095 // step (1)
3096#ifdef HAVE_ICONV
c547282d 3097#if !wxUSE_FONTMAP
e95354ec 3098 if ( m_name )
c547282d 3099#endif // !wxUSE_FONTMAP
e95354ec 3100 {
3ef10cfc 3101#if wxUSE_FONTMAP
8b3eb85d 3102 wxFontEncoding encoding(m_encoding);
3ef10cfc 3103#endif
8b3eb85d 3104
86501081 3105 if ( m_name )
8b3eb85d 3106 {
86501081 3107 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3108 if ( conv->IsOk() )
3109 return conv;
3110
3111 delete conv;
c547282d
VZ
3112
3113#if wxUSE_FONTMAP
8b3eb85d 3114 encoding =
86501081 3115 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3116#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3117 }
3118#if wxUSE_FONTMAP
3119 {
3120 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3121 if ( it != gs_nameCache.end() )
3122 {
3123 if ( it->second.empty() )
3124 return NULL;
c547282d 3125
86501081 3126 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3127 if ( conv->IsOk() )
3128 return conv;
e95354ec 3129
8b3eb85d
VZ
3130 delete conv;
3131 }
3132
a243da29 3133 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3134 // CS : in case this does not return valid names (eg for MacRoman)
3135 // encoding got a 'failure' entry in the cache all the same,
3136 // although it just has to be created using a different method, so
3137 // only store failed iconv creation attempts (or perhaps we
3138 // shoulnd't do this at all ?)
3c67ec06 3139 if ( names[0] != NULL )
8b3eb85d 3140 {
3c67ec06 3141 for ( ; *names; ++names )
8b3eb85d 3142 {
86501081
VS
3143 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3144 // will need changes that will obsolete this
3145 wxString name(*names);
3146 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3147 if ( conv->IsOk() )
3148 {
3149 gs_nameCache[encoding] = *names;
3150 return conv;
3151 }
3152
3153 delete conv;
8b3eb85d
VZ
3154 }
3155
9a83f860 3156 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3157 }
8b3eb85d
VZ
3158 }
3159#endif // wxUSE_FONTMAP
e95354ec
VZ
3160 }
3161#endif // HAVE_ICONV
3162
3163#ifdef wxHAVE_WIN32_MB2WC
3164 {
7608a683 3165#if wxUSE_FONTMAP
e95354ec
VZ
3166 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3167 : new wxMBConv_win32(m_encoding);
3168 if ( conv->IsOk() )
3169 return conv;
3170
3171 delete conv;
7608a683
WS
3172#else
3173 return NULL;
3174#endif
e95354ec
VZ
3175 }
3176#endif // wxHAVE_WIN32_MB2WC
ef199164 3177
5c4ed98d 3178#ifdef __DARWIN__
f7e98dee 3179 {
6ff49cbc
DE
3180 // leave UTF16 and UTF32 to the built-ins of wx
3181 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3182 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3183 {
a6900d10 3184#if wxUSE_FONTMAP
5c4ed98d
DE
3185 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3186 : new wxMBConv_cf(m_encoding);
a6900d10 3187#else
5c4ed98d 3188 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3189#endif
ef199164 3190
f7e98dee 3191 if ( conv->IsOk() )
d775fa82
WS
3192 return conv;
3193
3194 delete conv;
3195 }
335d31e0 3196 }
5c4ed98d
DE
3197#endif // __DARWIN__
3198
e95354ec
VZ
3199 // step (2)
3200 wxFontEncoding enc = m_encoding;
3201#if wxUSE_FONTMAP
c547282d
VZ
3202 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3203 {
3204 // use "false" to suppress interactive dialogs -- we can be called from
3205 // anywhere and popping up a dialog from here is the last thing we want to
3206 // do
267e11c5 3207 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3208 }
e95354ec
VZ
3209#endif // wxUSE_FONTMAP
3210
3211 switch ( enc )
3212 {
3213 case wxFONTENCODING_UTF7:
3214 return new wxMBConvUTF7;
3215
3216 case wxFONTENCODING_UTF8:
3217 return new wxMBConvUTF8;
3218
e95354ec
VZ
3219 case wxFONTENCODING_UTF16BE:
3220 return new wxMBConvUTF16BE;
3221
3222 case wxFONTENCODING_UTF16LE:
3223 return new wxMBConvUTF16LE;
3224
e95354ec
VZ
3225 case wxFONTENCODING_UTF32BE:
3226 return new wxMBConvUTF32BE;
3227
3228 case wxFONTENCODING_UTF32LE:
3229 return new wxMBConvUTF32LE;
3230
3231 default:
3232 // nothing to do but put here to suppress gcc warnings
ef199164 3233 break;
e95354ec
VZ
3234 }
3235
3236 // step (3)
3237#if wxUSE_FONTMAP
3238 {
3239 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3240 : new wxMBConv_wxwin(m_encoding);
3241 if ( conv->IsOk() )
3242 return conv;
3243
3244 delete conv;
3245 }
ef199164 3246
3df31b2d
VZ
3247 wxLogTrace(TRACE_STRCONV,
3248 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3249 (m_name ? wxString(m_name)
3df31b2d
VZ
3250 : wxFontMapperBase::GetEncodingName(m_encoding)));
3251#endif // wxUSE_FONTMAP
e95354ec
VZ
3252
3253 return NULL;
3254}
3255
0f0298b1
VZ
3256bool wxCSConv::IsOk() const
3257{
0f0298b1
VZ
3258 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3259 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3260 return true; // always ok as we do it ourselves
3261
3262 // m_convReal->IsOk() is called at its own creation, so we know it must
3263 // be ok if m_convReal is non-NULL
3264 return m_convReal != NULL;
3265}
3266
1c714a5d
VZ
3267size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3268 const char *src, size_t srcLen) const
3269{
2c74c558
VS
3270 if (m_convReal)
3271 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3272
3273 // latin-1 (direct)
05392dc8
VZ
3274 if ( srcLen == wxNO_LEN )
3275 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3276
05392dc8
VZ
3277 if ( dst )
3278 {
3279 if ( dstLen < srcLen )
3280 return wxCONV_FAILED;
1c714a5d 3281
05392dc8
VZ
3282 for ( size_t n = 0; n < srcLen; n++ )
3283 dst[n] = (unsigned char)(src[n]);
3284 }
2c74c558 3285
05392dc8 3286 return srcLen;
1c714a5d
VZ
3287}
3288
05392dc8
VZ
3289size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3290 const wchar_t *src, size_t srcLen) const
6001e347 3291{
e95354ec 3292 if (m_convReal)
05392dc8 3293 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3294
3295 // latin-1 (direct)
05392dc8
VZ
3296 if ( srcLen == wxNO_LEN )
3297 srcLen = wxWcslen(src) + 1;
dccce9ea 3298
05392dc8 3299 if ( dst )
f1339c56 3300 {
05392dc8
VZ
3301 if ( dstLen < srcLen )
3302 return wxCONV_FAILED;
1cd52418 3303
05392dc8 3304 for ( size_t n = 0; n < srcLen; n++ )
24642831 3305 {
05392dc8 3306 if ( src[n] > 0xFF )
467e0479 3307 return wxCONV_FAILED;
ef199164 3308
05392dc8 3309 dst[n] = (char)src[n];
24642831 3310 }
05392dc8 3311
24642831 3312 }
05392dc8 3313 else // still need to check the input validity
24642831 3314 {
05392dc8 3315 for ( size_t n = 0; n < srcLen; n++ )
24642831 3316 {
05392dc8 3317 if ( src[n] > 0xFF )
467e0479 3318 return wxCONV_FAILED;
24642831 3319 }
f1339c56 3320 }
dccce9ea 3321
05392dc8 3322 return srcLen;
6001e347
RR
3323}
3324
7ef3ab50 3325size_t wxCSConv::GetMBNulLen() const
eec47cc6 3326{
eec47cc6 3327 if ( m_convReal )
7ef3ab50 3328 return m_convReal->GetMBNulLen();
eec47cc6 3329
ba98e032 3330 // otherwise, we are ISO-8859-1
c1464d9d 3331 return 1;
eec47cc6
VZ
3332}
3333
ba98e032
VS
3334#if wxUSE_UNICODE_UTF8
3335bool wxCSConv::IsUTF8() const
3336{
ba98e032 3337 if ( m_convReal )
ba98e032 3338 return m_convReal->IsUTF8();
ba98e032
VS
3339
3340 // otherwise, we are ISO-8859-1
3341 return false;
3342}
3343#endif
3344
69c928ef
VZ
3345
3346#if wxUSE_UNICODE
3347
3348wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3349{
3350 if ( !s )
3351 return wxWCharBuffer();
3352
3353 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3354 if ( !wbuf )
5487ff0f 3355 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3356 if ( !wbuf )
3357 wbuf = wxConvISO8859_1.cMB2WX(s);
3358
3359 return wbuf;
3360}
3361
3362wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3363{
3364 if ( !ws )
3365 return wxCharBuffer();
3366
3367 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3368 if ( !buf )
3369 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3370
3371 return buf;
3372}
3373
3374#endif // wxUSE_UNICODE
f5a1953b 3375
1e50d914
VS
3376// ----------------------------------------------------------------------------
3377// globals
3378// ----------------------------------------------------------------------------
3379
3380// NB: The reason why we create converted objects in this convoluted way,
3381// using a factory function instead of global variable, is that they
3382// may be used at static initialization time (some of them are used by
3383// wxString ctors and there may be a global wxString object). In other
3384// words, possibly _before_ the converter global object would be
3385// initialized.
3386
3387#undef wxConvLibc
3388#undef wxConvUTF8
3389#undef wxConvUTF7
3390#undef wxConvLocal
3391#undef wxConvISO8859_1
3392
3393#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3394 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3395 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3396 { \
3397 static impl_klass name##Obj ctor_args; \
3398 return &name##Obj; \
3399 } \
3400 /* this ensures that all global converter objects are created */ \
3401 /* by the time static initialization is done, i.e. before any */ \
3402 /* thread is launched: */ \
3403 static klass* gs_##name##instance = wxGet_##name##Ptr()
3404
3405#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3406 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3407
5c69ef61
VZ
3408#ifdef __INTELC__
3409 // disable warning "variable 'xxx' was declared but never referenced"
3410 #pragma warning(disable: 177)
3411#endif // Intel C++
3412
1e50d914
VS
3413#ifdef __WINDOWS__
3414 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3415#elif 0 // defined(__WXOSX__)
3416 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3417#else
3418 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3419#endif
3420
e1079eda
VZ
3421// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3422// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3423// provokes an error message about "not enough macro parameters"; and we
3424// can't use "()" here as the name##Obj declaration would be parsed as a
3425// function declaration then, so use a semicolon and live with an extra
3426// empty statement (and hope that no compilers warns about this)
3427WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3428WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3429
3430WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3431WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3432
3433WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3434WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3435
6ac84a78 3436#ifdef __DARWIN__
8244507f
VZ
3437// It is important to use this conversion object under Darwin as it ensures
3438// that Unicode strings are (re)composed correctly even though xnu kernel uses
3439// decomposed form internally (at least for the file names).
6ac84a78 3440static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3441#endif
6ac84a78 3442
1e50d914 3443WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3444#ifdef __DARWIN__
1e50d914 3445 &wxConvMacUTF8DObj;
6ac84a78 3446#else // !__DARWIN__
1e50d914 3447 wxGet_wxConvLibcPtr();
6ac84a78 3448#endif // __DARWIN__/!__DARWIN__