]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
fixing build with wxUSE_PROTOCOL = 1 and wxUSE_SOCKETS = 0
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
1c193821 31#ifndef __WXWINCE__
1cd52418 32#include <errno.h>
1c193821
JS
33#endif
34
6001e347
RR
35#include <ctype.h>
36#include <string.h>
37#include <stdlib.h>
38
e95354ec 39#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
e95354ec 42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
b040e242 45#ifdef HAVE_ICONV
373658eb 46 #include <iconv.h>
b1d547eb 47 #include "wx/thread.h"
1cd52418 48#endif
1cd52418 49
373658eb
VZ
50#include "wx/encconv.h"
51#include "wx/fontmap.h"
52
5c4ed98d 53#ifdef __DARWIN__
c933e267 54#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
55#endif //def __DARWIN__
56
ef199164 57
9a83f860 58#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 59
467e0479
VZ
60// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61// be 4 bytes
4948c2b6 62#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
63 #define WC_UTF16
64#endif
65
ef199164 66
373658eb
VZ
67// ============================================================================
68// implementation
69// ============================================================================
70
69373110
VZ
71// helper function of cMB2WC(): check if n bytes at this location are all NUL
72static bool NotAllNULs(const char *p, size_t n)
73{
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78}
79
373658eb 80// ----------------------------------------------------------------------------
467e0479 81// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 82// ----------------------------------------------------------------------------
6001e347 83
c91830cb 84static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 85{
ef199164 86 if (input <= 0xffff)
4def3b35 87 {
999836aa
VZ
88 if (output)
89 *output = (wxUint16) input;
ef199164 90
4def3b35 91 return 1;
dccce9ea 92 }
ef199164 93 else if (input >= 0x110000)
4def3b35 94 {
467e0479 95 return wxCONV_FAILED;
dccce9ea
VZ
96 }
97 else
4def3b35 98 {
dccce9ea 99 if (output)
4def3b35 100 {
ef199164
DS
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 103 }
ef199164 104
4def3b35 105 return 2;
1cd52418 106 }
1cd52418
OK
107}
108
c91830cb 109static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 110{
ef199164 111 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
112 {
113 output = *input;
114 return 1;
dccce9ea 115 }
ef199164 116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
117 {
118 output = *input;
467e0479 119 return wxCONV_FAILED;
dccce9ea
VZ
120 }
121 else
4def3b35
VS
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
1cd52418
OK
126}
127
467e0479 128#ifdef WC_UTF16
35d11700
VZ
129 typedef wchar_t wxDecodeSurrogate_t;
130#else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
133
134// returns the next UTF-32 character from the wchar_t buffer and advances the
135// pointer to the character after this one
136//
137// if an invalid character is found, *pSrc is set to NULL, the caller must
138// check for this
35d11700 139static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
140{
141 wxUint32 out;
8d3dd069 142 const size_t
5c33522f 143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150}
151
f6bcfd97 152// ----------------------------------------------------------------------------
6001e347 153// wxMBConv
f6bcfd97 154// ----------------------------------------------------------------------------
2c53a80a 155
483b0434
VZ
156size_t
157wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
6001e347 159{
483b0434 160 // although new conversion classes are supposed to implement this function
36f93678 161 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
36f93678
VZ
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
6001e347 170
483b0434
VZ
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
eec47cc6 173
c1464d9d 174 // the number of NULs terminating this string
a78c43f1 175 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 176
c1464d9d
VZ
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
483b0434
VZ
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
467e0479 183 if ( srcLen != wxNO_LEN )
eec47cc6 184 {
c1464d9d 185 // we need to know how to find the end of this string
7ef3ab50 186 nulLen = GetMBNulLen();
483b0434
VZ
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
e4e3bbb4 189
c1464d9d 190 // if there are enough NULs we can avoid the copy
483b0434 191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
192 {
193 // make a copy in order to properly NUL-terminate the string
483b0434 194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 195 char * const p = bufTmp.data();
483b0434
VZ
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 198 *s = '\0';
483b0434
VZ
199
200 src = bufTmp;
eec47cc6 201 }
e4e3bbb4 202
483b0434
VZ
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
e4e3bbb4 209
36f93678
VZ
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
bbb0ff36 217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
483b0434 225 for ( ;; )
eec47cc6 226 {
c1464d9d 227 // try to convert the current chunk
483b0434 228 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
e4e3bbb4 231
483b0434 232 dstWritten += lenChunk;
f6a02087
VZ
233 if ( !srcEnd )
234 dstWritten++;
f5fb6871 235
f6a02087 236 if ( !lenChunk )
467e0479
VZ
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
483b0434
VZ
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
f6a02087
VZ
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
f6a02087
VZ
252 if ( !srcEnd )
253 dst++;
483b0434 254 }
c1464d9d 255
483b0434 256 if ( !srcEnd )
c1464d9d 257 {
467e0479 258 // we convert just one chunk in this case as this is the entire
bbb0ff36 259 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
260 break;
261 }
eec47cc6 262
bbb0ff36
VZ
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
bbb0ff36
VZ
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
c1464d9d 286
483b0434 287 if ( src >= srcEnd )
c1464d9d
VZ
288 break;
289 }
290
483b0434 291 return dstWritten;
e4e3bbb4
RN
292}
293
483b0434
VZ
294size_t
295wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
e4e3bbb4 297{
483b0434
VZ
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
e4e3bbb4 300
f6a02087
VZ
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
eec47cc6
VZ
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
eec47cc6 308 wxWCharBuffer bufTmp;
f6a02087 309 if ( isNulTerminated )
e4e3bbb4 310 {
483b0434 311 srcLen = wxWcslen(src) + 1;
eec47cc6 312 }
483b0434 313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
314 {
315 // make a copy in order to properly NUL-terminate the string
483b0434 316 bufTmp = wxWCharBuffer(srcLen);
ef199164 317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
27307233 324 src++ /* skip L'\0' too */ )
483b0434
VZ
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
483b0434 331 dstWritten += lenChunk;
27307233
VZ
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
f6a02087 340 dstWritten += lenNul;
483b0434
VZ
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
27307233
VZ
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
364 return wxCONV_FAILED;
365
27307233
VZ
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
483b0434 377 dst += lenChunk;
27307233 378 if ( chunkEnd < srcEnd )
f6a02087 379 dst += lenNul;
483b0434 380 }
27307233
VZ
381
382 src = chunkEnd;
eec47cc6 383 }
e4e3bbb4 384
483b0434
VZ
385 return dstWritten;
386}
387
ef199164 388size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 389{
51725fc0 390 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 391 if ( rc != wxCONV_FAILED )
509da451
VZ
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399}
400
ef199164 401size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 402{
51725fc0 403 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 404 if ( rc != wxCONV_FAILED )
509da451 405 {
51725fc0 406 rc -= GetMBNulLen();
509da451
VZ
407 }
408
409 return rc;
410}
411
483b0434
VZ
412wxMBConv::~wxMBConv()
413{
414 // nothing to do here (necessary for Darwin linking probably)
415}
e4e3bbb4 416
483b0434
VZ
417const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418{
419 if ( psz )
eec47cc6 420 {
483b0434 421 // calculate the length of the buffer needed first
a2db25a1 422 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 423 if ( nLen != wxCONV_FAILED )
f5fb6871 424 {
483b0434 425 // now do the actual conversion
a2db25a1 426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 427
483b0434 428 // +1 for the trailing NULL
a2db25a1 429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 430 return buf;
f5fb6871 431 }
483b0434 432 }
e4e3bbb4 433
483b0434
VZ
434 return wxWCharBuffer();
435}
3698ae71 436
483b0434
VZ
437const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438{
439 if ( pwz )
440 {
a2db25a1 441 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 442 if ( nLen != wxCONV_FAILED )
483b0434 443 {
a2db25a1
VZ
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451}
e4e3bbb4 452
483b0434 453const wxWCharBuffer
ef199164 454wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 455{
ef199164 456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 457 if ( dstLen != wxCONV_FAILED )
483b0434 458 {
0dd13d21
VZ
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
f6a02087 463 wbuf.data()[dstLen] = L'\0';
ef199164 464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
465 {
466 if ( outLen )
467e0479
VZ
467 {
468 *outLen = dstLen;
f6a02087
VZ
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
467e0479
VZ
476 (*outLen)--;
477 }
478
483b0434
VZ
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487}
488
489const wxCharBuffer
ef199164 490wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 491{
13d92ad6 492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 493 if ( dstLen != wxCONV_FAILED )
483b0434 494 {
0dd13d21
VZ
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
ef199164 501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
502 {
503 if ( outLen )
467e0479
VZ
504 {
505 *outLen = dstLen;
506
f6a02087 507 if ( inLen == wxNO_LEN )
467e0479 508 {
f6a02087
VZ
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
13d92ad6 511 *outLen -= nulLen;
467e0479
VZ
512 }
513 }
d32a507d 514
483b0434
VZ
515 return buf;
516 }
e4e3bbb4
RN
517 }
518
eec47cc6
VZ
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
e4e3bbb4
RN
523}
524
40ac5040
VZ
525const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526{
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
cfcfada9 540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
541}
542
543const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544{
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
cfcfada9 558 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
559}
560
6001e347 561// ----------------------------------------------------------------------------
bde4baac 562// wxMBConvLibc
6001e347
RR
563// ----------------------------------------------------------------------------
564
bde4baac
VZ
565size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566{
567 return wxMB2WC(buf, psz, n);
568}
569
570size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571{
572 return wxWC2MB(buf, psz, n);
573}
e1bfe89e
RR
574
575// ----------------------------------------------------------------------------
532d575b 576// wxConvBrokenFileNames
e1bfe89e
RR
577// ----------------------------------------------------------------------------
578
eec47cc6
VZ
579#ifdef __UNIX__
580
86501081 581wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 582{
9a83f860
VZ
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
586 else
587 m_conv = new wxCSConv(charset);
ea8ce907
RR
588}
589
eec47cc6 590#endif // __UNIX__
c12b7f79 591
bde4baac 592// ----------------------------------------------------------------------------
3698ae71 593// UTF-7
bde4baac 594// ----------------------------------------------------------------------------
6001e347 595
15f2ee32 596// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
597//
598// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 599
15f2ee32
RN
600//
601// BASE64 decoding table
602//
603static const unsigned char utf7unb64[] =
6001e347 604{
15f2ee32
RN
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
637};
638
9d653e81
VZ
639size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
15f2ee32 641{
9d653e81 642 DecoderState stateOrig,
852dcba5 643 *statePtr;
9d653e81
VZ
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
5c33522f 655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
15f2ee32
RN
667 size_t len = 0;
668
9d653e81
VZ
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 672 {
9d653e81
VZ
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
15f2ee32 676 {
9d653e81
VZ
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
15f2ee32 679 {
ccaa848d
VZ
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
852dcba5 691 return wxCONV_FAILED;
ccaa848d 692 }
852dcba5 693
9d653e81
VZ
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
15f2ee32 709 {
9d653e81
VZ
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
15f2ee32 715 {
9d653e81
VZ
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
15f2ee32 721 }
9d653e81 722 else // MSB
04a37834 723 {
9d653e81
VZ
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
04a37834 727 }
15f2ee32
RN
728 }
729 }
9d653e81 730 }
04a37834 731
9d653e81
VZ
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
04a37834 736 {
9d653e81
VZ
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
ccaa848d
VZ
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
9d653e81
VZ
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
15f2ee32
RN
770 }
771 }
04a37834 772
9d653e81
VZ
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
04a37834 782
15f2ee32 783 return len;
6001e347
RR
784}
785
15f2ee32
RN
786//
787// BASE64 encoding table
788//
789static const unsigned char utf7enb64[] =
790{
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799};
800
801//
802// UTF-7 encoding table
803//
804// 0 - Set D (directly encoded characters)
805// 1 - Set O (optional direct characters)
806// 2 - whitespace characters (optional)
807// 3 - special characters
808//
809static const unsigned char utf7encode[128] =
6001e347 810{
9d653e81 811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819};
820
9d653e81
VZ
821static inline bool wxIsUTF7Direct(wchar_t wc)
822{
823 return wc < 0x80 && utf7encode[wc] < 1;
824}
825
826size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
15f2ee32 828{
9d653e81
VZ
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
5c33522f 842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
843 }
844
845 EncoderState& state = *statePtr;
846
847
15f2ee32
RN
848 size_t len = 0;
849
9d653e81
VZ
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 852 {
9d653e81
VZ
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
15f2ee32 855 {
9d653e81
VZ
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
ef199164 865
9d653e81
VZ
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
15f2ee32
RN
875 len++;
876 }
9d653e81
VZ
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
15f2ee32 887#ifndef WC_UTF16
79c78d42 888 else if (((wxUint32)cc) > 0xffff)
b2c13097 889 {
15f2ee32 890 // no surrogate pair generation (yet?)
467e0479 891 return wxCONV_FAILED;
15f2ee32
RN
892 }
893#endif
894 else
895 {
9d653e81
VZ
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
ef199164 899
9d653e81
VZ
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
15f2ee32 907 {
9d653e81 908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 909 {
9d653e81
VZ
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
15f2ee32 914 {
9d653e81
VZ
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
15f2ee32 919 }
15f2ee32 920 }
ef199164 921
9d653e81
VZ
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
ef199164 924
9d653e81 925 src++;
15f2ee32 926 }
15f2ee32
RN
927 }
928 }
ef199164 929
9d653e81
VZ
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
ef199164 935
15f2ee32 936 return len;
6001e347
RR
937}
938
f6bcfd97 939// ----------------------------------------------------------------------------
6001e347 940// UTF-8
f6bcfd97 941// ----------------------------------------------------------------------------
6001e347 942
1774c3c5 943static const wxUint32 utf8_max[]=
4def3b35 944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 945
3698ae71
VZ
946// boundaries of the private use area we use to (temporarily) remap invalid
947// characters invalid in a UTF-8 encoded string
ea8ce907
RR
948const wxUint32 wxUnicodePUA = 0x100000;
949const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
0286d08d 951// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 952const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984};
985
986size_t
987wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989{
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
0dcbb107 998 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
0286d08d
VZ
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
0286d08d
VZ
1018 if ( out && !dstLen-- )
1019 break;
1020
5367a38a
VS
1021 wxUint32 code;
1022 unsigned char c = *p;
0286d08d 1023
5367a38a
VS
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
0286d08d 1028
5367a38a
VS
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
0286d08d 1031
5367a38a
VS
1032 code = c;
1033 }
1034 else
0286d08d 1035 {
5367a38a
VS
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
0286d08d 1081
5367a38a
VS
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
0286d08d
VZ
1085 }
1086
1087#ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095#else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098#endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107}
1108
1109size_t
1110wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112{
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
0dcbb107 1118 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
0286d08d
VZ
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
a964d3ed
VZ
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
0286d08d
VZ
1140
1141 wxUint32 code;
1142#ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
041e6050
VZ
1148 if ( srcLen != wxNO_LEN )
1149 srcLen--;
0286d08d
VZ
1150 }
1151#else // wchar_t is UTF-32
1152 code = *wp & 0x7fffffff;
1153#endif
1154
1155 unsigned len;
1156 if ( code <= 0x7F )
1157 {
1158 len = 1;
1159 if ( out )
1160 {
1161 if ( dstLen < len )
1162 break;
1163
1164 out[0] = (char)code;
1165 }
1166 }
1167 else if ( code <= 0x07FF )
1168 {
1169 len = 2;
1170 if ( out )
1171 {
1172 if ( dstLen < len )
1173 break;
1174
1175 // NB: this line takes 6 least significant bits, encodes them as
1176 // 10xxxxxx and discards them so that the next byte can be encoded:
1177 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1178 out[0] = 0xC0 | code;
1179 }
1180 }
1181 else if ( code < 0xFFFF )
1182 {
1183 len = 3;
1184 if ( out )
1185 {
1186 if ( dstLen < len )
1187 break;
1188
1189 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1191 out[0] = 0xE0 | code;
1192 }
1193 }
1194 else if ( code <= 0x10FFFF )
1195 {
1196 len = 4;
1197 if ( out )
1198 {
1199 if ( dstLen < len )
1200 break;
1201
1202 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1205 out[0] = 0xF0 | code;
1206 }
1207 }
1208 else
1209 {
9a83f860 1210 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1211 break;
1212 }
1213
1214 if ( out )
1215 {
1216 out += len;
1217 dstLen -= len;
1218 }
1219
1220 written += len;
1221 }
1222
1223 // we only get here if an error occurs during decoding
1224 return wxCONV_FAILED;
1225}
1226
d16d0917
VZ
1227size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1228 const char *psz, size_t srcLen) const
6001e347 1229{
0286d08d 1230 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1231 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1232
4def3b35
VS
1233 size_t len = 0;
1234
f4cb7c58
VZ
1235 // The length can be either given explicitly or computed implicitly for the
1236 // NUL-terminated strings.
1237 const bool isNulTerminated = srcLen == wxNO_LEN;
1238 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1239 {
ea8ce907
RR
1240 const char *opsz = psz;
1241 bool invalid = false;
4def3b35
VS
1242 unsigned char cc = *psz++, fc = cc;
1243 unsigned cnt;
dccce9ea 1244 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1245 fc <<= 1;
ef199164 1246
dccce9ea 1247 if (!cnt)
4def3b35
VS
1248 {
1249 // plain ASCII char
dccce9ea 1250 if (buf)
4def3b35
VS
1251 *buf++ = cc;
1252 len++;
561488ef
MW
1253
1254 // escape the escape character for octal escapes
1255 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1256 && cc == '\\' && (!buf || len < n))
1257 {
1258 if (buf)
1259 *buf++ = cc;
1260 len++;
1261 }
dccce9ea
VZ
1262 }
1263 else
4def3b35
VS
1264 {
1265 cnt--;
dccce9ea 1266 if (!cnt)
4def3b35
VS
1267 {
1268 // invalid UTF-8 sequence
ea8ce907 1269 invalid = true;
dccce9ea
VZ
1270 }
1271 else
4def3b35
VS
1272 {
1273 unsigned ocnt = cnt - 1;
1274 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1275 while (cnt--)
4def3b35 1276 {
ea8ce907 1277 cc = *psz;
dccce9ea 1278 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1279 {
1280 // invalid UTF-8 sequence
ea8ce907
RR
1281 invalid = true;
1282 break;
4def3b35 1283 }
ef199164 1284
ea8ce907 1285 psz++;
4def3b35
VS
1286 res = (res << 6) | (cc & 0x3f);
1287 }
ef199164 1288
ea8ce907 1289 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1290 {
1291 // illegal UTF-8 encoding
ea8ce907 1292 invalid = true;
4def3b35 1293 }
ea8ce907
RR
1294 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1295 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1296 {
1297 // if one of our PUA characters turns up externally
1298 // it must also be treated as an illegal sequence
1299 // (a bit like you have to escape an escape character)
1300 invalid = true;
1301 }
1302 else
1303 {
1cd52418 1304#ifdef WC_UTF16
0286d08d 1305 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1306 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1307 if (pa == wxCONV_FAILED)
ea8ce907
RR
1308 {
1309 invalid = true;
1310 }
1311 else
1312 {
1313 if (buf)
1314 buf += pa;
1315 len += pa;
1316 }
373658eb 1317#else // !WC_UTF16
ea8ce907 1318 if (buf)
38d4b1e4 1319 *buf++ = (wchar_t)res;
ea8ce907 1320 len++;
373658eb 1321#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1322 }
1323 }
ef199164 1324
ea8ce907
RR
1325 if (invalid)
1326 {
1327 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1328 {
1329 while (opsz < psz && (!buf || len < n))
1330 {
1331#ifdef WC_UTF16
1332 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1333 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1334 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1335 if (buf)
1336 buf += pa;
1337 opsz++;
1338 len += pa;
1339#else
1340 if (buf)
38d4b1e4 1341 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1342 opsz++;
1343 len++;
1344#endif
1345 }
1346 }
3698ae71 1347 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1348 {
1349 while (opsz < psz && (!buf || len < n))
1350 {
3698ae71
VZ
1351 if ( buf && len + 3 < n )
1352 {
17a1ebd1 1353 unsigned char on = *opsz;
3698ae71 1354 *buf++ = L'\\';
17a1ebd1
VZ
1355 *buf++ = (wchar_t)( L'0' + on / 0100 );
1356 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1357 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1358 }
ef199164 1359
ea8ce907
RR
1360 opsz++;
1361 len += 4;
1362 }
1363 }
3698ae71 1364 else // MAP_INVALID_UTF8_NOT
ea8ce907 1365 {
467e0479 1366 return wxCONV_FAILED;
ea8ce907 1367 }
4def3b35
VS
1368 }
1369 }
6001e347 1370 }
ef199164 1371
f4cb7c58
VZ
1372 if ( isNulTerminated )
1373 {
1374 // Add the trailing NUL in this case if we have a large enough buffer.
1375 if ( buf && (len < n) )
1376 *buf = 0;
ef199164 1377
f4cb7c58
VZ
1378 // And count it in any case.
1379 len++;
1380 }
1381
1382 return len;
6001e347
RR
1383}
1384
3698ae71
VZ
1385static inline bool isoctal(wchar_t wch)
1386{
1387 return L'0' <= wch && wch <= L'7';
1388}
1389
d16d0917
VZ
1390size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1391 const wchar_t *psz, size_t srcLen) const
6001e347 1392{
0286d08d 1393 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1394 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1395
4def3b35 1396 size_t len = 0;
6001e347 1397
2ba61518
VZ
1398 // The length can be either given explicitly or computed implicitly for the
1399 // NUL-terminated strings.
1400 const bool isNulTerminated = srcLen == wxNO_LEN;
1401 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1402 {
1403 wxUint32 cc;
ef199164 1404
1cd52418 1405#ifdef WC_UTF16
b5153fd8
VZ
1406 // cast is ok for WC_UTF16
1407 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1408 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1409#else
ef199164 1410 cc = (*psz++) & 0x7fffffff;
4def3b35 1411#endif
3698ae71
VZ
1412
1413 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1414 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1415 {
dccce9ea 1416 if (buf)
ea8ce907 1417 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1418 len++;
3698ae71 1419 }
561488ef
MW
1420 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1421 && cc == L'\\' && psz[0] == L'\\' )
1422 {
1423 if (buf)
1424 *buf++ = (char)cc;
1425 psz++;
1426 len++;
1427 }
3698ae71
VZ
1428 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1429 cc == L'\\' &&
1430 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1431 {
dccce9ea 1432 if (buf)
3698ae71 1433 {
ef199164
DS
1434 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1435 (psz[1] - L'0') * 010 +
b2c13097 1436 (psz[2] - L'0'));
3698ae71
VZ
1437 }
1438
1439 psz += 3;
ea8ce907
RR
1440 len++;
1441 }
1442 else
1443 {
1444 unsigned cnt;
ef199164
DS
1445 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1446 {
1447 }
1448
ea8ce907 1449 if (!cnt)
4def3b35 1450 {
ea8ce907
RR
1451 // plain ASCII char
1452 if (buf)
1453 *buf++ = (char) cc;
1454 len++;
1455 }
ea8ce907
RR
1456 else
1457 {
1458 len += cnt + 1;
1459 if (buf)
1460 {
1461 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1462 while (cnt--)
1463 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1464 }
4def3b35
VS
1465 }
1466 }
6001e347 1467 }
4def3b35 1468
2ba61518
VZ
1469 if ( isNulTerminated )
1470 {
1471 // Add the trailing NUL in this case if we have a large enough buffer.
1472 if ( buf && (len < n) )
1473 *buf = 0;
1474
1475 // And count it in any case.
1476 len++;
1477 }
adb45366 1478
2ba61518 1479 return len;
6001e347
RR
1480}
1481
467e0479 1482// ============================================================================
c91830cb 1483// UTF-16
467e0479 1484// ============================================================================
c91830cb
VZ
1485
1486#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1487 #define wxMBConvUTF16straight wxMBConvUTF16BE
1488 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1489#else
bde4baac
VZ
1490 #define wxMBConvUTF16swap wxMBConvUTF16BE
1491 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1492#endif
1493
467e0479
VZ
1494/* static */
1495size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1496{
1497 if ( srcLen == wxNO_LEN )
1498 {
1499 // count the number of bytes in input, including the trailing NULs
5c33522f 1500 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1501 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1502 ;
c91830cb 1503
467e0479
VZ
1504 srcLen *= BYTES_PER_CHAR;
1505 }
1506 else // we already have the length
1507 {
1508 // we can only convert an entire number of UTF-16 characters
1509 if ( srcLen % BYTES_PER_CHAR )
1510 return wxCONV_FAILED;
1511 }
1512
1513 return srcLen;
1514}
1515
1516// case when in-memory representation is UTF-16 too
c91830cb
VZ
1517#ifdef WC_UTF16
1518
467e0479
VZ
1519// ----------------------------------------------------------------------------
1520// conversions without endianness change
1521// ----------------------------------------------------------------------------
1522
1523size_t
1524wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1525 const char *src, size_t srcLen) const
c91830cb 1526{
467e0479
VZ
1527 // set up the scene for using memcpy() (which is presumably more efficient
1528 // than copying the bytes one by one)
1529 srcLen = GetLength(src, srcLen);
1530 if ( srcLen == wxNO_LEN )
1531 return wxCONV_FAILED;
c91830cb 1532
ef199164 1533 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1534 if ( dst )
c91830cb 1535 {
467e0479
VZ
1536 if ( dstLen < inLen )
1537 return wxCONV_FAILED;
c91830cb 1538
467e0479 1539 memcpy(dst, src, srcLen);
c91830cb 1540 }
d32a507d 1541
467e0479 1542 return inLen;
c91830cb
VZ
1543}
1544
467e0479
VZ
1545size_t
1546wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1547 const wchar_t *src, size_t srcLen) const
c91830cb 1548{
467e0479
VZ
1549 if ( srcLen == wxNO_LEN )
1550 srcLen = wxWcslen(src) + 1;
c91830cb 1551
467e0479
VZ
1552 srcLen *= BYTES_PER_CHAR;
1553
1554 if ( dst )
c91830cb 1555 {
467e0479
VZ
1556 if ( dstLen < srcLen )
1557 return wxCONV_FAILED;
d32a507d 1558
467e0479 1559 memcpy(dst, src, srcLen);
c91830cb 1560 }
d32a507d 1561
467e0479 1562 return srcLen;
c91830cb
VZ
1563}
1564
467e0479
VZ
1565// ----------------------------------------------------------------------------
1566// endian-reversing conversions
1567// ----------------------------------------------------------------------------
c91830cb 1568
467e0479
VZ
1569size_t
1570wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1571 const char *src, size_t srcLen) const
c91830cb 1572{
467e0479
VZ
1573 srcLen = GetLength(src, srcLen);
1574 if ( srcLen == wxNO_LEN )
1575 return wxCONV_FAILED;
c91830cb 1576
467e0479
VZ
1577 srcLen /= BYTES_PER_CHAR;
1578
1579 if ( dst )
c91830cb 1580 {
467e0479
VZ
1581 if ( dstLen < srcLen )
1582 return wxCONV_FAILED;
1583
5c33522f 1584 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1585 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1586 {
ef199164 1587 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1588 }
c91830cb 1589 }
bfab25d4 1590
467e0479 1591 return srcLen;
c91830cb
VZ
1592}
1593
467e0479
VZ
1594size_t
1595wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1596 const wchar_t *src, size_t srcLen) const
c91830cb 1597{
467e0479
VZ
1598 if ( srcLen == wxNO_LEN )
1599 srcLen = wxWcslen(src) + 1;
c91830cb 1600
467e0479
VZ
1601 srcLen *= BYTES_PER_CHAR;
1602
1603 if ( dst )
c91830cb 1604 {
467e0479
VZ
1605 if ( dstLen < srcLen )
1606 return wxCONV_FAILED;
1607
5c33522f 1608 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1609 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1610 {
ef199164 1611 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1612 }
c91830cb 1613 }
eec47cc6 1614
467e0479 1615 return srcLen;
c91830cb
VZ
1616}
1617
467e0479 1618#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1619
467e0479
VZ
1620// ----------------------------------------------------------------------------
1621// conversions without endianness change
1622// ----------------------------------------------------------------------------
c91830cb 1623
35d11700
VZ
1624size_t
1625wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1626 const char *src, size_t srcLen) const
c91830cb 1627{
35d11700
VZ
1628 srcLen = GetLength(src, srcLen);
1629 if ( srcLen == wxNO_LEN )
1630 return wxCONV_FAILED;
c91830cb 1631
ef199164 1632 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1633 if ( !dst )
c91830cb 1634 {
35d11700
VZ
1635 // optimization: return maximal space which could be needed for this
1636 // string even if the real size could be smaller if the buffer contains
1637 // any surrogates
1638 return inLen;
c91830cb 1639 }
c91830cb 1640
35d11700 1641 size_t outLen = 0;
5c33522f 1642 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1643 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1644 {
ef199164
DS
1645 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1646 if ( !inBuff )
35d11700
VZ
1647 return wxCONV_FAILED;
1648
1649 if ( ++outLen > dstLen )
1650 return wxCONV_FAILED;
c91830cb 1651
35d11700
VZ
1652 *dst++ = ch;
1653 }
1654
1655
1656 return outLen;
1657}
c91830cb 1658
35d11700
VZ
1659size_t
1660wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1661 const wchar_t *src, size_t srcLen) const
c91830cb 1662{
35d11700
VZ
1663 if ( srcLen == wxNO_LEN )
1664 srcLen = wxWcslen(src) + 1;
c91830cb 1665
35d11700 1666 size_t outLen = 0;
5c33522f 1667 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1668 for ( size_t n = 0; n < srcLen; n++ )
c91830cb 1669 {
d883acaa 1670 wxUint16 cc[2] = { 0 };
35d11700
VZ
1671 const size_t numChars = encode_utf16(*src++, cc);
1672 if ( numChars == wxCONV_FAILED )
1673 return wxCONV_FAILED;
c91830cb 1674
ef199164
DS
1675 outLen += numChars * BYTES_PER_CHAR;
1676 if ( outBuff )
c91830cb 1677 {
35d11700
VZ
1678 if ( outLen > dstLen )
1679 return wxCONV_FAILED;
1680
ef199164 1681 *outBuff++ = cc[0];
35d11700 1682 if ( numChars == 2 )
69b80d28 1683 {
35d11700 1684 // second character of a surrogate
ef199164 1685 *outBuff++ = cc[1];
69b80d28 1686 }
c91830cb 1687 }
c91830cb 1688 }
c91830cb 1689
35d11700 1690 return outLen;
c91830cb
VZ
1691}
1692
467e0479
VZ
1693// ----------------------------------------------------------------------------
1694// endian-reversing conversions
1695// ----------------------------------------------------------------------------
c91830cb 1696
35d11700
VZ
1697size_t
1698wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1699 const char *src, size_t srcLen) const
c91830cb 1700{
35d11700
VZ
1701 srcLen = GetLength(src, srcLen);
1702 if ( srcLen == wxNO_LEN )
1703 return wxCONV_FAILED;
1704
ef199164 1705 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1706 if ( !dst )
1707 {
1708 // optimization: return maximal space which could be needed for this
1709 // string even if the real size could be smaller if the buffer contains
1710 // any surrogates
1711 return inLen;
1712 }
c91830cb 1713
35d11700 1714 size_t outLen = 0;
5c33522f 1715 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1716 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1717 {
35d11700
VZ
1718 wxUint32 ch;
1719 wxUint16 tmp[2];
ef199164
DS
1720
1721 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1722 inBuff++;
1723 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1724
35d11700
VZ
1725 const size_t numChars = decode_utf16(tmp, ch);
1726 if ( numChars == wxCONV_FAILED )
1727 return wxCONV_FAILED;
c91830cb 1728
35d11700 1729 if ( numChars == 2 )
ef199164 1730 inBuff++;
35d11700
VZ
1731
1732 if ( ++outLen > dstLen )
1733 return wxCONV_FAILED;
c91830cb 1734
35d11700 1735 *dst++ = ch;
c91830cb 1736 }
c91830cb 1737
c91830cb 1738
35d11700
VZ
1739 return outLen;
1740}
c91830cb 1741
35d11700
VZ
1742size_t
1743wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1744 const wchar_t *src, size_t srcLen) const
c91830cb 1745{
35d11700
VZ
1746 if ( srcLen == wxNO_LEN )
1747 srcLen = wxWcslen(src) + 1;
c91830cb 1748
35d11700 1749 size_t outLen = 0;
5c33522f 1750 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1751 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb 1752 {
d883acaa 1753 wxUint16 cc[2] = { 0 };
35d11700
VZ
1754 const size_t numChars = encode_utf16(*src, cc);
1755 if ( numChars == wxCONV_FAILED )
1756 return wxCONV_FAILED;
c91830cb 1757
ef199164
DS
1758 outLen += numChars * BYTES_PER_CHAR;
1759 if ( outBuff )
c91830cb 1760 {
35d11700
VZ
1761 if ( outLen > dstLen )
1762 return wxCONV_FAILED;
1763
ef199164 1764 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1765 if ( numChars == 2 )
c91830cb 1766 {
35d11700 1767 // second character of a surrogate
ef199164 1768 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1769 }
1770 }
c91830cb 1771 }
c91830cb 1772
35d11700 1773 return outLen;
c91830cb
VZ
1774}
1775
467e0479 1776#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1777
1778
35d11700 1779// ============================================================================
c91830cb 1780// UTF-32
35d11700 1781// ============================================================================
c91830cb
VZ
1782
1783#ifdef WORDS_BIGENDIAN
467e0479
VZ
1784 #define wxMBConvUTF32straight wxMBConvUTF32BE
1785 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1786#else
467e0479
VZ
1787 #define wxMBConvUTF32swap wxMBConvUTF32BE
1788 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1789#endif
1790
1791
1792WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1793WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1794
467e0479
VZ
1795/* static */
1796size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1797{
1798 if ( srcLen == wxNO_LEN )
1799 {
1800 // count the number of bytes in input, including the trailing NULs
5c33522f 1801 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1802 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1803 ;
c91830cb 1804
467e0479
VZ
1805 srcLen *= BYTES_PER_CHAR;
1806 }
1807 else // we already have the length
1808 {
1809 // we can only convert an entire number of UTF-32 characters
1810 if ( srcLen % BYTES_PER_CHAR )
1811 return wxCONV_FAILED;
1812 }
1813
1814 return srcLen;
1815}
1816
1817// case when in-memory representation is UTF-16
c91830cb
VZ
1818#ifdef WC_UTF16
1819
467e0479
VZ
1820// ----------------------------------------------------------------------------
1821// conversions without endianness change
1822// ----------------------------------------------------------------------------
1823
1824size_t
1825wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1826 const char *src, size_t srcLen) const
c91830cb 1827{
467e0479
VZ
1828 srcLen = GetLength(src, srcLen);
1829 if ( srcLen == wxNO_LEN )
1830 return wxCONV_FAILED;
c91830cb 1831
5c33522f 1832 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1833 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1834 size_t outLen = 0;
1835 for ( size_t n = 0; n < inLen; n++ )
c91830cb 1836 {
d883acaa 1837 wxUint16 cc[2] = { 0 };
ef199164 1838 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1839 if ( numChars == wxCONV_FAILED )
1840 return wxCONV_FAILED;
c91830cb 1841
467e0479
VZ
1842 outLen += numChars;
1843 if ( dst )
c91830cb 1844 {
467e0479
VZ
1845 if ( outLen > dstLen )
1846 return wxCONV_FAILED;
d32a507d 1847
467e0479
VZ
1848 *dst++ = cc[0];
1849 if ( numChars == 2 )
1850 {
1851 // second character of a surrogate
1852 *dst++ = cc[1];
1853 }
1854 }
c91830cb 1855 }
d32a507d 1856
467e0479 1857 return outLen;
c91830cb
VZ
1858}
1859
467e0479
VZ
1860size_t
1861wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1862 const wchar_t *src, size_t srcLen) const
c91830cb 1863{
467e0479
VZ
1864 if ( srcLen == wxNO_LEN )
1865 srcLen = wxWcslen(src) + 1;
c91830cb 1866
467e0479 1867 if ( !dst )
c91830cb 1868 {
467e0479
VZ
1869 // optimization: return maximal space which could be needed for this
1870 // string instead of the exact amount which could be less if there are
1871 // any surrogates in the input
1872 //
1873 // we consider that surrogates are rare enough to make it worthwhile to
1874 // avoid running the loop below at the cost of slightly extra memory
1875 // consumption
ef199164 1876 return srcLen * BYTES_PER_CHAR;
467e0479 1877 }
c91830cb 1878
5c33522f 1879 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1880 size_t outLen = 0;
1881 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1882 {
1883 const wxUint32 ch = wxDecodeSurrogate(&src);
1884 if ( !src )
1885 return wxCONV_FAILED;
c91830cb 1886
467e0479 1887 outLen += BYTES_PER_CHAR;
d32a507d 1888
467e0479
VZ
1889 if ( outLen > dstLen )
1890 return wxCONV_FAILED;
b5153fd8 1891
ef199164 1892 *outBuff++ = ch;
467e0479 1893 }
c91830cb 1894
467e0479 1895 return outLen;
c91830cb
VZ
1896}
1897
467e0479
VZ
1898// ----------------------------------------------------------------------------
1899// endian-reversing conversions
1900// ----------------------------------------------------------------------------
c91830cb 1901
467e0479
VZ
1902size_t
1903wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1904 const char *src, size_t srcLen) const
c91830cb 1905{
467e0479
VZ
1906 srcLen = GetLength(src, srcLen);
1907 if ( srcLen == wxNO_LEN )
1908 return wxCONV_FAILED;
c91830cb 1909
5c33522f 1910 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1911 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1912 size_t outLen = 0;
ef199164 1913 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1914 {
d883acaa 1915 wxUint16 cc[2] = { 0 };
ef199164 1916 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1917 if ( numChars == wxCONV_FAILED )
1918 return wxCONV_FAILED;
c91830cb 1919
467e0479
VZ
1920 outLen += numChars;
1921 if ( dst )
c91830cb 1922 {
467e0479
VZ
1923 if ( outLen > dstLen )
1924 return wxCONV_FAILED;
d32a507d 1925
467e0479
VZ
1926 *dst++ = cc[0];
1927 if ( numChars == 2 )
1928 {
1929 // second character of a surrogate
1930 *dst++ = cc[1];
1931 }
1932 }
c91830cb 1933 }
b5153fd8 1934
467e0479 1935 return outLen;
c91830cb
VZ
1936}
1937
467e0479
VZ
1938size_t
1939wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1940 const wchar_t *src, size_t srcLen) const
c91830cb 1941{
467e0479
VZ
1942 if ( srcLen == wxNO_LEN )
1943 srcLen = wxWcslen(src) + 1;
c91830cb 1944
467e0479 1945 if ( !dst )
c91830cb 1946 {
467e0479
VZ
1947 // optimization: return maximal space which could be needed for this
1948 // string instead of the exact amount which could be less if there are
1949 // any surrogates in the input
1950 //
1951 // we consider that surrogates are rare enough to make it worthwhile to
1952 // avoid running the loop below at the cost of slightly extra memory
1953 // consumption
1954 return srcLen*BYTES_PER_CHAR;
1955 }
c91830cb 1956
5c33522f 1957 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1958 size_t outLen = 0;
1959 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1960 {
1961 const wxUint32 ch = wxDecodeSurrogate(&src);
1962 if ( !src )
1963 return wxCONV_FAILED;
c91830cb 1964
467e0479 1965 outLen += BYTES_PER_CHAR;
d32a507d 1966
467e0479
VZ
1967 if ( outLen > dstLen )
1968 return wxCONV_FAILED;
b5153fd8 1969
ef199164 1970 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1971 }
c91830cb 1972
467e0479 1973 return outLen;
c91830cb
VZ
1974}
1975
467e0479 1976#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1977
35d11700
VZ
1978// ----------------------------------------------------------------------------
1979// conversions without endianness change
1980// ----------------------------------------------------------------------------
1981
1982size_t
1983wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1984 const char *src, size_t srcLen) const
c91830cb 1985{
35d11700
VZ
1986 // use memcpy() as it should be much faster than hand-written loop
1987 srcLen = GetLength(src, srcLen);
1988 if ( srcLen == wxNO_LEN )
1989 return wxCONV_FAILED;
c91830cb 1990
35d11700
VZ
1991 const size_t inLen = srcLen/BYTES_PER_CHAR;
1992 if ( dst )
c91830cb 1993 {
35d11700
VZ
1994 if ( dstLen < inLen )
1995 return wxCONV_FAILED;
b5153fd8 1996
35d11700
VZ
1997 memcpy(dst, src, srcLen);
1998 }
c91830cb 1999
35d11700 2000 return inLen;
c91830cb
VZ
2001}
2002
35d11700
VZ
2003size_t
2004wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2005 const wchar_t *src, size_t srcLen) const
c91830cb 2006{
35d11700
VZ
2007 if ( srcLen == wxNO_LEN )
2008 srcLen = wxWcslen(src) + 1;
2009
2010 srcLen *= BYTES_PER_CHAR;
c91830cb 2011
35d11700 2012 if ( dst )
c91830cb 2013 {
35d11700
VZ
2014 if ( dstLen < srcLen )
2015 return wxCONV_FAILED;
c91830cb 2016
35d11700 2017 memcpy(dst, src, srcLen);
c91830cb
VZ
2018 }
2019
35d11700 2020 return srcLen;
c91830cb
VZ
2021}
2022
35d11700
VZ
2023// ----------------------------------------------------------------------------
2024// endian-reversing conversions
2025// ----------------------------------------------------------------------------
c91830cb 2026
35d11700
VZ
2027size_t
2028wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2029 const char *src, size_t srcLen) const
c91830cb 2030{
35d11700
VZ
2031 srcLen = GetLength(src, srcLen);
2032 if ( srcLen == wxNO_LEN )
2033 return wxCONV_FAILED;
2034
2035 srcLen /= BYTES_PER_CHAR;
c91830cb 2036
35d11700 2037 if ( dst )
c91830cb 2038 {
35d11700
VZ
2039 if ( dstLen < srcLen )
2040 return wxCONV_FAILED;
2041
5c33522f 2042 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2043 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2044 {
ef199164 2045 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2046 }
c91830cb 2047 }
b5153fd8 2048
35d11700 2049 return srcLen;
c91830cb
VZ
2050}
2051
35d11700
VZ
2052size_t
2053wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2054 const wchar_t *src, size_t srcLen) const
c91830cb 2055{
35d11700
VZ
2056 if ( srcLen == wxNO_LEN )
2057 srcLen = wxWcslen(src) + 1;
2058
2059 srcLen *= BYTES_PER_CHAR;
c91830cb 2060
35d11700 2061 if ( dst )
c91830cb 2062 {
35d11700
VZ
2063 if ( dstLen < srcLen )
2064 return wxCONV_FAILED;
2065
5c33522f 2066 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2067 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2068 {
ef199164 2069 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2070 }
c91830cb 2071 }
b5153fd8 2072
35d11700 2073 return srcLen;
c91830cb
VZ
2074}
2075
467e0479 2076#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2077
2078
36acb880
VZ
2079// ============================================================================
2080// The classes doing conversion using the iconv_xxx() functions
2081// ============================================================================
3caec1bb 2082
b040e242 2083#ifdef HAVE_ICONV
3a0d76bc 2084
b1d547eb
VS
2085// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2086// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2087// (unless there's yet another bug in glibc) the only case when iconv()
2088// returns with (size_t)-1 (which means error) and says there are 0 bytes
2089// left in the input buffer -- when _real_ error occurs,
2090// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2091// iconv() failure.
3caec1bb
VS
2092// [This bug does not appear in glibc 2.2.]
2093#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2094#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2095 (errno != E2BIG || bufLeft != 0))
2096#else
2097#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2098#endif
2099
ab217dba 2100#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2101
74a7eb0b
VZ
2102#define ICONV_T_INVALID ((iconv_t)-1)
2103
2104#if SIZEOF_WCHAR_T == 4
2105 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2106 #define WC_ENC wxFONTENCODING_UTF32
2107#elif SIZEOF_WCHAR_T == 2
2108 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2109 #define WC_ENC wxFONTENCODING_UTF16
2110#else // sizeof(wchar_t) != 2 nor 4
2111 // does this ever happen?
2112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2113#endif
2114
36acb880 2115// ----------------------------------------------------------------------------
e95354ec 2116// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2117// ----------------------------------------------------------------------------
2118
e95354ec 2119class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2120{
2121public:
86501081 2122 wxMBConv_iconv(const char *name);
e95354ec 2123 virtual ~wxMBConv_iconv();
36acb880 2124
8f4b0f43
VZ
2125 // implement base class virtual methods
2126 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2127 const char *src, size_t srcLen = wxNO_LEN) const;
2128 virtual size_t FromWChar(char *dst, size_t dstLen,
2129 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2130 virtual size_t GetMBNulLen() const;
2131
ba98e032
VS
2132#if wxUSE_UNICODE_UTF8
2133 virtual bool IsUTF8() const;
2134#endif
2135
d36c9347
VZ
2136 virtual wxMBConv *Clone() const
2137 {
b64f93b6 2138 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
d36c9347
VZ
2139 p->m_minMBCharWidth = m_minMBCharWidth;
2140 return p;
2141 }
2142
e95354ec 2143 bool IsOk() const
74a7eb0b 2144 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2145
2146protected:
ef199164
DS
2147 // the iconv handlers used to translate from multibyte
2148 // to wide char and in the other direction
36acb880
VZ
2149 iconv_t m2w,
2150 w2m;
ef199164 2151
b1d547eb
VS
2152#if wxUSE_THREADS
2153 // guards access to m2w and w2m objects
2154 wxMutex m_iconvMutex;
2155#endif
36acb880
VZ
2156
2157private:
e95354ec 2158 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2159 // available on this machine, it will remain NULL
74a7eb0b 2160 static wxString ms_wcCharsetName;
36acb880
VZ
2161
2162 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2163 // different endian-ness than the native one
405d8f46 2164 static bool ms_wcNeedsSwap;
eec47cc6 2165
d36c9347
VZ
2166
2167 // name of the encoding handled by this conversion
b64f93b6 2168 const char *m_name;
d36c9347 2169
7ef3ab50 2170 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2171 // initially
2172 size_t m_minMBCharWidth;
36acb880
VZ
2173};
2174
8f115891 2175// make the constructor available for unit testing
86501081 2176WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2177{
2178 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2179 if ( !result->IsOk() )
2180 {
2181 delete result;
2182 return 0;
2183 }
ef199164 2184
8f115891
MW
2185 return result;
2186}
2187
422e411e 2188wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2189bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2190
86501081 2191wxMBConv_iconv::wxMBConv_iconv(const char *name)
b64f93b6 2192 : m_name(wxStrdup(name))
36acb880 2193{
c1464d9d 2194 m_minMBCharWidth = 0;
eec47cc6 2195
36acb880 2196 // check for charset that represents wchar_t:
74a7eb0b 2197 if ( ms_wcCharsetName.empty() )
f1339c56 2198 {
9a83f860 2199 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2200
74a7eb0b 2201#if wxUSE_FONTMAP
a243da29 2202 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2203#else // !wxUSE_FONTMAP
a243da29 2204 static const wxChar *const names_static[] =
36acb880 2205 {
74a7eb0b 2206#if SIZEOF_WCHAR_T == 4
9a83f860 2207 wxT("UCS-4"),
da2f1172 2208#elif SIZEOF_WCHAR_T == 2
9a83f860 2209 wxT("UCS-2"),
74a7eb0b
VZ
2210#endif
2211 NULL
2212 };
a243da29 2213 const wxChar *const *names = names_static;
74a7eb0b 2214#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2215
d1f024a8 2216 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2217 {
17a1ebd1 2218 const wxString nameCS(*names);
74a7eb0b
VZ
2219
2220 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2221 wxString nameXE(nameCS);
ef199164
DS
2222
2223#ifdef WORDS_BIGENDIAN
9a83f860 2224 nameXE += wxT("BE");
ef199164 2225#else // little endian
9a83f860 2226 nameXE += wxT("LE");
ef199164 2227#endif
74a7eb0b 2228
9a83f860 2229 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2230 nameXE.c_str());
2231
86501081 2232 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2233 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2234 {
74a7eb0b 2235 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2236 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2237 nameCS.c_str());
86501081 2238 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2239
74a7eb0b
VZ
2240 // and check for bytesex ourselves:
2241 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2242 {
74a7eb0b 2243 char buf[2], *bufPtr;
e8769ed1 2244 wchar_t wbuf[2];
74a7eb0b
VZ
2245 size_t insz, outsz;
2246 size_t res;
2247
2248 buf[0] = 'A';
2249 buf[1] = 0;
2250 wbuf[0] = 0;
2251 insz = 2;
2252 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2253 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2254 bufPtr = buf;
2255
ef199164
DS
2256 res = iconv(
2257 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2258 &wbufPtr, &outsz);
74a7eb0b
VZ
2259
2260 if (ICONV_FAILED(res, insz))
2261 {
2262 wxLogLastError(wxT("iconv"));
422e411e 2263 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2264 nameCS.c_str());
74a7eb0b
VZ
2265 }
2266 else // ok, can convert to this encoding, remember it
2267 {
17a1ebd1 2268 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2269 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2270 }
3a0d76bc
VS
2271 }
2272 }
74a7eb0b 2273 else // use charset not requiring byte swapping
36acb880 2274 {
74a7eb0b 2275 ms_wcCharsetName = nameXE;
36acb880 2276 }
3a0d76bc 2277 }
74a7eb0b 2278
0944fceb 2279 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2280 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2281 ms_wcCharsetName.empty() ? wxString("<none>")
2282 : ms_wcCharsetName,
9a83f860
VZ
2283 ms_wcNeedsSwap ? wxT(" (needs swap)")
2284 : wxT(""));
3a0d76bc 2285 }
36acb880 2286 else // we already have ms_wcCharsetName
3caec1bb 2287 {
86501081 2288 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2289 }
dccce9ea 2290
74a7eb0b 2291 if ( ms_wcCharsetName.empty() )
f1339c56 2292 {
74a7eb0b 2293 w2m = ICONV_T_INVALID;
36acb880 2294 }
405d8f46
VZ
2295 else
2296 {
86501081 2297 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2298 if ( w2m == ICONV_T_INVALID )
2299 {
2300 wxLogTrace(TRACE_STRCONV,
2301 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2302 ms_wcCharsetName.c_str(), name);
74a7eb0b 2303 }
405d8f46 2304 }
36acb880 2305}
3caec1bb 2306
e95354ec 2307wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2308{
b64f93b6
VZ
2309 free(const_cast<char *>(m_name));
2310
74a7eb0b 2311 if ( m2w != ICONV_T_INVALID )
36acb880 2312 iconv_close(m2w);
74a7eb0b 2313 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2314 iconv_close(w2m);
2315}
3a0d76bc 2316
8f4b0f43
VZ
2317size_t
2318wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2319 const char *src, size_t srcLen) const
36acb880 2320{
8f4b0f43 2321 if ( srcLen == wxNO_LEN )
69373110 2322 {
8f4b0f43
VZ
2323 // find the string length: notice that must be done differently for
2324 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2325 // consecutive NULs
2326 const size_t nulLen = GetMBNulLen();
2327 switch ( nulLen )
2328 {
2329 default:
2330 return wxCONV_FAILED;
69373110 2331
8f4b0f43
VZ
2332 case 1:
2333 srcLen = strlen(src); // arguably more optimized than our version
2334 break;
69373110 2335
8f4b0f43
VZ
2336 case 2:
2337 case 4:
2338 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2339 // but they also have to start at character boundary and not
2340 // span two adjacent characters
2341 const char *p;
2342 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2343 ;
2344 srcLen = p - src;
2345 break;
2346 }
d50c0831
VZ
2347
2348 // when we're determining the length of the string ourselves we count
2349 // the terminating NUL(s) as part of it and always NUL-terminate the
2350 // output
2351 srcLen += nulLen;
69373110
VZ
2352 }
2353
8f4b0f43
VZ
2354 // we express length in the number of (wide) characters but iconv always
2355 // counts buffer sizes it in bytes
2356 dstLen *= SIZEOF_WCHAR_T;
2357
b1d547eb 2358#if wxUSE_THREADS
6a17b868
SN
2359 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2360 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2361 // wxConvLocal that are used all over wx code, so we have to make sure
2362 // the handle is used by at most one thread at the time. Otherwise
2363 // only a few wx classes would be safe to use from non-main threads
2364 // as MB<->WC conversion would fail "randomly".
2365 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2366#endif // wxUSE_THREADS
2367
36acb880 2368 size_t res, cres;
8f4b0f43 2369 const char *pszPtr = src;
36acb880 2370
8f4b0f43 2371 if ( dst )
36acb880 2372 {
8f4b0f43 2373 char* bufPtr = (char*)dst;
e8769ed1 2374
36acb880 2375 // have destination buffer, convert there
1752fda6 2376 size_t dstLenOrig = dstLen;
36acb880 2377 cres = iconv(m2w,
8f4b0f43
VZ
2378 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2379 &bufPtr, &dstLen);
1752fda6
VZ
2380
2381 // convert the number of bytes converted as returned by iconv to the
2382 // number of (wide) characters converted that we need
2383 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2384
36acb880 2385 if (ms_wcNeedsSwap)
3a0d76bc 2386 {
36acb880 2387 // convert to native endianness
17a1ebd1 2388 for ( unsigned i = 0; i < res; i++ )
467a2982 2389 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2390 }
36acb880 2391 }
8f4b0f43 2392 else // no destination buffer
36acb880 2393 {
8f4b0f43 2394 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2395 wchar_t tbuf[256];
36acb880 2396 res = 0;
ef199164
DS
2397
2398 do
2399 {
e8769ed1 2400 char* bufPtr = (char*)tbuf;
8f4b0f43 2401 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2402
2403 cres = iconv(m2w,
8f4b0f43
VZ
2404 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2405 &bufPtr, &dstLen );
36acb880 2406
8f4b0f43 2407 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2408 }
2409 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2410 }
dccce9ea 2411
8f4b0f43 2412 if (ICONV_FAILED(cres, srcLen))
f1339c56 2413 {
36acb880 2414 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2415 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2416 return wxCONV_FAILED;
36acb880
VZ
2417 }
2418
2419 return res;
2420}
2421
8f4b0f43
VZ
2422size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2423 const wchar_t *src, size_t srcLen) const
36acb880 2424{
b1d547eb
VS
2425#if wxUSE_THREADS
2426 // NB: explained in MB2WC
2427 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2428#endif
3698ae71 2429
8f4b0f43 2430 if ( srcLen == wxNO_LEN )
2588ee86 2431 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2432
2433 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2434 size_t outbuflen = dstLen;
36acb880 2435 size_t res, cres;
3a0d76bc 2436
36acb880 2437 wchar_t *tmpbuf = 0;
3caec1bb 2438
36acb880
VZ
2439 if (ms_wcNeedsSwap)
2440 {
2441 // need to copy to temp buffer to switch endianness
51725fc0 2442 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2443 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2444 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2445 for ( size_t i = 0; i < srcLen; i++ )
2446 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2447
8f4b0f43 2448 src = tmpbuf;
36acb880 2449 }
3a0d76bc 2450
8f4b0f43
VZ
2451 char* inbuf = (char*)src;
2452 if ( dst )
36acb880
VZ
2453 {
2454 // have destination buffer, convert there
8f4b0f43 2455 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2456
8f4b0f43 2457 res = dstLen - outbuflen;
36acb880 2458 }
8f4b0f43 2459 else // no destination buffer
36acb880 2460 {
8f4b0f43 2461 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2462 char tbuf[256];
36acb880 2463 res = 0;
ef199164
DS
2464 do
2465 {
8f4b0f43 2466 dst = tbuf;
51725fc0 2467 outbuflen = WXSIZEOF(tbuf);
36acb880 2468
8f4b0f43 2469 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2470
51725fc0 2471 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2472 }
2473 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2474 }
dccce9ea 2475
36acb880
VZ
2476 if (ms_wcNeedsSwap)
2477 {
2478 free(tmpbuf);
2479 }
dccce9ea 2480
e8769ed1 2481 if (ICONV_FAILED(cres, inbuflen))
36acb880 2482 {
ce6f8d6f 2483 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2484 return wxCONV_FAILED;
36acb880
VZ
2485 }
2486
2487 return res;
2488}
2489
7ef3ab50 2490size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2491{
c1464d9d 2492 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2493 {
2494 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2495
2496#if wxUSE_THREADS
2497 // NB: explained in MB2WC
2498 wxMutexLocker lock(self->m_iconvMutex);
2499#endif
2500
999020e1 2501 const wchar_t *wnul = L"";
c1464d9d 2502 char buf[8]; // should be enough for NUL in any encoding
356410fc 2503 size_t inLen = sizeof(wchar_t),
c1464d9d 2504 outLen = WXSIZEOF(buf);
ef199164
DS
2505 char *inBuff = (char *)wnul;
2506 char *outBuff = buf;
2507 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2508 {
c1464d9d 2509 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2510 }
2511 else // ok
2512 {
ef199164 2513 self->m_minMBCharWidth = outBuff - buf;
356410fc 2514 }
eec47cc6
VZ
2515 }
2516
c1464d9d 2517 return m_minMBCharWidth;
eec47cc6
VZ
2518}
2519
ba98e032
VS
2520#if wxUSE_UNICODE_UTF8
2521bool wxMBConv_iconv::IsUTF8() const
2522{
86501081
VS
2523 return wxStricmp(m_name, "UTF-8") == 0 ||
2524 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2525}
2526#endif
2527
b040e242 2528#endif // HAVE_ICONV
36acb880 2529
e95354ec 2530
36acb880
VZ
2531// ============================================================================
2532// Win32 conversion classes
2533// ============================================================================
1cd52418 2534
e95354ec 2535#ifdef wxHAVE_WIN32_MB2WC
373658eb 2536
8b04d4c4 2537// from utils.cpp
d775fa82 2538#if wxUSE_FONTMAP
86501081 2539extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2540extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2541#endif
373658eb 2542
e95354ec 2543class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2544{
2545public:
bde4baac
VZ
2546 wxMBConv_win32()
2547 {
2548 m_CodePage = CP_ACP;
c1464d9d 2549 m_minMBCharWidth = 0;
bde4baac
VZ
2550 }
2551
d36c9347 2552 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2553 : wxMBConv()
d36c9347
VZ
2554 {
2555 m_CodePage = conv.m_CodePage;
2556 m_minMBCharWidth = conv.m_minMBCharWidth;
2557 }
2558
7608a683 2559#if wxUSE_FONTMAP
86501081 2560 wxMBConv_win32(const char* name)
bde4baac
VZ
2561 {
2562 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2563 m_minMBCharWidth = 0;
bde4baac 2564 }
dccce9ea 2565
e95354ec 2566 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2567 {
2568 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2569 m_minMBCharWidth = 0;
bde4baac 2570 }
eec47cc6 2571#endif // wxUSE_FONTMAP
8b04d4c4 2572
d36c9347 2573 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2574 {
02272c9c
VZ
2575 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2576 // the behaviour is not compatible with the Unix version (using iconv)
2577 // and break the library itself, e.g. wxTextInputStream::NextChar()
2578 // wouldn't work if reading an incomplete MB char didn't result in an
2579 // error
667e5b3e 2580 //
89028980 2581 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2582 // Win XP or newer and it is not supported for UTF-[78] so we always
2583 // use our own conversions in this case. See
89028980
VS
2584 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2585 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2586 if ( m_CodePage == CP_UTF8 )
89028980 2587 {
5487ff0f 2588 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2589 }
830f8f11
VZ
2590
2591 if ( m_CodePage == CP_UTF7 )
2592 {
5487ff0f 2593 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2594 }
2595
2596 int flags = 0;
2597 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2598 IsAtLeastWin2kSP4() )
89028980 2599 {
830f8f11 2600 flags = MB_ERR_INVALID_CHARS;
89028980 2601 }
667e5b3e 2602
2b5f62a0
VZ
2603 const size_t len = ::MultiByteToWideChar
2604 (
2605 m_CodePage, // code page
667e5b3e 2606 flags, // flags: fall on error
2b5f62a0
VZ
2607 psz, // input string
2608 -1, // its length (NUL-terminated)
b4da152e 2609 buf, // output string
2b5f62a0
VZ
2610 buf ? n : 0 // size of output buffer
2611 );
89028980
VS
2612 if ( !len )
2613 {
2614 // function totally failed
467e0479 2615 return wxCONV_FAILED;
89028980
VS
2616 }
2617
2618 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2619 // check if we succeeded, by doing a double trip:
2620 if ( !flags && buf )
2621 {
53c174fc
VZ
2622 const size_t mbLen = strlen(psz);
2623 wxCharBuffer mbBuf(mbLen);
89028980
VS
2624 if ( ::WideCharToMultiByte
2625 (
2626 m_CodePage,
2627 0,
2628 buf,
2629 -1,
2630 mbBuf.data(),
53c174fc 2631 mbLen + 1, // size in bytes, not length
89028980
VS
2632 NULL,
2633 NULL
2634 ) == 0 ||
2635 strcmp(mbBuf, psz) != 0 )
2636 {
2637 // we didn't obtain the same thing we started from, hence
2638 // the conversion was lossy and we consider that it failed
467e0479 2639 return wxCONV_FAILED;
89028980
VS
2640 }
2641 }
2b5f62a0 2642
03a991bc
VZ
2643 // note that it returns count of written chars for buf != NULL and size
2644 // of the needed buffer for buf == NULL so in either case the length of
2645 // the string (which never includes the terminating NUL) is one less
89028980 2646 return len - 1;
f1339c56 2647 }
dccce9ea 2648
d36c9347 2649 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2650 {
13dd924a
VZ
2651 /*
2652 we have a problem here: by default, WideCharToMultiByte() may
2653 replace characters unrepresentable in the target code page with bad
2654 quality approximations such as turning "1/2" symbol (U+00BD) into
2655 "1" for the code pages which don't have it and we, obviously, want
2656 to avoid this at any price
d775fa82 2657
13dd924a
VZ
2658 the trouble is that this function does it _silently_, i.e. it won't
2659 even tell us whether it did or not... Win98/2000 and higher provide
2660 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2661 we have to resort to a round trip, i.e. check that converting back
2662 results in the same string -- this is, of course, expensive but
2663 otherwise we simply can't be sure to not garble the data.
2664 */
2665
2666 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2667 // it doesn't work with CJK encodings (which we test for rather roughly
2668 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2669 // supporting it
907173e5
WS
2670 BOOL usedDef wxDUMMY_INITIALIZE(false);
2671 BOOL *pUsedDef;
13dd924a
VZ
2672 int flags;
2673 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2674 {
2675 // it's our lucky day
2676 flags = WC_NO_BEST_FIT_CHARS;
2677 pUsedDef = &usedDef;
2678 }
2679 else // old system or unsupported encoding
2680 {
2681 flags = 0;
2682 pUsedDef = NULL;
2683 }
2684
2b5f62a0
VZ
2685 const size_t len = ::WideCharToMultiByte
2686 (
2687 m_CodePage, // code page
13dd924a
VZ
2688 flags, // either none or no best fit
2689 pwz, // input string
2b5f62a0
VZ
2690 -1, // it is (wide) NUL-terminated
2691 buf, // output buffer
2692 buf ? n : 0, // and its size
2693 NULL, // default "replacement" char
13dd924a 2694 pUsedDef // [out] was it used?
2b5f62a0
VZ
2695 );
2696
13dd924a
VZ
2697 if ( !len )
2698 {
2699 // function totally failed
467e0479 2700 return wxCONV_FAILED;
13dd924a
VZ
2701 }
2702
765bdb4a
VZ
2703 // we did something, check if we really succeeded
2704 if ( flags )
13dd924a 2705 {
765bdb4a
VZ
2706 // check if the conversion failed, i.e. if any replacements
2707 // were done
2708 if ( usedDef )
2709 return wxCONV_FAILED;
2710 }
2711 else // we must resort to double tripping...
2712 {
2713 // first we need to ensure that we really have the MB data: this is
2714 // not the case if we're called with NULL buffer, in which case we
2715 // need to do the conversion yet again
2716 wxCharBuffer bufDef;
2717 if ( !buf )
13dd924a 2718 {
765bdb4a
VZ
2719 bufDef = wxCharBuffer(len);
2720 buf = bufDef.data();
2721 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2722 buf, len, NULL, NULL) )
467e0479 2723 return wxCONV_FAILED;
13dd924a 2724 }
765bdb4a 2725
564da6ff
VZ
2726 if ( !n )
2727 n = wcslen(pwz);
765bdb4a 2728 wxWCharBuffer wcBuf(n);
564da6ff 2729 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2730 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2731 {
765bdb4a
VZ
2732 // we didn't obtain the same thing we started from, hence
2733 // the conversion was lossy and we consider that it failed
2734 return wxCONV_FAILED;
13dd924a
VZ
2735 }
2736 }
2737
03a991bc 2738 // see the comment above for the reason of "len - 1"
13dd924a 2739 return len - 1;
f1339c56 2740 }
dccce9ea 2741
7ef3ab50
VZ
2742 virtual size_t GetMBNulLen() const
2743 {
2744 if ( m_minMBCharWidth == 0 )
2745 {
2746 int len = ::WideCharToMultiByte
2747 (
2748 m_CodePage, // code page
2749 0, // no flags
2750 L"", // input string
2751 1, // translate just the NUL
2752 NULL, // output buffer
2753 0, // and its size
2754 NULL, // no replacement char
2755 NULL // [out] don't care if it was used
2756 );
2757
2758 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2759 switch ( len )
2760 {
2761 default:
9a83f860 2762 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2763 self->m_minMBCharWidth = (size_t)-1;
2764 break;
7ef3ab50
VZ
2765
2766 case 0:
2767 self->m_minMBCharWidth = (size_t)-1;
2768 break;
2769
2770 case 1:
2771 case 2:
2772 case 4:
2773 self->m_minMBCharWidth = len;
2774 break;
2775 }
2776 }
2777
2778 return m_minMBCharWidth;
2779 }
2780
d36c9347
VZ
2781 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2782
13dd924a
VZ
2783 bool IsOk() const { return m_CodePage != -1; }
2784
2785private:
2786 static bool CanUseNoBestFit()
2787 {
2788 static int s_isWin98Or2k = -1;
2789
2790 if ( s_isWin98Or2k == -1 )
2791 {
2792 int verMaj, verMin;
2793 switch ( wxGetOsVersion(&verMaj, &verMin) )
2794 {
406d283a 2795 case wxOS_WINDOWS_9X:
13dd924a
VZ
2796 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2797 break;
2798
406d283a 2799 case wxOS_WINDOWS_NT:
13dd924a
VZ
2800 s_isWin98Or2k = verMaj >= 5;
2801 break;
2802
2803 default:
ef199164 2804 // unknown: be conservative by default
13dd924a 2805 s_isWin98Or2k = 0;
ef199164 2806 break;
13dd924a
VZ
2807 }
2808
9a83f860 2809 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2810 }
2811
2812 return s_isWin98Or2k == 1;
2813 }
f1339c56 2814
89028980
VS
2815 static bool IsAtLeastWin2kSP4()
2816 {
8942f83a
WS
2817#ifdef __WXWINCE__
2818 return false;
2819#else
89028980
VS
2820 static int s_isAtLeastWin2kSP4 = -1;
2821
2822 if ( s_isAtLeastWin2kSP4 == -1 )
2823 {
2824 OSVERSIONINFOEX ver;
2825
2826 memset(&ver, 0, sizeof(ver));
2827 ver.dwOSVersionInfoSize = sizeof(ver);
2828 GetVersionEx((OSVERSIONINFO*)&ver);
2829
2830 s_isAtLeastWin2kSP4 =
2831 ((ver.dwMajorVersion > 5) || // Vista+
2832 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2833 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2834 ver.wServicePackMajor >= 4)) // 2000 SP4+
2835 ? 1 : 0;
2836 }
2837
2838 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2839#endif
89028980
VS
2840 }
2841
eec47cc6 2842
c1464d9d 2843 // the code page we're working with
b1d66b54 2844 long m_CodePage;
c1464d9d 2845
7ef3ab50 2846 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2847 // "unknown"
2848 size_t m_minMBCharWidth;
1cd52418 2849};
e95354ec
VZ
2850
2851#endif // wxHAVE_WIN32_MB2WC
2852
f7e98dee 2853
36acb880
VZ
2854// ============================================================================
2855// wxEncodingConverter based conversion classes
2856// ============================================================================
2857
1e6feb95 2858#if wxUSE_FONTMAP
1cd52418 2859
e95354ec 2860class wxMBConv_wxwin : public wxMBConv
1cd52418 2861{
8b04d4c4
VZ
2862private:
2863 void Init()
2864 {
6ac84a78
DE
2865 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2866 // The wxMBConv_cf class does a better job.
2867 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2868 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2869 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2870 }
2871
6001e347 2872public:
f1339c56
RR
2873 // temporarily just use wxEncodingConverter stuff,
2874 // so that it works while a better implementation is built
86501081 2875 wxMBConv_wxwin(const char* name)
f1339c56
RR
2876 {
2877 if (name)
267e11c5 2878 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2879 else
2880 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2881
8b04d4c4
VZ
2882 Init();
2883 }
2884
e95354ec 2885 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2886 {
2887 m_enc = enc;
2888
2889 Init();
f1339c56 2890 }
dccce9ea 2891
bde4baac 2892 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2893 {
2894 size_t inbuf = strlen(psz);
dccce9ea 2895 if (buf)
c643a977 2896 {
ef199164 2897 if (!m2w.Convert(psz, buf))
467e0479 2898 return wxCONV_FAILED;
c643a977 2899 }
f1339c56
RR
2900 return inbuf;
2901 }
dccce9ea 2902
bde4baac 2903 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2904 {
f8d791e0 2905 const size_t inbuf = wxWcslen(psz);
f1339c56 2906 if (buf)
c643a977 2907 {
ef199164 2908 if (!w2m.Convert(psz, buf))
467e0479 2909 return wxCONV_FAILED;
c643a977 2910 }
dccce9ea 2911
f1339c56
RR
2912 return inbuf;
2913 }
dccce9ea 2914
7ef3ab50 2915 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2916 {
2917 switch ( m_enc )
2918 {
2919 case wxFONTENCODING_UTF16BE:
2920 case wxFONTENCODING_UTF16LE:
c1464d9d 2921 return 2;
eec47cc6
VZ
2922
2923 case wxFONTENCODING_UTF32BE:
2924 case wxFONTENCODING_UTF32LE:
c1464d9d 2925 return 4;
eec47cc6
VZ
2926
2927 default:
c1464d9d 2928 return 1;
eec47cc6
VZ
2929 }
2930 }
2931
d36c9347
VZ
2932 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2933
7ef3ab50
VZ
2934 bool IsOk() const { return m_ok; }
2935
2936public:
2937 wxFontEncoding m_enc;
2938 wxEncodingConverter m2w, w2m;
2939
2940private:
cafbf6fb
VZ
2941 // were we initialized successfully?
2942 bool m_ok;
fc7a2a60 2943
c0c133e1 2944 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2945};
6001e347 2946
8f115891 2947// make the constructors available for unit testing
86501081 2948WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2949{
2950 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2951 if ( !result->IsOk() )
2952 {
2953 delete result;
2954 return 0;
2955 }
ef199164 2956
8f115891
MW
2957 return result;
2958}
2959
1e6feb95
VZ
2960#endif // wxUSE_FONTMAP
2961
36acb880
VZ
2962// ============================================================================
2963// wxCSConv implementation
2964// ============================================================================
2965
8b04d4c4 2966void wxCSConv::Init()
6001e347 2967{
e95354ec
VZ
2968 m_name = NULL;
2969 m_convReal = NULL;
6c4d607e
VZ
2970}
2971
2972void wxCSConv::SetEncoding(wxFontEncoding encoding)
2973{
2974 switch ( encoding )
2975 {
2976 case wxFONTENCODING_MAX:
2977 case wxFONTENCODING_SYSTEM:
2978 if ( m_name )
2979 {
2980 // It's ok to not have encoding value if we have a name for it.
2981 m_encoding = wxFONTENCODING_SYSTEM;
2982 }
2983 else // No name neither.
2984 {
2985 // Fall back to the system default encoding in this case (not
2986 // sure how much sense does this make but this is how the old
2987 // code used to behave).
2988#if wxUSE_INTL
2989 m_encoding = wxLocale::GetSystemEncoding();
2990 if ( m_encoding == wxFONTENCODING_SYSTEM )
2991#endif // wxUSE_INTL
2992 m_encoding = wxFONTENCODING_ISO8859_1;
2993 }
2994 break;
2995
2996 case wxFONTENCODING_DEFAULT:
2997 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2998 m_encoding = wxFONTENCODING_ISO8859_1;
2999 break;
3000
3001 default:
3002 // Just use the provided encoding.
3003 m_encoding = encoding;
3004 }
e95354ec
VZ
3005}
3006
86501081 3007wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
3008{
3009 Init();
82713003 3010
86501081 3011 if ( !charset.empty() )
e95354ec 3012 {
86501081 3013 SetName(charset.ToAscii());
e95354ec 3014 }
bda3d86a 3015
e4277538 3016#if wxUSE_FONTMAP
6c4d607e 3017 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
e4277538 3018#else
6c4d607e 3019 SetEncoding(wxFONTENCODING_SYSTEM);
e4277538 3020#endif
6c4d607e
VZ
3021
3022 m_convReal = DoCreate();
6001e347
RR
3023}
3024
8b04d4c4
VZ
3025wxCSConv::wxCSConv(wxFontEncoding encoding)
3026{
bda3d86a 3027 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 3028 {
9a83f860 3029 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
3030
3031 encoding = wxFONTENCODING_SYSTEM;
3032 }
3033
8b04d4c4
VZ
3034 Init();
3035
6c4d607e
VZ
3036 SetEncoding(encoding);
3037
3038 m_convReal = DoCreate();
8b04d4c4
VZ
3039}
3040
6001e347
RR
3041wxCSConv::~wxCSConv()
3042{
65e50848
JS
3043 Clear();
3044}
3045
54380f29 3046wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3047 : wxMBConv()
54380f29 3048{
8b04d4c4
VZ
3049 Init();
3050
54380f29 3051 SetName(conv.m_name);
6c4d607e
VZ
3052 SetEncoding(conv.m_encoding);
3053
3054 m_convReal = DoCreate();
54380f29
GD
3055}
3056
3057wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3058{
3059 Clear();
8b04d4c4 3060
54380f29 3061 SetName(conv.m_name);
6c4d607e
VZ
3062 SetEncoding(conv.m_encoding);
3063
3064 m_convReal = DoCreate();
8b04d4c4 3065
54380f29
GD
3066 return *this;
3067}
3068
65e50848
JS
3069void wxCSConv::Clear()
3070{
8b04d4c4 3071 free(m_name);
65e50848 3072 m_name = NULL;
6c4d607e
VZ
3073
3074 wxDELETE(m_convReal);
6001e347
RR
3075}
3076
86501081 3077void wxCSConv::SetName(const char *charset)
6001e347 3078{
6c4d607e 3079 if ( charset )
d6f2a891 3080 m_name = wxStrdup(charset);
6001e347
RR
3081}
3082
8b3eb85d 3083#if wxUSE_FONTMAP
8b3eb85d
VZ
3084
3085WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3086 wxEncodingNameCache );
8b3eb85d
VZ
3087
3088static wxEncodingNameCache gs_nameCache;
3089#endif
3090
e95354ec
VZ
3091wxMBConv *wxCSConv::DoCreate() const
3092{
ce6f8d6f
VZ
3093#if wxUSE_FONTMAP
3094 wxLogTrace(TRACE_STRCONV,
3095 wxT("creating conversion for %s"),
3096 (m_name ? m_name
86501081 3097 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3098#endif // wxUSE_FONTMAP
3099
c547282d
VZ
3100 // check for the special case of ASCII or ISO8859-1 charset: as we have
3101 // special knowledge of it anyhow, we don't need to create a special
3102 // conversion object
6c4d607e 3103 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 3104 {
e95354ec
VZ
3105 // don't convert at all
3106 return NULL;
3107 }
dccce9ea 3108
e95354ec
VZ
3109 // we trust OS to do conversion better than we can so try external
3110 // conversion methods first
3111 //
3112 // the full order is:
3113 // 1. OS conversion (iconv() under Unix or Win32 API)
3114 // 2. hard coded conversions for UTF
3115 // 3. wxEncodingConverter as fall back
3116
3117 // step (1)
3118#ifdef HAVE_ICONV
c547282d 3119#if !wxUSE_FONTMAP
e95354ec 3120 if ( m_name )
c547282d 3121#endif // !wxUSE_FONTMAP
e95354ec 3122 {
3ef10cfc 3123#if wxUSE_FONTMAP
8b3eb85d 3124 wxFontEncoding encoding(m_encoding);
3ef10cfc 3125#endif
8b3eb85d 3126
86501081 3127 if ( m_name )
8b3eb85d 3128 {
86501081 3129 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3130 if ( conv->IsOk() )
3131 return conv;
3132
3133 delete conv;
c547282d
VZ
3134
3135#if wxUSE_FONTMAP
8b3eb85d 3136 encoding =
86501081 3137 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3138#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3139 }
3140#if wxUSE_FONTMAP
3141 {
3142 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3143 if ( it != gs_nameCache.end() )
3144 {
3145 if ( it->second.empty() )
3146 return NULL;
c547282d 3147
86501081 3148 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3149 if ( conv->IsOk() )
3150 return conv;
e95354ec 3151
8b3eb85d
VZ
3152 delete conv;
3153 }
3154
a243da29 3155 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3156 // CS : in case this does not return valid names (eg for MacRoman)
3157 // encoding got a 'failure' entry in the cache all the same,
3158 // although it just has to be created using a different method, so
3159 // only store failed iconv creation attempts (or perhaps we
3160 // shoulnd't do this at all ?)
3c67ec06 3161 if ( names[0] != NULL )
8b3eb85d 3162 {
3c67ec06 3163 for ( ; *names; ++names )
8b3eb85d 3164 {
86501081
VS
3165 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3166 // will need changes that will obsolete this
3167 wxString name(*names);
3168 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3169 if ( conv->IsOk() )
3170 {
3171 gs_nameCache[encoding] = *names;
3172 return conv;
3173 }
3174
3175 delete conv;
8b3eb85d
VZ
3176 }
3177
9a83f860 3178 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3179 }
8b3eb85d
VZ
3180 }
3181#endif // wxUSE_FONTMAP
e95354ec
VZ
3182 }
3183#endif // HAVE_ICONV
3184
3185#ifdef wxHAVE_WIN32_MB2WC
3186 {
7608a683 3187#if wxUSE_FONTMAP
e95354ec
VZ
3188 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3189 : new wxMBConv_win32(m_encoding);
3190 if ( conv->IsOk() )
3191 return conv;
3192
3193 delete conv;
7608a683
WS
3194#else
3195 return NULL;
3196#endif
e95354ec
VZ
3197 }
3198#endif // wxHAVE_WIN32_MB2WC
ef199164 3199
5c4ed98d 3200#ifdef __DARWIN__
f7e98dee 3201 {
6ff49cbc
DE
3202 // leave UTF16 and UTF32 to the built-ins of wx
3203 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3204 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3205 {
a6900d10 3206#if wxUSE_FONTMAP
5c4ed98d
DE
3207 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3208 : new wxMBConv_cf(m_encoding);
a6900d10 3209#else
5c4ed98d 3210 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3211#endif
ef199164 3212
f7e98dee 3213 if ( conv->IsOk() )
d775fa82
WS
3214 return conv;
3215
3216 delete conv;
3217 }
335d31e0 3218 }
5c4ed98d
DE
3219#endif // __DARWIN__
3220
e95354ec
VZ
3221 // step (2)
3222 wxFontEncoding enc = m_encoding;
3223#if wxUSE_FONTMAP
c547282d
VZ
3224 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225 {
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3228 // do
267e11c5 3229 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3230 }
e95354ec
VZ
3231#endif // wxUSE_FONTMAP
3232
3233 switch ( enc )
3234 {
3235 case wxFONTENCODING_UTF7:
3236 return new wxMBConvUTF7;
3237
3238 case wxFONTENCODING_UTF8:
3239 return new wxMBConvUTF8;
3240
e95354ec
VZ
3241 case wxFONTENCODING_UTF16BE:
3242 return new wxMBConvUTF16BE;
3243
3244 case wxFONTENCODING_UTF16LE:
3245 return new wxMBConvUTF16LE;
3246
e95354ec
VZ
3247 case wxFONTENCODING_UTF32BE:
3248 return new wxMBConvUTF32BE;
3249
3250 case wxFONTENCODING_UTF32LE:
3251 return new wxMBConvUTF32LE;
3252
3253 default:
3254 // nothing to do but put here to suppress gcc warnings
ef199164 3255 break;
e95354ec
VZ
3256 }
3257
3258 // step (3)
3259#if wxUSE_FONTMAP
3260 {
3261 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262 : new wxMBConv_wxwin(m_encoding);
3263 if ( conv->IsOk() )
3264 return conv;
3265
3266 delete conv;
3267 }
ef199164 3268
3df31b2d
VZ
3269 wxLogTrace(TRACE_STRCONV,
3270 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3271 (m_name ? wxString(m_name)
3df31b2d
VZ
3272 : wxFontMapperBase::GetEncodingName(m_encoding)));
3273#endif // wxUSE_FONTMAP
e95354ec
VZ
3274
3275 return NULL;
3276}
3277
0f0298b1
VZ
3278bool wxCSConv::IsOk() const
3279{
0f0298b1
VZ
3280 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3281 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3282 return true; // always ok as we do it ourselves
3283
3284 // m_convReal->IsOk() is called at its own creation, so we know it must
3285 // be ok if m_convReal is non-NULL
3286 return m_convReal != NULL;
3287}
3288
1c714a5d
VZ
3289size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3290 const char *src, size_t srcLen) const
3291{
2c74c558
VS
3292 if (m_convReal)
3293 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3294
3295 // latin-1 (direct)
05392dc8
VZ
3296 if ( srcLen == wxNO_LEN )
3297 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3298
05392dc8
VZ
3299 if ( dst )
3300 {
3301 if ( dstLen < srcLen )
3302 return wxCONV_FAILED;
1c714a5d 3303
05392dc8
VZ
3304 for ( size_t n = 0; n < srcLen; n++ )
3305 dst[n] = (unsigned char)(src[n]);
3306 }
2c74c558 3307
05392dc8 3308 return srcLen;
1c714a5d
VZ
3309}
3310
05392dc8
VZ
3311size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3312 const wchar_t *src, size_t srcLen) const
6001e347 3313{
e95354ec 3314 if (m_convReal)
05392dc8 3315 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3316
3317 // latin-1 (direct)
05392dc8
VZ
3318 if ( srcLen == wxNO_LEN )
3319 srcLen = wxWcslen(src) + 1;
dccce9ea 3320
05392dc8 3321 if ( dst )
f1339c56 3322 {
05392dc8
VZ
3323 if ( dstLen < srcLen )
3324 return wxCONV_FAILED;
1cd52418 3325
05392dc8 3326 for ( size_t n = 0; n < srcLen; n++ )
24642831 3327 {
05392dc8 3328 if ( src[n] > 0xFF )
467e0479 3329 return wxCONV_FAILED;
ef199164 3330
05392dc8 3331 dst[n] = (char)src[n];
24642831 3332 }
05392dc8 3333
24642831 3334 }
05392dc8 3335 else // still need to check the input validity
24642831 3336 {
05392dc8 3337 for ( size_t n = 0; n < srcLen; n++ )
24642831 3338 {
05392dc8 3339 if ( src[n] > 0xFF )
467e0479 3340 return wxCONV_FAILED;
24642831 3341 }
f1339c56 3342 }
dccce9ea 3343
05392dc8 3344 return srcLen;
6001e347
RR
3345}
3346
7ef3ab50 3347size_t wxCSConv::GetMBNulLen() const
eec47cc6 3348{
eec47cc6 3349 if ( m_convReal )
7ef3ab50 3350 return m_convReal->GetMBNulLen();
eec47cc6 3351
ba98e032 3352 // otherwise, we are ISO-8859-1
c1464d9d 3353 return 1;
eec47cc6
VZ
3354}
3355
ba98e032
VS
3356#if wxUSE_UNICODE_UTF8
3357bool wxCSConv::IsUTF8() const
3358{
ba98e032 3359 if ( m_convReal )
ba98e032 3360 return m_convReal->IsUTF8();
ba98e032
VS
3361
3362 // otherwise, we are ISO-8859-1
3363 return false;
3364}
3365#endif
3366
69c928ef
VZ
3367
3368#if wxUSE_UNICODE
3369
3370wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3371{
3372 if ( !s )
3373 return wxWCharBuffer();
3374
3375 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3376 if ( !wbuf )
5487ff0f 3377 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3378 if ( !wbuf )
3379 wbuf = wxConvISO8859_1.cMB2WX(s);
3380
3381 return wbuf;
3382}
3383
3384wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3385{
3386 if ( !ws )
3387 return wxCharBuffer();
3388
3389 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3390 if ( !buf )
3391 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3392
3393 return buf;
3394}
3395
3396#endif // wxUSE_UNICODE
f5a1953b 3397
1e50d914
VS
3398// ----------------------------------------------------------------------------
3399// globals
3400// ----------------------------------------------------------------------------
3401
3402// NB: The reason why we create converted objects in this convoluted way,
3403// using a factory function instead of global variable, is that they
3404// may be used at static initialization time (some of them are used by
3405// wxString ctors and there may be a global wxString object). In other
3406// words, possibly _before_ the converter global object would be
3407// initialized.
3408
3409#undef wxConvLibc
3410#undef wxConvUTF8
3411#undef wxConvUTF7
3412#undef wxConvLocal
3413#undef wxConvISO8859_1
3414
3415#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3416 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3417 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3418 { \
3419 static impl_klass name##Obj ctor_args; \
3420 return &name##Obj; \
3421 } \
3422 /* this ensures that all global converter objects are created */ \
3423 /* by the time static initialization is done, i.e. before any */ \
3424 /* thread is launched: */ \
3425 static klass* gs_##name##instance = wxGet_##name##Ptr()
3426
3427#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3428 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3429
5c69ef61
VZ
3430#ifdef __INTELC__
3431 // disable warning "variable 'xxx' was declared but never referenced"
3432 #pragma warning(disable: 177)
3433#endif // Intel C++
3434
1e50d914
VS
3435#ifdef __WINDOWS__
3436 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3437#elif 0 // defined(__WXOSX__)
3438 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3439#else
3440 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3441#endif
3442
e1079eda
VZ
3443// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3444// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3445// provokes an error message about "not enough macro parameters"; and we
3446// can't use "()" here as the name##Obj declaration would be parsed as a
3447// function declaration then, so use a semicolon and live with an extra
3448// empty statement (and hope that no compilers warns about this)
3449WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3450WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3451
3452WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3453WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3454
3455WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3456WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3457
6ac84a78 3458#ifdef __DARWIN__
8244507f
VZ
3459// It is important to use this conversion object under Darwin as it ensures
3460// that Unicode strings are (re)composed correctly even though xnu kernel uses
3461// decomposed form internally (at least for the file names).
6ac84a78 3462static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3463#endif
6ac84a78 3464
1e50d914 3465WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3466#ifdef __DARWIN__
1e50d914 3467 &wxConvMacUTF8DObj;
6ac84a78 3468#else // !__DARWIN__
1e50d914 3469 wxGet_wxConvLibcPtr();
6ac84a78 3470#endif // __DARWIN__/!__DARWIN__