]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Make wxPORTRAIT and wxLANDSCAPE elements of wxPrintOrientation enum.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
c933e267 56#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
9a83f860 60#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069 144 const size_t
5c33522f 145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434 162 // although new conversion classes are supposed to implement this function
36f93678 163 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
36f93678
VZ
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
36f93678
VZ
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
bbb0ff36 219 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
483b0434 227 for ( ;; )
eec47cc6 228 {
c1464d9d 229 // try to convert the current chunk
483b0434 230 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
e4e3bbb4 233
483b0434 234 dstWritten += lenChunk;
f6a02087
VZ
235 if ( !srcEnd )
236 dstWritten++;
f5fb6871 237
f6a02087 238 if ( !lenChunk )
467e0479
VZ
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
483b0434
VZ
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
f6a02087
VZ
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
f6a02087
VZ
254 if ( !srcEnd )
255 dst++;
483b0434 256 }
c1464d9d 257
483b0434 258 if ( !srcEnd )
c1464d9d 259 {
467e0479 260 // we convert just one chunk in this case as this is the entire
bbb0ff36 261 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
262 break;
263 }
eec47cc6 264
bbb0ff36
VZ
265 // advance the input pointer past the end of this chunk: notice that we
266 // will always stop before srcEnd because we know that the chunk is
267 // always properly NUL-terminated
483b0434 268 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
269 {
270 // notice that we must skip over multiple bytes here as we suppose
271 // that if NUL takes 2 or 4 bytes, then all the other characters do
272 // too and so if advanced by a single byte we might erroneously
273 // detect sequences of NUL bytes in the middle of the input
483b0434 274 src += nulLen;
c1464d9d 275 }
e4e3bbb4 276
bbb0ff36
VZ
277 // if the buffer ends before this NUL, we shouldn't count it in our
278 // output so skip the code below
279 if ( src == srcEnd )
280 break;
281
282 // do count this terminator as it's inside the buffer we convert
283 dstWritten++;
284 if ( dst )
285 dst++;
286
287 src += nulLen; // skip the terminator itself
c1464d9d 288
483b0434 289 if ( src >= srcEnd )
c1464d9d
VZ
290 break;
291 }
292
483b0434 293 return dstWritten;
e4e3bbb4
RN
294}
295
483b0434
VZ
296size_t
297wxMBConv::FromWChar(char *dst, size_t dstLen,
298 const wchar_t *src, size_t srcLen) const
e4e3bbb4 299{
483b0434
VZ
300 // the number of chars [which would be] written to dst [if it were not NULL]
301 size_t dstWritten = 0;
e4e3bbb4 302
f6a02087
VZ
303 // if we don't know its length we have no choice but to assume that it is
304 // NUL-terminated (notice that it can still be NUL-terminated even if
305 // explicit length is given but it doesn't change our return value)
306 const bool isNulTerminated = srcLen == wxNO_LEN;
307
eec47cc6
VZ
308 // make a copy of the input string unless it is already properly
309 // NUL-terminated
eec47cc6 310 wxWCharBuffer bufTmp;
f6a02087 311 if ( isNulTerminated )
e4e3bbb4 312 {
483b0434 313 srcLen = wxWcslen(src) + 1;
eec47cc6 314 }
483b0434 315 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
316 {
317 // make a copy in order to properly NUL-terminate the string
483b0434 318 bufTmp = wxWCharBuffer(srcLen);
ef199164 319 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
320 src = bufTmp;
321 }
322
323 const size_t lenNul = GetMBNulLen();
324 for ( const wchar_t * const srcEnd = src + srcLen;
325 src < srcEnd;
27307233 326 src++ /* skip L'\0' too */ )
483b0434
VZ
327 {
328 // try to convert the current chunk
329 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
330 if ( lenChunk == wxCONV_FAILED )
331 return wxCONV_FAILED;
332
483b0434 333 dstWritten += lenChunk;
27307233
VZ
334
335 const wchar_t * const
336 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
337
338 // our return value accounts for the trailing NUL(s), unlike that of
339 // WC2MB(), however don't do it for the last NUL we artificially added
340 // ourselves above
341 if ( chunkEnd < srcEnd )
f6a02087 342 dstWritten += lenNul;
483b0434
VZ
343
344 if ( dst )
345 {
346 if ( dstWritten > dstLen )
347 return wxCONV_FAILED;
348
27307233
VZ
349 // if we know that there is enough space in the destination buffer
350 // (because we accounted for lenNul in dstWritten above), we can
351 // convert directly in place -- but otherwise we need another
352 // temporary buffer to ensure that we don't overwrite the output
353 wxCharBuffer dstBuf;
354 char *dstTmp;
355 if ( chunkEnd == srcEnd )
356 {
357 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
358 dstTmp = dstBuf.data();
359 }
360 else
361 {
362 dstTmp = dst;
363 }
364
365 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
366 return wxCONV_FAILED;
367
27307233
VZ
368 if ( dstTmp != dst )
369 {
370 // copy everything up to but excluding the terminating NUL(s)
371 // into the real output buffer
372 memcpy(dst, dstTmp, lenChunk);
373
374 // micro-optimization: if dstTmp != dst it means that chunkEnd
375 // == srcEnd and so we're done, no need to update anything below
376 break;
377 }
378
483b0434 379 dst += lenChunk;
27307233 380 if ( chunkEnd < srcEnd )
f6a02087 381 dst += lenNul;
483b0434 382 }
27307233
VZ
383
384 src = chunkEnd;
eec47cc6 385 }
e4e3bbb4 386
483b0434
VZ
387 return dstWritten;
388}
389
ef199164 390size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 391{
51725fc0 392 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 393 if ( rc != wxCONV_FAILED )
509da451
VZ
394 {
395 // ToWChar() returns the buffer length, i.e. including the trailing
396 // NUL, while this method doesn't take it into account
397 rc--;
398 }
399
400 return rc;
401}
402
ef199164 403size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 404{
51725fc0 405 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 406 if ( rc != wxCONV_FAILED )
509da451 407 {
51725fc0 408 rc -= GetMBNulLen();
509da451
VZ
409 }
410
411 return rc;
412}
413
483b0434
VZ
414wxMBConv::~wxMBConv()
415{
416 // nothing to do here (necessary for Darwin linking probably)
417}
e4e3bbb4 418
483b0434
VZ
419const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
420{
421 if ( psz )
eec47cc6 422 {
483b0434 423 // calculate the length of the buffer needed first
a2db25a1 424 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 425 if ( nLen != wxCONV_FAILED )
f5fb6871 426 {
483b0434 427 // now do the actual conversion
a2db25a1 428 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 429
483b0434 430 // +1 for the trailing NULL
a2db25a1 431 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 432 return buf;
f5fb6871 433 }
483b0434 434 }
e4e3bbb4 435
483b0434
VZ
436 return wxWCharBuffer();
437}
3698ae71 438
483b0434
VZ
439const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
440{
441 if ( pwz )
442 {
a2db25a1 443 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 444 if ( nLen != wxCONV_FAILED )
483b0434 445 {
a2db25a1
VZ
446 wxCharBuffer buf(nLen - 1);
447 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
448 return buf;
449 }
450 }
451
452 return wxCharBuffer();
453}
e4e3bbb4 454
483b0434 455const wxWCharBuffer
ef199164 456wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 457{
ef199164 458 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 459 if ( dstLen != wxCONV_FAILED )
483b0434 460 {
0dd13d21
VZ
461 // notice that we allocate space for dstLen+1 wide characters here
462 // because we want the buffer to always be NUL-terminated, even if the
463 // input isn't (as otherwise the caller has no way to know its length)
464 wxWCharBuffer wbuf(dstLen);
f6a02087 465 wbuf.data()[dstLen] = L'\0';
ef199164 466 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
467 {
468 if ( outLen )
467e0479
VZ
469 {
470 *outLen = dstLen;
f6a02087
VZ
471
472 // we also need to handle NUL-terminated input strings
473 // specially: for them the output is the length of the string
474 // excluding the trailing NUL, however if we're asked to
475 // convert a specific number of characters we return the length
476 // of the resulting output even if it's NUL-terminated
477 if ( inLen == wxNO_LEN )
467e0479
VZ
478 (*outLen)--;
479 }
480
483b0434
VZ
481 return wbuf;
482 }
483 }
484
485 if ( outLen )
486 *outLen = 0;
487
488 return wxWCharBuffer();
489}
490
491const wxCharBuffer
ef199164 492wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 493{
13d92ad6 494 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 495 if ( dstLen != wxCONV_FAILED )
483b0434 496 {
0dd13d21
VZ
497 const size_t nulLen = GetMBNulLen();
498
499 // as above, ensure that the buffer is always NUL-terminated, even if
500 // the input is not
501 wxCharBuffer buf(dstLen + nulLen - 1);
502 memset(buf.data() + dstLen, 0, nulLen);
ef199164 503 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
504 {
505 if ( outLen )
467e0479
VZ
506 {
507 *outLen = dstLen;
508
f6a02087 509 if ( inLen == wxNO_LEN )
467e0479 510 {
f6a02087
VZ
511 // in this case both input and output are NUL-terminated
512 // and we're not supposed to count NUL
13d92ad6 513 *outLen -= nulLen;
467e0479
VZ
514 }
515 }
d32a507d 516
483b0434
VZ
517 return buf;
518 }
e4e3bbb4
RN
519 }
520
eec47cc6
VZ
521 if ( outLen )
522 *outLen = 0;
523
524 return wxCharBuffer();
e4e3bbb4
RN
525}
526
40ac5040
VZ
527const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
528{
529 const size_t srcLen = buf.length();
530 if ( srcLen )
531 {
532 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
533 if ( dstLen != wxCONV_FAILED )
534 {
535 wxWCharBuffer wbuf(dstLen);
536 wbuf.data()[dstLen] = L'\0';
537 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
538 return wbuf;
539 }
540 }
541
542 return wxWCharBuffer();
543}
544
545const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
546{
547 const size_t srcLen = wbuf.length();
548 if ( srcLen )
549 {
550 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
551 if ( dstLen != wxCONV_FAILED )
552 {
553 wxCharBuffer buf(dstLen);
554 buf.data()[dstLen] = '\0';
555 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
556 return buf;
557 }
558 }
559
560 return wxCharBuffer();
561}
562
6001e347 563// ----------------------------------------------------------------------------
bde4baac 564// wxMBConvLibc
6001e347
RR
565// ----------------------------------------------------------------------------
566
bde4baac
VZ
567size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
568{
569 return wxMB2WC(buf, psz, n);
570}
571
572size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
573{
574 return wxWC2MB(buf, psz, n);
575}
e1bfe89e
RR
576
577// ----------------------------------------------------------------------------
532d575b 578// wxConvBrokenFileNames
e1bfe89e
RR
579// ----------------------------------------------------------------------------
580
eec47cc6
VZ
581#ifdef __UNIX__
582
86501081 583wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 584{
9a83f860
VZ
585 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
586 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 587 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
588 else
589 m_conv = new wxCSConv(charset);
ea8ce907
RR
590}
591
eec47cc6 592#endif // __UNIX__
c12b7f79 593
bde4baac 594// ----------------------------------------------------------------------------
3698ae71 595// UTF-7
bde4baac 596// ----------------------------------------------------------------------------
6001e347 597
15f2ee32 598// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
599//
600// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 601
15f2ee32
RN
602//
603// BASE64 decoding table
604//
605static const unsigned char utf7unb64[] =
6001e347 606{
15f2ee32
RN
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
613 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
614 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
616 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
617 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
618 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
620 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
621 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
622 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
637 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 638 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
639};
640
9d653e81
VZ
641size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
642 const char *src, size_t srcLen) const
15f2ee32 643{
9d653e81 644 DecoderState stateOrig,
852dcba5 645 *statePtr;
9d653e81
VZ
646 if ( srcLen == wxNO_LEN )
647 {
648 // convert the entire string, up to and including the trailing NUL
649 srcLen = strlen(src) + 1;
650
651 // when working on the entire strings we don't update nor use the shift
652 // state from the previous call
653 statePtr = &stateOrig;
654 }
655 else // when working with partial strings we do use the shift state
656 {
5c33522f 657 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
658
659 // also save the old state to be able to rollback to it on error
660 stateOrig = m_stateDecoder;
661 }
662
663 // but to simplify the code below we use this variable in both cases
664 DecoderState& state = *statePtr;
665
666
667 // number of characters [which would have been] written to dst [if it were
668 // not NULL]
15f2ee32
RN
669 size_t len = 0;
670
9d653e81
VZ
671 const char * const srcEnd = src + srcLen;
672
673 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 674 {
9d653e81
VZ
675 const unsigned char cc = *src++;
676
677 if ( state.IsShifted() )
15f2ee32 678 {
9d653e81
VZ
679 const unsigned char dc = utf7unb64[cc];
680 if ( dc == 0xff )
15f2ee32 681 {
ccaa848d
VZ
682 // end of encoded part, check that nothing was left: there can
683 // be up to 4 bits of 0 padding but nothing else (we also need
684 // to check isLSB as we count bits modulo 8 while a valid UTF-7
685 // encoded sequence must contain an integral number of UTF-16
686 // characters)
687 if ( state.isLSB || state.bit > 4 ||
688 (state.accum & ((1 << state.bit) - 1)) )
689 {
690 if ( !len )
691 state = stateOrig;
692
852dcba5 693 return wxCONV_FAILED;
ccaa848d 694 }
852dcba5 695
9d653e81
VZ
696 state.ToDirect();
697
698 // re-parse this character normally below unless it's '-' which
699 // is consumed by the decoder
700 if ( cc == '-' )
701 continue;
702 }
703 else // valid encoded character
704 {
705 // mini base64 decoder: each character is 6 bits
706 state.bit += 6;
707 state.accum <<= 6;
708 state.accum += dc;
709
710 if ( state.bit >= 8 )
15f2ee32 711 {
9d653e81
VZ
712 // got the full byte, consume it
713 state.bit -= 8;
714 unsigned char b = (state.accum >> state.bit) & 0x00ff;
715
716 if ( state.isLSB )
15f2ee32 717 {
9d653e81
VZ
718 // we've got the full word, output it
719 if ( dst )
720 *dst++ = (state.msb << 8) | b;
721 len++;
722 state.isLSB = false;
15f2ee32 723 }
9d653e81 724 else // MSB
04a37834 725 {
9d653e81
VZ
726 // just store it while we wait for LSB
727 state.msb = b;
728 state.isLSB = true;
04a37834 729 }
15f2ee32
RN
730 }
731 }
9d653e81 732 }
04a37834 733
9d653e81
VZ
734 if ( state.IsDirect() )
735 {
736 // start of an encoded segment?
737 if ( cc == '+' )
04a37834 738 {
9d653e81
VZ
739 if ( *src == '-' )
740 {
741 // just the encoded plus sign, don't switch to shifted mode
742 if ( dst )
743 *dst++ = '+';
744 len++;
745 src++;
746 }
ccaa848d
VZ
747 else if ( utf7unb64[(unsigned)*src] == 0xff )
748 {
749 // empty encoded chunks are not allowed
750 if ( !len )
751 state = stateOrig;
752
753 return wxCONV_FAILED;
754 }
755 else // base-64 encoded chunk follows
9d653e81
VZ
756 {
757 state.ToShifted();
758 }
759 }
760 else // not '+'
761 {
762 // only printable 7 bit ASCII characters (with the exception of
763 // NUL, TAB, CR and LF) can be used directly
764 if ( cc >= 0x7f || (cc < ' ' &&
765 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
766 return wxCONV_FAILED;
767
768 if ( dst )
769 *dst++ = cc;
770 len++;
771 }
15f2ee32
RN
772 }
773 }
04a37834 774
9d653e81
VZ
775 if ( !len )
776 {
777 // as we didn't read any characters we should be called with the same
778 // data (followed by some more new data) again later so don't save our
779 // state
780 state = stateOrig;
781
782 return wxCONV_FAILED;
783 }
04a37834 784
15f2ee32 785 return len;
6001e347
RR
786}
787
15f2ee32
RN
788//
789// BASE64 encoding table
790//
791static const unsigned char utf7enb64[] =
792{
793 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
794 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
795 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
796 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
797 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
798 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
799 'w', 'x', 'y', 'z', '0', '1', '2', '3',
800 '4', '5', '6', '7', '8', '9', '+', '/'
801};
802
803//
804// UTF-7 encoding table
805//
806// 0 - Set D (directly encoded characters)
807// 1 - Set O (optional direct characters)
808// 2 - whitespace characters (optional)
809// 3 - special characters
810//
811static const unsigned char utf7encode[128] =
6001e347 812{
9d653e81 813 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
814 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
815 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
819 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
821};
822
9d653e81
VZ
823static inline bool wxIsUTF7Direct(wchar_t wc)
824{
825 return wc < 0x80 && utf7encode[wc] < 1;
826}
827
828size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
829 const wchar_t *src, size_t srcLen) const
15f2ee32 830{
9d653e81
VZ
831 EncoderState stateOrig,
832 *statePtr;
833 if ( srcLen == wxNO_LEN )
834 {
835 // we don't apply the stored state when operating on entire strings at
836 // once
837 statePtr = &stateOrig;
838
839 srcLen = wxWcslen(src) + 1;
840 }
841 else // do use the mode we left the output in previously
842 {
843 stateOrig = m_stateEncoder;
5c33522f 844 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
845 }
846
847 EncoderState& state = *statePtr;
848
849
15f2ee32
RN
850 size_t len = 0;
851
9d653e81
VZ
852 const wchar_t * const srcEnd = src + srcLen;
853 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 854 {
9d653e81
VZ
855 wchar_t cc = *src++;
856 if ( wxIsUTF7Direct(cc) )
15f2ee32 857 {
9d653e81
VZ
858 if ( state.IsShifted() )
859 {
860 // pad with zeros the last encoded block if necessary
861 if ( state.bit )
862 {
863 if ( dst )
864 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
865 len++;
866 }
ef199164 867
9d653e81
VZ
868 state.ToDirect();
869
870 if ( dst )
871 *dst++ = '-';
872 len++;
873 }
874
875 if ( dst )
876 *dst++ = (char)cc;
15f2ee32
RN
877 len++;
878 }
9d653e81
VZ
879 else if ( cc == '+' && state.IsDirect() )
880 {
881 if ( dst )
882 {
883 *dst++ = '+';
884 *dst++ = '-';
885 }
886
887 len += 2;
888 }
15f2ee32 889#ifndef WC_UTF16
79c78d42 890 else if (((wxUint32)cc) > 0xffff)
b2c13097 891 {
15f2ee32 892 // no surrogate pair generation (yet?)
467e0479 893 return wxCONV_FAILED;
15f2ee32
RN
894 }
895#endif
896 else
897 {
9d653e81
VZ
898 if ( state.IsDirect() )
899 {
900 state.ToShifted();
ef199164 901
9d653e81
VZ
902 if ( dst )
903 *dst++ = '+';
904 len++;
905 }
906
907 // BASE64 encode string
908 for ( ;; )
15f2ee32 909 {
9d653e81 910 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 911 {
9d653e81
VZ
912 state.accum <<= 8;
913 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
914
915 for (state.bit += 8; state.bit >= 6; )
15f2ee32 916 {
9d653e81
VZ
917 state.bit -= 6;
918 if ( dst )
919 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
920 len++;
15f2ee32 921 }
15f2ee32 922 }
ef199164 923
9d653e81
VZ
924 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
925 break;
ef199164 926
9d653e81 927 src++;
15f2ee32 928 }
15f2ee32
RN
929 }
930 }
ef199164 931
9d653e81
VZ
932 // we need to restore the original encoder state if we were called just to
933 // calculate the amount of space needed as we will presumably be called
934 // again to really convert the data now
935 if ( !dst )
936 state = stateOrig;
ef199164 937
15f2ee32 938 return len;
6001e347
RR
939}
940
f6bcfd97 941// ----------------------------------------------------------------------------
6001e347 942// UTF-8
f6bcfd97 943// ----------------------------------------------------------------------------
6001e347 944
1774c3c5 945static const wxUint32 utf8_max[]=
4def3b35 946 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 947
3698ae71
VZ
948// boundaries of the private use area we use to (temporarily) remap invalid
949// characters invalid in a UTF-8 encoded string
ea8ce907
RR
950const wxUint32 wxUnicodePUA = 0x100000;
951const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
952
0286d08d 953// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 954const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
955 // single-byte sequences (ASCII):
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
964
965 // these are invalid:
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
970 0, 0, // C0,C1
971
972 // two-byte sequences:
973 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
974 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
975
976 // three-byte sequences:
977 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
978
979 // four-byte sequences:
980 4, 4, 4, 4, 4, // F0..F4
981
982 // these are invalid again (5- or 6-byte
983 // sequences and sequences for code points
984 // above U+10FFFF, as restricted by RFC 3629):
985 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
986};
987
988size_t
989wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
990 const char *src, size_t srcLen) const
991{
992 wchar_t *out = dstLen ? dst : NULL;
993 size_t written = 0;
994
995 if ( srcLen == wxNO_LEN )
996 srcLen = strlen(src) + 1;
997
998 for ( const char *p = src; ; p++ )
999 {
1000 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
1001 {
1002 // all done successfully, just add the trailing NULL if we are not
1003 // using explicit length
1004 if ( srcLen == wxNO_LEN )
1005 {
1006 if ( out )
1007 {
1008 if ( !dstLen )
1009 break;
1010
1011 *out = L'\0';
1012 }
1013
1014 written++;
1015 }
1016
1017 return written;
1018 }
1019
0286d08d
VZ
1020 if ( out && !dstLen-- )
1021 break;
1022
5367a38a
VS
1023 wxUint32 code;
1024 unsigned char c = *p;
0286d08d 1025
5367a38a
VS
1026 if ( c < 0x80 )
1027 {
1028 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1029 break;
0286d08d 1030
5367a38a
VS
1031 if ( srcLen != wxNO_LEN )
1032 srcLen--;
0286d08d 1033
5367a38a
VS
1034 code = c;
1035 }
1036 else
0286d08d 1037 {
5367a38a
VS
1038 unsigned len = tableUtf8Lengths[c];
1039 if ( !len )
1040 break;
1041
1042 if ( srcLen < len ) // the test works for wxNO_LEN too
1043 break;
1044
1045 if ( srcLen != wxNO_LEN )
1046 srcLen -= len;
1047
1048 // Char. number range | UTF-8 octet sequence
1049 // (hexadecimal) | (binary)
1050 // ----------------------+----------------------------------------
1051 // 0000 0000 - 0000 007F | 0xxxxxxx
1052 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1053 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1054 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1055 //
1056 // Code point value is stored in bits marked with 'x',
1057 // lowest-order bit of the value on the right side in the diagram
1058 // above. (from RFC 3629)
1059
1060 // mask to extract lead byte's value ('x' bits above), by sequence
1061 // length:
1062 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1063
1064 // mask and value of lead byte's most significant bits, by length:
1065 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1066 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1067
1068 len--; // it's more convenient to work with 0-based length here
1069
1070 // extract the lead byte's value bits:
1071 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1072 break;
1073
1074 code = c & leadValueMask[len];
1075
1076 // all remaining bytes, if any, are handled in the same way
1077 // regardless of sequence's length:
1078 for ( ; len; --len )
1079 {
1080 c = *++p;
1081 if ( (c & 0xC0) != 0x80 )
1082 return wxCONV_FAILED;
0286d08d 1083
5367a38a
VS
1084 code <<= 6;
1085 code |= c & 0x3F;
1086 }
0286d08d
VZ
1087 }
1088
1089#ifdef WC_UTF16
1090 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1091 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1092 {
1093 if ( out )
1094 out++;
1095 written++;
1096 }
1097#else // !WC_UTF16
1098 if ( out )
1099 *out = code;
1100#endif // WC_UTF16/!WC_UTF16
1101
1102 if ( out )
1103 out++;
1104
1105 written++;
1106 }
1107
1108 return wxCONV_FAILED;
1109}
1110
1111size_t
1112wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1113 const wchar_t *src, size_t srcLen) const
1114{
1115 char *out = dstLen ? dst : NULL;
1116 size_t written = 0;
1117
1118 for ( const wchar_t *wp = src; ; wp++ )
1119 {
a964d3ed 1120 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
1121 {
1122 // all done successfully, just add the trailing NULL if we are not
1123 // using explicit length
1124 if ( srcLen == wxNO_LEN )
1125 {
1126 if ( out )
1127 {
1128 if ( !dstLen )
1129 break;
1130
1131 *out = '\0';
1132 }
1133
1134 written++;
1135 }
1136
1137 return written;
1138 }
1139
a964d3ed
VZ
1140 if ( srcLen != wxNO_LEN )
1141 srcLen--;
0286d08d
VZ
1142
1143 wxUint32 code;
1144#ifdef WC_UTF16
1145 // cast is ok for WC_UTF16
1146 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1147 {
1148 // skip the next char too as we decoded a surrogate
1149 wp++;
1150 }
1151#else // wchar_t is UTF-32
1152 code = *wp & 0x7fffffff;
1153#endif
1154
1155 unsigned len;
1156 if ( code <= 0x7F )
1157 {
1158 len = 1;
1159 if ( out )
1160 {
1161 if ( dstLen < len )
1162 break;
1163
1164 out[0] = (char)code;
1165 }
1166 }
1167 else if ( code <= 0x07FF )
1168 {
1169 len = 2;
1170 if ( out )
1171 {
1172 if ( dstLen < len )
1173 break;
1174
1175 // NB: this line takes 6 least significant bits, encodes them as
1176 // 10xxxxxx and discards them so that the next byte can be encoded:
1177 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1178 out[0] = 0xC0 | code;
1179 }
1180 }
1181 else if ( code < 0xFFFF )
1182 {
1183 len = 3;
1184 if ( out )
1185 {
1186 if ( dstLen < len )
1187 break;
1188
1189 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1191 out[0] = 0xE0 | code;
1192 }
1193 }
1194 else if ( code <= 0x10FFFF )
1195 {
1196 len = 4;
1197 if ( out )
1198 {
1199 if ( dstLen < len )
1200 break;
1201
1202 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1205 out[0] = 0xF0 | code;
1206 }
1207 }
1208 else
1209 {
9a83f860 1210 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1211 break;
1212 }
1213
1214 if ( out )
1215 {
1216 out += len;
1217 dstLen -= len;
1218 }
1219
1220 written += len;
1221 }
1222
1223 // we only get here if an error occurs during decoding
1224 return wxCONV_FAILED;
1225}
1226
d16d0917
VZ
1227size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1228 const char *psz, size_t srcLen) const
6001e347 1229{
0286d08d 1230 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1231 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1232
4def3b35
VS
1233 size_t len = 0;
1234
d16d0917 1235 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1236 {
ea8ce907
RR
1237 const char *opsz = psz;
1238 bool invalid = false;
4def3b35
VS
1239 unsigned char cc = *psz++, fc = cc;
1240 unsigned cnt;
dccce9ea 1241 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1242 fc <<= 1;
ef199164 1243
dccce9ea 1244 if (!cnt)
4def3b35
VS
1245 {
1246 // plain ASCII char
dccce9ea 1247 if (buf)
4def3b35
VS
1248 *buf++ = cc;
1249 len++;
561488ef
MW
1250
1251 // escape the escape character for octal escapes
1252 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1253 && cc == '\\' && (!buf || len < n))
1254 {
1255 if (buf)
1256 *buf++ = cc;
1257 len++;
1258 }
dccce9ea
VZ
1259 }
1260 else
4def3b35
VS
1261 {
1262 cnt--;
dccce9ea 1263 if (!cnt)
4def3b35
VS
1264 {
1265 // invalid UTF-8 sequence
ea8ce907 1266 invalid = true;
dccce9ea
VZ
1267 }
1268 else
4def3b35
VS
1269 {
1270 unsigned ocnt = cnt - 1;
1271 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1272 while (cnt--)
4def3b35 1273 {
ea8ce907 1274 cc = *psz;
dccce9ea 1275 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1276 {
1277 // invalid UTF-8 sequence
ea8ce907
RR
1278 invalid = true;
1279 break;
4def3b35 1280 }
ef199164 1281
ea8ce907 1282 psz++;
4def3b35
VS
1283 res = (res << 6) | (cc & 0x3f);
1284 }
ef199164 1285
ea8ce907 1286 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1287 {
1288 // illegal UTF-8 encoding
ea8ce907 1289 invalid = true;
4def3b35 1290 }
ea8ce907
RR
1291 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1292 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1293 {
1294 // if one of our PUA characters turns up externally
1295 // it must also be treated as an illegal sequence
1296 // (a bit like you have to escape an escape character)
1297 invalid = true;
1298 }
1299 else
1300 {
1cd52418 1301#ifdef WC_UTF16
0286d08d 1302 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1303 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1304 if (pa == wxCONV_FAILED)
ea8ce907
RR
1305 {
1306 invalid = true;
1307 }
1308 else
1309 {
1310 if (buf)
1311 buf += pa;
1312 len += pa;
1313 }
373658eb 1314#else // !WC_UTF16
ea8ce907 1315 if (buf)
38d4b1e4 1316 *buf++ = (wchar_t)res;
ea8ce907 1317 len++;
373658eb 1318#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1319 }
1320 }
ef199164 1321
ea8ce907
RR
1322 if (invalid)
1323 {
1324 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1325 {
1326 while (opsz < psz && (!buf || len < n))
1327 {
1328#ifdef WC_UTF16
1329 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1330 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1331 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1332 if (buf)
1333 buf += pa;
1334 opsz++;
1335 len += pa;
1336#else
1337 if (buf)
38d4b1e4 1338 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1339 opsz++;
1340 len++;
1341#endif
1342 }
1343 }
3698ae71 1344 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1345 {
1346 while (opsz < psz && (!buf || len < n))
1347 {
3698ae71
VZ
1348 if ( buf && len + 3 < n )
1349 {
17a1ebd1 1350 unsigned char on = *opsz;
3698ae71 1351 *buf++ = L'\\';
17a1ebd1
VZ
1352 *buf++ = (wchar_t)( L'0' + on / 0100 );
1353 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1354 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1355 }
ef199164 1356
ea8ce907
RR
1357 opsz++;
1358 len += 4;
1359 }
1360 }
3698ae71 1361 else // MAP_INVALID_UTF8_NOT
ea8ce907 1362 {
467e0479 1363 return wxCONV_FAILED;
ea8ce907 1364 }
4def3b35
VS
1365 }
1366 }
6001e347 1367 }
ef199164 1368
d16d0917 1369 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1370 *buf = 0;
ef199164 1371
d16d0917 1372 return len + 1;
6001e347
RR
1373}
1374
3698ae71
VZ
1375static inline bool isoctal(wchar_t wch)
1376{
1377 return L'0' <= wch && wch <= L'7';
1378}
1379
d16d0917
VZ
1380size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1381 const wchar_t *psz, size_t srcLen) const
6001e347 1382{
0286d08d 1383 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1384 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1385
4def3b35 1386 size_t len = 0;
6001e347 1387
d16d0917 1388 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1389 {
1390 wxUint32 cc;
ef199164 1391
1cd52418 1392#ifdef WC_UTF16
b5153fd8
VZ
1393 // cast is ok for WC_UTF16
1394 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1395 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1396#else
ef199164 1397 cc = (*psz++) & 0x7fffffff;
4def3b35 1398#endif
3698ae71
VZ
1399
1400 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1401 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1402 {
dccce9ea 1403 if (buf)
ea8ce907 1404 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1405 len++;
3698ae71 1406 }
561488ef
MW
1407 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1408 && cc == L'\\' && psz[0] == L'\\' )
1409 {
1410 if (buf)
1411 *buf++ = (char)cc;
1412 psz++;
1413 len++;
1414 }
3698ae71
VZ
1415 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1416 cc == L'\\' &&
1417 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1418 {
dccce9ea 1419 if (buf)
3698ae71 1420 {
ef199164
DS
1421 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1422 (psz[1] - L'0') * 010 +
b2c13097 1423 (psz[2] - L'0'));
3698ae71
VZ
1424 }
1425
1426 psz += 3;
ea8ce907
RR
1427 len++;
1428 }
1429 else
1430 {
1431 unsigned cnt;
ef199164
DS
1432 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1433 {
1434 }
1435
ea8ce907 1436 if (!cnt)
4def3b35 1437 {
ea8ce907
RR
1438 // plain ASCII char
1439 if (buf)
1440 *buf++ = (char) cc;
1441 len++;
1442 }
ea8ce907
RR
1443 else
1444 {
1445 len += cnt + 1;
1446 if (buf)
1447 {
1448 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1449 while (cnt--)
1450 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1451 }
4def3b35
VS
1452 }
1453 }
6001e347 1454 }
4def3b35 1455
d16d0917 1456 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1457 *buf = 0;
adb45366 1458
d16d0917 1459 return len + 1;
6001e347
RR
1460}
1461
467e0479 1462// ============================================================================
c91830cb 1463// UTF-16
467e0479 1464// ============================================================================
c91830cb
VZ
1465
1466#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1467 #define wxMBConvUTF16straight wxMBConvUTF16BE
1468 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1469#else
bde4baac
VZ
1470 #define wxMBConvUTF16swap wxMBConvUTF16BE
1471 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1472#endif
1473
467e0479
VZ
1474/* static */
1475size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1476{
1477 if ( srcLen == wxNO_LEN )
1478 {
1479 // count the number of bytes in input, including the trailing NULs
5c33522f 1480 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1481 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1482 ;
c91830cb 1483
467e0479
VZ
1484 srcLen *= BYTES_PER_CHAR;
1485 }
1486 else // we already have the length
1487 {
1488 // we can only convert an entire number of UTF-16 characters
1489 if ( srcLen % BYTES_PER_CHAR )
1490 return wxCONV_FAILED;
1491 }
1492
1493 return srcLen;
1494}
1495
1496// case when in-memory representation is UTF-16 too
c91830cb
VZ
1497#ifdef WC_UTF16
1498
467e0479
VZ
1499// ----------------------------------------------------------------------------
1500// conversions without endianness change
1501// ----------------------------------------------------------------------------
1502
1503size_t
1504wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1505 const char *src, size_t srcLen) const
c91830cb 1506{
467e0479
VZ
1507 // set up the scene for using memcpy() (which is presumably more efficient
1508 // than copying the bytes one by one)
1509 srcLen = GetLength(src, srcLen);
1510 if ( srcLen == wxNO_LEN )
1511 return wxCONV_FAILED;
c91830cb 1512
ef199164 1513 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1514 if ( dst )
c91830cb 1515 {
467e0479
VZ
1516 if ( dstLen < inLen )
1517 return wxCONV_FAILED;
c91830cb 1518
467e0479 1519 memcpy(dst, src, srcLen);
c91830cb 1520 }
d32a507d 1521
467e0479 1522 return inLen;
c91830cb
VZ
1523}
1524
467e0479
VZ
1525size_t
1526wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1527 const wchar_t *src, size_t srcLen) const
c91830cb 1528{
467e0479
VZ
1529 if ( srcLen == wxNO_LEN )
1530 srcLen = wxWcslen(src) + 1;
c91830cb 1531
467e0479
VZ
1532 srcLen *= BYTES_PER_CHAR;
1533
1534 if ( dst )
c91830cb 1535 {
467e0479
VZ
1536 if ( dstLen < srcLen )
1537 return wxCONV_FAILED;
d32a507d 1538
467e0479 1539 memcpy(dst, src, srcLen);
c91830cb 1540 }
d32a507d 1541
467e0479 1542 return srcLen;
c91830cb
VZ
1543}
1544
467e0479
VZ
1545// ----------------------------------------------------------------------------
1546// endian-reversing conversions
1547// ----------------------------------------------------------------------------
c91830cb 1548
467e0479
VZ
1549size_t
1550wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1551 const char *src, size_t srcLen) const
c91830cb 1552{
467e0479
VZ
1553 srcLen = GetLength(src, srcLen);
1554 if ( srcLen == wxNO_LEN )
1555 return wxCONV_FAILED;
c91830cb 1556
467e0479
VZ
1557 srcLen /= BYTES_PER_CHAR;
1558
1559 if ( dst )
c91830cb 1560 {
467e0479
VZ
1561 if ( dstLen < srcLen )
1562 return wxCONV_FAILED;
1563
5c33522f 1564 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1565 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1566 {
ef199164 1567 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1568 }
c91830cb 1569 }
bfab25d4 1570
467e0479 1571 return srcLen;
c91830cb
VZ
1572}
1573
467e0479
VZ
1574size_t
1575wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1576 const wchar_t *src, size_t srcLen) const
c91830cb 1577{
467e0479
VZ
1578 if ( srcLen == wxNO_LEN )
1579 srcLen = wxWcslen(src) + 1;
c91830cb 1580
467e0479
VZ
1581 srcLen *= BYTES_PER_CHAR;
1582
1583 if ( dst )
c91830cb 1584 {
467e0479
VZ
1585 if ( dstLen < srcLen )
1586 return wxCONV_FAILED;
1587
5c33522f 1588 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1589 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1590 {
ef199164 1591 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1592 }
c91830cb 1593 }
eec47cc6 1594
467e0479 1595 return srcLen;
c91830cb
VZ
1596}
1597
467e0479 1598#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1599
467e0479
VZ
1600// ----------------------------------------------------------------------------
1601// conversions without endianness change
1602// ----------------------------------------------------------------------------
c91830cb 1603
35d11700
VZ
1604size_t
1605wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1606 const char *src, size_t srcLen) const
c91830cb 1607{
35d11700
VZ
1608 srcLen = GetLength(src, srcLen);
1609 if ( srcLen == wxNO_LEN )
1610 return wxCONV_FAILED;
c91830cb 1611
ef199164 1612 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1613 if ( !dst )
c91830cb 1614 {
35d11700
VZ
1615 // optimization: return maximal space which could be needed for this
1616 // string even if the real size could be smaller if the buffer contains
1617 // any surrogates
1618 return inLen;
c91830cb 1619 }
c91830cb 1620
35d11700 1621 size_t outLen = 0;
5c33522f 1622 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1623 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1624 {
ef199164
DS
1625 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1626 if ( !inBuff )
35d11700
VZ
1627 return wxCONV_FAILED;
1628
1629 if ( ++outLen > dstLen )
1630 return wxCONV_FAILED;
c91830cb 1631
35d11700
VZ
1632 *dst++ = ch;
1633 }
1634
1635
1636 return outLen;
1637}
c91830cb 1638
35d11700
VZ
1639size_t
1640wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1641 const wchar_t *src, size_t srcLen) const
c91830cb 1642{
35d11700
VZ
1643 if ( srcLen == wxNO_LEN )
1644 srcLen = wxWcslen(src) + 1;
c91830cb 1645
35d11700 1646 size_t outLen = 0;
5c33522f 1647 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1648 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1649 {
1650 wxUint16 cc[2];
35d11700
VZ
1651 const size_t numChars = encode_utf16(*src++, cc);
1652 if ( numChars == wxCONV_FAILED )
1653 return wxCONV_FAILED;
c91830cb 1654
ef199164
DS
1655 outLen += numChars * BYTES_PER_CHAR;
1656 if ( outBuff )
c91830cb 1657 {
35d11700
VZ
1658 if ( outLen > dstLen )
1659 return wxCONV_FAILED;
1660
ef199164 1661 *outBuff++ = cc[0];
35d11700 1662 if ( numChars == 2 )
69b80d28 1663 {
35d11700 1664 // second character of a surrogate
ef199164 1665 *outBuff++ = cc[1];
69b80d28 1666 }
c91830cb 1667 }
c91830cb 1668 }
c91830cb 1669
35d11700 1670 return outLen;
c91830cb
VZ
1671}
1672
467e0479
VZ
1673// ----------------------------------------------------------------------------
1674// endian-reversing conversions
1675// ----------------------------------------------------------------------------
c91830cb 1676
35d11700
VZ
1677size_t
1678wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1679 const char *src, size_t srcLen) const
c91830cb 1680{
35d11700
VZ
1681 srcLen = GetLength(src, srcLen);
1682 if ( srcLen == wxNO_LEN )
1683 return wxCONV_FAILED;
1684
ef199164 1685 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1686 if ( !dst )
1687 {
1688 // optimization: return maximal space which could be needed for this
1689 // string even if the real size could be smaller if the buffer contains
1690 // any surrogates
1691 return inLen;
1692 }
c91830cb 1693
35d11700 1694 size_t outLen = 0;
5c33522f 1695 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1696 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1697 {
35d11700
VZ
1698 wxUint32 ch;
1699 wxUint16 tmp[2];
ef199164
DS
1700
1701 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1702 inBuff++;
1703 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1704
35d11700
VZ
1705 const size_t numChars = decode_utf16(tmp, ch);
1706 if ( numChars == wxCONV_FAILED )
1707 return wxCONV_FAILED;
c91830cb 1708
35d11700 1709 if ( numChars == 2 )
ef199164 1710 inBuff++;
35d11700
VZ
1711
1712 if ( ++outLen > dstLen )
1713 return wxCONV_FAILED;
c91830cb 1714
35d11700 1715 *dst++ = ch;
c91830cb 1716 }
c91830cb 1717
c91830cb 1718
35d11700
VZ
1719 return outLen;
1720}
c91830cb 1721
35d11700
VZ
1722size_t
1723wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1724 const wchar_t *src, size_t srcLen) const
c91830cb 1725{
35d11700
VZ
1726 if ( srcLen == wxNO_LEN )
1727 srcLen = wxWcslen(src) + 1;
c91830cb 1728
35d11700 1729 size_t outLen = 0;
5c33522f 1730 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1731 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1732 {
1733 wxUint16 cc[2];
35d11700
VZ
1734 const size_t numChars = encode_utf16(*src, cc);
1735 if ( numChars == wxCONV_FAILED )
1736 return wxCONV_FAILED;
c91830cb 1737
ef199164
DS
1738 outLen += numChars * BYTES_PER_CHAR;
1739 if ( outBuff )
c91830cb 1740 {
35d11700
VZ
1741 if ( outLen > dstLen )
1742 return wxCONV_FAILED;
1743
ef199164 1744 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1745 if ( numChars == 2 )
c91830cb 1746 {
35d11700 1747 // second character of a surrogate
ef199164 1748 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1749 }
1750 }
c91830cb 1751 }
c91830cb 1752
35d11700 1753 return outLen;
c91830cb
VZ
1754}
1755
467e0479 1756#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1757
1758
35d11700 1759// ============================================================================
c91830cb 1760// UTF-32
35d11700 1761// ============================================================================
c91830cb
VZ
1762
1763#ifdef WORDS_BIGENDIAN
467e0479
VZ
1764 #define wxMBConvUTF32straight wxMBConvUTF32BE
1765 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1766#else
467e0479
VZ
1767 #define wxMBConvUTF32swap wxMBConvUTF32BE
1768 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1769#endif
1770
1771
1772WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1773WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1774
467e0479
VZ
1775/* static */
1776size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1777{
1778 if ( srcLen == wxNO_LEN )
1779 {
1780 // count the number of bytes in input, including the trailing NULs
5c33522f 1781 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1782 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1783 ;
c91830cb 1784
467e0479
VZ
1785 srcLen *= BYTES_PER_CHAR;
1786 }
1787 else // we already have the length
1788 {
1789 // we can only convert an entire number of UTF-32 characters
1790 if ( srcLen % BYTES_PER_CHAR )
1791 return wxCONV_FAILED;
1792 }
1793
1794 return srcLen;
1795}
1796
1797// case when in-memory representation is UTF-16
c91830cb
VZ
1798#ifdef WC_UTF16
1799
467e0479
VZ
1800// ----------------------------------------------------------------------------
1801// conversions without endianness change
1802// ----------------------------------------------------------------------------
1803
1804size_t
1805wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1806 const char *src, size_t srcLen) const
c91830cb 1807{
467e0479
VZ
1808 srcLen = GetLength(src, srcLen);
1809 if ( srcLen == wxNO_LEN )
1810 return wxCONV_FAILED;
c91830cb 1811
5c33522f 1812 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1813 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1814 size_t outLen = 0;
1815 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1816 {
1817 wxUint16 cc[2];
ef199164 1818 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1819 if ( numChars == wxCONV_FAILED )
1820 return wxCONV_FAILED;
c91830cb 1821
467e0479
VZ
1822 outLen += numChars;
1823 if ( dst )
c91830cb 1824 {
467e0479
VZ
1825 if ( outLen > dstLen )
1826 return wxCONV_FAILED;
d32a507d 1827
467e0479
VZ
1828 *dst++ = cc[0];
1829 if ( numChars == 2 )
1830 {
1831 // second character of a surrogate
1832 *dst++ = cc[1];
1833 }
1834 }
c91830cb 1835 }
d32a507d 1836
467e0479 1837 return outLen;
c91830cb
VZ
1838}
1839
467e0479
VZ
1840size_t
1841wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1842 const wchar_t *src, size_t srcLen) const
c91830cb 1843{
467e0479
VZ
1844 if ( srcLen == wxNO_LEN )
1845 srcLen = wxWcslen(src) + 1;
c91830cb 1846
467e0479 1847 if ( !dst )
c91830cb 1848 {
467e0479
VZ
1849 // optimization: return maximal space which could be needed for this
1850 // string instead of the exact amount which could be less if there are
1851 // any surrogates in the input
1852 //
1853 // we consider that surrogates are rare enough to make it worthwhile to
1854 // avoid running the loop below at the cost of slightly extra memory
1855 // consumption
ef199164 1856 return srcLen * BYTES_PER_CHAR;
467e0479 1857 }
c91830cb 1858
5c33522f 1859 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1860 size_t outLen = 0;
1861 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1862 {
1863 const wxUint32 ch = wxDecodeSurrogate(&src);
1864 if ( !src )
1865 return wxCONV_FAILED;
c91830cb 1866
467e0479 1867 outLen += BYTES_PER_CHAR;
d32a507d 1868
467e0479
VZ
1869 if ( outLen > dstLen )
1870 return wxCONV_FAILED;
b5153fd8 1871
ef199164 1872 *outBuff++ = ch;
467e0479 1873 }
c91830cb 1874
467e0479 1875 return outLen;
c91830cb
VZ
1876}
1877
467e0479
VZ
1878// ----------------------------------------------------------------------------
1879// endian-reversing conversions
1880// ----------------------------------------------------------------------------
c91830cb 1881
467e0479
VZ
1882size_t
1883wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1884 const char *src, size_t srcLen) const
c91830cb 1885{
467e0479
VZ
1886 srcLen = GetLength(src, srcLen);
1887 if ( srcLen == wxNO_LEN )
1888 return wxCONV_FAILED;
c91830cb 1889
5c33522f 1890 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1891 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1892 size_t outLen = 0;
ef199164 1893 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1894 {
c91830cb 1895 wxUint16 cc[2];
ef199164 1896 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1897 if ( numChars == wxCONV_FAILED )
1898 return wxCONV_FAILED;
c91830cb 1899
467e0479
VZ
1900 outLen += numChars;
1901 if ( dst )
c91830cb 1902 {
467e0479
VZ
1903 if ( outLen > dstLen )
1904 return wxCONV_FAILED;
d32a507d 1905
467e0479
VZ
1906 *dst++ = cc[0];
1907 if ( numChars == 2 )
1908 {
1909 // second character of a surrogate
1910 *dst++ = cc[1];
1911 }
1912 }
c91830cb 1913 }
b5153fd8 1914
467e0479 1915 return outLen;
c91830cb
VZ
1916}
1917
467e0479
VZ
1918size_t
1919wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1920 const wchar_t *src, size_t srcLen) const
c91830cb 1921{
467e0479
VZ
1922 if ( srcLen == wxNO_LEN )
1923 srcLen = wxWcslen(src) + 1;
c91830cb 1924
467e0479 1925 if ( !dst )
c91830cb 1926 {
467e0479
VZ
1927 // optimization: return maximal space which could be needed for this
1928 // string instead of the exact amount which could be less if there are
1929 // any surrogates in the input
1930 //
1931 // we consider that surrogates are rare enough to make it worthwhile to
1932 // avoid running the loop below at the cost of slightly extra memory
1933 // consumption
1934 return srcLen*BYTES_PER_CHAR;
1935 }
c91830cb 1936
5c33522f 1937 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1938 size_t outLen = 0;
1939 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1940 {
1941 const wxUint32 ch = wxDecodeSurrogate(&src);
1942 if ( !src )
1943 return wxCONV_FAILED;
c91830cb 1944
467e0479 1945 outLen += BYTES_PER_CHAR;
d32a507d 1946
467e0479
VZ
1947 if ( outLen > dstLen )
1948 return wxCONV_FAILED;
b5153fd8 1949
ef199164 1950 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1951 }
c91830cb 1952
467e0479 1953 return outLen;
c91830cb
VZ
1954}
1955
467e0479 1956#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1957
35d11700
VZ
1958// ----------------------------------------------------------------------------
1959// conversions without endianness change
1960// ----------------------------------------------------------------------------
1961
1962size_t
1963wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1964 const char *src, size_t srcLen) const
c91830cb 1965{
35d11700
VZ
1966 // use memcpy() as it should be much faster than hand-written loop
1967 srcLen = GetLength(src, srcLen);
1968 if ( srcLen == wxNO_LEN )
1969 return wxCONV_FAILED;
c91830cb 1970
35d11700
VZ
1971 const size_t inLen = srcLen/BYTES_PER_CHAR;
1972 if ( dst )
c91830cb 1973 {
35d11700
VZ
1974 if ( dstLen < inLen )
1975 return wxCONV_FAILED;
b5153fd8 1976
35d11700
VZ
1977 memcpy(dst, src, srcLen);
1978 }
c91830cb 1979
35d11700 1980 return inLen;
c91830cb
VZ
1981}
1982
35d11700
VZ
1983size_t
1984wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1985 const wchar_t *src, size_t srcLen) const
c91830cb 1986{
35d11700
VZ
1987 if ( srcLen == wxNO_LEN )
1988 srcLen = wxWcslen(src) + 1;
1989
1990 srcLen *= BYTES_PER_CHAR;
c91830cb 1991
35d11700 1992 if ( dst )
c91830cb 1993 {
35d11700
VZ
1994 if ( dstLen < srcLen )
1995 return wxCONV_FAILED;
c91830cb 1996
35d11700 1997 memcpy(dst, src, srcLen);
c91830cb
VZ
1998 }
1999
35d11700 2000 return srcLen;
c91830cb
VZ
2001}
2002
35d11700
VZ
2003// ----------------------------------------------------------------------------
2004// endian-reversing conversions
2005// ----------------------------------------------------------------------------
c91830cb 2006
35d11700
VZ
2007size_t
2008wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2009 const char *src, size_t srcLen) const
c91830cb 2010{
35d11700
VZ
2011 srcLen = GetLength(src, srcLen);
2012 if ( srcLen == wxNO_LEN )
2013 return wxCONV_FAILED;
2014
2015 srcLen /= BYTES_PER_CHAR;
c91830cb 2016
35d11700 2017 if ( dst )
c91830cb 2018 {
35d11700
VZ
2019 if ( dstLen < srcLen )
2020 return wxCONV_FAILED;
2021
5c33522f 2022 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2023 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2024 {
ef199164 2025 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2026 }
c91830cb 2027 }
b5153fd8 2028
35d11700 2029 return srcLen;
c91830cb
VZ
2030}
2031
35d11700
VZ
2032size_t
2033wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2034 const wchar_t *src, size_t srcLen) const
c91830cb 2035{
35d11700
VZ
2036 if ( srcLen == wxNO_LEN )
2037 srcLen = wxWcslen(src) + 1;
2038
2039 srcLen *= BYTES_PER_CHAR;
c91830cb 2040
35d11700 2041 if ( dst )
c91830cb 2042 {
35d11700
VZ
2043 if ( dstLen < srcLen )
2044 return wxCONV_FAILED;
2045
5c33522f 2046 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2047 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2048 {
ef199164 2049 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2050 }
c91830cb 2051 }
b5153fd8 2052
35d11700 2053 return srcLen;
c91830cb
VZ
2054}
2055
467e0479 2056#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2057
2058
36acb880
VZ
2059// ============================================================================
2060// The classes doing conversion using the iconv_xxx() functions
2061// ============================================================================
3caec1bb 2062
b040e242 2063#ifdef HAVE_ICONV
3a0d76bc 2064
b1d547eb
VS
2065// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2066// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2067// (unless there's yet another bug in glibc) the only case when iconv()
2068// returns with (size_t)-1 (which means error) and says there are 0 bytes
2069// left in the input buffer -- when _real_ error occurs,
2070// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2071// iconv() failure.
3caec1bb
VS
2072// [This bug does not appear in glibc 2.2.]
2073#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2074#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2075 (errno != E2BIG || bufLeft != 0))
2076#else
2077#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2078#endif
2079
ab217dba 2080#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2081
74a7eb0b
VZ
2082#define ICONV_T_INVALID ((iconv_t)-1)
2083
2084#if SIZEOF_WCHAR_T == 4
2085 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2086 #define WC_ENC wxFONTENCODING_UTF32
2087#elif SIZEOF_WCHAR_T == 2
2088 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2089 #define WC_ENC wxFONTENCODING_UTF16
2090#else // sizeof(wchar_t) != 2 nor 4
2091 // does this ever happen?
2092 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2093#endif
2094
36acb880 2095// ----------------------------------------------------------------------------
e95354ec 2096// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2097// ----------------------------------------------------------------------------
2098
e95354ec 2099class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2100{
2101public:
86501081 2102 wxMBConv_iconv(const char *name);
e95354ec 2103 virtual ~wxMBConv_iconv();
36acb880 2104
8f4b0f43
VZ
2105 // implement base class virtual methods
2106 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2107 const char *src, size_t srcLen = wxNO_LEN) const;
2108 virtual size_t FromWChar(char *dst, size_t dstLen,
2109 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2110 virtual size_t GetMBNulLen() const;
2111
ba98e032
VS
2112#if wxUSE_UNICODE_UTF8
2113 virtual bool IsUTF8() const;
2114#endif
2115
d36c9347
VZ
2116 virtual wxMBConv *Clone() const
2117 {
86501081 2118 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
2119 p->m_minMBCharWidth = m_minMBCharWidth;
2120 return p;
2121 }
2122
e95354ec 2123 bool IsOk() const
74a7eb0b 2124 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2125
2126protected:
ef199164
DS
2127 // the iconv handlers used to translate from multibyte
2128 // to wide char and in the other direction
36acb880
VZ
2129 iconv_t m2w,
2130 w2m;
ef199164 2131
b1d547eb
VS
2132#if wxUSE_THREADS
2133 // guards access to m2w and w2m objects
2134 wxMutex m_iconvMutex;
2135#endif
36acb880
VZ
2136
2137private:
e95354ec 2138 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2139 // available on this machine, it will remain NULL
74a7eb0b 2140 static wxString ms_wcCharsetName;
36acb880
VZ
2141
2142 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2143 // different endian-ness than the native one
405d8f46 2144 static bool ms_wcNeedsSwap;
eec47cc6 2145
d36c9347
VZ
2146
2147 // name of the encoding handled by this conversion
2148 wxString m_name;
2149
7ef3ab50 2150 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2151 // initially
2152 size_t m_minMBCharWidth;
36acb880
VZ
2153};
2154
8f115891 2155// make the constructor available for unit testing
86501081 2156WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2157{
2158 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2159 if ( !result->IsOk() )
2160 {
2161 delete result;
2162 return 0;
2163 }
ef199164 2164
8f115891
MW
2165 return result;
2166}
2167
422e411e 2168wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2169bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2170
86501081 2171wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2172 : m_name(name)
36acb880 2173{
c1464d9d 2174 m_minMBCharWidth = 0;
eec47cc6 2175
36acb880 2176 // check for charset that represents wchar_t:
74a7eb0b 2177 if ( ms_wcCharsetName.empty() )
f1339c56 2178 {
9a83f860 2179 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2180
74a7eb0b 2181#if wxUSE_FONTMAP
a243da29 2182 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2183#else // !wxUSE_FONTMAP
a243da29 2184 static const wxChar *const names_static[] =
36acb880 2185 {
74a7eb0b 2186#if SIZEOF_WCHAR_T == 4
9a83f860 2187 wxT("UCS-4"),
74a7eb0b 2188#elif SIZEOF_WCHAR_T = 2
9a83f860 2189 wxT("UCS-2"),
74a7eb0b
VZ
2190#endif
2191 NULL
2192 };
a243da29 2193 const wxChar *const *names = names_static;
74a7eb0b 2194#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2195
d1f024a8 2196 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2197 {
17a1ebd1 2198 const wxString nameCS(*names);
74a7eb0b
VZ
2199
2200 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2201 wxString nameXE(nameCS);
ef199164
DS
2202
2203#ifdef WORDS_BIGENDIAN
9a83f860 2204 nameXE += wxT("BE");
ef199164 2205#else // little endian
9a83f860 2206 nameXE += wxT("LE");
ef199164 2207#endif
74a7eb0b 2208
9a83f860 2209 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2210 nameXE.c_str());
2211
86501081 2212 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2213 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2214 {
74a7eb0b 2215 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2216 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2217 nameCS.c_str());
86501081 2218 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2219
74a7eb0b
VZ
2220 // and check for bytesex ourselves:
2221 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2222 {
74a7eb0b 2223 char buf[2], *bufPtr;
e8769ed1 2224 wchar_t wbuf[2];
74a7eb0b
VZ
2225 size_t insz, outsz;
2226 size_t res;
2227
2228 buf[0] = 'A';
2229 buf[1] = 0;
2230 wbuf[0] = 0;
2231 insz = 2;
2232 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2233 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2234 bufPtr = buf;
2235
ef199164
DS
2236 res = iconv(
2237 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2238 &wbufPtr, &outsz);
74a7eb0b
VZ
2239
2240 if (ICONV_FAILED(res, insz))
2241 {
2242 wxLogLastError(wxT("iconv"));
422e411e 2243 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2244 nameCS.c_str());
74a7eb0b
VZ
2245 }
2246 else // ok, can convert to this encoding, remember it
2247 {
17a1ebd1 2248 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2249 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2250 }
3a0d76bc
VS
2251 }
2252 }
74a7eb0b 2253 else // use charset not requiring byte swapping
36acb880 2254 {
74a7eb0b 2255 ms_wcCharsetName = nameXE;
36acb880 2256 }
3a0d76bc 2257 }
74a7eb0b 2258
0944fceb 2259 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2260 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2261 ms_wcCharsetName.empty() ? wxString("<none>")
2262 : ms_wcCharsetName,
9a83f860
VZ
2263 ms_wcNeedsSwap ? wxT(" (needs swap)")
2264 : wxT(""));
3a0d76bc 2265 }
36acb880 2266 else // we already have ms_wcCharsetName
3caec1bb 2267 {
86501081 2268 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2269 }
dccce9ea 2270
74a7eb0b 2271 if ( ms_wcCharsetName.empty() )
f1339c56 2272 {
74a7eb0b 2273 w2m = ICONV_T_INVALID;
36acb880 2274 }
405d8f46
VZ
2275 else
2276 {
86501081 2277 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2278 if ( w2m == ICONV_T_INVALID )
2279 {
2280 wxLogTrace(TRACE_STRCONV,
2281 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2282 ms_wcCharsetName.c_str(), name);
74a7eb0b 2283 }
405d8f46 2284 }
36acb880 2285}
3caec1bb 2286
e95354ec 2287wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2288{
74a7eb0b 2289 if ( m2w != ICONV_T_INVALID )
36acb880 2290 iconv_close(m2w);
74a7eb0b 2291 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2292 iconv_close(w2m);
2293}
3a0d76bc 2294
8f4b0f43
VZ
2295size_t
2296wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2297 const char *src, size_t srcLen) const
36acb880 2298{
8f4b0f43 2299 if ( srcLen == wxNO_LEN )
69373110 2300 {
8f4b0f43
VZ
2301 // find the string length: notice that must be done differently for
2302 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2303 // consecutive NULs
2304 const size_t nulLen = GetMBNulLen();
2305 switch ( nulLen )
2306 {
2307 default:
2308 return wxCONV_FAILED;
69373110 2309
8f4b0f43
VZ
2310 case 1:
2311 srcLen = strlen(src); // arguably more optimized than our version
2312 break;
69373110 2313
8f4b0f43
VZ
2314 case 2:
2315 case 4:
2316 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317 // but they also have to start at character boundary and not
2318 // span two adjacent characters
2319 const char *p;
2320 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2321 ;
2322 srcLen = p - src;
2323 break;
2324 }
d50c0831
VZ
2325
2326 // when we're determining the length of the string ourselves we count
2327 // the terminating NUL(s) as part of it and always NUL-terminate the
2328 // output
2329 srcLen += nulLen;
69373110
VZ
2330 }
2331
8f4b0f43
VZ
2332 // we express length in the number of (wide) characters but iconv always
2333 // counts buffer sizes it in bytes
2334 dstLen *= SIZEOF_WCHAR_T;
2335
b1d547eb 2336#if wxUSE_THREADS
6a17b868
SN
2337 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2339 // wxConvLocal that are used all over wx code, so we have to make sure
2340 // the handle is used by at most one thread at the time. Otherwise
2341 // only a few wx classes would be safe to use from non-main threads
2342 // as MB<->WC conversion would fail "randomly".
2343 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2344#endif // wxUSE_THREADS
2345
36acb880 2346 size_t res, cres;
8f4b0f43 2347 const char *pszPtr = src;
36acb880 2348
8f4b0f43 2349 if ( dst )
36acb880 2350 {
8f4b0f43 2351 char* bufPtr = (char*)dst;
e8769ed1 2352
36acb880 2353 // have destination buffer, convert there
1752fda6 2354 size_t dstLenOrig = dstLen;
36acb880 2355 cres = iconv(m2w,
8f4b0f43
VZ
2356 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2357 &bufPtr, &dstLen);
1752fda6
VZ
2358
2359 // convert the number of bytes converted as returned by iconv to the
2360 // number of (wide) characters converted that we need
2361 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2362
36acb880 2363 if (ms_wcNeedsSwap)
3a0d76bc 2364 {
36acb880 2365 // convert to native endianness
17a1ebd1 2366 for ( unsigned i = 0; i < res; i++ )
467a2982 2367 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2368 }
36acb880 2369 }
8f4b0f43 2370 else // no destination buffer
36acb880 2371 {
8f4b0f43 2372 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2373 wchar_t tbuf[256];
36acb880 2374 res = 0;
ef199164
DS
2375
2376 do
2377 {
e8769ed1 2378 char* bufPtr = (char*)tbuf;
8f4b0f43 2379 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2380
2381 cres = iconv(m2w,
8f4b0f43
VZ
2382 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2383 &bufPtr, &dstLen );
36acb880 2384
8f4b0f43 2385 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2386 }
2387 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2388 }
dccce9ea 2389
8f4b0f43 2390 if (ICONV_FAILED(cres, srcLen))
f1339c56 2391 {
36acb880 2392 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2393 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2394 return wxCONV_FAILED;
36acb880
VZ
2395 }
2396
2397 return res;
2398}
2399
8f4b0f43
VZ
2400size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2401 const wchar_t *src, size_t srcLen) const
36acb880 2402{
b1d547eb
VS
2403#if wxUSE_THREADS
2404 // NB: explained in MB2WC
2405 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2406#endif
3698ae71 2407
8f4b0f43 2408 if ( srcLen == wxNO_LEN )
2588ee86 2409 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2410
2411 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2412 size_t outbuflen = dstLen;
36acb880 2413 size_t res, cres;
3a0d76bc 2414
36acb880 2415 wchar_t *tmpbuf = 0;
3caec1bb 2416
36acb880
VZ
2417 if (ms_wcNeedsSwap)
2418 {
2419 // need to copy to temp buffer to switch endianness
51725fc0 2420 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2421 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2422 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2423 for ( size_t i = 0; i < srcLen; i++ )
2424 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2425
8f4b0f43 2426 src = tmpbuf;
36acb880 2427 }
3a0d76bc 2428
8f4b0f43
VZ
2429 char* inbuf = (char*)src;
2430 if ( dst )
36acb880
VZ
2431 {
2432 // have destination buffer, convert there
8f4b0f43 2433 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2434
8f4b0f43 2435 res = dstLen - outbuflen;
36acb880 2436 }
8f4b0f43 2437 else // no destination buffer
36acb880 2438 {
8f4b0f43 2439 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2440 char tbuf[256];
36acb880 2441 res = 0;
ef199164
DS
2442 do
2443 {
8f4b0f43 2444 dst = tbuf;
51725fc0 2445 outbuflen = WXSIZEOF(tbuf);
36acb880 2446
8f4b0f43 2447 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2448
51725fc0 2449 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2450 }
2451 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2452 }
dccce9ea 2453
36acb880
VZ
2454 if (ms_wcNeedsSwap)
2455 {
2456 free(tmpbuf);
2457 }
dccce9ea 2458
e8769ed1 2459 if (ICONV_FAILED(cres, inbuflen))
36acb880 2460 {
ce6f8d6f 2461 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2462 return wxCONV_FAILED;
36acb880
VZ
2463 }
2464
2465 return res;
2466}
2467
7ef3ab50 2468size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2469{
c1464d9d 2470 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2471 {
2472 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2473
2474#if wxUSE_THREADS
2475 // NB: explained in MB2WC
2476 wxMutexLocker lock(self->m_iconvMutex);
2477#endif
2478
999020e1 2479 const wchar_t *wnul = L"";
c1464d9d 2480 char buf[8]; // should be enough for NUL in any encoding
356410fc 2481 size_t inLen = sizeof(wchar_t),
c1464d9d 2482 outLen = WXSIZEOF(buf);
ef199164
DS
2483 char *inBuff = (char *)wnul;
2484 char *outBuff = buf;
2485 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2486 {
c1464d9d 2487 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2488 }
2489 else // ok
2490 {
ef199164 2491 self->m_minMBCharWidth = outBuff - buf;
356410fc 2492 }
eec47cc6
VZ
2493 }
2494
c1464d9d 2495 return m_minMBCharWidth;
eec47cc6
VZ
2496}
2497
ba98e032
VS
2498#if wxUSE_UNICODE_UTF8
2499bool wxMBConv_iconv::IsUTF8() const
2500{
86501081
VS
2501 return wxStricmp(m_name, "UTF-8") == 0 ||
2502 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2503}
2504#endif
2505
b040e242 2506#endif // HAVE_ICONV
36acb880 2507
e95354ec 2508
36acb880
VZ
2509// ============================================================================
2510// Win32 conversion classes
2511// ============================================================================
1cd52418 2512
e95354ec 2513#ifdef wxHAVE_WIN32_MB2WC
373658eb 2514
8b04d4c4 2515// from utils.cpp
d775fa82 2516#if wxUSE_FONTMAP
86501081 2517extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2518extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2519#endif
373658eb 2520
e95354ec 2521class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2522{
2523public:
bde4baac
VZ
2524 wxMBConv_win32()
2525 {
2526 m_CodePage = CP_ACP;
c1464d9d 2527 m_minMBCharWidth = 0;
bde4baac
VZ
2528 }
2529
d36c9347 2530 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2531 : wxMBConv()
d36c9347
VZ
2532 {
2533 m_CodePage = conv.m_CodePage;
2534 m_minMBCharWidth = conv.m_minMBCharWidth;
2535 }
2536
7608a683 2537#if wxUSE_FONTMAP
86501081 2538 wxMBConv_win32(const char* name)
bde4baac
VZ
2539 {
2540 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2541 m_minMBCharWidth = 0;
bde4baac 2542 }
dccce9ea 2543
e95354ec 2544 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2545 {
2546 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2547 m_minMBCharWidth = 0;
bde4baac 2548 }
eec47cc6 2549#endif // wxUSE_FONTMAP
8b04d4c4 2550
d36c9347 2551 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2552 {
02272c9c
VZ
2553 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554 // the behaviour is not compatible with the Unix version (using iconv)
2555 // and break the library itself, e.g. wxTextInputStream::NextChar()
2556 // wouldn't work if reading an incomplete MB char didn't result in an
2557 // error
667e5b3e 2558 //
89028980 2559 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2560 // Win XP or newer and it is not supported for UTF-[78] so we always
2561 // use our own conversions in this case. See
89028980
VS
2562 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2564 if ( m_CodePage == CP_UTF8 )
89028980 2565 {
5487ff0f 2566 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2567 }
830f8f11
VZ
2568
2569 if ( m_CodePage == CP_UTF7 )
2570 {
5487ff0f 2571 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2572 }
2573
2574 int flags = 0;
2575 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2576 IsAtLeastWin2kSP4() )
89028980 2577 {
830f8f11 2578 flags = MB_ERR_INVALID_CHARS;
89028980 2579 }
667e5b3e 2580
2b5f62a0
VZ
2581 const size_t len = ::MultiByteToWideChar
2582 (
2583 m_CodePage, // code page
667e5b3e 2584 flags, // flags: fall on error
2b5f62a0
VZ
2585 psz, // input string
2586 -1, // its length (NUL-terminated)
b4da152e 2587 buf, // output string
2b5f62a0
VZ
2588 buf ? n : 0 // size of output buffer
2589 );
89028980
VS
2590 if ( !len )
2591 {
2592 // function totally failed
467e0479 2593 return wxCONV_FAILED;
89028980
VS
2594 }
2595
2596 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597 // check if we succeeded, by doing a double trip:
2598 if ( !flags && buf )
2599 {
53c174fc
VZ
2600 const size_t mbLen = strlen(psz);
2601 wxCharBuffer mbBuf(mbLen);
89028980
VS
2602 if ( ::WideCharToMultiByte
2603 (
2604 m_CodePage,
2605 0,
2606 buf,
2607 -1,
2608 mbBuf.data(),
53c174fc 2609 mbLen + 1, // size in bytes, not length
89028980
VS
2610 NULL,
2611 NULL
2612 ) == 0 ||
2613 strcmp(mbBuf, psz) != 0 )
2614 {
2615 // we didn't obtain the same thing we started from, hence
2616 // the conversion was lossy and we consider that it failed
467e0479 2617 return wxCONV_FAILED;
89028980
VS
2618 }
2619 }
2b5f62a0 2620
03a991bc
VZ
2621 // note that it returns count of written chars for buf != NULL and size
2622 // of the needed buffer for buf == NULL so in either case the length of
2623 // the string (which never includes the terminating NUL) is one less
89028980 2624 return len - 1;
f1339c56 2625 }
dccce9ea 2626
d36c9347 2627 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2628 {
13dd924a
VZ
2629 /*
2630 we have a problem here: by default, WideCharToMultiByte() may
2631 replace characters unrepresentable in the target code page with bad
2632 quality approximations such as turning "1/2" symbol (U+00BD) into
2633 "1" for the code pages which don't have it and we, obviously, want
2634 to avoid this at any price
d775fa82 2635
13dd924a
VZ
2636 the trouble is that this function does it _silently_, i.e. it won't
2637 even tell us whether it did or not... Win98/2000 and higher provide
2638 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639 we have to resort to a round trip, i.e. check that converting back
2640 results in the same string -- this is, of course, expensive but
2641 otherwise we simply can't be sure to not garble the data.
2642 */
2643
2644 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645 // it doesn't work with CJK encodings (which we test for rather roughly
2646 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2647 // supporting it
907173e5
WS
2648 BOOL usedDef wxDUMMY_INITIALIZE(false);
2649 BOOL *pUsedDef;
13dd924a
VZ
2650 int flags;
2651 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2652 {
2653 // it's our lucky day
2654 flags = WC_NO_BEST_FIT_CHARS;
2655 pUsedDef = &usedDef;
2656 }
2657 else // old system or unsupported encoding
2658 {
2659 flags = 0;
2660 pUsedDef = NULL;
2661 }
2662
2b5f62a0
VZ
2663 const size_t len = ::WideCharToMultiByte
2664 (
2665 m_CodePage, // code page
13dd924a
VZ
2666 flags, // either none or no best fit
2667 pwz, // input string
2b5f62a0
VZ
2668 -1, // it is (wide) NUL-terminated
2669 buf, // output buffer
2670 buf ? n : 0, // and its size
2671 NULL, // default "replacement" char
13dd924a 2672 pUsedDef // [out] was it used?
2b5f62a0
VZ
2673 );
2674
13dd924a
VZ
2675 if ( !len )
2676 {
2677 // function totally failed
467e0479 2678 return wxCONV_FAILED;
13dd924a
VZ
2679 }
2680
765bdb4a
VZ
2681 // we did something, check if we really succeeded
2682 if ( flags )
13dd924a 2683 {
765bdb4a
VZ
2684 // check if the conversion failed, i.e. if any replacements
2685 // were done
2686 if ( usedDef )
2687 return wxCONV_FAILED;
2688 }
2689 else // we must resort to double tripping...
2690 {
2691 // first we need to ensure that we really have the MB data: this is
2692 // not the case if we're called with NULL buffer, in which case we
2693 // need to do the conversion yet again
2694 wxCharBuffer bufDef;
2695 if ( !buf )
13dd924a 2696 {
765bdb4a
VZ
2697 bufDef = wxCharBuffer(len);
2698 buf = bufDef.data();
2699 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2700 buf, len, NULL, NULL) )
467e0479 2701 return wxCONV_FAILED;
13dd924a 2702 }
765bdb4a 2703
564da6ff
VZ
2704 if ( !n )
2705 n = wcslen(pwz);
765bdb4a 2706 wxWCharBuffer wcBuf(n);
564da6ff 2707 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2708 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2709 {
765bdb4a
VZ
2710 // we didn't obtain the same thing we started from, hence
2711 // the conversion was lossy and we consider that it failed
2712 return wxCONV_FAILED;
13dd924a
VZ
2713 }
2714 }
2715
03a991bc 2716 // see the comment above for the reason of "len - 1"
13dd924a 2717 return len - 1;
f1339c56 2718 }
dccce9ea 2719
7ef3ab50
VZ
2720 virtual size_t GetMBNulLen() const
2721 {
2722 if ( m_minMBCharWidth == 0 )
2723 {
2724 int len = ::WideCharToMultiByte
2725 (
2726 m_CodePage, // code page
2727 0, // no flags
2728 L"", // input string
2729 1, // translate just the NUL
2730 NULL, // output buffer
2731 0, // and its size
2732 NULL, // no replacement char
2733 NULL // [out] don't care if it was used
2734 );
2735
2736 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2737 switch ( len )
2738 {
2739 default:
9a83f860 2740 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2741 self->m_minMBCharWidth = (size_t)-1;
2742 break;
7ef3ab50
VZ
2743
2744 case 0:
2745 self->m_minMBCharWidth = (size_t)-1;
2746 break;
2747
2748 case 1:
2749 case 2:
2750 case 4:
2751 self->m_minMBCharWidth = len;
2752 break;
2753 }
2754 }
2755
2756 return m_minMBCharWidth;
2757 }
2758
d36c9347
VZ
2759 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2760
13dd924a
VZ
2761 bool IsOk() const { return m_CodePage != -1; }
2762
2763private:
2764 static bool CanUseNoBestFit()
2765 {
2766 static int s_isWin98Or2k = -1;
2767
2768 if ( s_isWin98Or2k == -1 )
2769 {
2770 int verMaj, verMin;
2771 switch ( wxGetOsVersion(&verMaj, &verMin) )
2772 {
406d283a 2773 case wxOS_WINDOWS_9X:
13dd924a
VZ
2774 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2775 break;
2776
406d283a 2777 case wxOS_WINDOWS_NT:
13dd924a
VZ
2778 s_isWin98Or2k = verMaj >= 5;
2779 break;
2780
2781 default:
ef199164 2782 // unknown: be conservative by default
13dd924a 2783 s_isWin98Or2k = 0;
ef199164 2784 break;
13dd924a
VZ
2785 }
2786
9a83f860 2787 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2788 }
2789
2790 return s_isWin98Or2k == 1;
2791 }
f1339c56 2792
89028980
VS
2793 static bool IsAtLeastWin2kSP4()
2794 {
8942f83a
WS
2795#ifdef __WXWINCE__
2796 return false;
2797#else
89028980
VS
2798 static int s_isAtLeastWin2kSP4 = -1;
2799
2800 if ( s_isAtLeastWin2kSP4 == -1 )
2801 {
2802 OSVERSIONINFOEX ver;
2803
2804 memset(&ver, 0, sizeof(ver));
2805 ver.dwOSVersionInfoSize = sizeof(ver);
2806 GetVersionEx((OSVERSIONINFO*)&ver);
2807
2808 s_isAtLeastWin2kSP4 =
2809 ((ver.dwMajorVersion > 5) || // Vista+
2810 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2811 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2812 ver.wServicePackMajor >= 4)) // 2000 SP4+
2813 ? 1 : 0;
2814 }
2815
2816 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2817#endif
89028980
VS
2818 }
2819
eec47cc6 2820
c1464d9d 2821 // the code page we're working with
b1d66b54 2822 long m_CodePage;
c1464d9d 2823
7ef3ab50 2824 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2825 // "unknown"
2826 size_t m_minMBCharWidth;
1cd52418 2827};
e95354ec
VZ
2828
2829#endif // wxHAVE_WIN32_MB2WC
2830
f7e98dee 2831
36acb880
VZ
2832// ============================================================================
2833// wxEncodingConverter based conversion classes
2834// ============================================================================
2835
1e6feb95 2836#if wxUSE_FONTMAP
1cd52418 2837
e95354ec 2838class wxMBConv_wxwin : public wxMBConv
1cd52418 2839{
8b04d4c4
VZ
2840private:
2841 void Init()
2842 {
6ac84a78
DE
2843 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844 // The wxMBConv_cf class does a better job.
2845 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2846 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2847 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2848 }
2849
6001e347 2850public:
f1339c56
RR
2851 // temporarily just use wxEncodingConverter stuff,
2852 // so that it works while a better implementation is built
86501081 2853 wxMBConv_wxwin(const char* name)
f1339c56
RR
2854 {
2855 if (name)
267e11c5 2856 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2857 else
2858 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2859
8b04d4c4
VZ
2860 Init();
2861 }
2862
e95354ec 2863 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2864 {
2865 m_enc = enc;
2866
2867 Init();
f1339c56 2868 }
dccce9ea 2869
bde4baac 2870 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2871 {
2872 size_t inbuf = strlen(psz);
dccce9ea 2873 if (buf)
c643a977 2874 {
ef199164 2875 if (!m2w.Convert(psz, buf))
467e0479 2876 return wxCONV_FAILED;
c643a977 2877 }
f1339c56
RR
2878 return inbuf;
2879 }
dccce9ea 2880
bde4baac 2881 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2882 {
f8d791e0 2883 const size_t inbuf = wxWcslen(psz);
f1339c56 2884 if (buf)
c643a977 2885 {
ef199164 2886 if (!w2m.Convert(psz, buf))
467e0479 2887 return wxCONV_FAILED;
c643a977 2888 }
dccce9ea 2889
f1339c56
RR
2890 return inbuf;
2891 }
dccce9ea 2892
7ef3ab50 2893 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2894 {
2895 switch ( m_enc )
2896 {
2897 case wxFONTENCODING_UTF16BE:
2898 case wxFONTENCODING_UTF16LE:
c1464d9d 2899 return 2;
eec47cc6
VZ
2900
2901 case wxFONTENCODING_UTF32BE:
2902 case wxFONTENCODING_UTF32LE:
c1464d9d 2903 return 4;
eec47cc6
VZ
2904
2905 default:
c1464d9d 2906 return 1;
eec47cc6
VZ
2907 }
2908 }
2909
d36c9347
VZ
2910 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2911
7ef3ab50
VZ
2912 bool IsOk() const { return m_ok; }
2913
2914public:
2915 wxFontEncoding m_enc;
2916 wxEncodingConverter m2w, w2m;
2917
2918private:
cafbf6fb
VZ
2919 // were we initialized successfully?
2920 bool m_ok;
fc7a2a60 2921
c0c133e1 2922 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2923};
6001e347 2924
8f115891 2925// make the constructors available for unit testing
86501081 2926WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2927{
2928 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2929 if ( !result->IsOk() )
2930 {
2931 delete result;
2932 return 0;
2933 }
ef199164 2934
8f115891
MW
2935 return result;
2936}
2937
1e6feb95
VZ
2938#endif // wxUSE_FONTMAP
2939
36acb880
VZ
2940// ============================================================================
2941// wxCSConv implementation
2942// ============================================================================
2943
8b04d4c4 2944void wxCSConv::Init()
6001e347 2945{
e95354ec
VZ
2946 m_name = NULL;
2947 m_convReal = NULL;
2948 m_deferred = true;
2949}
2950
86501081 2951wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2952{
2953 Init();
82713003 2954
86501081 2955 if ( !charset.empty() )
e95354ec 2956 {
86501081 2957 SetName(charset.ToAscii());
e95354ec 2958 }
bda3d86a 2959
e4277538
VZ
2960#if wxUSE_FONTMAP
2961 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
e3276230
VZ
2962 if ( m_encoding == wxFONTENCODING_MAX )
2963 {
2964 // set to unknown/invalid value
2965 m_encoding = wxFONTENCODING_SYSTEM;
2966 }
2967 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2968 {
2969 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2970 m_encoding = wxFONTENCODING_ISO8859_1;
2971 }
e4277538 2972#else
bda3d86a 2973 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2974#endif
6001e347
RR
2975}
2976
8b04d4c4
VZ
2977wxCSConv::wxCSConv(wxFontEncoding encoding)
2978{
bda3d86a 2979 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 2980 {
9a83f860 2981 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
2982
2983 encoding = wxFONTENCODING_SYSTEM;
2984 }
2985
8b04d4c4
VZ
2986 Init();
2987
bda3d86a 2988 m_encoding = encoding;
8b04d4c4
VZ
2989}
2990
6001e347
RR
2991wxCSConv::~wxCSConv()
2992{
65e50848
JS
2993 Clear();
2994}
2995
54380f29 2996wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2997 : wxMBConv()
54380f29 2998{
8b04d4c4
VZ
2999 Init();
3000
54380f29 3001 SetName(conv.m_name);
8b04d4c4 3002 m_encoding = conv.m_encoding;
54380f29
GD
3003}
3004
3005wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3006{
3007 Clear();
8b04d4c4 3008
54380f29 3009 SetName(conv.m_name);
8b04d4c4
VZ
3010 m_encoding = conv.m_encoding;
3011
54380f29
GD
3012 return *this;
3013}
3014
65e50848
JS
3015void wxCSConv::Clear()
3016{
8b04d4c4 3017 free(m_name);
e95354ec 3018 delete m_convReal;
8b04d4c4 3019
65e50848 3020 m_name = NULL;
e95354ec 3021 m_convReal = NULL;
6001e347
RR
3022}
3023
86501081 3024void wxCSConv::SetName(const char *charset)
6001e347 3025{
f1339c56
RR
3026 if (charset)
3027 {
d6f2a891 3028 m_name = wxStrdup(charset);
e95354ec 3029 m_deferred = true;
f1339c56 3030 }
6001e347
RR
3031}
3032
8b3eb85d 3033#if wxUSE_FONTMAP
8b3eb85d
VZ
3034
3035WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3036 wxEncodingNameCache );
8b3eb85d
VZ
3037
3038static wxEncodingNameCache gs_nameCache;
3039#endif
3040
e95354ec
VZ
3041wxMBConv *wxCSConv::DoCreate() const
3042{
ce6f8d6f
VZ
3043#if wxUSE_FONTMAP
3044 wxLogTrace(TRACE_STRCONV,
3045 wxT("creating conversion for %s"),
3046 (m_name ? m_name
86501081 3047 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3048#endif // wxUSE_FONTMAP
3049
c547282d
VZ
3050 // check for the special case of ASCII or ISO8859-1 charset: as we have
3051 // special knowledge of it anyhow, we don't need to create a special
3052 // conversion object
e4277538
VZ
3053 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3054 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3055 {
e95354ec
VZ
3056 // don't convert at all
3057 return NULL;
3058 }
dccce9ea 3059
e95354ec
VZ
3060 // we trust OS to do conversion better than we can so try external
3061 // conversion methods first
3062 //
3063 // the full order is:
3064 // 1. OS conversion (iconv() under Unix or Win32 API)
3065 // 2. hard coded conversions for UTF
3066 // 3. wxEncodingConverter as fall back
3067
3068 // step (1)
3069#ifdef HAVE_ICONV
c547282d 3070#if !wxUSE_FONTMAP
e95354ec 3071 if ( m_name )
c547282d 3072#endif // !wxUSE_FONTMAP
e95354ec 3073 {
3ef10cfc 3074#if wxUSE_FONTMAP
8b3eb85d 3075 wxFontEncoding encoding(m_encoding);
3ef10cfc 3076#endif
8b3eb85d 3077
86501081 3078 if ( m_name )
8b3eb85d 3079 {
86501081 3080 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3081 if ( conv->IsOk() )
3082 return conv;
3083
3084 delete conv;
c547282d
VZ
3085
3086#if wxUSE_FONTMAP
8b3eb85d 3087 encoding =
86501081 3088 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3089#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3090 }
3091#if wxUSE_FONTMAP
3092 {
3093 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3094 if ( it != gs_nameCache.end() )
3095 {
3096 if ( it->second.empty() )
3097 return NULL;
c547282d 3098
86501081 3099 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3100 if ( conv->IsOk() )
3101 return conv;
e95354ec 3102
8b3eb85d
VZ
3103 delete conv;
3104 }
3105
a243da29 3106 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3107 // CS : in case this does not return valid names (eg for MacRoman)
3108 // encoding got a 'failure' entry in the cache all the same,
3109 // although it just has to be created using a different method, so
3110 // only store failed iconv creation attempts (or perhaps we
3111 // shoulnd't do this at all ?)
3c67ec06 3112 if ( names[0] != NULL )
8b3eb85d 3113 {
3c67ec06 3114 for ( ; *names; ++names )
8b3eb85d 3115 {
86501081
VS
3116 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3117 // will need changes that will obsolete this
3118 wxString name(*names);
3119 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3120 if ( conv->IsOk() )
3121 {
3122 gs_nameCache[encoding] = *names;
3123 return conv;
3124 }
3125
3126 delete conv;
8b3eb85d
VZ
3127 }
3128
9a83f860 3129 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3130 }
8b3eb85d
VZ
3131 }
3132#endif // wxUSE_FONTMAP
e95354ec
VZ
3133 }
3134#endif // HAVE_ICONV
3135
3136#ifdef wxHAVE_WIN32_MB2WC
3137 {
7608a683 3138#if wxUSE_FONTMAP
e95354ec
VZ
3139 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3140 : new wxMBConv_win32(m_encoding);
3141 if ( conv->IsOk() )
3142 return conv;
3143
3144 delete conv;
7608a683
WS
3145#else
3146 return NULL;
3147#endif
e95354ec
VZ
3148 }
3149#endif // wxHAVE_WIN32_MB2WC
ef199164 3150
5c4ed98d 3151#ifdef __DARWIN__
f7e98dee 3152 {
6ff49cbc
DE
3153 // leave UTF16 and UTF32 to the built-ins of wx
3154 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3155 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3156 {
a6900d10 3157#if wxUSE_FONTMAP
5c4ed98d
DE
3158 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3159 : new wxMBConv_cf(m_encoding);
a6900d10 3160#else
5c4ed98d 3161 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3162#endif
ef199164 3163
f7e98dee 3164 if ( conv->IsOk() )
d775fa82
WS
3165 return conv;
3166
3167 delete conv;
3168 }
335d31e0 3169 }
5c4ed98d
DE
3170#endif // __DARWIN__
3171
e95354ec
VZ
3172 // step (2)
3173 wxFontEncoding enc = m_encoding;
3174#if wxUSE_FONTMAP
c547282d
VZ
3175 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3176 {
3177 // use "false" to suppress interactive dialogs -- we can be called from
3178 // anywhere and popping up a dialog from here is the last thing we want to
3179 // do
267e11c5 3180 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3181 }
e95354ec
VZ
3182#endif // wxUSE_FONTMAP
3183
3184 switch ( enc )
3185 {
3186 case wxFONTENCODING_UTF7:
3187 return new wxMBConvUTF7;
3188
3189 case wxFONTENCODING_UTF8:
3190 return new wxMBConvUTF8;
3191
e95354ec
VZ
3192 case wxFONTENCODING_UTF16BE:
3193 return new wxMBConvUTF16BE;
3194
3195 case wxFONTENCODING_UTF16LE:
3196 return new wxMBConvUTF16LE;
3197
e95354ec
VZ
3198 case wxFONTENCODING_UTF32BE:
3199 return new wxMBConvUTF32BE;
3200
3201 case wxFONTENCODING_UTF32LE:
3202 return new wxMBConvUTF32LE;
3203
3204 default:
3205 // nothing to do but put here to suppress gcc warnings
ef199164 3206 break;
e95354ec
VZ
3207 }
3208
3209 // step (3)
3210#if wxUSE_FONTMAP
3211 {
3212 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3213 : new wxMBConv_wxwin(m_encoding);
3214 if ( conv->IsOk() )
3215 return conv;
3216
3217 delete conv;
3218 }
ef199164 3219
3df31b2d
VZ
3220 wxLogTrace(TRACE_STRCONV,
3221 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3222 (m_name ? wxString(m_name)
3df31b2d
VZ
3223 : wxFontMapperBase::GetEncodingName(m_encoding)));
3224#endif // wxUSE_FONTMAP
e95354ec
VZ
3225
3226 return NULL;
3227}
3228
3229void wxCSConv::CreateConvIfNeeded() const
3230{
3231 if ( m_deferred )
3232 {
3233 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3234
bda3d86a
VZ
3235 // if we don't have neither the name nor the encoding, use the default
3236 // encoding for this system
3237 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3238 {
4c75209f 3239#if wxUSE_INTL
02c7347b 3240 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3241#else
3242 // fallback to some reasonable default:
3243 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3244#endif // wxUSE_INTL
4c75209f 3245 }
bda3d86a 3246
e95354ec
VZ
3247 self->m_convReal = DoCreate();
3248 self->m_deferred = false;
6001e347 3249 }
6001e347
RR
3250}
3251
0f0298b1
VZ
3252bool wxCSConv::IsOk() const
3253{
3254 CreateConvIfNeeded();
3255
3256 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3257 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3258 return true; // always ok as we do it ourselves
3259
3260 // m_convReal->IsOk() is called at its own creation, so we know it must
3261 // be ok if m_convReal is non-NULL
3262 return m_convReal != NULL;
3263}
3264
1c714a5d
VZ
3265size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3266 const char *src, size_t srcLen) const
3267{
3268 CreateConvIfNeeded();
3269
2c74c558
VS
3270 if (m_convReal)
3271 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3272
3273 // latin-1 (direct)
05392dc8
VZ
3274 if ( srcLen == wxNO_LEN )
3275 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3276
05392dc8
VZ
3277 if ( dst )
3278 {
3279 if ( dstLen < srcLen )
3280 return wxCONV_FAILED;
1c714a5d 3281
05392dc8
VZ
3282 for ( size_t n = 0; n < srcLen; n++ )
3283 dst[n] = (unsigned char)(src[n]);
3284 }
2c74c558 3285
05392dc8 3286 return srcLen;
1c714a5d
VZ
3287}
3288
05392dc8
VZ
3289size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3290 const wchar_t *src, size_t srcLen) const
6001e347 3291{
e95354ec 3292 CreateConvIfNeeded();
dccce9ea 3293
e95354ec 3294 if (m_convReal)
05392dc8 3295 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3296
3297 // latin-1 (direct)
05392dc8
VZ
3298 if ( srcLen == wxNO_LEN )
3299 srcLen = wxWcslen(src) + 1;
dccce9ea 3300
05392dc8 3301 if ( dst )
f1339c56 3302 {
05392dc8
VZ
3303 if ( dstLen < srcLen )
3304 return wxCONV_FAILED;
1cd52418 3305
05392dc8 3306 for ( size_t n = 0; n < srcLen; n++ )
24642831 3307 {
05392dc8 3308 if ( src[n] > 0xFF )
467e0479 3309 return wxCONV_FAILED;
ef199164 3310
05392dc8 3311 dst[n] = (char)src[n];
24642831 3312 }
05392dc8 3313
24642831 3314 }
05392dc8 3315 else // still need to check the input validity
24642831 3316 {
05392dc8 3317 for ( size_t n = 0; n < srcLen; n++ )
24642831 3318 {
05392dc8 3319 if ( src[n] > 0xFF )
467e0479 3320 return wxCONV_FAILED;
24642831 3321 }
f1339c56 3322 }
dccce9ea 3323
05392dc8 3324 return srcLen;
6001e347
RR
3325}
3326
7ef3ab50 3327size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3328{
3329 CreateConvIfNeeded();
3330
3331 if ( m_convReal )
3332 {
7ef3ab50 3333 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3334 }
3335
ba98e032 3336 // otherwise, we are ISO-8859-1
c1464d9d 3337 return 1;
eec47cc6
VZ
3338}
3339
ba98e032
VS
3340#if wxUSE_UNICODE_UTF8
3341bool wxCSConv::IsUTF8() const
3342{
3343 CreateConvIfNeeded();
3344
3345 if ( m_convReal )
3346 {
3347 return m_convReal->IsUTF8();
3348 }
3349
3350 // otherwise, we are ISO-8859-1
3351 return false;
3352}
3353#endif
3354
69c928ef
VZ
3355
3356#if wxUSE_UNICODE
3357
3358wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3359{
3360 if ( !s )
3361 return wxWCharBuffer();
3362
3363 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3364 if ( !wbuf )
5487ff0f 3365 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3366 if ( !wbuf )
3367 wbuf = wxConvISO8859_1.cMB2WX(s);
3368
3369 return wbuf;
3370}
3371
3372wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3373{
3374 if ( !ws )
3375 return wxCharBuffer();
3376
3377 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3378 if ( !buf )
3379 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3380
3381 return buf;
3382}
3383
3384#endif // wxUSE_UNICODE
f5a1953b 3385
1e50d914
VS
3386// ----------------------------------------------------------------------------
3387// globals
3388// ----------------------------------------------------------------------------
3389
3390// NB: The reason why we create converted objects in this convoluted way,
3391// using a factory function instead of global variable, is that they
3392// may be used at static initialization time (some of them are used by
3393// wxString ctors and there may be a global wxString object). In other
3394// words, possibly _before_ the converter global object would be
3395// initialized.
3396
3397#undef wxConvLibc
3398#undef wxConvUTF8
3399#undef wxConvUTF7
3400#undef wxConvLocal
3401#undef wxConvISO8859_1
3402
3403#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3404 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3405 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3406 { \
3407 static impl_klass name##Obj ctor_args; \
3408 return &name##Obj; \
3409 } \
3410 /* this ensures that all global converter objects are created */ \
3411 /* by the time static initialization is done, i.e. before any */ \
3412 /* thread is launched: */ \
3413 static klass* gs_##name##instance = wxGet_##name##Ptr()
3414
3415#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3416 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3417
5c69ef61
VZ
3418#ifdef __INTELC__
3419 // disable warning "variable 'xxx' was declared but never referenced"
3420 #pragma warning(disable: 177)
3421#endif // Intel C++
3422
1e50d914
VS
3423#ifdef __WINDOWS__
3424 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3425#elif 0 // defined(__WXOSX__)
3426 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3427#else
3428 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3429#endif
3430
e1079eda
VZ
3431// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3432// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3433// provokes an error message about "not enough macro parameters"; and we
3434// can't use "()" here as the name##Obj declaration would be parsed as a
3435// function declaration then, so use a semicolon and live with an extra
3436// empty statement (and hope that no compilers warns about this)
3437WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3438WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3439
3440WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3441WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3442
3443WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3444WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3445
6ac84a78
DE
3446#ifdef __DARWIN__
3447// The xnu kernel always communicates file paths in decomposed UTF-8.
3448// WARNING: Are we sure that CFString's conversion will cause decomposition?
3449static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3450#endif
6ac84a78 3451
1e50d914 3452WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3453#ifdef __DARWIN__
1e50d914 3454 &wxConvMacUTF8DObj;
6ac84a78 3455#else // !__DARWIN__
1e50d914 3456 wxGet_wxConvLibcPtr();
6ac84a78 3457#endif // __DARWIN__/!__DARWIN__
1e50d914 3458
bde4baac
VZ
3459#else // !wxUSE_WCHAR_T
3460
1e50d914 3461// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3462// stand-ins in absence of wchar_t
3463WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3464 wxConvISO8859_1,
3465 wxConvLocal,
3466 wxConvUTF8;
3467
3468#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T