]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Correctly restore the originally used C locale in wxLocale dtor.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
1c193821 31#ifndef __WXWINCE__
1cd52418 32#include <errno.h>
1c193821
JS
33#endif
34
6001e347
RR
35#include <ctype.h>
36#include <string.h>
37#include <stdlib.h>
38
e95354ec 39#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
e95354ec 42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
b040e242 45#ifdef HAVE_ICONV
373658eb 46 #include <iconv.h>
b1d547eb 47 #include "wx/thread.h"
1cd52418 48#endif
1cd52418 49
373658eb
VZ
50#include "wx/encconv.h"
51#include "wx/fontmap.h"
52
5c4ed98d 53#ifdef __DARWIN__
c933e267 54#include "wx/osx/core/private/strconv_cf.h"
5c4ed98d
DE
55#endif //def __DARWIN__
56
ef199164 57
9a83f860 58#define TRACE_STRCONV wxT("strconv")
ce6f8d6f 59
467e0479
VZ
60// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61// be 4 bytes
4948c2b6 62#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
63 #define WC_UTF16
64#endif
65
ef199164 66
373658eb
VZ
67// ============================================================================
68// implementation
69// ============================================================================
70
69373110
VZ
71// helper function of cMB2WC(): check if n bytes at this location are all NUL
72static bool NotAllNULs(const char *p, size_t n)
73{
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78}
79
373658eb 80// ----------------------------------------------------------------------------
467e0479 81// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 82// ----------------------------------------------------------------------------
6001e347 83
c91830cb 84static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 85{
ef199164 86 if (input <= 0xffff)
4def3b35 87 {
999836aa
VZ
88 if (output)
89 *output = (wxUint16) input;
ef199164 90
4def3b35 91 return 1;
dccce9ea 92 }
ef199164 93 else if (input >= 0x110000)
4def3b35 94 {
467e0479 95 return wxCONV_FAILED;
dccce9ea
VZ
96 }
97 else
4def3b35 98 {
dccce9ea 99 if (output)
4def3b35 100 {
ef199164
DS
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 103 }
ef199164 104
4def3b35 105 return 2;
1cd52418 106 }
1cd52418
OK
107}
108
c91830cb 109static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 110{
ef199164 111 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
112 {
113 output = *input;
114 return 1;
dccce9ea 115 }
ef199164 116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
117 {
118 output = *input;
467e0479 119 return wxCONV_FAILED;
dccce9ea
VZ
120 }
121 else
4def3b35
VS
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
1cd52418
OK
126}
127
467e0479 128#ifdef WC_UTF16
35d11700
VZ
129 typedef wchar_t wxDecodeSurrogate_t;
130#else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
133
134// returns the next UTF-32 character from the wchar_t buffer and advances the
135// pointer to the character after this one
136//
137// if an invalid character is found, *pSrc is set to NULL, the caller must
138// check for this
35d11700 139static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
140{
141 wxUint32 out;
8d3dd069 142 const size_t
5c33522f 143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
467e0479
VZ
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150}
151
f6bcfd97 152// ----------------------------------------------------------------------------
6001e347 153// wxMBConv
f6bcfd97 154// ----------------------------------------------------------------------------
2c53a80a 155
483b0434
VZ
156size_t
157wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
6001e347 159{
483b0434 160 // although new conversion classes are supposed to implement this function
36f93678 161 // directly, the existing ones only implement the old MB2WC() and so, to
483b0434
VZ
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
36f93678
VZ
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
6001e347 170
483b0434
VZ
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
eec47cc6 173
c1464d9d 174 // the number of NULs terminating this string
a78c43f1 175 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 176
c1464d9d
VZ
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
483b0434
VZ
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
467e0479 183 if ( srcLen != wxNO_LEN )
eec47cc6 184 {
c1464d9d 185 // we need to know how to find the end of this string
7ef3ab50 186 nulLen = GetMBNulLen();
483b0434
VZ
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
e4e3bbb4 189
c1464d9d 190 // if there are enough NULs we can avoid the copy
483b0434 191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
192 {
193 // make a copy in order to properly NUL-terminate the string
483b0434 194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 195 char * const p = bufTmp.data();
483b0434
VZ
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 198 *s = '\0';
483b0434
VZ
199
200 src = bufTmp;
eec47cc6 201 }
e4e3bbb4 202
483b0434
VZ
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
e4e3bbb4 209
36f93678
VZ
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
bbb0ff36 217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
36f93678
VZ
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
483b0434 225 for ( ;; )
eec47cc6 226 {
c1464d9d 227 // try to convert the current chunk
483b0434 228 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
e4e3bbb4 231
483b0434 232 dstWritten += lenChunk;
f6a02087
VZ
233 if ( !srcEnd )
234 dstWritten++;
f5fb6871 235
f6a02087 236 if ( !lenChunk )
467e0479
VZ
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
483b0434
VZ
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
f6a02087
VZ
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
483b0434
VZ
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
f6a02087
VZ
252 if ( !srcEnd )
253 dst++;
483b0434 254 }
c1464d9d 255
483b0434 256 if ( !srcEnd )
c1464d9d 257 {
467e0479 258 // we convert just one chunk in this case as this is the entire
bbb0ff36 259 // string anyhow (and we don't count the trailing NUL in this case)
c1464d9d
VZ
260 break;
261 }
eec47cc6 262
bbb0ff36
VZ
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
483b0434 266 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
483b0434 272 src += nulLen;
c1464d9d 273 }
e4e3bbb4 274
bbb0ff36
VZ
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
c1464d9d 286
483b0434 287 if ( src >= srcEnd )
c1464d9d
VZ
288 break;
289 }
290
483b0434 291 return dstWritten;
e4e3bbb4
RN
292}
293
483b0434
VZ
294size_t
295wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
e4e3bbb4 297{
483b0434
VZ
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
e4e3bbb4 300
f6a02087
VZ
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
eec47cc6
VZ
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
eec47cc6 308 wxWCharBuffer bufTmp;
f6a02087 309 if ( isNulTerminated )
e4e3bbb4 310 {
483b0434 311 srcLen = wxWcslen(src) + 1;
eec47cc6 312 }
483b0434 313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
314 {
315 // make a copy in order to properly NUL-terminate the string
483b0434 316 bufTmp = wxWCharBuffer(srcLen);
ef199164 317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
27307233 324 src++ /* skip L'\0' too */ )
483b0434
VZ
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
483b0434
VZ
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
483b0434 331 dstWritten += lenChunk;
27307233
VZ
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
f6a02087 340 dstWritten += lenNul;
483b0434
VZ
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
27307233
VZ
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
483b0434
VZ
364 return wxCONV_FAILED;
365
27307233
VZ
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
483b0434 377 dst += lenChunk;
27307233 378 if ( chunkEnd < srcEnd )
f6a02087 379 dst += lenNul;
483b0434 380 }
27307233
VZ
381
382 src = chunkEnd;
eec47cc6 383 }
e4e3bbb4 384
483b0434
VZ
385 return dstWritten;
386}
387
ef199164 388size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 389{
51725fc0 390 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 391 if ( rc != wxCONV_FAILED )
509da451
VZ
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399}
400
ef199164 401size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 402{
51725fc0 403 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 404 if ( rc != wxCONV_FAILED )
509da451 405 {
51725fc0 406 rc -= GetMBNulLen();
509da451
VZ
407 }
408
409 return rc;
410}
411
483b0434
VZ
412wxMBConv::~wxMBConv()
413{
414 // nothing to do here (necessary for Darwin linking probably)
415}
e4e3bbb4 416
483b0434
VZ
417const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418{
419 if ( psz )
eec47cc6 420 {
483b0434 421 // calculate the length of the buffer needed first
a2db25a1 422 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 423 if ( nLen != wxCONV_FAILED )
f5fb6871 424 {
483b0434 425 // now do the actual conversion
a2db25a1 426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 427
483b0434 428 // +1 for the trailing NULL
a2db25a1 429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 430 return buf;
f5fb6871 431 }
483b0434 432 }
e4e3bbb4 433
483b0434
VZ
434 return wxWCharBuffer();
435}
3698ae71 436
483b0434
VZ
437const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438{
439 if ( pwz )
440 {
a2db25a1 441 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 442 if ( nLen != wxCONV_FAILED )
483b0434 443 {
a2db25a1
VZ
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451}
e4e3bbb4 452
483b0434 453const wxWCharBuffer
ef199164 454wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 455{
ef199164 456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 457 if ( dstLen != wxCONV_FAILED )
483b0434 458 {
0dd13d21
VZ
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
ef199164 463 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
464 {
465 if ( outLen )
467e0479
VZ
466 {
467 *outLen = dstLen;
f6a02087
VZ
468
469 // we also need to handle NUL-terminated input strings
470 // specially: for them the output is the length of the string
471 // excluding the trailing NUL, however if we're asked to
472 // convert a specific number of characters we return the length
473 // of the resulting output even if it's NUL-terminated
474 if ( inLen == wxNO_LEN )
467e0479
VZ
475 (*outLen)--;
476 }
477
483b0434
VZ
478 return wbuf;
479 }
480 }
481
482 if ( outLen )
483 *outLen = 0;
484
485 return wxWCharBuffer();
486}
487
488const wxCharBuffer
ef199164 489wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 490{
13d92ad6 491 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 492 if ( dstLen != wxCONV_FAILED )
483b0434 493 {
0dd13d21
VZ
494 const size_t nulLen = GetMBNulLen();
495
496 // as above, ensure that the buffer is always NUL-terminated, even if
497 // the input is not
498 wxCharBuffer buf(dstLen + nulLen - 1);
499 memset(buf.data() + dstLen, 0, nulLen);
ef199164 500 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
501 {
502 if ( outLen )
467e0479
VZ
503 {
504 *outLen = dstLen;
505
f6a02087 506 if ( inLen == wxNO_LEN )
467e0479 507 {
f6a02087
VZ
508 // in this case both input and output are NUL-terminated
509 // and we're not supposed to count NUL
13d92ad6 510 *outLen -= nulLen;
467e0479
VZ
511 }
512 }
d32a507d 513
483b0434
VZ
514 return buf;
515 }
e4e3bbb4
RN
516 }
517
eec47cc6
VZ
518 if ( outLen )
519 *outLen = 0;
520
521 return wxCharBuffer();
e4e3bbb4
RN
522}
523
40ac5040
VZ
524const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
525{
526 const size_t srcLen = buf.length();
527 if ( srcLen )
528 {
529 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
530 if ( dstLen != wxCONV_FAILED )
531 {
532 wxWCharBuffer wbuf(dstLen);
533 wbuf.data()[dstLen] = L'\0';
534 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
535 return wbuf;
536 }
537 }
538
cfcfada9 539 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
40ac5040
VZ
540}
541
542const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
543{
544 const size_t srcLen = wbuf.length();
545 if ( srcLen )
546 {
547 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
548 if ( dstLen != wxCONV_FAILED )
549 {
550 wxCharBuffer buf(dstLen);
551 buf.data()[dstLen] = '\0';
552 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
553 return buf;
554 }
555 }
556
cfcfada9 557 return wxScopedCharBuffer::CreateNonOwned("", 0);
40ac5040
VZ
558}
559
6001e347 560// ----------------------------------------------------------------------------
bde4baac 561// wxMBConvLibc
6001e347
RR
562// ----------------------------------------------------------------------------
563
bde4baac
VZ
564size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
565{
566 return wxMB2WC(buf, psz, n);
567}
568
569size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
570{
571 return wxWC2MB(buf, psz, n);
572}
e1bfe89e
RR
573
574// ----------------------------------------------------------------------------
532d575b 575// wxConvBrokenFileNames
e1bfe89e
RR
576// ----------------------------------------------------------------------------
577
eec47cc6
VZ
578#ifdef __UNIX__
579
86501081 580wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 581{
9a83f860
VZ
582 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
583 wxStricmp(charset, wxT("UTF8")) == 0 )
5deedd6e 584 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
585 else
586 m_conv = new wxCSConv(charset);
ea8ce907
RR
587}
588
eec47cc6 589#endif // __UNIX__
c12b7f79 590
bde4baac 591// ----------------------------------------------------------------------------
3698ae71 592// UTF-7
bde4baac 593// ----------------------------------------------------------------------------
6001e347 594
15f2ee32 595// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
596//
597// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 598
15f2ee32
RN
599//
600// BASE64 decoding table
601//
602static const unsigned char utf7unb64[] =
6001e347 603{
15f2ee32
RN
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
610 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
611 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
613 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
614 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
615 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
616 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
617 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
618 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
619 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
620 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
ccaa848d 635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
636};
637
9d653e81
VZ
638size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
639 const char *src, size_t srcLen) const
15f2ee32 640{
9d653e81 641 DecoderState stateOrig,
852dcba5 642 *statePtr;
9d653e81
VZ
643 if ( srcLen == wxNO_LEN )
644 {
645 // convert the entire string, up to and including the trailing NUL
646 srcLen = strlen(src) + 1;
647
648 // when working on the entire strings we don't update nor use the shift
649 // state from the previous call
650 statePtr = &stateOrig;
651 }
652 else // when working with partial strings we do use the shift state
653 {
5c33522f 654 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
9d653e81
VZ
655
656 // also save the old state to be able to rollback to it on error
657 stateOrig = m_stateDecoder;
658 }
659
660 // but to simplify the code below we use this variable in both cases
661 DecoderState& state = *statePtr;
662
663
664 // number of characters [which would have been] written to dst [if it were
665 // not NULL]
15f2ee32
RN
666 size_t len = 0;
667
9d653e81
VZ
668 const char * const srcEnd = src + srcLen;
669
670 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 671 {
9d653e81
VZ
672 const unsigned char cc = *src++;
673
674 if ( state.IsShifted() )
15f2ee32 675 {
9d653e81
VZ
676 const unsigned char dc = utf7unb64[cc];
677 if ( dc == 0xff )
15f2ee32 678 {
ccaa848d
VZ
679 // end of encoded part, check that nothing was left: there can
680 // be up to 4 bits of 0 padding but nothing else (we also need
681 // to check isLSB as we count bits modulo 8 while a valid UTF-7
682 // encoded sequence must contain an integral number of UTF-16
683 // characters)
684 if ( state.isLSB || state.bit > 4 ||
685 (state.accum & ((1 << state.bit) - 1)) )
686 {
687 if ( !len )
688 state = stateOrig;
689
852dcba5 690 return wxCONV_FAILED;
ccaa848d 691 }
852dcba5 692
9d653e81
VZ
693 state.ToDirect();
694
695 // re-parse this character normally below unless it's '-' which
696 // is consumed by the decoder
697 if ( cc == '-' )
698 continue;
699 }
700 else // valid encoded character
701 {
702 // mini base64 decoder: each character is 6 bits
703 state.bit += 6;
704 state.accum <<= 6;
705 state.accum += dc;
706
707 if ( state.bit >= 8 )
15f2ee32 708 {
9d653e81
VZ
709 // got the full byte, consume it
710 state.bit -= 8;
711 unsigned char b = (state.accum >> state.bit) & 0x00ff;
712
713 if ( state.isLSB )
15f2ee32 714 {
9d653e81
VZ
715 // we've got the full word, output it
716 if ( dst )
717 *dst++ = (state.msb << 8) | b;
718 len++;
719 state.isLSB = false;
15f2ee32 720 }
9d653e81 721 else // MSB
04a37834 722 {
9d653e81
VZ
723 // just store it while we wait for LSB
724 state.msb = b;
725 state.isLSB = true;
04a37834 726 }
15f2ee32
RN
727 }
728 }
9d653e81 729 }
04a37834 730
9d653e81
VZ
731 if ( state.IsDirect() )
732 {
733 // start of an encoded segment?
734 if ( cc == '+' )
04a37834 735 {
9d653e81
VZ
736 if ( *src == '-' )
737 {
738 // just the encoded plus sign, don't switch to shifted mode
739 if ( dst )
740 *dst++ = '+';
741 len++;
742 src++;
743 }
ccaa848d
VZ
744 else if ( utf7unb64[(unsigned)*src] == 0xff )
745 {
746 // empty encoded chunks are not allowed
747 if ( !len )
748 state = stateOrig;
749
750 return wxCONV_FAILED;
751 }
752 else // base-64 encoded chunk follows
9d653e81
VZ
753 {
754 state.ToShifted();
755 }
756 }
757 else // not '+'
758 {
759 // only printable 7 bit ASCII characters (with the exception of
760 // NUL, TAB, CR and LF) can be used directly
761 if ( cc >= 0x7f || (cc < ' ' &&
762 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
763 return wxCONV_FAILED;
764
765 if ( dst )
766 *dst++ = cc;
767 len++;
768 }
15f2ee32
RN
769 }
770 }
04a37834 771
9d653e81
VZ
772 if ( !len )
773 {
774 // as we didn't read any characters we should be called with the same
775 // data (followed by some more new data) again later so don't save our
776 // state
777 state = stateOrig;
778
779 return wxCONV_FAILED;
780 }
04a37834 781
15f2ee32 782 return len;
6001e347
RR
783}
784
15f2ee32
RN
785//
786// BASE64 encoding table
787//
788static const unsigned char utf7enb64[] =
789{
790 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
791 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
792 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
793 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
794 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
795 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
796 'w', 'x', 'y', 'z', '0', '1', '2', '3',
797 '4', '5', '6', '7', '8', '9', '+', '/'
798};
799
800//
801// UTF-7 encoding table
802//
803// 0 - Set D (directly encoded characters)
804// 1 - Set O (optional direct characters)
805// 2 - whitespace characters (optional)
806// 3 - special characters
807//
808static const unsigned char utf7encode[128] =
6001e347 809{
9d653e81 810 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
811 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
812 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
813 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
816 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
818};
819
9d653e81
VZ
820static inline bool wxIsUTF7Direct(wchar_t wc)
821{
822 return wc < 0x80 && utf7encode[wc] < 1;
823}
824
825size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
826 const wchar_t *src, size_t srcLen) const
15f2ee32 827{
9d653e81
VZ
828 EncoderState stateOrig,
829 *statePtr;
830 if ( srcLen == wxNO_LEN )
831 {
832 // we don't apply the stored state when operating on entire strings at
833 // once
834 statePtr = &stateOrig;
835
836 srcLen = wxWcslen(src) + 1;
837 }
838 else // do use the mode we left the output in previously
839 {
840 stateOrig = m_stateEncoder;
5c33522f 841 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
9d653e81
VZ
842 }
843
844 EncoderState& state = *statePtr;
845
846
15f2ee32
RN
847 size_t len = 0;
848
9d653e81
VZ
849 const wchar_t * const srcEnd = src + srcLen;
850 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 851 {
9d653e81
VZ
852 wchar_t cc = *src++;
853 if ( wxIsUTF7Direct(cc) )
15f2ee32 854 {
9d653e81
VZ
855 if ( state.IsShifted() )
856 {
857 // pad with zeros the last encoded block if necessary
858 if ( state.bit )
859 {
860 if ( dst )
861 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
862 len++;
863 }
ef199164 864
9d653e81
VZ
865 state.ToDirect();
866
867 if ( dst )
868 *dst++ = '-';
869 len++;
870 }
871
872 if ( dst )
873 *dst++ = (char)cc;
15f2ee32
RN
874 len++;
875 }
9d653e81
VZ
876 else if ( cc == '+' && state.IsDirect() )
877 {
878 if ( dst )
879 {
880 *dst++ = '+';
881 *dst++ = '-';
882 }
883
884 len += 2;
885 }
15f2ee32 886#ifndef WC_UTF16
79c78d42 887 else if (((wxUint32)cc) > 0xffff)
b2c13097 888 {
15f2ee32 889 // no surrogate pair generation (yet?)
467e0479 890 return wxCONV_FAILED;
15f2ee32
RN
891 }
892#endif
893 else
894 {
9d653e81
VZ
895 if ( state.IsDirect() )
896 {
897 state.ToShifted();
ef199164 898
9d653e81
VZ
899 if ( dst )
900 *dst++ = '+';
901 len++;
902 }
903
904 // BASE64 encode string
905 for ( ;; )
15f2ee32 906 {
9d653e81 907 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 908 {
9d653e81
VZ
909 state.accum <<= 8;
910 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
911
912 for (state.bit += 8; state.bit >= 6; )
15f2ee32 913 {
9d653e81
VZ
914 state.bit -= 6;
915 if ( dst )
916 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
917 len++;
15f2ee32 918 }
15f2ee32 919 }
ef199164 920
9d653e81
VZ
921 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
922 break;
ef199164 923
9d653e81 924 src++;
15f2ee32 925 }
15f2ee32
RN
926 }
927 }
ef199164 928
9d653e81
VZ
929 // we need to restore the original encoder state if we were called just to
930 // calculate the amount of space needed as we will presumably be called
931 // again to really convert the data now
932 if ( !dst )
933 state = stateOrig;
ef199164 934
15f2ee32 935 return len;
6001e347
RR
936}
937
f6bcfd97 938// ----------------------------------------------------------------------------
6001e347 939// UTF-8
f6bcfd97 940// ----------------------------------------------------------------------------
6001e347 941
1774c3c5 942static const wxUint32 utf8_max[]=
4def3b35 943 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 944
3698ae71
VZ
945// boundaries of the private use area we use to (temporarily) remap invalid
946// characters invalid in a UTF-8 encoded string
ea8ce907
RR
947const wxUint32 wxUnicodePUA = 0x100000;
948const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
949
0286d08d 950// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 951const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
952 // single-byte sequences (ASCII):
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
961
962 // these are invalid:
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
967 0, 0, // C0,C1
968
969 // two-byte sequences:
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
972
973 // three-byte sequences:
974 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
975
976 // four-byte sequences:
977 4, 4, 4, 4, 4, // F0..F4
978
979 // these are invalid again (5- or 6-byte
980 // sequences and sequences for code points
981 // above U+10FFFF, as restricted by RFC 3629):
982 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
983};
984
985size_t
986wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
987 const char *src, size_t srcLen) const
988{
989 wchar_t *out = dstLen ? dst : NULL;
990 size_t written = 0;
991
992 if ( srcLen == wxNO_LEN )
993 srcLen = strlen(src) + 1;
994
995 for ( const char *p = src; ; p++ )
996 {
0dcbb107 997 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
0286d08d
VZ
998 {
999 // all done successfully, just add the trailing NULL if we are not
1000 // using explicit length
1001 if ( srcLen == wxNO_LEN )
1002 {
1003 if ( out )
1004 {
1005 if ( !dstLen )
1006 break;
1007
1008 *out = L'\0';
1009 }
1010
1011 written++;
1012 }
1013
1014 return written;
1015 }
1016
0286d08d
VZ
1017 if ( out && !dstLen-- )
1018 break;
1019
5367a38a
VS
1020 wxUint32 code;
1021 unsigned char c = *p;
0286d08d 1022
5367a38a
VS
1023 if ( c < 0x80 )
1024 {
1025 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1026 break;
0286d08d 1027
5367a38a
VS
1028 if ( srcLen != wxNO_LEN )
1029 srcLen--;
0286d08d 1030
5367a38a
VS
1031 code = c;
1032 }
1033 else
0286d08d 1034 {
5367a38a
VS
1035 unsigned len = tableUtf8Lengths[c];
1036 if ( !len )
1037 break;
1038
1039 if ( srcLen < len ) // the test works for wxNO_LEN too
1040 break;
1041
1042 if ( srcLen != wxNO_LEN )
1043 srcLen -= len;
1044
1045 // Char. number range | UTF-8 octet sequence
1046 // (hexadecimal) | (binary)
1047 // ----------------------+----------------------------------------
1048 // 0000 0000 - 0000 007F | 0xxxxxxx
1049 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1050 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1051 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1052 //
1053 // Code point value is stored in bits marked with 'x',
1054 // lowest-order bit of the value on the right side in the diagram
1055 // above. (from RFC 3629)
1056
1057 // mask to extract lead byte's value ('x' bits above), by sequence
1058 // length:
1059 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1060
1061 // mask and value of lead byte's most significant bits, by length:
1062 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1063 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1064
1065 len--; // it's more convenient to work with 0-based length here
1066
1067 // extract the lead byte's value bits:
1068 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1069 break;
1070
1071 code = c & leadValueMask[len];
1072
1073 // all remaining bytes, if any, are handled in the same way
1074 // regardless of sequence's length:
1075 for ( ; len; --len )
1076 {
1077 c = *++p;
1078 if ( (c & 0xC0) != 0x80 )
1079 return wxCONV_FAILED;
0286d08d 1080
5367a38a
VS
1081 code <<= 6;
1082 code |= c & 0x3F;
1083 }
0286d08d
VZ
1084 }
1085
1086#ifdef WC_UTF16
1087 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1088 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1089 {
1090 if ( out )
1091 out++;
1092 written++;
1093 }
1094#else // !WC_UTF16
1095 if ( out )
1096 *out = code;
1097#endif // WC_UTF16/!WC_UTF16
1098
1099 if ( out )
1100 out++;
1101
1102 written++;
1103 }
1104
1105 return wxCONV_FAILED;
1106}
1107
1108size_t
1109wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1110 const wchar_t *src, size_t srcLen) const
1111{
1112 char *out = dstLen ? dst : NULL;
1113 size_t written = 0;
1114
1115 for ( const wchar_t *wp = src; ; wp++ )
1116 {
0dcbb107 1117 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
0286d08d
VZ
1118 {
1119 // all done successfully, just add the trailing NULL if we are not
1120 // using explicit length
1121 if ( srcLen == wxNO_LEN )
1122 {
1123 if ( out )
1124 {
1125 if ( !dstLen )
1126 break;
1127
1128 *out = '\0';
1129 }
1130
1131 written++;
1132 }
1133
1134 return written;
1135 }
1136
a964d3ed
VZ
1137 if ( srcLen != wxNO_LEN )
1138 srcLen--;
0286d08d
VZ
1139
1140 wxUint32 code;
1141#ifdef WC_UTF16
1142 // cast is ok for WC_UTF16
1143 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1144 {
1145 // skip the next char too as we decoded a surrogate
1146 wp++;
041e6050
VZ
1147 if ( srcLen != wxNO_LEN )
1148 srcLen--;
0286d08d
VZ
1149 }
1150#else // wchar_t is UTF-32
1151 code = *wp & 0x7fffffff;
1152#endif
1153
1154 unsigned len;
1155 if ( code <= 0x7F )
1156 {
1157 len = 1;
1158 if ( out )
1159 {
1160 if ( dstLen < len )
1161 break;
1162
1163 out[0] = (char)code;
1164 }
1165 }
1166 else if ( code <= 0x07FF )
1167 {
1168 len = 2;
1169 if ( out )
1170 {
1171 if ( dstLen < len )
1172 break;
1173
1174 // NB: this line takes 6 least significant bits, encodes them as
1175 // 10xxxxxx and discards them so that the next byte can be encoded:
1176 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1177 out[0] = 0xC0 | code;
1178 }
1179 }
1180 else if ( code < 0xFFFF )
1181 {
1182 len = 3;
1183 if ( out )
1184 {
1185 if ( dstLen < len )
1186 break;
1187
1188 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[0] = 0xE0 | code;
1191 }
1192 }
1193 else if ( code <= 0x10FFFF )
1194 {
1195 len = 4;
1196 if ( out )
1197 {
1198 if ( dstLen < len )
1199 break;
1200
1201 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[0] = 0xF0 | code;
1205 }
1206 }
1207 else
1208 {
9a83f860 1209 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
0286d08d
VZ
1210 break;
1211 }
1212
1213 if ( out )
1214 {
1215 out += len;
1216 dstLen -= len;
1217 }
1218
1219 written += len;
1220 }
1221
1222 // we only get here if an error occurs during decoding
1223 return wxCONV_FAILED;
1224}
1225
d16d0917
VZ
1226size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1227 const char *psz, size_t srcLen) const
6001e347 1228{
0286d08d 1229 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1230 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1231
4def3b35
VS
1232 size_t len = 0;
1233
f4cb7c58
VZ
1234 // The length can be either given explicitly or computed implicitly for the
1235 // NUL-terminated strings.
1236 const bool isNulTerminated = srcLen == wxNO_LEN;
1237 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1238 {
ea8ce907
RR
1239 const char *opsz = psz;
1240 bool invalid = false;
4def3b35
VS
1241 unsigned char cc = *psz++, fc = cc;
1242 unsigned cnt;
dccce9ea 1243 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1244 fc <<= 1;
ef199164 1245
dccce9ea 1246 if (!cnt)
4def3b35
VS
1247 {
1248 // plain ASCII char
dccce9ea 1249 if (buf)
4def3b35
VS
1250 *buf++ = cc;
1251 len++;
561488ef
MW
1252
1253 // escape the escape character for octal escapes
1254 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1255 && cc == '\\' && (!buf || len < n))
1256 {
1257 if (buf)
1258 *buf++ = cc;
1259 len++;
1260 }
dccce9ea
VZ
1261 }
1262 else
4def3b35
VS
1263 {
1264 cnt--;
dccce9ea 1265 if (!cnt)
4def3b35
VS
1266 {
1267 // invalid UTF-8 sequence
ea8ce907 1268 invalid = true;
dccce9ea
VZ
1269 }
1270 else
4def3b35
VS
1271 {
1272 unsigned ocnt = cnt - 1;
1273 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1274 while (cnt--)
4def3b35 1275 {
ea8ce907 1276 cc = *psz;
dccce9ea 1277 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1278 {
1279 // invalid UTF-8 sequence
ea8ce907
RR
1280 invalid = true;
1281 break;
4def3b35 1282 }
ef199164 1283
ea8ce907 1284 psz++;
4def3b35
VS
1285 res = (res << 6) | (cc & 0x3f);
1286 }
ef199164 1287
ea8ce907 1288 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1289 {
1290 // illegal UTF-8 encoding
ea8ce907 1291 invalid = true;
4def3b35 1292 }
ea8ce907
RR
1293 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1294 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1295 {
1296 // if one of our PUA characters turns up externally
1297 // it must also be treated as an illegal sequence
1298 // (a bit like you have to escape an escape character)
1299 invalid = true;
1300 }
1301 else
1302 {
1cd52418 1303#ifdef WC_UTF16
0286d08d 1304 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1305 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1306 if (pa == wxCONV_FAILED)
ea8ce907
RR
1307 {
1308 invalid = true;
1309 }
1310 else
1311 {
1312 if (buf)
1313 buf += pa;
1314 len += pa;
1315 }
373658eb 1316#else // !WC_UTF16
ea8ce907 1317 if (buf)
38d4b1e4 1318 *buf++ = (wchar_t)res;
ea8ce907 1319 len++;
373658eb 1320#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1321 }
1322 }
ef199164 1323
ea8ce907
RR
1324 if (invalid)
1325 {
1326 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1327 {
1328 while (opsz < psz && (!buf || len < n))
1329 {
1330#ifdef WC_UTF16
1331 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1332 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1333 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1334 if (buf)
1335 buf += pa;
1336 opsz++;
1337 len += pa;
1338#else
1339 if (buf)
38d4b1e4 1340 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1341 opsz++;
1342 len++;
1343#endif
1344 }
1345 }
3698ae71 1346 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1347 {
1348 while (opsz < psz && (!buf || len < n))
1349 {
3698ae71
VZ
1350 if ( buf && len + 3 < n )
1351 {
17a1ebd1 1352 unsigned char on = *opsz;
3698ae71 1353 *buf++ = L'\\';
17a1ebd1
VZ
1354 *buf++ = (wchar_t)( L'0' + on / 0100 );
1355 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1356 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1357 }
ef199164 1358
ea8ce907
RR
1359 opsz++;
1360 len += 4;
1361 }
1362 }
3698ae71 1363 else // MAP_INVALID_UTF8_NOT
ea8ce907 1364 {
467e0479 1365 return wxCONV_FAILED;
ea8ce907 1366 }
4def3b35
VS
1367 }
1368 }
6001e347 1369 }
ef199164 1370
f4cb7c58
VZ
1371 if ( isNulTerminated )
1372 {
1373 // Add the trailing NUL in this case if we have a large enough buffer.
1374 if ( buf && (len < n) )
1375 *buf = 0;
ef199164 1376
f4cb7c58
VZ
1377 // And count it in any case.
1378 len++;
1379 }
1380
1381 return len;
6001e347
RR
1382}
1383
3698ae71
VZ
1384static inline bool isoctal(wchar_t wch)
1385{
1386 return L'0' <= wch && wch <= L'7';
1387}
1388
d16d0917
VZ
1389size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1390 const wchar_t *psz, size_t srcLen) const
6001e347 1391{
0286d08d 1392 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1393 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1394
4def3b35 1395 size_t len = 0;
6001e347 1396
2ba61518
VZ
1397 // The length can be either given explicitly or computed implicitly for the
1398 // NUL-terminated strings.
1399 const bool isNulTerminated = srcLen == wxNO_LEN;
1400 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1401 {
1402 wxUint32 cc;
ef199164 1403
1cd52418 1404#ifdef WC_UTF16
b5153fd8
VZ
1405 // cast is ok for WC_UTF16
1406 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1407 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1408#else
ef199164 1409 cc = (*psz++) & 0x7fffffff;
4def3b35 1410#endif
3698ae71
VZ
1411
1412 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1413 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1414 {
dccce9ea 1415 if (buf)
ea8ce907 1416 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1417 len++;
3698ae71 1418 }
561488ef
MW
1419 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1420 && cc == L'\\' && psz[0] == L'\\' )
1421 {
1422 if (buf)
1423 *buf++ = (char)cc;
1424 psz++;
1425 len++;
1426 }
3698ae71
VZ
1427 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1428 cc == L'\\' &&
1429 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1430 {
dccce9ea 1431 if (buf)
3698ae71 1432 {
ef199164
DS
1433 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1434 (psz[1] - L'0') * 010 +
b2c13097 1435 (psz[2] - L'0'));
3698ae71
VZ
1436 }
1437
1438 psz += 3;
ea8ce907
RR
1439 len++;
1440 }
1441 else
1442 {
1443 unsigned cnt;
ef199164
DS
1444 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1445 {
1446 }
1447
ea8ce907 1448 if (!cnt)
4def3b35 1449 {
ea8ce907
RR
1450 // plain ASCII char
1451 if (buf)
1452 *buf++ = (char) cc;
1453 len++;
1454 }
ea8ce907
RR
1455 else
1456 {
1457 len += cnt + 1;
1458 if (buf)
1459 {
1460 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1461 while (cnt--)
1462 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1463 }
4def3b35
VS
1464 }
1465 }
6001e347 1466 }
4def3b35 1467
2ba61518
VZ
1468 if ( isNulTerminated )
1469 {
1470 // Add the trailing NUL in this case if we have a large enough buffer.
1471 if ( buf && (len < n) )
1472 *buf = 0;
1473
1474 // And count it in any case.
1475 len++;
1476 }
adb45366 1477
2ba61518 1478 return len;
6001e347
RR
1479}
1480
467e0479 1481// ============================================================================
c91830cb 1482// UTF-16
467e0479 1483// ============================================================================
c91830cb
VZ
1484
1485#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1486 #define wxMBConvUTF16straight wxMBConvUTF16BE
1487 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1488#else
bde4baac
VZ
1489 #define wxMBConvUTF16swap wxMBConvUTF16BE
1490 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1491#endif
1492
467e0479
VZ
1493/* static */
1494size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1495{
1496 if ( srcLen == wxNO_LEN )
1497 {
1498 // count the number of bytes in input, including the trailing NULs
5c33522f 1499 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1500 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1501 ;
c91830cb 1502
467e0479
VZ
1503 srcLen *= BYTES_PER_CHAR;
1504 }
1505 else // we already have the length
1506 {
1507 // we can only convert an entire number of UTF-16 characters
1508 if ( srcLen % BYTES_PER_CHAR )
1509 return wxCONV_FAILED;
1510 }
1511
1512 return srcLen;
1513}
1514
1515// case when in-memory representation is UTF-16 too
c91830cb
VZ
1516#ifdef WC_UTF16
1517
467e0479
VZ
1518// ----------------------------------------------------------------------------
1519// conversions without endianness change
1520// ----------------------------------------------------------------------------
1521
1522size_t
1523wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1524 const char *src, size_t srcLen) const
c91830cb 1525{
467e0479
VZ
1526 // set up the scene for using memcpy() (which is presumably more efficient
1527 // than copying the bytes one by one)
1528 srcLen = GetLength(src, srcLen);
1529 if ( srcLen == wxNO_LEN )
1530 return wxCONV_FAILED;
c91830cb 1531
ef199164 1532 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1533 if ( dst )
c91830cb 1534 {
467e0479
VZ
1535 if ( dstLen < inLen )
1536 return wxCONV_FAILED;
c91830cb 1537
467e0479 1538 memcpy(dst, src, srcLen);
c91830cb 1539 }
d32a507d 1540
467e0479 1541 return inLen;
c91830cb
VZ
1542}
1543
467e0479
VZ
1544size_t
1545wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1546 const wchar_t *src, size_t srcLen) const
c91830cb 1547{
467e0479
VZ
1548 if ( srcLen == wxNO_LEN )
1549 srcLen = wxWcslen(src) + 1;
c91830cb 1550
467e0479
VZ
1551 srcLen *= BYTES_PER_CHAR;
1552
1553 if ( dst )
c91830cb 1554 {
467e0479
VZ
1555 if ( dstLen < srcLen )
1556 return wxCONV_FAILED;
d32a507d 1557
467e0479 1558 memcpy(dst, src, srcLen);
c91830cb 1559 }
d32a507d 1560
467e0479 1561 return srcLen;
c91830cb
VZ
1562}
1563
467e0479
VZ
1564// ----------------------------------------------------------------------------
1565// endian-reversing conversions
1566// ----------------------------------------------------------------------------
c91830cb 1567
467e0479
VZ
1568size_t
1569wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1570 const char *src, size_t srcLen) const
c91830cb 1571{
467e0479
VZ
1572 srcLen = GetLength(src, srcLen);
1573 if ( srcLen == wxNO_LEN )
1574 return wxCONV_FAILED;
c91830cb 1575
467e0479
VZ
1576 srcLen /= BYTES_PER_CHAR;
1577
1578 if ( dst )
c91830cb 1579 {
467e0479
VZ
1580 if ( dstLen < srcLen )
1581 return wxCONV_FAILED;
1582
5c33522f 1583 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1584 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1585 {
ef199164 1586 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1587 }
c91830cb 1588 }
bfab25d4 1589
467e0479 1590 return srcLen;
c91830cb
VZ
1591}
1592
467e0479
VZ
1593size_t
1594wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1595 const wchar_t *src, size_t srcLen) const
c91830cb 1596{
467e0479
VZ
1597 if ( srcLen == wxNO_LEN )
1598 srcLen = wxWcslen(src) + 1;
c91830cb 1599
467e0479
VZ
1600 srcLen *= BYTES_PER_CHAR;
1601
1602 if ( dst )
c91830cb 1603 {
467e0479
VZ
1604 if ( dstLen < srcLen )
1605 return wxCONV_FAILED;
1606
5c33522f 1607 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
467e0479 1608 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1609 {
ef199164 1610 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1611 }
c91830cb 1612 }
eec47cc6 1613
467e0479 1614 return srcLen;
c91830cb
VZ
1615}
1616
467e0479 1617#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1618
467e0479
VZ
1619// ----------------------------------------------------------------------------
1620// conversions without endianness change
1621// ----------------------------------------------------------------------------
c91830cb 1622
35d11700
VZ
1623size_t
1624wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1625 const char *src, size_t srcLen) const
c91830cb 1626{
35d11700
VZ
1627 srcLen = GetLength(src, srcLen);
1628 if ( srcLen == wxNO_LEN )
1629 return wxCONV_FAILED;
c91830cb 1630
ef199164 1631 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1632 if ( !dst )
c91830cb 1633 {
35d11700
VZ
1634 // optimization: return maximal space which could be needed for this
1635 // string even if the real size could be smaller if the buffer contains
1636 // any surrogates
1637 return inLen;
c91830cb 1638 }
c91830cb 1639
35d11700 1640 size_t outLen = 0;
5c33522f 1641 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1642 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1643 {
ef199164
DS
1644 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1645 if ( !inBuff )
35d11700
VZ
1646 return wxCONV_FAILED;
1647
1648 if ( ++outLen > dstLen )
1649 return wxCONV_FAILED;
c91830cb 1650
35d11700
VZ
1651 *dst++ = ch;
1652 }
1653
1654
1655 return outLen;
1656}
c91830cb 1657
35d11700
VZ
1658size_t
1659wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1660 const wchar_t *src, size_t srcLen) const
c91830cb 1661{
35d11700
VZ
1662 if ( srcLen == wxNO_LEN )
1663 srcLen = wxWcslen(src) + 1;
c91830cb 1664
35d11700 1665 size_t outLen = 0;
5c33522f 1666 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1667 for ( size_t n = 0; n < srcLen; n++ )
c91830cb 1668 {
d883acaa 1669 wxUint16 cc[2] = { 0 };
35d11700
VZ
1670 const size_t numChars = encode_utf16(*src++, cc);
1671 if ( numChars == wxCONV_FAILED )
1672 return wxCONV_FAILED;
c91830cb 1673
ef199164
DS
1674 outLen += numChars * BYTES_PER_CHAR;
1675 if ( outBuff )
c91830cb 1676 {
35d11700
VZ
1677 if ( outLen > dstLen )
1678 return wxCONV_FAILED;
1679
ef199164 1680 *outBuff++ = cc[0];
35d11700 1681 if ( numChars == 2 )
69b80d28 1682 {
35d11700 1683 // second character of a surrogate
ef199164 1684 *outBuff++ = cc[1];
69b80d28 1685 }
c91830cb 1686 }
c91830cb 1687 }
c91830cb 1688
35d11700 1689 return outLen;
c91830cb
VZ
1690}
1691
467e0479
VZ
1692// ----------------------------------------------------------------------------
1693// endian-reversing conversions
1694// ----------------------------------------------------------------------------
c91830cb 1695
35d11700
VZ
1696size_t
1697wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1698 const char *src, size_t srcLen) const
c91830cb 1699{
35d11700
VZ
1700 srcLen = GetLength(src, srcLen);
1701 if ( srcLen == wxNO_LEN )
1702 return wxCONV_FAILED;
1703
ef199164 1704 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1705 if ( !dst )
1706 {
1707 // optimization: return maximal space which could be needed for this
1708 // string even if the real size could be smaller if the buffer contains
1709 // any surrogates
1710 return inLen;
1711 }
c91830cb 1712
35d11700 1713 size_t outLen = 0;
5c33522f 1714 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
ef199164 1715 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1716 {
35d11700
VZ
1717 wxUint32 ch;
1718 wxUint16 tmp[2];
ef199164
DS
1719
1720 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1721 inBuff++;
1722 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1723
35d11700
VZ
1724 const size_t numChars = decode_utf16(tmp, ch);
1725 if ( numChars == wxCONV_FAILED )
1726 return wxCONV_FAILED;
c91830cb 1727
35d11700 1728 if ( numChars == 2 )
ef199164 1729 inBuff++;
35d11700
VZ
1730
1731 if ( ++outLen > dstLen )
1732 return wxCONV_FAILED;
c91830cb 1733
35d11700 1734 *dst++ = ch;
c91830cb 1735 }
c91830cb 1736
c91830cb 1737
35d11700
VZ
1738 return outLen;
1739}
c91830cb 1740
35d11700
VZ
1741size_t
1742wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1743 const wchar_t *src, size_t srcLen) const
c91830cb 1744{
35d11700
VZ
1745 if ( srcLen == wxNO_LEN )
1746 srcLen = wxWcslen(src) + 1;
c91830cb 1747
35d11700 1748 size_t outLen = 0;
5c33522f 1749 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
35d11700 1750 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb 1751 {
d883acaa 1752 wxUint16 cc[2] = { 0 };
35d11700
VZ
1753 const size_t numChars = encode_utf16(*src, cc);
1754 if ( numChars == wxCONV_FAILED )
1755 return wxCONV_FAILED;
c91830cb 1756
ef199164
DS
1757 outLen += numChars * BYTES_PER_CHAR;
1758 if ( outBuff )
c91830cb 1759 {
35d11700
VZ
1760 if ( outLen > dstLen )
1761 return wxCONV_FAILED;
1762
ef199164 1763 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1764 if ( numChars == 2 )
c91830cb 1765 {
35d11700 1766 // second character of a surrogate
ef199164 1767 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1768 }
1769 }
c91830cb 1770 }
c91830cb 1771
35d11700 1772 return outLen;
c91830cb
VZ
1773}
1774
467e0479 1775#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1776
1777
35d11700 1778// ============================================================================
c91830cb 1779// UTF-32
35d11700 1780// ============================================================================
c91830cb
VZ
1781
1782#ifdef WORDS_BIGENDIAN
467e0479
VZ
1783 #define wxMBConvUTF32straight wxMBConvUTF32BE
1784 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1785#else
467e0479
VZ
1786 #define wxMBConvUTF32swap wxMBConvUTF32BE
1787 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1788#endif
1789
1790
1791WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1792WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1793
467e0479
VZ
1794/* static */
1795size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1796{
1797 if ( srcLen == wxNO_LEN )
1798 {
1799 // count the number of bytes in input, including the trailing NULs
5c33522f 1800 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1801 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1802 ;
c91830cb 1803
467e0479
VZ
1804 srcLen *= BYTES_PER_CHAR;
1805 }
1806 else // we already have the length
1807 {
1808 // we can only convert an entire number of UTF-32 characters
1809 if ( srcLen % BYTES_PER_CHAR )
1810 return wxCONV_FAILED;
1811 }
1812
1813 return srcLen;
1814}
1815
1816// case when in-memory representation is UTF-16
c91830cb
VZ
1817#ifdef WC_UTF16
1818
467e0479
VZ
1819// ----------------------------------------------------------------------------
1820// conversions without endianness change
1821// ----------------------------------------------------------------------------
1822
1823size_t
1824wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1825 const char *src, size_t srcLen) const
c91830cb 1826{
467e0479
VZ
1827 srcLen = GetLength(src, srcLen);
1828 if ( srcLen == wxNO_LEN )
1829 return wxCONV_FAILED;
c91830cb 1830
5c33522f 1831 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1832 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1833 size_t outLen = 0;
1834 for ( size_t n = 0; n < inLen; n++ )
c91830cb 1835 {
d883acaa 1836 wxUint16 cc[2] = { 0 };
ef199164 1837 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1838 if ( numChars == wxCONV_FAILED )
1839 return wxCONV_FAILED;
c91830cb 1840
467e0479
VZ
1841 outLen += numChars;
1842 if ( dst )
c91830cb 1843 {
467e0479
VZ
1844 if ( outLen > dstLen )
1845 return wxCONV_FAILED;
d32a507d 1846
467e0479
VZ
1847 *dst++ = cc[0];
1848 if ( numChars == 2 )
1849 {
1850 // second character of a surrogate
1851 *dst++ = cc[1];
1852 }
1853 }
c91830cb 1854 }
d32a507d 1855
467e0479 1856 return outLen;
c91830cb
VZ
1857}
1858
467e0479
VZ
1859size_t
1860wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1861 const wchar_t *src, size_t srcLen) const
c91830cb 1862{
467e0479
VZ
1863 if ( srcLen == wxNO_LEN )
1864 srcLen = wxWcslen(src) + 1;
c91830cb 1865
467e0479 1866 if ( !dst )
c91830cb 1867 {
467e0479
VZ
1868 // optimization: return maximal space which could be needed for this
1869 // string instead of the exact amount which could be less if there are
1870 // any surrogates in the input
1871 //
1872 // we consider that surrogates are rare enough to make it worthwhile to
1873 // avoid running the loop below at the cost of slightly extra memory
1874 // consumption
ef199164 1875 return srcLen * BYTES_PER_CHAR;
467e0479 1876 }
c91830cb 1877
5c33522f 1878 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1879 size_t outLen = 0;
1880 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1881 {
1882 const wxUint32 ch = wxDecodeSurrogate(&src);
1883 if ( !src )
1884 return wxCONV_FAILED;
c91830cb 1885
467e0479 1886 outLen += BYTES_PER_CHAR;
d32a507d 1887
467e0479
VZ
1888 if ( outLen > dstLen )
1889 return wxCONV_FAILED;
b5153fd8 1890
ef199164 1891 *outBuff++ = ch;
467e0479 1892 }
c91830cb 1893
467e0479 1894 return outLen;
c91830cb
VZ
1895}
1896
467e0479
VZ
1897// ----------------------------------------------------------------------------
1898// endian-reversing conversions
1899// ----------------------------------------------------------------------------
c91830cb 1900
467e0479
VZ
1901size_t
1902wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1903 const char *src, size_t srcLen) const
c91830cb 1904{
467e0479
VZ
1905 srcLen = GetLength(src, srcLen);
1906 if ( srcLen == wxNO_LEN )
1907 return wxCONV_FAILED;
c91830cb 1908
5c33522f 1909 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 1910 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1911 size_t outLen = 0;
ef199164 1912 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1913 {
d883acaa 1914 wxUint16 cc[2] = { 0 };
ef199164 1915 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1916 if ( numChars == wxCONV_FAILED )
1917 return wxCONV_FAILED;
c91830cb 1918
467e0479
VZ
1919 outLen += numChars;
1920 if ( dst )
c91830cb 1921 {
467e0479
VZ
1922 if ( outLen > dstLen )
1923 return wxCONV_FAILED;
d32a507d 1924
467e0479
VZ
1925 *dst++ = cc[0];
1926 if ( numChars == 2 )
1927 {
1928 // second character of a surrogate
1929 *dst++ = cc[1];
1930 }
1931 }
c91830cb 1932 }
b5153fd8 1933
467e0479 1934 return outLen;
c91830cb
VZ
1935}
1936
467e0479
VZ
1937size_t
1938wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1939 const wchar_t *src, size_t srcLen) const
c91830cb 1940{
467e0479
VZ
1941 if ( srcLen == wxNO_LEN )
1942 srcLen = wxWcslen(src) + 1;
c91830cb 1943
467e0479 1944 if ( !dst )
c91830cb 1945 {
467e0479
VZ
1946 // optimization: return maximal space which could be needed for this
1947 // string instead of the exact amount which could be less if there are
1948 // any surrogates in the input
1949 //
1950 // we consider that surrogates are rare enough to make it worthwhile to
1951 // avoid running the loop below at the cost of slightly extra memory
1952 // consumption
1953 return srcLen*BYTES_PER_CHAR;
1954 }
c91830cb 1955
5c33522f 1956 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
467e0479
VZ
1957 size_t outLen = 0;
1958 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1959 {
1960 const wxUint32 ch = wxDecodeSurrogate(&src);
1961 if ( !src )
1962 return wxCONV_FAILED;
c91830cb 1963
467e0479 1964 outLen += BYTES_PER_CHAR;
d32a507d 1965
467e0479
VZ
1966 if ( outLen > dstLen )
1967 return wxCONV_FAILED;
b5153fd8 1968
ef199164 1969 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1970 }
c91830cb 1971
467e0479 1972 return outLen;
c91830cb
VZ
1973}
1974
467e0479 1975#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1976
35d11700
VZ
1977// ----------------------------------------------------------------------------
1978// conversions without endianness change
1979// ----------------------------------------------------------------------------
1980
1981size_t
1982wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1983 const char *src, size_t srcLen) const
c91830cb 1984{
35d11700
VZ
1985 // use memcpy() as it should be much faster than hand-written loop
1986 srcLen = GetLength(src, srcLen);
1987 if ( srcLen == wxNO_LEN )
1988 return wxCONV_FAILED;
c91830cb 1989
35d11700
VZ
1990 const size_t inLen = srcLen/BYTES_PER_CHAR;
1991 if ( dst )
c91830cb 1992 {
35d11700
VZ
1993 if ( dstLen < inLen )
1994 return wxCONV_FAILED;
b5153fd8 1995
35d11700
VZ
1996 memcpy(dst, src, srcLen);
1997 }
c91830cb 1998
35d11700 1999 return inLen;
c91830cb
VZ
2000}
2001
35d11700
VZ
2002size_t
2003wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2004 const wchar_t *src, size_t srcLen) const
c91830cb 2005{
35d11700
VZ
2006 if ( srcLen == wxNO_LEN )
2007 srcLen = wxWcslen(src) + 1;
2008
2009 srcLen *= BYTES_PER_CHAR;
c91830cb 2010
35d11700 2011 if ( dst )
c91830cb 2012 {
35d11700
VZ
2013 if ( dstLen < srcLen )
2014 return wxCONV_FAILED;
c91830cb 2015
35d11700 2016 memcpy(dst, src, srcLen);
c91830cb
VZ
2017 }
2018
35d11700 2019 return srcLen;
c91830cb
VZ
2020}
2021
35d11700
VZ
2022// ----------------------------------------------------------------------------
2023// endian-reversing conversions
2024// ----------------------------------------------------------------------------
c91830cb 2025
35d11700
VZ
2026size_t
2027wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2028 const char *src, size_t srcLen) const
c91830cb 2029{
35d11700
VZ
2030 srcLen = GetLength(src, srcLen);
2031 if ( srcLen == wxNO_LEN )
2032 return wxCONV_FAILED;
2033
2034 srcLen /= BYTES_PER_CHAR;
c91830cb 2035
35d11700 2036 if ( dst )
c91830cb 2037 {
35d11700
VZ
2038 if ( dstLen < srcLen )
2039 return wxCONV_FAILED;
2040
5c33522f 2041 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
ef199164 2042 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 2043 {
ef199164 2044 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 2045 }
c91830cb 2046 }
b5153fd8 2047
35d11700 2048 return srcLen;
c91830cb
VZ
2049}
2050
35d11700
VZ
2051size_t
2052wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2053 const wchar_t *src, size_t srcLen) const
c91830cb 2054{
35d11700
VZ
2055 if ( srcLen == wxNO_LEN )
2056 srcLen = wxWcslen(src) + 1;
2057
2058 srcLen *= BYTES_PER_CHAR;
c91830cb 2059
35d11700 2060 if ( dst )
c91830cb 2061 {
35d11700
VZ
2062 if ( dstLen < srcLen )
2063 return wxCONV_FAILED;
2064
5c33522f 2065 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
35d11700 2066 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 2067 {
ef199164 2068 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 2069 }
c91830cb 2070 }
b5153fd8 2071
35d11700 2072 return srcLen;
c91830cb
VZ
2073}
2074
467e0479 2075#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
2076
2077
36acb880
VZ
2078// ============================================================================
2079// The classes doing conversion using the iconv_xxx() functions
2080// ============================================================================
3caec1bb 2081
b040e242 2082#ifdef HAVE_ICONV
3a0d76bc 2083
b1d547eb
VS
2084// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2085// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2086// (unless there's yet another bug in glibc) the only case when iconv()
2087// returns with (size_t)-1 (which means error) and says there are 0 bytes
2088// left in the input buffer -- when _real_ error occurs,
2089// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2090// iconv() failure.
3caec1bb
VS
2091// [This bug does not appear in glibc 2.2.]
2092#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2093#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2094 (errno != E2BIG || bufLeft != 0))
2095#else
2096#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2097#endif
2098
ab217dba 2099#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 2100
74a7eb0b
VZ
2101#define ICONV_T_INVALID ((iconv_t)-1)
2102
2103#if SIZEOF_WCHAR_T == 4
2104 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2105 #define WC_ENC wxFONTENCODING_UTF32
2106#elif SIZEOF_WCHAR_T == 2
2107 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2108 #define WC_ENC wxFONTENCODING_UTF16
2109#else // sizeof(wchar_t) != 2 nor 4
2110 // does this ever happen?
2111 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2112#endif
2113
36acb880 2114// ----------------------------------------------------------------------------
e95354ec 2115// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
2116// ----------------------------------------------------------------------------
2117
e95354ec 2118class wxMBConv_iconv : public wxMBConv
1cd52418
OK
2119{
2120public:
86501081 2121 wxMBConv_iconv(const char *name);
e95354ec 2122 virtual ~wxMBConv_iconv();
36acb880 2123
8f4b0f43
VZ
2124 // implement base class virtual methods
2125 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2126 const char *src, size_t srcLen = wxNO_LEN) const;
2127 virtual size_t FromWChar(char *dst, size_t dstLen,
2128 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
2129 virtual size_t GetMBNulLen() const;
2130
ba98e032
VS
2131#if wxUSE_UNICODE_UTF8
2132 virtual bool IsUTF8() const;
2133#endif
2134
d36c9347
VZ
2135 virtual wxMBConv *Clone() const
2136 {
b64f93b6 2137 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
d36c9347
VZ
2138 p->m_minMBCharWidth = m_minMBCharWidth;
2139 return p;
2140 }
2141
e95354ec 2142 bool IsOk() const
74a7eb0b 2143 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
2144
2145protected:
ef199164
DS
2146 // the iconv handlers used to translate from multibyte
2147 // to wide char and in the other direction
36acb880
VZ
2148 iconv_t m2w,
2149 w2m;
ef199164 2150
b1d547eb
VS
2151#if wxUSE_THREADS
2152 // guards access to m2w and w2m objects
2153 wxMutex m_iconvMutex;
2154#endif
36acb880
VZ
2155
2156private:
e95354ec 2157 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2158 // available on this machine, it will remain NULL
74a7eb0b 2159 static wxString ms_wcCharsetName;
36acb880
VZ
2160
2161 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2162 // different endian-ness than the native one
405d8f46 2163 static bool ms_wcNeedsSwap;
eec47cc6 2164
d36c9347
VZ
2165
2166 // name of the encoding handled by this conversion
b64f93b6 2167 const char *m_name;
d36c9347 2168
7ef3ab50 2169 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2170 // initially
2171 size_t m_minMBCharWidth;
36acb880
VZ
2172};
2173
8f115891 2174// make the constructor available for unit testing
86501081 2175WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2176{
2177 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2178 if ( !result->IsOk() )
2179 {
2180 delete result;
2181 return 0;
2182 }
ef199164 2183
8f115891
MW
2184 return result;
2185}
2186
422e411e 2187wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2188bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2189
86501081 2190wxMBConv_iconv::wxMBConv_iconv(const char *name)
b64f93b6 2191 : m_name(wxStrdup(name))
36acb880 2192{
c1464d9d 2193 m_minMBCharWidth = 0;
eec47cc6 2194
36acb880 2195 // check for charset that represents wchar_t:
74a7eb0b 2196 if ( ms_wcCharsetName.empty() )
f1339c56 2197 {
9a83f860 2198 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
c2b83fdd 2199
74a7eb0b 2200#if wxUSE_FONTMAP
a243da29 2201 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
74a7eb0b 2202#else // !wxUSE_FONTMAP
a243da29 2203 static const wxChar *const names_static[] =
36acb880 2204 {
74a7eb0b 2205#if SIZEOF_WCHAR_T == 4
9a83f860 2206 wxT("UCS-4"),
da2f1172 2207#elif SIZEOF_WCHAR_T == 2
9a83f860 2208 wxT("UCS-2"),
74a7eb0b
VZ
2209#endif
2210 NULL
2211 };
a243da29 2212 const wxChar *const *names = names_static;
74a7eb0b 2213#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2214
d1f024a8 2215 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2216 {
17a1ebd1 2217 const wxString nameCS(*names);
74a7eb0b
VZ
2218
2219 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2220 wxString nameXE(nameCS);
ef199164
DS
2221
2222#ifdef WORDS_BIGENDIAN
9a83f860 2223 nameXE += wxT("BE");
ef199164 2224#else // little endian
9a83f860 2225 nameXE += wxT("LE");
ef199164 2226#endif
74a7eb0b 2227
9a83f860 2228 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd
VZ
2229 nameXE.c_str());
2230
86501081 2231 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2232 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2233 {
74a7eb0b 2234 // try charset w/o bytesex info (e.g. "UCS4")
9a83f860 2235 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
c2b83fdd 2236 nameCS.c_str());
86501081 2237 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2238
74a7eb0b
VZ
2239 // and check for bytesex ourselves:
2240 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2241 {
74a7eb0b 2242 char buf[2], *bufPtr;
e8769ed1 2243 wchar_t wbuf[2];
74a7eb0b
VZ
2244 size_t insz, outsz;
2245 size_t res;
2246
2247 buf[0] = 'A';
2248 buf[1] = 0;
2249 wbuf[0] = 0;
2250 insz = 2;
2251 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2252 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2253 bufPtr = buf;
2254
ef199164
DS
2255 res = iconv(
2256 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2257 &wbufPtr, &outsz);
74a7eb0b
VZ
2258
2259 if (ICONV_FAILED(res, insz))
2260 {
2261 wxLogLastError(wxT("iconv"));
422e411e 2262 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2263 nameCS.c_str());
74a7eb0b
VZ
2264 }
2265 else // ok, can convert to this encoding, remember it
2266 {
17a1ebd1 2267 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2268 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2269 }
3a0d76bc
VS
2270 }
2271 }
74a7eb0b 2272 else // use charset not requiring byte swapping
36acb880 2273 {
74a7eb0b 2274 ms_wcCharsetName = nameXE;
36acb880 2275 }
3a0d76bc 2276 }
74a7eb0b 2277
0944fceb 2278 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2279 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2280 ms_wcCharsetName.empty() ? wxString("<none>")
2281 : ms_wcCharsetName,
9a83f860
VZ
2282 ms_wcNeedsSwap ? wxT(" (needs swap)")
2283 : wxT(""));
3a0d76bc 2284 }
36acb880 2285 else // we already have ms_wcCharsetName
3caec1bb 2286 {
86501081 2287 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2288 }
dccce9ea 2289
74a7eb0b 2290 if ( ms_wcCharsetName.empty() )
f1339c56 2291 {
74a7eb0b 2292 w2m = ICONV_T_INVALID;
36acb880 2293 }
405d8f46
VZ
2294 else
2295 {
86501081 2296 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2297 if ( w2m == ICONV_T_INVALID )
2298 {
2299 wxLogTrace(TRACE_STRCONV,
2300 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2301 ms_wcCharsetName.c_str(), name);
74a7eb0b 2302 }
405d8f46 2303 }
36acb880 2304}
3caec1bb 2305
e95354ec 2306wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2307{
b64f93b6
VZ
2308 free(const_cast<char *>(m_name));
2309
74a7eb0b 2310 if ( m2w != ICONV_T_INVALID )
36acb880 2311 iconv_close(m2w);
74a7eb0b 2312 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2313 iconv_close(w2m);
2314}
3a0d76bc 2315
8f4b0f43
VZ
2316size_t
2317wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2318 const char *src, size_t srcLen) const
36acb880 2319{
8f4b0f43 2320 if ( srcLen == wxNO_LEN )
69373110 2321 {
8f4b0f43
VZ
2322 // find the string length: notice that must be done differently for
2323 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2324 // consecutive NULs
2325 const size_t nulLen = GetMBNulLen();
2326 switch ( nulLen )
2327 {
2328 default:
2329 return wxCONV_FAILED;
69373110 2330
8f4b0f43
VZ
2331 case 1:
2332 srcLen = strlen(src); // arguably more optimized than our version
2333 break;
69373110 2334
8f4b0f43
VZ
2335 case 2:
2336 case 4:
2337 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2338 // but they also have to start at character boundary and not
2339 // span two adjacent characters
2340 const char *p;
2341 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2342 ;
2343 srcLen = p - src;
2344 break;
2345 }
d50c0831
VZ
2346
2347 // when we're determining the length of the string ourselves we count
2348 // the terminating NUL(s) as part of it and always NUL-terminate the
2349 // output
2350 srcLen += nulLen;
69373110
VZ
2351 }
2352
8f4b0f43
VZ
2353 // we express length in the number of (wide) characters but iconv always
2354 // counts buffer sizes it in bytes
2355 dstLen *= SIZEOF_WCHAR_T;
2356
b1d547eb 2357#if wxUSE_THREADS
6a17b868
SN
2358 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2359 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2360 // wxConvLocal that are used all over wx code, so we have to make sure
2361 // the handle is used by at most one thread at the time. Otherwise
2362 // only a few wx classes would be safe to use from non-main threads
2363 // as MB<->WC conversion would fail "randomly".
2364 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2365#endif // wxUSE_THREADS
2366
36acb880 2367 size_t res, cres;
8f4b0f43 2368 const char *pszPtr = src;
36acb880 2369
8f4b0f43 2370 if ( dst )
36acb880 2371 {
8f4b0f43 2372 char* bufPtr = (char*)dst;
e8769ed1 2373
36acb880 2374 // have destination buffer, convert there
1752fda6 2375 size_t dstLenOrig = dstLen;
36acb880 2376 cres = iconv(m2w,
8f4b0f43
VZ
2377 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2378 &bufPtr, &dstLen);
1752fda6
VZ
2379
2380 // convert the number of bytes converted as returned by iconv to the
2381 // number of (wide) characters converted that we need
2382 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2383
36acb880 2384 if (ms_wcNeedsSwap)
3a0d76bc 2385 {
36acb880 2386 // convert to native endianness
17a1ebd1 2387 for ( unsigned i = 0; i < res; i++ )
467a2982 2388 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2389 }
36acb880 2390 }
8f4b0f43 2391 else // no destination buffer
36acb880 2392 {
8f4b0f43 2393 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2394 wchar_t tbuf[256];
36acb880 2395 res = 0;
ef199164
DS
2396
2397 do
2398 {
e8769ed1 2399 char* bufPtr = (char*)tbuf;
8f4b0f43 2400 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2401
2402 cres = iconv(m2w,
8f4b0f43
VZ
2403 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2404 &bufPtr, &dstLen );
36acb880 2405
8f4b0f43 2406 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2407 }
2408 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2409 }
dccce9ea 2410
8f4b0f43 2411 if (ICONV_FAILED(cres, srcLen))
f1339c56 2412 {
36acb880 2413 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2414 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2415 return wxCONV_FAILED;
36acb880
VZ
2416 }
2417
2418 return res;
2419}
2420
8f4b0f43
VZ
2421size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2422 const wchar_t *src, size_t srcLen) const
36acb880 2423{
b1d547eb
VS
2424#if wxUSE_THREADS
2425 // NB: explained in MB2WC
2426 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2427#endif
3698ae71 2428
8f4b0f43 2429 if ( srcLen == wxNO_LEN )
2588ee86 2430 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2431
2432 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2433 size_t outbuflen = dstLen;
36acb880 2434 size_t res, cres;
3a0d76bc 2435
36acb880 2436 wchar_t *tmpbuf = 0;
3caec1bb 2437
36acb880
VZ
2438 if (ms_wcNeedsSwap)
2439 {
2440 // need to copy to temp buffer to switch endianness
51725fc0 2441 // (doing WC_BSWAP twice on the original buffer won't work, as it
36acb880 2442 // could be in read-only memory, or be accessed in some other thread)
51725fc0 2443 tmpbuf = (wchar_t *)malloc(inbuflen);
8f4b0f43
VZ
2444 for ( size_t i = 0; i < srcLen; i++ )
2445 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2446
8f4b0f43 2447 src = tmpbuf;
36acb880 2448 }
3a0d76bc 2449
8f4b0f43
VZ
2450 char* inbuf = (char*)src;
2451 if ( dst )
36acb880
VZ
2452 {
2453 // have destination buffer, convert there
8f4b0f43 2454 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2455
8f4b0f43 2456 res = dstLen - outbuflen;
36acb880 2457 }
8f4b0f43 2458 else // no destination buffer
36acb880 2459 {
8f4b0f43 2460 // convert using temp buffer to calculate the size of the buffer needed
878c265b 2461 char tbuf[256];
36acb880 2462 res = 0;
ef199164
DS
2463 do
2464 {
8f4b0f43 2465 dst = tbuf;
51725fc0 2466 outbuflen = WXSIZEOF(tbuf);
36acb880 2467
8f4b0f43 2468 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2469
51725fc0 2470 res += WXSIZEOF(tbuf) - outbuflen;
ef199164
DS
2471 }
2472 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2473 }
dccce9ea 2474
36acb880
VZ
2475 if (ms_wcNeedsSwap)
2476 {
2477 free(tmpbuf);
2478 }
dccce9ea 2479
e8769ed1 2480 if (ICONV_FAILED(cres, inbuflen))
36acb880 2481 {
ce6f8d6f 2482 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2483 return wxCONV_FAILED;
36acb880
VZ
2484 }
2485
2486 return res;
2487}
2488
7ef3ab50 2489size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2490{
c1464d9d 2491 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2492 {
2493 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2494
2495#if wxUSE_THREADS
2496 // NB: explained in MB2WC
2497 wxMutexLocker lock(self->m_iconvMutex);
2498#endif
2499
999020e1 2500 const wchar_t *wnul = L"";
c1464d9d 2501 char buf[8]; // should be enough for NUL in any encoding
356410fc 2502 size_t inLen = sizeof(wchar_t),
c1464d9d 2503 outLen = WXSIZEOF(buf);
ef199164
DS
2504 char *inBuff = (char *)wnul;
2505 char *outBuff = buf;
2506 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2507 {
c1464d9d 2508 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2509 }
2510 else // ok
2511 {
ef199164 2512 self->m_minMBCharWidth = outBuff - buf;
356410fc 2513 }
eec47cc6
VZ
2514 }
2515
c1464d9d 2516 return m_minMBCharWidth;
eec47cc6
VZ
2517}
2518
ba98e032
VS
2519#if wxUSE_UNICODE_UTF8
2520bool wxMBConv_iconv::IsUTF8() const
2521{
86501081
VS
2522 return wxStricmp(m_name, "UTF-8") == 0 ||
2523 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2524}
2525#endif
2526
b040e242 2527#endif // HAVE_ICONV
36acb880 2528
e95354ec 2529
36acb880
VZ
2530// ============================================================================
2531// Win32 conversion classes
2532// ============================================================================
1cd52418 2533
e95354ec 2534#ifdef wxHAVE_WIN32_MB2WC
373658eb 2535
8b04d4c4 2536// from utils.cpp
d775fa82 2537#if wxUSE_FONTMAP
86501081 2538extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2539extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2540#endif
373658eb 2541
e95354ec 2542class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2543{
2544public:
bde4baac
VZ
2545 wxMBConv_win32()
2546 {
2547 m_CodePage = CP_ACP;
c1464d9d 2548 m_minMBCharWidth = 0;
bde4baac
VZ
2549 }
2550
d36c9347 2551 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2552 : wxMBConv()
d36c9347
VZ
2553 {
2554 m_CodePage = conv.m_CodePage;
2555 m_minMBCharWidth = conv.m_minMBCharWidth;
2556 }
2557
7608a683 2558#if wxUSE_FONTMAP
86501081 2559 wxMBConv_win32(const char* name)
bde4baac
VZ
2560 {
2561 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2562 m_minMBCharWidth = 0;
bde4baac 2563 }
dccce9ea 2564
e95354ec 2565 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2566 {
2567 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2568 m_minMBCharWidth = 0;
bde4baac 2569 }
eec47cc6 2570#endif // wxUSE_FONTMAP
8b04d4c4 2571
d36c9347 2572 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2573 {
02272c9c
VZ
2574 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2575 // the behaviour is not compatible with the Unix version (using iconv)
2576 // and break the library itself, e.g. wxTextInputStream::NextChar()
2577 // wouldn't work if reading an incomplete MB char didn't result in an
2578 // error
667e5b3e 2579 //
89028980 2580 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2581 // Win XP or newer and it is not supported for UTF-[78] so we always
2582 // use our own conversions in this case. See
89028980
VS
2583 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2584 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2585 if ( m_CodePage == CP_UTF8 )
89028980 2586 {
5487ff0f 2587 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2588 }
830f8f11
VZ
2589
2590 if ( m_CodePage == CP_UTF7 )
2591 {
5487ff0f 2592 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2593 }
2594
2595 int flags = 0;
2596 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2597 IsAtLeastWin2kSP4() )
89028980 2598 {
830f8f11 2599 flags = MB_ERR_INVALID_CHARS;
89028980 2600 }
667e5b3e 2601
2b5f62a0
VZ
2602 const size_t len = ::MultiByteToWideChar
2603 (
2604 m_CodePage, // code page
667e5b3e 2605 flags, // flags: fall on error
2b5f62a0
VZ
2606 psz, // input string
2607 -1, // its length (NUL-terminated)
b4da152e 2608 buf, // output string
2b5f62a0
VZ
2609 buf ? n : 0 // size of output buffer
2610 );
89028980
VS
2611 if ( !len )
2612 {
2613 // function totally failed
467e0479 2614 return wxCONV_FAILED;
89028980
VS
2615 }
2616
2617 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2618 // check if we succeeded, by doing a double trip:
2619 if ( !flags && buf )
2620 {
53c174fc
VZ
2621 const size_t mbLen = strlen(psz);
2622 wxCharBuffer mbBuf(mbLen);
89028980
VS
2623 if ( ::WideCharToMultiByte
2624 (
2625 m_CodePage,
2626 0,
2627 buf,
2628 -1,
2629 mbBuf.data(),
53c174fc 2630 mbLen + 1, // size in bytes, not length
89028980
VS
2631 NULL,
2632 NULL
2633 ) == 0 ||
2634 strcmp(mbBuf, psz) != 0 )
2635 {
2636 // we didn't obtain the same thing we started from, hence
2637 // the conversion was lossy and we consider that it failed
467e0479 2638 return wxCONV_FAILED;
89028980
VS
2639 }
2640 }
2b5f62a0 2641
03a991bc
VZ
2642 // note that it returns count of written chars for buf != NULL and size
2643 // of the needed buffer for buf == NULL so in either case the length of
2644 // the string (which never includes the terminating NUL) is one less
89028980 2645 return len - 1;
f1339c56 2646 }
dccce9ea 2647
d36c9347 2648 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2649 {
13dd924a
VZ
2650 /*
2651 we have a problem here: by default, WideCharToMultiByte() may
2652 replace characters unrepresentable in the target code page with bad
2653 quality approximations such as turning "1/2" symbol (U+00BD) into
2654 "1" for the code pages which don't have it and we, obviously, want
2655 to avoid this at any price
d775fa82 2656
13dd924a
VZ
2657 the trouble is that this function does it _silently_, i.e. it won't
2658 even tell us whether it did or not... Win98/2000 and higher provide
2659 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2660 we have to resort to a round trip, i.e. check that converting back
2661 results in the same string -- this is, of course, expensive but
2662 otherwise we simply can't be sure to not garble the data.
2663 */
2664
2665 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2666 // it doesn't work with CJK encodings (which we test for rather roughly
2667 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2668 // supporting it
907173e5
WS
2669 BOOL usedDef wxDUMMY_INITIALIZE(false);
2670 BOOL *pUsedDef;
13dd924a
VZ
2671 int flags;
2672 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2673 {
2674 // it's our lucky day
2675 flags = WC_NO_BEST_FIT_CHARS;
2676 pUsedDef = &usedDef;
2677 }
2678 else // old system or unsupported encoding
2679 {
2680 flags = 0;
2681 pUsedDef = NULL;
2682 }
2683
2b5f62a0
VZ
2684 const size_t len = ::WideCharToMultiByte
2685 (
2686 m_CodePage, // code page
13dd924a
VZ
2687 flags, // either none or no best fit
2688 pwz, // input string
2b5f62a0
VZ
2689 -1, // it is (wide) NUL-terminated
2690 buf, // output buffer
2691 buf ? n : 0, // and its size
2692 NULL, // default "replacement" char
13dd924a 2693 pUsedDef // [out] was it used?
2b5f62a0
VZ
2694 );
2695
13dd924a
VZ
2696 if ( !len )
2697 {
2698 // function totally failed
467e0479 2699 return wxCONV_FAILED;
13dd924a
VZ
2700 }
2701
765bdb4a
VZ
2702 // we did something, check if we really succeeded
2703 if ( flags )
13dd924a 2704 {
765bdb4a
VZ
2705 // check if the conversion failed, i.e. if any replacements
2706 // were done
2707 if ( usedDef )
2708 return wxCONV_FAILED;
2709 }
2710 else // we must resort to double tripping...
2711 {
2712 // first we need to ensure that we really have the MB data: this is
2713 // not the case if we're called with NULL buffer, in which case we
2714 // need to do the conversion yet again
2715 wxCharBuffer bufDef;
2716 if ( !buf )
13dd924a 2717 {
765bdb4a
VZ
2718 bufDef = wxCharBuffer(len);
2719 buf = bufDef.data();
2720 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2721 buf, len, NULL, NULL) )
467e0479 2722 return wxCONV_FAILED;
13dd924a 2723 }
765bdb4a 2724
564da6ff
VZ
2725 if ( !n )
2726 n = wcslen(pwz);
765bdb4a 2727 wxWCharBuffer wcBuf(n);
564da6ff 2728 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2729 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2730 {
765bdb4a
VZ
2731 // we didn't obtain the same thing we started from, hence
2732 // the conversion was lossy and we consider that it failed
2733 return wxCONV_FAILED;
13dd924a
VZ
2734 }
2735 }
2736
03a991bc 2737 // see the comment above for the reason of "len - 1"
13dd924a 2738 return len - 1;
f1339c56 2739 }
dccce9ea 2740
7ef3ab50
VZ
2741 virtual size_t GetMBNulLen() const
2742 {
2743 if ( m_minMBCharWidth == 0 )
2744 {
2745 int len = ::WideCharToMultiByte
2746 (
2747 m_CodePage, // code page
2748 0, // no flags
2749 L"", // input string
2750 1, // translate just the NUL
2751 NULL, // output buffer
2752 0, // and its size
2753 NULL, // no replacement char
2754 NULL // [out] don't care if it was used
2755 );
2756
2757 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2758 switch ( len )
2759 {
2760 default:
9a83f860 2761 wxLogDebug(wxT("Unexpected NUL length %d"), len);
ef199164
DS
2762 self->m_minMBCharWidth = (size_t)-1;
2763 break;
7ef3ab50
VZ
2764
2765 case 0:
2766 self->m_minMBCharWidth = (size_t)-1;
2767 break;
2768
2769 case 1:
2770 case 2:
2771 case 4:
2772 self->m_minMBCharWidth = len;
2773 break;
2774 }
2775 }
2776
2777 return m_minMBCharWidth;
2778 }
2779
d36c9347
VZ
2780 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2781
13dd924a
VZ
2782 bool IsOk() const { return m_CodePage != -1; }
2783
2784private:
2785 static bool CanUseNoBestFit()
2786 {
2787 static int s_isWin98Or2k = -1;
2788
2789 if ( s_isWin98Or2k == -1 )
2790 {
2791 int verMaj, verMin;
2792 switch ( wxGetOsVersion(&verMaj, &verMin) )
2793 {
406d283a 2794 case wxOS_WINDOWS_9X:
13dd924a
VZ
2795 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2796 break;
2797
406d283a 2798 case wxOS_WINDOWS_NT:
13dd924a
VZ
2799 s_isWin98Or2k = verMaj >= 5;
2800 break;
2801
2802 default:
ef199164 2803 // unknown: be conservative by default
13dd924a 2804 s_isWin98Or2k = 0;
ef199164 2805 break;
13dd924a
VZ
2806 }
2807
9a83f860 2808 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
13dd924a
VZ
2809 }
2810
2811 return s_isWin98Or2k == 1;
2812 }
f1339c56 2813
89028980
VS
2814 static bool IsAtLeastWin2kSP4()
2815 {
8942f83a
WS
2816#ifdef __WXWINCE__
2817 return false;
2818#else
89028980
VS
2819 static int s_isAtLeastWin2kSP4 = -1;
2820
2821 if ( s_isAtLeastWin2kSP4 == -1 )
2822 {
2823 OSVERSIONINFOEX ver;
2824
2825 memset(&ver, 0, sizeof(ver));
2826 ver.dwOSVersionInfoSize = sizeof(ver);
2827 GetVersionEx((OSVERSIONINFO*)&ver);
2828
2829 s_isAtLeastWin2kSP4 =
2830 ((ver.dwMajorVersion > 5) || // Vista+
2831 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2832 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2833 ver.wServicePackMajor >= 4)) // 2000 SP4+
2834 ? 1 : 0;
2835 }
2836
2837 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2838#endif
89028980
VS
2839 }
2840
eec47cc6 2841
c1464d9d 2842 // the code page we're working with
b1d66b54 2843 long m_CodePage;
c1464d9d 2844
7ef3ab50 2845 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2846 // "unknown"
2847 size_t m_minMBCharWidth;
1cd52418 2848};
e95354ec
VZ
2849
2850#endif // wxHAVE_WIN32_MB2WC
2851
f7e98dee 2852
36acb880
VZ
2853// ============================================================================
2854// wxEncodingConverter based conversion classes
2855// ============================================================================
2856
1e6feb95 2857#if wxUSE_FONTMAP
1cd52418 2858
e95354ec 2859class wxMBConv_wxwin : public wxMBConv
1cd52418 2860{
8b04d4c4
VZ
2861private:
2862 void Init()
2863 {
6ac84a78
DE
2864 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2865 // The wxMBConv_cf class does a better job.
2866 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2867 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2868 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2869 }
2870
6001e347 2871public:
f1339c56
RR
2872 // temporarily just use wxEncodingConverter stuff,
2873 // so that it works while a better implementation is built
86501081 2874 wxMBConv_wxwin(const char* name)
f1339c56
RR
2875 {
2876 if (name)
267e11c5 2877 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2878 else
2879 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2880
8b04d4c4
VZ
2881 Init();
2882 }
2883
e95354ec 2884 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2885 {
2886 m_enc = enc;
2887
2888 Init();
f1339c56 2889 }
dccce9ea 2890
bde4baac 2891 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2892 {
2893 size_t inbuf = strlen(psz);
dccce9ea 2894 if (buf)
c643a977 2895 {
ef199164 2896 if (!m2w.Convert(psz, buf))
467e0479 2897 return wxCONV_FAILED;
c643a977 2898 }
f1339c56
RR
2899 return inbuf;
2900 }
dccce9ea 2901
bde4baac 2902 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2903 {
f8d791e0 2904 const size_t inbuf = wxWcslen(psz);
f1339c56 2905 if (buf)
c643a977 2906 {
ef199164 2907 if (!w2m.Convert(psz, buf))
467e0479 2908 return wxCONV_FAILED;
c643a977 2909 }
dccce9ea 2910
f1339c56
RR
2911 return inbuf;
2912 }
dccce9ea 2913
7ef3ab50 2914 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2915 {
2916 switch ( m_enc )
2917 {
2918 case wxFONTENCODING_UTF16BE:
2919 case wxFONTENCODING_UTF16LE:
c1464d9d 2920 return 2;
eec47cc6
VZ
2921
2922 case wxFONTENCODING_UTF32BE:
2923 case wxFONTENCODING_UTF32LE:
c1464d9d 2924 return 4;
eec47cc6
VZ
2925
2926 default:
c1464d9d 2927 return 1;
eec47cc6
VZ
2928 }
2929 }
2930
d36c9347
VZ
2931 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2932
7ef3ab50
VZ
2933 bool IsOk() const { return m_ok; }
2934
2935public:
2936 wxFontEncoding m_enc;
2937 wxEncodingConverter m2w, w2m;
2938
2939private:
cafbf6fb
VZ
2940 // were we initialized successfully?
2941 bool m_ok;
fc7a2a60 2942
c0c133e1 2943 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
f6bcfd97 2944};
6001e347 2945
8f115891 2946// make the constructors available for unit testing
86501081 2947WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2948{
2949 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2950 if ( !result->IsOk() )
2951 {
2952 delete result;
2953 return 0;
2954 }
ef199164 2955
8f115891
MW
2956 return result;
2957}
2958
1e6feb95
VZ
2959#endif // wxUSE_FONTMAP
2960
36acb880
VZ
2961// ============================================================================
2962// wxCSConv implementation
2963// ============================================================================
2964
8b04d4c4 2965void wxCSConv::Init()
6001e347 2966{
e95354ec
VZ
2967 m_name = NULL;
2968 m_convReal = NULL;
6c4d607e
VZ
2969}
2970
2971void wxCSConv::SetEncoding(wxFontEncoding encoding)
2972{
2973 switch ( encoding )
2974 {
2975 case wxFONTENCODING_MAX:
2976 case wxFONTENCODING_SYSTEM:
2977 if ( m_name )
2978 {
2979 // It's ok to not have encoding value if we have a name for it.
2980 m_encoding = wxFONTENCODING_SYSTEM;
2981 }
2982 else // No name neither.
2983 {
2984 // Fall back to the system default encoding in this case (not
2985 // sure how much sense does this make but this is how the old
2986 // code used to behave).
2987#if wxUSE_INTL
2988 m_encoding = wxLocale::GetSystemEncoding();
2989 if ( m_encoding == wxFONTENCODING_SYSTEM )
2990#endif // wxUSE_INTL
2991 m_encoding = wxFONTENCODING_ISO8859_1;
2992 }
2993 break;
2994
2995 case wxFONTENCODING_DEFAULT:
2996 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2997 m_encoding = wxFONTENCODING_ISO8859_1;
2998 break;
2999
3000 default:
3001 // Just use the provided encoding.
3002 m_encoding = encoding;
3003 }
e95354ec
VZ
3004}
3005
86501081 3006wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
3007{
3008 Init();
82713003 3009
86501081 3010 if ( !charset.empty() )
e95354ec 3011 {
86501081 3012 SetName(charset.ToAscii());
e95354ec 3013 }
bda3d86a 3014
e4277538 3015#if wxUSE_FONTMAP
6c4d607e 3016 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
e4277538 3017#else
6c4d607e 3018 SetEncoding(wxFONTENCODING_SYSTEM);
e4277538 3019#endif
6c4d607e
VZ
3020
3021 m_convReal = DoCreate();
6001e347
RR
3022}
3023
8b04d4c4
VZ
3024wxCSConv::wxCSConv(wxFontEncoding encoding)
3025{
bda3d86a 3026 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec 3027 {
9a83f860 3028 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
e95354ec
VZ
3029
3030 encoding = wxFONTENCODING_SYSTEM;
3031 }
3032
8b04d4c4
VZ
3033 Init();
3034
6c4d607e
VZ
3035 SetEncoding(encoding);
3036
3037 m_convReal = DoCreate();
8b04d4c4
VZ
3038}
3039
6001e347
RR
3040wxCSConv::~wxCSConv()
3041{
65e50848
JS
3042 Clear();
3043}
3044
54380f29 3045wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3046 : wxMBConv()
54380f29 3047{
8b04d4c4
VZ
3048 Init();
3049
54380f29 3050 SetName(conv.m_name);
6c4d607e
VZ
3051 SetEncoding(conv.m_encoding);
3052
3053 m_convReal = DoCreate();
54380f29
GD
3054}
3055
3056wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3057{
3058 Clear();
8b04d4c4 3059
54380f29 3060 SetName(conv.m_name);
6c4d607e
VZ
3061 SetEncoding(conv.m_encoding);
3062
3063 m_convReal = DoCreate();
8b04d4c4 3064
54380f29
GD
3065 return *this;
3066}
3067
65e50848
JS
3068void wxCSConv::Clear()
3069{
8b04d4c4 3070 free(m_name);
65e50848 3071 m_name = NULL;
6c4d607e
VZ
3072
3073 wxDELETE(m_convReal);
6001e347
RR
3074}
3075
86501081 3076void wxCSConv::SetName(const char *charset)
6001e347 3077{
6c4d607e 3078 if ( charset )
d6f2a891 3079 m_name = wxStrdup(charset);
6001e347
RR
3080}
3081
8b3eb85d 3082#if wxUSE_FONTMAP
8b3eb85d
VZ
3083
3084WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3085 wxEncodingNameCache );
8b3eb85d
VZ
3086
3087static wxEncodingNameCache gs_nameCache;
3088#endif
3089
e95354ec
VZ
3090wxMBConv *wxCSConv::DoCreate() const
3091{
ce6f8d6f
VZ
3092#if wxUSE_FONTMAP
3093 wxLogTrace(TRACE_STRCONV,
3094 wxT("creating conversion for %s"),
3095 (m_name ? m_name
86501081 3096 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
3097#endif // wxUSE_FONTMAP
3098
c547282d
VZ
3099 // check for the special case of ASCII or ISO8859-1 charset: as we have
3100 // special knowledge of it anyhow, we don't need to create a special
3101 // conversion object
6c4d607e 3102 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 3103 {
e95354ec
VZ
3104 // don't convert at all
3105 return NULL;
3106 }
dccce9ea 3107
e95354ec
VZ
3108 // we trust OS to do conversion better than we can so try external
3109 // conversion methods first
3110 //
3111 // the full order is:
3112 // 1. OS conversion (iconv() under Unix or Win32 API)
3113 // 2. hard coded conversions for UTF
3114 // 3. wxEncodingConverter as fall back
3115
3116 // step (1)
3117#ifdef HAVE_ICONV
c547282d 3118#if !wxUSE_FONTMAP
e95354ec 3119 if ( m_name )
c547282d 3120#endif // !wxUSE_FONTMAP
e95354ec 3121 {
3ef10cfc 3122#if wxUSE_FONTMAP
8b3eb85d 3123 wxFontEncoding encoding(m_encoding);
3ef10cfc 3124#endif
8b3eb85d 3125
86501081 3126 if ( m_name )
8b3eb85d 3127 {
86501081 3128 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
3129 if ( conv->IsOk() )
3130 return conv;
3131
3132 delete conv;
c547282d
VZ
3133
3134#if wxUSE_FONTMAP
8b3eb85d 3135 encoding =
86501081 3136 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3137#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3138 }
3139#if wxUSE_FONTMAP
3140 {
3141 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3142 if ( it != gs_nameCache.end() )
3143 {
3144 if ( it->second.empty() )
3145 return NULL;
c547282d 3146
86501081 3147 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
3148 if ( conv->IsOk() )
3149 return conv;
e95354ec 3150
8b3eb85d
VZ
3151 delete conv;
3152 }
3153
a243da29 3154 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
3155 // CS : in case this does not return valid names (eg for MacRoman)
3156 // encoding got a 'failure' entry in the cache all the same,
3157 // although it just has to be created using a different method, so
3158 // only store failed iconv creation attempts (or perhaps we
3159 // shoulnd't do this at all ?)
3c67ec06 3160 if ( names[0] != NULL )
8b3eb85d 3161 {
3c67ec06 3162 for ( ; *names; ++names )
8b3eb85d 3163 {
86501081
VS
3164 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3165 // will need changes that will obsolete this
3166 wxString name(*names);
3167 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
3168 if ( conv->IsOk() )
3169 {
3170 gs_nameCache[encoding] = *names;
3171 return conv;
3172 }
3173
3174 delete conv;
8b3eb85d
VZ
3175 }
3176
9a83f860 3177 gs_nameCache[encoding] = wxT(""); // cache the failure
8b3eb85d 3178 }
8b3eb85d
VZ
3179 }
3180#endif // wxUSE_FONTMAP
e95354ec
VZ
3181 }
3182#endif // HAVE_ICONV
3183
3184#ifdef wxHAVE_WIN32_MB2WC
3185 {
7608a683 3186#if wxUSE_FONTMAP
e95354ec
VZ
3187 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3188 : new wxMBConv_win32(m_encoding);
3189 if ( conv->IsOk() )
3190 return conv;
3191
3192 delete conv;
7608a683
WS
3193#else
3194 return NULL;
3195#endif
e95354ec
VZ
3196 }
3197#endif // wxHAVE_WIN32_MB2WC
ef199164 3198
5c4ed98d 3199#ifdef __DARWIN__
f7e98dee 3200 {
6ff49cbc
DE
3201 // leave UTF16 and UTF32 to the built-ins of wx
3202 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3203 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3204 {
a6900d10 3205#if wxUSE_FONTMAP
5c4ed98d
DE
3206 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3207 : new wxMBConv_cf(m_encoding);
a6900d10 3208#else
5c4ed98d 3209 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3210#endif
ef199164 3211
f7e98dee 3212 if ( conv->IsOk() )
d775fa82
WS
3213 return conv;
3214
3215 delete conv;
3216 }
335d31e0 3217 }
5c4ed98d
DE
3218#endif // __DARWIN__
3219
e95354ec
VZ
3220 // step (2)
3221 wxFontEncoding enc = m_encoding;
3222#if wxUSE_FONTMAP
c547282d
VZ
3223 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3224 {
3225 // use "false" to suppress interactive dialogs -- we can be called from
3226 // anywhere and popping up a dialog from here is the last thing we want to
3227 // do
267e11c5 3228 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3229 }
e95354ec
VZ
3230#endif // wxUSE_FONTMAP
3231
3232 switch ( enc )
3233 {
3234 case wxFONTENCODING_UTF7:
3235 return new wxMBConvUTF7;
3236
3237 case wxFONTENCODING_UTF8:
3238 return new wxMBConvUTF8;
3239
e95354ec
VZ
3240 case wxFONTENCODING_UTF16BE:
3241 return new wxMBConvUTF16BE;
3242
3243 case wxFONTENCODING_UTF16LE:
3244 return new wxMBConvUTF16LE;
3245
e95354ec
VZ
3246 case wxFONTENCODING_UTF32BE:
3247 return new wxMBConvUTF32BE;
3248
3249 case wxFONTENCODING_UTF32LE:
3250 return new wxMBConvUTF32LE;
3251
3252 default:
3253 // nothing to do but put here to suppress gcc warnings
ef199164 3254 break;
e95354ec
VZ
3255 }
3256
3257 // step (3)
3258#if wxUSE_FONTMAP
3259 {
3260 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3261 : new wxMBConv_wxwin(m_encoding);
3262 if ( conv->IsOk() )
3263 return conv;
3264
3265 delete conv;
3266 }
ef199164 3267
3df31b2d
VZ
3268 wxLogTrace(TRACE_STRCONV,
3269 wxT("encoding \"%s\" is not supported by this system"),
ef6cef09 3270 (m_name ? wxString(m_name)
3df31b2d
VZ
3271 : wxFontMapperBase::GetEncodingName(m_encoding)));
3272#endif // wxUSE_FONTMAP
e95354ec
VZ
3273
3274 return NULL;
3275}
3276
0f0298b1
VZ
3277bool wxCSConv::IsOk() const
3278{
0f0298b1
VZ
3279 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3280 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3281 return true; // always ok as we do it ourselves
3282
3283 // m_convReal->IsOk() is called at its own creation, so we know it must
3284 // be ok if m_convReal is non-NULL
3285 return m_convReal != NULL;
3286}
3287
1c714a5d
VZ
3288size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3289 const char *src, size_t srcLen) const
3290{
2c74c558
VS
3291 if (m_convReal)
3292 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3293
3294 // latin-1 (direct)
05392dc8
VZ
3295 if ( srcLen == wxNO_LEN )
3296 srcLen = strlen(src) + 1; // take trailing NUL too
1c714a5d 3297
05392dc8
VZ
3298 if ( dst )
3299 {
3300 if ( dstLen < srcLen )
3301 return wxCONV_FAILED;
1c714a5d 3302
05392dc8
VZ
3303 for ( size_t n = 0; n < srcLen; n++ )
3304 dst[n] = (unsigned char)(src[n]);
3305 }
2c74c558 3306
05392dc8 3307 return srcLen;
1c714a5d
VZ
3308}
3309
05392dc8
VZ
3310size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3311 const wchar_t *src, size_t srcLen) const
6001e347 3312{
e95354ec 3313 if (m_convReal)
05392dc8 3314 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
f1339c56
RR
3315
3316 // latin-1 (direct)
05392dc8
VZ
3317 if ( srcLen == wxNO_LEN )
3318 srcLen = wxWcslen(src) + 1;
dccce9ea 3319
05392dc8 3320 if ( dst )
f1339c56 3321 {
05392dc8
VZ
3322 if ( dstLen < srcLen )
3323 return wxCONV_FAILED;
1cd52418 3324
05392dc8 3325 for ( size_t n = 0; n < srcLen; n++ )
24642831 3326 {
05392dc8 3327 if ( src[n] > 0xFF )
467e0479 3328 return wxCONV_FAILED;
ef199164 3329
05392dc8 3330 dst[n] = (char)src[n];
24642831 3331 }
05392dc8 3332
24642831 3333 }
05392dc8 3334 else // still need to check the input validity
24642831 3335 {
05392dc8 3336 for ( size_t n = 0; n < srcLen; n++ )
24642831 3337 {
05392dc8 3338 if ( src[n] > 0xFF )
467e0479 3339 return wxCONV_FAILED;
24642831 3340 }
f1339c56 3341 }
dccce9ea 3342
05392dc8 3343 return srcLen;
6001e347
RR
3344}
3345
7ef3ab50 3346size_t wxCSConv::GetMBNulLen() const
eec47cc6 3347{
eec47cc6 3348 if ( m_convReal )
7ef3ab50 3349 return m_convReal->GetMBNulLen();
eec47cc6 3350
ba98e032 3351 // otherwise, we are ISO-8859-1
c1464d9d 3352 return 1;
eec47cc6
VZ
3353}
3354
ba98e032
VS
3355#if wxUSE_UNICODE_UTF8
3356bool wxCSConv::IsUTF8() const
3357{
ba98e032 3358 if ( m_convReal )
ba98e032 3359 return m_convReal->IsUTF8();
ba98e032
VS
3360
3361 // otherwise, we are ISO-8859-1
3362 return false;
3363}
3364#endif
3365
69c928ef
VZ
3366
3367#if wxUSE_UNICODE
3368
3369wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3370{
3371 if ( !s )
3372 return wxWCharBuffer();
3373
3374 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3375 if ( !wbuf )
5487ff0f 3376 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3377 if ( !wbuf )
3378 wbuf = wxConvISO8859_1.cMB2WX(s);
3379
3380 return wbuf;
3381}
3382
3383wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3384{
3385 if ( !ws )
3386 return wxCharBuffer();
3387
3388 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3389 if ( !buf )
3390 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3391
3392 return buf;
3393}
3394
3395#endif // wxUSE_UNICODE
f5a1953b 3396
1e50d914
VS
3397// ----------------------------------------------------------------------------
3398// globals
3399// ----------------------------------------------------------------------------
3400
3401// NB: The reason why we create converted objects in this convoluted way,
3402// using a factory function instead of global variable, is that they
3403// may be used at static initialization time (some of them are used by
3404// wxString ctors and there may be a global wxString object). In other
3405// words, possibly _before_ the converter global object would be
3406// initialized.
3407
3408#undef wxConvLibc
3409#undef wxConvUTF8
3410#undef wxConvUTF7
3411#undef wxConvLocal
3412#undef wxConvISO8859_1
3413
3414#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3415 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3416 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3417 { \
3418 static impl_klass name##Obj ctor_args; \
3419 return &name##Obj; \
3420 } \
3421 /* this ensures that all global converter objects are created */ \
3422 /* by the time static initialization is done, i.e. before any */ \
3423 /* thread is launched: */ \
3424 static klass* gs_##name##instance = wxGet_##name##Ptr()
3425
3426#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3427 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3428
5c69ef61
VZ
3429#ifdef __INTELC__
3430 // disable warning "variable 'xxx' was declared but never referenced"
3431 #pragma warning(disable: 177)
3432#endif // Intel C++
3433
1e50d914
VS
3434#ifdef __WINDOWS__
3435 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
c45fad9a
SC
3436#elif 0 // defined(__WXOSX__)
3437 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
1e50d914
VS
3438#else
3439 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3440#endif
3441
e1079eda
VZ
3442// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3443// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3444// provokes an error message about "not enough macro parameters"; and we
3445// can't use "()" here as the name##Obj declaration would be parsed as a
3446// function declaration then, so use a semicolon and live with an extra
3447// empty statement (and hope that no compilers warns about this)
3448WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3449WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3450
3451WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3452WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3453
3454WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3455WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3456
6ac84a78 3457#ifdef __DARWIN__
8244507f
VZ
3458// It is important to use this conversion object under Darwin as it ensures
3459// that Unicode strings are (re)composed correctly even though xnu kernel uses
3460// decomposed form internally (at least for the file names).
6ac84a78 3461static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3462#endif
6ac84a78 3463
1e50d914 3464WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3465#ifdef __DARWIN__
1e50d914 3466 &wxConvMacUTF8DObj;
6ac84a78 3467#else // !__DARWIN__
1e50d914 3468 wxGet_wxConvLibcPtr();
6ac84a78 3469#endif // __DARWIN__/!__DARWIN__