]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
iphone additions
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
e4dd1e19 56#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
ce6f8d6f
VZ
60#define TRACE_STRCONV _T("strconv")
61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069
VZ
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434
VZ
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
6001e347 168
483b0434
VZ
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
eec47cc6 171
c1464d9d 172 // the number of NULs terminating this string
a78c43f1 173 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 174
c1464d9d
VZ
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
483b0434
VZ
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
467e0479 181 if ( srcLen != wxNO_LEN )
eec47cc6 182 {
c1464d9d 183 // we need to know how to find the end of this string
7ef3ab50 184 nulLen = GetMBNulLen();
483b0434
VZ
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
e4e3bbb4 187
c1464d9d 188 // if there are enough NULs we can avoid the copy
483b0434 189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
190 {
191 // make a copy in order to properly NUL-terminate the string
483b0434 192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 193 char * const p = bufTmp.data();
483b0434
VZ
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 196 *s = '\0';
483b0434
VZ
197
198 src = bufTmp;
eec47cc6 199 }
e4e3bbb4 200
483b0434
VZ
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
e4e3bbb4 207
483b0434 208 for ( ;; )
eec47cc6 209 {
c1464d9d 210 // try to convert the current chunk
483b0434 211 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
e4e3bbb4 214
467e0479 215 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 216
483b0434 217 dstWritten += lenChunk;
f5fb6871 218
467e0479
VZ
219 if ( lenChunk == 1 )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
483b0434
VZ
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
830f8f11 230 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
c1464d9d 235
483b0434 236 if ( !srcEnd )
c1464d9d 237 {
467e0479
VZ
238 // we convert just one chunk in this case as this is the entire
239 // string anyhow
c1464d9d
VZ
240 break;
241 }
eec47cc6
VZ
242
243 // advance the input pointer past the end of this chunk
483b0434 244 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
483b0434 250 src += nulLen;
c1464d9d 251 }
e4e3bbb4 252
483b0434 253 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
483b0434 258 if ( src >= srcEnd )
c1464d9d
VZ
259 break;
260 }
261
483b0434 262 return dstWritten;
e4e3bbb4
RN
263}
264
483b0434
VZ
265size_t
266wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
e4e3bbb4 268{
483b0434
VZ
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
e4e3bbb4 271
eec47cc6
VZ
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
467e0479 278 if ( srcLen == wxNO_LEN )
e4e3bbb4 279 {
483b0434 280 srcLen = wxWcslen(src) + 1;
eec47cc6 281 }
483b0434 282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
283 {
284 // make a copy in order to properly NUL-terminate the string
483b0434 285 bufTmp = wxWCharBuffer(srcLen);
ef199164 286 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
eec47cc6 314 }
e4e3bbb4 315
483b0434
VZ
316 return dstWritten;
317}
318
ef199164 319size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 320{
ef199164 321 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 322 if ( rc != wxCONV_FAILED )
509da451
VZ
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330}
331
ef199164 332size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 333{
ef199164 334 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 335 if ( rc != wxCONV_FAILED )
509da451
VZ
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341}
342
483b0434
VZ
343wxMBConv::~wxMBConv()
344{
345 // nothing to do here (necessary for Darwin linking probably)
346}
e4e3bbb4 347
483b0434
VZ
348const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349{
350 if ( psz )
eec47cc6 351 {
483b0434 352 // calculate the length of the buffer needed first
a2db25a1 353 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 354 if ( nLen != wxCONV_FAILED )
f5fb6871 355 {
483b0434 356 // now do the actual conversion
a2db25a1 357 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 358
483b0434 359 // +1 for the trailing NULL
a2db25a1 360 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 361 return buf;
f5fb6871 362 }
483b0434 363 }
e4e3bbb4 364
483b0434
VZ
365 return wxWCharBuffer();
366}
3698ae71 367
483b0434
VZ
368const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369{
370 if ( pwz )
371 {
a2db25a1 372 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 373 if ( nLen != wxCONV_FAILED )
483b0434 374 {
a2db25a1
VZ
375 wxCharBuffer buf(nLen - 1);
376 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
377 return buf;
378 }
379 }
380
381 return wxCharBuffer();
382}
e4e3bbb4 383
483b0434 384const wxWCharBuffer
ef199164 385wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 386{
ef199164 387 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 388 if ( dstLen != wxCONV_FAILED )
483b0434 389 {
0dd13d21
VZ
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer wbuf(dstLen);
00ceccee 394 wbuf.data()[dstLen - 1] = L'\0';
ef199164 395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
396 {
397 if ( outLen )
467e0479
VZ
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
483b0434
VZ
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412}
413
414const wxCharBuffer
ef199164 415wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 416{
13d92ad6 417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 418 if ( dstLen != wxCONV_FAILED )
483b0434 419 {
0dd13d21
VZ
420 const size_t nulLen = GetMBNulLen();
421
422 // as above, ensure that the buffer is always NUL-terminated, even if
423 // the input is not
424 wxCharBuffer buf(dstLen + nulLen - 1);
425 memset(buf.data() + dstLen, 0, nulLen);
ef199164 426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
427 {
428 if ( outLen )
467e0479
VZ
429 {
430 *outLen = dstLen;
431
13d92ad6
VZ
432 if ( dstLen >= nulLen &&
433 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
434 {
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
13d92ad6 437 *outLen -= nulLen;
467e0479
VZ
438 }
439 }
d32a507d 440
483b0434
VZ
441 return buf;
442 }
e4e3bbb4
RN
443 }
444
eec47cc6
VZ
445 if ( outLen )
446 *outLen = 0;
447
448 return wxCharBuffer();
e4e3bbb4
RN
449}
450
6001e347 451// ----------------------------------------------------------------------------
bde4baac 452// wxMBConvLibc
6001e347
RR
453// ----------------------------------------------------------------------------
454
bde4baac
VZ
455size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456{
457 return wxMB2WC(buf, psz, n);
458}
459
460size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
461{
462 return wxWC2MB(buf, psz, n);
463}
e1bfe89e
RR
464
465// ----------------------------------------------------------------------------
532d575b 466// wxConvBrokenFileNames
e1bfe89e
RR
467// ----------------------------------------------------------------------------
468
eec47cc6
VZ
469#ifdef __UNIX__
470
86501081 471wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 472{
86501081
VS
473 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
474 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 475 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
476 else
477 m_conv = new wxCSConv(charset);
ea8ce907
RR
478}
479
eec47cc6 480#endif // __UNIX__
c12b7f79 481
bde4baac 482// ----------------------------------------------------------------------------
3698ae71 483// UTF-7
bde4baac 484// ----------------------------------------------------------------------------
6001e347 485
15f2ee32 486// Implementation (C) 2004 Fredrik Roubert
9d653e81
VZ
487//
488// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
6001e347 489
15f2ee32
RN
490//
491// BASE64 decoding table
492//
493static const unsigned char utf7unb64[] =
6001e347 494{
15f2ee32
RN
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
9d653e81 526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
15f2ee32
RN
527};
528
9d653e81
VZ
529size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
530 const char *src, size_t srcLen) const
15f2ee32 531{
9d653e81
VZ
532 DecoderState stateOrig,
533 *statePtr;
534 if ( srcLen == wxNO_LEN )
535 {
536 // convert the entire string, up to and including the trailing NUL
537 srcLen = strlen(src) + 1;
538
539 // when working on the entire strings we don't update nor use the shift
540 // state from the previous call
541 statePtr = &stateOrig;
542 }
543 else // when working with partial strings we do use the shift state
544 {
545 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
546
547 // also save the old state to be able to rollback to it on error
548 stateOrig = m_stateDecoder;
549 }
550
551 // but to simplify the code below we use this variable in both cases
552 DecoderState& state = *statePtr;
553
554
555 // number of characters [which would have been] written to dst [if it were
556 // not NULL]
15f2ee32
RN
557 size_t len = 0;
558
9d653e81
VZ
559 const char * const srcEnd = src + srcLen;
560
561 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
15f2ee32 562 {
9d653e81
VZ
563 const unsigned char cc = *src++;
564
565 if ( state.IsShifted() )
15f2ee32 566 {
9d653e81
VZ
567 const unsigned char dc = utf7unb64[cc];
568 if ( dc == 0xff )
15f2ee32 569 {
9d653e81
VZ
570 // end of encoded part
571 state.ToDirect();
572
573 // re-parse this character normally below unless it's '-' which
574 // is consumed by the decoder
575 if ( cc == '-' )
576 continue;
577 }
578 else // valid encoded character
579 {
580 // mini base64 decoder: each character is 6 bits
581 state.bit += 6;
582 state.accum <<= 6;
583 state.accum += dc;
584
585 if ( state.bit >= 8 )
15f2ee32 586 {
9d653e81
VZ
587 // got the full byte, consume it
588 state.bit -= 8;
589 unsigned char b = (state.accum >> state.bit) & 0x00ff;
590
591 if ( state.isLSB )
15f2ee32 592 {
9d653e81
VZ
593 // we've got the full word, output it
594 if ( dst )
595 *dst++ = (state.msb << 8) | b;
596 len++;
597 state.isLSB = false;
15f2ee32 598 }
9d653e81 599 else // MSB
04a37834 600 {
9d653e81
VZ
601 // just store it while we wait for LSB
602 state.msb = b;
603 state.isLSB = true;
04a37834 604 }
15f2ee32
RN
605 }
606 }
9d653e81 607 }
04a37834 608
9d653e81
VZ
609 if ( state.IsDirect() )
610 {
611 // start of an encoded segment?
612 if ( cc == '+' )
04a37834 613 {
9d653e81
VZ
614 if ( src == srcEnd )
615 return wxCONV_FAILED; // can't have '+' at the end
04a37834 616
9d653e81
VZ
617 if ( *src == '-' )
618 {
619 // just the encoded plus sign, don't switch to shifted mode
620 if ( dst )
621 *dst++ = '+';
622 len++;
623 src++;
624 }
625 else
626 {
627 state.ToShifted();
628 }
629 }
630 else // not '+'
631 {
632 // only printable 7 bit ASCII characters (with the exception of
633 // NUL, TAB, CR and LF) can be used directly
634 if ( cc >= 0x7f || (cc < ' ' &&
635 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
636 return wxCONV_FAILED;
637
638 if ( dst )
639 *dst++ = cc;
640 len++;
641 }
15f2ee32
RN
642 }
643 }
04a37834 644
9d653e81
VZ
645 if ( !len )
646 {
647 // as we didn't read any characters we should be called with the same
648 // data (followed by some more new data) again later so don't save our
649 // state
650 state = stateOrig;
651
652 return wxCONV_FAILED;
653 }
04a37834 654
15f2ee32 655 return len;
6001e347
RR
656}
657
15f2ee32
RN
658//
659// BASE64 encoding table
660//
661static const unsigned char utf7enb64[] =
662{
663 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
664 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
665 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
666 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
667 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
668 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
669 'w', 'x', 'y', 'z', '0', '1', '2', '3',
670 '4', '5', '6', '7', '8', '9', '+', '/'
671};
672
673//
674// UTF-7 encoding table
675//
676// 0 - Set D (directly encoded characters)
677// 1 - Set O (optional direct characters)
678// 2 - whitespace characters (optional)
679// 3 - special characters
680//
681static const unsigned char utf7encode[128] =
6001e347 682{
9d653e81 683 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
15f2ee32
RN
684 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
685 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
689 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
691};
692
9d653e81
VZ
693static inline bool wxIsUTF7Direct(wchar_t wc)
694{
695 return wc < 0x80 && utf7encode[wc] < 1;
696}
697
698size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
699 const wchar_t *src, size_t srcLen) const
15f2ee32 700{
9d653e81
VZ
701 EncoderState stateOrig,
702 *statePtr;
703 if ( srcLen == wxNO_LEN )
704 {
705 // we don't apply the stored state when operating on entire strings at
706 // once
707 statePtr = &stateOrig;
708
709 srcLen = wxWcslen(src) + 1;
710 }
711 else // do use the mode we left the output in previously
712 {
713 stateOrig = m_stateEncoder;
714 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
715 }
716
717 EncoderState& state = *statePtr;
718
719
15f2ee32
RN
720 size_t len = 0;
721
9d653e81
VZ
722 const wchar_t * const srcEnd = src + srcLen;
723 while ( src < srcEnd && (!dst || len < dstLen) )
15f2ee32 724 {
9d653e81
VZ
725 wchar_t cc = *src++;
726 if ( wxIsUTF7Direct(cc) )
15f2ee32 727 {
9d653e81
VZ
728 if ( state.IsShifted() )
729 {
730 // pad with zeros the last encoded block if necessary
731 if ( state.bit )
732 {
733 if ( dst )
734 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
735 len++;
736 }
ef199164 737
9d653e81
VZ
738 state.ToDirect();
739
740 if ( dst )
741 *dst++ = '-';
742 len++;
743 }
744
745 if ( dst )
746 *dst++ = (char)cc;
15f2ee32
RN
747 len++;
748 }
9d653e81
VZ
749 else if ( cc == '+' && state.IsDirect() )
750 {
751 if ( dst )
752 {
753 *dst++ = '+';
754 *dst++ = '-';
755 }
756
757 len += 2;
758 }
15f2ee32 759#ifndef WC_UTF16
79c78d42 760 else if (((wxUint32)cc) > 0xffff)
b2c13097 761 {
15f2ee32 762 // no surrogate pair generation (yet?)
467e0479 763 return wxCONV_FAILED;
15f2ee32
RN
764 }
765#endif
766 else
767 {
9d653e81
VZ
768 if ( state.IsDirect() )
769 {
770 state.ToShifted();
ef199164 771
9d653e81
VZ
772 if ( dst )
773 *dst++ = '+';
774 len++;
775 }
776
777 // BASE64 encode string
778 for ( ;; )
15f2ee32 779 {
9d653e81 780 for ( unsigned lsb = 0; lsb < 2; lsb++ )
15f2ee32 781 {
9d653e81
VZ
782 state.accum <<= 8;
783 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
784
785 for (state.bit += 8; state.bit >= 6; )
15f2ee32 786 {
9d653e81
VZ
787 state.bit -= 6;
788 if ( dst )
789 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
790 len++;
15f2ee32 791 }
15f2ee32 792 }
ef199164 793
9d653e81
VZ
794 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
795 break;
ef199164 796
9d653e81 797 src++;
15f2ee32 798 }
15f2ee32
RN
799 }
800 }
ef199164 801
9d653e81
VZ
802 // we need to restore the original encoder state if we were called just to
803 // calculate the amount of space needed as we will presumably be called
804 // again to really convert the data now
805 if ( !dst )
806 state = stateOrig;
ef199164 807
15f2ee32 808 return len;
6001e347
RR
809}
810
f6bcfd97 811// ----------------------------------------------------------------------------
6001e347 812// UTF-8
f6bcfd97 813// ----------------------------------------------------------------------------
6001e347 814
1774c3c5 815static const wxUint32 utf8_max[]=
4def3b35 816 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 817
3698ae71
VZ
818// boundaries of the private use area we use to (temporarily) remap invalid
819// characters invalid in a UTF-8 encoded string
ea8ce907
RR
820const wxUint32 wxUnicodePUA = 0x100000;
821const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
822
0286d08d 823// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 824const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
825 // single-byte sequences (ASCII):
826 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
828 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
834
835 // these are invalid:
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
840 0, 0, // C0,C1
841
842 // two-byte sequences:
843 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
845
846 // three-byte sequences:
847 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
848
849 // four-byte sequences:
850 4, 4, 4, 4, 4, // F0..F4
851
852 // these are invalid again (5- or 6-byte
853 // sequences and sequences for code points
854 // above U+10FFFF, as restricted by RFC 3629):
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
856};
857
858size_t
859wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
860 const char *src, size_t srcLen) const
861{
862 wchar_t *out = dstLen ? dst : NULL;
863 size_t written = 0;
864
865 if ( srcLen == wxNO_LEN )
866 srcLen = strlen(src) + 1;
867
868 for ( const char *p = src; ; p++ )
869 {
870 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
871 {
872 // all done successfully, just add the trailing NULL if we are not
873 // using explicit length
874 if ( srcLen == wxNO_LEN )
875 {
876 if ( out )
877 {
878 if ( !dstLen )
879 break;
880
881 *out = L'\0';
882 }
883
884 written++;
885 }
886
887 return written;
888 }
889
0286d08d
VZ
890 if ( out && !dstLen-- )
891 break;
892
5367a38a
VS
893 wxUint32 code;
894 unsigned char c = *p;
0286d08d 895
5367a38a
VS
896 if ( c < 0x80 )
897 {
898 if ( srcLen == 0 ) // the test works for wxNO_LEN too
899 break;
0286d08d 900
5367a38a
VS
901 if ( srcLen != wxNO_LEN )
902 srcLen--;
0286d08d 903
5367a38a
VS
904 code = c;
905 }
906 else
0286d08d 907 {
5367a38a
VS
908 unsigned len = tableUtf8Lengths[c];
909 if ( !len )
910 break;
911
912 if ( srcLen < len ) // the test works for wxNO_LEN too
913 break;
914
915 if ( srcLen != wxNO_LEN )
916 srcLen -= len;
917
918 // Char. number range | UTF-8 octet sequence
919 // (hexadecimal) | (binary)
920 // ----------------------+----------------------------------------
921 // 0000 0000 - 0000 007F | 0xxxxxxx
922 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
923 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
924 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
925 //
926 // Code point value is stored in bits marked with 'x',
927 // lowest-order bit of the value on the right side in the diagram
928 // above. (from RFC 3629)
929
930 // mask to extract lead byte's value ('x' bits above), by sequence
931 // length:
932 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
933
934 // mask and value of lead byte's most significant bits, by length:
935 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
936 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
937
938 len--; // it's more convenient to work with 0-based length here
939
940 // extract the lead byte's value bits:
941 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
942 break;
943
944 code = c & leadValueMask[len];
945
946 // all remaining bytes, if any, are handled in the same way
947 // regardless of sequence's length:
948 for ( ; len; --len )
949 {
950 c = *++p;
951 if ( (c & 0xC0) != 0x80 )
952 return wxCONV_FAILED;
0286d08d 953
5367a38a
VS
954 code <<= 6;
955 code |= c & 0x3F;
956 }
0286d08d
VZ
957 }
958
959#ifdef WC_UTF16
960 // cast is ok because wchar_t == wxUint16 if WC_UTF16
961 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
962 {
963 if ( out )
964 out++;
965 written++;
966 }
967#else // !WC_UTF16
968 if ( out )
969 *out = code;
970#endif // WC_UTF16/!WC_UTF16
971
972 if ( out )
973 out++;
974
975 written++;
976 }
977
978 return wxCONV_FAILED;
979}
980
981size_t
982wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
983 const wchar_t *src, size_t srcLen) const
984{
985 char *out = dstLen ? dst : NULL;
986 size_t written = 0;
987
988 for ( const wchar_t *wp = src; ; wp++ )
989 {
a964d3ed 990 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
0286d08d
VZ
991 {
992 // all done successfully, just add the trailing NULL if we are not
993 // using explicit length
994 if ( srcLen == wxNO_LEN )
995 {
996 if ( out )
997 {
998 if ( !dstLen )
999 break;
1000
1001 *out = '\0';
1002 }
1003
1004 written++;
1005 }
1006
1007 return written;
1008 }
1009
a964d3ed
VZ
1010 if ( srcLen != wxNO_LEN )
1011 srcLen--;
0286d08d
VZ
1012
1013 wxUint32 code;
1014#ifdef WC_UTF16
1015 // cast is ok for WC_UTF16
1016 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1017 {
1018 // skip the next char too as we decoded a surrogate
1019 wp++;
1020 }
1021#else // wchar_t is UTF-32
1022 code = *wp & 0x7fffffff;
1023#endif
1024
1025 unsigned len;
1026 if ( code <= 0x7F )
1027 {
1028 len = 1;
1029 if ( out )
1030 {
1031 if ( dstLen < len )
1032 break;
1033
1034 out[0] = (char)code;
1035 }
1036 }
1037 else if ( code <= 0x07FF )
1038 {
1039 len = 2;
1040 if ( out )
1041 {
1042 if ( dstLen < len )
1043 break;
1044
1045 // NB: this line takes 6 least significant bits, encodes them as
1046 // 10xxxxxx and discards them so that the next byte can be encoded:
1047 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1048 out[0] = 0xC0 | code;
1049 }
1050 }
1051 else if ( code < 0xFFFF )
1052 {
1053 len = 3;
1054 if ( out )
1055 {
1056 if ( dstLen < len )
1057 break;
1058
1059 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1060 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1061 out[0] = 0xE0 | code;
1062 }
1063 }
1064 else if ( code <= 0x10FFFF )
1065 {
1066 len = 4;
1067 if ( out )
1068 {
1069 if ( dstLen < len )
1070 break;
1071
1072 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1073 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1074 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1075 out[0] = 0xF0 | code;
1076 }
1077 }
1078 else
1079 {
1080 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1081 break;
1082 }
1083
1084 if ( out )
1085 {
1086 out += len;
1087 dstLen -= len;
1088 }
1089
1090 written += len;
1091 }
1092
1093 // we only get here if an error occurs during decoding
1094 return wxCONV_FAILED;
1095}
1096
d16d0917
VZ
1097size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1098 const char *psz, size_t srcLen) const
6001e347 1099{
0286d08d 1100 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1101 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 1102
4def3b35
VS
1103 size_t len = 0;
1104
d16d0917 1105 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1106 {
ea8ce907
RR
1107 const char *opsz = psz;
1108 bool invalid = false;
4def3b35
VS
1109 unsigned char cc = *psz++, fc = cc;
1110 unsigned cnt;
dccce9ea 1111 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1112 fc <<= 1;
ef199164 1113
dccce9ea 1114 if (!cnt)
4def3b35
VS
1115 {
1116 // plain ASCII char
dccce9ea 1117 if (buf)
4def3b35
VS
1118 *buf++ = cc;
1119 len++;
561488ef
MW
1120
1121 // escape the escape character for octal escapes
1122 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1123 && cc == '\\' && (!buf || len < n))
1124 {
1125 if (buf)
1126 *buf++ = cc;
1127 len++;
1128 }
dccce9ea
VZ
1129 }
1130 else
4def3b35
VS
1131 {
1132 cnt--;
dccce9ea 1133 if (!cnt)
4def3b35
VS
1134 {
1135 // invalid UTF-8 sequence
ea8ce907 1136 invalid = true;
dccce9ea
VZ
1137 }
1138 else
4def3b35
VS
1139 {
1140 unsigned ocnt = cnt - 1;
1141 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1142 while (cnt--)
4def3b35 1143 {
ea8ce907 1144 cc = *psz;
dccce9ea 1145 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1146 {
1147 // invalid UTF-8 sequence
ea8ce907
RR
1148 invalid = true;
1149 break;
4def3b35 1150 }
ef199164 1151
ea8ce907 1152 psz++;
4def3b35
VS
1153 res = (res << 6) | (cc & 0x3f);
1154 }
ef199164 1155
ea8ce907 1156 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1157 {
1158 // illegal UTF-8 encoding
ea8ce907 1159 invalid = true;
4def3b35 1160 }
ea8ce907
RR
1161 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1162 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1163 {
1164 // if one of our PUA characters turns up externally
1165 // it must also be treated as an illegal sequence
1166 // (a bit like you have to escape an escape character)
1167 invalid = true;
1168 }
1169 else
1170 {
1cd52418 1171#ifdef WC_UTF16
0286d08d 1172 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1173 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1174 if (pa == wxCONV_FAILED)
ea8ce907
RR
1175 {
1176 invalid = true;
1177 }
1178 else
1179 {
1180 if (buf)
1181 buf += pa;
1182 len += pa;
1183 }
373658eb 1184#else // !WC_UTF16
ea8ce907 1185 if (buf)
38d4b1e4 1186 *buf++ = (wchar_t)res;
ea8ce907 1187 len++;
373658eb 1188#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1189 }
1190 }
ef199164 1191
ea8ce907
RR
1192 if (invalid)
1193 {
1194 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1195 {
1196 while (opsz < psz && (!buf || len < n))
1197 {
1198#ifdef WC_UTF16
1199 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1200 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1201 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1202 if (buf)
1203 buf += pa;
1204 opsz++;
1205 len += pa;
1206#else
1207 if (buf)
38d4b1e4 1208 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1209 opsz++;
1210 len++;
1211#endif
1212 }
1213 }
3698ae71 1214 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1215 {
1216 while (opsz < psz && (!buf || len < n))
1217 {
3698ae71
VZ
1218 if ( buf && len + 3 < n )
1219 {
17a1ebd1 1220 unsigned char on = *opsz;
3698ae71 1221 *buf++ = L'\\';
17a1ebd1
VZ
1222 *buf++ = (wchar_t)( L'0' + on / 0100 );
1223 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1224 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1225 }
ef199164 1226
ea8ce907
RR
1227 opsz++;
1228 len += 4;
1229 }
1230 }
3698ae71 1231 else // MAP_INVALID_UTF8_NOT
ea8ce907 1232 {
467e0479 1233 return wxCONV_FAILED;
ea8ce907 1234 }
4def3b35
VS
1235 }
1236 }
6001e347 1237 }
ef199164 1238
d16d0917 1239 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1240 *buf = 0;
ef199164 1241
d16d0917 1242 return len + 1;
6001e347
RR
1243}
1244
3698ae71
VZ
1245static inline bool isoctal(wchar_t wch)
1246{
1247 return L'0' <= wch && wch <= L'7';
1248}
1249
d16d0917
VZ
1250size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1251 const wchar_t *psz, size_t srcLen) const
6001e347 1252{
0286d08d 1253 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1254 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1255
4def3b35 1256 size_t len = 0;
6001e347 1257
d16d0917 1258 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1259 {
1260 wxUint32 cc;
ef199164 1261
1cd52418 1262#ifdef WC_UTF16
b5153fd8
VZ
1263 // cast is ok for WC_UTF16
1264 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1265 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1266#else
ef199164 1267 cc = (*psz++) & 0x7fffffff;
4def3b35 1268#endif
3698ae71
VZ
1269
1270 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1271 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1272 {
dccce9ea 1273 if (buf)
ea8ce907 1274 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1275 len++;
3698ae71 1276 }
561488ef
MW
1277 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1278 && cc == L'\\' && psz[0] == L'\\' )
1279 {
1280 if (buf)
1281 *buf++ = (char)cc;
1282 psz++;
1283 len++;
1284 }
3698ae71
VZ
1285 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1286 cc == L'\\' &&
1287 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1288 {
dccce9ea 1289 if (buf)
3698ae71 1290 {
ef199164
DS
1291 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1292 (psz[1] - L'0') * 010 +
b2c13097 1293 (psz[2] - L'0'));
3698ae71
VZ
1294 }
1295
1296 psz += 3;
ea8ce907
RR
1297 len++;
1298 }
1299 else
1300 {
1301 unsigned cnt;
ef199164
DS
1302 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1303 {
1304 }
1305
ea8ce907 1306 if (!cnt)
4def3b35 1307 {
ea8ce907
RR
1308 // plain ASCII char
1309 if (buf)
1310 *buf++ = (char) cc;
1311 len++;
1312 }
ea8ce907
RR
1313 else
1314 {
1315 len += cnt + 1;
1316 if (buf)
1317 {
1318 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1319 while (cnt--)
1320 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1321 }
4def3b35
VS
1322 }
1323 }
6001e347 1324 }
4def3b35 1325
d16d0917 1326 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1327 *buf = 0;
adb45366 1328
d16d0917 1329 return len + 1;
6001e347
RR
1330}
1331
467e0479 1332// ============================================================================
c91830cb 1333// UTF-16
467e0479 1334// ============================================================================
c91830cb
VZ
1335
1336#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1337 #define wxMBConvUTF16straight wxMBConvUTF16BE
1338 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1339#else
bde4baac
VZ
1340 #define wxMBConvUTF16swap wxMBConvUTF16BE
1341 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1342#endif
1343
467e0479
VZ
1344/* static */
1345size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1346{
1347 if ( srcLen == wxNO_LEN )
1348 {
1349 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1350 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1351 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1352 ;
c91830cb 1353
467e0479
VZ
1354 srcLen *= BYTES_PER_CHAR;
1355 }
1356 else // we already have the length
1357 {
1358 // we can only convert an entire number of UTF-16 characters
1359 if ( srcLen % BYTES_PER_CHAR )
1360 return wxCONV_FAILED;
1361 }
1362
1363 return srcLen;
1364}
1365
1366// case when in-memory representation is UTF-16 too
c91830cb
VZ
1367#ifdef WC_UTF16
1368
467e0479
VZ
1369// ----------------------------------------------------------------------------
1370// conversions without endianness change
1371// ----------------------------------------------------------------------------
1372
1373size_t
1374wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1375 const char *src, size_t srcLen) const
c91830cb 1376{
467e0479
VZ
1377 // set up the scene for using memcpy() (which is presumably more efficient
1378 // than copying the bytes one by one)
1379 srcLen = GetLength(src, srcLen);
1380 if ( srcLen == wxNO_LEN )
1381 return wxCONV_FAILED;
c91830cb 1382
ef199164 1383 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1384 if ( dst )
c91830cb 1385 {
467e0479
VZ
1386 if ( dstLen < inLen )
1387 return wxCONV_FAILED;
c91830cb 1388
467e0479 1389 memcpy(dst, src, srcLen);
c91830cb 1390 }
d32a507d 1391
467e0479 1392 return inLen;
c91830cb
VZ
1393}
1394
467e0479
VZ
1395size_t
1396wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1397 const wchar_t *src, size_t srcLen) const
c91830cb 1398{
467e0479
VZ
1399 if ( srcLen == wxNO_LEN )
1400 srcLen = wxWcslen(src) + 1;
c91830cb 1401
467e0479
VZ
1402 srcLen *= BYTES_PER_CHAR;
1403
1404 if ( dst )
c91830cb 1405 {
467e0479
VZ
1406 if ( dstLen < srcLen )
1407 return wxCONV_FAILED;
d32a507d 1408
467e0479 1409 memcpy(dst, src, srcLen);
c91830cb 1410 }
d32a507d 1411
467e0479 1412 return srcLen;
c91830cb
VZ
1413}
1414
467e0479
VZ
1415// ----------------------------------------------------------------------------
1416// endian-reversing conversions
1417// ----------------------------------------------------------------------------
c91830cb 1418
467e0479
VZ
1419size_t
1420wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1421 const char *src, size_t srcLen) const
c91830cb 1422{
467e0479
VZ
1423 srcLen = GetLength(src, srcLen);
1424 if ( srcLen == wxNO_LEN )
1425 return wxCONV_FAILED;
c91830cb 1426
467e0479
VZ
1427 srcLen /= BYTES_PER_CHAR;
1428
1429 if ( dst )
c91830cb 1430 {
467e0479
VZ
1431 if ( dstLen < srcLen )
1432 return wxCONV_FAILED;
1433
ef199164
DS
1434 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1435 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1436 {
ef199164 1437 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1438 }
c91830cb 1439 }
bfab25d4 1440
467e0479 1441 return srcLen;
c91830cb
VZ
1442}
1443
467e0479
VZ
1444size_t
1445wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1446 const wchar_t *src, size_t srcLen) const
c91830cb 1447{
467e0479
VZ
1448 if ( srcLen == wxNO_LEN )
1449 srcLen = wxWcslen(src) + 1;
c91830cb 1450
467e0479
VZ
1451 srcLen *= BYTES_PER_CHAR;
1452
1453 if ( dst )
c91830cb 1454 {
467e0479
VZ
1455 if ( dstLen < srcLen )
1456 return wxCONV_FAILED;
1457
ef199164 1458 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1459 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1460 {
ef199164 1461 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1462 }
c91830cb 1463 }
eec47cc6 1464
467e0479 1465 return srcLen;
c91830cb
VZ
1466}
1467
467e0479 1468#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1469
467e0479
VZ
1470// ----------------------------------------------------------------------------
1471// conversions without endianness change
1472// ----------------------------------------------------------------------------
c91830cb 1473
35d11700
VZ
1474size_t
1475wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1476 const char *src, size_t srcLen) const
c91830cb 1477{
35d11700
VZ
1478 srcLen = GetLength(src, srcLen);
1479 if ( srcLen == wxNO_LEN )
1480 return wxCONV_FAILED;
c91830cb 1481
ef199164 1482 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1483 if ( !dst )
c91830cb 1484 {
35d11700
VZ
1485 // optimization: return maximal space which could be needed for this
1486 // string even if the real size could be smaller if the buffer contains
1487 // any surrogates
1488 return inLen;
c91830cb 1489 }
c91830cb 1490
35d11700 1491 size_t outLen = 0;
ef199164
DS
1492 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1493 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1494 {
ef199164
DS
1495 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1496 if ( !inBuff )
35d11700
VZ
1497 return wxCONV_FAILED;
1498
1499 if ( ++outLen > dstLen )
1500 return wxCONV_FAILED;
c91830cb 1501
35d11700
VZ
1502 *dst++ = ch;
1503 }
1504
1505
1506 return outLen;
1507}
c91830cb 1508
35d11700
VZ
1509size_t
1510wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1511 const wchar_t *src, size_t srcLen) const
c91830cb 1512{
35d11700
VZ
1513 if ( srcLen == wxNO_LEN )
1514 srcLen = wxWcslen(src) + 1;
c91830cb 1515
35d11700 1516 size_t outLen = 0;
ef199164 1517 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1518 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1519 {
1520 wxUint16 cc[2];
35d11700
VZ
1521 const size_t numChars = encode_utf16(*src++, cc);
1522 if ( numChars == wxCONV_FAILED )
1523 return wxCONV_FAILED;
c91830cb 1524
ef199164
DS
1525 outLen += numChars * BYTES_PER_CHAR;
1526 if ( outBuff )
c91830cb 1527 {
35d11700
VZ
1528 if ( outLen > dstLen )
1529 return wxCONV_FAILED;
1530
ef199164 1531 *outBuff++ = cc[0];
35d11700 1532 if ( numChars == 2 )
69b80d28 1533 {
35d11700 1534 // second character of a surrogate
ef199164 1535 *outBuff++ = cc[1];
69b80d28 1536 }
c91830cb 1537 }
c91830cb 1538 }
c91830cb 1539
35d11700 1540 return outLen;
c91830cb
VZ
1541}
1542
467e0479
VZ
1543// ----------------------------------------------------------------------------
1544// endian-reversing conversions
1545// ----------------------------------------------------------------------------
c91830cb 1546
35d11700
VZ
1547size_t
1548wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549 const char *src, size_t srcLen) const
c91830cb 1550{
35d11700
VZ
1551 srcLen = GetLength(src, srcLen);
1552 if ( srcLen == wxNO_LEN )
1553 return wxCONV_FAILED;
1554
ef199164 1555 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1556 if ( !dst )
1557 {
1558 // optimization: return maximal space which could be needed for this
1559 // string even if the real size could be smaller if the buffer contains
1560 // any surrogates
1561 return inLen;
1562 }
c91830cb 1563
35d11700 1564 size_t outLen = 0;
ef199164
DS
1565 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1566 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1567 {
35d11700
VZ
1568 wxUint32 ch;
1569 wxUint16 tmp[2];
ef199164
DS
1570
1571 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1572 inBuff++;
1573 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1574
35d11700
VZ
1575 const size_t numChars = decode_utf16(tmp, ch);
1576 if ( numChars == wxCONV_FAILED )
1577 return wxCONV_FAILED;
c91830cb 1578
35d11700 1579 if ( numChars == 2 )
ef199164 1580 inBuff++;
35d11700
VZ
1581
1582 if ( ++outLen > dstLen )
1583 return wxCONV_FAILED;
c91830cb 1584
35d11700 1585 *dst++ = ch;
c91830cb 1586 }
c91830cb 1587
c91830cb 1588
35d11700
VZ
1589 return outLen;
1590}
c91830cb 1591
35d11700
VZ
1592size_t
1593wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1594 const wchar_t *src, size_t srcLen) const
c91830cb 1595{
35d11700
VZ
1596 if ( srcLen == wxNO_LEN )
1597 srcLen = wxWcslen(src) + 1;
c91830cb 1598
35d11700 1599 size_t outLen = 0;
ef199164 1600 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1601 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1602 {
1603 wxUint16 cc[2];
35d11700
VZ
1604 const size_t numChars = encode_utf16(*src, cc);
1605 if ( numChars == wxCONV_FAILED )
1606 return wxCONV_FAILED;
c91830cb 1607
ef199164
DS
1608 outLen += numChars * BYTES_PER_CHAR;
1609 if ( outBuff )
c91830cb 1610 {
35d11700
VZ
1611 if ( outLen > dstLen )
1612 return wxCONV_FAILED;
1613
ef199164 1614 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1615 if ( numChars == 2 )
c91830cb 1616 {
35d11700 1617 // second character of a surrogate
ef199164 1618 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1619 }
1620 }
c91830cb 1621 }
c91830cb 1622
35d11700 1623 return outLen;
c91830cb
VZ
1624}
1625
467e0479 1626#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1627
1628
35d11700 1629// ============================================================================
c91830cb 1630// UTF-32
35d11700 1631// ============================================================================
c91830cb
VZ
1632
1633#ifdef WORDS_BIGENDIAN
467e0479
VZ
1634 #define wxMBConvUTF32straight wxMBConvUTF32BE
1635 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1636#else
467e0479
VZ
1637 #define wxMBConvUTF32swap wxMBConvUTF32BE
1638 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1639#endif
1640
1641
1642WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1643WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1644
467e0479
VZ
1645/* static */
1646size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1647{
1648 if ( srcLen == wxNO_LEN )
1649 {
1650 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1651 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1653 ;
c91830cb 1654
467e0479
VZ
1655 srcLen *= BYTES_PER_CHAR;
1656 }
1657 else // we already have the length
1658 {
1659 // we can only convert an entire number of UTF-32 characters
1660 if ( srcLen % BYTES_PER_CHAR )
1661 return wxCONV_FAILED;
1662 }
1663
1664 return srcLen;
1665}
1666
1667// case when in-memory representation is UTF-16
c91830cb
VZ
1668#ifdef WC_UTF16
1669
467e0479
VZ
1670// ----------------------------------------------------------------------------
1671// conversions without endianness change
1672// ----------------------------------------------------------------------------
1673
1674size_t
1675wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1676 const char *src, size_t srcLen) const
c91830cb 1677{
467e0479
VZ
1678 srcLen = GetLength(src, srcLen);
1679 if ( srcLen == wxNO_LEN )
1680 return wxCONV_FAILED;
c91830cb 1681
ef199164
DS
1682 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1683 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1684 size_t outLen = 0;
1685 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1686 {
1687 wxUint16 cc[2];
ef199164 1688 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1689 if ( numChars == wxCONV_FAILED )
1690 return wxCONV_FAILED;
c91830cb 1691
467e0479
VZ
1692 outLen += numChars;
1693 if ( dst )
c91830cb 1694 {
467e0479
VZ
1695 if ( outLen > dstLen )
1696 return wxCONV_FAILED;
d32a507d 1697
467e0479
VZ
1698 *dst++ = cc[0];
1699 if ( numChars == 2 )
1700 {
1701 // second character of a surrogate
1702 *dst++ = cc[1];
1703 }
1704 }
c91830cb 1705 }
d32a507d 1706
467e0479 1707 return outLen;
c91830cb
VZ
1708}
1709
467e0479
VZ
1710size_t
1711wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1712 const wchar_t *src, size_t srcLen) const
c91830cb 1713{
467e0479
VZ
1714 if ( srcLen == wxNO_LEN )
1715 srcLen = wxWcslen(src) + 1;
c91830cb 1716
467e0479 1717 if ( !dst )
c91830cb 1718 {
467e0479
VZ
1719 // optimization: return maximal space which could be needed for this
1720 // string instead of the exact amount which could be less if there are
1721 // any surrogates in the input
1722 //
1723 // we consider that surrogates are rare enough to make it worthwhile to
1724 // avoid running the loop below at the cost of slightly extra memory
1725 // consumption
ef199164 1726 return srcLen * BYTES_PER_CHAR;
467e0479 1727 }
c91830cb 1728
ef199164 1729 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1730 size_t outLen = 0;
1731 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1732 {
1733 const wxUint32 ch = wxDecodeSurrogate(&src);
1734 if ( !src )
1735 return wxCONV_FAILED;
c91830cb 1736
467e0479 1737 outLen += BYTES_PER_CHAR;
d32a507d 1738
467e0479
VZ
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
b5153fd8 1741
ef199164 1742 *outBuff++ = ch;
467e0479 1743 }
c91830cb 1744
467e0479 1745 return outLen;
c91830cb
VZ
1746}
1747
467e0479
VZ
1748// ----------------------------------------------------------------------------
1749// endian-reversing conversions
1750// ----------------------------------------------------------------------------
c91830cb 1751
467e0479
VZ
1752size_t
1753wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1754 const char *src, size_t srcLen) const
c91830cb 1755{
467e0479
VZ
1756 srcLen = GetLength(src, srcLen);
1757 if ( srcLen == wxNO_LEN )
1758 return wxCONV_FAILED;
c91830cb 1759
ef199164
DS
1760 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1761 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1762 size_t outLen = 0;
ef199164 1763 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1764 {
c91830cb 1765 wxUint16 cc[2];
ef199164 1766 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1767 if ( numChars == wxCONV_FAILED )
1768 return wxCONV_FAILED;
c91830cb 1769
467e0479
VZ
1770 outLen += numChars;
1771 if ( dst )
c91830cb 1772 {
467e0479
VZ
1773 if ( outLen > dstLen )
1774 return wxCONV_FAILED;
d32a507d 1775
467e0479
VZ
1776 *dst++ = cc[0];
1777 if ( numChars == 2 )
1778 {
1779 // second character of a surrogate
1780 *dst++ = cc[1];
1781 }
1782 }
c91830cb 1783 }
b5153fd8 1784
467e0479 1785 return outLen;
c91830cb
VZ
1786}
1787
467e0479
VZ
1788size_t
1789wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1790 const wchar_t *src, size_t srcLen) const
c91830cb 1791{
467e0479
VZ
1792 if ( srcLen == wxNO_LEN )
1793 srcLen = wxWcslen(src) + 1;
c91830cb 1794
467e0479 1795 if ( !dst )
c91830cb 1796 {
467e0479
VZ
1797 // optimization: return maximal space which could be needed for this
1798 // string instead of the exact amount which could be less if there are
1799 // any surrogates in the input
1800 //
1801 // we consider that surrogates are rare enough to make it worthwhile to
1802 // avoid running the loop below at the cost of slightly extra memory
1803 // consumption
1804 return srcLen*BYTES_PER_CHAR;
1805 }
c91830cb 1806
ef199164 1807 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1808 size_t outLen = 0;
1809 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1810 {
1811 const wxUint32 ch = wxDecodeSurrogate(&src);
1812 if ( !src )
1813 return wxCONV_FAILED;
c91830cb 1814
467e0479 1815 outLen += BYTES_PER_CHAR;
d32a507d 1816
467e0479
VZ
1817 if ( outLen > dstLen )
1818 return wxCONV_FAILED;
b5153fd8 1819
ef199164 1820 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1821 }
c91830cb 1822
467e0479 1823 return outLen;
c91830cb
VZ
1824}
1825
467e0479 1826#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1827
35d11700
VZ
1828// ----------------------------------------------------------------------------
1829// conversions without endianness change
1830// ----------------------------------------------------------------------------
1831
1832size_t
1833wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1834 const char *src, size_t srcLen) const
c91830cb 1835{
35d11700
VZ
1836 // use memcpy() as it should be much faster than hand-written loop
1837 srcLen = GetLength(src, srcLen);
1838 if ( srcLen == wxNO_LEN )
1839 return wxCONV_FAILED;
c91830cb 1840
35d11700
VZ
1841 const size_t inLen = srcLen/BYTES_PER_CHAR;
1842 if ( dst )
c91830cb 1843 {
35d11700
VZ
1844 if ( dstLen < inLen )
1845 return wxCONV_FAILED;
b5153fd8 1846
35d11700
VZ
1847 memcpy(dst, src, srcLen);
1848 }
c91830cb 1849
35d11700 1850 return inLen;
c91830cb
VZ
1851}
1852
35d11700
VZ
1853size_t
1854wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1855 const wchar_t *src, size_t srcLen) const
c91830cb 1856{
35d11700
VZ
1857 if ( srcLen == wxNO_LEN )
1858 srcLen = wxWcslen(src) + 1;
1859
1860 srcLen *= BYTES_PER_CHAR;
c91830cb 1861
35d11700 1862 if ( dst )
c91830cb 1863 {
35d11700
VZ
1864 if ( dstLen < srcLen )
1865 return wxCONV_FAILED;
c91830cb 1866
35d11700 1867 memcpy(dst, src, srcLen);
c91830cb
VZ
1868 }
1869
35d11700 1870 return srcLen;
c91830cb
VZ
1871}
1872
35d11700
VZ
1873// ----------------------------------------------------------------------------
1874// endian-reversing conversions
1875// ----------------------------------------------------------------------------
c91830cb 1876
35d11700
VZ
1877size_t
1878wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1879 const char *src, size_t srcLen) const
c91830cb 1880{
35d11700
VZ
1881 srcLen = GetLength(src, srcLen);
1882 if ( srcLen == wxNO_LEN )
1883 return wxCONV_FAILED;
1884
1885 srcLen /= BYTES_PER_CHAR;
c91830cb 1886
35d11700 1887 if ( dst )
c91830cb 1888 {
35d11700
VZ
1889 if ( dstLen < srcLen )
1890 return wxCONV_FAILED;
1891
ef199164
DS
1892 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1893 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1894 {
ef199164 1895 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1896 }
c91830cb 1897 }
b5153fd8 1898
35d11700 1899 return srcLen;
c91830cb
VZ
1900}
1901
35d11700
VZ
1902size_t
1903wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1904 const wchar_t *src, size_t srcLen) const
c91830cb 1905{
35d11700
VZ
1906 if ( srcLen == wxNO_LEN )
1907 srcLen = wxWcslen(src) + 1;
1908
1909 srcLen *= BYTES_PER_CHAR;
c91830cb 1910
35d11700 1911 if ( dst )
c91830cb 1912 {
35d11700
VZ
1913 if ( dstLen < srcLen )
1914 return wxCONV_FAILED;
1915
ef199164 1916 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1917 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1918 {
ef199164 1919 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1920 }
c91830cb 1921 }
b5153fd8 1922
35d11700 1923 return srcLen;
c91830cb
VZ
1924}
1925
467e0479 1926#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1927
1928
36acb880
VZ
1929// ============================================================================
1930// The classes doing conversion using the iconv_xxx() functions
1931// ============================================================================
3caec1bb 1932
b040e242 1933#ifdef HAVE_ICONV
3a0d76bc 1934
b1d547eb
VS
1935// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1936// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1937// (unless there's yet another bug in glibc) the only case when iconv()
1938// returns with (size_t)-1 (which means error) and says there are 0 bytes
1939// left in the input buffer -- when _real_ error occurs,
1940// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1941// iconv() failure.
3caec1bb
VS
1942// [This bug does not appear in glibc 2.2.]
1943#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1944#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1945 (errno != E2BIG || bufLeft != 0))
1946#else
1947#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1948#endif
1949
ab217dba 1950#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1951
74a7eb0b
VZ
1952#define ICONV_T_INVALID ((iconv_t)-1)
1953
1954#if SIZEOF_WCHAR_T == 4
1955 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1956 #define WC_ENC wxFONTENCODING_UTF32
1957#elif SIZEOF_WCHAR_T == 2
1958 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1959 #define WC_ENC wxFONTENCODING_UTF16
1960#else // sizeof(wchar_t) != 2 nor 4
1961 // does this ever happen?
1962 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1963#endif
1964
36acb880 1965// ----------------------------------------------------------------------------
e95354ec 1966// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1967// ----------------------------------------------------------------------------
1968
e95354ec 1969class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1970{
1971public:
86501081 1972 wxMBConv_iconv(const char *name);
e95354ec 1973 virtual ~wxMBConv_iconv();
36acb880 1974
8f4b0f43
VZ
1975 // implement base class virtual methods
1976 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1977 const char *src, size_t srcLen = wxNO_LEN) const;
1978 virtual size_t FromWChar(char *dst, size_t dstLen,
1979 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
7ef3ab50
VZ
1980 virtual size_t GetMBNulLen() const;
1981
ba98e032
VS
1982#if wxUSE_UNICODE_UTF8
1983 virtual bool IsUTF8() const;
1984#endif
1985
d36c9347
VZ
1986 virtual wxMBConv *Clone() const
1987 {
86501081 1988 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1989 p->m_minMBCharWidth = m_minMBCharWidth;
1990 return p;
1991 }
1992
e95354ec 1993 bool IsOk() const
74a7eb0b 1994 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1995
1996protected:
ef199164
DS
1997 // the iconv handlers used to translate from multibyte
1998 // to wide char and in the other direction
36acb880
VZ
1999 iconv_t m2w,
2000 w2m;
ef199164 2001
b1d547eb
VS
2002#if wxUSE_THREADS
2003 // guards access to m2w and w2m objects
2004 wxMutex m_iconvMutex;
2005#endif
36acb880
VZ
2006
2007private:
e95354ec 2008 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 2009 // available on this machine, it will remain NULL
74a7eb0b 2010 static wxString ms_wcCharsetName;
36acb880
VZ
2011
2012 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2013 // different endian-ness than the native one
405d8f46 2014 static bool ms_wcNeedsSwap;
eec47cc6 2015
d36c9347
VZ
2016
2017 // name of the encoding handled by this conversion
2018 wxString m_name;
2019
7ef3ab50 2020 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
2021 // initially
2022 size_t m_minMBCharWidth;
36acb880
VZ
2023};
2024
8f115891 2025// make the constructor available for unit testing
86501081 2026WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
2027{
2028 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2029 if ( !result->IsOk() )
2030 {
2031 delete result;
2032 return 0;
2033 }
ef199164 2034
8f115891
MW
2035 return result;
2036}
2037
422e411e 2038wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 2039bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 2040
86501081 2041wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 2042 : m_name(name)
36acb880 2043{
c1464d9d 2044 m_minMBCharWidth = 0;
eec47cc6 2045
36acb880 2046 // check for charset that represents wchar_t:
74a7eb0b 2047 if ( ms_wcCharsetName.empty() )
f1339c56 2048 {
c2b83fdd
VZ
2049 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2050
74a7eb0b
VZ
2051#if wxUSE_FONTMAP
2052 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2053#else // !wxUSE_FONTMAP
91cb7f52 2054 static const wxChar *names_static[] =
36acb880 2055 {
74a7eb0b
VZ
2056#if SIZEOF_WCHAR_T == 4
2057 _T("UCS-4"),
2058#elif SIZEOF_WCHAR_T = 2
2059 _T("UCS-2"),
2060#endif
2061 NULL
2062 };
91cb7f52 2063 const wxChar **names = names_static;
74a7eb0b 2064#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 2065
d1f024a8 2066 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 2067 {
17a1ebd1 2068 const wxString nameCS(*names);
74a7eb0b
VZ
2069
2070 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 2071 wxString nameXE(nameCS);
ef199164
DS
2072
2073#ifdef WORDS_BIGENDIAN
74a7eb0b 2074 nameXE += _T("BE");
ef199164 2075#else // little endian
74a7eb0b 2076 nameXE += _T("LE");
ef199164 2077#endif
74a7eb0b 2078
c2b83fdd
VZ
2079 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2080 nameXE.c_str());
2081
86501081 2082 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 2083 if ( m2w == ICONV_T_INVALID )
3a0d76bc 2084 {
74a7eb0b 2085 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
2086 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2087 nameCS.c_str());
86501081 2088 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 2089
74a7eb0b
VZ
2090 // and check for bytesex ourselves:
2091 if ( m2w != ICONV_T_INVALID )
3a0d76bc 2092 {
74a7eb0b 2093 char buf[2], *bufPtr;
e8769ed1 2094 wchar_t wbuf[2];
74a7eb0b
VZ
2095 size_t insz, outsz;
2096 size_t res;
2097
2098 buf[0] = 'A';
2099 buf[1] = 0;
2100 wbuf[0] = 0;
2101 insz = 2;
2102 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 2103 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
2104 bufPtr = buf;
2105
ef199164
DS
2106 res = iconv(
2107 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 2108 &wbufPtr, &outsz);
74a7eb0b
VZ
2109
2110 if (ICONV_FAILED(res, insz))
2111 {
2112 wxLogLastError(wxT("iconv"));
422e411e 2113 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2114 nameCS.c_str());
74a7eb0b
VZ
2115 }
2116 else // ok, can convert to this encoding, remember it
2117 {
17a1ebd1 2118 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2119 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2120 }
3a0d76bc
VS
2121 }
2122 }
74a7eb0b 2123 else // use charset not requiring byte swapping
36acb880 2124 {
74a7eb0b 2125 ms_wcCharsetName = nameXE;
36acb880 2126 }
3a0d76bc 2127 }
74a7eb0b 2128
0944fceb 2129 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2130 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2131 ms_wcCharsetName.empty() ? wxString("<none>")
2132 : ms_wcCharsetName,
74a7eb0b
VZ
2133 ms_wcNeedsSwap ? _T(" (needs swap)")
2134 : _T(""));
3a0d76bc 2135 }
36acb880 2136 else // we already have ms_wcCharsetName
3caec1bb 2137 {
86501081 2138 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2139 }
dccce9ea 2140
74a7eb0b 2141 if ( ms_wcCharsetName.empty() )
f1339c56 2142 {
74a7eb0b 2143 w2m = ICONV_T_INVALID;
36acb880 2144 }
405d8f46
VZ
2145 else
2146 {
86501081 2147 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2148 if ( w2m == ICONV_T_INVALID )
2149 {
2150 wxLogTrace(TRACE_STRCONV,
2151 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2152 ms_wcCharsetName.c_str(), name);
74a7eb0b 2153 }
405d8f46 2154 }
36acb880 2155}
3caec1bb 2156
e95354ec 2157wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2158{
74a7eb0b 2159 if ( m2w != ICONV_T_INVALID )
36acb880 2160 iconv_close(m2w);
74a7eb0b 2161 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2162 iconv_close(w2m);
2163}
3a0d76bc 2164
8f4b0f43
VZ
2165size_t
2166wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2167 const char *src, size_t srcLen) const
36acb880 2168{
8f4b0f43 2169 if ( srcLen == wxNO_LEN )
69373110 2170 {
8f4b0f43
VZ
2171 // find the string length: notice that must be done differently for
2172 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2173 // consecutive NULs
2174 const size_t nulLen = GetMBNulLen();
2175 switch ( nulLen )
2176 {
2177 default:
2178 return wxCONV_FAILED;
69373110 2179
8f4b0f43
VZ
2180 case 1:
2181 srcLen = strlen(src); // arguably more optimized than our version
2182 break;
69373110 2183
8f4b0f43
VZ
2184 case 2:
2185 case 4:
2186 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2187 // but they also have to start at character boundary and not
2188 // span two adjacent characters
2189 const char *p;
2190 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2191 ;
2192 srcLen = p - src;
2193 break;
2194 }
d50c0831
VZ
2195
2196 // when we're determining the length of the string ourselves we count
2197 // the terminating NUL(s) as part of it and always NUL-terminate the
2198 // output
2199 srcLen += nulLen;
69373110
VZ
2200 }
2201
8f4b0f43
VZ
2202 // we express length in the number of (wide) characters but iconv always
2203 // counts buffer sizes it in bytes
2204 dstLen *= SIZEOF_WCHAR_T;
2205
b1d547eb 2206#if wxUSE_THREADS
6a17b868
SN
2207 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2208 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2209 // wxConvLocal that are used all over wx code, so we have to make sure
2210 // the handle is used by at most one thread at the time. Otherwise
2211 // only a few wx classes would be safe to use from non-main threads
2212 // as MB<->WC conversion would fail "randomly".
2213 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2214#endif // wxUSE_THREADS
2215
36acb880 2216 size_t res, cres;
8f4b0f43 2217 const char *pszPtr = src;
36acb880 2218
8f4b0f43 2219 if ( dst )
36acb880 2220 {
8f4b0f43 2221 char* bufPtr = (char*)dst;
e8769ed1 2222
36acb880 2223 // have destination buffer, convert there
1752fda6 2224 size_t dstLenOrig = dstLen;
36acb880 2225 cres = iconv(m2w,
8f4b0f43
VZ
2226 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2227 &bufPtr, &dstLen);
1752fda6
VZ
2228
2229 // convert the number of bytes converted as returned by iconv to the
2230 // number of (wide) characters converted that we need
2231 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
dccce9ea 2232
36acb880 2233 if (ms_wcNeedsSwap)
3a0d76bc 2234 {
36acb880 2235 // convert to native endianness
17a1ebd1 2236 for ( unsigned i = 0; i < res; i++ )
467a2982 2237 dst[i] = WC_BSWAP(dst[i]);
3a0d76bc 2238 }
36acb880 2239 }
8f4b0f43 2240 else // no destination buffer
36acb880 2241 {
8f4b0f43 2242 // convert using temp buffer to calculate the size of the buffer needed
36acb880
VZ
2243 wchar_t tbuf[8];
2244 res = 0;
ef199164
DS
2245
2246 do
2247 {
e8769ed1 2248 char* bufPtr = (char*)tbuf;
8f4b0f43 2249 dstLen = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2250
2251 cres = iconv(m2w,
8f4b0f43
VZ
2252 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2253 &bufPtr, &dstLen );
36acb880 2254
8f4b0f43 2255 res += 8 - (dstLen / SIZEOF_WCHAR_T);
ef199164
DS
2256 }
2257 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2258 }
dccce9ea 2259
8f4b0f43 2260 if (ICONV_FAILED(cres, srcLen))
f1339c56 2261 {
36acb880 2262 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2263 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2264 return wxCONV_FAILED;
36acb880
VZ
2265 }
2266
2267 return res;
2268}
2269
8f4b0f43
VZ
2270size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2271 const wchar_t *src, size_t srcLen) const
36acb880 2272{
b1d547eb
VS
2273#if wxUSE_THREADS
2274 // NB: explained in MB2WC
2275 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2276#endif
3698ae71 2277
8f4b0f43 2278 if ( srcLen == wxNO_LEN )
2588ee86 2279 srcLen = wxWcslen(src) + 1;
8f4b0f43
VZ
2280
2281 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2282 size_t outbuflen = dstLen;
36acb880 2283 size_t res, cres;
3a0d76bc 2284
36acb880 2285 wchar_t *tmpbuf = 0;
3caec1bb 2286
36acb880
VZ
2287 if (ms_wcNeedsSwap)
2288 {
2289 // need to copy to temp buffer to switch endianness
74a7eb0b 2290 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2291 // could be in read-only memory, or be accessed in some other thread)
e8769ed1 2292 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
8f4b0f43
VZ
2293 for ( size_t i = 0; i < srcLen; i++ )
2294 tmpbuf[i] = WC_BSWAP(src[i]);
ef199164 2295
8f4b0f43
VZ
2296 tmpbuf[srcLen] = L'\0';
2297 src = tmpbuf;
36acb880 2298 }
3a0d76bc 2299
8f4b0f43
VZ
2300 char* inbuf = (char*)src;
2301 if ( dst )
36acb880
VZ
2302 {
2303 // have destination buffer, convert there
8f4b0f43 2304 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
3a0d76bc 2305
8f4b0f43 2306 res = dstLen - outbuflen;
36acb880 2307 }
8f4b0f43 2308 else // no destination buffer
36acb880 2309 {
8f4b0f43 2310 // convert using temp buffer to calculate the size of the buffer needed
36acb880
VZ
2311 char tbuf[16];
2312 res = 0;
ef199164
DS
2313 do
2314 {
8f4b0f43 2315 dst = tbuf;
e8769ed1 2316 outbuflen = 16;
36acb880 2317
8f4b0f43 2318 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
dccce9ea 2319
e8769ed1 2320 res += 16 - outbuflen;
ef199164
DS
2321 }
2322 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2323 }
dccce9ea 2324
36acb880
VZ
2325 if (ms_wcNeedsSwap)
2326 {
2327 free(tmpbuf);
2328 }
dccce9ea 2329
e8769ed1 2330 if (ICONV_FAILED(cres, inbuflen))
36acb880 2331 {
ce6f8d6f 2332 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2333 return wxCONV_FAILED;
36acb880
VZ
2334 }
2335
2336 return res;
2337}
2338
7ef3ab50 2339size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2340{
c1464d9d 2341 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2342 {
2343 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2344
2345#if wxUSE_THREADS
2346 // NB: explained in MB2WC
2347 wxMutexLocker lock(self->m_iconvMutex);
2348#endif
2349
999020e1 2350 const wchar_t *wnul = L"";
c1464d9d 2351 char buf[8]; // should be enough for NUL in any encoding
356410fc 2352 size_t inLen = sizeof(wchar_t),
c1464d9d 2353 outLen = WXSIZEOF(buf);
ef199164
DS
2354 char *inBuff = (char *)wnul;
2355 char *outBuff = buf;
2356 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2357 {
c1464d9d 2358 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2359 }
2360 else // ok
2361 {
ef199164 2362 self->m_minMBCharWidth = outBuff - buf;
356410fc 2363 }
eec47cc6
VZ
2364 }
2365
c1464d9d 2366 return m_minMBCharWidth;
eec47cc6
VZ
2367}
2368
ba98e032
VS
2369#if wxUSE_UNICODE_UTF8
2370bool wxMBConv_iconv::IsUTF8() const
2371{
86501081
VS
2372 return wxStricmp(m_name, "UTF-8") == 0 ||
2373 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2374}
2375#endif
2376
b040e242 2377#endif // HAVE_ICONV
36acb880 2378
e95354ec 2379
36acb880
VZ
2380// ============================================================================
2381// Win32 conversion classes
2382// ============================================================================
1cd52418 2383
e95354ec 2384#ifdef wxHAVE_WIN32_MB2WC
373658eb 2385
8b04d4c4 2386// from utils.cpp
d775fa82 2387#if wxUSE_FONTMAP
86501081 2388extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2389extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2390#endif
373658eb 2391
e95354ec 2392class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2393{
2394public:
bde4baac
VZ
2395 wxMBConv_win32()
2396 {
2397 m_CodePage = CP_ACP;
c1464d9d 2398 m_minMBCharWidth = 0;
bde4baac
VZ
2399 }
2400
d36c9347 2401 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2402 : wxMBConv()
d36c9347
VZ
2403 {
2404 m_CodePage = conv.m_CodePage;
2405 m_minMBCharWidth = conv.m_minMBCharWidth;
2406 }
2407
7608a683 2408#if wxUSE_FONTMAP
86501081 2409 wxMBConv_win32(const char* name)
bde4baac
VZ
2410 {
2411 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2412 m_minMBCharWidth = 0;
bde4baac 2413 }
dccce9ea 2414
e95354ec 2415 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2416 {
2417 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2418 m_minMBCharWidth = 0;
bde4baac 2419 }
eec47cc6 2420#endif // wxUSE_FONTMAP
8b04d4c4 2421
d36c9347 2422 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2423 {
02272c9c
VZ
2424 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2425 // the behaviour is not compatible with the Unix version (using iconv)
2426 // and break the library itself, e.g. wxTextInputStream::NextChar()
2427 // wouldn't work if reading an incomplete MB char didn't result in an
2428 // error
667e5b3e 2429 //
89028980 2430 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2431 // Win XP or newer and it is not supported for UTF-[78] so we always
2432 // use our own conversions in this case. See
89028980
VS
2433 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2434 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2435 if ( m_CodePage == CP_UTF8 )
89028980 2436 {
5487ff0f 2437 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2438 }
830f8f11
VZ
2439
2440 if ( m_CodePage == CP_UTF7 )
2441 {
5487ff0f 2442 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2443 }
2444
2445 int flags = 0;
2446 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2447 IsAtLeastWin2kSP4() )
89028980 2448 {
830f8f11 2449 flags = MB_ERR_INVALID_CHARS;
89028980 2450 }
667e5b3e 2451
2b5f62a0
VZ
2452 const size_t len = ::MultiByteToWideChar
2453 (
2454 m_CodePage, // code page
667e5b3e 2455 flags, // flags: fall on error
2b5f62a0
VZ
2456 psz, // input string
2457 -1, // its length (NUL-terminated)
b4da152e 2458 buf, // output string
2b5f62a0
VZ
2459 buf ? n : 0 // size of output buffer
2460 );
89028980
VS
2461 if ( !len )
2462 {
2463 // function totally failed
467e0479 2464 return wxCONV_FAILED;
89028980
VS
2465 }
2466
2467 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2468 // check if we succeeded, by doing a double trip:
2469 if ( !flags && buf )
2470 {
53c174fc
VZ
2471 const size_t mbLen = strlen(psz);
2472 wxCharBuffer mbBuf(mbLen);
89028980
VS
2473 if ( ::WideCharToMultiByte
2474 (
2475 m_CodePage,
2476 0,
2477 buf,
2478 -1,
2479 mbBuf.data(),
53c174fc 2480 mbLen + 1, // size in bytes, not length
89028980
VS
2481 NULL,
2482 NULL
2483 ) == 0 ||
2484 strcmp(mbBuf, psz) != 0 )
2485 {
2486 // we didn't obtain the same thing we started from, hence
2487 // the conversion was lossy and we consider that it failed
467e0479 2488 return wxCONV_FAILED;
89028980
VS
2489 }
2490 }
2b5f62a0 2491
03a991bc
VZ
2492 // note that it returns count of written chars for buf != NULL and size
2493 // of the needed buffer for buf == NULL so in either case the length of
2494 // the string (which never includes the terminating NUL) is one less
89028980 2495 return len - 1;
f1339c56 2496 }
dccce9ea 2497
d36c9347 2498 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2499 {
13dd924a
VZ
2500 /*
2501 we have a problem here: by default, WideCharToMultiByte() may
2502 replace characters unrepresentable in the target code page with bad
2503 quality approximations such as turning "1/2" symbol (U+00BD) into
2504 "1" for the code pages which don't have it and we, obviously, want
2505 to avoid this at any price
d775fa82 2506
13dd924a
VZ
2507 the trouble is that this function does it _silently_, i.e. it won't
2508 even tell us whether it did or not... Win98/2000 and higher provide
2509 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2510 we have to resort to a round trip, i.e. check that converting back
2511 results in the same string -- this is, of course, expensive but
2512 otherwise we simply can't be sure to not garble the data.
2513 */
2514
2515 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2516 // it doesn't work with CJK encodings (which we test for rather roughly
2517 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2518 // supporting it
907173e5
WS
2519 BOOL usedDef wxDUMMY_INITIALIZE(false);
2520 BOOL *pUsedDef;
13dd924a
VZ
2521 int flags;
2522 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2523 {
2524 // it's our lucky day
2525 flags = WC_NO_BEST_FIT_CHARS;
2526 pUsedDef = &usedDef;
2527 }
2528 else // old system or unsupported encoding
2529 {
2530 flags = 0;
2531 pUsedDef = NULL;
2532 }
2533
2b5f62a0
VZ
2534 const size_t len = ::WideCharToMultiByte
2535 (
2536 m_CodePage, // code page
13dd924a
VZ
2537 flags, // either none or no best fit
2538 pwz, // input string
2b5f62a0
VZ
2539 -1, // it is (wide) NUL-terminated
2540 buf, // output buffer
2541 buf ? n : 0, // and its size
2542 NULL, // default "replacement" char
13dd924a 2543 pUsedDef // [out] was it used?
2b5f62a0
VZ
2544 );
2545
13dd924a
VZ
2546 if ( !len )
2547 {
2548 // function totally failed
467e0479 2549 return wxCONV_FAILED;
13dd924a
VZ
2550 }
2551
765bdb4a
VZ
2552 // we did something, check if we really succeeded
2553 if ( flags )
13dd924a 2554 {
765bdb4a
VZ
2555 // check if the conversion failed, i.e. if any replacements
2556 // were done
2557 if ( usedDef )
2558 return wxCONV_FAILED;
2559 }
2560 else // we must resort to double tripping...
2561 {
2562 // first we need to ensure that we really have the MB data: this is
2563 // not the case if we're called with NULL buffer, in which case we
2564 // need to do the conversion yet again
2565 wxCharBuffer bufDef;
2566 if ( !buf )
13dd924a 2567 {
765bdb4a
VZ
2568 bufDef = wxCharBuffer(len);
2569 buf = bufDef.data();
2570 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2571 buf, len, NULL, NULL) )
467e0479 2572 return wxCONV_FAILED;
13dd924a 2573 }
765bdb4a 2574
564da6ff
VZ
2575 if ( !n )
2576 n = wcslen(pwz);
765bdb4a 2577 wxWCharBuffer wcBuf(n);
564da6ff 2578 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2579 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2580 {
765bdb4a
VZ
2581 // we didn't obtain the same thing we started from, hence
2582 // the conversion was lossy and we consider that it failed
2583 return wxCONV_FAILED;
13dd924a
VZ
2584 }
2585 }
2586
03a991bc 2587 // see the comment above for the reason of "len - 1"
13dd924a 2588 return len - 1;
f1339c56 2589 }
dccce9ea 2590
7ef3ab50
VZ
2591 virtual size_t GetMBNulLen() const
2592 {
2593 if ( m_minMBCharWidth == 0 )
2594 {
2595 int len = ::WideCharToMultiByte
2596 (
2597 m_CodePage, // code page
2598 0, // no flags
2599 L"", // input string
2600 1, // translate just the NUL
2601 NULL, // output buffer
2602 0, // and its size
2603 NULL, // no replacement char
2604 NULL // [out] don't care if it was used
2605 );
2606
2607 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2608 switch ( len )
2609 {
2610 default:
2611 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2612 self->m_minMBCharWidth = (size_t)-1;
2613 break;
7ef3ab50
VZ
2614
2615 case 0:
2616 self->m_minMBCharWidth = (size_t)-1;
2617 break;
2618
2619 case 1:
2620 case 2:
2621 case 4:
2622 self->m_minMBCharWidth = len;
2623 break;
2624 }
2625 }
2626
2627 return m_minMBCharWidth;
2628 }
2629
d36c9347
VZ
2630 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2631
13dd924a
VZ
2632 bool IsOk() const { return m_CodePage != -1; }
2633
2634private:
2635 static bool CanUseNoBestFit()
2636 {
2637 static int s_isWin98Or2k = -1;
2638
2639 if ( s_isWin98Or2k == -1 )
2640 {
2641 int verMaj, verMin;
2642 switch ( wxGetOsVersion(&verMaj, &verMin) )
2643 {
406d283a 2644 case wxOS_WINDOWS_9X:
13dd924a
VZ
2645 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2646 break;
2647
406d283a 2648 case wxOS_WINDOWS_NT:
13dd924a
VZ
2649 s_isWin98Or2k = verMaj >= 5;
2650 break;
2651
2652 default:
ef199164 2653 // unknown: be conservative by default
13dd924a 2654 s_isWin98Or2k = 0;
ef199164 2655 break;
13dd924a
VZ
2656 }
2657
2658 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2659 }
2660
2661 return s_isWin98Or2k == 1;
2662 }
f1339c56 2663
89028980
VS
2664 static bool IsAtLeastWin2kSP4()
2665 {
8942f83a
WS
2666#ifdef __WXWINCE__
2667 return false;
2668#else
89028980
VS
2669 static int s_isAtLeastWin2kSP4 = -1;
2670
2671 if ( s_isAtLeastWin2kSP4 == -1 )
2672 {
2673 OSVERSIONINFOEX ver;
2674
2675 memset(&ver, 0, sizeof(ver));
2676 ver.dwOSVersionInfoSize = sizeof(ver);
2677 GetVersionEx((OSVERSIONINFO*)&ver);
2678
2679 s_isAtLeastWin2kSP4 =
2680 ((ver.dwMajorVersion > 5) || // Vista+
2681 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2682 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2683 ver.wServicePackMajor >= 4)) // 2000 SP4+
2684 ? 1 : 0;
2685 }
2686
2687 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2688#endif
89028980
VS
2689 }
2690
eec47cc6 2691
c1464d9d 2692 // the code page we're working with
b1d66b54 2693 long m_CodePage;
c1464d9d 2694
7ef3ab50 2695 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2696 // "unknown"
2697 size_t m_minMBCharWidth;
1cd52418 2698};
e95354ec
VZ
2699
2700#endif // wxHAVE_WIN32_MB2WC
2701
f7e98dee 2702
36acb880
VZ
2703// ============================================================================
2704// wxEncodingConverter based conversion classes
2705// ============================================================================
2706
1e6feb95 2707#if wxUSE_FONTMAP
1cd52418 2708
e95354ec 2709class wxMBConv_wxwin : public wxMBConv
1cd52418 2710{
8b04d4c4
VZ
2711private:
2712 void Init()
2713 {
6ac84a78
DE
2714 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2715 // The wxMBConv_cf class does a better job.
2716 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2717 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2718 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2719 }
2720
6001e347 2721public:
f1339c56
RR
2722 // temporarily just use wxEncodingConverter stuff,
2723 // so that it works while a better implementation is built
86501081 2724 wxMBConv_wxwin(const char* name)
f1339c56
RR
2725 {
2726 if (name)
267e11c5 2727 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2728 else
2729 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2730
8b04d4c4
VZ
2731 Init();
2732 }
2733
e95354ec 2734 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2735 {
2736 m_enc = enc;
2737
2738 Init();
f1339c56 2739 }
dccce9ea 2740
bde4baac 2741 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2742 {
2743 size_t inbuf = strlen(psz);
dccce9ea 2744 if (buf)
c643a977 2745 {
ef199164 2746 if (!m2w.Convert(psz, buf))
467e0479 2747 return wxCONV_FAILED;
c643a977 2748 }
f1339c56
RR
2749 return inbuf;
2750 }
dccce9ea 2751
bde4baac 2752 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2753 {
f8d791e0 2754 const size_t inbuf = wxWcslen(psz);
f1339c56 2755 if (buf)
c643a977 2756 {
ef199164 2757 if (!w2m.Convert(psz, buf))
467e0479 2758 return wxCONV_FAILED;
c643a977 2759 }
dccce9ea 2760
f1339c56
RR
2761 return inbuf;
2762 }
dccce9ea 2763
7ef3ab50 2764 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2765 {
2766 switch ( m_enc )
2767 {
2768 case wxFONTENCODING_UTF16BE:
2769 case wxFONTENCODING_UTF16LE:
c1464d9d 2770 return 2;
eec47cc6
VZ
2771
2772 case wxFONTENCODING_UTF32BE:
2773 case wxFONTENCODING_UTF32LE:
c1464d9d 2774 return 4;
eec47cc6
VZ
2775
2776 default:
c1464d9d 2777 return 1;
eec47cc6
VZ
2778 }
2779 }
2780
d36c9347
VZ
2781 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2782
7ef3ab50
VZ
2783 bool IsOk() const { return m_ok; }
2784
2785public:
2786 wxFontEncoding m_enc;
2787 wxEncodingConverter m2w, w2m;
2788
2789private:
cafbf6fb
VZ
2790 // were we initialized successfully?
2791 bool m_ok;
fc7a2a60 2792
e95354ec 2793 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2794};
6001e347 2795
8f115891 2796// make the constructors available for unit testing
86501081 2797WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2798{
2799 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2800 if ( !result->IsOk() )
2801 {
2802 delete result;
2803 return 0;
2804 }
ef199164 2805
8f115891
MW
2806 return result;
2807}
2808
1e6feb95
VZ
2809#endif // wxUSE_FONTMAP
2810
36acb880
VZ
2811// ============================================================================
2812// wxCSConv implementation
2813// ============================================================================
2814
8b04d4c4 2815void wxCSConv::Init()
6001e347 2816{
e95354ec
VZ
2817 m_name = NULL;
2818 m_convReal = NULL;
2819 m_deferred = true;
2820}
2821
86501081 2822wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2823{
2824 Init();
82713003 2825
86501081 2826 if ( !charset.empty() )
e95354ec 2827 {
86501081 2828 SetName(charset.ToAscii());
e95354ec 2829 }
bda3d86a 2830
e4277538
VZ
2831#if wxUSE_FONTMAP
2832 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2833#else
bda3d86a 2834 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2835#endif
6001e347
RR
2836}
2837
8b04d4c4
VZ
2838wxCSConv::wxCSConv(wxFontEncoding encoding)
2839{
bda3d86a 2840 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2841 {
2842 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2843
2844 encoding = wxFONTENCODING_SYSTEM;
2845 }
2846
8b04d4c4
VZ
2847 Init();
2848
bda3d86a 2849 m_encoding = encoding;
8b04d4c4
VZ
2850}
2851
6001e347
RR
2852wxCSConv::~wxCSConv()
2853{
65e50848
JS
2854 Clear();
2855}
2856
54380f29 2857wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2858 : wxMBConv()
54380f29 2859{
8b04d4c4
VZ
2860 Init();
2861
54380f29 2862 SetName(conv.m_name);
8b04d4c4 2863 m_encoding = conv.m_encoding;
54380f29
GD
2864}
2865
2866wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2867{
2868 Clear();
8b04d4c4 2869
54380f29 2870 SetName(conv.m_name);
8b04d4c4
VZ
2871 m_encoding = conv.m_encoding;
2872
54380f29
GD
2873 return *this;
2874}
2875
65e50848
JS
2876void wxCSConv::Clear()
2877{
8b04d4c4 2878 free(m_name);
e95354ec 2879 delete m_convReal;
8b04d4c4 2880
65e50848 2881 m_name = NULL;
e95354ec 2882 m_convReal = NULL;
6001e347
RR
2883}
2884
86501081 2885void wxCSConv::SetName(const char *charset)
6001e347 2886{
f1339c56
RR
2887 if (charset)
2888 {
d6f2a891 2889 m_name = wxStrdup(charset);
e95354ec 2890 m_deferred = true;
f1339c56 2891 }
6001e347
RR
2892}
2893
8b3eb85d 2894#if wxUSE_FONTMAP
8b3eb85d
VZ
2895
2896WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2897 wxEncodingNameCache );
8b3eb85d
VZ
2898
2899static wxEncodingNameCache gs_nameCache;
2900#endif
2901
e95354ec
VZ
2902wxMBConv *wxCSConv::DoCreate() const
2903{
ce6f8d6f
VZ
2904#if wxUSE_FONTMAP
2905 wxLogTrace(TRACE_STRCONV,
2906 wxT("creating conversion for %s"),
2907 (m_name ? m_name
86501081 2908 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2909#endif // wxUSE_FONTMAP
2910
c547282d
VZ
2911 // check for the special case of ASCII or ISO8859-1 charset: as we have
2912 // special knowledge of it anyhow, we don't need to create a special
2913 // conversion object
e4277538
VZ
2914 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2915 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2916 {
e95354ec
VZ
2917 // don't convert at all
2918 return NULL;
2919 }
dccce9ea 2920
e95354ec
VZ
2921 // we trust OS to do conversion better than we can so try external
2922 // conversion methods first
2923 //
2924 // the full order is:
2925 // 1. OS conversion (iconv() under Unix or Win32 API)
2926 // 2. hard coded conversions for UTF
2927 // 3. wxEncodingConverter as fall back
2928
2929 // step (1)
2930#ifdef HAVE_ICONV
c547282d 2931#if !wxUSE_FONTMAP
e95354ec 2932 if ( m_name )
c547282d 2933#endif // !wxUSE_FONTMAP
e95354ec 2934 {
3ef10cfc 2935#if wxUSE_FONTMAP
8b3eb85d 2936 wxFontEncoding encoding(m_encoding);
3ef10cfc 2937#endif
8b3eb85d 2938
86501081 2939 if ( m_name )
8b3eb85d 2940 {
86501081 2941 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2942 if ( conv->IsOk() )
2943 return conv;
2944
2945 delete conv;
c547282d
VZ
2946
2947#if wxUSE_FONTMAP
8b3eb85d 2948 encoding =
86501081 2949 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2950#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2951 }
2952#if wxUSE_FONTMAP
2953 {
2954 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2955 if ( it != gs_nameCache.end() )
2956 {
2957 if ( it->second.empty() )
2958 return NULL;
c547282d 2959
86501081 2960 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2961 if ( conv->IsOk() )
2962 return conv;
e95354ec 2963
8b3eb85d
VZ
2964 delete conv;
2965 }
2966
2967 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2968 // CS : in case this does not return valid names (eg for MacRoman)
2969 // encoding got a 'failure' entry in the cache all the same,
2970 // although it just has to be created using a different method, so
2971 // only store failed iconv creation attempts (or perhaps we
2972 // shoulnd't do this at all ?)
3c67ec06 2973 if ( names[0] != NULL )
8b3eb85d 2974 {
3c67ec06 2975 for ( ; *names; ++names )
8b3eb85d 2976 {
86501081
VS
2977 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2978 // will need changes that will obsolete this
2979 wxString name(*names);
2980 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2981 if ( conv->IsOk() )
2982 {
2983 gs_nameCache[encoding] = *names;
2984 return conv;
2985 }
2986
2987 delete conv;
8b3eb85d
VZ
2988 }
2989
3c67ec06 2990 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2991 }
8b3eb85d
VZ
2992 }
2993#endif // wxUSE_FONTMAP
e95354ec
VZ
2994 }
2995#endif // HAVE_ICONV
2996
2997#ifdef wxHAVE_WIN32_MB2WC
2998 {
7608a683 2999#if wxUSE_FONTMAP
e95354ec
VZ
3000 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3001 : new wxMBConv_win32(m_encoding);
3002 if ( conv->IsOk() )
3003 return conv;
3004
3005 delete conv;
7608a683
WS
3006#else
3007 return NULL;
3008#endif
e95354ec
VZ
3009 }
3010#endif // wxHAVE_WIN32_MB2WC
ef199164 3011
5c4ed98d 3012#ifdef __DARWIN__
f7e98dee 3013 {
6ff49cbc
DE
3014 // leave UTF16 and UTF32 to the built-ins of wx
3015 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3016 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 3017 {
a6900d10 3018#if wxUSE_FONTMAP
5c4ed98d
DE
3019 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3020 : new wxMBConv_cf(m_encoding);
a6900d10 3021#else
5c4ed98d 3022 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 3023#endif
ef199164 3024
f7e98dee 3025 if ( conv->IsOk() )
d775fa82
WS
3026 return conv;
3027
3028 delete conv;
3029 }
335d31e0 3030 }
5c4ed98d
DE
3031#endif // __DARWIN__
3032
e95354ec
VZ
3033 // step (2)
3034 wxFontEncoding enc = m_encoding;
3035#if wxUSE_FONTMAP
c547282d
VZ
3036 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3037 {
3038 // use "false" to suppress interactive dialogs -- we can be called from
3039 // anywhere and popping up a dialog from here is the last thing we want to
3040 // do
267e11c5 3041 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3042 }
e95354ec
VZ
3043#endif // wxUSE_FONTMAP
3044
3045 switch ( enc )
3046 {
3047 case wxFONTENCODING_UTF7:
3048 return new wxMBConvUTF7;
3049
3050 case wxFONTENCODING_UTF8:
3051 return new wxMBConvUTF8;
3052
e95354ec
VZ
3053 case wxFONTENCODING_UTF16BE:
3054 return new wxMBConvUTF16BE;
3055
3056 case wxFONTENCODING_UTF16LE:
3057 return new wxMBConvUTF16LE;
3058
e95354ec
VZ
3059 case wxFONTENCODING_UTF32BE:
3060 return new wxMBConvUTF32BE;
3061
3062 case wxFONTENCODING_UTF32LE:
3063 return new wxMBConvUTF32LE;
3064
3065 default:
3066 // nothing to do but put here to suppress gcc warnings
ef199164 3067 break;
e95354ec
VZ
3068 }
3069
3070 // step (3)
3071#if wxUSE_FONTMAP
3072 {
3073 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3074 : new wxMBConv_wxwin(m_encoding);
3075 if ( conv->IsOk() )
3076 return conv;
3077
3078 delete conv;
3079 }
3080#endif // wxUSE_FONTMAP
3081
a58d4f4d
VS
3082 // NB: This is a hack to prevent deadlock. What could otherwise happen
3083 // in Unicode build: wxConvLocal creation ends up being here
3084 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
3085 // attach a timestamp, for which it will need wxConvLocal (to convert
3086 // time to char* and then wchar_t*), but that fails, tries to log the
3087 // error, but wxLog has an (already locked) critical section that
3088 // guards the static buffer.
a58d4f4d
VS
3089 static bool alreadyLoggingError = false;
3090 if (!alreadyLoggingError)
3091 {
3092 alreadyLoggingError = true;
3093 wxLogError(_("Cannot convert from the charset '%s'!"),
3094 m_name ? m_name
e95354ec
VZ
3095 :
3096#if wxUSE_FONTMAP
86501081 3097 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 3098#else // !wxUSE_FONTMAP
86501081 3099 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
3100#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3101 );
ef199164 3102
a58d4f4d
VS
3103 alreadyLoggingError = false;
3104 }
e95354ec
VZ
3105
3106 return NULL;
3107}
3108
3109void wxCSConv::CreateConvIfNeeded() const
3110{
3111 if ( m_deferred )
3112 {
3113 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 3114
bda3d86a
VZ
3115 // if we don't have neither the name nor the encoding, use the default
3116 // encoding for this system
3117 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3118 {
4c75209f 3119#if wxUSE_INTL
02c7347b 3120 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3121#else
3122 // fallback to some reasonable default:
3123 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3124#endif // wxUSE_INTL
4c75209f 3125 }
bda3d86a 3126
e95354ec
VZ
3127 self->m_convReal = DoCreate();
3128 self->m_deferred = false;
6001e347 3129 }
6001e347
RR
3130}
3131
0f0298b1
VZ
3132bool wxCSConv::IsOk() const
3133{
3134 CreateConvIfNeeded();
3135
3136 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3137 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3138 return true; // always ok as we do it ourselves
3139
3140 // m_convReal->IsOk() is called at its own creation, so we know it must
3141 // be ok if m_convReal is non-NULL
3142 return m_convReal != NULL;
3143}
3144
1c714a5d
VZ
3145size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3146 const char *src, size_t srcLen) const
3147{
3148 CreateConvIfNeeded();
3149
2c74c558
VS
3150 if (m_convReal)
3151 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3152
3153 // latin-1 (direct)
3154 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3155}
3156
3157size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3158 const wchar_t *src, size_t srcLen) const
3159{
3160 CreateConvIfNeeded();
3161
2c74c558
VS
3162 if (m_convReal)
3163 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3164
3165 // latin-1 (direct)
3166 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3167}
3168
6001e347
RR
3169size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3170{
e95354ec 3171 CreateConvIfNeeded();
dccce9ea 3172
e95354ec
VZ
3173 if (m_convReal)
3174 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3175
3176 // latin-1 (direct)
4def3b35 3177 size_t len = strlen(psz);
dccce9ea 3178
f1339c56
RR
3179 if (buf)
3180 {
4def3b35 3181 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3182 buf[c] = (unsigned char)(psz[c]);
3183 }
dccce9ea 3184
f1339c56 3185 return len;
6001e347
RR
3186}
3187
3188size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3189{
e95354ec 3190 CreateConvIfNeeded();
dccce9ea 3191
e95354ec
VZ
3192 if (m_convReal)
3193 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3194
f1339c56 3195 // latin-1 (direct)
f8d791e0 3196 const size_t len = wxWcslen(psz);
f1339c56
RR
3197 if (buf)
3198 {
4def3b35 3199 for (size_t c = 0; c <= len; c++)
24642831
VS
3200 {
3201 if (psz[c] > 0xFF)
467e0479 3202 return wxCONV_FAILED;
ef199164 3203
907173e5 3204 buf[c] = (char)psz[c];
24642831
VS
3205 }
3206 }
3207 else
3208 {
3209 for (size_t c = 0; c <= len; c++)
3210 {
3211 if (psz[c] > 0xFF)
467e0479 3212 return wxCONV_FAILED;
24642831 3213 }
f1339c56 3214 }
dccce9ea 3215
f1339c56 3216 return len;
6001e347
RR
3217}
3218
7ef3ab50 3219size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3220{
3221 CreateConvIfNeeded();
3222
3223 if ( m_convReal )
3224 {
7ef3ab50 3225 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3226 }
3227
ba98e032 3228 // otherwise, we are ISO-8859-1
c1464d9d 3229 return 1;
eec47cc6
VZ
3230}
3231
ba98e032
VS
3232#if wxUSE_UNICODE_UTF8
3233bool wxCSConv::IsUTF8() const
3234{
3235 CreateConvIfNeeded();
3236
3237 if ( m_convReal )
3238 {
3239 return m_convReal->IsUTF8();
3240 }
3241
3242 // otherwise, we are ISO-8859-1
3243 return false;
3244}
3245#endif
3246
69c928ef
VZ
3247
3248#if wxUSE_UNICODE
3249
3250wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3251{
3252 if ( !s )
3253 return wxWCharBuffer();
3254
3255 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3256 if ( !wbuf )
5487ff0f 3257 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3258 if ( !wbuf )
3259 wbuf = wxConvISO8859_1.cMB2WX(s);
3260
3261 return wbuf;
3262}
3263
3264wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3265{
3266 if ( !ws )
3267 return wxCharBuffer();
3268
3269 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3270 if ( !buf )
3271 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3272
3273 return buf;
3274}
3275
3276#endif // wxUSE_UNICODE
f5a1953b 3277
1e50d914
VS
3278// ----------------------------------------------------------------------------
3279// globals
3280// ----------------------------------------------------------------------------
3281
3282// NB: The reason why we create converted objects in this convoluted way,
3283// using a factory function instead of global variable, is that they
3284// may be used at static initialization time (some of them are used by
3285// wxString ctors and there may be a global wxString object). In other
3286// words, possibly _before_ the converter global object would be
3287// initialized.
3288
3289#undef wxConvLibc
3290#undef wxConvUTF8
3291#undef wxConvUTF7
3292#undef wxConvLocal
3293#undef wxConvISO8859_1
3294
3295#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3296 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3297 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3298 { \
3299 static impl_klass name##Obj ctor_args; \
3300 return &name##Obj; \
3301 } \
3302 /* this ensures that all global converter objects are created */ \
3303 /* by the time static initialization is done, i.e. before any */ \
3304 /* thread is launched: */ \
3305 static klass* gs_##name##instance = wxGet_##name##Ptr()
3306
3307#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3308 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3309
3310#ifdef __WINDOWS__
3311 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3312#else
3313 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3314#endif
3315
e1079eda
VZ
3316// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3317// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3318// provokes an error message about "not enough macro parameters"; and we
3319// can't use "()" here as the name##Obj declaration would be parsed as a
3320// function declaration then, so use a semicolon and live with an extra
3321// empty statement (and hope that no compilers warns about this)
3322WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3323WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3324
3325WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3326WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3327
3328WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3329WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3330
6ac84a78
DE
3331#ifdef __DARWIN__
3332// The xnu kernel always communicates file paths in decomposed UTF-8.
3333// WARNING: Are we sure that CFString's conversion will cause decomposition?
3334static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3335#endif
6ac84a78 3336
1e50d914 3337WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3338#ifdef __DARWIN__
1e50d914 3339 &wxConvMacUTF8DObj;
6ac84a78 3340#else // !__DARWIN__
1e50d914 3341 wxGet_wxConvLibcPtr();
6ac84a78 3342#endif // __DARWIN__/!__DARWIN__
1e50d914 3343
bde4baac
VZ
3344#else // !wxUSE_WCHAR_T
3345
1e50d914 3346// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3347// stand-ins in absence of wchar_t
3348WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3349 wxConvISO8859_1,
3350 wxConvLocal,
3351 wxConvUTF8;
3352
3353#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T