]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
minimize searching for tlw parent, remove useless code
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
b040e242 47#ifdef HAVE_ICONV
373658eb 48 #include <iconv.h>
b1d547eb 49 #include "wx/thread.h"
1cd52418 50#endif
1cd52418 51
373658eb
VZ
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
5c4ed98d 55#ifdef __DARWIN__
e4dd1e19 56#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
57#endif //def __DARWIN__
58
ef199164 59
ce6f8d6f
VZ
60#define TRACE_STRCONV _T("strconv")
61
467e0479
VZ
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
4948c2b6 64#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
65 #define WC_UTF16
66#endif
67
ef199164 68
373658eb
VZ
69// ============================================================================
70// implementation
71// ============================================================================
72
69373110
VZ
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
373658eb 82// ----------------------------------------------------------------------------
467e0479 83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 84// ----------------------------------------------------------------------------
6001e347 85
c91830cb 86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 87{
ef199164 88 if (input <= 0xffff)
4def3b35 89 {
999836aa
VZ
90 if (output)
91 *output = (wxUint16) input;
ef199164 92
4def3b35 93 return 1;
dccce9ea 94 }
ef199164 95 else if (input >= 0x110000)
4def3b35 96 {
467e0479 97 return wxCONV_FAILED;
dccce9ea
VZ
98 }
99 else
4def3b35 100 {
dccce9ea 101 if (output)
4def3b35 102 {
ef199164
DS
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 105 }
ef199164 106
4def3b35 107 return 2;
1cd52418 108 }
1cd52418
OK
109}
110
c91830cb 111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 112{
ef199164 113 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
114 {
115 output = *input;
116 return 1;
dccce9ea 117 }
ef199164 118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
119 {
120 output = *input;
467e0479 121 return wxCONV_FAILED;
dccce9ea
VZ
122 }
123 else
4def3b35
VS
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
1cd52418
OK
128}
129
467e0479 130#ifdef WC_UTF16
35d11700
VZ
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
35d11700 141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
142{
143 wxUint32 out;
8d3dd069
VZ
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
f6bcfd97 154// ----------------------------------------------------------------------------
6001e347 155// wxMBConv
f6bcfd97 156// ----------------------------------------------------------------------------
2c53a80a 157
483b0434
VZ
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
6001e347 161{
483b0434
VZ
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
6001e347 168
483b0434
VZ
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
eec47cc6 171
c1464d9d 172 // the number of NULs terminating this string
a78c43f1 173 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 174
c1464d9d
VZ
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
483b0434
VZ
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
467e0479 181 if ( srcLen != wxNO_LEN )
eec47cc6 182 {
c1464d9d 183 // we need to know how to find the end of this string
7ef3ab50 184 nulLen = GetMBNulLen();
483b0434
VZ
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
e4e3bbb4 187
c1464d9d 188 // if there are enough NULs we can avoid the copy
483b0434 189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
190 {
191 // make a copy in order to properly NUL-terminate the string
483b0434 192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 193 char * const p = bufTmp.data();
483b0434
VZ
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 196 *s = '\0';
483b0434
VZ
197
198 src = bufTmp;
eec47cc6 199 }
e4e3bbb4 200
483b0434
VZ
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
e4e3bbb4 207
483b0434 208 for ( ;; )
eec47cc6 209 {
c1464d9d 210 // try to convert the current chunk
483b0434 211 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
e4e3bbb4 214
467e0479 215 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 216
483b0434 217 dstWritten += lenChunk;
f5fb6871 218
467e0479
VZ
219 if ( lenChunk == 1 )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
483b0434
VZ
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
830f8f11 230 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
c1464d9d 235
483b0434 236 if ( !srcEnd )
c1464d9d 237 {
467e0479
VZ
238 // we convert just one chunk in this case as this is the entire
239 // string anyhow
c1464d9d
VZ
240 break;
241 }
eec47cc6
VZ
242
243 // advance the input pointer past the end of this chunk
483b0434 244 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
483b0434 250 src += nulLen;
c1464d9d 251 }
e4e3bbb4 252
483b0434 253 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
483b0434 258 if ( src >= srcEnd )
c1464d9d
VZ
259 break;
260 }
261
483b0434 262 return dstWritten;
e4e3bbb4
RN
263}
264
483b0434
VZ
265size_t
266wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
e4e3bbb4 268{
483b0434
VZ
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
e4e3bbb4 271
eec47cc6
VZ
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
467e0479 278 if ( srcLen == wxNO_LEN )
e4e3bbb4 279 {
483b0434 280 srcLen = wxWcslen(src) + 1;
eec47cc6 281 }
483b0434 282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
283 {
284 // make a copy in order to properly NUL-terminate the string
483b0434 285 bufTmp = wxWCharBuffer(srcLen);
ef199164 286 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
eec47cc6 314 }
e4e3bbb4 315
483b0434
VZ
316 return dstWritten;
317}
318
ef199164 319size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 320{
ef199164 321 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 322 if ( rc != wxCONV_FAILED )
509da451
VZ
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330}
331
ef199164 332size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 333{
ef199164 334 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 335 if ( rc != wxCONV_FAILED )
509da451
VZ
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341}
342
483b0434
VZ
343wxMBConv::~wxMBConv()
344{
345 // nothing to do here (necessary for Darwin linking probably)
346}
e4e3bbb4 347
483b0434
VZ
348const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349{
350 if ( psz )
eec47cc6 351 {
483b0434 352 // calculate the length of the buffer needed first
a2db25a1 353 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 354 if ( nLen != wxCONV_FAILED )
f5fb6871 355 {
483b0434 356 // now do the actual conversion
a2db25a1 357 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 358
483b0434 359 // +1 for the trailing NULL
a2db25a1 360 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 361 return buf;
f5fb6871 362 }
483b0434 363 }
e4e3bbb4 364
483b0434
VZ
365 return wxWCharBuffer();
366}
3698ae71 367
483b0434
VZ
368const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369{
370 if ( pwz )
371 {
a2db25a1 372 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 373 if ( nLen != wxCONV_FAILED )
483b0434 374 {
a2db25a1
VZ
375 wxCharBuffer buf(nLen - 1);
376 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
377 return buf;
378 }
379 }
380
381 return wxCharBuffer();
382}
e4e3bbb4 383
483b0434 384const wxWCharBuffer
ef199164 385wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 386{
ef199164 387 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 388 if ( dstLen != wxCONV_FAILED )
483b0434 389 {
0dd13d21
VZ
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer wbuf(dstLen);
00ceccee 394 wbuf.data()[dstLen - 1] = L'\0';
ef199164 395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
396 {
397 if ( outLen )
467e0479
VZ
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
483b0434
VZ
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412}
413
414const wxCharBuffer
ef199164 415wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 416{
13d92ad6 417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 418 if ( dstLen != wxCONV_FAILED )
483b0434 419 {
0dd13d21
VZ
420 const size_t nulLen = GetMBNulLen();
421
422 // as above, ensure that the buffer is always NUL-terminated, even if
423 // the input is not
424 wxCharBuffer buf(dstLen + nulLen - 1);
425 memset(buf.data() + dstLen, 0, nulLen);
ef199164 426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
427 {
428 if ( outLen )
467e0479
VZ
429 {
430 *outLen = dstLen;
431
13d92ad6
VZ
432 if ( dstLen >= nulLen &&
433 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
434 {
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
13d92ad6 437 *outLen -= nulLen;
467e0479
VZ
438 }
439 }
d32a507d 440
483b0434
VZ
441 return buf;
442 }
e4e3bbb4
RN
443 }
444
eec47cc6
VZ
445 if ( outLen )
446 *outLen = 0;
447
448 return wxCharBuffer();
e4e3bbb4
RN
449}
450
6001e347 451// ----------------------------------------------------------------------------
bde4baac 452// wxMBConvLibc
6001e347
RR
453// ----------------------------------------------------------------------------
454
bde4baac
VZ
455size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456{
457 return wxMB2WC(buf, psz, n);
458}
459
460size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
461{
462 return wxWC2MB(buf, psz, n);
463}
e1bfe89e
RR
464
465// ----------------------------------------------------------------------------
532d575b 466// wxConvBrokenFileNames
e1bfe89e
RR
467// ----------------------------------------------------------------------------
468
eec47cc6
VZ
469#ifdef __UNIX__
470
86501081 471wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 472{
86501081
VS
473 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
474 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 475 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
476 else
477 m_conv = new wxCSConv(charset);
ea8ce907
RR
478}
479
eec47cc6 480#endif // __UNIX__
c12b7f79 481
bde4baac 482// ----------------------------------------------------------------------------
3698ae71 483// UTF-7
bde4baac 484// ----------------------------------------------------------------------------
6001e347 485
15f2ee32 486// Implementation (C) 2004 Fredrik Roubert
6001e347 487
15f2ee32
RN
488//
489// BASE64 decoding table
490//
491static const unsigned char utf7unb64[] =
6001e347 492{
15f2ee32
RN
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
499 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
500 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
502 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
503 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
504 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
506 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
507 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
508 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
525};
526
527size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
528{
15f2ee32
RN
529 size_t len = 0;
530
04a37834 531 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
532 {
533 unsigned char cc = *psz++;
534 if (cc != '+')
535 {
536 // plain ASCII char
537 if (buf)
538 *buf++ = cc;
539 len++;
540 }
541 else if (*psz == '-')
542 {
543 // encoded plus sign
544 if (buf)
545 *buf++ = cc;
546 len++;
547 psz++;
548 }
04a37834 549 else // start of BASE64 encoded string
15f2ee32 550 {
04a37834 551 bool lsb, ok;
15f2ee32 552 unsigned int d, l;
04a37834
VZ
553 for ( ok = lsb = false, d = 0, l = 0;
554 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
555 psz++ )
15f2ee32
RN
556 {
557 d <<= 6;
558 d += cc;
559 for (l += 6; l >= 8; lsb = !lsb)
560 {
04a37834 561 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
562 if (lsb)
563 {
564 if (buf)
565 *buf++ |= c;
566 len ++;
567 }
568 else
04a37834 569 {
15f2ee32 570 if (buf)
6356d52a 571 *buf = (wchar_t)(c << 8);
04a37834
VZ
572 }
573
574 ok = true;
15f2ee32
RN
575 }
576 }
04a37834
VZ
577
578 if ( !ok )
579 {
580 // in valid UTF7 we should have valid characters after '+'
467e0479 581 return wxCONV_FAILED;
04a37834
VZ
582 }
583
15f2ee32
RN
584 if (*psz == '-')
585 psz++;
586 }
587 }
04a37834
VZ
588
589 if ( buf && (len < n) )
590 *buf = '\0';
591
15f2ee32 592 return len;
6001e347
RR
593}
594
15f2ee32
RN
595//
596// BASE64 encoding table
597//
598static const unsigned char utf7enb64[] =
599{
600 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
601 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
602 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
603 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
604 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
605 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
606 'w', 'x', 'y', 'z', '0', '1', '2', '3',
607 '4', '5', '6', '7', '8', '9', '+', '/'
608};
609
610//
611// UTF-7 encoding table
612//
613// 0 - Set D (directly encoded characters)
614// 1 - Set O (optional direct characters)
615// 2 - whitespace characters (optional)
616// 3 - special characters
617//
618static const unsigned char utf7encode[128] =
6001e347 619{
15f2ee32
RN
620 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
626 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
628};
629
667e5b3e 630size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 631{
15f2ee32
RN
632 size_t len = 0;
633
634 while (*psz && ((!buf) || (len < n)))
635 {
636 wchar_t cc = *psz++;
637 if (cc < 0x80 && utf7encode[cc] < 1)
638 {
639 // plain ASCII char
640 if (buf)
641 *buf++ = (char)cc;
ef199164 642
15f2ee32
RN
643 len++;
644 }
645#ifndef WC_UTF16
79c78d42 646 else if (((wxUint32)cc) > 0xffff)
b2c13097 647 {
15f2ee32 648 // no surrogate pair generation (yet?)
467e0479 649 return wxCONV_FAILED;
15f2ee32
RN
650 }
651#endif
652 else
653 {
654 if (buf)
655 *buf++ = '+';
ef199164 656
15f2ee32
RN
657 len++;
658 if (cc != '+')
659 {
660 // BASE64 encode string
661 unsigned int lsb, d, l;
73c902d6 662 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
663 {
664 for (lsb = 0; lsb < 2; lsb ++)
665 {
666 d <<= 8;
667 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
668
669 for (l += 8; l >= 6; )
670 {
671 l -= 6;
672 if (buf)
673 *buf++ = utf7enb64[(d >> l) % 64];
674 len++;
675 }
676 }
ef199164 677
15f2ee32
RN
678 cc = *psz;
679 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
680 break;
681 }
ef199164 682
15f2ee32
RN
683 if (l != 0)
684 {
685 if (buf)
686 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 687
15f2ee32
RN
688 len++;
689 }
690 }
ef199164 691
15f2ee32
RN
692 if (buf)
693 *buf++ = '-';
694 len++;
695 }
696 }
ef199164 697
15f2ee32
RN
698 if (buf && (len < n))
699 *buf = 0;
ef199164 700
15f2ee32 701 return len;
6001e347
RR
702}
703
f6bcfd97 704// ----------------------------------------------------------------------------
6001e347 705// UTF-8
f6bcfd97 706// ----------------------------------------------------------------------------
6001e347 707
1774c3c5 708static const wxUint32 utf8_max[]=
4def3b35 709 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 710
3698ae71
VZ
711// boundaries of the private use area we use to (temporarily) remap invalid
712// characters invalid in a UTF-8 encoded string
ea8ce907
RR
713const wxUint32 wxUnicodePUA = 0x100000;
714const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
715
0286d08d 716// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 717const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
718 // single-byte sequences (ASCII):
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
727
728 // these are invalid:
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
733 0, 0, // C0,C1
734
735 // two-byte sequences:
736 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
737 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
738
739 // three-byte sequences:
740 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
741
742 // four-byte sequences:
743 4, 4, 4, 4, 4, // F0..F4
744
745 // these are invalid again (5- or 6-byte
746 // sequences and sequences for code points
747 // above U+10FFFF, as restricted by RFC 3629):
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
749};
750
751size_t
752wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
753 const char *src, size_t srcLen) const
754{
755 wchar_t *out = dstLen ? dst : NULL;
756 size_t written = 0;
757
758 if ( srcLen == wxNO_LEN )
759 srcLen = strlen(src) + 1;
760
761 for ( const char *p = src; ; p++ )
762 {
763 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
764 {
765 // all done successfully, just add the trailing NULL if we are not
766 // using explicit length
767 if ( srcLen == wxNO_LEN )
768 {
769 if ( out )
770 {
771 if ( !dstLen )
772 break;
773
774 *out = L'\0';
775 }
776
777 written++;
778 }
779
780 return written;
781 }
782
0286d08d
VZ
783 if ( out && !dstLen-- )
784 break;
785
5367a38a
VS
786 wxUint32 code;
787 unsigned char c = *p;
0286d08d 788
5367a38a
VS
789 if ( c < 0x80 )
790 {
791 if ( srcLen == 0 ) // the test works for wxNO_LEN too
792 break;
0286d08d 793
5367a38a
VS
794 if ( srcLen != wxNO_LEN )
795 srcLen--;
0286d08d 796
5367a38a
VS
797 code = c;
798 }
799 else
0286d08d 800 {
5367a38a
VS
801 unsigned len = tableUtf8Lengths[c];
802 if ( !len )
803 break;
804
805 if ( srcLen < len ) // the test works for wxNO_LEN too
806 break;
807
808 if ( srcLen != wxNO_LEN )
809 srcLen -= len;
810
811 // Char. number range | UTF-8 octet sequence
812 // (hexadecimal) | (binary)
813 // ----------------------+----------------------------------------
814 // 0000 0000 - 0000 007F | 0xxxxxxx
815 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
816 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
817 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818 //
819 // Code point value is stored in bits marked with 'x',
820 // lowest-order bit of the value on the right side in the diagram
821 // above. (from RFC 3629)
822
823 // mask to extract lead byte's value ('x' bits above), by sequence
824 // length:
825 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
826
827 // mask and value of lead byte's most significant bits, by length:
828 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
829 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
830
831 len--; // it's more convenient to work with 0-based length here
832
833 // extract the lead byte's value bits:
834 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
835 break;
836
837 code = c & leadValueMask[len];
838
839 // all remaining bytes, if any, are handled in the same way
840 // regardless of sequence's length:
841 for ( ; len; --len )
842 {
843 c = *++p;
844 if ( (c & 0xC0) != 0x80 )
845 return wxCONV_FAILED;
0286d08d 846
5367a38a
VS
847 code <<= 6;
848 code |= c & 0x3F;
849 }
0286d08d
VZ
850 }
851
852#ifdef WC_UTF16
853 // cast is ok because wchar_t == wxUint16 if WC_UTF16
854 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
855 {
856 if ( out )
857 out++;
858 written++;
859 }
860#else // !WC_UTF16
861 if ( out )
862 *out = code;
863#endif // WC_UTF16/!WC_UTF16
864
865 if ( out )
866 out++;
867
868 written++;
869 }
870
871 return wxCONV_FAILED;
872}
873
874size_t
875wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
876 const wchar_t *src, size_t srcLen) const
877{
878 char *out = dstLen ? dst : NULL;
879 size_t written = 0;
880
881 for ( const wchar_t *wp = src; ; wp++ )
882 {
883 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
884 {
885 // all done successfully, just add the trailing NULL if we are not
886 // using explicit length
887 if ( srcLen == wxNO_LEN )
888 {
889 if ( out )
890 {
891 if ( !dstLen )
892 break;
893
894 *out = '\0';
895 }
896
897 written++;
898 }
899
900 return written;
901 }
902
903
904 wxUint32 code;
905#ifdef WC_UTF16
906 // cast is ok for WC_UTF16
907 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
908 {
909 // skip the next char too as we decoded a surrogate
910 wp++;
911 }
912#else // wchar_t is UTF-32
913 code = *wp & 0x7fffffff;
914#endif
915
916 unsigned len;
917 if ( code <= 0x7F )
918 {
919 len = 1;
920 if ( out )
921 {
922 if ( dstLen < len )
923 break;
924
925 out[0] = (char)code;
926 }
927 }
928 else if ( code <= 0x07FF )
929 {
930 len = 2;
931 if ( out )
932 {
933 if ( dstLen < len )
934 break;
935
936 // NB: this line takes 6 least significant bits, encodes them as
937 // 10xxxxxx and discards them so that the next byte can be encoded:
938 out[1] = 0x80 | (code & 0x3F); code >>= 6;
939 out[0] = 0xC0 | code;
940 }
941 }
942 else if ( code < 0xFFFF )
943 {
944 len = 3;
945 if ( out )
946 {
947 if ( dstLen < len )
948 break;
949
950 out[2] = 0x80 | (code & 0x3F); code >>= 6;
951 out[1] = 0x80 | (code & 0x3F); code >>= 6;
952 out[0] = 0xE0 | code;
953 }
954 }
955 else if ( code <= 0x10FFFF )
956 {
957 len = 4;
958 if ( out )
959 {
960 if ( dstLen < len )
961 break;
962
963 out[3] = 0x80 | (code & 0x3F); code >>= 6;
964 out[2] = 0x80 | (code & 0x3F); code >>= 6;
965 out[1] = 0x80 | (code & 0x3F); code >>= 6;
966 out[0] = 0xF0 | code;
967 }
968 }
969 else
970 {
971 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
972 break;
973 }
974
975 if ( out )
976 {
977 out += len;
978 dstLen -= len;
979 }
980
981 written += len;
982 }
983
984 // we only get here if an error occurs during decoding
985 return wxCONV_FAILED;
986}
987
d16d0917
VZ
988size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
989 const char *psz, size_t srcLen) const
6001e347 990{
0286d08d 991 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 992 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 993
4def3b35
VS
994 size_t len = 0;
995
d16d0917 996 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 997 {
ea8ce907
RR
998 const char *opsz = psz;
999 bool invalid = false;
4def3b35
VS
1000 unsigned char cc = *psz++, fc = cc;
1001 unsigned cnt;
dccce9ea 1002 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1003 fc <<= 1;
ef199164 1004
dccce9ea 1005 if (!cnt)
4def3b35
VS
1006 {
1007 // plain ASCII char
dccce9ea 1008 if (buf)
4def3b35
VS
1009 *buf++ = cc;
1010 len++;
561488ef
MW
1011
1012 // escape the escape character for octal escapes
1013 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1014 && cc == '\\' && (!buf || len < n))
1015 {
1016 if (buf)
1017 *buf++ = cc;
1018 len++;
1019 }
dccce9ea
VZ
1020 }
1021 else
4def3b35
VS
1022 {
1023 cnt--;
dccce9ea 1024 if (!cnt)
4def3b35
VS
1025 {
1026 // invalid UTF-8 sequence
ea8ce907 1027 invalid = true;
dccce9ea
VZ
1028 }
1029 else
4def3b35
VS
1030 {
1031 unsigned ocnt = cnt - 1;
1032 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1033 while (cnt--)
4def3b35 1034 {
ea8ce907 1035 cc = *psz;
dccce9ea 1036 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1037 {
1038 // invalid UTF-8 sequence
ea8ce907
RR
1039 invalid = true;
1040 break;
4def3b35 1041 }
ef199164 1042
ea8ce907 1043 psz++;
4def3b35
VS
1044 res = (res << 6) | (cc & 0x3f);
1045 }
ef199164 1046
ea8ce907 1047 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1048 {
1049 // illegal UTF-8 encoding
ea8ce907 1050 invalid = true;
4def3b35 1051 }
ea8ce907
RR
1052 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1053 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1054 {
1055 // if one of our PUA characters turns up externally
1056 // it must also be treated as an illegal sequence
1057 // (a bit like you have to escape an escape character)
1058 invalid = true;
1059 }
1060 else
1061 {
1cd52418 1062#ifdef WC_UTF16
0286d08d 1063 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1064 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1065 if (pa == wxCONV_FAILED)
ea8ce907
RR
1066 {
1067 invalid = true;
1068 }
1069 else
1070 {
1071 if (buf)
1072 buf += pa;
1073 len += pa;
1074 }
373658eb 1075#else // !WC_UTF16
ea8ce907 1076 if (buf)
38d4b1e4 1077 *buf++ = (wchar_t)res;
ea8ce907 1078 len++;
373658eb 1079#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1080 }
1081 }
ef199164 1082
ea8ce907
RR
1083 if (invalid)
1084 {
1085 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1086 {
1087 while (opsz < psz && (!buf || len < n))
1088 {
1089#ifdef WC_UTF16
1090 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1091 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1092 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1093 if (buf)
1094 buf += pa;
1095 opsz++;
1096 len += pa;
1097#else
1098 if (buf)
38d4b1e4 1099 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1100 opsz++;
1101 len++;
1102#endif
1103 }
1104 }
3698ae71 1105 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1106 {
1107 while (opsz < psz && (!buf || len < n))
1108 {
3698ae71
VZ
1109 if ( buf && len + 3 < n )
1110 {
17a1ebd1 1111 unsigned char on = *opsz;
3698ae71 1112 *buf++ = L'\\';
17a1ebd1
VZ
1113 *buf++ = (wchar_t)( L'0' + on / 0100 );
1114 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1115 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1116 }
ef199164 1117
ea8ce907
RR
1118 opsz++;
1119 len += 4;
1120 }
1121 }
3698ae71 1122 else // MAP_INVALID_UTF8_NOT
ea8ce907 1123 {
467e0479 1124 return wxCONV_FAILED;
ea8ce907 1125 }
4def3b35
VS
1126 }
1127 }
6001e347 1128 }
ef199164 1129
d16d0917 1130 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1131 *buf = 0;
ef199164 1132
d16d0917 1133 return len + 1;
6001e347
RR
1134}
1135
3698ae71
VZ
1136static inline bool isoctal(wchar_t wch)
1137{
1138 return L'0' <= wch && wch <= L'7';
1139}
1140
d16d0917
VZ
1141size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1142 const wchar_t *psz, size_t srcLen) const
6001e347 1143{
0286d08d 1144 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1145 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1146
4def3b35 1147 size_t len = 0;
6001e347 1148
d16d0917 1149 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1150 {
1151 wxUint32 cc;
ef199164 1152
1cd52418 1153#ifdef WC_UTF16
b5153fd8
VZ
1154 // cast is ok for WC_UTF16
1155 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1156 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1157#else
ef199164 1158 cc = (*psz++) & 0x7fffffff;
4def3b35 1159#endif
3698ae71
VZ
1160
1161 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1162 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1163 {
dccce9ea 1164 if (buf)
ea8ce907 1165 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1166 len++;
3698ae71 1167 }
561488ef
MW
1168 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1169 && cc == L'\\' && psz[0] == L'\\' )
1170 {
1171 if (buf)
1172 *buf++ = (char)cc;
1173 psz++;
1174 len++;
1175 }
3698ae71
VZ
1176 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1177 cc == L'\\' &&
1178 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1179 {
dccce9ea 1180 if (buf)
3698ae71 1181 {
ef199164
DS
1182 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1183 (psz[1] - L'0') * 010 +
b2c13097 1184 (psz[2] - L'0'));
3698ae71
VZ
1185 }
1186
1187 psz += 3;
ea8ce907
RR
1188 len++;
1189 }
1190 else
1191 {
1192 unsigned cnt;
ef199164
DS
1193 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1194 {
1195 }
1196
ea8ce907 1197 if (!cnt)
4def3b35 1198 {
ea8ce907
RR
1199 // plain ASCII char
1200 if (buf)
1201 *buf++ = (char) cc;
1202 len++;
1203 }
ea8ce907
RR
1204 else
1205 {
1206 len += cnt + 1;
1207 if (buf)
1208 {
1209 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1210 while (cnt--)
1211 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1212 }
4def3b35
VS
1213 }
1214 }
6001e347 1215 }
4def3b35 1216
d16d0917 1217 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1218 *buf = 0;
adb45366 1219
d16d0917 1220 return len + 1;
6001e347
RR
1221}
1222
467e0479 1223// ============================================================================
c91830cb 1224// UTF-16
467e0479 1225// ============================================================================
c91830cb
VZ
1226
1227#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1228 #define wxMBConvUTF16straight wxMBConvUTF16BE
1229 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1230#else
bde4baac
VZ
1231 #define wxMBConvUTF16swap wxMBConvUTF16BE
1232 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1233#endif
1234
467e0479
VZ
1235/* static */
1236size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1237{
1238 if ( srcLen == wxNO_LEN )
1239 {
1240 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1241 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1242 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1243 ;
c91830cb 1244
467e0479
VZ
1245 srcLen *= BYTES_PER_CHAR;
1246 }
1247 else // we already have the length
1248 {
1249 // we can only convert an entire number of UTF-16 characters
1250 if ( srcLen % BYTES_PER_CHAR )
1251 return wxCONV_FAILED;
1252 }
1253
1254 return srcLen;
1255}
1256
1257// case when in-memory representation is UTF-16 too
c91830cb
VZ
1258#ifdef WC_UTF16
1259
467e0479
VZ
1260// ----------------------------------------------------------------------------
1261// conversions without endianness change
1262// ----------------------------------------------------------------------------
1263
1264size_t
1265wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1266 const char *src, size_t srcLen) const
c91830cb 1267{
467e0479
VZ
1268 // set up the scene for using memcpy() (which is presumably more efficient
1269 // than copying the bytes one by one)
1270 srcLen = GetLength(src, srcLen);
1271 if ( srcLen == wxNO_LEN )
1272 return wxCONV_FAILED;
c91830cb 1273
ef199164 1274 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1275 if ( dst )
c91830cb 1276 {
467e0479
VZ
1277 if ( dstLen < inLen )
1278 return wxCONV_FAILED;
c91830cb 1279
467e0479 1280 memcpy(dst, src, srcLen);
c91830cb 1281 }
d32a507d 1282
467e0479 1283 return inLen;
c91830cb
VZ
1284}
1285
467e0479
VZ
1286size_t
1287wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1288 const wchar_t *src, size_t srcLen) const
c91830cb 1289{
467e0479
VZ
1290 if ( srcLen == wxNO_LEN )
1291 srcLen = wxWcslen(src) + 1;
c91830cb 1292
467e0479
VZ
1293 srcLen *= BYTES_PER_CHAR;
1294
1295 if ( dst )
c91830cb 1296 {
467e0479
VZ
1297 if ( dstLen < srcLen )
1298 return wxCONV_FAILED;
d32a507d 1299
467e0479 1300 memcpy(dst, src, srcLen);
c91830cb 1301 }
d32a507d 1302
467e0479 1303 return srcLen;
c91830cb
VZ
1304}
1305
467e0479
VZ
1306// ----------------------------------------------------------------------------
1307// endian-reversing conversions
1308// ----------------------------------------------------------------------------
c91830cb 1309
467e0479
VZ
1310size_t
1311wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1312 const char *src, size_t srcLen) const
c91830cb 1313{
467e0479
VZ
1314 srcLen = GetLength(src, srcLen);
1315 if ( srcLen == wxNO_LEN )
1316 return wxCONV_FAILED;
c91830cb 1317
467e0479
VZ
1318 srcLen /= BYTES_PER_CHAR;
1319
1320 if ( dst )
c91830cb 1321 {
467e0479
VZ
1322 if ( dstLen < srcLen )
1323 return wxCONV_FAILED;
1324
ef199164
DS
1325 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1326 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1327 {
ef199164 1328 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1329 }
c91830cb 1330 }
bfab25d4 1331
467e0479 1332 return srcLen;
c91830cb
VZ
1333}
1334
467e0479
VZ
1335size_t
1336wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1337 const wchar_t *src, size_t srcLen) const
c91830cb 1338{
467e0479
VZ
1339 if ( srcLen == wxNO_LEN )
1340 srcLen = wxWcslen(src) + 1;
c91830cb 1341
467e0479
VZ
1342 srcLen *= BYTES_PER_CHAR;
1343
1344 if ( dst )
c91830cb 1345 {
467e0479
VZ
1346 if ( dstLen < srcLen )
1347 return wxCONV_FAILED;
1348
ef199164 1349 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1350 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1351 {
ef199164 1352 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1353 }
c91830cb 1354 }
eec47cc6 1355
467e0479 1356 return srcLen;
c91830cb
VZ
1357}
1358
467e0479 1359#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1360
467e0479
VZ
1361// ----------------------------------------------------------------------------
1362// conversions without endianness change
1363// ----------------------------------------------------------------------------
c91830cb 1364
35d11700
VZ
1365size_t
1366wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1367 const char *src, size_t srcLen) const
c91830cb 1368{
35d11700
VZ
1369 srcLen = GetLength(src, srcLen);
1370 if ( srcLen == wxNO_LEN )
1371 return wxCONV_FAILED;
c91830cb 1372
ef199164 1373 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1374 if ( !dst )
c91830cb 1375 {
35d11700
VZ
1376 // optimization: return maximal space which could be needed for this
1377 // string even if the real size could be smaller if the buffer contains
1378 // any surrogates
1379 return inLen;
c91830cb 1380 }
c91830cb 1381
35d11700 1382 size_t outLen = 0;
ef199164
DS
1383 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1384 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1385 {
ef199164
DS
1386 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1387 if ( !inBuff )
35d11700
VZ
1388 return wxCONV_FAILED;
1389
1390 if ( ++outLen > dstLen )
1391 return wxCONV_FAILED;
c91830cb 1392
35d11700
VZ
1393 *dst++ = ch;
1394 }
1395
1396
1397 return outLen;
1398}
c91830cb 1399
35d11700
VZ
1400size_t
1401wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1402 const wchar_t *src, size_t srcLen) const
c91830cb 1403{
35d11700
VZ
1404 if ( srcLen == wxNO_LEN )
1405 srcLen = wxWcslen(src) + 1;
c91830cb 1406
35d11700 1407 size_t outLen = 0;
ef199164 1408 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1409 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1410 {
1411 wxUint16 cc[2];
35d11700
VZ
1412 const size_t numChars = encode_utf16(*src++, cc);
1413 if ( numChars == wxCONV_FAILED )
1414 return wxCONV_FAILED;
c91830cb 1415
ef199164
DS
1416 outLen += numChars * BYTES_PER_CHAR;
1417 if ( outBuff )
c91830cb 1418 {
35d11700
VZ
1419 if ( outLen > dstLen )
1420 return wxCONV_FAILED;
1421
ef199164 1422 *outBuff++ = cc[0];
35d11700 1423 if ( numChars == 2 )
69b80d28 1424 {
35d11700 1425 // second character of a surrogate
ef199164 1426 *outBuff++ = cc[1];
69b80d28 1427 }
c91830cb 1428 }
c91830cb 1429 }
c91830cb 1430
35d11700 1431 return outLen;
c91830cb
VZ
1432}
1433
467e0479
VZ
1434// ----------------------------------------------------------------------------
1435// endian-reversing conversions
1436// ----------------------------------------------------------------------------
c91830cb 1437
35d11700
VZ
1438size_t
1439wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1440 const char *src, size_t srcLen) const
c91830cb 1441{
35d11700
VZ
1442 srcLen = GetLength(src, srcLen);
1443 if ( srcLen == wxNO_LEN )
1444 return wxCONV_FAILED;
1445
ef199164 1446 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1447 if ( !dst )
1448 {
1449 // optimization: return maximal space which could be needed for this
1450 // string even if the real size could be smaller if the buffer contains
1451 // any surrogates
1452 return inLen;
1453 }
c91830cb 1454
35d11700 1455 size_t outLen = 0;
ef199164
DS
1456 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1457 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1458 {
35d11700
VZ
1459 wxUint32 ch;
1460 wxUint16 tmp[2];
ef199164
DS
1461
1462 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463 inBuff++;
1464 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1465
35d11700
VZ
1466 const size_t numChars = decode_utf16(tmp, ch);
1467 if ( numChars == wxCONV_FAILED )
1468 return wxCONV_FAILED;
c91830cb 1469
35d11700 1470 if ( numChars == 2 )
ef199164 1471 inBuff++;
35d11700
VZ
1472
1473 if ( ++outLen > dstLen )
1474 return wxCONV_FAILED;
c91830cb 1475
35d11700 1476 *dst++ = ch;
c91830cb 1477 }
c91830cb 1478
c91830cb 1479
35d11700
VZ
1480 return outLen;
1481}
c91830cb 1482
35d11700
VZ
1483size_t
1484wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1485 const wchar_t *src, size_t srcLen) const
c91830cb 1486{
35d11700
VZ
1487 if ( srcLen == wxNO_LEN )
1488 srcLen = wxWcslen(src) + 1;
c91830cb 1489
35d11700 1490 size_t outLen = 0;
ef199164 1491 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1492 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1493 {
1494 wxUint16 cc[2];
35d11700
VZ
1495 const size_t numChars = encode_utf16(*src, cc);
1496 if ( numChars == wxCONV_FAILED )
1497 return wxCONV_FAILED;
c91830cb 1498
ef199164
DS
1499 outLen += numChars * BYTES_PER_CHAR;
1500 if ( outBuff )
c91830cb 1501 {
35d11700
VZ
1502 if ( outLen > dstLen )
1503 return wxCONV_FAILED;
1504
ef199164 1505 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1506 if ( numChars == 2 )
c91830cb 1507 {
35d11700 1508 // second character of a surrogate
ef199164 1509 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1510 }
1511 }
c91830cb 1512 }
c91830cb 1513
35d11700 1514 return outLen;
c91830cb
VZ
1515}
1516
467e0479 1517#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1518
1519
35d11700 1520// ============================================================================
c91830cb 1521// UTF-32
35d11700 1522// ============================================================================
c91830cb
VZ
1523
1524#ifdef WORDS_BIGENDIAN
467e0479
VZ
1525 #define wxMBConvUTF32straight wxMBConvUTF32BE
1526 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1527#else
467e0479
VZ
1528 #define wxMBConvUTF32swap wxMBConvUTF32BE
1529 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1530#endif
1531
1532
1533WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1534WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1535
467e0479
VZ
1536/* static */
1537size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1538{
1539 if ( srcLen == wxNO_LEN )
1540 {
1541 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1542 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1543 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1544 ;
c91830cb 1545
467e0479
VZ
1546 srcLen *= BYTES_PER_CHAR;
1547 }
1548 else // we already have the length
1549 {
1550 // we can only convert an entire number of UTF-32 characters
1551 if ( srcLen % BYTES_PER_CHAR )
1552 return wxCONV_FAILED;
1553 }
1554
1555 return srcLen;
1556}
1557
1558// case when in-memory representation is UTF-16
c91830cb
VZ
1559#ifdef WC_UTF16
1560
467e0479
VZ
1561// ----------------------------------------------------------------------------
1562// conversions without endianness change
1563// ----------------------------------------------------------------------------
1564
1565size_t
1566wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1567 const char *src, size_t srcLen) const
c91830cb 1568{
467e0479
VZ
1569 srcLen = GetLength(src, srcLen);
1570 if ( srcLen == wxNO_LEN )
1571 return wxCONV_FAILED;
c91830cb 1572
ef199164
DS
1573 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1574 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1575 size_t outLen = 0;
1576 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1577 {
1578 wxUint16 cc[2];
ef199164 1579 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1580 if ( numChars == wxCONV_FAILED )
1581 return wxCONV_FAILED;
c91830cb 1582
467e0479
VZ
1583 outLen += numChars;
1584 if ( dst )
c91830cb 1585 {
467e0479
VZ
1586 if ( outLen > dstLen )
1587 return wxCONV_FAILED;
d32a507d 1588
467e0479
VZ
1589 *dst++ = cc[0];
1590 if ( numChars == 2 )
1591 {
1592 // second character of a surrogate
1593 *dst++ = cc[1];
1594 }
1595 }
c91830cb 1596 }
d32a507d 1597
467e0479 1598 return outLen;
c91830cb
VZ
1599}
1600
467e0479
VZ
1601size_t
1602wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1603 const wchar_t *src, size_t srcLen) const
c91830cb 1604{
467e0479
VZ
1605 if ( srcLen == wxNO_LEN )
1606 srcLen = wxWcslen(src) + 1;
c91830cb 1607
467e0479 1608 if ( !dst )
c91830cb 1609 {
467e0479
VZ
1610 // optimization: return maximal space which could be needed for this
1611 // string instead of the exact amount which could be less if there are
1612 // any surrogates in the input
1613 //
1614 // we consider that surrogates are rare enough to make it worthwhile to
1615 // avoid running the loop below at the cost of slightly extra memory
1616 // consumption
ef199164 1617 return srcLen * BYTES_PER_CHAR;
467e0479 1618 }
c91830cb 1619
ef199164 1620 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1621 size_t outLen = 0;
1622 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1623 {
1624 const wxUint32 ch = wxDecodeSurrogate(&src);
1625 if ( !src )
1626 return wxCONV_FAILED;
c91830cb 1627
467e0479 1628 outLen += BYTES_PER_CHAR;
d32a507d 1629
467e0479
VZ
1630 if ( outLen > dstLen )
1631 return wxCONV_FAILED;
b5153fd8 1632
ef199164 1633 *outBuff++ = ch;
467e0479 1634 }
c91830cb 1635
467e0479 1636 return outLen;
c91830cb
VZ
1637}
1638
467e0479
VZ
1639// ----------------------------------------------------------------------------
1640// endian-reversing conversions
1641// ----------------------------------------------------------------------------
c91830cb 1642
467e0479
VZ
1643size_t
1644wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1645 const char *src, size_t srcLen) const
c91830cb 1646{
467e0479
VZ
1647 srcLen = GetLength(src, srcLen);
1648 if ( srcLen == wxNO_LEN )
1649 return wxCONV_FAILED;
c91830cb 1650
ef199164
DS
1651 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1653 size_t outLen = 0;
ef199164 1654 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1655 {
c91830cb 1656 wxUint16 cc[2];
ef199164 1657 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1658 if ( numChars == wxCONV_FAILED )
1659 return wxCONV_FAILED;
c91830cb 1660
467e0479
VZ
1661 outLen += numChars;
1662 if ( dst )
c91830cb 1663 {
467e0479
VZ
1664 if ( outLen > dstLen )
1665 return wxCONV_FAILED;
d32a507d 1666
467e0479
VZ
1667 *dst++ = cc[0];
1668 if ( numChars == 2 )
1669 {
1670 // second character of a surrogate
1671 *dst++ = cc[1];
1672 }
1673 }
c91830cb 1674 }
b5153fd8 1675
467e0479 1676 return outLen;
c91830cb
VZ
1677}
1678
467e0479
VZ
1679size_t
1680wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1681 const wchar_t *src, size_t srcLen) const
c91830cb 1682{
467e0479
VZ
1683 if ( srcLen == wxNO_LEN )
1684 srcLen = wxWcslen(src) + 1;
c91830cb 1685
467e0479 1686 if ( !dst )
c91830cb 1687 {
467e0479
VZ
1688 // optimization: return maximal space which could be needed for this
1689 // string instead of the exact amount which could be less if there are
1690 // any surrogates in the input
1691 //
1692 // we consider that surrogates are rare enough to make it worthwhile to
1693 // avoid running the loop below at the cost of slightly extra memory
1694 // consumption
1695 return srcLen*BYTES_PER_CHAR;
1696 }
c91830cb 1697
ef199164 1698 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1699 size_t outLen = 0;
1700 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1701 {
1702 const wxUint32 ch = wxDecodeSurrogate(&src);
1703 if ( !src )
1704 return wxCONV_FAILED;
c91830cb 1705
467e0479 1706 outLen += BYTES_PER_CHAR;
d32a507d 1707
467e0479
VZ
1708 if ( outLen > dstLen )
1709 return wxCONV_FAILED;
b5153fd8 1710
ef199164 1711 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1712 }
c91830cb 1713
467e0479 1714 return outLen;
c91830cb
VZ
1715}
1716
467e0479 1717#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1718
35d11700
VZ
1719// ----------------------------------------------------------------------------
1720// conversions without endianness change
1721// ----------------------------------------------------------------------------
1722
1723size_t
1724wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1725 const char *src, size_t srcLen) const
c91830cb 1726{
35d11700
VZ
1727 // use memcpy() as it should be much faster than hand-written loop
1728 srcLen = GetLength(src, srcLen);
1729 if ( srcLen == wxNO_LEN )
1730 return wxCONV_FAILED;
c91830cb 1731
35d11700
VZ
1732 const size_t inLen = srcLen/BYTES_PER_CHAR;
1733 if ( dst )
c91830cb 1734 {
35d11700
VZ
1735 if ( dstLen < inLen )
1736 return wxCONV_FAILED;
b5153fd8 1737
35d11700
VZ
1738 memcpy(dst, src, srcLen);
1739 }
c91830cb 1740
35d11700 1741 return inLen;
c91830cb
VZ
1742}
1743
35d11700
VZ
1744size_t
1745wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1746 const wchar_t *src, size_t srcLen) const
c91830cb 1747{
35d11700
VZ
1748 if ( srcLen == wxNO_LEN )
1749 srcLen = wxWcslen(src) + 1;
1750
1751 srcLen *= BYTES_PER_CHAR;
c91830cb 1752
35d11700 1753 if ( dst )
c91830cb 1754 {
35d11700
VZ
1755 if ( dstLen < srcLen )
1756 return wxCONV_FAILED;
c91830cb 1757
35d11700 1758 memcpy(dst, src, srcLen);
c91830cb
VZ
1759 }
1760
35d11700 1761 return srcLen;
c91830cb
VZ
1762}
1763
35d11700
VZ
1764// ----------------------------------------------------------------------------
1765// endian-reversing conversions
1766// ----------------------------------------------------------------------------
c91830cb 1767
35d11700
VZ
1768size_t
1769wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1770 const char *src, size_t srcLen) const
c91830cb 1771{
35d11700
VZ
1772 srcLen = GetLength(src, srcLen);
1773 if ( srcLen == wxNO_LEN )
1774 return wxCONV_FAILED;
1775
1776 srcLen /= BYTES_PER_CHAR;
c91830cb 1777
35d11700 1778 if ( dst )
c91830cb 1779 {
35d11700
VZ
1780 if ( dstLen < srcLen )
1781 return wxCONV_FAILED;
1782
ef199164
DS
1783 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1784 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1785 {
ef199164 1786 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1787 }
c91830cb 1788 }
b5153fd8 1789
35d11700 1790 return srcLen;
c91830cb
VZ
1791}
1792
35d11700
VZ
1793size_t
1794wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1795 const wchar_t *src, size_t srcLen) const
c91830cb 1796{
35d11700
VZ
1797 if ( srcLen == wxNO_LEN )
1798 srcLen = wxWcslen(src) + 1;
1799
1800 srcLen *= BYTES_PER_CHAR;
c91830cb 1801
35d11700 1802 if ( dst )
c91830cb 1803 {
35d11700
VZ
1804 if ( dstLen < srcLen )
1805 return wxCONV_FAILED;
1806
ef199164 1807 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1808 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1809 {
ef199164 1810 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1811 }
c91830cb 1812 }
b5153fd8 1813
35d11700 1814 return srcLen;
c91830cb
VZ
1815}
1816
467e0479 1817#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1818
1819
36acb880
VZ
1820// ============================================================================
1821// The classes doing conversion using the iconv_xxx() functions
1822// ============================================================================
3caec1bb 1823
b040e242 1824#ifdef HAVE_ICONV
3a0d76bc 1825
b1d547eb
VS
1826// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1827// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1828// (unless there's yet another bug in glibc) the only case when iconv()
1829// returns with (size_t)-1 (which means error) and says there are 0 bytes
1830// left in the input buffer -- when _real_ error occurs,
1831// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1832// iconv() failure.
3caec1bb
VS
1833// [This bug does not appear in glibc 2.2.]
1834#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1835#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1836 (errno != E2BIG || bufLeft != 0))
1837#else
1838#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1839#endif
1840
ab217dba 1841#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1842
74a7eb0b
VZ
1843#define ICONV_T_INVALID ((iconv_t)-1)
1844
1845#if SIZEOF_WCHAR_T == 4
1846 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1847 #define WC_ENC wxFONTENCODING_UTF32
1848#elif SIZEOF_WCHAR_T == 2
1849 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1850 #define WC_ENC wxFONTENCODING_UTF16
1851#else // sizeof(wchar_t) != 2 nor 4
1852 // does this ever happen?
1853 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1854#endif
1855
36acb880 1856// ----------------------------------------------------------------------------
e95354ec 1857// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1858// ----------------------------------------------------------------------------
1859
e95354ec 1860class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1861{
1862public:
86501081 1863 wxMBConv_iconv(const char *name);
e95354ec 1864 virtual ~wxMBConv_iconv();
36acb880 1865
bde4baac
VZ
1866 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1867 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1868
d36c9347 1869 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1870 virtual size_t GetMBNulLen() const;
1871
ba98e032
VS
1872#if wxUSE_UNICODE_UTF8
1873 virtual bool IsUTF8() const;
1874#endif
1875
d36c9347
VZ
1876 virtual wxMBConv *Clone() const
1877 {
86501081 1878 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1879 p->m_minMBCharWidth = m_minMBCharWidth;
1880 return p;
1881 }
1882
e95354ec 1883 bool IsOk() const
74a7eb0b 1884 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1885
1886protected:
ef199164
DS
1887 // the iconv handlers used to translate from multibyte
1888 // to wide char and in the other direction
36acb880
VZ
1889 iconv_t m2w,
1890 w2m;
ef199164 1891
b1d547eb
VS
1892#if wxUSE_THREADS
1893 // guards access to m2w and w2m objects
1894 wxMutex m_iconvMutex;
1895#endif
36acb880
VZ
1896
1897private:
e95354ec 1898 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1899 // available on this machine, it will remain NULL
74a7eb0b 1900 static wxString ms_wcCharsetName;
36acb880
VZ
1901
1902 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903 // different endian-ness than the native one
405d8f46 1904 static bool ms_wcNeedsSwap;
eec47cc6 1905
d36c9347
VZ
1906
1907 // name of the encoding handled by this conversion
1908 wxString m_name;
1909
7ef3ab50 1910 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1911 // initially
1912 size_t m_minMBCharWidth;
36acb880
VZ
1913};
1914
8f115891 1915// make the constructor available for unit testing
86501081 1916WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
1917{
1918 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919 if ( !result->IsOk() )
1920 {
1921 delete result;
1922 return 0;
1923 }
ef199164 1924
8f115891
MW
1925 return result;
1926}
1927
422e411e 1928wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1929bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1930
86501081 1931wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 1932 : m_name(name)
36acb880 1933{
c1464d9d 1934 m_minMBCharWidth = 0;
eec47cc6 1935
36acb880 1936 // check for charset that represents wchar_t:
74a7eb0b 1937 if ( ms_wcCharsetName.empty() )
f1339c56 1938 {
c2b83fdd
VZ
1939 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
74a7eb0b
VZ
1941#if wxUSE_FONTMAP
1942 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943#else // !wxUSE_FONTMAP
91cb7f52 1944 static const wxChar *names_static[] =
36acb880 1945 {
74a7eb0b
VZ
1946#if SIZEOF_WCHAR_T == 4
1947 _T("UCS-4"),
1948#elif SIZEOF_WCHAR_T = 2
1949 _T("UCS-2"),
1950#endif
1951 NULL
1952 };
91cb7f52 1953 const wxChar **names = names_static;
74a7eb0b 1954#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1955
d1f024a8 1956 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1957 {
17a1ebd1 1958 const wxString nameCS(*names);
74a7eb0b
VZ
1959
1960 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1961 wxString nameXE(nameCS);
ef199164
DS
1962
1963#ifdef WORDS_BIGENDIAN
74a7eb0b 1964 nameXE += _T("BE");
ef199164 1965#else // little endian
74a7eb0b 1966 nameXE += _T("LE");
ef199164 1967#endif
74a7eb0b 1968
c2b83fdd
VZ
1969 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1970 nameXE.c_str());
1971
86501081 1972 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 1973 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1974 {
74a7eb0b 1975 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1976 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1977 nameCS.c_str());
86501081 1978 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 1979
74a7eb0b
VZ
1980 // and check for bytesex ourselves:
1981 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1982 {
74a7eb0b 1983 char buf[2], *bufPtr;
e8769ed1 1984 wchar_t wbuf[2];
74a7eb0b
VZ
1985 size_t insz, outsz;
1986 size_t res;
1987
1988 buf[0] = 'A';
1989 buf[1] = 0;
1990 wbuf[0] = 0;
1991 insz = 2;
1992 outsz = SIZEOF_WCHAR_T * 2;
e8769ed1 1993 char* wbufPtr = (char*)wbuf;
74a7eb0b
VZ
1994 bufPtr = buf;
1995
ef199164
DS
1996 res = iconv(
1997 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
e8769ed1 1998 &wbufPtr, &outsz);
74a7eb0b
VZ
1999
2000 if (ICONV_FAILED(res, insz))
2001 {
2002 wxLogLastError(wxT("iconv"));
422e411e 2003 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2004 nameCS.c_str());
74a7eb0b
VZ
2005 }
2006 else // ok, can convert to this encoding, remember it
2007 {
17a1ebd1 2008 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2009 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010 }
3a0d76bc
VS
2011 }
2012 }
74a7eb0b 2013 else // use charset not requiring byte swapping
36acb880 2014 {
74a7eb0b 2015 ms_wcCharsetName = nameXE;
36acb880 2016 }
3a0d76bc 2017 }
74a7eb0b 2018
0944fceb 2019 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2020 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2021 ms_wcCharsetName.empty() ? wxString("<none>")
2022 : ms_wcCharsetName,
74a7eb0b
VZ
2023 ms_wcNeedsSwap ? _T(" (needs swap)")
2024 : _T(""));
3a0d76bc 2025 }
36acb880 2026 else // we already have ms_wcCharsetName
3caec1bb 2027 {
86501081 2028 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2029 }
dccce9ea 2030
74a7eb0b 2031 if ( ms_wcCharsetName.empty() )
f1339c56 2032 {
74a7eb0b 2033 w2m = ICONV_T_INVALID;
36acb880 2034 }
405d8f46
VZ
2035 else
2036 {
86501081 2037 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2038 if ( w2m == ICONV_T_INVALID )
2039 {
2040 wxLogTrace(TRACE_STRCONV,
2041 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2042 ms_wcCharsetName.c_str(), name);
74a7eb0b 2043 }
405d8f46 2044 }
36acb880 2045}
3caec1bb 2046
e95354ec 2047wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2048{
74a7eb0b 2049 if ( m2w != ICONV_T_INVALID )
36acb880 2050 iconv_close(m2w);
74a7eb0b 2051 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2052 iconv_close(w2m);
2053}
3a0d76bc 2054
bde4baac 2055size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 2056{
69373110
VZ
2057 // find the string length: notice that must be done differently for
2058 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2059 size_t inbuf;
7ef3ab50 2060 const size_t nulLen = GetMBNulLen();
69373110
VZ
2061 switch ( nulLen )
2062 {
2063 default:
467e0479 2064 return wxCONV_FAILED;
69373110
VZ
2065
2066 case 1:
2067 inbuf = strlen(psz); // arguably more optimized than our version
2068 break;
2069
2070 case 2:
2071 case 4:
2072 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2073 // they also have to start at character boundary and not span two
2074 // adjacent characters
2075 const char *p;
2076 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2077 ;
2078 inbuf = p - psz;
2079 break;
2080 }
2081
b1d547eb 2082#if wxUSE_THREADS
6a17b868
SN
2083 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2084 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2085 // wxConvLocal that are used all over wx code, so we have to make sure
2086 // the handle is used by at most one thread at the time. Otherwise
2087 // only a few wx classes would be safe to use from non-main threads
2088 // as MB<->WC conversion would fail "randomly".
2089 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2090#endif // wxUSE_THREADS
2091
36acb880
VZ
2092 size_t outbuf = n * SIZEOF_WCHAR_T;
2093 size_t res, cres;
36acb880
VZ
2094 const char *pszPtr = psz;
2095
2096 if (buf)
2097 {
e8769ed1
PC
2098 char* bufPtr = (char*)buf;
2099
36acb880
VZ
2100 // have destination buffer, convert there
2101 cres = iconv(m2w,
2102 ICONV_CHAR_CAST(&pszPtr), &inbuf,
e8769ed1 2103 &bufPtr, &outbuf);
36acb880 2104 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 2105
36acb880 2106 if (ms_wcNeedsSwap)
3a0d76bc 2107 {
36acb880 2108 // convert to native endianness
17a1ebd1
VZ
2109 for ( unsigned i = 0; i < res; i++ )
2110 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 2111 }
adb45366 2112
69373110 2113 // NUL-terminate the string if there is any space left
49dd9820
VS
2114 if (res < n)
2115 buf[res] = 0;
36acb880
VZ
2116 }
2117 else
2118 {
2119 // no destination buffer... convert using temp buffer
2120 // to calculate destination buffer requirement
2121 wchar_t tbuf[8];
2122 res = 0;
ef199164
DS
2123
2124 do
2125 {
e8769ed1 2126 char* bufPtr = (char*)tbuf;
ef199164 2127 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2128
2129 cres = iconv(m2w,
2130 ICONV_CHAR_CAST(&pszPtr), &inbuf,
e8769ed1 2131 &bufPtr, &outbuf );
36acb880 2132
ef199164
DS
2133 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2134 }
2135 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2136 }
dccce9ea 2137
36acb880 2138 if (ICONV_FAILED(cres, inbuf))
f1339c56 2139 {
36acb880 2140 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2141 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2142 return wxCONV_FAILED;
36acb880
VZ
2143 }
2144
2145 return res;
2146}
2147
bde4baac 2148size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 2149{
b1d547eb
VS
2150#if wxUSE_THREADS
2151 // NB: explained in MB2WC
2152 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2153#endif
3698ae71 2154
156162ec 2155 size_t inlen = wxWcslen(psz);
e8769ed1
PC
2156 size_t inbuflen = inlen * SIZEOF_WCHAR_T;
2157 size_t outbuflen = n;
36acb880 2158 size_t res, cres;
3a0d76bc 2159
36acb880 2160 wchar_t *tmpbuf = 0;
3caec1bb 2161
36acb880
VZ
2162 if (ms_wcNeedsSwap)
2163 {
2164 // need to copy to temp buffer to switch endianness
74a7eb0b 2165 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2166 // could be in read-only memory, or be accessed in some other thread)
e8769ed1 2167 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
17a1ebd1
VZ
2168 for ( size_t i = 0; i < inlen; i++ )
2169 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 2170
156162ec 2171 tmpbuf[inlen] = L'\0';
74a7eb0b 2172 psz = tmpbuf;
36acb880 2173 }
3a0d76bc 2174
e8769ed1 2175 char* inbuf = (char*)psz;
36acb880
VZ
2176 if (buf)
2177 {
2178 // have destination buffer, convert there
e8769ed1 2179 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen);
3a0d76bc 2180
e8769ed1 2181 res = n - outbuflen;
adb45366 2182
49dd9820
VS
2183 // NB: iconv was given only wcslen(psz) characters on input, and so
2184 // it couldn't convert the trailing zero. Let's do it ourselves
2185 // if there's some room left for it in the output buffer.
2186 if (res < n)
2187 buf[0] = 0;
36acb880
VZ
2188 }
2189 else
2190 {
ef199164 2191 // no destination buffer: convert using temp buffer
36acb880
VZ
2192 // to calculate destination buffer requirement
2193 char tbuf[16];
2194 res = 0;
ef199164
DS
2195 do
2196 {
2197 buf = tbuf;
e8769ed1 2198 outbuflen = 16;
36acb880 2199
e8769ed1 2200 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &buf, &outbuflen);
dccce9ea 2201
e8769ed1 2202 res += 16 - outbuflen;
ef199164
DS
2203 }
2204 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2205 }
dccce9ea 2206
36acb880
VZ
2207 if (ms_wcNeedsSwap)
2208 {
2209 free(tmpbuf);
2210 }
dccce9ea 2211
e8769ed1 2212 if (ICONV_FAILED(cres, inbuflen))
36acb880 2213 {
ce6f8d6f 2214 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2215 return wxCONV_FAILED;
36acb880
VZ
2216 }
2217
2218 return res;
2219}
2220
7ef3ab50 2221size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2222{
c1464d9d 2223 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2224 {
2225 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2226
2227#if wxUSE_THREADS
2228 // NB: explained in MB2WC
2229 wxMutexLocker lock(self->m_iconvMutex);
2230#endif
2231
999020e1 2232 const wchar_t *wnul = L"";
c1464d9d 2233 char buf[8]; // should be enough for NUL in any encoding
356410fc 2234 size_t inLen = sizeof(wchar_t),
c1464d9d 2235 outLen = WXSIZEOF(buf);
ef199164
DS
2236 char *inBuff = (char *)wnul;
2237 char *outBuff = buf;
2238 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2239 {
c1464d9d 2240 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2241 }
2242 else // ok
2243 {
ef199164 2244 self->m_minMBCharWidth = outBuff - buf;
356410fc 2245 }
eec47cc6
VZ
2246 }
2247
c1464d9d 2248 return m_minMBCharWidth;
eec47cc6
VZ
2249}
2250
ba98e032
VS
2251#if wxUSE_UNICODE_UTF8
2252bool wxMBConv_iconv::IsUTF8() const
2253{
86501081
VS
2254 return wxStricmp(m_name, "UTF-8") == 0 ||
2255 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2256}
2257#endif
2258
b040e242 2259#endif // HAVE_ICONV
36acb880 2260
e95354ec 2261
36acb880
VZ
2262// ============================================================================
2263// Win32 conversion classes
2264// ============================================================================
1cd52418 2265
e95354ec 2266#ifdef wxHAVE_WIN32_MB2WC
373658eb 2267
8b04d4c4 2268// from utils.cpp
d775fa82 2269#if wxUSE_FONTMAP
86501081 2270extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2271extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2272#endif
373658eb 2273
e95354ec 2274class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2275{
2276public:
bde4baac
VZ
2277 wxMBConv_win32()
2278 {
2279 m_CodePage = CP_ACP;
c1464d9d 2280 m_minMBCharWidth = 0;
bde4baac
VZ
2281 }
2282
d36c9347 2283 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2284 : wxMBConv()
d36c9347
VZ
2285 {
2286 m_CodePage = conv.m_CodePage;
2287 m_minMBCharWidth = conv.m_minMBCharWidth;
2288 }
2289
7608a683 2290#if wxUSE_FONTMAP
86501081 2291 wxMBConv_win32(const char* name)
bde4baac
VZ
2292 {
2293 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2294 m_minMBCharWidth = 0;
bde4baac 2295 }
dccce9ea 2296
e95354ec 2297 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2298 {
2299 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2300 m_minMBCharWidth = 0;
bde4baac 2301 }
eec47cc6 2302#endif // wxUSE_FONTMAP
8b04d4c4 2303
d36c9347 2304 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2305 {
02272c9c
VZ
2306 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2307 // the behaviour is not compatible with the Unix version (using iconv)
2308 // and break the library itself, e.g. wxTextInputStream::NextChar()
2309 // wouldn't work if reading an incomplete MB char didn't result in an
2310 // error
667e5b3e 2311 //
89028980 2312 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2313 // Win XP or newer and it is not supported for UTF-[78] so we always
2314 // use our own conversions in this case. See
89028980
VS
2315 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2316 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2317 if ( m_CodePage == CP_UTF8 )
89028980 2318 {
5487ff0f 2319 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2320 }
830f8f11
VZ
2321
2322 if ( m_CodePage == CP_UTF7 )
2323 {
5487ff0f 2324 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2325 }
2326
2327 int flags = 0;
2328 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2329 IsAtLeastWin2kSP4() )
89028980 2330 {
830f8f11 2331 flags = MB_ERR_INVALID_CHARS;
89028980 2332 }
667e5b3e 2333
2b5f62a0
VZ
2334 const size_t len = ::MultiByteToWideChar
2335 (
2336 m_CodePage, // code page
667e5b3e 2337 flags, // flags: fall on error
2b5f62a0
VZ
2338 psz, // input string
2339 -1, // its length (NUL-terminated)
b4da152e 2340 buf, // output string
2b5f62a0
VZ
2341 buf ? n : 0 // size of output buffer
2342 );
89028980
VS
2343 if ( !len )
2344 {
2345 // function totally failed
467e0479 2346 return wxCONV_FAILED;
89028980
VS
2347 }
2348
2349 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2350 // check if we succeeded, by doing a double trip:
2351 if ( !flags && buf )
2352 {
53c174fc
VZ
2353 const size_t mbLen = strlen(psz);
2354 wxCharBuffer mbBuf(mbLen);
89028980
VS
2355 if ( ::WideCharToMultiByte
2356 (
2357 m_CodePage,
2358 0,
2359 buf,
2360 -1,
2361 mbBuf.data(),
53c174fc 2362 mbLen + 1, // size in bytes, not length
89028980
VS
2363 NULL,
2364 NULL
2365 ) == 0 ||
2366 strcmp(mbBuf, psz) != 0 )
2367 {
2368 // we didn't obtain the same thing we started from, hence
2369 // the conversion was lossy and we consider that it failed
467e0479 2370 return wxCONV_FAILED;
89028980
VS
2371 }
2372 }
2b5f62a0 2373
03a991bc
VZ
2374 // note that it returns count of written chars for buf != NULL and size
2375 // of the needed buffer for buf == NULL so in either case the length of
2376 // the string (which never includes the terminating NUL) is one less
89028980 2377 return len - 1;
f1339c56 2378 }
dccce9ea 2379
d36c9347 2380 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2381 {
13dd924a
VZ
2382 /*
2383 we have a problem here: by default, WideCharToMultiByte() may
2384 replace characters unrepresentable in the target code page with bad
2385 quality approximations such as turning "1/2" symbol (U+00BD) into
2386 "1" for the code pages which don't have it and we, obviously, want
2387 to avoid this at any price
d775fa82 2388
13dd924a
VZ
2389 the trouble is that this function does it _silently_, i.e. it won't
2390 even tell us whether it did or not... Win98/2000 and higher provide
2391 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2392 we have to resort to a round trip, i.e. check that converting back
2393 results in the same string -- this is, of course, expensive but
2394 otherwise we simply can't be sure to not garble the data.
2395 */
2396
2397 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2398 // it doesn't work with CJK encodings (which we test for rather roughly
2399 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2400 // supporting it
907173e5
WS
2401 BOOL usedDef wxDUMMY_INITIALIZE(false);
2402 BOOL *pUsedDef;
13dd924a
VZ
2403 int flags;
2404 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2405 {
2406 // it's our lucky day
2407 flags = WC_NO_BEST_FIT_CHARS;
2408 pUsedDef = &usedDef;
2409 }
2410 else // old system or unsupported encoding
2411 {
2412 flags = 0;
2413 pUsedDef = NULL;
2414 }
2415
2b5f62a0
VZ
2416 const size_t len = ::WideCharToMultiByte
2417 (
2418 m_CodePage, // code page
13dd924a
VZ
2419 flags, // either none or no best fit
2420 pwz, // input string
2b5f62a0
VZ
2421 -1, // it is (wide) NUL-terminated
2422 buf, // output buffer
2423 buf ? n : 0, // and its size
2424 NULL, // default "replacement" char
13dd924a 2425 pUsedDef // [out] was it used?
2b5f62a0
VZ
2426 );
2427
13dd924a
VZ
2428 if ( !len )
2429 {
2430 // function totally failed
467e0479 2431 return wxCONV_FAILED;
13dd924a
VZ
2432 }
2433
765bdb4a
VZ
2434 // we did something, check if we really succeeded
2435 if ( flags )
13dd924a 2436 {
765bdb4a
VZ
2437 // check if the conversion failed, i.e. if any replacements
2438 // were done
2439 if ( usedDef )
2440 return wxCONV_FAILED;
2441 }
2442 else // we must resort to double tripping...
2443 {
2444 // first we need to ensure that we really have the MB data: this is
2445 // not the case if we're called with NULL buffer, in which case we
2446 // need to do the conversion yet again
2447 wxCharBuffer bufDef;
2448 if ( !buf )
13dd924a 2449 {
765bdb4a
VZ
2450 bufDef = wxCharBuffer(len);
2451 buf = bufDef.data();
2452 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2453 buf, len, NULL, NULL) )
467e0479 2454 return wxCONV_FAILED;
13dd924a 2455 }
765bdb4a 2456
564da6ff
VZ
2457 if ( !n )
2458 n = wcslen(pwz);
765bdb4a 2459 wxWCharBuffer wcBuf(n);
564da6ff 2460 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2461 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2462 {
765bdb4a
VZ
2463 // we didn't obtain the same thing we started from, hence
2464 // the conversion was lossy and we consider that it failed
2465 return wxCONV_FAILED;
13dd924a
VZ
2466 }
2467 }
2468
03a991bc 2469 // see the comment above for the reason of "len - 1"
13dd924a 2470 return len - 1;
f1339c56 2471 }
dccce9ea 2472
7ef3ab50
VZ
2473 virtual size_t GetMBNulLen() const
2474 {
2475 if ( m_minMBCharWidth == 0 )
2476 {
2477 int len = ::WideCharToMultiByte
2478 (
2479 m_CodePage, // code page
2480 0, // no flags
2481 L"", // input string
2482 1, // translate just the NUL
2483 NULL, // output buffer
2484 0, // and its size
2485 NULL, // no replacement char
2486 NULL // [out] don't care if it was used
2487 );
2488
2489 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2490 switch ( len )
2491 {
2492 default:
2493 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2494 self->m_minMBCharWidth = (size_t)-1;
2495 break;
7ef3ab50
VZ
2496
2497 case 0:
2498 self->m_minMBCharWidth = (size_t)-1;
2499 break;
2500
2501 case 1:
2502 case 2:
2503 case 4:
2504 self->m_minMBCharWidth = len;
2505 break;
2506 }
2507 }
2508
2509 return m_minMBCharWidth;
2510 }
2511
d36c9347
VZ
2512 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2513
13dd924a
VZ
2514 bool IsOk() const { return m_CodePage != -1; }
2515
2516private:
2517 static bool CanUseNoBestFit()
2518 {
2519 static int s_isWin98Or2k = -1;
2520
2521 if ( s_isWin98Or2k == -1 )
2522 {
2523 int verMaj, verMin;
2524 switch ( wxGetOsVersion(&verMaj, &verMin) )
2525 {
406d283a 2526 case wxOS_WINDOWS_9X:
13dd924a
VZ
2527 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2528 break;
2529
406d283a 2530 case wxOS_WINDOWS_NT:
13dd924a
VZ
2531 s_isWin98Or2k = verMaj >= 5;
2532 break;
2533
2534 default:
ef199164 2535 // unknown: be conservative by default
13dd924a 2536 s_isWin98Or2k = 0;
ef199164 2537 break;
13dd924a
VZ
2538 }
2539
2540 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2541 }
2542
2543 return s_isWin98Or2k == 1;
2544 }
f1339c56 2545
89028980
VS
2546 static bool IsAtLeastWin2kSP4()
2547 {
8942f83a
WS
2548#ifdef __WXWINCE__
2549 return false;
2550#else
89028980
VS
2551 static int s_isAtLeastWin2kSP4 = -1;
2552
2553 if ( s_isAtLeastWin2kSP4 == -1 )
2554 {
2555 OSVERSIONINFOEX ver;
2556
2557 memset(&ver, 0, sizeof(ver));
2558 ver.dwOSVersionInfoSize = sizeof(ver);
2559 GetVersionEx((OSVERSIONINFO*)&ver);
2560
2561 s_isAtLeastWin2kSP4 =
2562 ((ver.dwMajorVersion > 5) || // Vista+
2563 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2564 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2565 ver.wServicePackMajor >= 4)) // 2000 SP4+
2566 ? 1 : 0;
2567 }
2568
2569 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2570#endif
89028980
VS
2571 }
2572
eec47cc6 2573
c1464d9d 2574 // the code page we're working with
b1d66b54 2575 long m_CodePage;
c1464d9d 2576
7ef3ab50 2577 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2578 // "unknown"
2579 size_t m_minMBCharWidth;
1cd52418 2580};
e95354ec
VZ
2581
2582#endif // wxHAVE_WIN32_MB2WC
2583
f7e98dee 2584
36acb880
VZ
2585// ============================================================================
2586// wxEncodingConverter based conversion classes
2587// ============================================================================
2588
1e6feb95 2589#if wxUSE_FONTMAP
1cd52418 2590
e95354ec 2591class wxMBConv_wxwin : public wxMBConv
1cd52418 2592{
8b04d4c4
VZ
2593private:
2594 void Init()
2595 {
6ac84a78
DE
2596 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597 // The wxMBConv_cf class does a better job.
2598 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2599 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2600 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2601 }
2602
6001e347 2603public:
f1339c56
RR
2604 // temporarily just use wxEncodingConverter stuff,
2605 // so that it works while a better implementation is built
86501081 2606 wxMBConv_wxwin(const char* name)
f1339c56
RR
2607 {
2608 if (name)
267e11c5 2609 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2610 else
2611 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2612
8b04d4c4
VZ
2613 Init();
2614 }
2615
e95354ec 2616 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2617 {
2618 m_enc = enc;
2619
2620 Init();
f1339c56 2621 }
dccce9ea 2622
bde4baac 2623 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2624 {
2625 size_t inbuf = strlen(psz);
dccce9ea 2626 if (buf)
c643a977 2627 {
ef199164 2628 if (!m2w.Convert(psz, buf))
467e0479 2629 return wxCONV_FAILED;
c643a977 2630 }
f1339c56
RR
2631 return inbuf;
2632 }
dccce9ea 2633
bde4baac 2634 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2635 {
f8d791e0 2636 const size_t inbuf = wxWcslen(psz);
f1339c56 2637 if (buf)
c643a977 2638 {
ef199164 2639 if (!w2m.Convert(psz, buf))
467e0479 2640 return wxCONV_FAILED;
c643a977 2641 }
dccce9ea 2642
f1339c56
RR
2643 return inbuf;
2644 }
dccce9ea 2645
7ef3ab50 2646 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2647 {
2648 switch ( m_enc )
2649 {
2650 case wxFONTENCODING_UTF16BE:
2651 case wxFONTENCODING_UTF16LE:
c1464d9d 2652 return 2;
eec47cc6
VZ
2653
2654 case wxFONTENCODING_UTF32BE:
2655 case wxFONTENCODING_UTF32LE:
c1464d9d 2656 return 4;
eec47cc6
VZ
2657
2658 default:
c1464d9d 2659 return 1;
eec47cc6
VZ
2660 }
2661 }
2662
d36c9347
VZ
2663 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2664
7ef3ab50
VZ
2665 bool IsOk() const { return m_ok; }
2666
2667public:
2668 wxFontEncoding m_enc;
2669 wxEncodingConverter m2w, w2m;
2670
2671private:
cafbf6fb
VZ
2672 // were we initialized successfully?
2673 bool m_ok;
fc7a2a60 2674
e95354ec 2675 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2676};
6001e347 2677
8f115891 2678// make the constructors available for unit testing
86501081 2679WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2680{
2681 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2682 if ( !result->IsOk() )
2683 {
2684 delete result;
2685 return 0;
2686 }
ef199164 2687
8f115891
MW
2688 return result;
2689}
2690
1e6feb95
VZ
2691#endif // wxUSE_FONTMAP
2692
36acb880
VZ
2693// ============================================================================
2694// wxCSConv implementation
2695// ============================================================================
2696
8b04d4c4 2697void wxCSConv::Init()
6001e347 2698{
e95354ec
VZ
2699 m_name = NULL;
2700 m_convReal = NULL;
2701 m_deferred = true;
2702}
2703
86501081 2704wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2705{
2706 Init();
82713003 2707
86501081 2708 if ( !charset.empty() )
e95354ec 2709 {
86501081 2710 SetName(charset.ToAscii());
e95354ec 2711 }
bda3d86a 2712
e4277538
VZ
2713#if wxUSE_FONTMAP
2714 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2715#else
bda3d86a 2716 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2717#endif
6001e347
RR
2718}
2719
8b04d4c4
VZ
2720wxCSConv::wxCSConv(wxFontEncoding encoding)
2721{
bda3d86a 2722 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2723 {
2724 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2725
2726 encoding = wxFONTENCODING_SYSTEM;
2727 }
2728
8b04d4c4
VZ
2729 Init();
2730
bda3d86a 2731 m_encoding = encoding;
8b04d4c4
VZ
2732}
2733
6001e347
RR
2734wxCSConv::~wxCSConv()
2735{
65e50848
JS
2736 Clear();
2737}
2738
54380f29 2739wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2740 : wxMBConv()
54380f29 2741{
8b04d4c4
VZ
2742 Init();
2743
54380f29 2744 SetName(conv.m_name);
8b04d4c4 2745 m_encoding = conv.m_encoding;
54380f29
GD
2746}
2747
2748wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2749{
2750 Clear();
8b04d4c4 2751
54380f29 2752 SetName(conv.m_name);
8b04d4c4
VZ
2753 m_encoding = conv.m_encoding;
2754
54380f29
GD
2755 return *this;
2756}
2757
65e50848
JS
2758void wxCSConv::Clear()
2759{
8b04d4c4 2760 free(m_name);
e95354ec 2761 delete m_convReal;
8b04d4c4 2762
65e50848 2763 m_name = NULL;
e95354ec 2764 m_convReal = NULL;
6001e347
RR
2765}
2766
86501081 2767void wxCSConv::SetName(const char *charset)
6001e347 2768{
f1339c56
RR
2769 if (charset)
2770 {
d6f2a891 2771 m_name = wxStrdup(charset);
e95354ec 2772 m_deferred = true;
f1339c56 2773 }
6001e347
RR
2774}
2775
8b3eb85d 2776#if wxUSE_FONTMAP
8b3eb85d
VZ
2777
2778WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2779 wxEncodingNameCache );
8b3eb85d
VZ
2780
2781static wxEncodingNameCache gs_nameCache;
2782#endif
2783
e95354ec
VZ
2784wxMBConv *wxCSConv::DoCreate() const
2785{
ce6f8d6f
VZ
2786#if wxUSE_FONTMAP
2787 wxLogTrace(TRACE_STRCONV,
2788 wxT("creating conversion for %s"),
2789 (m_name ? m_name
86501081 2790 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2791#endif // wxUSE_FONTMAP
2792
c547282d
VZ
2793 // check for the special case of ASCII or ISO8859-1 charset: as we have
2794 // special knowledge of it anyhow, we don't need to create a special
2795 // conversion object
e4277538
VZ
2796 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2797 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2798 {
e95354ec
VZ
2799 // don't convert at all
2800 return NULL;
2801 }
dccce9ea 2802
e95354ec
VZ
2803 // we trust OS to do conversion better than we can so try external
2804 // conversion methods first
2805 //
2806 // the full order is:
2807 // 1. OS conversion (iconv() under Unix or Win32 API)
2808 // 2. hard coded conversions for UTF
2809 // 3. wxEncodingConverter as fall back
2810
2811 // step (1)
2812#ifdef HAVE_ICONV
c547282d 2813#if !wxUSE_FONTMAP
e95354ec 2814 if ( m_name )
c547282d 2815#endif // !wxUSE_FONTMAP
e95354ec 2816 {
3ef10cfc 2817#if wxUSE_FONTMAP
8b3eb85d 2818 wxFontEncoding encoding(m_encoding);
3ef10cfc 2819#endif
8b3eb85d 2820
86501081 2821 if ( m_name )
8b3eb85d 2822 {
86501081 2823 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2824 if ( conv->IsOk() )
2825 return conv;
2826
2827 delete conv;
c547282d
VZ
2828
2829#if wxUSE_FONTMAP
8b3eb85d 2830 encoding =
86501081 2831 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2832#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2833 }
2834#if wxUSE_FONTMAP
2835 {
2836 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2837 if ( it != gs_nameCache.end() )
2838 {
2839 if ( it->second.empty() )
2840 return NULL;
c547282d 2841
86501081 2842 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2843 if ( conv->IsOk() )
2844 return conv;
e95354ec 2845
8b3eb85d
VZ
2846 delete conv;
2847 }
2848
2849 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2850 // CS : in case this does not return valid names (eg for MacRoman)
2851 // encoding got a 'failure' entry in the cache all the same,
2852 // although it just has to be created using a different method, so
2853 // only store failed iconv creation attempts (or perhaps we
2854 // shoulnd't do this at all ?)
3c67ec06 2855 if ( names[0] != NULL )
8b3eb85d 2856 {
3c67ec06 2857 for ( ; *names; ++names )
8b3eb85d 2858 {
86501081
VS
2859 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860 // will need changes that will obsolete this
2861 wxString name(*names);
2862 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2863 if ( conv->IsOk() )
2864 {
2865 gs_nameCache[encoding] = *names;
2866 return conv;
2867 }
2868
2869 delete conv;
8b3eb85d
VZ
2870 }
2871
3c67ec06 2872 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2873 }
8b3eb85d
VZ
2874 }
2875#endif // wxUSE_FONTMAP
e95354ec
VZ
2876 }
2877#endif // HAVE_ICONV
2878
2879#ifdef wxHAVE_WIN32_MB2WC
2880 {
7608a683 2881#if wxUSE_FONTMAP
e95354ec
VZ
2882 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2883 : new wxMBConv_win32(m_encoding);
2884 if ( conv->IsOk() )
2885 return conv;
2886
2887 delete conv;
7608a683
WS
2888#else
2889 return NULL;
2890#endif
e95354ec
VZ
2891 }
2892#endif // wxHAVE_WIN32_MB2WC
ef199164 2893
5c4ed98d 2894#ifdef __DARWIN__
f7e98dee 2895 {
6ff49cbc
DE
2896 // leave UTF16 and UTF32 to the built-ins of wx
2897 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2898 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 2899 {
a6900d10 2900#if wxUSE_FONTMAP
5c4ed98d
DE
2901 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2902 : new wxMBConv_cf(m_encoding);
a6900d10 2903#else
5c4ed98d 2904 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 2905#endif
ef199164 2906
f7e98dee 2907 if ( conv->IsOk() )
d775fa82
WS
2908 return conv;
2909
2910 delete conv;
2911 }
335d31e0 2912 }
5c4ed98d
DE
2913#endif // __DARWIN__
2914
e95354ec
VZ
2915 // step (2)
2916 wxFontEncoding enc = m_encoding;
2917#if wxUSE_FONTMAP
c547282d
VZ
2918 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2919 {
2920 // use "false" to suppress interactive dialogs -- we can be called from
2921 // anywhere and popping up a dialog from here is the last thing we want to
2922 // do
267e11c5 2923 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2924 }
e95354ec
VZ
2925#endif // wxUSE_FONTMAP
2926
2927 switch ( enc )
2928 {
2929 case wxFONTENCODING_UTF7:
2930 return new wxMBConvUTF7;
2931
2932 case wxFONTENCODING_UTF8:
2933 return new wxMBConvUTF8;
2934
e95354ec
VZ
2935 case wxFONTENCODING_UTF16BE:
2936 return new wxMBConvUTF16BE;
2937
2938 case wxFONTENCODING_UTF16LE:
2939 return new wxMBConvUTF16LE;
2940
e95354ec
VZ
2941 case wxFONTENCODING_UTF32BE:
2942 return new wxMBConvUTF32BE;
2943
2944 case wxFONTENCODING_UTF32LE:
2945 return new wxMBConvUTF32LE;
2946
2947 default:
2948 // nothing to do but put here to suppress gcc warnings
ef199164 2949 break;
e95354ec
VZ
2950 }
2951
2952 // step (3)
2953#if wxUSE_FONTMAP
2954 {
2955 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2956 : new wxMBConv_wxwin(m_encoding);
2957 if ( conv->IsOk() )
2958 return conv;
2959
2960 delete conv;
2961 }
2962#endif // wxUSE_FONTMAP
2963
a58d4f4d
VS
2964 // NB: This is a hack to prevent deadlock. What could otherwise happen
2965 // in Unicode build: wxConvLocal creation ends up being here
2966 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
2967 // attach a timestamp, for which it will need wxConvLocal (to convert
2968 // time to char* and then wchar_t*), but that fails, tries to log the
2969 // error, but wxLog has an (already locked) critical section that
2970 // guards the static buffer.
a58d4f4d
VS
2971 static bool alreadyLoggingError = false;
2972 if (!alreadyLoggingError)
2973 {
2974 alreadyLoggingError = true;
2975 wxLogError(_("Cannot convert from the charset '%s'!"),
2976 m_name ? m_name
e95354ec
VZ
2977 :
2978#if wxUSE_FONTMAP
86501081 2979 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 2980#else // !wxUSE_FONTMAP
86501081 2981 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
2982#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2983 );
ef199164 2984
a58d4f4d
VS
2985 alreadyLoggingError = false;
2986 }
e95354ec
VZ
2987
2988 return NULL;
2989}
2990
2991void wxCSConv::CreateConvIfNeeded() const
2992{
2993 if ( m_deferred )
2994 {
2995 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 2996
bda3d86a
VZ
2997 // if we don't have neither the name nor the encoding, use the default
2998 // encoding for this system
2999 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3000 {
4c75209f 3001#if wxUSE_INTL
02c7347b 3002 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3003#else
3004 // fallback to some reasonable default:
3005 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3006#endif // wxUSE_INTL
4c75209f 3007 }
bda3d86a 3008
e95354ec
VZ
3009 self->m_convReal = DoCreate();
3010 self->m_deferred = false;
6001e347 3011 }
6001e347
RR
3012}
3013
0f0298b1
VZ
3014bool wxCSConv::IsOk() const
3015{
3016 CreateConvIfNeeded();
3017
3018 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3020 return true; // always ok as we do it ourselves
3021
3022 // m_convReal->IsOk() is called at its own creation, so we know it must
3023 // be ok if m_convReal is non-NULL
3024 return m_convReal != NULL;
3025}
3026
1c714a5d
VZ
3027size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3028 const char *src, size_t srcLen) const
3029{
3030 CreateConvIfNeeded();
3031
2c74c558
VS
3032 if (m_convReal)
3033 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3034
3035 // latin-1 (direct)
3036 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3037}
3038
3039size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3040 const wchar_t *src, size_t srcLen) const
3041{
3042 CreateConvIfNeeded();
3043
2c74c558
VS
3044 if (m_convReal)
3045 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3046
3047 // latin-1 (direct)
3048 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3049}
3050
6001e347
RR
3051size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3052{
e95354ec 3053 CreateConvIfNeeded();
dccce9ea 3054
e95354ec
VZ
3055 if (m_convReal)
3056 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3057
3058 // latin-1 (direct)
4def3b35 3059 size_t len = strlen(psz);
dccce9ea 3060
f1339c56
RR
3061 if (buf)
3062 {
4def3b35 3063 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3064 buf[c] = (unsigned char)(psz[c]);
3065 }
dccce9ea 3066
f1339c56 3067 return len;
6001e347
RR
3068}
3069
3070size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3071{
e95354ec 3072 CreateConvIfNeeded();
dccce9ea 3073
e95354ec
VZ
3074 if (m_convReal)
3075 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3076
f1339c56 3077 // latin-1 (direct)
f8d791e0 3078 const size_t len = wxWcslen(psz);
f1339c56
RR
3079 if (buf)
3080 {
4def3b35 3081 for (size_t c = 0; c <= len; c++)
24642831
VS
3082 {
3083 if (psz[c] > 0xFF)
467e0479 3084 return wxCONV_FAILED;
ef199164 3085
907173e5 3086 buf[c] = (char)psz[c];
24642831
VS
3087 }
3088 }
3089 else
3090 {
3091 for (size_t c = 0; c <= len; c++)
3092 {
3093 if (psz[c] > 0xFF)
467e0479 3094 return wxCONV_FAILED;
24642831 3095 }
f1339c56 3096 }
dccce9ea 3097
f1339c56 3098 return len;
6001e347
RR
3099}
3100
7ef3ab50 3101size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3102{
3103 CreateConvIfNeeded();
3104
3105 if ( m_convReal )
3106 {
7ef3ab50 3107 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3108 }
3109
ba98e032 3110 // otherwise, we are ISO-8859-1
c1464d9d 3111 return 1;
eec47cc6
VZ
3112}
3113
ba98e032
VS
3114#if wxUSE_UNICODE_UTF8
3115bool wxCSConv::IsUTF8() const
3116{
3117 CreateConvIfNeeded();
3118
3119 if ( m_convReal )
3120 {
3121 return m_convReal->IsUTF8();
3122 }
3123
3124 // otherwise, we are ISO-8859-1
3125 return false;
3126}
3127#endif
3128
69c928ef
VZ
3129
3130#if wxUSE_UNICODE
3131
3132wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3133{
3134 if ( !s )
3135 return wxWCharBuffer();
3136
3137 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3138 if ( !wbuf )
5487ff0f 3139 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3140 if ( !wbuf )
3141 wbuf = wxConvISO8859_1.cMB2WX(s);
3142
3143 return wbuf;
3144}
3145
3146wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3147{
3148 if ( !ws )
3149 return wxCharBuffer();
3150
3151 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3152 if ( !buf )
3153 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3154
3155 return buf;
3156}
3157
3158#endif // wxUSE_UNICODE
f5a1953b 3159
1e50d914
VS
3160// ----------------------------------------------------------------------------
3161// globals
3162// ----------------------------------------------------------------------------
3163
3164// NB: The reason why we create converted objects in this convoluted way,
3165// using a factory function instead of global variable, is that they
3166// may be used at static initialization time (some of them are used by
3167// wxString ctors and there may be a global wxString object). In other
3168// words, possibly _before_ the converter global object would be
3169// initialized.
3170
3171#undef wxConvLibc
3172#undef wxConvUTF8
3173#undef wxConvUTF7
3174#undef wxConvLocal
3175#undef wxConvISO8859_1
3176
3177#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3178 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3179 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3180 { \
3181 static impl_klass name##Obj ctor_args; \
3182 return &name##Obj; \
3183 } \
3184 /* this ensures that all global converter objects are created */ \
3185 /* by the time static initialization is done, i.e. before any */ \
3186 /* thread is launched: */ \
3187 static klass* gs_##name##instance = wxGet_##name##Ptr()
3188
3189#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3191
3192#ifdef __WINDOWS__
3193 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3194#else
3195 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3196#endif
3197
e1079eda
VZ
3198// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200// provokes an error message about "not enough macro parameters"; and we
3201// can't use "()" here as the name##Obj declaration would be parsed as a
3202// function declaration then, so use a semicolon and live with an extra
3203// empty statement (and hope that no compilers warns about this)
3204WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3205WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3206
3207WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3208WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3209
3210WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3211WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3212
6ac84a78
DE
3213#ifdef __DARWIN__
3214// The xnu kernel always communicates file paths in decomposed UTF-8.
3215// WARNING: Are we sure that CFString's conversion will cause decomposition?
3216static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3217#endif
6ac84a78 3218
1e50d914 3219WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3220#ifdef __DARWIN__
1e50d914 3221 &wxConvMacUTF8DObj;
6ac84a78 3222#else // !__DARWIN__
1e50d914 3223 wxGet_wxConvLibcPtr();
6ac84a78 3224#endif // __DARWIN__/!__DARWIN__
1e50d914 3225
bde4baac
VZ
3226#else // !wxUSE_WCHAR_T
3227
1e50d914 3228// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3229// stand-ins in absence of wchar_t
3230WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3231 wxConvISO8859_1,
3232 wxConvLocal,
3233 wxConvUTF8;
3234
3235#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T