]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Always set the background colour for XP, for consistent toolbar backgrounds
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
6001e347 47#ifdef __SALFORDC__
373658eb 48 #include <clib.h>
6001e347
RR
49#endif
50
b040e242 51#ifdef HAVE_ICONV
373658eb 52 #include <iconv.h>
b1d547eb 53 #include "wx/thread.h"
1cd52418 54#endif
1cd52418 55
373658eb
VZ
56#include "wx/encconv.h"
57#include "wx/fontmap.h"
58
5c4ed98d 59#ifdef __DARWIN__
e4dd1e19 60#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
61#endif //def __DARWIN__
62
ef199164 63
ce6f8d6f
VZ
64#define TRACE_STRCONV _T("strconv")
65
467e0479
VZ
66// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67// be 4 bytes
4948c2b6 68#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
69 #define WC_UTF16
70#endif
71
ef199164 72
373658eb
VZ
73// ============================================================================
74// implementation
75// ============================================================================
76
69373110
VZ
77// helper function of cMB2WC(): check if n bytes at this location are all NUL
78static bool NotAllNULs(const char *p, size_t n)
79{
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84}
85
373658eb 86// ----------------------------------------------------------------------------
467e0479 87// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 88// ----------------------------------------------------------------------------
6001e347 89
c91830cb 90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 91{
ef199164 92 if (input <= 0xffff)
4def3b35 93 {
999836aa
VZ
94 if (output)
95 *output = (wxUint16) input;
ef199164 96
4def3b35 97 return 1;
dccce9ea 98 }
ef199164 99 else if (input >= 0x110000)
4def3b35 100 {
467e0479 101 return wxCONV_FAILED;
dccce9ea
VZ
102 }
103 else
4def3b35 104 {
dccce9ea 105 if (output)
4def3b35 106 {
ef199164
DS
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 109 }
ef199164 110
4def3b35 111 return 2;
1cd52418 112 }
1cd52418
OK
113}
114
c91830cb 115static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 116{
ef199164 117 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
118 {
119 output = *input;
120 return 1;
dccce9ea 121 }
ef199164 122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
467e0479 125 return wxCONV_FAILED;
dccce9ea
VZ
126 }
127 else
4def3b35
VS
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
1cd52418
OK
132}
133
467e0479 134#ifdef WC_UTF16
35d11700
VZ
135 typedef wchar_t wxDecodeSurrogate_t;
136#else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
139
140// returns the next UTF-32 character from the wchar_t buffer and advances the
141// pointer to the character after this one
142//
143// if an invalid character is found, *pSrc is set to NULL, the caller must
144// check for this
35d11700 145static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
146{
147 wxUint32 out;
8d3dd069
VZ
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156}
157
f6bcfd97 158// ----------------------------------------------------------------------------
6001e347 159// wxMBConv
f6bcfd97 160// ----------------------------------------------------------------------------
2c53a80a 161
483b0434
VZ
162size_t
163wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
6001e347 165{
483b0434
VZ
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
483b0434 212 for ( ;; )
eec47cc6 213 {
c1464d9d 214 // try to convert the current chunk
483b0434 215 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
e4e3bbb4 218
467e0479 219 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 220
483b0434 221 dstWritten += lenChunk;
f5fb6871 222
467e0479
VZ
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
483b0434
VZ
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
830f8f11 234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
c1464d9d 239
483b0434 240 if ( !srcEnd )
c1464d9d 241 {
467e0479
VZ
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
c1464d9d
VZ
244 break;
245 }
eec47cc6
VZ
246
247 // advance the input pointer past the end of this chunk
483b0434 248 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
483b0434 254 src += nulLen;
c1464d9d 255 }
e4e3bbb4 256
483b0434 257 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
483b0434 262 if ( src >= srcEnd )
c1464d9d
VZ
263 break;
264 }
265
483b0434 266 return dstWritten;
e4e3bbb4
RN
267}
268
483b0434
VZ
269size_t
270wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
e4e3bbb4 272{
483b0434
VZ
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
e4e3bbb4 275
eec47cc6
VZ
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
467e0479 282 if ( srcLen == wxNO_LEN )
e4e3bbb4 283 {
483b0434 284 srcLen = wxWcslen(src) + 1;
eec47cc6 285 }
483b0434 286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
287 {
288 // make a copy in order to properly NUL-terminate the string
483b0434 289 bufTmp = wxWCharBuffer(srcLen);
ef199164 290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
eec47cc6 318 }
e4e3bbb4 319
483b0434
VZ
320 return dstWritten;
321}
322
ef199164 323size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 324{
ef199164 325 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 326 if ( rc != wxCONV_FAILED )
509da451
VZ
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334}
335
ef199164 336size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 337{
ef199164 338 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 339 if ( rc != wxCONV_FAILED )
509da451
VZ
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345}
346
483b0434
VZ
347wxMBConv::~wxMBConv()
348{
349 // nothing to do here (necessary for Darwin linking probably)
350}
e4e3bbb4 351
483b0434
VZ
352const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353{
354 if ( psz )
eec47cc6 355 {
483b0434 356 // calculate the length of the buffer needed first
a2db25a1 357 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 358 if ( nLen != wxCONV_FAILED )
f5fb6871 359 {
483b0434 360 // now do the actual conversion
a2db25a1 361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 362
483b0434 363 // +1 for the trailing NULL
a2db25a1 364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 365 return buf;
f5fb6871 366 }
483b0434 367 }
e4e3bbb4 368
483b0434
VZ
369 return wxWCharBuffer();
370}
3698ae71 371
483b0434
VZ
372const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373{
374 if ( pwz )
375 {
a2db25a1 376 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 377 if ( nLen != wxCONV_FAILED )
483b0434 378 {
a2db25a1
VZ
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386}
e4e3bbb4 387
483b0434 388const wxWCharBuffer
ef199164 389wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 390{
ef199164 391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 392 if ( dstLen != wxCONV_FAILED )
483b0434 393 {
830f8f11 394 wxWCharBuffer wbuf(dstLen - 1);
ef199164 395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
396 {
397 if ( outLen )
467e0479
VZ
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
483b0434
VZ
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412}
413
414const wxCharBuffer
ef199164 415wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 416{
13d92ad6 417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 418 if ( dstLen != wxCONV_FAILED )
483b0434 419 {
168a76fe
VZ
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
ef199164 423 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
424 {
425 if ( outLen )
467e0479
VZ
426 {
427 *outLen = dstLen;
428
429 const size_t nulLen = GetMBNulLen();
13d92ad6
VZ
430 if ( dstLen >= nulLen &&
431 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
432 {
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
13d92ad6 435 *outLen -= nulLen;
467e0479
VZ
436 }
437 }
d32a507d 438
483b0434
VZ
439 return buf;
440 }
e4e3bbb4
RN
441 }
442
eec47cc6
VZ
443 if ( outLen )
444 *outLen = 0;
445
446 return wxCharBuffer();
e4e3bbb4
RN
447}
448
6001e347 449// ----------------------------------------------------------------------------
bde4baac 450// wxMBConvLibc
6001e347
RR
451// ----------------------------------------------------------------------------
452
bde4baac
VZ
453size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
454{
455 return wxMB2WC(buf, psz, n);
456}
457
458size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
459{
460 return wxWC2MB(buf, psz, n);
461}
e1bfe89e
RR
462
463// ----------------------------------------------------------------------------
532d575b 464// wxConvBrokenFileNames
e1bfe89e
RR
465// ----------------------------------------------------------------------------
466
eec47cc6
VZ
467#ifdef __UNIX__
468
86501081 469wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 470{
86501081
VS
471 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
472 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 473 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
474 else
475 m_conv = new wxCSConv(charset);
ea8ce907
RR
476}
477
eec47cc6 478#endif // __UNIX__
c12b7f79 479
bde4baac 480// ----------------------------------------------------------------------------
3698ae71 481// UTF-7
bde4baac 482// ----------------------------------------------------------------------------
6001e347 483
15f2ee32 484// Implementation (C) 2004 Fredrik Roubert
6001e347 485
15f2ee32
RN
486//
487// BASE64 decoding table
488//
489static const unsigned char utf7unb64[] =
6001e347 490{
15f2ee32
RN
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
523};
524
525size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
526{
15f2ee32
RN
527 size_t len = 0;
528
04a37834 529 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
530 {
531 unsigned char cc = *psz++;
532 if (cc != '+')
533 {
534 // plain ASCII char
535 if (buf)
536 *buf++ = cc;
537 len++;
538 }
539 else if (*psz == '-')
540 {
541 // encoded plus sign
542 if (buf)
543 *buf++ = cc;
544 len++;
545 psz++;
546 }
04a37834 547 else // start of BASE64 encoded string
15f2ee32 548 {
04a37834 549 bool lsb, ok;
15f2ee32 550 unsigned int d, l;
04a37834
VZ
551 for ( ok = lsb = false, d = 0, l = 0;
552 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
553 psz++ )
15f2ee32
RN
554 {
555 d <<= 6;
556 d += cc;
557 for (l += 6; l >= 8; lsb = !lsb)
558 {
04a37834 559 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
560 if (lsb)
561 {
562 if (buf)
563 *buf++ |= c;
564 len ++;
565 }
566 else
04a37834 567 {
15f2ee32 568 if (buf)
6356d52a 569 *buf = (wchar_t)(c << 8);
04a37834
VZ
570 }
571
572 ok = true;
15f2ee32
RN
573 }
574 }
04a37834
VZ
575
576 if ( !ok )
577 {
578 // in valid UTF7 we should have valid characters after '+'
467e0479 579 return wxCONV_FAILED;
04a37834
VZ
580 }
581
15f2ee32
RN
582 if (*psz == '-')
583 psz++;
584 }
585 }
04a37834
VZ
586
587 if ( buf && (len < n) )
588 *buf = '\0';
589
15f2ee32 590 return len;
6001e347
RR
591}
592
15f2ee32
RN
593//
594// BASE64 encoding table
595//
596static const unsigned char utf7enb64[] =
597{
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
606};
607
608//
609// UTF-7 encoding table
610//
611// 0 - Set D (directly encoded characters)
612// 1 - Set O (optional direct characters)
613// 2 - whitespace characters (optional)
614// 3 - special characters
615//
616static const unsigned char utf7encode[128] =
6001e347 617{
15f2ee32
RN
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
626};
627
667e5b3e 628size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 629{
15f2ee32
RN
630 size_t len = 0;
631
632 while (*psz && ((!buf) || (len < n)))
633 {
634 wchar_t cc = *psz++;
635 if (cc < 0x80 && utf7encode[cc] < 1)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = (char)cc;
ef199164 640
15f2ee32
RN
641 len++;
642 }
643#ifndef WC_UTF16
79c78d42 644 else if (((wxUint32)cc) > 0xffff)
b2c13097 645 {
15f2ee32 646 // no surrogate pair generation (yet?)
467e0479 647 return wxCONV_FAILED;
15f2ee32
RN
648 }
649#endif
650 else
651 {
652 if (buf)
653 *buf++ = '+';
ef199164 654
15f2ee32
RN
655 len++;
656 if (cc != '+')
657 {
658 // BASE64 encode string
659 unsigned int lsb, d, l;
73c902d6 660 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
661 {
662 for (lsb = 0; lsb < 2; lsb ++)
663 {
664 d <<= 8;
665 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
666
667 for (l += 8; l >= 6; )
668 {
669 l -= 6;
670 if (buf)
671 *buf++ = utf7enb64[(d >> l) % 64];
672 len++;
673 }
674 }
ef199164 675
15f2ee32
RN
676 cc = *psz;
677 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
678 break;
679 }
ef199164 680
15f2ee32
RN
681 if (l != 0)
682 {
683 if (buf)
684 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 685
15f2ee32
RN
686 len++;
687 }
688 }
ef199164 689
15f2ee32
RN
690 if (buf)
691 *buf++ = '-';
692 len++;
693 }
694 }
ef199164 695
15f2ee32
RN
696 if (buf && (len < n))
697 *buf = 0;
ef199164 698
15f2ee32 699 return len;
6001e347
RR
700}
701
f6bcfd97 702// ----------------------------------------------------------------------------
6001e347 703// UTF-8
f6bcfd97 704// ----------------------------------------------------------------------------
6001e347 705
1774c3c5 706static const wxUint32 utf8_max[]=
4def3b35 707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 708
3698ae71
VZ
709// boundaries of the private use area we use to (temporarily) remap invalid
710// characters invalid in a UTF-8 encoded string
ea8ce907
RR
711const wxUint32 wxUnicodePUA = 0x100000;
712const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
713
0286d08d 714// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 715const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
725
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
731 0, 0, // C0,C1
732
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
736
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
739
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
742
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
747};
748
749size_t
750wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
751 const char *src, size_t srcLen) const
752{
753 wchar_t *out = dstLen ? dst : NULL;
754 size_t written = 0;
755
756 if ( srcLen == wxNO_LEN )
757 srcLen = strlen(src) + 1;
758
759 for ( const char *p = src; ; p++ )
760 {
761 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
762 {
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen == wxNO_LEN )
766 {
767 if ( out )
768 {
769 if ( !dstLen )
770 break;
771
772 *out = L'\0';
773 }
774
775 written++;
776 }
777
778 return written;
779 }
780
0286d08d
VZ
781 if ( out && !dstLen-- )
782 break;
783
5367a38a
VS
784 wxUint32 code;
785 unsigned char c = *p;
0286d08d 786
5367a38a
VS
787 if ( c < 0x80 )
788 {
789 if ( srcLen == 0 ) // the test works for wxNO_LEN too
790 break;
0286d08d 791
5367a38a
VS
792 if ( srcLen != wxNO_LEN )
793 srcLen--;
0286d08d 794
5367a38a
VS
795 code = c;
796 }
797 else
0286d08d 798 {
5367a38a
VS
799 unsigned len = tableUtf8Lengths[c];
800 if ( !len )
801 break;
802
803 if ( srcLen < len ) // the test works for wxNO_LEN too
804 break;
805
806 if ( srcLen != wxNO_LEN )
807 srcLen -= len;
808
809 // Char. number range | UTF-8 octet sequence
810 // (hexadecimal) | (binary)
811 // ----------------------+----------------------------------------
812 // 0000 0000 - 0000 007F | 0xxxxxxx
813 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
814 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
815 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
816 //
817 // Code point value is stored in bits marked with 'x',
818 // lowest-order bit of the value on the right side in the diagram
819 // above. (from RFC 3629)
820
821 // mask to extract lead byte's value ('x' bits above), by sequence
822 // length:
823 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
824
825 // mask and value of lead byte's most significant bits, by length:
826 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
827 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
828
829 len--; // it's more convenient to work with 0-based length here
830
831 // extract the lead byte's value bits:
832 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
833 break;
834
835 code = c & leadValueMask[len];
836
837 // all remaining bytes, if any, are handled in the same way
838 // regardless of sequence's length:
839 for ( ; len; --len )
840 {
841 c = *++p;
842 if ( (c & 0xC0) != 0x80 )
843 return wxCONV_FAILED;
0286d08d 844
5367a38a
VS
845 code <<= 6;
846 code |= c & 0x3F;
847 }
0286d08d
VZ
848 }
849
850#ifdef WC_UTF16
851 // cast is ok because wchar_t == wxUint16 if WC_UTF16
852 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
853 {
854 if ( out )
855 out++;
856 written++;
857 }
858#else // !WC_UTF16
859 if ( out )
860 *out = code;
861#endif // WC_UTF16/!WC_UTF16
862
863 if ( out )
864 out++;
865
866 written++;
867 }
868
869 return wxCONV_FAILED;
870}
871
872size_t
873wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
874 const wchar_t *src, size_t srcLen) const
875{
876 char *out = dstLen ? dst : NULL;
877 size_t written = 0;
878
879 for ( const wchar_t *wp = src; ; wp++ )
880 {
881 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
882 {
883 // all done successfully, just add the trailing NULL if we are not
884 // using explicit length
885 if ( srcLen == wxNO_LEN )
886 {
887 if ( out )
888 {
889 if ( !dstLen )
890 break;
891
892 *out = '\0';
893 }
894
895 written++;
896 }
897
898 return written;
899 }
900
901
902 wxUint32 code;
903#ifdef WC_UTF16
904 // cast is ok for WC_UTF16
905 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
906 {
907 // skip the next char too as we decoded a surrogate
908 wp++;
909 }
910#else // wchar_t is UTF-32
911 code = *wp & 0x7fffffff;
912#endif
913
914 unsigned len;
915 if ( code <= 0x7F )
916 {
917 len = 1;
918 if ( out )
919 {
920 if ( dstLen < len )
921 break;
922
923 out[0] = (char)code;
924 }
925 }
926 else if ( code <= 0x07FF )
927 {
928 len = 2;
929 if ( out )
930 {
931 if ( dstLen < len )
932 break;
933
934 // NB: this line takes 6 least significant bits, encodes them as
935 // 10xxxxxx and discards them so that the next byte can be encoded:
936 out[1] = 0x80 | (code & 0x3F); code >>= 6;
937 out[0] = 0xC0 | code;
938 }
939 }
940 else if ( code < 0xFFFF )
941 {
942 len = 3;
943 if ( out )
944 {
945 if ( dstLen < len )
946 break;
947
948 out[2] = 0x80 | (code & 0x3F); code >>= 6;
949 out[1] = 0x80 | (code & 0x3F); code >>= 6;
950 out[0] = 0xE0 | code;
951 }
952 }
953 else if ( code <= 0x10FFFF )
954 {
955 len = 4;
956 if ( out )
957 {
958 if ( dstLen < len )
959 break;
960
961 out[3] = 0x80 | (code & 0x3F); code >>= 6;
962 out[2] = 0x80 | (code & 0x3F); code >>= 6;
963 out[1] = 0x80 | (code & 0x3F); code >>= 6;
964 out[0] = 0xF0 | code;
965 }
966 }
967 else
968 {
969 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
970 break;
971 }
972
973 if ( out )
974 {
975 out += len;
976 dstLen -= len;
977 }
978
979 written += len;
980 }
981
982 // we only get here if an error occurs during decoding
983 return wxCONV_FAILED;
984}
985
6001e347
RR
986size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
987{
0286d08d
VZ
988 if ( m_options == MAP_INVALID_UTF8_NOT )
989 return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
990
4def3b35
VS
991 size_t len = 0;
992
dccce9ea 993 while (*psz && ((!buf) || (len < n)))
4def3b35 994 {
ea8ce907
RR
995 const char *opsz = psz;
996 bool invalid = false;
4def3b35
VS
997 unsigned char cc = *psz++, fc = cc;
998 unsigned cnt;
dccce9ea 999 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1000 fc <<= 1;
ef199164 1001
dccce9ea 1002 if (!cnt)
4def3b35
VS
1003 {
1004 // plain ASCII char
dccce9ea 1005 if (buf)
4def3b35
VS
1006 *buf++ = cc;
1007 len++;
561488ef
MW
1008
1009 // escape the escape character for octal escapes
1010 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1011 && cc == '\\' && (!buf || len < n))
1012 {
1013 if (buf)
1014 *buf++ = cc;
1015 len++;
1016 }
dccce9ea
VZ
1017 }
1018 else
4def3b35
VS
1019 {
1020 cnt--;
dccce9ea 1021 if (!cnt)
4def3b35
VS
1022 {
1023 // invalid UTF-8 sequence
ea8ce907 1024 invalid = true;
dccce9ea
VZ
1025 }
1026 else
4def3b35
VS
1027 {
1028 unsigned ocnt = cnt - 1;
1029 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1030 while (cnt--)
4def3b35 1031 {
ea8ce907 1032 cc = *psz;
dccce9ea 1033 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1034 {
1035 // invalid UTF-8 sequence
ea8ce907
RR
1036 invalid = true;
1037 break;
4def3b35 1038 }
ef199164 1039
ea8ce907 1040 psz++;
4def3b35
VS
1041 res = (res << 6) | (cc & 0x3f);
1042 }
ef199164 1043
ea8ce907 1044 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1045 {
1046 // illegal UTF-8 encoding
ea8ce907 1047 invalid = true;
4def3b35 1048 }
ea8ce907
RR
1049 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1050 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1051 {
1052 // if one of our PUA characters turns up externally
1053 // it must also be treated as an illegal sequence
1054 // (a bit like you have to escape an escape character)
1055 invalid = true;
1056 }
1057 else
1058 {
1cd52418 1059#ifdef WC_UTF16
0286d08d 1060 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1061 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1062 if (pa == wxCONV_FAILED)
ea8ce907
RR
1063 {
1064 invalid = true;
1065 }
1066 else
1067 {
1068 if (buf)
1069 buf += pa;
1070 len += pa;
1071 }
373658eb 1072#else // !WC_UTF16
ea8ce907 1073 if (buf)
38d4b1e4 1074 *buf++ = (wchar_t)res;
ea8ce907 1075 len++;
373658eb 1076#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1077 }
1078 }
ef199164 1079
ea8ce907
RR
1080 if (invalid)
1081 {
1082 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1083 {
1084 while (opsz < psz && (!buf || len < n))
1085 {
1086#ifdef WC_UTF16
1087 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1088 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1089 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1090 if (buf)
1091 buf += pa;
1092 opsz++;
1093 len += pa;
1094#else
1095 if (buf)
38d4b1e4 1096 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1097 opsz++;
1098 len++;
1099#endif
1100 }
1101 }
3698ae71 1102 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1103 {
1104 while (opsz < psz && (!buf || len < n))
1105 {
3698ae71
VZ
1106 if ( buf && len + 3 < n )
1107 {
17a1ebd1 1108 unsigned char on = *opsz;
3698ae71 1109 *buf++ = L'\\';
17a1ebd1
VZ
1110 *buf++ = (wchar_t)( L'0' + on / 0100 );
1111 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1112 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1113 }
ef199164 1114
ea8ce907
RR
1115 opsz++;
1116 len += 4;
1117 }
1118 }
3698ae71 1119 else // MAP_INVALID_UTF8_NOT
ea8ce907 1120 {
467e0479 1121 return wxCONV_FAILED;
ea8ce907 1122 }
4def3b35
VS
1123 }
1124 }
6001e347 1125 }
ef199164 1126
dccce9ea 1127 if (buf && (len < n))
4def3b35 1128 *buf = 0;
ef199164 1129
4def3b35 1130 return len;
6001e347
RR
1131}
1132
3698ae71
VZ
1133static inline bool isoctal(wchar_t wch)
1134{
1135 return L'0' <= wch && wch <= L'7';
1136}
1137
6001e347
RR
1138size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1139{
0286d08d
VZ
1140 if ( m_options == MAP_INVALID_UTF8_NOT )
1141 return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1142
4def3b35 1143 size_t len = 0;
6001e347 1144
dccce9ea 1145 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
1146 {
1147 wxUint32 cc;
ef199164 1148
1cd52418 1149#ifdef WC_UTF16
b5153fd8
VZ
1150 // cast is ok for WC_UTF16
1151 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1152 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1153#else
ef199164 1154 cc = (*psz++) & 0x7fffffff;
4def3b35 1155#endif
3698ae71
VZ
1156
1157 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1158 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1159 {
dccce9ea 1160 if (buf)
ea8ce907 1161 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1162 len++;
3698ae71 1163 }
561488ef
MW
1164 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1165 && cc == L'\\' && psz[0] == L'\\' )
1166 {
1167 if (buf)
1168 *buf++ = (char)cc;
1169 psz++;
1170 len++;
1171 }
3698ae71
VZ
1172 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1173 cc == L'\\' &&
1174 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1175 {
dccce9ea 1176 if (buf)
3698ae71 1177 {
ef199164
DS
1178 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1179 (psz[1] - L'0') * 010 +
b2c13097 1180 (psz[2] - L'0'));
3698ae71
VZ
1181 }
1182
1183 psz += 3;
ea8ce907
RR
1184 len++;
1185 }
1186 else
1187 {
1188 unsigned cnt;
ef199164
DS
1189 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1190 {
1191 }
1192
ea8ce907 1193 if (!cnt)
4def3b35 1194 {
ea8ce907
RR
1195 // plain ASCII char
1196 if (buf)
1197 *buf++ = (char) cc;
1198 len++;
1199 }
ea8ce907
RR
1200 else
1201 {
1202 len += cnt + 1;
1203 if (buf)
1204 {
1205 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1206 while (cnt--)
1207 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1208 }
4def3b35
VS
1209 }
1210 }
6001e347 1211 }
4def3b35 1212
ef199164 1213 if (buf && (len < n))
3698ae71 1214 *buf = 0;
adb45366 1215
4def3b35 1216 return len;
6001e347
RR
1217}
1218
467e0479 1219// ============================================================================
c91830cb 1220// UTF-16
467e0479 1221// ============================================================================
c91830cb
VZ
1222
1223#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1224 #define wxMBConvUTF16straight wxMBConvUTF16BE
1225 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1226#else
bde4baac
VZ
1227 #define wxMBConvUTF16swap wxMBConvUTF16BE
1228 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1229#endif
1230
467e0479
VZ
1231/* static */
1232size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1233{
1234 if ( srcLen == wxNO_LEN )
1235 {
1236 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1237 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1238 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1239 ;
c91830cb 1240
467e0479
VZ
1241 srcLen *= BYTES_PER_CHAR;
1242 }
1243 else // we already have the length
1244 {
1245 // we can only convert an entire number of UTF-16 characters
1246 if ( srcLen % BYTES_PER_CHAR )
1247 return wxCONV_FAILED;
1248 }
1249
1250 return srcLen;
1251}
1252
1253// case when in-memory representation is UTF-16 too
c91830cb
VZ
1254#ifdef WC_UTF16
1255
467e0479
VZ
1256// ----------------------------------------------------------------------------
1257// conversions without endianness change
1258// ----------------------------------------------------------------------------
1259
1260size_t
1261wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1262 const char *src, size_t srcLen) const
c91830cb 1263{
467e0479
VZ
1264 // set up the scene for using memcpy() (which is presumably more efficient
1265 // than copying the bytes one by one)
1266 srcLen = GetLength(src, srcLen);
1267 if ( srcLen == wxNO_LEN )
1268 return wxCONV_FAILED;
c91830cb 1269
ef199164 1270 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1271 if ( dst )
c91830cb 1272 {
467e0479
VZ
1273 if ( dstLen < inLen )
1274 return wxCONV_FAILED;
c91830cb 1275
467e0479 1276 memcpy(dst, src, srcLen);
c91830cb 1277 }
d32a507d 1278
467e0479 1279 return inLen;
c91830cb
VZ
1280}
1281
467e0479
VZ
1282size_t
1283wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1284 const wchar_t *src, size_t srcLen) const
c91830cb 1285{
467e0479
VZ
1286 if ( srcLen == wxNO_LEN )
1287 srcLen = wxWcslen(src) + 1;
c91830cb 1288
467e0479
VZ
1289 srcLen *= BYTES_PER_CHAR;
1290
1291 if ( dst )
c91830cb 1292 {
467e0479
VZ
1293 if ( dstLen < srcLen )
1294 return wxCONV_FAILED;
d32a507d 1295
467e0479 1296 memcpy(dst, src, srcLen);
c91830cb 1297 }
d32a507d 1298
467e0479 1299 return srcLen;
c91830cb
VZ
1300}
1301
467e0479
VZ
1302// ----------------------------------------------------------------------------
1303// endian-reversing conversions
1304// ----------------------------------------------------------------------------
c91830cb 1305
467e0479
VZ
1306size_t
1307wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1308 const char *src, size_t srcLen) const
c91830cb 1309{
467e0479
VZ
1310 srcLen = GetLength(src, srcLen);
1311 if ( srcLen == wxNO_LEN )
1312 return wxCONV_FAILED;
c91830cb 1313
467e0479
VZ
1314 srcLen /= BYTES_PER_CHAR;
1315
1316 if ( dst )
c91830cb 1317 {
467e0479
VZ
1318 if ( dstLen < srcLen )
1319 return wxCONV_FAILED;
1320
ef199164
DS
1321 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1322 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1323 {
ef199164 1324 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1325 }
c91830cb 1326 }
bfab25d4 1327
467e0479 1328 return srcLen;
c91830cb
VZ
1329}
1330
467e0479
VZ
1331size_t
1332wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1333 const wchar_t *src, size_t srcLen) const
c91830cb 1334{
467e0479
VZ
1335 if ( srcLen == wxNO_LEN )
1336 srcLen = wxWcslen(src) + 1;
c91830cb 1337
467e0479
VZ
1338 srcLen *= BYTES_PER_CHAR;
1339
1340 if ( dst )
c91830cb 1341 {
467e0479
VZ
1342 if ( dstLen < srcLen )
1343 return wxCONV_FAILED;
1344
ef199164 1345 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1346 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1347 {
ef199164 1348 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1349 }
c91830cb 1350 }
eec47cc6 1351
467e0479 1352 return srcLen;
c91830cb
VZ
1353}
1354
467e0479 1355#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1356
467e0479
VZ
1357// ----------------------------------------------------------------------------
1358// conversions without endianness change
1359// ----------------------------------------------------------------------------
c91830cb 1360
35d11700
VZ
1361size_t
1362wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1363 const char *src, size_t srcLen) const
c91830cb 1364{
35d11700
VZ
1365 srcLen = GetLength(src, srcLen);
1366 if ( srcLen == wxNO_LEN )
1367 return wxCONV_FAILED;
c91830cb 1368
ef199164 1369 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1370 if ( !dst )
c91830cb 1371 {
35d11700
VZ
1372 // optimization: return maximal space which could be needed for this
1373 // string even if the real size could be smaller if the buffer contains
1374 // any surrogates
1375 return inLen;
c91830cb 1376 }
c91830cb 1377
35d11700 1378 size_t outLen = 0;
ef199164
DS
1379 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1380 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1381 {
ef199164
DS
1382 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1383 if ( !inBuff )
35d11700
VZ
1384 return wxCONV_FAILED;
1385
1386 if ( ++outLen > dstLen )
1387 return wxCONV_FAILED;
c91830cb 1388
35d11700
VZ
1389 *dst++ = ch;
1390 }
1391
1392
1393 return outLen;
1394}
c91830cb 1395
35d11700
VZ
1396size_t
1397wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1398 const wchar_t *src, size_t srcLen) const
c91830cb 1399{
35d11700
VZ
1400 if ( srcLen == wxNO_LEN )
1401 srcLen = wxWcslen(src) + 1;
c91830cb 1402
35d11700 1403 size_t outLen = 0;
ef199164 1404 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1405 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1406 {
1407 wxUint16 cc[2];
35d11700
VZ
1408 const size_t numChars = encode_utf16(*src++, cc);
1409 if ( numChars == wxCONV_FAILED )
1410 return wxCONV_FAILED;
c91830cb 1411
ef199164
DS
1412 outLen += numChars * BYTES_PER_CHAR;
1413 if ( outBuff )
c91830cb 1414 {
35d11700
VZ
1415 if ( outLen > dstLen )
1416 return wxCONV_FAILED;
1417
ef199164 1418 *outBuff++ = cc[0];
35d11700 1419 if ( numChars == 2 )
69b80d28 1420 {
35d11700 1421 // second character of a surrogate
ef199164 1422 *outBuff++ = cc[1];
69b80d28 1423 }
c91830cb 1424 }
c91830cb 1425 }
c91830cb 1426
35d11700 1427 return outLen;
c91830cb
VZ
1428}
1429
467e0479
VZ
1430// ----------------------------------------------------------------------------
1431// endian-reversing conversions
1432// ----------------------------------------------------------------------------
c91830cb 1433
35d11700
VZ
1434size_t
1435wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1436 const char *src, size_t srcLen) const
c91830cb 1437{
35d11700
VZ
1438 srcLen = GetLength(src, srcLen);
1439 if ( srcLen == wxNO_LEN )
1440 return wxCONV_FAILED;
1441
ef199164 1442 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1443 if ( !dst )
1444 {
1445 // optimization: return maximal space which could be needed for this
1446 // string even if the real size could be smaller if the buffer contains
1447 // any surrogates
1448 return inLen;
1449 }
c91830cb 1450
35d11700 1451 size_t outLen = 0;
ef199164
DS
1452 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1453 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1454 {
35d11700
VZ
1455 wxUint32 ch;
1456 wxUint16 tmp[2];
ef199164
DS
1457
1458 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1459 inBuff++;
1460 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1461
35d11700
VZ
1462 const size_t numChars = decode_utf16(tmp, ch);
1463 if ( numChars == wxCONV_FAILED )
1464 return wxCONV_FAILED;
c91830cb 1465
35d11700 1466 if ( numChars == 2 )
ef199164 1467 inBuff++;
35d11700
VZ
1468
1469 if ( ++outLen > dstLen )
1470 return wxCONV_FAILED;
c91830cb 1471
35d11700 1472 *dst++ = ch;
c91830cb 1473 }
c91830cb 1474
c91830cb 1475
35d11700
VZ
1476 return outLen;
1477}
c91830cb 1478
35d11700
VZ
1479size_t
1480wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1481 const wchar_t *src, size_t srcLen) const
c91830cb 1482{
35d11700
VZ
1483 if ( srcLen == wxNO_LEN )
1484 srcLen = wxWcslen(src) + 1;
c91830cb 1485
35d11700 1486 size_t outLen = 0;
ef199164 1487 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1488 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1489 {
1490 wxUint16 cc[2];
35d11700
VZ
1491 const size_t numChars = encode_utf16(*src, cc);
1492 if ( numChars == wxCONV_FAILED )
1493 return wxCONV_FAILED;
c91830cb 1494
ef199164
DS
1495 outLen += numChars * BYTES_PER_CHAR;
1496 if ( outBuff )
c91830cb 1497 {
35d11700
VZ
1498 if ( outLen > dstLen )
1499 return wxCONV_FAILED;
1500
ef199164 1501 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1502 if ( numChars == 2 )
c91830cb 1503 {
35d11700 1504 // second character of a surrogate
ef199164 1505 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1506 }
1507 }
c91830cb 1508 }
c91830cb 1509
35d11700 1510 return outLen;
c91830cb
VZ
1511}
1512
467e0479 1513#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1514
1515
35d11700 1516// ============================================================================
c91830cb 1517// UTF-32
35d11700 1518// ============================================================================
c91830cb
VZ
1519
1520#ifdef WORDS_BIGENDIAN
467e0479
VZ
1521 #define wxMBConvUTF32straight wxMBConvUTF32BE
1522 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1523#else
467e0479
VZ
1524 #define wxMBConvUTF32swap wxMBConvUTF32BE
1525 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1526#endif
1527
1528
1529WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1530WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1531
467e0479
VZ
1532/* static */
1533size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1534{
1535 if ( srcLen == wxNO_LEN )
1536 {
1537 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1538 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1539 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1540 ;
c91830cb 1541
467e0479
VZ
1542 srcLen *= BYTES_PER_CHAR;
1543 }
1544 else // we already have the length
1545 {
1546 // we can only convert an entire number of UTF-32 characters
1547 if ( srcLen % BYTES_PER_CHAR )
1548 return wxCONV_FAILED;
1549 }
1550
1551 return srcLen;
1552}
1553
1554// case when in-memory representation is UTF-16
c91830cb
VZ
1555#ifdef WC_UTF16
1556
467e0479
VZ
1557// ----------------------------------------------------------------------------
1558// conversions without endianness change
1559// ----------------------------------------------------------------------------
1560
1561size_t
1562wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1563 const char *src, size_t srcLen) const
c91830cb 1564{
467e0479
VZ
1565 srcLen = GetLength(src, srcLen);
1566 if ( srcLen == wxNO_LEN )
1567 return wxCONV_FAILED;
c91830cb 1568
ef199164
DS
1569 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1570 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1571 size_t outLen = 0;
1572 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1573 {
1574 wxUint16 cc[2];
ef199164 1575 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1576 if ( numChars == wxCONV_FAILED )
1577 return wxCONV_FAILED;
c91830cb 1578
467e0479
VZ
1579 outLen += numChars;
1580 if ( dst )
c91830cb 1581 {
467e0479
VZ
1582 if ( outLen > dstLen )
1583 return wxCONV_FAILED;
d32a507d 1584
467e0479
VZ
1585 *dst++ = cc[0];
1586 if ( numChars == 2 )
1587 {
1588 // second character of a surrogate
1589 *dst++ = cc[1];
1590 }
1591 }
c91830cb 1592 }
d32a507d 1593
467e0479 1594 return outLen;
c91830cb
VZ
1595}
1596
467e0479
VZ
1597size_t
1598wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1599 const wchar_t *src, size_t srcLen) const
c91830cb 1600{
467e0479
VZ
1601 if ( srcLen == wxNO_LEN )
1602 srcLen = wxWcslen(src) + 1;
c91830cb 1603
467e0479 1604 if ( !dst )
c91830cb 1605 {
467e0479
VZ
1606 // optimization: return maximal space which could be needed for this
1607 // string instead of the exact amount which could be less if there are
1608 // any surrogates in the input
1609 //
1610 // we consider that surrogates are rare enough to make it worthwhile to
1611 // avoid running the loop below at the cost of slightly extra memory
1612 // consumption
ef199164 1613 return srcLen * BYTES_PER_CHAR;
467e0479 1614 }
c91830cb 1615
ef199164 1616 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1617 size_t outLen = 0;
1618 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1619 {
1620 const wxUint32 ch = wxDecodeSurrogate(&src);
1621 if ( !src )
1622 return wxCONV_FAILED;
c91830cb 1623
467e0479 1624 outLen += BYTES_PER_CHAR;
d32a507d 1625
467e0479
VZ
1626 if ( outLen > dstLen )
1627 return wxCONV_FAILED;
b5153fd8 1628
ef199164 1629 *outBuff++ = ch;
467e0479 1630 }
c91830cb 1631
467e0479 1632 return outLen;
c91830cb
VZ
1633}
1634
467e0479
VZ
1635// ----------------------------------------------------------------------------
1636// endian-reversing conversions
1637// ----------------------------------------------------------------------------
c91830cb 1638
467e0479
VZ
1639size_t
1640wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1641 const char *src, size_t srcLen) const
c91830cb 1642{
467e0479
VZ
1643 srcLen = GetLength(src, srcLen);
1644 if ( srcLen == wxNO_LEN )
1645 return wxCONV_FAILED;
c91830cb 1646
ef199164
DS
1647 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1648 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1649 size_t outLen = 0;
ef199164 1650 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1651 {
c91830cb 1652 wxUint16 cc[2];
ef199164 1653 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1654 if ( numChars == wxCONV_FAILED )
1655 return wxCONV_FAILED;
c91830cb 1656
467e0479
VZ
1657 outLen += numChars;
1658 if ( dst )
c91830cb 1659 {
467e0479
VZ
1660 if ( outLen > dstLen )
1661 return wxCONV_FAILED;
d32a507d 1662
467e0479
VZ
1663 *dst++ = cc[0];
1664 if ( numChars == 2 )
1665 {
1666 // second character of a surrogate
1667 *dst++ = cc[1];
1668 }
1669 }
c91830cb 1670 }
b5153fd8 1671
467e0479 1672 return outLen;
c91830cb
VZ
1673}
1674
467e0479
VZ
1675size_t
1676wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1677 const wchar_t *src, size_t srcLen) const
c91830cb 1678{
467e0479
VZ
1679 if ( srcLen == wxNO_LEN )
1680 srcLen = wxWcslen(src) + 1;
c91830cb 1681
467e0479 1682 if ( !dst )
c91830cb 1683 {
467e0479
VZ
1684 // optimization: return maximal space which could be needed for this
1685 // string instead of the exact amount which could be less if there are
1686 // any surrogates in the input
1687 //
1688 // we consider that surrogates are rare enough to make it worthwhile to
1689 // avoid running the loop below at the cost of slightly extra memory
1690 // consumption
1691 return srcLen*BYTES_PER_CHAR;
1692 }
c91830cb 1693
ef199164 1694 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1695 size_t outLen = 0;
1696 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1697 {
1698 const wxUint32 ch = wxDecodeSurrogate(&src);
1699 if ( !src )
1700 return wxCONV_FAILED;
c91830cb 1701
467e0479 1702 outLen += BYTES_PER_CHAR;
d32a507d 1703
467e0479
VZ
1704 if ( outLen > dstLen )
1705 return wxCONV_FAILED;
b5153fd8 1706
ef199164 1707 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1708 }
c91830cb 1709
467e0479 1710 return outLen;
c91830cb
VZ
1711}
1712
467e0479 1713#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1714
35d11700
VZ
1715// ----------------------------------------------------------------------------
1716// conversions without endianness change
1717// ----------------------------------------------------------------------------
1718
1719size_t
1720wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1721 const char *src, size_t srcLen) const
c91830cb 1722{
35d11700
VZ
1723 // use memcpy() as it should be much faster than hand-written loop
1724 srcLen = GetLength(src, srcLen);
1725 if ( srcLen == wxNO_LEN )
1726 return wxCONV_FAILED;
c91830cb 1727
35d11700
VZ
1728 const size_t inLen = srcLen/BYTES_PER_CHAR;
1729 if ( dst )
c91830cb 1730 {
35d11700
VZ
1731 if ( dstLen < inLen )
1732 return wxCONV_FAILED;
b5153fd8 1733
35d11700
VZ
1734 memcpy(dst, src, srcLen);
1735 }
c91830cb 1736
35d11700 1737 return inLen;
c91830cb
VZ
1738}
1739
35d11700
VZ
1740size_t
1741wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1742 const wchar_t *src, size_t srcLen) const
c91830cb 1743{
35d11700
VZ
1744 if ( srcLen == wxNO_LEN )
1745 srcLen = wxWcslen(src) + 1;
1746
1747 srcLen *= BYTES_PER_CHAR;
c91830cb 1748
35d11700 1749 if ( dst )
c91830cb 1750 {
35d11700
VZ
1751 if ( dstLen < srcLen )
1752 return wxCONV_FAILED;
c91830cb 1753
35d11700 1754 memcpy(dst, src, srcLen);
c91830cb
VZ
1755 }
1756
35d11700 1757 return srcLen;
c91830cb
VZ
1758}
1759
35d11700
VZ
1760// ----------------------------------------------------------------------------
1761// endian-reversing conversions
1762// ----------------------------------------------------------------------------
c91830cb 1763
35d11700
VZ
1764size_t
1765wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1766 const char *src, size_t srcLen) const
c91830cb 1767{
35d11700
VZ
1768 srcLen = GetLength(src, srcLen);
1769 if ( srcLen == wxNO_LEN )
1770 return wxCONV_FAILED;
1771
1772 srcLen /= BYTES_PER_CHAR;
c91830cb 1773
35d11700 1774 if ( dst )
c91830cb 1775 {
35d11700
VZ
1776 if ( dstLen < srcLen )
1777 return wxCONV_FAILED;
1778
ef199164
DS
1779 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1780 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1781 {
ef199164 1782 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1783 }
c91830cb 1784 }
b5153fd8 1785
35d11700 1786 return srcLen;
c91830cb
VZ
1787}
1788
35d11700
VZ
1789size_t
1790wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1791 const wchar_t *src, size_t srcLen) const
c91830cb 1792{
35d11700
VZ
1793 if ( srcLen == wxNO_LEN )
1794 srcLen = wxWcslen(src) + 1;
1795
1796 srcLen *= BYTES_PER_CHAR;
c91830cb 1797
35d11700 1798 if ( dst )
c91830cb 1799 {
35d11700
VZ
1800 if ( dstLen < srcLen )
1801 return wxCONV_FAILED;
1802
ef199164 1803 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1804 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1805 {
ef199164 1806 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1807 }
c91830cb 1808 }
b5153fd8 1809
35d11700 1810 return srcLen;
c91830cb
VZ
1811}
1812
467e0479 1813#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1814
1815
36acb880
VZ
1816// ============================================================================
1817// The classes doing conversion using the iconv_xxx() functions
1818// ============================================================================
3caec1bb 1819
b040e242 1820#ifdef HAVE_ICONV
3a0d76bc 1821
b1d547eb
VS
1822// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1823// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1824// (unless there's yet another bug in glibc) the only case when iconv()
1825// returns with (size_t)-1 (which means error) and says there are 0 bytes
1826// left in the input buffer -- when _real_ error occurs,
1827// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1828// iconv() failure.
3caec1bb
VS
1829// [This bug does not appear in glibc 2.2.]
1830#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1831#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1832 (errno != E2BIG || bufLeft != 0))
1833#else
1834#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1835#endif
1836
ab217dba 1837#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1838
74a7eb0b
VZ
1839#define ICONV_T_INVALID ((iconv_t)-1)
1840
1841#if SIZEOF_WCHAR_T == 4
1842 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1843 #define WC_ENC wxFONTENCODING_UTF32
1844#elif SIZEOF_WCHAR_T == 2
1845 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1846 #define WC_ENC wxFONTENCODING_UTF16
1847#else // sizeof(wchar_t) != 2 nor 4
1848 // does this ever happen?
1849 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1850#endif
1851
36acb880 1852// ----------------------------------------------------------------------------
e95354ec 1853// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1854// ----------------------------------------------------------------------------
1855
e95354ec 1856class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1857{
1858public:
86501081 1859 wxMBConv_iconv(const char *name);
e95354ec 1860 virtual ~wxMBConv_iconv();
36acb880 1861
bde4baac
VZ
1862 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1863 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1864
d36c9347 1865 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1866 virtual size_t GetMBNulLen() const;
1867
ba98e032
VS
1868#if wxUSE_UNICODE_UTF8
1869 virtual bool IsUTF8() const;
1870#endif
1871
d36c9347
VZ
1872 virtual wxMBConv *Clone() const
1873 {
86501081 1874 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1875 p->m_minMBCharWidth = m_minMBCharWidth;
1876 return p;
1877 }
1878
e95354ec 1879 bool IsOk() const
74a7eb0b 1880 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1881
1882protected:
ef199164
DS
1883 // the iconv handlers used to translate from multibyte
1884 // to wide char and in the other direction
36acb880
VZ
1885 iconv_t m2w,
1886 w2m;
ef199164 1887
b1d547eb
VS
1888#if wxUSE_THREADS
1889 // guards access to m2w and w2m objects
1890 wxMutex m_iconvMutex;
1891#endif
36acb880
VZ
1892
1893private:
e95354ec 1894 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1895 // available on this machine, it will remain NULL
74a7eb0b 1896 static wxString ms_wcCharsetName;
36acb880
VZ
1897
1898 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1899 // different endian-ness than the native one
405d8f46 1900 static bool ms_wcNeedsSwap;
eec47cc6 1901
d36c9347
VZ
1902
1903 // name of the encoding handled by this conversion
1904 wxString m_name;
1905
7ef3ab50 1906 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1907 // initially
1908 size_t m_minMBCharWidth;
36acb880
VZ
1909};
1910
8f115891 1911// make the constructor available for unit testing
86501081 1912WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
1913{
1914 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1915 if ( !result->IsOk() )
1916 {
1917 delete result;
1918 return 0;
1919 }
ef199164 1920
8f115891
MW
1921 return result;
1922}
1923
422e411e 1924wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1925bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1926
86501081 1927wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 1928 : m_name(name)
36acb880 1929{
c1464d9d 1930 m_minMBCharWidth = 0;
eec47cc6 1931
36acb880 1932 // check for charset that represents wchar_t:
74a7eb0b 1933 if ( ms_wcCharsetName.empty() )
f1339c56 1934 {
c2b83fdd
VZ
1935 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1936
74a7eb0b
VZ
1937#if wxUSE_FONTMAP
1938 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1939#else // !wxUSE_FONTMAP
91cb7f52 1940 static const wxChar *names_static[] =
36acb880 1941 {
74a7eb0b
VZ
1942#if SIZEOF_WCHAR_T == 4
1943 _T("UCS-4"),
1944#elif SIZEOF_WCHAR_T = 2
1945 _T("UCS-2"),
1946#endif
1947 NULL
1948 };
91cb7f52 1949 const wxChar **names = names_static;
74a7eb0b 1950#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1951
d1f024a8 1952 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1953 {
17a1ebd1 1954 const wxString nameCS(*names);
74a7eb0b
VZ
1955
1956 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1957 wxString nameXE(nameCS);
ef199164
DS
1958
1959#ifdef WORDS_BIGENDIAN
74a7eb0b 1960 nameXE += _T("BE");
ef199164 1961#else // little endian
74a7eb0b 1962 nameXE += _T("LE");
ef199164 1963#endif
74a7eb0b 1964
c2b83fdd
VZ
1965 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1966 nameXE.c_str());
1967
86501081 1968 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 1969 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1970 {
74a7eb0b 1971 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1972 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1973 nameCS.c_str());
86501081 1974 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 1975
74a7eb0b
VZ
1976 // and check for bytesex ourselves:
1977 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1978 {
74a7eb0b
VZ
1979 char buf[2], *bufPtr;
1980 wchar_t wbuf[2], *wbufPtr;
1981 size_t insz, outsz;
1982 size_t res;
1983
1984 buf[0] = 'A';
1985 buf[1] = 0;
1986 wbuf[0] = 0;
1987 insz = 2;
1988 outsz = SIZEOF_WCHAR_T * 2;
1989 wbufPtr = wbuf;
1990 bufPtr = buf;
1991
ef199164
DS
1992 res = iconv(
1993 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1994 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
1995
1996 if (ICONV_FAILED(res, insz))
1997 {
1998 wxLogLastError(wxT("iconv"));
422e411e 1999 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2000 nameCS.c_str());
74a7eb0b
VZ
2001 }
2002 else // ok, can convert to this encoding, remember it
2003 {
17a1ebd1 2004 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2005 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2006 }
3a0d76bc
VS
2007 }
2008 }
74a7eb0b 2009 else // use charset not requiring byte swapping
36acb880 2010 {
74a7eb0b 2011 ms_wcCharsetName = nameXE;
36acb880 2012 }
3a0d76bc 2013 }
74a7eb0b 2014
0944fceb 2015 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2016 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2017 ms_wcCharsetName.empty() ? wxString("<none>")
2018 : ms_wcCharsetName,
74a7eb0b
VZ
2019 ms_wcNeedsSwap ? _T(" (needs swap)")
2020 : _T(""));
3a0d76bc 2021 }
36acb880 2022 else // we already have ms_wcCharsetName
3caec1bb 2023 {
86501081 2024 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2025 }
dccce9ea 2026
74a7eb0b 2027 if ( ms_wcCharsetName.empty() )
f1339c56 2028 {
74a7eb0b 2029 w2m = ICONV_T_INVALID;
36acb880 2030 }
405d8f46
VZ
2031 else
2032 {
86501081 2033 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2034 if ( w2m == ICONV_T_INVALID )
2035 {
2036 wxLogTrace(TRACE_STRCONV,
2037 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2038 ms_wcCharsetName.c_str(), name);
74a7eb0b 2039 }
405d8f46 2040 }
36acb880 2041}
3caec1bb 2042
e95354ec 2043wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2044{
74a7eb0b 2045 if ( m2w != ICONV_T_INVALID )
36acb880 2046 iconv_close(m2w);
74a7eb0b 2047 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2048 iconv_close(w2m);
2049}
3a0d76bc 2050
bde4baac 2051size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 2052{
69373110
VZ
2053 // find the string length: notice that must be done differently for
2054 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2055 size_t inbuf;
7ef3ab50 2056 const size_t nulLen = GetMBNulLen();
69373110
VZ
2057 switch ( nulLen )
2058 {
2059 default:
467e0479 2060 return wxCONV_FAILED;
69373110
VZ
2061
2062 case 1:
2063 inbuf = strlen(psz); // arguably more optimized than our version
2064 break;
2065
2066 case 2:
2067 case 4:
2068 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2069 // they also have to start at character boundary and not span two
2070 // adjacent characters
2071 const char *p;
2072 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2073 ;
2074 inbuf = p - psz;
2075 break;
2076 }
2077
b1d547eb 2078#if wxUSE_THREADS
6a17b868
SN
2079 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2080 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2081 // wxConvLocal that are used all over wx code, so we have to make sure
2082 // the handle is used by at most one thread at the time. Otherwise
2083 // only a few wx classes would be safe to use from non-main threads
2084 // as MB<->WC conversion would fail "randomly".
2085 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2086#endif // wxUSE_THREADS
2087
36acb880
VZ
2088 size_t outbuf = n * SIZEOF_WCHAR_T;
2089 size_t res, cres;
2090 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2091 wchar_t *bufPtr = buf;
2092 const char *pszPtr = psz;
2093
2094 if (buf)
2095 {
2096 // have destination buffer, convert there
2097 cres = iconv(m2w,
2098 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2099 (char**)&bufPtr, &outbuf);
2100 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 2101
36acb880 2102 if (ms_wcNeedsSwap)
3a0d76bc 2103 {
36acb880 2104 // convert to native endianness
17a1ebd1
VZ
2105 for ( unsigned i = 0; i < res; i++ )
2106 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 2107 }
adb45366 2108
69373110 2109 // NUL-terminate the string if there is any space left
49dd9820
VS
2110 if (res < n)
2111 buf[res] = 0;
36acb880
VZ
2112 }
2113 else
2114 {
2115 // no destination buffer... convert using temp buffer
2116 // to calculate destination buffer requirement
2117 wchar_t tbuf[8];
2118 res = 0;
ef199164
DS
2119
2120 do
2121 {
36acb880 2122 bufPtr = tbuf;
ef199164 2123 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2124
2125 cres = iconv(m2w,
2126 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2127 (char**)&bufPtr, &outbuf );
2128
ef199164
DS
2129 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2130 }
2131 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2132 }
dccce9ea 2133
36acb880 2134 if (ICONV_FAILED(cres, inbuf))
f1339c56 2135 {
36acb880 2136 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2137 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2138 return wxCONV_FAILED;
36acb880
VZ
2139 }
2140
2141 return res;
2142}
2143
bde4baac 2144size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 2145{
b1d547eb
VS
2146#if wxUSE_THREADS
2147 // NB: explained in MB2WC
2148 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2149#endif
3698ae71 2150
156162ec
MW
2151 size_t inlen = wxWcslen(psz);
2152 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
2153 size_t outbuf = n;
2154 size_t res, cres;
3a0d76bc 2155
36acb880 2156 wchar_t *tmpbuf = 0;
3caec1bb 2157
36acb880
VZ
2158 if (ms_wcNeedsSwap)
2159 {
2160 // need to copy to temp buffer to switch endianness
74a7eb0b 2161 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2162 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 2163 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
2164 for ( size_t i = 0; i < inlen; i++ )
2165 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 2166
156162ec 2167 tmpbuf[inlen] = L'\0';
74a7eb0b 2168 psz = tmpbuf;
36acb880 2169 }
3a0d76bc 2170
36acb880
VZ
2171 if (buf)
2172 {
2173 // have destination buffer, convert there
2174 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 2175
ef199164 2176 res = n - outbuf;
adb45366 2177
49dd9820
VS
2178 // NB: iconv was given only wcslen(psz) characters on input, and so
2179 // it couldn't convert the trailing zero. Let's do it ourselves
2180 // if there's some room left for it in the output buffer.
2181 if (res < n)
2182 buf[0] = 0;
36acb880
VZ
2183 }
2184 else
2185 {
ef199164 2186 // no destination buffer: convert using temp buffer
36acb880
VZ
2187 // to calculate destination buffer requirement
2188 char tbuf[16];
2189 res = 0;
ef199164
DS
2190 do
2191 {
2192 buf = tbuf;
2193 outbuf = 16;
36acb880
VZ
2194
2195 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 2196
36acb880 2197 res += 16 - outbuf;
ef199164
DS
2198 }
2199 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2200 }
dccce9ea 2201
36acb880
VZ
2202 if (ms_wcNeedsSwap)
2203 {
2204 free(tmpbuf);
2205 }
dccce9ea 2206
36acb880
VZ
2207 if (ICONV_FAILED(cres, inbuf))
2208 {
ce6f8d6f 2209 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2210 return wxCONV_FAILED;
36acb880
VZ
2211 }
2212
2213 return res;
2214}
2215
7ef3ab50 2216size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2217{
c1464d9d 2218 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2219 {
2220 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2221
2222#if wxUSE_THREADS
2223 // NB: explained in MB2WC
2224 wxMutexLocker lock(self->m_iconvMutex);
2225#endif
2226
999020e1 2227 const wchar_t *wnul = L"";
c1464d9d 2228 char buf[8]; // should be enough for NUL in any encoding
356410fc 2229 size_t inLen = sizeof(wchar_t),
c1464d9d 2230 outLen = WXSIZEOF(buf);
ef199164
DS
2231 char *inBuff = (char *)wnul;
2232 char *outBuff = buf;
2233 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2234 {
c1464d9d 2235 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2236 }
2237 else // ok
2238 {
ef199164 2239 self->m_minMBCharWidth = outBuff - buf;
356410fc 2240 }
eec47cc6
VZ
2241 }
2242
c1464d9d 2243 return m_minMBCharWidth;
eec47cc6
VZ
2244}
2245
ba98e032
VS
2246#if wxUSE_UNICODE_UTF8
2247bool wxMBConv_iconv::IsUTF8() const
2248{
86501081
VS
2249 return wxStricmp(m_name, "UTF-8") == 0 ||
2250 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2251}
2252#endif
2253
b040e242 2254#endif // HAVE_ICONV
36acb880 2255
e95354ec 2256
36acb880
VZ
2257// ============================================================================
2258// Win32 conversion classes
2259// ============================================================================
1cd52418 2260
e95354ec 2261#ifdef wxHAVE_WIN32_MB2WC
373658eb 2262
8b04d4c4 2263// from utils.cpp
d775fa82 2264#if wxUSE_FONTMAP
86501081 2265extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2266extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2267#endif
373658eb 2268
e95354ec 2269class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2270{
2271public:
bde4baac
VZ
2272 wxMBConv_win32()
2273 {
2274 m_CodePage = CP_ACP;
c1464d9d 2275 m_minMBCharWidth = 0;
bde4baac
VZ
2276 }
2277
d36c9347 2278 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2279 : wxMBConv()
d36c9347
VZ
2280 {
2281 m_CodePage = conv.m_CodePage;
2282 m_minMBCharWidth = conv.m_minMBCharWidth;
2283 }
2284
7608a683 2285#if wxUSE_FONTMAP
86501081 2286 wxMBConv_win32(const char* name)
bde4baac
VZ
2287 {
2288 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2289 m_minMBCharWidth = 0;
bde4baac 2290 }
dccce9ea 2291
e95354ec 2292 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2293 {
2294 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2295 m_minMBCharWidth = 0;
bde4baac 2296 }
eec47cc6 2297#endif // wxUSE_FONTMAP
8b04d4c4 2298
d36c9347 2299 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2300 {
02272c9c
VZ
2301 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2302 // the behaviour is not compatible with the Unix version (using iconv)
2303 // and break the library itself, e.g. wxTextInputStream::NextChar()
2304 // wouldn't work if reading an incomplete MB char didn't result in an
2305 // error
667e5b3e 2306 //
89028980 2307 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2308 // Win XP or newer and it is not supported for UTF-[78] so we always
2309 // use our own conversions in this case. See
89028980
VS
2310 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2311 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2312 if ( m_CodePage == CP_UTF8 )
89028980 2313 {
5487ff0f 2314 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2315 }
830f8f11
VZ
2316
2317 if ( m_CodePage == CP_UTF7 )
2318 {
5487ff0f 2319 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2320 }
2321
2322 int flags = 0;
2323 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2324 IsAtLeastWin2kSP4() )
89028980 2325 {
830f8f11 2326 flags = MB_ERR_INVALID_CHARS;
89028980 2327 }
667e5b3e 2328
2b5f62a0
VZ
2329 const size_t len = ::MultiByteToWideChar
2330 (
2331 m_CodePage, // code page
667e5b3e 2332 flags, // flags: fall on error
2b5f62a0
VZ
2333 psz, // input string
2334 -1, // its length (NUL-terminated)
b4da152e 2335 buf, // output string
2b5f62a0
VZ
2336 buf ? n : 0 // size of output buffer
2337 );
89028980
VS
2338 if ( !len )
2339 {
2340 // function totally failed
467e0479 2341 return wxCONV_FAILED;
89028980
VS
2342 }
2343
2344 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2345 // check if we succeeded, by doing a double trip:
2346 if ( !flags && buf )
2347 {
53c174fc
VZ
2348 const size_t mbLen = strlen(psz);
2349 wxCharBuffer mbBuf(mbLen);
89028980
VS
2350 if ( ::WideCharToMultiByte
2351 (
2352 m_CodePage,
2353 0,
2354 buf,
2355 -1,
2356 mbBuf.data(),
53c174fc 2357 mbLen + 1, // size in bytes, not length
89028980
VS
2358 NULL,
2359 NULL
2360 ) == 0 ||
2361 strcmp(mbBuf, psz) != 0 )
2362 {
2363 // we didn't obtain the same thing we started from, hence
2364 // the conversion was lossy and we consider that it failed
467e0479 2365 return wxCONV_FAILED;
89028980
VS
2366 }
2367 }
2b5f62a0 2368
03a991bc
VZ
2369 // note that it returns count of written chars for buf != NULL and size
2370 // of the needed buffer for buf == NULL so in either case the length of
2371 // the string (which never includes the terminating NUL) is one less
89028980 2372 return len - 1;
f1339c56 2373 }
dccce9ea 2374
d36c9347 2375 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2376 {
13dd924a
VZ
2377 /*
2378 we have a problem here: by default, WideCharToMultiByte() may
2379 replace characters unrepresentable in the target code page with bad
2380 quality approximations such as turning "1/2" symbol (U+00BD) into
2381 "1" for the code pages which don't have it and we, obviously, want
2382 to avoid this at any price
d775fa82 2383
13dd924a
VZ
2384 the trouble is that this function does it _silently_, i.e. it won't
2385 even tell us whether it did or not... Win98/2000 and higher provide
2386 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2387 we have to resort to a round trip, i.e. check that converting back
2388 results in the same string -- this is, of course, expensive but
2389 otherwise we simply can't be sure to not garble the data.
2390 */
2391
2392 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2393 // it doesn't work with CJK encodings (which we test for rather roughly
2394 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2395 // supporting it
907173e5
WS
2396 BOOL usedDef wxDUMMY_INITIALIZE(false);
2397 BOOL *pUsedDef;
13dd924a
VZ
2398 int flags;
2399 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2400 {
2401 // it's our lucky day
2402 flags = WC_NO_BEST_FIT_CHARS;
2403 pUsedDef = &usedDef;
2404 }
2405 else // old system or unsupported encoding
2406 {
2407 flags = 0;
2408 pUsedDef = NULL;
2409 }
2410
2b5f62a0
VZ
2411 const size_t len = ::WideCharToMultiByte
2412 (
2413 m_CodePage, // code page
13dd924a
VZ
2414 flags, // either none or no best fit
2415 pwz, // input string
2b5f62a0
VZ
2416 -1, // it is (wide) NUL-terminated
2417 buf, // output buffer
2418 buf ? n : 0, // and its size
2419 NULL, // default "replacement" char
13dd924a 2420 pUsedDef // [out] was it used?
2b5f62a0
VZ
2421 );
2422
13dd924a
VZ
2423 if ( !len )
2424 {
2425 // function totally failed
467e0479 2426 return wxCONV_FAILED;
13dd924a
VZ
2427 }
2428
2429 // if we were really converting, check if we succeeded
2430 if ( buf )
2431 {
2432 if ( flags )
2433 {
2434 // check if the conversion failed, i.e. if any replacements
2435 // were done
2436 if ( usedDef )
467e0479 2437 return wxCONV_FAILED;
13dd924a
VZ
2438 }
2439 else // we must resort to double tripping...
2440 {
2441 wxWCharBuffer wcBuf(n);
467e0479 2442 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
13dd924a
VZ
2443 wcscmp(wcBuf, pwz) != 0 )
2444 {
2445 // we didn't obtain the same thing we started from, hence
2446 // the conversion was lossy and we consider that it failed
467e0479 2447 return wxCONV_FAILED;
13dd924a
VZ
2448 }
2449 }
2450 }
2451
03a991bc 2452 // see the comment above for the reason of "len - 1"
13dd924a 2453 return len - 1;
f1339c56 2454 }
dccce9ea 2455
7ef3ab50
VZ
2456 virtual size_t GetMBNulLen() const
2457 {
2458 if ( m_minMBCharWidth == 0 )
2459 {
2460 int len = ::WideCharToMultiByte
2461 (
2462 m_CodePage, // code page
2463 0, // no flags
2464 L"", // input string
2465 1, // translate just the NUL
2466 NULL, // output buffer
2467 0, // and its size
2468 NULL, // no replacement char
2469 NULL // [out] don't care if it was used
2470 );
2471
2472 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2473 switch ( len )
2474 {
2475 default:
2476 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2477 self->m_minMBCharWidth = (size_t)-1;
2478 break;
7ef3ab50
VZ
2479
2480 case 0:
2481 self->m_minMBCharWidth = (size_t)-1;
2482 break;
2483
2484 case 1:
2485 case 2:
2486 case 4:
2487 self->m_minMBCharWidth = len;
2488 break;
2489 }
2490 }
2491
2492 return m_minMBCharWidth;
2493 }
2494
d36c9347
VZ
2495 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2496
13dd924a
VZ
2497 bool IsOk() const { return m_CodePage != -1; }
2498
2499private:
2500 static bool CanUseNoBestFit()
2501 {
2502 static int s_isWin98Or2k = -1;
2503
2504 if ( s_isWin98Or2k == -1 )
2505 {
2506 int verMaj, verMin;
2507 switch ( wxGetOsVersion(&verMaj, &verMin) )
2508 {
406d283a 2509 case wxOS_WINDOWS_9X:
13dd924a
VZ
2510 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2511 break;
2512
406d283a 2513 case wxOS_WINDOWS_NT:
13dd924a
VZ
2514 s_isWin98Or2k = verMaj >= 5;
2515 break;
2516
2517 default:
ef199164 2518 // unknown: be conservative by default
13dd924a 2519 s_isWin98Or2k = 0;
ef199164 2520 break;
13dd924a
VZ
2521 }
2522
2523 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2524 }
2525
2526 return s_isWin98Or2k == 1;
2527 }
f1339c56 2528
89028980
VS
2529 static bool IsAtLeastWin2kSP4()
2530 {
8942f83a
WS
2531#ifdef __WXWINCE__
2532 return false;
2533#else
89028980
VS
2534 static int s_isAtLeastWin2kSP4 = -1;
2535
2536 if ( s_isAtLeastWin2kSP4 == -1 )
2537 {
2538 OSVERSIONINFOEX ver;
2539
2540 memset(&ver, 0, sizeof(ver));
2541 ver.dwOSVersionInfoSize = sizeof(ver);
2542 GetVersionEx((OSVERSIONINFO*)&ver);
2543
2544 s_isAtLeastWin2kSP4 =
2545 ((ver.dwMajorVersion > 5) || // Vista+
2546 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2547 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2548 ver.wServicePackMajor >= 4)) // 2000 SP4+
2549 ? 1 : 0;
2550 }
2551
2552 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2553#endif
89028980
VS
2554 }
2555
eec47cc6 2556
c1464d9d 2557 // the code page we're working with
b1d66b54 2558 long m_CodePage;
c1464d9d 2559
7ef3ab50 2560 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2561 // "unknown"
2562 size_t m_minMBCharWidth;
1cd52418 2563};
e95354ec
VZ
2564
2565#endif // wxHAVE_WIN32_MB2WC
2566
f7e98dee 2567
36acb880
VZ
2568// ============================================================================
2569// wxEncodingConverter based conversion classes
2570// ============================================================================
2571
1e6feb95 2572#if wxUSE_FONTMAP
1cd52418 2573
e95354ec 2574class wxMBConv_wxwin : public wxMBConv
1cd52418 2575{
8b04d4c4
VZ
2576private:
2577 void Init()
2578 {
6ac84a78
DE
2579 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2580 // The wxMBConv_cf class does a better job.
2581 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2582 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2583 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2584 }
2585
6001e347 2586public:
f1339c56
RR
2587 // temporarily just use wxEncodingConverter stuff,
2588 // so that it works while a better implementation is built
86501081 2589 wxMBConv_wxwin(const char* name)
f1339c56
RR
2590 {
2591 if (name)
267e11c5 2592 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2593 else
2594 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2595
8b04d4c4
VZ
2596 Init();
2597 }
2598
e95354ec 2599 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2600 {
2601 m_enc = enc;
2602
2603 Init();
f1339c56 2604 }
dccce9ea 2605
bde4baac 2606 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2607 {
2608 size_t inbuf = strlen(psz);
dccce9ea 2609 if (buf)
c643a977 2610 {
ef199164 2611 if (!m2w.Convert(psz, buf))
467e0479 2612 return wxCONV_FAILED;
c643a977 2613 }
f1339c56
RR
2614 return inbuf;
2615 }
dccce9ea 2616
bde4baac 2617 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2618 {
f8d791e0 2619 const size_t inbuf = wxWcslen(psz);
f1339c56 2620 if (buf)
c643a977 2621 {
ef199164 2622 if (!w2m.Convert(psz, buf))
467e0479 2623 return wxCONV_FAILED;
c643a977 2624 }
dccce9ea 2625
f1339c56
RR
2626 return inbuf;
2627 }
dccce9ea 2628
7ef3ab50 2629 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2630 {
2631 switch ( m_enc )
2632 {
2633 case wxFONTENCODING_UTF16BE:
2634 case wxFONTENCODING_UTF16LE:
c1464d9d 2635 return 2;
eec47cc6
VZ
2636
2637 case wxFONTENCODING_UTF32BE:
2638 case wxFONTENCODING_UTF32LE:
c1464d9d 2639 return 4;
eec47cc6
VZ
2640
2641 default:
c1464d9d 2642 return 1;
eec47cc6
VZ
2643 }
2644 }
2645
d36c9347
VZ
2646 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2647
7ef3ab50
VZ
2648 bool IsOk() const { return m_ok; }
2649
2650public:
2651 wxFontEncoding m_enc;
2652 wxEncodingConverter m2w, w2m;
2653
2654private:
cafbf6fb
VZ
2655 // were we initialized successfully?
2656 bool m_ok;
fc7a2a60 2657
e95354ec 2658 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2659};
6001e347 2660
8f115891 2661// make the constructors available for unit testing
86501081 2662WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2663{
2664 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2665 if ( !result->IsOk() )
2666 {
2667 delete result;
2668 return 0;
2669 }
ef199164 2670
8f115891
MW
2671 return result;
2672}
2673
1e6feb95
VZ
2674#endif // wxUSE_FONTMAP
2675
36acb880
VZ
2676// ============================================================================
2677// wxCSConv implementation
2678// ============================================================================
2679
8b04d4c4 2680void wxCSConv::Init()
6001e347 2681{
e95354ec
VZ
2682 m_name = NULL;
2683 m_convReal = NULL;
2684 m_deferred = true;
2685}
2686
86501081 2687wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2688{
2689 Init();
82713003 2690
86501081 2691 if ( !charset.empty() )
e95354ec 2692 {
86501081 2693 SetName(charset.ToAscii());
e95354ec 2694 }
bda3d86a 2695
e4277538
VZ
2696#if wxUSE_FONTMAP
2697 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2698#else
bda3d86a 2699 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2700#endif
6001e347
RR
2701}
2702
8b04d4c4
VZ
2703wxCSConv::wxCSConv(wxFontEncoding encoding)
2704{
bda3d86a 2705 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2706 {
2707 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2708
2709 encoding = wxFONTENCODING_SYSTEM;
2710 }
2711
8b04d4c4
VZ
2712 Init();
2713
bda3d86a 2714 m_encoding = encoding;
8b04d4c4
VZ
2715}
2716
6001e347
RR
2717wxCSConv::~wxCSConv()
2718{
65e50848
JS
2719 Clear();
2720}
2721
54380f29 2722wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2723 : wxMBConv()
54380f29 2724{
8b04d4c4
VZ
2725 Init();
2726
54380f29 2727 SetName(conv.m_name);
8b04d4c4 2728 m_encoding = conv.m_encoding;
54380f29
GD
2729}
2730
2731wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2732{
2733 Clear();
8b04d4c4 2734
54380f29 2735 SetName(conv.m_name);
8b04d4c4
VZ
2736 m_encoding = conv.m_encoding;
2737
54380f29
GD
2738 return *this;
2739}
2740
65e50848
JS
2741void wxCSConv::Clear()
2742{
8b04d4c4 2743 free(m_name);
e95354ec 2744 delete m_convReal;
8b04d4c4 2745
65e50848 2746 m_name = NULL;
e95354ec 2747 m_convReal = NULL;
6001e347
RR
2748}
2749
86501081 2750void wxCSConv::SetName(const char *charset)
6001e347 2751{
f1339c56
RR
2752 if (charset)
2753 {
d6f2a891 2754 m_name = wxStrdup(charset);
e95354ec 2755 m_deferred = true;
f1339c56 2756 }
6001e347
RR
2757}
2758
8b3eb85d 2759#if wxUSE_FONTMAP
8b3eb85d
VZ
2760
2761WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2762 wxEncodingNameCache );
8b3eb85d
VZ
2763
2764static wxEncodingNameCache gs_nameCache;
2765#endif
2766
e95354ec
VZ
2767wxMBConv *wxCSConv::DoCreate() const
2768{
ce6f8d6f
VZ
2769#if wxUSE_FONTMAP
2770 wxLogTrace(TRACE_STRCONV,
2771 wxT("creating conversion for %s"),
2772 (m_name ? m_name
86501081 2773 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2774#endif // wxUSE_FONTMAP
2775
c547282d
VZ
2776 // check for the special case of ASCII or ISO8859-1 charset: as we have
2777 // special knowledge of it anyhow, we don't need to create a special
2778 // conversion object
e4277538
VZ
2779 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2780 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2781 {
e95354ec
VZ
2782 // don't convert at all
2783 return NULL;
2784 }
dccce9ea 2785
e95354ec
VZ
2786 // we trust OS to do conversion better than we can so try external
2787 // conversion methods first
2788 //
2789 // the full order is:
2790 // 1. OS conversion (iconv() under Unix or Win32 API)
2791 // 2. hard coded conversions for UTF
2792 // 3. wxEncodingConverter as fall back
2793
2794 // step (1)
2795#ifdef HAVE_ICONV
c547282d 2796#if !wxUSE_FONTMAP
e95354ec 2797 if ( m_name )
c547282d 2798#endif // !wxUSE_FONTMAP
e95354ec 2799 {
3ef10cfc 2800#if wxUSE_FONTMAP
8b3eb85d 2801 wxFontEncoding encoding(m_encoding);
3ef10cfc 2802#endif
8b3eb85d 2803
86501081 2804 if ( m_name )
8b3eb85d 2805 {
86501081 2806 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2807 if ( conv->IsOk() )
2808 return conv;
2809
2810 delete conv;
c547282d
VZ
2811
2812#if wxUSE_FONTMAP
8b3eb85d 2813 encoding =
86501081 2814 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2815#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2816 }
2817#if wxUSE_FONTMAP
2818 {
2819 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2820 if ( it != gs_nameCache.end() )
2821 {
2822 if ( it->second.empty() )
2823 return NULL;
c547282d 2824
86501081 2825 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2826 if ( conv->IsOk() )
2827 return conv;
e95354ec 2828
8b3eb85d
VZ
2829 delete conv;
2830 }
2831
2832 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2833 // CS : in case this does not return valid names (eg for MacRoman)
2834 // encoding got a 'failure' entry in the cache all the same,
2835 // although it just has to be created using a different method, so
2836 // only store failed iconv creation attempts (or perhaps we
2837 // shoulnd't do this at all ?)
3c67ec06 2838 if ( names[0] != NULL )
8b3eb85d 2839 {
3c67ec06 2840 for ( ; *names; ++names )
8b3eb85d 2841 {
86501081
VS
2842 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2843 // will need changes that will obsolete this
2844 wxString name(*names);
2845 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2846 if ( conv->IsOk() )
2847 {
2848 gs_nameCache[encoding] = *names;
2849 return conv;
2850 }
2851
2852 delete conv;
8b3eb85d
VZ
2853 }
2854
3c67ec06 2855 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2856 }
8b3eb85d
VZ
2857 }
2858#endif // wxUSE_FONTMAP
e95354ec
VZ
2859 }
2860#endif // HAVE_ICONV
2861
2862#ifdef wxHAVE_WIN32_MB2WC
2863 {
7608a683 2864#if wxUSE_FONTMAP
e95354ec
VZ
2865 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2866 : new wxMBConv_win32(m_encoding);
2867 if ( conv->IsOk() )
2868 return conv;
2869
2870 delete conv;
7608a683
WS
2871#else
2872 return NULL;
2873#endif
e95354ec
VZ
2874 }
2875#endif // wxHAVE_WIN32_MB2WC
ef199164 2876
5c4ed98d 2877#ifdef __DARWIN__
f7e98dee 2878 {
6ff49cbc
DE
2879 // leave UTF16 and UTF32 to the built-ins of wx
2880 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2881 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 2882 {
a6900d10 2883#if wxUSE_FONTMAP
5c4ed98d
DE
2884 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2885 : new wxMBConv_cf(m_encoding);
a6900d10 2886#else
5c4ed98d 2887 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 2888#endif
ef199164 2889
f7e98dee 2890 if ( conv->IsOk() )
d775fa82
WS
2891 return conv;
2892
2893 delete conv;
2894 }
335d31e0 2895 }
5c4ed98d
DE
2896#endif // __DARWIN__
2897
e95354ec
VZ
2898 // step (2)
2899 wxFontEncoding enc = m_encoding;
2900#if wxUSE_FONTMAP
c547282d
VZ
2901 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2902 {
2903 // use "false" to suppress interactive dialogs -- we can be called from
2904 // anywhere and popping up a dialog from here is the last thing we want to
2905 // do
267e11c5 2906 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2907 }
e95354ec
VZ
2908#endif // wxUSE_FONTMAP
2909
2910 switch ( enc )
2911 {
2912 case wxFONTENCODING_UTF7:
2913 return new wxMBConvUTF7;
2914
2915 case wxFONTENCODING_UTF8:
2916 return new wxMBConvUTF8;
2917
e95354ec
VZ
2918 case wxFONTENCODING_UTF16BE:
2919 return new wxMBConvUTF16BE;
2920
2921 case wxFONTENCODING_UTF16LE:
2922 return new wxMBConvUTF16LE;
2923
e95354ec
VZ
2924 case wxFONTENCODING_UTF32BE:
2925 return new wxMBConvUTF32BE;
2926
2927 case wxFONTENCODING_UTF32LE:
2928 return new wxMBConvUTF32LE;
2929
2930 default:
2931 // nothing to do but put here to suppress gcc warnings
ef199164 2932 break;
e95354ec
VZ
2933 }
2934
2935 // step (3)
2936#if wxUSE_FONTMAP
2937 {
2938 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2939 : new wxMBConv_wxwin(m_encoding);
2940 if ( conv->IsOk() )
2941 return conv;
2942
2943 delete conv;
2944 }
2945#endif // wxUSE_FONTMAP
2946
a58d4f4d
VS
2947 // NB: This is a hack to prevent deadlock. What could otherwise happen
2948 // in Unicode build: wxConvLocal creation ends up being here
2949 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
2950 // attach a timestamp, for which it will need wxConvLocal (to convert
2951 // time to char* and then wchar_t*), but that fails, tries to log the
2952 // error, but wxLog has an (already locked) critical section that
2953 // guards the static buffer.
a58d4f4d
VS
2954 static bool alreadyLoggingError = false;
2955 if (!alreadyLoggingError)
2956 {
2957 alreadyLoggingError = true;
2958 wxLogError(_("Cannot convert from the charset '%s'!"),
2959 m_name ? m_name
e95354ec
VZ
2960 :
2961#if wxUSE_FONTMAP
86501081 2962 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 2963#else // !wxUSE_FONTMAP
86501081 2964 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
2965#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2966 );
ef199164 2967
a58d4f4d
VS
2968 alreadyLoggingError = false;
2969 }
e95354ec
VZ
2970
2971 return NULL;
2972}
2973
2974void wxCSConv::CreateConvIfNeeded() const
2975{
2976 if ( m_deferred )
2977 {
2978 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 2979
bda3d86a
VZ
2980 // if we don't have neither the name nor the encoding, use the default
2981 // encoding for this system
2982 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2983 {
4c75209f 2984#if wxUSE_INTL
02c7347b 2985 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
2986#else
2987 // fallback to some reasonable default:
2988 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 2989#endif // wxUSE_INTL
4c75209f 2990 }
bda3d86a 2991
e95354ec
VZ
2992 self->m_convReal = DoCreate();
2993 self->m_deferred = false;
6001e347 2994 }
6001e347
RR
2995}
2996
0f0298b1
VZ
2997bool wxCSConv::IsOk() const
2998{
2999 CreateConvIfNeeded();
3000
3001 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3002 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3003 return true; // always ok as we do it ourselves
3004
3005 // m_convReal->IsOk() is called at its own creation, so we know it must
3006 // be ok if m_convReal is non-NULL
3007 return m_convReal != NULL;
3008}
3009
1c714a5d
VZ
3010size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3011 const char *src, size_t srcLen) const
3012{
3013 CreateConvIfNeeded();
3014
2c74c558
VS
3015 if (m_convReal)
3016 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3017
3018 // latin-1 (direct)
3019 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3020}
3021
3022size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3023 const wchar_t *src, size_t srcLen) const
3024{
3025 CreateConvIfNeeded();
3026
2c74c558
VS
3027 if (m_convReal)
3028 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3029
3030 // latin-1 (direct)
3031 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3032}
3033
6001e347
RR
3034size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3035{
e95354ec 3036 CreateConvIfNeeded();
dccce9ea 3037
e95354ec
VZ
3038 if (m_convReal)
3039 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3040
3041 // latin-1 (direct)
4def3b35 3042 size_t len = strlen(psz);
dccce9ea 3043
f1339c56
RR
3044 if (buf)
3045 {
4def3b35 3046 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3047 buf[c] = (unsigned char)(psz[c]);
3048 }
dccce9ea 3049
f1339c56 3050 return len;
6001e347
RR
3051}
3052
3053size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3054{
e95354ec 3055 CreateConvIfNeeded();
dccce9ea 3056
e95354ec
VZ
3057 if (m_convReal)
3058 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3059
f1339c56 3060 // latin-1 (direct)
f8d791e0 3061 const size_t len = wxWcslen(psz);
f1339c56
RR
3062 if (buf)
3063 {
4def3b35 3064 for (size_t c = 0; c <= len; c++)
24642831
VS
3065 {
3066 if (psz[c] > 0xFF)
467e0479 3067 return wxCONV_FAILED;
ef199164 3068
907173e5 3069 buf[c] = (char)psz[c];
24642831
VS
3070 }
3071 }
3072 else
3073 {
3074 for (size_t c = 0; c <= len; c++)
3075 {
3076 if (psz[c] > 0xFF)
467e0479 3077 return wxCONV_FAILED;
24642831 3078 }
f1339c56 3079 }
dccce9ea 3080
f1339c56 3081 return len;
6001e347
RR
3082}
3083
7ef3ab50 3084size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3085{
3086 CreateConvIfNeeded();
3087
3088 if ( m_convReal )
3089 {
7ef3ab50 3090 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3091 }
3092
ba98e032 3093 // otherwise, we are ISO-8859-1
c1464d9d 3094 return 1;
eec47cc6
VZ
3095}
3096
ba98e032
VS
3097#if wxUSE_UNICODE_UTF8
3098bool wxCSConv::IsUTF8() const
3099{
3100 CreateConvIfNeeded();
3101
3102 if ( m_convReal )
3103 {
3104 return m_convReal->IsUTF8();
3105 }
3106
3107 // otherwise, we are ISO-8859-1
3108 return false;
3109}
3110#endif
3111
69c928ef
VZ
3112
3113#if wxUSE_UNICODE
3114
3115wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3116{
3117 if ( !s )
3118 return wxWCharBuffer();
3119
3120 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3121 if ( !wbuf )
5487ff0f 3122 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3123 if ( !wbuf )
3124 wbuf = wxConvISO8859_1.cMB2WX(s);
3125
3126 return wbuf;
3127}
3128
3129wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3130{
3131 if ( !ws )
3132 return wxCharBuffer();
3133
3134 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3135 if ( !buf )
3136 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3137
3138 return buf;
3139}
3140
3141#endif // wxUSE_UNICODE
f5a1953b 3142
1e50d914
VS
3143// ----------------------------------------------------------------------------
3144// globals
3145// ----------------------------------------------------------------------------
3146
3147// NB: The reason why we create converted objects in this convoluted way,
3148// using a factory function instead of global variable, is that they
3149// may be used at static initialization time (some of them are used by
3150// wxString ctors and there may be a global wxString object). In other
3151// words, possibly _before_ the converter global object would be
3152// initialized.
3153
3154#undef wxConvLibc
3155#undef wxConvUTF8
3156#undef wxConvUTF7
3157#undef wxConvLocal
3158#undef wxConvISO8859_1
3159
3160#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3161 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3162 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3163 { \
3164 static impl_klass name##Obj ctor_args; \
3165 return &name##Obj; \
3166 } \
3167 /* this ensures that all global converter objects are created */ \
3168 /* by the time static initialization is done, i.e. before any */ \
3169 /* thread is launched: */ \
3170 static klass* gs_##name##instance = wxGet_##name##Ptr()
3171
3172#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3173 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3174
3175#ifdef __WINDOWS__
3176 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3177#else
3178 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3179#endif
3180
0286d08d 3181WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3182WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3183
3184WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3185WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3186
3187WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3188WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3189
6ac84a78
DE
3190#ifdef __DARWIN__
3191// The xnu kernel always communicates file paths in decomposed UTF-8.
3192// WARNING: Are we sure that CFString's conversion will cause decomposition?
3193static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3194#endif
6ac84a78 3195
1e50d914 3196WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3197#ifdef __DARWIN__
1e50d914 3198 &wxConvMacUTF8DObj;
6ac84a78 3199#else // !__DARWIN__
1e50d914 3200 wxGet_wxConvLibcPtr();
6ac84a78 3201#endif // __DARWIN__/!__DARWIN__
1e50d914 3202
bde4baac
VZ
3203#else // !wxUSE_WCHAR_T
3204
1e50d914 3205// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3206// stand-ins in absence of wchar_t
3207WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3208 wxConvISO8859_1,
3209 wxConvLocal,
3210 wxConvUTF8;
3211
3212#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T