]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
fix wxStringOutputStream to deal with NUL bytes correctly (incidentally fixes bug...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
6001e347 47#ifdef __SALFORDC__
373658eb 48 #include <clib.h>
6001e347
RR
49#endif
50
b040e242 51#ifdef HAVE_ICONV
373658eb 52 #include <iconv.h>
b1d547eb 53 #include "wx/thread.h"
1cd52418 54#endif
1cd52418 55
373658eb
VZ
56#include "wx/encconv.h"
57#include "wx/fontmap.h"
58
5c4ed98d 59#ifdef __DARWIN__
e4dd1e19 60#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
61#endif //def __DARWIN__
62
ef199164 63
ce6f8d6f
VZ
64#define TRACE_STRCONV _T("strconv")
65
467e0479
VZ
66// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67// be 4 bytes
4948c2b6 68#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
69 #define WC_UTF16
70#endif
71
ef199164 72
373658eb
VZ
73// ============================================================================
74// implementation
75// ============================================================================
76
69373110
VZ
77// helper function of cMB2WC(): check if n bytes at this location are all NUL
78static bool NotAllNULs(const char *p, size_t n)
79{
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84}
85
373658eb 86// ----------------------------------------------------------------------------
467e0479 87// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 88// ----------------------------------------------------------------------------
6001e347 89
c91830cb 90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 91{
ef199164 92 if (input <= 0xffff)
4def3b35 93 {
999836aa
VZ
94 if (output)
95 *output = (wxUint16) input;
ef199164 96
4def3b35 97 return 1;
dccce9ea 98 }
ef199164 99 else if (input >= 0x110000)
4def3b35 100 {
467e0479 101 return wxCONV_FAILED;
dccce9ea
VZ
102 }
103 else
4def3b35 104 {
dccce9ea 105 if (output)
4def3b35 106 {
ef199164
DS
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 109 }
ef199164 110
4def3b35 111 return 2;
1cd52418 112 }
1cd52418
OK
113}
114
c91830cb 115static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 116{
ef199164 117 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
118 {
119 output = *input;
120 return 1;
dccce9ea 121 }
ef199164 122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
467e0479 125 return wxCONV_FAILED;
dccce9ea
VZ
126 }
127 else
4def3b35
VS
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
1cd52418
OK
132}
133
467e0479 134#ifdef WC_UTF16
35d11700
VZ
135 typedef wchar_t wxDecodeSurrogate_t;
136#else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
139
140// returns the next UTF-32 character from the wchar_t buffer and advances the
141// pointer to the character after this one
142//
143// if an invalid character is found, *pSrc is set to NULL, the caller must
144// check for this
35d11700 145static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
146{
147 wxUint32 out;
8d3dd069
VZ
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156}
157
f6bcfd97 158// ----------------------------------------------------------------------------
6001e347 159// wxMBConv
f6bcfd97 160// ----------------------------------------------------------------------------
2c53a80a 161
483b0434
VZ
162size_t
163wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
6001e347 165{
483b0434
VZ
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
483b0434 212 for ( ;; )
eec47cc6 213 {
c1464d9d 214 // try to convert the current chunk
483b0434 215 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
e4e3bbb4 218
467e0479 219 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 220
483b0434 221 dstWritten += lenChunk;
f5fb6871 222
467e0479
VZ
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
483b0434
VZ
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
830f8f11 234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
c1464d9d 239
483b0434 240 if ( !srcEnd )
c1464d9d 241 {
467e0479
VZ
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
c1464d9d
VZ
244 break;
245 }
eec47cc6
VZ
246
247 // advance the input pointer past the end of this chunk
483b0434 248 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
483b0434 254 src += nulLen;
c1464d9d 255 }
e4e3bbb4 256
483b0434 257 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
483b0434 262 if ( src >= srcEnd )
c1464d9d
VZ
263 break;
264 }
265
483b0434 266 return dstWritten;
e4e3bbb4
RN
267}
268
483b0434
VZ
269size_t
270wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
e4e3bbb4 272{
483b0434
VZ
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
e4e3bbb4 275
eec47cc6
VZ
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
467e0479 282 if ( srcLen == wxNO_LEN )
e4e3bbb4 283 {
483b0434 284 srcLen = wxWcslen(src) + 1;
eec47cc6 285 }
483b0434 286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
287 {
288 // make a copy in order to properly NUL-terminate the string
483b0434 289 bufTmp = wxWCharBuffer(srcLen);
ef199164 290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
eec47cc6 318 }
e4e3bbb4 319
483b0434
VZ
320 return dstWritten;
321}
322
ef199164 323size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 324{
ef199164 325 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 326 if ( rc != wxCONV_FAILED )
509da451
VZ
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334}
335
ef199164 336size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 337{
ef199164 338 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 339 if ( rc != wxCONV_FAILED )
509da451
VZ
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345}
346
483b0434
VZ
347wxMBConv::~wxMBConv()
348{
349 // nothing to do here (necessary for Darwin linking probably)
350}
e4e3bbb4 351
483b0434
VZ
352const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353{
354 if ( psz )
eec47cc6 355 {
483b0434 356 // calculate the length of the buffer needed first
a2db25a1 357 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 358 if ( nLen != wxCONV_FAILED )
f5fb6871 359 {
483b0434 360 // now do the actual conversion
a2db25a1 361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 362
483b0434 363 // +1 for the trailing NULL
a2db25a1 364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 365 return buf;
f5fb6871 366 }
483b0434 367 }
e4e3bbb4 368
483b0434
VZ
369 return wxWCharBuffer();
370}
3698ae71 371
483b0434
VZ
372const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373{
374 if ( pwz )
375 {
a2db25a1 376 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 377 if ( nLen != wxCONV_FAILED )
483b0434 378 {
a2db25a1
VZ
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386}
e4e3bbb4 387
483b0434 388const wxWCharBuffer
ef199164 389wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 390{
ef199164 391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 392 if ( dstLen != wxCONV_FAILED )
483b0434 393 {
830f8f11 394 wxWCharBuffer wbuf(dstLen - 1);
ef199164 395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
396 {
397 if ( outLen )
467e0479
VZ
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
483b0434
VZ
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412}
413
414const wxCharBuffer
ef199164 415wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 416{
13d92ad6 417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 418 if ( dstLen != wxCONV_FAILED )
483b0434 419 {
168a76fe
VZ
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
ef199164 423 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
424 {
425 if ( outLen )
467e0479
VZ
426 {
427 *outLen = dstLen;
428
429 const size_t nulLen = GetMBNulLen();
13d92ad6
VZ
430 if ( dstLen >= nulLen &&
431 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
432 {
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
13d92ad6 435 *outLen -= nulLen;
467e0479
VZ
436 }
437 }
d32a507d 438
483b0434
VZ
439 return buf;
440 }
e4e3bbb4
RN
441 }
442
eec47cc6
VZ
443 if ( outLen )
444 *outLen = 0;
445
446 return wxCharBuffer();
e4e3bbb4
RN
447}
448
6001e347 449// ----------------------------------------------------------------------------
bde4baac 450// wxMBConvLibc
6001e347
RR
451// ----------------------------------------------------------------------------
452
bde4baac
VZ
453size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
454{
455 return wxMB2WC(buf, psz, n);
456}
457
458size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
459{
460 return wxWC2MB(buf, psz, n);
461}
e1bfe89e
RR
462
463// ----------------------------------------------------------------------------
532d575b 464// wxConvBrokenFileNames
e1bfe89e
RR
465// ----------------------------------------------------------------------------
466
eec47cc6
VZ
467#ifdef __UNIX__
468
86501081 469wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 470{
86501081
VS
471 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
472 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 473 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
474 else
475 m_conv = new wxCSConv(charset);
ea8ce907
RR
476}
477
eec47cc6 478#endif // __UNIX__
c12b7f79 479
bde4baac 480// ----------------------------------------------------------------------------
3698ae71 481// UTF-7
bde4baac 482// ----------------------------------------------------------------------------
6001e347 483
15f2ee32 484// Implementation (C) 2004 Fredrik Roubert
6001e347 485
15f2ee32
RN
486//
487// BASE64 decoding table
488//
489static const unsigned char utf7unb64[] =
6001e347 490{
15f2ee32
RN
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
523};
524
525size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
526{
15f2ee32
RN
527 size_t len = 0;
528
04a37834 529 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
530 {
531 unsigned char cc = *psz++;
532 if (cc != '+')
533 {
534 // plain ASCII char
535 if (buf)
536 *buf++ = cc;
537 len++;
538 }
539 else if (*psz == '-')
540 {
541 // encoded plus sign
542 if (buf)
543 *buf++ = cc;
544 len++;
545 psz++;
546 }
04a37834 547 else // start of BASE64 encoded string
15f2ee32 548 {
04a37834 549 bool lsb, ok;
15f2ee32 550 unsigned int d, l;
04a37834
VZ
551 for ( ok = lsb = false, d = 0, l = 0;
552 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
553 psz++ )
15f2ee32
RN
554 {
555 d <<= 6;
556 d += cc;
557 for (l += 6; l >= 8; lsb = !lsb)
558 {
04a37834 559 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
560 if (lsb)
561 {
562 if (buf)
563 *buf++ |= c;
564 len ++;
565 }
566 else
04a37834 567 {
15f2ee32 568 if (buf)
6356d52a 569 *buf = (wchar_t)(c << 8);
04a37834
VZ
570 }
571
572 ok = true;
15f2ee32
RN
573 }
574 }
04a37834
VZ
575
576 if ( !ok )
577 {
578 // in valid UTF7 we should have valid characters after '+'
467e0479 579 return wxCONV_FAILED;
04a37834
VZ
580 }
581
15f2ee32
RN
582 if (*psz == '-')
583 psz++;
584 }
585 }
04a37834
VZ
586
587 if ( buf && (len < n) )
588 *buf = '\0';
589
15f2ee32 590 return len;
6001e347
RR
591}
592
15f2ee32
RN
593//
594// BASE64 encoding table
595//
596static const unsigned char utf7enb64[] =
597{
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
606};
607
608//
609// UTF-7 encoding table
610//
611// 0 - Set D (directly encoded characters)
612// 1 - Set O (optional direct characters)
613// 2 - whitespace characters (optional)
614// 3 - special characters
615//
616static const unsigned char utf7encode[128] =
6001e347 617{
15f2ee32
RN
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
626};
627
667e5b3e 628size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 629{
15f2ee32
RN
630 size_t len = 0;
631
632 while (*psz && ((!buf) || (len < n)))
633 {
634 wchar_t cc = *psz++;
635 if (cc < 0x80 && utf7encode[cc] < 1)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = (char)cc;
ef199164 640
15f2ee32
RN
641 len++;
642 }
643#ifndef WC_UTF16
79c78d42 644 else if (((wxUint32)cc) > 0xffff)
b2c13097 645 {
15f2ee32 646 // no surrogate pair generation (yet?)
467e0479 647 return wxCONV_FAILED;
15f2ee32
RN
648 }
649#endif
650 else
651 {
652 if (buf)
653 *buf++ = '+';
ef199164 654
15f2ee32
RN
655 len++;
656 if (cc != '+')
657 {
658 // BASE64 encode string
659 unsigned int lsb, d, l;
73c902d6 660 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
661 {
662 for (lsb = 0; lsb < 2; lsb ++)
663 {
664 d <<= 8;
665 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
666
667 for (l += 8; l >= 6; )
668 {
669 l -= 6;
670 if (buf)
671 *buf++ = utf7enb64[(d >> l) % 64];
672 len++;
673 }
674 }
ef199164 675
15f2ee32
RN
676 cc = *psz;
677 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
678 break;
679 }
ef199164 680
15f2ee32
RN
681 if (l != 0)
682 {
683 if (buf)
684 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 685
15f2ee32
RN
686 len++;
687 }
688 }
ef199164 689
15f2ee32
RN
690 if (buf)
691 *buf++ = '-';
692 len++;
693 }
694 }
ef199164 695
15f2ee32
RN
696 if (buf && (len < n))
697 *buf = 0;
ef199164 698
15f2ee32 699 return len;
6001e347
RR
700}
701
f6bcfd97 702// ----------------------------------------------------------------------------
6001e347 703// UTF-8
f6bcfd97 704// ----------------------------------------------------------------------------
6001e347 705
1774c3c5 706static const wxUint32 utf8_max[]=
4def3b35 707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 708
3698ae71
VZ
709// boundaries of the private use area we use to (temporarily) remap invalid
710// characters invalid in a UTF-8 encoded string
ea8ce907
RR
711const wxUint32 wxUnicodePUA = 0x100000;
712const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
713
0286d08d 714// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 715const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
725
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
731 0, 0, // C0,C1
732
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
736
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
739
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
742
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
747};
748
749size_t
750wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
751 const char *src, size_t srcLen) const
752{
753 wchar_t *out = dstLen ? dst : NULL;
754 size_t written = 0;
755
756 if ( srcLen == wxNO_LEN )
757 srcLen = strlen(src) + 1;
758
759 for ( const char *p = src; ; p++ )
760 {
761 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
762 {
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen == wxNO_LEN )
766 {
767 if ( out )
768 {
769 if ( !dstLen )
770 break;
771
772 *out = L'\0';
773 }
774
775 written++;
776 }
777
778 return written;
779 }
780
0286d08d
VZ
781 if ( out && !dstLen-- )
782 break;
783
5367a38a
VS
784 wxUint32 code;
785 unsigned char c = *p;
0286d08d 786
5367a38a
VS
787 if ( c < 0x80 )
788 {
789 if ( srcLen == 0 ) // the test works for wxNO_LEN too
790 break;
0286d08d 791
5367a38a
VS
792 if ( srcLen != wxNO_LEN )
793 srcLen--;
0286d08d 794
5367a38a
VS
795 code = c;
796 }
797 else
0286d08d 798 {
5367a38a
VS
799 unsigned len = tableUtf8Lengths[c];
800 if ( !len )
801 break;
802
803 if ( srcLen < len ) // the test works for wxNO_LEN too
804 break;
805
806 if ( srcLen != wxNO_LEN )
807 srcLen -= len;
808
809 // Char. number range | UTF-8 octet sequence
810 // (hexadecimal) | (binary)
811 // ----------------------+----------------------------------------
812 // 0000 0000 - 0000 007F | 0xxxxxxx
813 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
814 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
815 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
816 //
817 // Code point value is stored in bits marked with 'x',
818 // lowest-order bit of the value on the right side in the diagram
819 // above. (from RFC 3629)
820
821 // mask to extract lead byte's value ('x' bits above), by sequence
822 // length:
823 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
824
825 // mask and value of lead byte's most significant bits, by length:
826 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
827 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
828
829 len--; // it's more convenient to work with 0-based length here
830
831 // extract the lead byte's value bits:
832 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
833 break;
834
835 code = c & leadValueMask[len];
836
837 // all remaining bytes, if any, are handled in the same way
838 // regardless of sequence's length:
839 for ( ; len; --len )
840 {
841 c = *++p;
842 if ( (c & 0xC0) != 0x80 )
843 return wxCONV_FAILED;
0286d08d 844
5367a38a
VS
845 code <<= 6;
846 code |= c & 0x3F;
847 }
0286d08d
VZ
848 }
849
850#ifdef WC_UTF16
851 // cast is ok because wchar_t == wxUint16 if WC_UTF16
852 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
853 {
854 if ( out )
855 out++;
856 written++;
857 }
858#else // !WC_UTF16
859 if ( out )
860 *out = code;
861#endif // WC_UTF16/!WC_UTF16
862
863 if ( out )
864 out++;
865
866 written++;
867 }
868
869 return wxCONV_FAILED;
870}
871
872size_t
873wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
874 const wchar_t *src, size_t srcLen) const
875{
876 char *out = dstLen ? dst : NULL;
877 size_t written = 0;
878
879 for ( const wchar_t *wp = src; ; wp++ )
880 {
881 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
882 {
883 // all done successfully, just add the trailing NULL if we are not
884 // using explicit length
885 if ( srcLen == wxNO_LEN )
886 {
887 if ( out )
888 {
889 if ( !dstLen )
890 break;
891
892 *out = '\0';
893 }
894
895 written++;
896 }
897
898 return written;
899 }
900
901
902 wxUint32 code;
903#ifdef WC_UTF16
904 // cast is ok for WC_UTF16
905 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
906 {
907 // skip the next char too as we decoded a surrogate
908 wp++;
909 }
910#else // wchar_t is UTF-32
911 code = *wp & 0x7fffffff;
912#endif
913
914 unsigned len;
915 if ( code <= 0x7F )
916 {
917 len = 1;
918 if ( out )
919 {
920 if ( dstLen < len )
921 break;
922
923 out[0] = (char)code;
924 }
925 }
926 else if ( code <= 0x07FF )
927 {
928 len = 2;
929 if ( out )
930 {
931 if ( dstLen < len )
932 break;
933
934 // NB: this line takes 6 least significant bits, encodes them as
935 // 10xxxxxx and discards them so that the next byte can be encoded:
936 out[1] = 0x80 | (code & 0x3F); code >>= 6;
937 out[0] = 0xC0 | code;
938 }
939 }
940 else if ( code < 0xFFFF )
941 {
942 len = 3;
943 if ( out )
944 {
945 if ( dstLen < len )
946 break;
947
948 out[2] = 0x80 | (code & 0x3F); code >>= 6;
949 out[1] = 0x80 | (code & 0x3F); code >>= 6;
950 out[0] = 0xE0 | code;
951 }
952 }
953 else if ( code <= 0x10FFFF )
954 {
955 len = 4;
956 if ( out )
957 {
958 if ( dstLen < len )
959 break;
960
961 out[3] = 0x80 | (code & 0x3F); code >>= 6;
962 out[2] = 0x80 | (code & 0x3F); code >>= 6;
963 out[1] = 0x80 | (code & 0x3F); code >>= 6;
964 out[0] = 0xF0 | code;
965 }
966 }
967 else
968 {
969 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
970 break;
971 }
972
973 if ( out )
974 {
975 out += len;
976 dstLen -= len;
977 }
978
979 written += len;
980 }
981
982 // we only get here if an error occurs during decoding
983 return wxCONV_FAILED;
984}
985
d16d0917
VZ
986size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
987 const char *psz, size_t srcLen) const
6001e347 988{
0286d08d 989 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 990 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 991
4def3b35
VS
992 size_t len = 0;
993
d16d0917 994 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 995 {
ea8ce907
RR
996 const char *opsz = psz;
997 bool invalid = false;
4def3b35
VS
998 unsigned char cc = *psz++, fc = cc;
999 unsigned cnt;
dccce9ea 1000 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1001 fc <<= 1;
ef199164 1002
dccce9ea 1003 if (!cnt)
4def3b35
VS
1004 {
1005 // plain ASCII char
dccce9ea 1006 if (buf)
4def3b35
VS
1007 *buf++ = cc;
1008 len++;
561488ef
MW
1009
1010 // escape the escape character for octal escapes
1011 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1012 && cc == '\\' && (!buf || len < n))
1013 {
1014 if (buf)
1015 *buf++ = cc;
1016 len++;
1017 }
dccce9ea
VZ
1018 }
1019 else
4def3b35
VS
1020 {
1021 cnt--;
dccce9ea 1022 if (!cnt)
4def3b35
VS
1023 {
1024 // invalid UTF-8 sequence
ea8ce907 1025 invalid = true;
dccce9ea
VZ
1026 }
1027 else
4def3b35
VS
1028 {
1029 unsigned ocnt = cnt - 1;
1030 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1031 while (cnt--)
4def3b35 1032 {
ea8ce907 1033 cc = *psz;
dccce9ea 1034 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1035 {
1036 // invalid UTF-8 sequence
ea8ce907
RR
1037 invalid = true;
1038 break;
4def3b35 1039 }
ef199164 1040
ea8ce907 1041 psz++;
4def3b35
VS
1042 res = (res << 6) | (cc & 0x3f);
1043 }
ef199164 1044
ea8ce907 1045 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1046 {
1047 // illegal UTF-8 encoding
ea8ce907 1048 invalid = true;
4def3b35 1049 }
ea8ce907
RR
1050 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1051 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1052 {
1053 // if one of our PUA characters turns up externally
1054 // it must also be treated as an illegal sequence
1055 // (a bit like you have to escape an escape character)
1056 invalid = true;
1057 }
1058 else
1059 {
1cd52418 1060#ifdef WC_UTF16
0286d08d 1061 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1062 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1063 if (pa == wxCONV_FAILED)
ea8ce907
RR
1064 {
1065 invalid = true;
1066 }
1067 else
1068 {
1069 if (buf)
1070 buf += pa;
1071 len += pa;
1072 }
373658eb 1073#else // !WC_UTF16
ea8ce907 1074 if (buf)
38d4b1e4 1075 *buf++ = (wchar_t)res;
ea8ce907 1076 len++;
373658eb 1077#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1078 }
1079 }
ef199164 1080
ea8ce907
RR
1081 if (invalid)
1082 {
1083 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1084 {
1085 while (opsz < psz && (!buf || len < n))
1086 {
1087#ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1089 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1090 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1091 if (buf)
1092 buf += pa;
1093 opsz++;
1094 len += pa;
1095#else
1096 if (buf)
38d4b1e4 1097 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1098 opsz++;
1099 len++;
1100#endif
1101 }
1102 }
3698ae71 1103 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1104 {
1105 while (opsz < psz && (!buf || len < n))
1106 {
3698ae71
VZ
1107 if ( buf && len + 3 < n )
1108 {
17a1ebd1 1109 unsigned char on = *opsz;
3698ae71 1110 *buf++ = L'\\';
17a1ebd1
VZ
1111 *buf++ = (wchar_t)( L'0' + on / 0100 );
1112 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1113 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1114 }
ef199164 1115
ea8ce907
RR
1116 opsz++;
1117 len += 4;
1118 }
1119 }
3698ae71 1120 else // MAP_INVALID_UTF8_NOT
ea8ce907 1121 {
467e0479 1122 return wxCONV_FAILED;
ea8ce907 1123 }
4def3b35
VS
1124 }
1125 }
6001e347 1126 }
ef199164 1127
d16d0917 1128 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1129 *buf = 0;
ef199164 1130
d16d0917 1131 return len + 1;
6001e347
RR
1132}
1133
3698ae71
VZ
1134static inline bool isoctal(wchar_t wch)
1135{
1136 return L'0' <= wch && wch <= L'7';
1137}
1138
d16d0917
VZ
1139size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1140 const wchar_t *psz, size_t srcLen) const
6001e347 1141{
0286d08d 1142 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1143 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1144
4def3b35 1145 size_t len = 0;
6001e347 1146
d16d0917 1147 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1148 {
1149 wxUint32 cc;
ef199164 1150
1cd52418 1151#ifdef WC_UTF16
b5153fd8
VZ
1152 // cast is ok for WC_UTF16
1153 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1154 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1155#else
ef199164 1156 cc = (*psz++) & 0x7fffffff;
4def3b35 1157#endif
3698ae71
VZ
1158
1159 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1160 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1161 {
dccce9ea 1162 if (buf)
ea8ce907 1163 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1164 len++;
3698ae71 1165 }
561488ef
MW
1166 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1167 && cc == L'\\' && psz[0] == L'\\' )
1168 {
1169 if (buf)
1170 *buf++ = (char)cc;
1171 psz++;
1172 len++;
1173 }
3698ae71
VZ
1174 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1175 cc == L'\\' &&
1176 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1177 {
dccce9ea 1178 if (buf)
3698ae71 1179 {
ef199164
DS
1180 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1181 (psz[1] - L'0') * 010 +
b2c13097 1182 (psz[2] - L'0'));
3698ae71
VZ
1183 }
1184
1185 psz += 3;
ea8ce907
RR
1186 len++;
1187 }
1188 else
1189 {
1190 unsigned cnt;
ef199164
DS
1191 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1192 {
1193 }
1194
ea8ce907 1195 if (!cnt)
4def3b35 1196 {
ea8ce907
RR
1197 // plain ASCII char
1198 if (buf)
1199 *buf++ = (char) cc;
1200 len++;
1201 }
ea8ce907
RR
1202 else
1203 {
1204 len += cnt + 1;
1205 if (buf)
1206 {
1207 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1208 while (cnt--)
1209 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1210 }
4def3b35
VS
1211 }
1212 }
6001e347 1213 }
4def3b35 1214
d16d0917 1215 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1216 *buf = 0;
adb45366 1217
d16d0917 1218 return len + 1;
6001e347
RR
1219}
1220
467e0479 1221// ============================================================================
c91830cb 1222// UTF-16
467e0479 1223// ============================================================================
c91830cb
VZ
1224
1225#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1226 #define wxMBConvUTF16straight wxMBConvUTF16BE
1227 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1228#else
bde4baac
VZ
1229 #define wxMBConvUTF16swap wxMBConvUTF16BE
1230 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1231#endif
1232
467e0479
VZ
1233/* static */
1234size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1235{
1236 if ( srcLen == wxNO_LEN )
1237 {
1238 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1239 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1240 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1241 ;
c91830cb 1242
467e0479
VZ
1243 srcLen *= BYTES_PER_CHAR;
1244 }
1245 else // we already have the length
1246 {
1247 // we can only convert an entire number of UTF-16 characters
1248 if ( srcLen % BYTES_PER_CHAR )
1249 return wxCONV_FAILED;
1250 }
1251
1252 return srcLen;
1253}
1254
1255// case when in-memory representation is UTF-16 too
c91830cb
VZ
1256#ifdef WC_UTF16
1257
467e0479
VZ
1258// ----------------------------------------------------------------------------
1259// conversions without endianness change
1260// ----------------------------------------------------------------------------
1261
1262size_t
1263wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1264 const char *src, size_t srcLen) const
c91830cb 1265{
467e0479
VZ
1266 // set up the scene for using memcpy() (which is presumably more efficient
1267 // than copying the bytes one by one)
1268 srcLen = GetLength(src, srcLen);
1269 if ( srcLen == wxNO_LEN )
1270 return wxCONV_FAILED;
c91830cb 1271
ef199164 1272 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1273 if ( dst )
c91830cb 1274 {
467e0479
VZ
1275 if ( dstLen < inLen )
1276 return wxCONV_FAILED;
c91830cb 1277
467e0479 1278 memcpy(dst, src, srcLen);
c91830cb 1279 }
d32a507d 1280
467e0479 1281 return inLen;
c91830cb
VZ
1282}
1283
467e0479
VZ
1284size_t
1285wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1286 const wchar_t *src, size_t srcLen) const
c91830cb 1287{
467e0479
VZ
1288 if ( srcLen == wxNO_LEN )
1289 srcLen = wxWcslen(src) + 1;
c91830cb 1290
467e0479
VZ
1291 srcLen *= BYTES_PER_CHAR;
1292
1293 if ( dst )
c91830cb 1294 {
467e0479
VZ
1295 if ( dstLen < srcLen )
1296 return wxCONV_FAILED;
d32a507d 1297
467e0479 1298 memcpy(dst, src, srcLen);
c91830cb 1299 }
d32a507d 1300
467e0479 1301 return srcLen;
c91830cb
VZ
1302}
1303
467e0479
VZ
1304// ----------------------------------------------------------------------------
1305// endian-reversing conversions
1306// ----------------------------------------------------------------------------
c91830cb 1307
467e0479
VZ
1308size_t
1309wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1310 const char *src, size_t srcLen) const
c91830cb 1311{
467e0479
VZ
1312 srcLen = GetLength(src, srcLen);
1313 if ( srcLen == wxNO_LEN )
1314 return wxCONV_FAILED;
c91830cb 1315
467e0479
VZ
1316 srcLen /= BYTES_PER_CHAR;
1317
1318 if ( dst )
c91830cb 1319 {
467e0479
VZ
1320 if ( dstLen < srcLen )
1321 return wxCONV_FAILED;
1322
ef199164
DS
1323 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1324 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1325 {
ef199164 1326 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1327 }
c91830cb 1328 }
bfab25d4 1329
467e0479 1330 return srcLen;
c91830cb
VZ
1331}
1332
467e0479
VZ
1333size_t
1334wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1335 const wchar_t *src, size_t srcLen) const
c91830cb 1336{
467e0479
VZ
1337 if ( srcLen == wxNO_LEN )
1338 srcLen = wxWcslen(src) + 1;
c91830cb 1339
467e0479
VZ
1340 srcLen *= BYTES_PER_CHAR;
1341
1342 if ( dst )
c91830cb 1343 {
467e0479
VZ
1344 if ( dstLen < srcLen )
1345 return wxCONV_FAILED;
1346
ef199164 1347 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1348 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1349 {
ef199164 1350 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1351 }
c91830cb 1352 }
eec47cc6 1353
467e0479 1354 return srcLen;
c91830cb
VZ
1355}
1356
467e0479 1357#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1358
467e0479
VZ
1359// ----------------------------------------------------------------------------
1360// conversions without endianness change
1361// ----------------------------------------------------------------------------
c91830cb 1362
35d11700
VZ
1363size_t
1364wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1365 const char *src, size_t srcLen) const
c91830cb 1366{
35d11700
VZ
1367 srcLen = GetLength(src, srcLen);
1368 if ( srcLen == wxNO_LEN )
1369 return wxCONV_FAILED;
c91830cb 1370
ef199164 1371 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1372 if ( !dst )
c91830cb 1373 {
35d11700
VZ
1374 // optimization: return maximal space which could be needed for this
1375 // string even if the real size could be smaller if the buffer contains
1376 // any surrogates
1377 return inLen;
c91830cb 1378 }
c91830cb 1379
35d11700 1380 size_t outLen = 0;
ef199164
DS
1381 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1382 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1383 {
ef199164
DS
1384 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1385 if ( !inBuff )
35d11700
VZ
1386 return wxCONV_FAILED;
1387
1388 if ( ++outLen > dstLen )
1389 return wxCONV_FAILED;
c91830cb 1390
35d11700
VZ
1391 *dst++ = ch;
1392 }
1393
1394
1395 return outLen;
1396}
c91830cb 1397
35d11700
VZ
1398size_t
1399wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1400 const wchar_t *src, size_t srcLen) const
c91830cb 1401{
35d11700
VZ
1402 if ( srcLen == wxNO_LEN )
1403 srcLen = wxWcslen(src) + 1;
c91830cb 1404
35d11700 1405 size_t outLen = 0;
ef199164 1406 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1407 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1408 {
1409 wxUint16 cc[2];
35d11700
VZ
1410 const size_t numChars = encode_utf16(*src++, cc);
1411 if ( numChars == wxCONV_FAILED )
1412 return wxCONV_FAILED;
c91830cb 1413
ef199164
DS
1414 outLen += numChars * BYTES_PER_CHAR;
1415 if ( outBuff )
c91830cb 1416 {
35d11700
VZ
1417 if ( outLen > dstLen )
1418 return wxCONV_FAILED;
1419
ef199164 1420 *outBuff++ = cc[0];
35d11700 1421 if ( numChars == 2 )
69b80d28 1422 {
35d11700 1423 // second character of a surrogate
ef199164 1424 *outBuff++ = cc[1];
69b80d28 1425 }
c91830cb 1426 }
c91830cb 1427 }
c91830cb 1428
35d11700 1429 return outLen;
c91830cb
VZ
1430}
1431
467e0479
VZ
1432// ----------------------------------------------------------------------------
1433// endian-reversing conversions
1434// ----------------------------------------------------------------------------
c91830cb 1435
35d11700
VZ
1436size_t
1437wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1438 const char *src, size_t srcLen) const
c91830cb 1439{
35d11700
VZ
1440 srcLen = GetLength(src, srcLen);
1441 if ( srcLen == wxNO_LEN )
1442 return wxCONV_FAILED;
1443
ef199164 1444 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1445 if ( !dst )
1446 {
1447 // optimization: return maximal space which could be needed for this
1448 // string even if the real size could be smaller if the buffer contains
1449 // any surrogates
1450 return inLen;
1451 }
c91830cb 1452
35d11700 1453 size_t outLen = 0;
ef199164
DS
1454 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1455 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1456 {
35d11700
VZ
1457 wxUint32 ch;
1458 wxUint16 tmp[2];
ef199164
DS
1459
1460 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1461 inBuff++;
1462 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1463
35d11700
VZ
1464 const size_t numChars = decode_utf16(tmp, ch);
1465 if ( numChars == wxCONV_FAILED )
1466 return wxCONV_FAILED;
c91830cb 1467
35d11700 1468 if ( numChars == 2 )
ef199164 1469 inBuff++;
35d11700
VZ
1470
1471 if ( ++outLen > dstLen )
1472 return wxCONV_FAILED;
c91830cb 1473
35d11700 1474 *dst++ = ch;
c91830cb 1475 }
c91830cb 1476
c91830cb 1477
35d11700
VZ
1478 return outLen;
1479}
c91830cb 1480
35d11700
VZ
1481size_t
1482wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1483 const wchar_t *src, size_t srcLen) const
c91830cb 1484{
35d11700
VZ
1485 if ( srcLen == wxNO_LEN )
1486 srcLen = wxWcslen(src) + 1;
c91830cb 1487
35d11700 1488 size_t outLen = 0;
ef199164 1489 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1490 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1491 {
1492 wxUint16 cc[2];
35d11700
VZ
1493 const size_t numChars = encode_utf16(*src, cc);
1494 if ( numChars == wxCONV_FAILED )
1495 return wxCONV_FAILED;
c91830cb 1496
ef199164
DS
1497 outLen += numChars * BYTES_PER_CHAR;
1498 if ( outBuff )
c91830cb 1499 {
35d11700
VZ
1500 if ( outLen > dstLen )
1501 return wxCONV_FAILED;
1502
ef199164 1503 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1504 if ( numChars == 2 )
c91830cb 1505 {
35d11700 1506 // second character of a surrogate
ef199164 1507 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1508 }
1509 }
c91830cb 1510 }
c91830cb 1511
35d11700 1512 return outLen;
c91830cb
VZ
1513}
1514
467e0479 1515#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1516
1517
35d11700 1518// ============================================================================
c91830cb 1519// UTF-32
35d11700 1520// ============================================================================
c91830cb
VZ
1521
1522#ifdef WORDS_BIGENDIAN
467e0479
VZ
1523 #define wxMBConvUTF32straight wxMBConvUTF32BE
1524 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1525#else
467e0479
VZ
1526 #define wxMBConvUTF32swap wxMBConvUTF32BE
1527 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1528#endif
1529
1530
1531WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1532WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1533
467e0479
VZ
1534/* static */
1535size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1536{
1537 if ( srcLen == wxNO_LEN )
1538 {
1539 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1540 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1541 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1542 ;
c91830cb 1543
467e0479
VZ
1544 srcLen *= BYTES_PER_CHAR;
1545 }
1546 else // we already have the length
1547 {
1548 // we can only convert an entire number of UTF-32 characters
1549 if ( srcLen % BYTES_PER_CHAR )
1550 return wxCONV_FAILED;
1551 }
1552
1553 return srcLen;
1554}
1555
1556// case when in-memory representation is UTF-16
c91830cb
VZ
1557#ifdef WC_UTF16
1558
467e0479
VZ
1559// ----------------------------------------------------------------------------
1560// conversions without endianness change
1561// ----------------------------------------------------------------------------
1562
1563size_t
1564wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1565 const char *src, size_t srcLen) const
c91830cb 1566{
467e0479
VZ
1567 srcLen = GetLength(src, srcLen);
1568 if ( srcLen == wxNO_LEN )
1569 return wxCONV_FAILED;
c91830cb 1570
ef199164
DS
1571 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1572 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1573 size_t outLen = 0;
1574 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1575 {
1576 wxUint16 cc[2];
ef199164 1577 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1578 if ( numChars == wxCONV_FAILED )
1579 return wxCONV_FAILED;
c91830cb 1580
467e0479
VZ
1581 outLen += numChars;
1582 if ( dst )
c91830cb 1583 {
467e0479
VZ
1584 if ( outLen > dstLen )
1585 return wxCONV_FAILED;
d32a507d 1586
467e0479
VZ
1587 *dst++ = cc[0];
1588 if ( numChars == 2 )
1589 {
1590 // second character of a surrogate
1591 *dst++ = cc[1];
1592 }
1593 }
c91830cb 1594 }
d32a507d 1595
467e0479 1596 return outLen;
c91830cb
VZ
1597}
1598
467e0479
VZ
1599size_t
1600wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1601 const wchar_t *src, size_t srcLen) const
c91830cb 1602{
467e0479
VZ
1603 if ( srcLen == wxNO_LEN )
1604 srcLen = wxWcslen(src) + 1;
c91830cb 1605
467e0479 1606 if ( !dst )
c91830cb 1607 {
467e0479
VZ
1608 // optimization: return maximal space which could be needed for this
1609 // string instead of the exact amount which could be less if there are
1610 // any surrogates in the input
1611 //
1612 // we consider that surrogates are rare enough to make it worthwhile to
1613 // avoid running the loop below at the cost of slightly extra memory
1614 // consumption
ef199164 1615 return srcLen * BYTES_PER_CHAR;
467e0479 1616 }
c91830cb 1617
ef199164 1618 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1619 size_t outLen = 0;
1620 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1621 {
1622 const wxUint32 ch = wxDecodeSurrogate(&src);
1623 if ( !src )
1624 return wxCONV_FAILED;
c91830cb 1625
467e0479 1626 outLen += BYTES_PER_CHAR;
d32a507d 1627
467e0479
VZ
1628 if ( outLen > dstLen )
1629 return wxCONV_FAILED;
b5153fd8 1630
ef199164 1631 *outBuff++ = ch;
467e0479 1632 }
c91830cb 1633
467e0479 1634 return outLen;
c91830cb
VZ
1635}
1636
467e0479
VZ
1637// ----------------------------------------------------------------------------
1638// endian-reversing conversions
1639// ----------------------------------------------------------------------------
c91830cb 1640
467e0479
VZ
1641size_t
1642wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1643 const char *src, size_t srcLen) const
c91830cb 1644{
467e0479
VZ
1645 srcLen = GetLength(src, srcLen);
1646 if ( srcLen == wxNO_LEN )
1647 return wxCONV_FAILED;
c91830cb 1648
ef199164
DS
1649 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1650 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1651 size_t outLen = 0;
ef199164 1652 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1653 {
c91830cb 1654 wxUint16 cc[2];
ef199164 1655 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1656 if ( numChars == wxCONV_FAILED )
1657 return wxCONV_FAILED;
c91830cb 1658
467e0479
VZ
1659 outLen += numChars;
1660 if ( dst )
c91830cb 1661 {
467e0479
VZ
1662 if ( outLen > dstLen )
1663 return wxCONV_FAILED;
d32a507d 1664
467e0479
VZ
1665 *dst++ = cc[0];
1666 if ( numChars == 2 )
1667 {
1668 // second character of a surrogate
1669 *dst++ = cc[1];
1670 }
1671 }
c91830cb 1672 }
b5153fd8 1673
467e0479 1674 return outLen;
c91830cb
VZ
1675}
1676
467e0479
VZ
1677size_t
1678wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1679 const wchar_t *src, size_t srcLen) const
c91830cb 1680{
467e0479
VZ
1681 if ( srcLen == wxNO_LEN )
1682 srcLen = wxWcslen(src) + 1;
c91830cb 1683
467e0479 1684 if ( !dst )
c91830cb 1685 {
467e0479
VZ
1686 // optimization: return maximal space which could be needed for this
1687 // string instead of the exact amount which could be less if there are
1688 // any surrogates in the input
1689 //
1690 // we consider that surrogates are rare enough to make it worthwhile to
1691 // avoid running the loop below at the cost of slightly extra memory
1692 // consumption
1693 return srcLen*BYTES_PER_CHAR;
1694 }
c91830cb 1695
ef199164 1696 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1697 size_t outLen = 0;
1698 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1699 {
1700 const wxUint32 ch = wxDecodeSurrogate(&src);
1701 if ( !src )
1702 return wxCONV_FAILED;
c91830cb 1703
467e0479 1704 outLen += BYTES_PER_CHAR;
d32a507d 1705
467e0479
VZ
1706 if ( outLen > dstLen )
1707 return wxCONV_FAILED;
b5153fd8 1708
ef199164 1709 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1710 }
c91830cb 1711
467e0479 1712 return outLen;
c91830cb
VZ
1713}
1714
467e0479 1715#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1716
35d11700
VZ
1717// ----------------------------------------------------------------------------
1718// conversions without endianness change
1719// ----------------------------------------------------------------------------
1720
1721size_t
1722wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1723 const char *src, size_t srcLen) const
c91830cb 1724{
35d11700
VZ
1725 // use memcpy() as it should be much faster than hand-written loop
1726 srcLen = GetLength(src, srcLen);
1727 if ( srcLen == wxNO_LEN )
1728 return wxCONV_FAILED;
c91830cb 1729
35d11700
VZ
1730 const size_t inLen = srcLen/BYTES_PER_CHAR;
1731 if ( dst )
c91830cb 1732 {
35d11700
VZ
1733 if ( dstLen < inLen )
1734 return wxCONV_FAILED;
b5153fd8 1735
35d11700
VZ
1736 memcpy(dst, src, srcLen);
1737 }
c91830cb 1738
35d11700 1739 return inLen;
c91830cb
VZ
1740}
1741
35d11700
VZ
1742size_t
1743wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1744 const wchar_t *src, size_t srcLen) const
c91830cb 1745{
35d11700
VZ
1746 if ( srcLen == wxNO_LEN )
1747 srcLen = wxWcslen(src) + 1;
1748
1749 srcLen *= BYTES_PER_CHAR;
c91830cb 1750
35d11700 1751 if ( dst )
c91830cb 1752 {
35d11700
VZ
1753 if ( dstLen < srcLen )
1754 return wxCONV_FAILED;
c91830cb 1755
35d11700 1756 memcpy(dst, src, srcLen);
c91830cb
VZ
1757 }
1758
35d11700 1759 return srcLen;
c91830cb
VZ
1760}
1761
35d11700
VZ
1762// ----------------------------------------------------------------------------
1763// endian-reversing conversions
1764// ----------------------------------------------------------------------------
c91830cb 1765
35d11700
VZ
1766size_t
1767wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1768 const char *src, size_t srcLen) const
c91830cb 1769{
35d11700
VZ
1770 srcLen = GetLength(src, srcLen);
1771 if ( srcLen == wxNO_LEN )
1772 return wxCONV_FAILED;
1773
1774 srcLen /= BYTES_PER_CHAR;
c91830cb 1775
35d11700 1776 if ( dst )
c91830cb 1777 {
35d11700
VZ
1778 if ( dstLen < srcLen )
1779 return wxCONV_FAILED;
1780
ef199164
DS
1781 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1782 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1783 {
ef199164 1784 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1785 }
c91830cb 1786 }
b5153fd8 1787
35d11700 1788 return srcLen;
c91830cb
VZ
1789}
1790
35d11700
VZ
1791size_t
1792wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1793 const wchar_t *src, size_t srcLen) const
c91830cb 1794{
35d11700
VZ
1795 if ( srcLen == wxNO_LEN )
1796 srcLen = wxWcslen(src) + 1;
1797
1798 srcLen *= BYTES_PER_CHAR;
c91830cb 1799
35d11700 1800 if ( dst )
c91830cb 1801 {
35d11700
VZ
1802 if ( dstLen < srcLen )
1803 return wxCONV_FAILED;
1804
ef199164 1805 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1806 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1807 {
ef199164 1808 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1809 }
c91830cb 1810 }
b5153fd8 1811
35d11700 1812 return srcLen;
c91830cb
VZ
1813}
1814
467e0479 1815#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1816
1817
36acb880
VZ
1818// ============================================================================
1819// The classes doing conversion using the iconv_xxx() functions
1820// ============================================================================
3caec1bb 1821
b040e242 1822#ifdef HAVE_ICONV
3a0d76bc 1823
b1d547eb
VS
1824// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1825// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1826// (unless there's yet another bug in glibc) the only case when iconv()
1827// returns with (size_t)-1 (which means error) and says there are 0 bytes
1828// left in the input buffer -- when _real_ error occurs,
1829// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1830// iconv() failure.
3caec1bb
VS
1831// [This bug does not appear in glibc 2.2.]
1832#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1833#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1834 (errno != E2BIG || bufLeft != 0))
1835#else
1836#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1837#endif
1838
ab217dba 1839#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1840
74a7eb0b
VZ
1841#define ICONV_T_INVALID ((iconv_t)-1)
1842
1843#if SIZEOF_WCHAR_T == 4
1844 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1845 #define WC_ENC wxFONTENCODING_UTF32
1846#elif SIZEOF_WCHAR_T == 2
1847 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1848 #define WC_ENC wxFONTENCODING_UTF16
1849#else // sizeof(wchar_t) != 2 nor 4
1850 // does this ever happen?
1851 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1852#endif
1853
36acb880 1854// ----------------------------------------------------------------------------
e95354ec 1855// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1856// ----------------------------------------------------------------------------
1857
e95354ec 1858class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1859{
1860public:
86501081 1861 wxMBConv_iconv(const char *name);
e95354ec 1862 virtual ~wxMBConv_iconv();
36acb880 1863
bde4baac
VZ
1864 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1865 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1866
d36c9347 1867 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1868 virtual size_t GetMBNulLen() const;
1869
ba98e032
VS
1870#if wxUSE_UNICODE_UTF8
1871 virtual bool IsUTF8() const;
1872#endif
1873
d36c9347
VZ
1874 virtual wxMBConv *Clone() const
1875 {
86501081 1876 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1877 p->m_minMBCharWidth = m_minMBCharWidth;
1878 return p;
1879 }
1880
e95354ec 1881 bool IsOk() const
74a7eb0b 1882 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1883
1884protected:
ef199164
DS
1885 // the iconv handlers used to translate from multibyte
1886 // to wide char and in the other direction
36acb880
VZ
1887 iconv_t m2w,
1888 w2m;
ef199164 1889
b1d547eb
VS
1890#if wxUSE_THREADS
1891 // guards access to m2w and w2m objects
1892 wxMutex m_iconvMutex;
1893#endif
36acb880
VZ
1894
1895private:
e95354ec 1896 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1897 // available on this machine, it will remain NULL
74a7eb0b 1898 static wxString ms_wcCharsetName;
36acb880
VZ
1899
1900 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1901 // different endian-ness than the native one
405d8f46 1902 static bool ms_wcNeedsSwap;
eec47cc6 1903
d36c9347
VZ
1904
1905 // name of the encoding handled by this conversion
1906 wxString m_name;
1907
7ef3ab50 1908 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1909 // initially
1910 size_t m_minMBCharWidth;
36acb880
VZ
1911};
1912
8f115891 1913// make the constructor available for unit testing
86501081 1914WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
1915{
1916 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1917 if ( !result->IsOk() )
1918 {
1919 delete result;
1920 return 0;
1921 }
ef199164 1922
8f115891
MW
1923 return result;
1924}
1925
422e411e 1926wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1927bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1928
86501081 1929wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 1930 : m_name(name)
36acb880 1931{
c1464d9d 1932 m_minMBCharWidth = 0;
eec47cc6 1933
36acb880 1934 // check for charset that represents wchar_t:
74a7eb0b 1935 if ( ms_wcCharsetName.empty() )
f1339c56 1936 {
c2b83fdd
VZ
1937 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1938
74a7eb0b
VZ
1939#if wxUSE_FONTMAP
1940 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1941#else // !wxUSE_FONTMAP
91cb7f52 1942 static const wxChar *names_static[] =
36acb880 1943 {
74a7eb0b
VZ
1944#if SIZEOF_WCHAR_T == 4
1945 _T("UCS-4"),
1946#elif SIZEOF_WCHAR_T = 2
1947 _T("UCS-2"),
1948#endif
1949 NULL
1950 };
91cb7f52 1951 const wxChar **names = names_static;
74a7eb0b 1952#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1953
d1f024a8 1954 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1955 {
17a1ebd1 1956 const wxString nameCS(*names);
74a7eb0b
VZ
1957
1958 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1959 wxString nameXE(nameCS);
ef199164
DS
1960
1961#ifdef WORDS_BIGENDIAN
74a7eb0b 1962 nameXE += _T("BE");
ef199164 1963#else // little endian
74a7eb0b 1964 nameXE += _T("LE");
ef199164 1965#endif
74a7eb0b 1966
c2b83fdd
VZ
1967 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1968 nameXE.c_str());
1969
86501081 1970 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 1971 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1972 {
74a7eb0b 1973 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1974 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1975 nameCS.c_str());
86501081 1976 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 1977
74a7eb0b
VZ
1978 // and check for bytesex ourselves:
1979 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1980 {
74a7eb0b
VZ
1981 char buf[2], *bufPtr;
1982 wchar_t wbuf[2], *wbufPtr;
1983 size_t insz, outsz;
1984 size_t res;
1985
1986 buf[0] = 'A';
1987 buf[1] = 0;
1988 wbuf[0] = 0;
1989 insz = 2;
1990 outsz = SIZEOF_WCHAR_T * 2;
1991 wbufPtr = wbuf;
1992 bufPtr = buf;
1993
ef199164
DS
1994 res = iconv(
1995 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1996 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
1997
1998 if (ICONV_FAILED(res, insz))
1999 {
2000 wxLogLastError(wxT("iconv"));
422e411e 2001 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2002 nameCS.c_str());
74a7eb0b
VZ
2003 }
2004 else // ok, can convert to this encoding, remember it
2005 {
17a1ebd1 2006 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2007 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2008 }
3a0d76bc
VS
2009 }
2010 }
74a7eb0b 2011 else // use charset not requiring byte swapping
36acb880 2012 {
74a7eb0b 2013 ms_wcCharsetName = nameXE;
36acb880 2014 }
3a0d76bc 2015 }
74a7eb0b 2016
0944fceb 2017 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2018 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2019 ms_wcCharsetName.empty() ? wxString("<none>")
2020 : ms_wcCharsetName,
74a7eb0b
VZ
2021 ms_wcNeedsSwap ? _T(" (needs swap)")
2022 : _T(""));
3a0d76bc 2023 }
36acb880 2024 else // we already have ms_wcCharsetName
3caec1bb 2025 {
86501081 2026 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2027 }
dccce9ea 2028
74a7eb0b 2029 if ( ms_wcCharsetName.empty() )
f1339c56 2030 {
74a7eb0b 2031 w2m = ICONV_T_INVALID;
36acb880 2032 }
405d8f46
VZ
2033 else
2034 {
86501081 2035 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2036 if ( w2m == ICONV_T_INVALID )
2037 {
2038 wxLogTrace(TRACE_STRCONV,
2039 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2040 ms_wcCharsetName.c_str(), name);
74a7eb0b 2041 }
405d8f46 2042 }
36acb880 2043}
3caec1bb 2044
e95354ec 2045wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2046{
74a7eb0b 2047 if ( m2w != ICONV_T_INVALID )
36acb880 2048 iconv_close(m2w);
74a7eb0b 2049 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2050 iconv_close(w2m);
2051}
3a0d76bc 2052
bde4baac 2053size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 2054{
69373110
VZ
2055 // find the string length: notice that must be done differently for
2056 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2057 size_t inbuf;
7ef3ab50 2058 const size_t nulLen = GetMBNulLen();
69373110
VZ
2059 switch ( nulLen )
2060 {
2061 default:
467e0479 2062 return wxCONV_FAILED;
69373110
VZ
2063
2064 case 1:
2065 inbuf = strlen(psz); // arguably more optimized than our version
2066 break;
2067
2068 case 2:
2069 case 4:
2070 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2071 // they also have to start at character boundary and not span two
2072 // adjacent characters
2073 const char *p;
2074 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2075 ;
2076 inbuf = p - psz;
2077 break;
2078 }
2079
b1d547eb 2080#if wxUSE_THREADS
6a17b868
SN
2081 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2082 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2083 // wxConvLocal that are used all over wx code, so we have to make sure
2084 // the handle is used by at most one thread at the time. Otherwise
2085 // only a few wx classes would be safe to use from non-main threads
2086 // as MB<->WC conversion would fail "randomly".
2087 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2088#endif // wxUSE_THREADS
2089
36acb880
VZ
2090 size_t outbuf = n * SIZEOF_WCHAR_T;
2091 size_t res, cres;
2092 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2093 wchar_t *bufPtr = buf;
2094 const char *pszPtr = psz;
2095
2096 if (buf)
2097 {
2098 // have destination buffer, convert there
2099 cres = iconv(m2w,
2100 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2101 (char**)&bufPtr, &outbuf);
2102 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 2103
36acb880 2104 if (ms_wcNeedsSwap)
3a0d76bc 2105 {
36acb880 2106 // convert to native endianness
17a1ebd1
VZ
2107 for ( unsigned i = 0; i < res; i++ )
2108 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 2109 }
adb45366 2110
69373110 2111 // NUL-terminate the string if there is any space left
49dd9820
VS
2112 if (res < n)
2113 buf[res] = 0;
36acb880
VZ
2114 }
2115 else
2116 {
2117 // no destination buffer... convert using temp buffer
2118 // to calculate destination buffer requirement
2119 wchar_t tbuf[8];
2120 res = 0;
ef199164
DS
2121
2122 do
2123 {
36acb880 2124 bufPtr = tbuf;
ef199164 2125 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2126
2127 cres = iconv(m2w,
2128 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2129 (char**)&bufPtr, &outbuf );
2130
ef199164
DS
2131 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2132 }
2133 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2134 }
dccce9ea 2135
36acb880 2136 if (ICONV_FAILED(cres, inbuf))
f1339c56 2137 {
36acb880 2138 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2139 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2140 return wxCONV_FAILED;
36acb880
VZ
2141 }
2142
2143 return res;
2144}
2145
bde4baac 2146size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 2147{
b1d547eb
VS
2148#if wxUSE_THREADS
2149 // NB: explained in MB2WC
2150 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2151#endif
3698ae71 2152
156162ec
MW
2153 size_t inlen = wxWcslen(psz);
2154 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
2155 size_t outbuf = n;
2156 size_t res, cres;
3a0d76bc 2157
36acb880 2158 wchar_t *tmpbuf = 0;
3caec1bb 2159
36acb880
VZ
2160 if (ms_wcNeedsSwap)
2161 {
2162 // need to copy to temp buffer to switch endianness
74a7eb0b 2163 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2164 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 2165 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
2166 for ( size_t i = 0; i < inlen; i++ )
2167 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 2168
156162ec 2169 tmpbuf[inlen] = L'\0';
74a7eb0b 2170 psz = tmpbuf;
36acb880 2171 }
3a0d76bc 2172
36acb880
VZ
2173 if (buf)
2174 {
2175 // have destination buffer, convert there
2176 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 2177
ef199164 2178 res = n - outbuf;
adb45366 2179
49dd9820
VS
2180 // NB: iconv was given only wcslen(psz) characters on input, and so
2181 // it couldn't convert the trailing zero. Let's do it ourselves
2182 // if there's some room left for it in the output buffer.
2183 if (res < n)
2184 buf[0] = 0;
36acb880
VZ
2185 }
2186 else
2187 {
ef199164 2188 // no destination buffer: convert using temp buffer
36acb880
VZ
2189 // to calculate destination buffer requirement
2190 char tbuf[16];
2191 res = 0;
ef199164
DS
2192 do
2193 {
2194 buf = tbuf;
2195 outbuf = 16;
36acb880
VZ
2196
2197 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 2198
36acb880 2199 res += 16 - outbuf;
ef199164
DS
2200 }
2201 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2202 }
dccce9ea 2203
36acb880
VZ
2204 if (ms_wcNeedsSwap)
2205 {
2206 free(tmpbuf);
2207 }
dccce9ea 2208
36acb880
VZ
2209 if (ICONV_FAILED(cres, inbuf))
2210 {
ce6f8d6f 2211 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2212 return wxCONV_FAILED;
36acb880
VZ
2213 }
2214
2215 return res;
2216}
2217
7ef3ab50 2218size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2219{
c1464d9d 2220 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2221 {
2222 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2223
2224#if wxUSE_THREADS
2225 // NB: explained in MB2WC
2226 wxMutexLocker lock(self->m_iconvMutex);
2227#endif
2228
999020e1 2229 const wchar_t *wnul = L"";
c1464d9d 2230 char buf[8]; // should be enough for NUL in any encoding
356410fc 2231 size_t inLen = sizeof(wchar_t),
c1464d9d 2232 outLen = WXSIZEOF(buf);
ef199164
DS
2233 char *inBuff = (char *)wnul;
2234 char *outBuff = buf;
2235 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2236 {
c1464d9d 2237 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2238 }
2239 else // ok
2240 {
ef199164 2241 self->m_minMBCharWidth = outBuff - buf;
356410fc 2242 }
eec47cc6
VZ
2243 }
2244
c1464d9d 2245 return m_minMBCharWidth;
eec47cc6
VZ
2246}
2247
ba98e032
VS
2248#if wxUSE_UNICODE_UTF8
2249bool wxMBConv_iconv::IsUTF8() const
2250{
86501081
VS
2251 return wxStricmp(m_name, "UTF-8") == 0 ||
2252 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2253}
2254#endif
2255
b040e242 2256#endif // HAVE_ICONV
36acb880 2257
e95354ec 2258
36acb880
VZ
2259// ============================================================================
2260// Win32 conversion classes
2261// ============================================================================
1cd52418 2262
e95354ec 2263#ifdef wxHAVE_WIN32_MB2WC
373658eb 2264
8b04d4c4 2265// from utils.cpp
d775fa82 2266#if wxUSE_FONTMAP
86501081 2267extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2268extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2269#endif
373658eb 2270
e95354ec 2271class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2272{
2273public:
bde4baac
VZ
2274 wxMBConv_win32()
2275 {
2276 m_CodePage = CP_ACP;
c1464d9d 2277 m_minMBCharWidth = 0;
bde4baac
VZ
2278 }
2279
d36c9347 2280 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2281 : wxMBConv()
d36c9347
VZ
2282 {
2283 m_CodePage = conv.m_CodePage;
2284 m_minMBCharWidth = conv.m_minMBCharWidth;
2285 }
2286
7608a683 2287#if wxUSE_FONTMAP
86501081 2288 wxMBConv_win32(const char* name)
bde4baac
VZ
2289 {
2290 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2291 m_minMBCharWidth = 0;
bde4baac 2292 }
dccce9ea 2293
e95354ec 2294 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2295 {
2296 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2297 m_minMBCharWidth = 0;
bde4baac 2298 }
eec47cc6 2299#endif // wxUSE_FONTMAP
8b04d4c4 2300
d36c9347 2301 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2302 {
02272c9c
VZ
2303 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2304 // the behaviour is not compatible with the Unix version (using iconv)
2305 // and break the library itself, e.g. wxTextInputStream::NextChar()
2306 // wouldn't work if reading an incomplete MB char didn't result in an
2307 // error
667e5b3e 2308 //
89028980 2309 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2310 // Win XP or newer and it is not supported for UTF-[78] so we always
2311 // use our own conversions in this case. See
89028980
VS
2312 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2313 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2314 if ( m_CodePage == CP_UTF8 )
89028980 2315 {
5487ff0f 2316 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2317 }
830f8f11
VZ
2318
2319 if ( m_CodePage == CP_UTF7 )
2320 {
5487ff0f 2321 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2322 }
2323
2324 int flags = 0;
2325 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2326 IsAtLeastWin2kSP4() )
89028980 2327 {
830f8f11 2328 flags = MB_ERR_INVALID_CHARS;
89028980 2329 }
667e5b3e 2330
2b5f62a0
VZ
2331 const size_t len = ::MultiByteToWideChar
2332 (
2333 m_CodePage, // code page
667e5b3e 2334 flags, // flags: fall on error
2b5f62a0
VZ
2335 psz, // input string
2336 -1, // its length (NUL-terminated)
b4da152e 2337 buf, // output string
2b5f62a0
VZ
2338 buf ? n : 0 // size of output buffer
2339 );
89028980
VS
2340 if ( !len )
2341 {
2342 // function totally failed
467e0479 2343 return wxCONV_FAILED;
89028980
VS
2344 }
2345
2346 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2347 // check if we succeeded, by doing a double trip:
2348 if ( !flags && buf )
2349 {
53c174fc
VZ
2350 const size_t mbLen = strlen(psz);
2351 wxCharBuffer mbBuf(mbLen);
89028980
VS
2352 if ( ::WideCharToMultiByte
2353 (
2354 m_CodePage,
2355 0,
2356 buf,
2357 -1,
2358 mbBuf.data(),
53c174fc 2359 mbLen + 1, // size in bytes, not length
89028980
VS
2360 NULL,
2361 NULL
2362 ) == 0 ||
2363 strcmp(mbBuf, psz) != 0 )
2364 {
2365 // we didn't obtain the same thing we started from, hence
2366 // the conversion was lossy and we consider that it failed
467e0479 2367 return wxCONV_FAILED;
89028980
VS
2368 }
2369 }
2b5f62a0 2370
03a991bc
VZ
2371 // note that it returns count of written chars for buf != NULL and size
2372 // of the needed buffer for buf == NULL so in either case the length of
2373 // the string (which never includes the terminating NUL) is one less
89028980 2374 return len - 1;
f1339c56 2375 }
dccce9ea 2376
d36c9347 2377 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2378 {
13dd924a
VZ
2379 /*
2380 we have a problem here: by default, WideCharToMultiByte() may
2381 replace characters unrepresentable in the target code page with bad
2382 quality approximations such as turning "1/2" symbol (U+00BD) into
2383 "1" for the code pages which don't have it and we, obviously, want
2384 to avoid this at any price
d775fa82 2385
13dd924a
VZ
2386 the trouble is that this function does it _silently_, i.e. it won't
2387 even tell us whether it did or not... Win98/2000 and higher provide
2388 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2389 we have to resort to a round trip, i.e. check that converting back
2390 results in the same string -- this is, of course, expensive but
2391 otherwise we simply can't be sure to not garble the data.
2392 */
2393
2394 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2395 // it doesn't work with CJK encodings (which we test for rather roughly
2396 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2397 // supporting it
907173e5
WS
2398 BOOL usedDef wxDUMMY_INITIALIZE(false);
2399 BOOL *pUsedDef;
13dd924a
VZ
2400 int flags;
2401 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2402 {
2403 // it's our lucky day
2404 flags = WC_NO_BEST_FIT_CHARS;
2405 pUsedDef = &usedDef;
2406 }
2407 else // old system or unsupported encoding
2408 {
2409 flags = 0;
2410 pUsedDef = NULL;
2411 }
2412
2b5f62a0
VZ
2413 const size_t len = ::WideCharToMultiByte
2414 (
2415 m_CodePage, // code page
13dd924a
VZ
2416 flags, // either none or no best fit
2417 pwz, // input string
2b5f62a0
VZ
2418 -1, // it is (wide) NUL-terminated
2419 buf, // output buffer
2420 buf ? n : 0, // and its size
2421 NULL, // default "replacement" char
13dd924a 2422 pUsedDef // [out] was it used?
2b5f62a0
VZ
2423 );
2424
13dd924a
VZ
2425 if ( !len )
2426 {
2427 // function totally failed
467e0479 2428 return wxCONV_FAILED;
13dd924a
VZ
2429 }
2430
765bdb4a
VZ
2431 // we did something, check if we really succeeded
2432 if ( flags )
13dd924a 2433 {
765bdb4a
VZ
2434 // check if the conversion failed, i.e. if any replacements
2435 // were done
2436 if ( usedDef )
2437 return wxCONV_FAILED;
2438 }
2439 else // we must resort to double tripping...
2440 {
2441 // first we need to ensure that we really have the MB data: this is
2442 // not the case if we're called with NULL buffer, in which case we
2443 // need to do the conversion yet again
2444 wxCharBuffer bufDef;
2445 if ( !buf )
13dd924a 2446 {
765bdb4a
VZ
2447 bufDef = wxCharBuffer(len);
2448 buf = bufDef.data();
2449 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2450 buf, len, NULL, NULL) )
467e0479 2451 return wxCONV_FAILED;
13dd924a 2452 }
765bdb4a
VZ
2453
2454 wxWCharBuffer wcBuf(n);
2455 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2456 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2457 {
765bdb4a
VZ
2458 // we didn't obtain the same thing we started from, hence
2459 // the conversion was lossy and we consider that it failed
2460 return wxCONV_FAILED;
13dd924a
VZ
2461 }
2462 }
2463
03a991bc 2464 // see the comment above for the reason of "len - 1"
13dd924a 2465 return len - 1;
f1339c56 2466 }
dccce9ea 2467
7ef3ab50
VZ
2468 virtual size_t GetMBNulLen() const
2469 {
2470 if ( m_minMBCharWidth == 0 )
2471 {
2472 int len = ::WideCharToMultiByte
2473 (
2474 m_CodePage, // code page
2475 0, // no flags
2476 L"", // input string
2477 1, // translate just the NUL
2478 NULL, // output buffer
2479 0, // and its size
2480 NULL, // no replacement char
2481 NULL // [out] don't care if it was used
2482 );
2483
2484 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2485 switch ( len )
2486 {
2487 default:
2488 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2489 self->m_minMBCharWidth = (size_t)-1;
2490 break;
7ef3ab50
VZ
2491
2492 case 0:
2493 self->m_minMBCharWidth = (size_t)-1;
2494 break;
2495
2496 case 1:
2497 case 2:
2498 case 4:
2499 self->m_minMBCharWidth = len;
2500 break;
2501 }
2502 }
2503
2504 return m_minMBCharWidth;
2505 }
2506
d36c9347
VZ
2507 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2508
13dd924a
VZ
2509 bool IsOk() const { return m_CodePage != -1; }
2510
2511private:
2512 static bool CanUseNoBestFit()
2513 {
2514 static int s_isWin98Or2k = -1;
2515
2516 if ( s_isWin98Or2k == -1 )
2517 {
2518 int verMaj, verMin;
2519 switch ( wxGetOsVersion(&verMaj, &verMin) )
2520 {
406d283a 2521 case wxOS_WINDOWS_9X:
13dd924a
VZ
2522 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2523 break;
2524
406d283a 2525 case wxOS_WINDOWS_NT:
13dd924a
VZ
2526 s_isWin98Or2k = verMaj >= 5;
2527 break;
2528
2529 default:
ef199164 2530 // unknown: be conservative by default
13dd924a 2531 s_isWin98Or2k = 0;
ef199164 2532 break;
13dd924a
VZ
2533 }
2534
2535 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2536 }
2537
2538 return s_isWin98Or2k == 1;
2539 }
f1339c56 2540
89028980
VS
2541 static bool IsAtLeastWin2kSP4()
2542 {
8942f83a
WS
2543#ifdef __WXWINCE__
2544 return false;
2545#else
89028980
VS
2546 static int s_isAtLeastWin2kSP4 = -1;
2547
2548 if ( s_isAtLeastWin2kSP4 == -1 )
2549 {
2550 OSVERSIONINFOEX ver;
2551
2552 memset(&ver, 0, sizeof(ver));
2553 ver.dwOSVersionInfoSize = sizeof(ver);
2554 GetVersionEx((OSVERSIONINFO*)&ver);
2555
2556 s_isAtLeastWin2kSP4 =
2557 ((ver.dwMajorVersion > 5) || // Vista+
2558 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2559 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2560 ver.wServicePackMajor >= 4)) // 2000 SP4+
2561 ? 1 : 0;
2562 }
2563
2564 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2565#endif
89028980
VS
2566 }
2567
eec47cc6 2568
c1464d9d 2569 // the code page we're working with
b1d66b54 2570 long m_CodePage;
c1464d9d 2571
7ef3ab50 2572 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2573 // "unknown"
2574 size_t m_minMBCharWidth;
1cd52418 2575};
e95354ec
VZ
2576
2577#endif // wxHAVE_WIN32_MB2WC
2578
f7e98dee 2579
36acb880
VZ
2580// ============================================================================
2581// wxEncodingConverter based conversion classes
2582// ============================================================================
2583
1e6feb95 2584#if wxUSE_FONTMAP
1cd52418 2585
e95354ec 2586class wxMBConv_wxwin : public wxMBConv
1cd52418 2587{
8b04d4c4
VZ
2588private:
2589 void Init()
2590 {
6ac84a78
DE
2591 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2592 // The wxMBConv_cf class does a better job.
2593 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2594 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2595 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2596 }
2597
6001e347 2598public:
f1339c56
RR
2599 // temporarily just use wxEncodingConverter stuff,
2600 // so that it works while a better implementation is built
86501081 2601 wxMBConv_wxwin(const char* name)
f1339c56
RR
2602 {
2603 if (name)
267e11c5 2604 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2605 else
2606 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2607
8b04d4c4
VZ
2608 Init();
2609 }
2610
e95354ec 2611 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2612 {
2613 m_enc = enc;
2614
2615 Init();
f1339c56 2616 }
dccce9ea 2617
bde4baac 2618 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2619 {
2620 size_t inbuf = strlen(psz);
dccce9ea 2621 if (buf)
c643a977 2622 {
ef199164 2623 if (!m2w.Convert(psz, buf))
467e0479 2624 return wxCONV_FAILED;
c643a977 2625 }
f1339c56
RR
2626 return inbuf;
2627 }
dccce9ea 2628
bde4baac 2629 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2630 {
f8d791e0 2631 const size_t inbuf = wxWcslen(psz);
f1339c56 2632 if (buf)
c643a977 2633 {
ef199164 2634 if (!w2m.Convert(psz, buf))
467e0479 2635 return wxCONV_FAILED;
c643a977 2636 }
dccce9ea 2637
f1339c56
RR
2638 return inbuf;
2639 }
dccce9ea 2640
7ef3ab50 2641 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2642 {
2643 switch ( m_enc )
2644 {
2645 case wxFONTENCODING_UTF16BE:
2646 case wxFONTENCODING_UTF16LE:
c1464d9d 2647 return 2;
eec47cc6
VZ
2648
2649 case wxFONTENCODING_UTF32BE:
2650 case wxFONTENCODING_UTF32LE:
c1464d9d 2651 return 4;
eec47cc6
VZ
2652
2653 default:
c1464d9d 2654 return 1;
eec47cc6
VZ
2655 }
2656 }
2657
d36c9347
VZ
2658 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2659
7ef3ab50
VZ
2660 bool IsOk() const { return m_ok; }
2661
2662public:
2663 wxFontEncoding m_enc;
2664 wxEncodingConverter m2w, w2m;
2665
2666private:
cafbf6fb
VZ
2667 // were we initialized successfully?
2668 bool m_ok;
fc7a2a60 2669
e95354ec 2670 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2671};
6001e347 2672
8f115891 2673// make the constructors available for unit testing
86501081 2674WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2675{
2676 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2677 if ( !result->IsOk() )
2678 {
2679 delete result;
2680 return 0;
2681 }
ef199164 2682
8f115891
MW
2683 return result;
2684}
2685
1e6feb95
VZ
2686#endif // wxUSE_FONTMAP
2687
36acb880
VZ
2688// ============================================================================
2689// wxCSConv implementation
2690// ============================================================================
2691
8b04d4c4 2692void wxCSConv::Init()
6001e347 2693{
e95354ec
VZ
2694 m_name = NULL;
2695 m_convReal = NULL;
2696 m_deferred = true;
2697}
2698
86501081 2699wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2700{
2701 Init();
82713003 2702
86501081 2703 if ( !charset.empty() )
e95354ec 2704 {
86501081 2705 SetName(charset.ToAscii());
e95354ec 2706 }
bda3d86a 2707
e4277538
VZ
2708#if wxUSE_FONTMAP
2709 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2710#else
bda3d86a 2711 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2712#endif
6001e347
RR
2713}
2714
8b04d4c4
VZ
2715wxCSConv::wxCSConv(wxFontEncoding encoding)
2716{
bda3d86a 2717 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2718 {
2719 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2720
2721 encoding = wxFONTENCODING_SYSTEM;
2722 }
2723
8b04d4c4
VZ
2724 Init();
2725
bda3d86a 2726 m_encoding = encoding;
8b04d4c4
VZ
2727}
2728
6001e347
RR
2729wxCSConv::~wxCSConv()
2730{
65e50848
JS
2731 Clear();
2732}
2733
54380f29 2734wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2735 : wxMBConv()
54380f29 2736{
8b04d4c4
VZ
2737 Init();
2738
54380f29 2739 SetName(conv.m_name);
8b04d4c4 2740 m_encoding = conv.m_encoding;
54380f29
GD
2741}
2742
2743wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2744{
2745 Clear();
8b04d4c4 2746
54380f29 2747 SetName(conv.m_name);
8b04d4c4
VZ
2748 m_encoding = conv.m_encoding;
2749
54380f29
GD
2750 return *this;
2751}
2752
65e50848
JS
2753void wxCSConv::Clear()
2754{
8b04d4c4 2755 free(m_name);
e95354ec 2756 delete m_convReal;
8b04d4c4 2757
65e50848 2758 m_name = NULL;
e95354ec 2759 m_convReal = NULL;
6001e347
RR
2760}
2761
86501081 2762void wxCSConv::SetName(const char *charset)
6001e347 2763{
f1339c56
RR
2764 if (charset)
2765 {
d6f2a891 2766 m_name = wxStrdup(charset);
e95354ec 2767 m_deferred = true;
f1339c56 2768 }
6001e347
RR
2769}
2770
8b3eb85d 2771#if wxUSE_FONTMAP
8b3eb85d
VZ
2772
2773WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2774 wxEncodingNameCache );
8b3eb85d
VZ
2775
2776static wxEncodingNameCache gs_nameCache;
2777#endif
2778
e95354ec
VZ
2779wxMBConv *wxCSConv::DoCreate() const
2780{
ce6f8d6f
VZ
2781#if wxUSE_FONTMAP
2782 wxLogTrace(TRACE_STRCONV,
2783 wxT("creating conversion for %s"),
2784 (m_name ? m_name
86501081 2785 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2786#endif // wxUSE_FONTMAP
2787
c547282d
VZ
2788 // check for the special case of ASCII or ISO8859-1 charset: as we have
2789 // special knowledge of it anyhow, we don't need to create a special
2790 // conversion object
e4277538
VZ
2791 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2792 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2793 {
e95354ec
VZ
2794 // don't convert at all
2795 return NULL;
2796 }
dccce9ea 2797
e95354ec
VZ
2798 // we trust OS to do conversion better than we can so try external
2799 // conversion methods first
2800 //
2801 // the full order is:
2802 // 1. OS conversion (iconv() under Unix or Win32 API)
2803 // 2. hard coded conversions for UTF
2804 // 3. wxEncodingConverter as fall back
2805
2806 // step (1)
2807#ifdef HAVE_ICONV
c547282d 2808#if !wxUSE_FONTMAP
e95354ec 2809 if ( m_name )
c547282d 2810#endif // !wxUSE_FONTMAP
e95354ec 2811 {
3ef10cfc 2812#if wxUSE_FONTMAP
8b3eb85d 2813 wxFontEncoding encoding(m_encoding);
3ef10cfc 2814#endif
8b3eb85d 2815
86501081 2816 if ( m_name )
8b3eb85d 2817 {
86501081 2818 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2819 if ( conv->IsOk() )
2820 return conv;
2821
2822 delete conv;
c547282d
VZ
2823
2824#if wxUSE_FONTMAP
8b3eb85d 2825 encoding =
86501081 2826 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2827#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2828 }
2829#if wxUSE_FONTMAP
2830 {
2831 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2832 if ( it != gs_nameCache.end() )
2833 {
2834 if ( it->second.empty() )
2835 return NULL;
c547282d 2836
86501081 2837 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2838 if ( conv->IsOk() )
2839 return conv;
e95354ec 2840
8b3eb85d
VZ
2841 delete conv;
2842 }
2843
2844 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2845 // CS : in case this does not return valid names (eg for MacRoman)
2846 // encoding got a 'failure' entry in the cache all the same,
2847 // although it just has to be created using a different method, so
2848 // only store failed iconv creation attempts (or perhaps we
2849 // shoulnd't do this at all ?)
3c67ec06 2850 if ( names[0] != NULL )
8b3eb85d 2851 {
3c67ec06 2852 for ( ; *names; ++names )
8b3eb85d 2853 {
86501081
VS
2854 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2855 // will need changes that will obsolete this
2856 wxString name(*names);
2857 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2858 if ( conv->IsOk() )
2859 {
2860 gs_nameCache[encoding] = *names;
2861 return conv;
2862 }
2863
2864 delete conv;
8b3eb85d
VZ
2865 }
2866
3c67ec06 2867 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2868 }
8b3eb85d
VZ
2869 }
2870#endif // wxUSE_FONTMAP
e95354ec
VZ
2871 }
2872#endif // HAVE_ICONV
2873
2874#ifdef wxHAVE_WIN32_MB2WC
2875 {
7608a683 2876#if wxUSE_FONTMAP
e95354ec
VZ
2877 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2878 : new wxMBConv_win32(m_encoding);
2879 if ( conv->IsOk() )
2880 return conv;
2881
2882 delete conv;
7608a683
WS
2883#else
2884 return NULL;
2885#endif
e95354ec
VZ
2886 }
2887#endif // wxHAVE_WIN32_MB2WC
ef199164 2888
5c4ed98d 2889#ifdef __DARWIN__
f7e98dee 2890 {
6ff49cbc
DE
2891 // leave UTF16 and UTF32 to the built-ins of wx
2892 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2893 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 2894 {
a6900d10 2895#if wxUSE_FONTMAP
5c4ed98d
DE
2896 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2897 : new wxMBConv_cf(m_encoding);
a6900d10 2898#else
5c4ed98d 2899 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 2900#endif
ef199164 2901
f7e98dee 2902 if ( conv->IsOk() )
d775fa82
WS
2903 return conv;
2904
2905 delete conv;
2906 }
335d31e0 2907 }
5c4ed98d
DE
2908#endif // __DARWIN__
2909
e95354ec
VZ
2910 // step (2)
2911 wxFontEncoding enc = m_encoding;
2912#if wxUSE_FONTMAP
c547282d
VZ
2913 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2914 {
2915 // use "false" to suppress interactive dialogs -- we can be called from
2916 // anywhere and popping up a dialog from here is the last thing we want to
2917 // do
267e11c5 2918 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2919 }
e95354ec
VZ
2920#endif // wxUSE_FONTMAP
2921
2922 switch ( enc )
2923 {
2924 case wxFONTENCODING_UTF7:
2925 return new wxMBConvUTF7;
2926
2927 case wxFONTENCODING_UTF8:
2928 return new wxMBConvUTF8;
2929
e95354ec
VZ
2930 case wxFONTENCODING_UTF16BE:
2931 return new wxMBConvUTF16BE;
2932
2933 case wxFONTENCODING_UTF16LE:
2934 return new wxMBConvUTF16LE;
2935
e95354ec
VZ
2936 case wxFONTENCODING_UTF32BE:
2937 return new wxMBConvUTF32BE;
2938
2939 case wxFONTENCODING_UTF32LE:
2940 return new wxMBConvUTF32LE;
2941
2942 default:
2943 // nothing to do but put here to suppress gcc warnings
ef199164 2944 break;
e95354ec
VZ
2945 }
2946
2947 // step (3)
2948#if wxUSE_FONTMAP
2949 {
2950 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2951 : new wxMBConv_wxwin(m_encoding);
2952 if ( conv->IsOk() )
2953 return conv;
2954
2955 delete conv;
2956 }
2957#endif // wxUSE_FONTMAP
2958
a58d4f4d
VS
2959 // NB: This is a hack to prevent deadlock. What could otherwise happen
2960 // in Unicode build: wxConvLocal creation ends up being here
2961 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
2962 // attach a timestamp, for which it will need wxConvLocal (to convert
2963 // time to char* and then wchar_t*), but that fails, tries to log the
2964 // error, but wxLog has an (already locked) critical section that
2965 // guards the static buffer.
a58d4f4d
VS
2966 static bool alreadyLoggingError = false;
2967 if (!alreadyLoggingError)
2968 {
2969 alreadyLoggingError = true;
2970 wxLogError(_("Cannot convert from the charset '%s'!"),
2971 m_name ? m_name
e95354ec
VZ
2972 :
2973#if wxUSE_FONTMAP
86501081 2974 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 2975#else // !wxUSE_FONTMAP
86501081 2976 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
2977#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2978 );
ef199164 2979
a58d4f4d
VS
2980 alreadyLoggingError = false;
2981 }
e95354ec
VZ
2982
2983 return NULL;
2984}
2985
2986void wxCSConv::CreateConvIfNeeded() const
2987{
2988 if ( m_deferred )
2989 {
2990 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 2991
bda3d86a
VZ
2992 // if we don't have neither the name nor the encoding, use the default
2993 // encoding for this system
2994 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2995 {
4c75209f 2996#if wxUSE_INTL
02c7347b 2997 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
2998#else
2999 // fallback to some reasonable default:
3000 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3001#endif // wxUSE_INTL
4c75209f 3002 }
bda3d86a 3003
e95354ec
VZ
3004 self->m_convReal = DoCreate();
3005 self->m_deferred = false;
6001e347 3006 }
6001e347
RR
3007}
3008
0f0298b1
VZ
3009bool wxCSConv::IsOk() const
3010{
3011 CreateConvIfNeeded();
3012
3013 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3014 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3015 return true; // always ok as we do it ourselves
3016
3017 // m_convReal->IsOk() is called at its own creation, so we know it must
3018 // be ok if m_convReal is non-NULL
3019 return m_convReal != NULL;
3020}
3021
1c714a5d
VZ
3022size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3023 const char *src, size_t srcLen) const
3024{
3025 CreateConvIfNeeded();
3026
2c74c558
VS
3027 if (m_convReal)
3028 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3029
3030 // latin-1 (direct)
3031 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3032}
3033
3034size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3035 const wchar_t *src, size_t srcLen) const
3036{
3037 CreateConvIfNeeded();
3038
2c74c558
VS
3039 if (m_convReal)
3040 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3041
3042 // latin-1 (direct)
3043 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3044}
3045
6001e347
RR
3046size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3047{
e95354ec 3048 CreateConvIfNeeded();
dccce9ea 3049
e95354ec
VZ
3050 if (m_convReal)
3051 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3052
3053 // latin-1 (direct)
4def3b35 3054 size_t len = strlen(psz);
dccce9ea 3055
f1339c56
RR
3056 if (buf)
3057 {
4def3b35 3058 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3059 buf[c] = (unsigned char)(psz[c]);
3060 }
dccce9ea 3061
f1339c56 3062 return len;
6001e347
RR
3063}
3064
3065size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3066{
e95354ec 3067 CreateConvIfNeeded();
dccce9ea 3068
e95354ec
VZ
3069 if (m_convReal)
3070 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3071
f1339c56 3072 // latin-1 (direct)
f8d791e0 3073 const size_t len = wxWcslen(psz);
f1339c56
RR
3074 if (buf)
3075 {
4def3b35 3076 for (size_t c = 0; c <= len; c++)
24642831
VS
3077 {
3078 if (psz[c] > 0xFF)
467e0479 3079 return wxCONV_FAILED;
ef199164 3080
907173e5 3081 buf[c] = (char)psz[c];
24642831
VS
3082 }
3083 }
3084 else
3085 {
3086 for (size_t c = 0; c <= len; c++)
3087 {
3088 if (psz[c] > 0xFF)
467e0479 3089 return wxCONV_FAILED;
24642831 3090 }
f1339c56 3091 }
dccce9ea 3092
f1339c56 3093 return len;
6001e347
RR
3094}
3095
7ef3ab50 3096size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3097{
3098 CreateConvIfNeeded();
3099
3100 if ( m_convReal )
3101 {
7ef3ab50 3102 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3103 }
3104
ba98e032 3105 // otherwise, we are ISO-8859-1
c1464d9d 3106 return 1;
eec47cc6
VZ
3107}
3108
ba98e032
VS
3109#if wxUSE_UNICODE_UTF8
3110bool wxCSConv::IsUTF8() const
3111{
3112 CreateConvIfNeeded();
3113
3114 if ( m_convReal )
3115 {
3116 return m_convReal->IsUTF8();
3117 }
3118
3119 // otherwise, we are ISO-8859-1
3120 return false;
3121}
3122#endif
3123
69c928ef
VZ
3124
3125#if wxUSE_UNICODE
3126
3127wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3128{
3129 if ( !s )
3130 return wxWCharBuffer();
3131
3132 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3133 if ( !wbuf )
5487ff0f 3134 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3135 if ( !wbuf )
3136 wbuf = wxConvISO8859_1.cMB2WX(s);
3137
3138 return wbuf;
3139}
3140
3141wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3142{
3143 if ( !ws )
3144 return wxCharBuffer();
3145
3146 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3147 if ( !buf )
3148 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3149
3150 return buf;
3151}
3152
3153#endif // wxUSE_UNICODE
f5a1953b 3154
1e50d914
VS
3155// ----------------------------------------------------------------------------
3156// globals
3157// ----------------------------------------------------------------------------
3158
3159// NB: The reason why we create converted objects in this convoluted way,
3160// using a factory function instead of global variable, is that they
3161// may be used at static initialization time (some of them are used by
3162// wxString ctors and there may be a global wxString object). In other
3163// words, possibly _before_ the converter global object would be
3164// initialized.
3165
3166#undef wxConvLibc
3167#undef wxConvUTF8
3168#undef wxConvUTF7
3169#undef wxConvLocal
3170#undef wxConvISO8859_1
3171
3172#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3173 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3174 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3175 { \
3176 static impl_klass name##Obj ctor_args; \
3177 return &name##Obj; \
3178 } \
3179 /* this ensures that all global converter objects are created */ \
3180 /* by the time static initialization is done, i.e. before any */ \
3181 /* thread is launched: */ \
3182 static klass* gs_##name##instance = wxGet_##name##Ptr()
3183
3184#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3185 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3186
3187#ifdef __WINDOWS__
3188 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3189#else
3190 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3191#endif
3192
e1079eda
VZ
3193// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3194// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3195// provokes an error message about "not enough macro parameters"; and we
3196// can't use "()" here as the name##Obj declaration would be parsed as a
3197// function declaration then, so use a semicolon and live with an extra
3198// empty statement (and hope that no compilers warns about this)
3199WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3200WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3201
3202WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3203WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3204
3205WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3206WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3207
6ac84a78
DE
3208#ifdef __DARWIN__
3209// The xnu kernel always communicates file paths in decomposed UTF-8.
3210// WARNING: Are we sure that CFString's conversion will cause decomposition?
3211static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3212#endif
6ac84a78 3213
1e50d914 3214WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3215#ifdef __DARWIN__
1e50d914 3216 &wxConvMacUTF8DObj;
6ac84a78 3217#else // !__DARWIN__
1e50d914 3218 wxGet_wxConvLibcPtr();
6ac84a78 3219#endif // __DARWIN__/!__DARWIN__
1e50d914 3220
bde4baac
VZ
3221#else // !wxUSE_WCHAR_T
3222
1e50d914 3223// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3224// stand-ins in absence of wchar_t
3225WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3226 wxConvISO8859_1,
3227 wxConvLocal,
3228 wxConvUTF8;
3229
3230#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T