]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Added logic, API and docs for wxDataViewModel::HasDefaultCompare indicating a compare...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
6001e347 47#ifdef __SALFORDC__
373658eb 48 #include <clib.h>
6001e347
RR
49#endif
50
b040e242 51#ifdef HAVE_ICONV
373658eb 52 #include <iconv.h>
b1d547eb 53 #include "wx/thread.h"
1cd52418 54#endif
1cd52418 55
373658eb
VZ
56#include "wx/encconv.h"
57#include "wx/fontmap.h"
58
5c4ed98d 59#ifdef __DARWIN__
e4dd1e19 60#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
61#endif //def __DARWIN__
62
ef199164 63
ce6f8d6f
VZ
64#define TRACE_STRCONV _T("strconv")
65
467e0479
VZ
66// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67// be 4 bytes
4948c2b6 68#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
69 #define WC_UTF16
70#endif
71
ef199164 72
373658eb
VZ
73// ============================================================================
74// implementation
75// ============================================================================
76
69373110
VZ
77// helper function of cMB2WC(): check if n bytes at this location are all NUL
78static bool NotAllNULs(const char *p, size_t n)
79{
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84}
85
373658eb 86// ----------------------------------------------------------------------------
467e0479 87// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 88// ----------------------------------------------------------------------------
6001e347 89
c91830cb 90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 91{
ef199164 92 if (input <= 0xffff)
4def3b35 93 {
999836aa
VZ
94 if (output)
95 *output = (wxUint16) input;
ef199164 96
4def3b35 97 return 1;
dccce9ea 98 }
ef199164 99 else if (input >= 0x110000)
4def3b35 100 {
467e0479 101 return wxCONV_FAILED;
dccce9ea
VZ
102 }
103 else
4def3b35 104 {
dccce9ea 105 if (output)
4def3b35 106 {
ef199164
DS
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 109 }
ef199164 110
4def3b35 111 return 2;
1cd52418 112 }
1cd52418
OK
113}
114
c91830cb 115static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 116{
ef199164 117 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
118 {
119 output = *input;
120 return 1;
dccce9ea 121 }
ef199164 122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
467e0479 125 return wxCONV_FAILED;
dccce9ea
VZ
126 }
127 else
4def3b35
VS
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
1cd52418
OK
132}
133
467e0479 134#ifdef WC_UTF16
35d11700
VZ
135 typedef wchar_t wxDecodeSurrogate_t;
136#else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
139
140// returns the next UTF-32 character from the wchar_t buffer and advances the
141// pointer to the character after this one
142//
143// if an invalid character is found, *pSrc is set to NULL, the caller must
144// check for this
35d11700 145static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
146{
147 wxUint32 out;
8d3dd069
VZ
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156}
157
f6bcfd97 158// ----------------------------------------------------------------------------
6001e347 159// wxMBConv
f6bcfd97 160// ----------------------------------------------------------------------------
2c53a80a 161
483b0434
VZ
162size_t
163wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
6001e347 165{
483b0434
VZ
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
483b0434 212 for ( ;; )
eec47cc6 213 {
c1464d9d 214 // try to convert the current chunk
483b0434 215 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
e4e3bbb4 218
467e0479 219 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 220
483b0434 221 dstWritten += lenChunk;
f5fb6871 222
467e0479
VZ
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
483b0434
VZ
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
830f8f11 234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
c1464d9d 239
483b0434 240 if ( !srcEnd )
c1464d9d 241 {
467e0479
VZ
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
c1464d9d
VZ
244 break;
245 }
eec47cc6
VZ
246
247 // advance the input pointer past the end of this chunk
483b0434 248 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
483b0434 254 src += nulLen;
c1464d9d 255 }
e4e3bbb4 256
483b0434 257 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
483b0434 262 if ( src >= srcEnd )
c1464d9d
VZ
263 break;
264 }
265
483b0434 266 return dstWritten;
e4e3bbb4
RN
267}
268
483b0434
VZ
269size_t
270wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
e4e3bbb4 272{
483b0434
VZ
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
e4e3bbb4 275
eec47cc6
VZ
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
467e0479 282 if ( srcLen == wxNO_LEN )
e4e3bbb4 283 {
483b0434 284 srcLen = wxWcslen(src) + 1;
eec47cc6 285 }
483b0434 286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
287 {
288 // make a copy in order to properly NUL-terminate the string
483b0434 289 bufTmp = wxWCharBuffer(srcLen);
ef199164 290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
eec47cc6 318 }
e4e3bbb4 319
483b0434
VZ
320 return dstWritten;
321}
322
ef199164 323size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 324{
ef199164 325 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 326 if ( rc != wxCONV_FAILED )
509da451
VZ
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334}
335
ef199164 336size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 337{
ef199164 338 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 339 if ( rc != wxCONV_FAILED )
509da451
VZ
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345}
346
483b0434
VZ
347wxMBConv::~wxMBConv()
348{
349 // nothing to do here (necessary for Darwin linking probably)
350}
e4e3bbb4 351
483b0434
VZ
352const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353{
354 if ( psz )
eec47cc6 355 {
483b0434 356 // calculate the length of the buffer needed first
a2db25a1 357 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 358 if ( nLen != wxCONV_FAILED )
f5fb6871 359 {
483b0434 360 // now do the actual conversion
a2db25a1 361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 362
483b0434 363 // +1 for the trailing NULL
a2db25a1 364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 365 return buf;
f5fb6871 366 }
483b0434 367 }
e4e3bbb4 368
483b0434
VZ
369 return wxWCharBuffer();
370}
3698ae71 371
483b0434
VZ
372const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373{
374 if ( pwz )
375 {
a2db25a1 376 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 377 if ( nLen != wxCONV_FAILED )
483b0434 378 {
a2db25a1
VZ
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386}
e4e3bbb4 387
483b0434 388const wxWCharBuffer
ef199164 389wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 390{
ef199164 391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 392 if ( dstLen != wxCONV_FAILED )
483b0434 393 {
830f8f11 394 wxWCharBuffer wbuf(dstLen - 1);
ef199164 395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
396 {
397 if ( outLen )
467e0479
VZ
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
483b0434
VZ
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412}
413
414const wxCharBuffer
ef199164 415wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 416{
13d92ad6 417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 418 if ( dstLen != wxCONV_FAILED )
483b0434 419 {
168a76fe
VZ
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
ef199164 423 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
424 {
425 if ( outLen )
467e0479
VZ
426 {
427 *outLen = dstLen;
428
429 const size_t nulLen = GetMBNulLen();
13d92ad6
VZ
430 if ( dstLen >= nulLen &&
431 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
432 {
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
13d92ad6 435 *outLen -= nulLen;
467e0479
VZ
436 }
437 }
d32a507d 438
483b0434
VZ
439 return buf;
440 }
e4e3bbb4
RN
441 }
442
eec47cc6
VZ
443 if ( outLen )
444 *outLen = 0;
445
446 return wxCharBuffer();
e4e3bbb4
RN
447}
448
6001e347 449// ----------------------------------------------------------------------------
bde4baac 450// wxMBConvLibc
6001e347
RR
451// ----------------------------------------------------------------------------
452
bde4baac
VZ
453size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
454{
455 return wxMB2WC(buf, psz, n);
456}
457
458size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
459{
460 return wxWC2MB(buf, psz, n);
461}
e1bfe89e
RR
462
463// ----------------------------------------------------------------------------
532d575b 464// wxConvBrokenFileNames
e1bfe89e
RR
465// ----------------------------------------------------------------------------
466
eec47cc6
VZ
467#ifdef __UNIX__
468
86501081 469wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 470{
86501081
VS
471 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
472 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 473 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
474 else
475 m_conv = new wxCSConv(charset);
ea8ce907
RR
476}
477
eec47cc6 478#endif // __UNIX__
c12b7f79 479
bde4baac 480// ----------------------------------------------------------------------------
3698ae71 481// UTF-7
bde4baac 482// ----------------------------------------------------------------------------
6001e347 483
15f2ee32 484// Implementation (C) 2004 Fredrik Roubert
6001e347 485
15f2ee32
RN
486//
487// BASE64 decoding table
488//
489static const unsigned char utf7unb64[] =
6001e347 490{
15f2ee32
RN
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
523};
524
525size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
526{
15f2ee32
RN
527 size_t len = 0;
528
04a37834 529 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
530 {
531 unsigned char cc = *psz++;
532 if (cc != '+')
533 {
534 // plain ASCII char
535 if (buf)
536 *buf++ = cc;
537 len++;
538 }
539 else if (*psz == '-')
540 {
541 // encoded plus sign
542 if (buf)
543 *buf++ = cc;
544 len++;
545 psz++;
546 }
04a37834 547 else // start of BASE64 encoded string
15f2ee32 548 {
04a37834 549 bool lsb, ok;
15f2ee32 550 unsigned int d, l;
04a37834
VZ
551 for ( ok = lsb = false, d = 0, l = 0;
552 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
553 psz++ )
15f2ee32
RN
554 {
555 d <<= 6;
556 d += cc;
557 for (l += 6; l >= 8; lsb = !lsb)
558 {
04a37834 559 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
560 if (lsb)
561 {
562 if (buf)
563 *buf++ |= c;
564 len ++;
565 }
566 else
04a37834 567 {
15f2ee32 568 if (buf)
6356d52a 569 *buf = (wchar_t)(c << 8);
04a37834
VZ
570 }
571
572 ok = true;
15f2ee32
RN
573 }
574 }
04a37834
VZ
575
576 if ( !ok )
577 {
578 // in valid UTF7 we should have valid characters after '+'
467e0479 579 return wxCONV_FAILED;
04a37834
VZ
580 }
581
15f2ee32
RN
582 if (*psz == '-')
583 psz++;
584 }
585 }
04a37834
VZ
586
587 if ( buf && (len < n) )
588 *buf = '\0';
589
15f2ee32 590 return len;
6001e347
RR
591}
592
15f2ee32
RN
593//
594// BASE64 encoding table
595//
596static const unsigned char utf7enb64[] =
597{
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
606};
607
608//
609// UTF-7 encoding table
610//
611// 0 - Set D (directly encoded characters)
612// 1 - Set O (optional direct characters)
613// 2 - whitespace characters (optional)
614// 3 - special characters
615//
616static const unsigned char utf7encode[128] =
6001e347 617{
15f2ee32
RN
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
626};
627
667e5b3e 628size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 629{
15f2ee32
RN
630 size_t len = 0;
631
632 while (*psz && ((!buf) || (len < n)))
633 {
634 wchar_t cc = *psz++;
635 if (cc < 0x80 && utf7encode[cc] < 1)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = (char)cc;
ef199164 640
15f2ee32
RN
641 len++;
642 }
643#ifndef WC_UTF16
79c78d42 644 else if (((wxUint32)cc) > 0xffff)
b2c13097 645 {
15f2ee32 646 // no surrogate pair generation (yet?)
467e0479 647 return wxCONV_FAILED;
15f2ee32
RN
648 }
649#endif
650 else
651 {
652 if (buf)
653 *buf++ = '+';
ef199164 654
15f2ee32
RN
655 len++;
656 if (cc != '+')
657 {
658 // BASE64 encode string
659 unsigned int lsb, d, l;
73c902d6 660 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
661 {
662 for (lsb = 0; lsb < 2; lsb ++)
663 {
664 d <<= 8;
665 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
666
667 for (l += 8; l >= 6; )
668 {
669 l -= 6;
670 if (buf)
671 *buf++ = utf7enb64[(d >> l) % 64];
672 len++;
673 }
674 }
ef199164 675
15f2ee32
RN
676 cc = *psz;
677 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
678 break;
679 }
ef199164 680
15f2ee32
RN
681 if (l != 0)
682 {
683 if (buf)
684 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 685
15f2ee32
RN
686 len++;
687 }
688 }
ef199164 689
15f2ee32
RN
690 if (buf)
691 *buf++ = '-';
692 len++;
693 }
694 }
ef199164 695
15f2ee32
RN
696 if (buf && (len < n))
697 *buf = 0;
ef199164 698
15f2ee32 699 return len;
6001e347
RR
700}
701
f6bcfd97 702// ----------------------------------------------------------------------------
6001e347 703// UTF-8
f6bcfd97 704// ----------------------------------------------------------------------------
6001e347 705
1774c3c5 706static const wxUint32 utf8_max[]=
4def3b35 707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 708
3698ae71
VZ
709// boundaries of the private use area we use to (temporarily) remap invalid
710// characters invalid in a UTF-8 encoded string
ea8ce907
RR
711const wxUint32 wxUnicodePUA = 0x100000;
712const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
713
0286d08d 714// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 715const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
725
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
731 0, 0, // C0,C1
732
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
736
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
739
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
742
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
747};
748
749size_t
750wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
751 const char *src, size_t srcLen) const
752{
753 wchar_t *out = dstLen ? dst : NULL;
754 size_t written = 0;
755
756 if ( srcLen == wxNO_LEN )
757 srcLen = strlen(src) + 1;
758
759 for ( const char *p = src; ; p++ )
760 {
761 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
762 {
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen == wxNO_LEN )
766 {
767 if ( out )
768 {
769 if ( !dstLen )
770 break;
771
772 *out = L'\0';
773 }
774
775 written++;
776 }
777
778 return written;
779 }
780
781 unsigned char c = *p;
782 unsigned len = tableUtf8Lengths[c];
783 if ( !len )
784 break;
785
786 if ( srcLen < len ) // the test works for wxNO_LEN too
787 break;
788
789 if ( srcLen != wxNO_LEN )
790 srcLen -= len;
791
792 if ( out && !dstLen-- )
793 break;
794
795
796 // Char. number range | UTF-8 octet sequence
797 // (hexadecimal) | (binary)
798 // ----------------------+---------------------------------------------
799 // 0000 0000 - 0000 007F | 0xxxxxxx
800 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
801 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
802 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
803 //
804 // Code point value is stored in bits marked with 'x', lowest-order bit
805 // of the value on the right side in the diagram above.
806 // (from RFC 3629)
807
808 // mask to extract lead byte's value ('x' bits above), by sequence length:
809 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
810
811 // mask and value of lead byte's most significant bits, by length:
812 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
813 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
814
815 len--; // it's more convenient to work with 0-based length here
816
817 // extract the lead byte's value bits:
818 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
819 break;
820
821 wxUint32 code = c & leadValueMask[len];
822
823 // all remaining bytes, if any, are handled in the same way regardless of
824 // sequence's length:
825 for ( ; len; --len )
826 {
827 c = *++p;
828 if ( (c & 0xC0) != 0x80 )
829 return wxCONV_FAILED;
830
831 code <<= 6;
832 code |= c & 0x3F;
833 }
834
835#ifdef WC_UTF16
836 // cast is ok because wchar_t == wxUint16 if WC_UTF16
837 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
838 {
839 if ( out )
840 out++;
841 written++;
842 }
843#else // !WC_UTF16
844 if ( out )
845 *out = code;
846#endif // WC_UTF16/!WC_UTF16
847
848 if ( out )
849 out++;
850
851 written++;
852 }
853
854 return wxCONV_FAILED;
855}
856
857size_t
858wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
859 const wchar_t *src, size_t srcLen) const
860{
861 char *out = dstLen ? dst : NULL;
862 size_t written = 0;
863
864 for ( const wchar_t *wp = src; ; wp++ )
865 {
866 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
867 {
868 // all done successfully, just add the trailing NULL if we are not
869 // using explicit length
870 if ( srcLen == wxNO_LEN )
871 {
872 if ( out )
873 {
874 if ( !dstLen )
875 break;
876
877 *out = '\0';
878 }
879
880 written++;
881 }
882
883 return written;
884 }
885
886
887 wxUint32 code;
888#ifdef WC_UTF16
889 // cast is ok for WC_UTF16
890 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
891 {
892 // skip the next char too as we decoded a surrogate
893 wp++;
894 }
895#else // wchar_t is UTF-32
896 code = *wp & 0x7fffffff;
897#endif
898
899 unsigned len;
900 if ( code <= 0x7F )
901 {
902 len = 1;
903 if ( out )
904 {
905 if ( dstLen < len )
906 break;
907
908 out[0] = (char)code;
909 }
910 }
911 else if ( code <= 0x07FF )
912 {
913 len = 2;
914 if ( out )
915 {
916 if ( dstLen < len )
917 break;
918
919 // NB: this line takes 6 least significant bits, encodes them as
920 // 10xxxxxx and discards them so that the next byte can be encoded:
921 out[1] = 0x80 | (code & 0x3F); code >>= 6;
922 out[0] = 0xC0 | code;
923 }
924 }
925 else if ( code < 0xFFFF )
926 {
927 len = 3;
928 if ( out )
929 {
930 if ( dstLen < len )
931 break;
932
933 out[2] = 0x80 | (code & 0x3F); code >>= 6;
934 out[1] = 0x80 | (code & 0x3F); code >>= 6;
935 out[0] = 0xE0 | code;
936 }
937 }
938 else if ( code <= 0x10FFFF )
939 {
940 len = 4;
941 if ( out )
942 {
943 if ( dstLen < len )
944 break;
945
946 out[3] = 0x80 | (code & 0x3F); code >>= 6;
947 out[2] = 0x80 | (code & 0x3F); code >>= 6;
948 out[1] = 0x80 | (code & 0x3F); code >>= 6;
949 out[0] = 0xF0 | code;
950 }
951 }
952 else
953 {
954 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
955 break;
956 }
957
958 if ( out )
959 {
960 out += len;
961 dstLen -= len;
962 }
963
964 written += len;
965 }
966
967 // we only get here if an error occurs during decoding
968 return wxCONV_FAILED;
969}
970
6001e347
RR
971size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
972{
0286d08d
VZ
973 if ( m_options == MAP_INVALID_UTF8_NOT )
974 return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
975
4def3b35
VS
976 size_t len = 0;
977
dccce9ea 978 while (*psz && ((!buf) || (len < n)))
4def3b35 979 {
ea8ce907
RR
980 const char *opsz = psz;
981 bool invalid = false;
4def3b35
VS
982 unsigned char cc = *psz++, fc = cc;
983 unsigned cnt;
dccce9ea 984 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 985 fc <<= 1;
ef199164 986
dccce9ea 987 if (!cnt)
4def3b35
VS
988 {
989 // plain ASCII char
dccce9ea 990 if (buf)
4def3b35
VS
991 *buf++ = cc;
992 len++;
561488ef
MW
993
994 // escape the escape character for octal escapes
995 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
996 && cc == '\\' && (!buf || len < n))
997 {
998 if (buf)
999 *buf++ = cc;
1000 len++;
1001 }
dccce9ea
VZ
1002 }
1003 else
4def3b35
VS
1004 {
1005 cnt--;
dccce9ea 1006 if (!cnt)
4def3b35
VS
1007 {
1008 // invalid UTF-8 sequence
ea8ce907 1009 invalid = true;
dccce9ea
VZ
1010 }
1011 else
4def3b35
VS
1012 {
1013 unsigned ocnt = cnt - 1;
1014 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1015 while (cnt--)
4def3b35 1016 {
ea8ce907 1017 cc = *psz;
dccce9ea 1018 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1019 {
1020 // invalid UTF-8 sequence
ea8ce907
RR
1021 invalid = true;
1022 break;
4def3b35 1023 }
ef199164 1024
ea8ce907 1025 psz++;
4def3b35
VS
1026 res = (res << 6) | (cc & 0x3f);
1027 }
ef199164 1028
ea8ce907 1029 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1030 {
1031 // illegal UTF-8 encoding
ea8ce907 1032 invalid = true;
4def3b35 1033 }
ea8ce907
RR
1034 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1035 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1036 {
1037 // if one of our PUA characters turns up externally
1038 // it must also be treated as an illegal sequence
1039 // (a bit like you have to escape an escape character)
1040 invalid = true;
1041 }
1042 else
1043 {
1cd52418 1044#ifdef WC_UTF16
0286d08d 1045 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1046 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1047 if (pa == wxCONV_FAILED)
ea8ce907
RR
1048 {
1049 invalid = true;
1050 }
1051 else
1052 {
1053 if (buf)
1054 buf += pa;
1055 len += pa;
1056 }
373658eb 1057#else // !WC_UTF16
ea8ce907 1058 if (buf)
38d4b1e4 1059 *buf++ = (wchar_t)res;
ea8ce907 1060 len++;
373658eb 1061#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1062 }
1063 }
ef199164 1064
ea8ce907
RR
1065 if (invalid)
1066 {
1067 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1068 {
1069 while (opsz < psz && (!buf || len < n))
1070 {
1071#ifdef WC_UTF16
1072 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1073 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1074 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1075 if (buf)
1076 buf += pa;
1077 opsz++;
1078 len += pa;
1079#else
1080 if (buf)
38d4b1e4 1081 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1082 opsz++;
1083 len++;
1084#endif
1085 }
1086 }
3698ae71 1087 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1088 {
1089 while (opsz < psz && (!buf || len < n))
1090 {
3698ae71
VZ
1091 if ( buf && len + 3 < n )
1092 {
17a1ebd1 1093 unsigned char on = *opsz;
3698ae71 1094 *buf++ = L'\\';
17a1ebd1
VZ
1095 *buf++ = (wchar_t)( L'0' + on / 0100 );
1096 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1097 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1098 }
ef199164 1099
ea8ce907
RR
1100 opsz++;
1101 len += 4;
1102 }
1103 }
3698ae71 1104 else // MAP_INVALID_UTF8_NOT
ea8ce907 1105 {
467e0479 1106 return wxCONV_FAILED;
ea8ce907 1107 }
4def3b35
VS
1108 }
1109 }
6001e347 1110 }
ef199164 1111
dccce9ea 1112 if (buf && (len < n))
4def3b35 1113 *buf = 0;
ef199164 1114
4def3b35 1115 return len;
6001e347
RR
1116}
1117
3698ae71
VZ
1118static inline bool isoctal(wchar_t wch)
1119{
1120 return L'0' <= wch && wch <= L'7';
1121}
1122
6001e347
RR
1123size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1124{
0286d08d
VZ
1125 if ( m_options == MAP_INVALID_UTF8_NOT )
1126 return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1127
4def3b35 1128 size_t len = 0;
6001e347 1129
dccce9ea 1130 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
1131 {
1132 wxUint32 cc;
ef199164 1133
1cd52418 1134#ifdef WC_UTF16
b5153fd8
VZ
1135 // cast is ok for WC_UTF16
1136 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1137 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1138#else
ef199164 1139 cc = (*psz++) & 0x7fffffff;
4def3b35 1140#endif
3698ae71
VZ
1141
1142 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1143 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1144 {
dccce9ea 1145 if (buf)
ea8ce907 1146 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1147 len++;
3698ae71 1148 }
561488ef
MW
1149 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1150 && cc == L'\\' && psz[0] == L'\\' )
1151 {
1152 if (buf)
1153 *buf++ = (char)cc;
1154 psz++;
1155 len++;
1156 }
3698ae71
VZ
1157 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1158 cc == L'\\' &&
1159 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1160 {
dccce9ea 1161 if (buf)
3698ae71 1162 {
ef199164
DS
1163 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1164 (psz[1] - L'0') * 010 +
b2c13097 1165 (psz[2] - L'0'));
3698ae71
VZ
1166 }
1167
1168 psz += 3;
ea8ce907
RR
1169 len++;
1170 }
1171 else
1172 {
1173 unsigned cnt;
ef199164
DS
1174 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1175 {
1176 }
1177
ea8ce907 1178 if (!cnt)
4def3b35 1179 {
ea8ce907
RR
1180 // plain ASCII char
1181 if (buf)
1182 *buf++ = (char) cc;
1183 len++;
1184 }
ea8ce907
RR
1185 else
1186 {
1187 len += cnt + 1;
1188 if (buf)
1189 {
1190 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1191 while (cnt--)
1192 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1193 }
4def3b35
VS
1194 }
1195 }
6001e347 1196 }
4def3b35 1197
ef199164 1198 if (buf && (len < n))
3698ae71 1199 *buf = 0;
adb45366 1200
4def3b35 1201 return len;
6001e347
RR
1202}
1203
467e0479 1204// ============================================================================
c91830cb 1205// UTF-16
467e0479 1206// ============================================================================
c91830cb
VZ
1207
1208#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1209 #define wxMBConvUTF16straight wxMBConvUTF16BE
1210 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1211#else
bde4baac
VZ
1212 #define wxMBConvUTF16swap wxMBConvUTF16BE
1213 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1214#endif
1215
467e0479
VZ
1216/* static */
1217size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1218{
1219 if ( srcLen == wxNO_LEN )
1220 {
1221 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1222 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1223 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1224 ;
c91830cb 1225
467e0479
VZ
1226 srcLen *= BYTES_PER_CHAR;
1227 }
1228 else // we already have the length
1229 {
1230 // we can only convert an entire number of UTF-16 characters
1231 if ( srcLen % BYTES_PER_CHAR )
1232 return wxCONV_FAILED;
1233 }
1234
1235 return srcLen;
1236}
1237
1238// case when in-memory representation is UTF-16 too
c91830cb
VZ
1239#ifdef WC_UTF16
1240
467e0479
VZ
1241// ----------------------------------------------------------------------------
1242// conversions without endianness change
1243// ----------------------------------------------------------------------------
1244
1245size_t
1246wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1247 const char *src, size_t srcLen) const
c91830cb 1248{
467e0479
VZ
1249 // set up the scene for using memcpy() (which is presumably more efficient
1250 // than copying the bytes one by one)
1251 srcLen = GetLength(src, srcLen);
1252 if ( srcLen == wxNO_LEN )
1253 return wxCONV_FAILED;
c91830cb 1254
ef199164 1255 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1256 if ( dst )
c91830cb 1257 {
467e0479
VZ
1258 if ( dstLen < inLen )
1259 return wxCONV_FAILED;
c91830cb 1260
467e0479 1261 memcpy(dst, src, srcLen);
c91830cb 1262 }
d32a507d 1263
467e0479 1264 return inLen;
c91830cb
VZ
1265}
1266
467e0479
VZ
1267size_t
1268wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1269 const wchar_t *src, size_t srcLen) const
c91830cb 1270{
467e0479
VZ
1271 if ( srcLen == wxNO_LEN )
1272 srcLen = wxWcslen(src) + 1;
c91830cb 1273
467e0479
VZ
1274 srcLen *= BYTES_PER_CHAR;
1275
1276 if ( dst )
c91830cb 1277 {
467e0479
VZ
1278 if ( dstLen < srcLen )
1279 return wxCONV_FAILED;
d32a507d 1280
467e0479 1281 memcpy(dst, src, srcLen);
c91830cb 1282 }
d32a507d 1283
467e0479 1284 return srcLen;
c91830cb
VZ
1285}
1286
467e0479
VZ
1287// ----------------------------------------------------------------------------
1288// endian-reversing conversions
1289// ----------------------------------------------------------------------------
c91830cb 1290
467e0479
VZ
1291size_t
1292wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1293 const char *src, size_t srcLen) const
c91830cb 1294{
467e0479
VZ
1295 srcLen = GetLength(src, srcLen);
1296 if ( srcLen == wxNO_LEN )
1297 return wxCONV_FAILED;
c91830cb 1298
467e0479
VZ
1299 srcLen /= BYTES_PER_CHAR;
1300
1301 if ( dst )
c91830cb 1302 {
467e0479
VZ
1303 if ( dstLen < srcLen )
1304 return wxCONV_FAILED;
1305
ef199164
DS
1306 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1307 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1308 {
ef199164 1309 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1310 }
c91830cb 1311 }
bfab25d4 1312
467e0479 1313 return srcLen;
c91830cb
VZ
1314}
1315
467e0479
VZ
1316size_t
1317wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1318 const wchar_t *src, size_t srcLen) const
c91830cb 1319{
467e0479
VZ
1320 if ( srcLen == wxNO_LEN )
1321 srcLen = wxWcslen(src) + 1;
c91830cb 1322
467e0479
VZ
1323 srcLen *= BYTES_PER_CHAR;
1324
1325 if ( dst )
c91830cb 1326 {
467e0479
VZ
1327 if ( dstLen < srcLen )
1328 return wxCONV_FAILED;
1329
ef199164 1330 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1331 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1332 {
ef199164 1333 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1334 }
c91830cb 1335 }
eec47cc6 1336
467e0479 1337 return srcLen;
c91830cb
VZ
1338}
1339
467e0479 1340#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1341
467e0479
VZ
1342// ----------------------------------------------------------------------------
1343// conversions without endianness change
1344// ----------------------------------------------------------------------------
c91830cb 1345
35d11700
VZ
1346size_t
1347wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1348 const char *src, size_t srcLen) const
c91830cb 1349{
35d11700
VZ
1350 srcLen = GetLength(src, srcLen);
1351 if ( srcLen == wxNO_LEN )
1352 return wxCONV_FAILED;
c91830cb 1353
ef199164 1354 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1355 if ( !dst )
c91830cb 1356 {
35d11700
VZ
1357 // optimization: return maximal space which could be needed for this
1358 // string even if the real size could be smaller if the buffer contains
1359 // any surrogates
1360 return inLen;
c91830cb 1361 }
c91830cb 1362
35d11700 1363 size_t outLen = 0;
ef199164
DS
1364 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1365 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1366 {
ef199164
DS
1367 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1368 if ( !inBuff )
35d11700
VZ
1369 return wxCONV_FAILED;
1370
1371 if ( ++outLen > dstLen )
1372 return wxCONV_FAILED;
c91830cb 1373
35d11700
VZ
1374 *dst++ = ch;
1375 }
1376
1377
1378 return outLen;
1379}
c91830cb 1380
35d11700
VZ
1381size_t
1382wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1383 const wchar_t *src, size_t srcLen) const
c91830cb 1384{
35d11700
VZ
1385 if ( srcLen == wxNO_LEN )
1386 srcLen = wxWcslen(src) + 1;
c91830cb 1387
35d11700 1388 size_t outLen = 0;
ef199164 1389 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1390 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1391 {
1392 wxUint16 cc[2];
35d11700
VZ
1393 const size_t numChars = encode_utf16(*src++, cc);
1394 if ( numChars == wxCONV_FAILED )
1395 return wxCONV_FAILED;
c91830cb 1396
ef199164
DS
1397 outLen += numChars * BYTES_PER_CHAR;
1398 if ( outBuff )
c91830cb 1399 {
35d11700
VZ
1400 if ( outLen > dstLen )
1401 return wxCONV_FAILED;
1402
ef199164 1403 *outBuff++ = cc[0];
35d11700 1404 if ( numChars == 2 )
69b80d28 1405 {
35d11700 1406 // second character of a surrogate
ef199164 1407 *outBuff++ = cc[1];
69b80d28 1408 }
c91830cb 1409 }
c91830cb 1410 }
c91830cb 1411
35d11700 1412 return outLen;
c91830cb
VZ
1413}
1414
467e0479
VZ
1415// ----------------------------------------------------------------------------
1416// endian-reversing conversions
1417// ----------------------------------------------------------------------------
c91830cb 1418
35d11700
VZ
1419size_t
1420wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1421 const char *src, size_t srcLen) const
c91830cb 1422{
35d11700
VZ
1423 srcLen = GetLength(src, srcLen);
1424 if ( srcLen == wxNO_LEN )
1425 return wxCONV_FAILED;
1426
ef199164 1427 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1428 if ( !dst )
1429 {
1430 // optimization: return maximal space which could be needed for this
1431 // string even if the real size could be smaller if the buffer contains
1432 // any surrogates
1433 return inLen;
1434 }
c91830cb 1435
35d11700 1436 size_t outLen = 0;
ef199164
DS
1437 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1438 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1439 {
35d11700
VZ
1440 wxUint32 ch;
1441 wxUint16 tmp[2];
ef199164
DS
1442
1443 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1444 inBuff++;
1445 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1446
35d11700
VZ
1447 const size_t numChars = decode_utf16(tmp, ch);
1448 if ( numChars == wxCONV_FAILED )
1449 return wxCONV_FAILED;
c91830cb 1450
35d11700 1451 if ( numChars == 2 )
ef199164 1452 inBuff++;
35d11700
VZ
1453
1454 if ( ++outLen > dstLen )
1455 return wxCONV_FAILED;
c91830cb 1456
35d11700 1457 *dst++ = ch;
c91830cb 1458 }
c91830cb 1459
c91830cb 1460
35d11700
VZ
1461 return outLen;
1462}
c91830cb 1463
35d11700
VZ
1464size_t
1465wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1466 const wchar_t *src, size_t srcLen) const
c91830cb 1467{
35d11700
VZ
1468 if ( srcLen == wxNO_LEN )
1469 srcLen = wxWcslen(src) + 1;
c91830cb 1470
35d11700 1471 size_t outLen = 0;
ef199164 1472 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1473 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1474 {
1475 wxUint16 cc[2];
35d11700
VZ
1476 const size_t numChars = encode_utf16(*src, cc);
1477 if ( numChars == wxCONV_FAILED )
1478 return wxCONV_FAILED;
c91830cb 1479
ef199164
DS
1480 outLen += numChars * BYTES_PER_CHAR;
1481 if ( outBuff )
c91830cb 1482 {
35d11700
VZ
1483 if ( outLen > dstLen )
1484 return wxCONV_FAILED;
1485
ef199164 1486 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1487 if ( numChars == 2 )
c91830cb 1488 {
35d11700 1489 // second character of a surrogate
ef199164 1490 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1491 }
1492 }
c91830cb 1493 }
c91830cb 1494
35d11700 1495 return outLen;
c91830cb
VZ
1496}
1497
467e0479 1498#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1499
1500
35d11700 1501// ============================================================================
c91830cb 1502// UTF-32
35d11700 1503// ============================================================================
c91830cb
VZ
1504
1505#ifdef WORDS_BIGENDIAN
467e0479
VZ
1506 #define wxMBConvUTF32straight wxMBConvUTF32BE
1507 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1508#else
467e0479
VZ
1509 #define wxMBConvUTF32swap wxMBConvUTF32BE
1510 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1511#endif
1512
1513
1514WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1515WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1516
467e0479
VZ
1517/* static */
1518size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1519{
1520 if ( srcLen == wxNO_LEN )
1521 {
1522 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1523 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1524 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1525 ;
c91830cb 1526
467e0479
VZ
1527 srcLen *= BYTES_PER_CHAR;
1528 }
1529 else // we already have the length
1530 {
1531 // we can only convert an entire number of UTF-32 characters
1532 if ( srcLen % BYTES_PER_CHAR )
1533 return wxCONV_FAILED;
1534 }
1535
1536 return srcLen;
1537}
1538
1539// case when in-memory representation is UTF-16
c91830cb
VZ
1540#ifdef WC_UTF16
1541
467e0479
VZ
1542// ----------------------------------------------------------------------------
1543// conversions without endianness change
1544// ----------------------------------------------------------------------------
1545
1546size_t
1547wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1548 const char *src, size_t srcLen) const
c91830cb 1549{
467e0479
VZ
1550 srcLen = GetLength(src, srcLen);
1551 if ( srcLen == wxNO_LEN )
1552 return wxCONV_FAILED;
c91830cb 1553
ef199164
DS
1554 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1555 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1556 size_t outLen = 0;
1557 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1558 {
1559 wxUint16 cc[2];
ef199164 1560 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1561 if ( numChars == wxCONV_FAILED )
1562 return wxCONV_FAILED;
c91830cb 1563
467e0479
VZ
1564 outLen += numChars;
1565 if ( dst )
c91830cb 1566 {
467e0479
VZ
1567 if ( outLen > dstLen )
1568 return wxCONV_FAILED;
d32a507d 1569
467e0479
VZ
1570 *dst++ = cc[0];
1571 if ( numChars == 2 )
1572 {
1573 // second character of a surrogate
1574 *dst++ = cc[1];
1575 }
1576 }
c91830cb 1577 }
d32a507d 1578
467e0479 1579 return outLen;
c91830cb
VZ
1580}
1581
467e0479
VZ
1582size_t
1583wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1584 const wchar_t *src, size_t srcLen) const
c91830cb 1585{
467e0479
VZ
1586 if ( srcLen == wxNO_LEN )
1587 srcLen = wxWcslen(src) + 1;
c91830cb 1588
467e0479 1589 if ( !dst )
c91830cb 1590 {
467e0479
VZ
1591 // optimization: return maximal space which could be needed for this
1592 // string instead of the exact amount which could be less if there are
1593 // any surrogates in the input
1594 //
1595 // we consider that surrogates are rare enough to make it worthwhile to
1596 // avoid running the loop below at the cost of slightly extra memory
1597 // consumption
ef199164 1598 return srcLen * BYTES_PER_CHAR;
467e0479 1599 }
c91830cb 1600
ef199164 1601 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1602 size_t outLen = 0;
1603 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1604 {
1605 const wxUint32 ch = wxDecodeSurrogate(&src);
1606 if ( !src )
1607 return wxCONV_FAILED;
c91830cb 1608
467e0479 1609 outLen += BYTES_PER_CHAR;
d32a507d 1610
467e0479
VZ
1611 if ( outLen > dstLen )
1612 return wxCONV_FAILED;
b5153fd8 1613
ef199164 1614 *outBuff++ = ch;
467e0479 1615 }
c91830cb 1616
467e0479 1617 return outLen;
c91830cb
VZ
1618}
1619
467e0479
VZ
1620// ----------------------------------------------------------------------------
1621// endian-reversing conversions
1622// ----------------------------------------------------------------------------
c91830cb 1623
467e0479
VZ
1624size_t
1625wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1626 const char *src, size_t srcLen) const
c91830cb 1627{
467e0479
VZ
1628 srcLen = GetLength(src, srcLen);
1629 if ( srcLen == wxNO_LEN )
1630 return wxCONV_FAILED;
c91830cb 1631
ef199164
DS
1632 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1633 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1634 size_t outLen = 0;
ef199164 1635 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1636 {
c91830cb 1637 wxUint16 cc[2];
ef199164 1638 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1639 if ( numChars == wxCONV_FAILED )
1640 return wxCONV_FAILED;
c91830cb 1641
467e0479
VZ
1642 outLen += numChars;
1643 if ( dst )
c91830cb 1644 {
467e0479
VZ
1645 if ( outLen > dstLen )
1646 return wxCONV_FAILED;
d32a507d 1647
467e0479
VZ
1648 *dst++ = cc[0];
1649 if ( numChars == 2 )
1650 {
1651 // second character of a surrogate
1652 *dst++ = cc[1];
1653 }
1654 }
c91830cb 1655 }
b5153fd8 1656
467e0479 1657 return outLen;
c91830cb
VZ
1658}
1659
467e0479
VZ
1660size_t
1661wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1662 const wchar_t *src, size_t srcLen) const
c91830cb 1663{
467e0479
VZ
1664 if ( srcLen == wxNO_LEN )
1665 srcLen = wxWcslen(src) + 1;
c91830cb 1666
467e0479 1667 if ( !dst )
c91830cb 1668 {
467e0479
VZ
1669 // optimization: return maximal space which could be needed for this
1670 // string instead of the exact amount which could be less if there are
1671 // any surrogates in the input
1672 //
1673 // we consider that surrogates are rare enough to make it worthwhile to
1674 // avoid running the loop below at the cost of slightly extra memory
1675 // consumption
1676 return srcLen*BYTES_PER_CHAR;
1677 }
c91830cb 1678
ef199164 1679 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1680 size_t outLen = 0;
1681 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1682 {
1683 const wxUint32 ch = wxDecodeSurrogate(&src);
1684 if ( !src )
1685 return wxCONV_FAILED;
c91830cb 1686
467e0479 1687 outLen += BYTES_PER_CHAR;
d32a507d 1688
467e0479
VZ
1689 if ( outLen > dstLen )
1690 return wxCONV_FAILED;
b5153fd8 1691
ef199164 1692 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1693 }
c91830cb 1694
467e0479 1695 return outLen;
c91830cb
VZ
1696}
1697
467e0479 1698#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1699
35d11700
VZ
1700// ----------------------------------------------------------------------------
1701// conversions without endianness change
1702// ----------------------------------------------------------------------------
1703
1704size_t
1705wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1706 const char *src, size_t srcLen) const
c91830cb 1707{
35d11700
VZ
1708 // use memcpy() as it should be much faster than hand-written loop
1709 srcLen = GetLength(src, srcLen);
1710 if ( srcLen == wxNO_LEN )
1711 return wxCONV_FAILED;
c91830cb 1712
35d11700
VZ
1713 const size_t inLen = srcLen/BYTES_PER_CHAR;
1714 if ( dst )
c91830cb 1715 {
35d11700
VZ
1716 if ( dstLen < inLen )
1717 return wxCONV_FAILED;
b5153fd8 1718
35d11700
VZ
1719 memcpy(dst, src, srcLen);
1720 }
c91830cb 1721
35d11700 1722 return inLen;
c91830cb
VZ
1723}
1724
35d11700
VZ
1725size_t
1726wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1727 const wchar_t *src, size_t srcLen) const
c91830cb 1728{
35d11700
VZ
1729 if ( srcLen == wxNO_LEN )
1730 srcLen = wxWcslen(src) + 1;
1731
1732 srcLen *= BYTES_PER_CHAR;
c91830cb 1733
35d11700 1734 if ( dst )
c91830cb 1735 {
35d11700
VZ
1736 if ( dstLen < srcLen )
1737 return wxCONV_FAILED;
c91830cb 1738
35d11700 1739 memcpy(dst, src, srcLen);
c91830cb
VZ
1740 }
1741
35d11700 1742 return srcLen;
c91830cb
VZ
1743}
1744
35d11700
VZ
1745// ----------------------------------------------------------------------------
1746// endian-reversing conversions
1747// ----------------------------------------------------------------------------
c91830cb 1748
35d11700
VZ
1749size_t
1750wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1751 const char *src, size_t srcLen) const
c91830cb 1752{
35d11700
VZ
1753 srcLen = GetLength(src, srcLen);
1754 if ( srcLen == wxNO_LEN )
1755 return wxCONV_FAILED;
1756
1757 srcLen /= BYTES_PER_CHAR;
c91830cb 1758
35d11700 1759 if ( dst )
c91830cb 1760 {
35d11700
VZ
1761 if ( dstLen < srcLen )
1762 return wxCONV_FAILED;
1763
ef199164
DS
1764 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1765 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1766 {
ef199164 1767 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1768 }
c91830cb 1769 }
b5153fd8 1770
35d11700 1771 return srcLen;
c91830cb
VZ
1772}
1773
35d11700
VZ
1774size_t
1775wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1776 const wchar_t *src, size_t srcLen) const
c91830cb 1777{
35d11700
VZ
1778 if ( srcLen == wxNO_LEN )
1779 srcLen = wxWcslen(src) + 1;
1780
1781 srcLen *= BYTES_PER_CHAR;
c91830cb 1782
35d11700 1783 if ( dst )
c91830cb 1784 {
35d11700
VZ
1785 if ( dstLen < srcLen )
1786 return wxCONV_FAILED;
1787
ef199164 1788 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1789 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1790 {
ef199164 1791 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1792 }
c91830cb 1793 }
b5153fd8 1794
35d11700 1795 return srcLen;
c91830cb
VZ
1796}
1797
467e0479 1798#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1799
1800
36acb880
VZ
1801// ============================================================================
1802// The classes doing conversion using the iconv_xxx() functions
1803// ============================================================================
3caec1bb 1804
b040e242 1805#ifdef HAVE_ICONV
3a0d76bc 1806
b1d547eb
VS
1807// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1808// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1809// (unless there's yet another bug in glibc) the only case when iconv()
1810// returns with (size_t)-1 (which means error) and says there are 0 bytes
1811// left in the input buffer -- when _real_ error occurs,
1812// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1813// iconv() failure.
3caec1bb
VS
1814// [This bug does not appear in glibc 2.2.]
1815#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1816#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1817 (errno != E2BIG || bufLeft != 0))
1818#else
1819#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1820#endif
1821
ab217dba 1822#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1823
74a7eb0b
VZ
1824#define ICONV_T_INVALID ((iconv_t)-1)
1825
1826#if SIZEOF_WCHAR_T == 4
1827 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1828 #define WC_ENC wxFONTENCODING_UTF32
1829#elif SIZEOF_WCHAR_T == 2
1830 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1831 #define WC_ENC wxFONTENCODING_UTF16
1832#else // sizeof(wchar_t) != 2 nor 4
1833 // does this ever happen?
1834 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1835#endif
1836
36acb880 1837// ----------------------------------------------------------------------------
e95354ec 1838// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1839// ----------------------------------------------------------------------------
1840
e95354ec 1841class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1842{
1843public:
86501081 1844 wxMBConv_iconv(const char *name);
e95354ec 1845 virtual ~wxMBConv_iconv();
36acb880 1846
bde4baac
VZ
1847 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1848 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1849
d36c9347 1850 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1851 virtual size_t GetMBNulLen() const;
1852
ba98e032
VS
1853#if wxUSE_UNICODE_UTF8
1854 virtual bool IsUTF8() const;
1855#endif
1856
d36c9347
VZ
1857 virtual wxMBConv *Clone() const
1858 {
86501081 1859 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1860 p->m_minMBCharWidth = m_minMBCharWidth;
1861 return p;
1862 }
1863
e95354ec 1864 bool IsOk() const
74a7eb0b 1865 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1866
1867protected:
ef199164
DS
1868 // the iconv handlers used to translate from multibyte
1869 // to wide char and in the other direction
36acb880
VZ
1870 iconv_t m2w,
1871 w2m;
ef199164 1872
b1d547eb
VS
1873#if wxUSE_THREADS
1874 // guards access to m2w and w2m objects
1875 wxMutex m_iconvMutex;
1876#endif
36acb880
VZ
1877
1878private:
e95354ec 1879 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1880 // available on this machine, it will remain NULL
74a7eb0b 1881 static wxString ms_wcCharsetName;
36acb880
VZ
1882
1883 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1884 // different endian-ness than the native one
405d8f46 1885 static bool ms_wcNeedsSwap;
eec47cc6 1886
d36c9347
VZ
1887
1888 // name of the encoding handled by this conversion
1889 wxString m_name;
1890
7ef3ab50 1891 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1892 // initially
1893 size_t m_minMBCharWidth;
36acb880
VZ
1894};
1895
8f115891 1896// make the constructor available for unit testing
86501081 1897WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
1898{
1899 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1900 if ( !result->IsOk() )
1901 {
1902 delete result;
1903 return 0;
1904 }
ef199164 1905
8f115891
MW
1906 return result;
1907}
1908
422e411e 1909wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1910bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1911
86501081 1912wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 1913 : m_name(name)
36acb880 1914{
c1464d9d 1915 m_minMBCharWidth = 0;
eec47cc6 1916
36acb880 1917 // check for charset that represents wchar_t:
74a7eb0b 1918 if ( ms_wcCharsetName.empty() )
f1339c56 1919 {
c2b83fdd
VZ
1920 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1921
74a7eb0b
VZ
1922#if wxUSE_FONTMAP
1923 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1924#else // !wxUSE_FONTMAP
91cb7f52 1925 static const wxChar *names_static[] =
36acb880 1926 {
74a7eb0b
VZ
1927#if SIZEOF_WCHAR_T == 4
1928 _T("UCS-4"),
1929#elif SIZEOF_WCHAR_T = 2
1930 _T("UCS-2"),
1931#endif
1932 NULL
1933 };
91cb7f52 1934 const wxChar **names = names_static;
74a7eb0b 1935#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1936
d1f024a8 1937 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1938 {
17a1ebd1 1939 const wxString nameCS(*names);
74a7eb0b
VZ
1940
1941 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1942 wxString nameXE(nameCS);
ef199164
DS
1943
1944#ifdef WORDS_BIGENDIAN
74a7eb0b 1945 nameXE += _T("BE");
ef199164 1946#else // little endian
74a7eb0b 1947 nameXE += _T("LE");
ef199164 1948#endif
74a7eb0b 1949
c2b83fdd
VZ
1950 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1951 nameXE.c_str());
1952
86501081 1953 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 1954 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1955 {
74a7eb0b 1956 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1957 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1958 nameCS.c_str());
86501081 1959 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 1960
74a7eb0b
VZ
1961 // and check for bytesex ourselves:
1962 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1963 {
74a7eb0b
VZ
1964 char buf[2], *bufPtr;
1965 wchar_t wbuf[2], *wbufPtr;
1966 size_t insz, outsz;
1967 size_t res;
1968
1969 buf[0] = 'A';
1970 buf[1] = 0;
1971 wbuf[0] = 0;
1972 insz = 2;
1973 outsz = SIZEOF_WCHAR_T * 2;
1974 wbufPtr = wbuf;
1975 bufPtr = buf;
1976
ef199164
DS
1977 res = iconv(
1978 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1979 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
1980
1981 if (ICONV_FAILED(res, insz))
1982 {
1983 wxLogLastError(wxT("iconv"));
422e411e 1984 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1985 nameCS.c_str());
74a7eb0b
VZ
1986 }
1987 else // ok, can convert to this encoding, remember it
1988 {
17a1ebd1 1989 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1990 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1991 }
3a0d76bc
VS
1992 }
1993 }
74a7eb0b 1994 else // use charset not requiring byte swapping
36acb880 1995 {
74a7eb0b 1996 ms_wcCharsetName = nameXE;
36acb880 1997 }
3a0d76bc 1998 }
74a7eb0b 1999
0944fceb 2000 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2001 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2002 ms_wcCharsetName.empty() ? wxString("<none>")
2003 : ms_wcCharsetName,
74a7eb0b
VZ
2004 ms_wcNeedsSwap ? _T(" (needs swap)")
2005 : _T(""));
3a0d76bc 2006 }
36acb880 2007 else // we already have ms_wcCharsetName
3caec1bb 2008 {
86501081 2009 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2010 }
dccce9ea 2011
74a7eb0b 2012 if ( ms_wcCharsetName.empty() )
f1339c56 2013 {
74a7eb0b 2014 w2m = ICONV_T_INVALID;
36acb880 2015 }
405d8f46
VZ
2016 else
2017 {
86501081 2018 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2019 if ( w2m == ICONV_T_INVALID )
2020 {
2021 wxLogTrace(TRACE_STRCONV,
2022 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2023 ms_wcCharsetName.c_str(), name);
74a7eb0b 2024 }
405d8f46 2025 }
36acb880 2026}
3caec1bb 2027
e95354ec 2028wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2029{
74a7eb0b 2030 if ( m2w != ICONV_T_INVALID )
36acb880 2031 iconv_close(m2w);
74a7eb0b 2032 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2033 iconv_close(w2m);
2034}
3a0d76bc 2035
bde4baac 2036size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 2037{
69373110
VZ
2038 // find the string length: notice that must be done differently for
2039 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2040 size_t inbuf;
7ef3ab50 2041 const size_t nulLen = GetMBNulLen();
69373110
VZ
2042 switch ( nulLen )
2043 {
2044 default:
467e0479 2045 return wxCONV_FAILED;
69373110
VZ
2046
2047 case 1:
2048 inbuf = strlen(psz); // arguably more optimized than our version
2049 break;
2050
2051 case 2:
2052 case 4:
2053 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2054 // they also have to start at character boundary and not span two
2055 // adjacent characters
2056 const char *p;
2057 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2058 ;
2059 inbuf = p - psz;
2060 break;
2061 }
2062
b1d547eb 2063#if wxUSE_THREADS
6a17b868
SN
2064 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2065 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2066 // wxConvLocal that are used all over wx code, so we have to make sure
2067 // the handle is used by at most one thread at the time. Otherwise
2068 // only a few wx classes would be safe to use from non-main threads
2069 // as MB<->WC conversion would fail "randomly".
2070 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2071#endif // wxUSE_THREADS
2072
36acb880
VZ
2073 size_t outbuf = n * SIZEOF_WCHAR_T;
2074 size_t res, cres;
2075 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2076 wchar_t *bufPtr = buf;
2077 const char *pszPtr = psz;
2078
2079 if (buf)
2080 {
2081 // have destination buffer, convert there
2082 cres = iconv(m2w,
2083 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2084 (char**)&bufPtr, &outbuf);
2085 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 2086
36acb880 2087 if (ms_wcNeedsSwap)
3a0d76bc 2088 {
36acb880 2089 // convert to native endianness
17a1ebd1
VZ
2090 for ( unsigned i = 0; i < res; i++ )
2091 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 2092 }
adb45366 2093
69373110 2094 // NUL-terminate the string if there is any space left
49dd9820
VS
2095 if (res < n)
2096 buf[res] = 0;
36acb880
VZ
2097 }
2098 else
2099 {
2100 // no destination buffer... convert using temp buffer
2101 // to calculate destination buffer requirement
2102 wchar_t tbuf[8];
2103 res = 0;
ef199164
DS
2104
2105 do
2106 {
36acb880 2107 bufPtr = tbuf;
ef199164 2108 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2109
2110 cres = iconv(m2w,
2111 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2112 (char**)&bufPtr, &outbuf );
2113
ef199164
DS
2114 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2115 }
2116 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2117 }
dccce9ea 2118
36acb880 2119 if (ICONV_FAILED(cres, inbuf))
f1339c56 2120 {
36acb880 2121 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2122 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2123 return wxCONV_FAILED;
36acb880
VZ
2124 }
2125
2126 return res;
2127}
2128
bde4baac 2129size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 2130{
b1d547eb
VS
2131#if wxUSE_THREADS
2132 // NB: explained in MB2WC
2133 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2134#endif
3698ae71 2135
156162ec
MW
2136 size_t inlen = wxWcslen(psz);
2137 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
2138 size_t outbuf = n;
2139 size_t res, cres;
3a0d76bc 2140
36acb880 2141 wchar_t *tmpbuf = 0;
3caec1bb 2142
36acb880
VZ
2143 if (ms_wcNeedsSwap)
2144 {
2145 // need to copy to temp buffer to switch endianness
74a7eb0b 2146 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2147 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 2148 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
2149 for ( size_t i = 0; i < inlen; i++ )
2150 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 2151
156162ec 2152 tmpbuf[inlen] = L'\0';
74a7eb0b 2153 psz = tmpbuf;
36acb880 2154 }
3a0d76bc 2155
36acb880
VZ
2156 if (buf)
2157 {
2158 // have destination buffer, convert there
2159 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 2160
ef199164 2161 res = n - outbuf;
adb45366 2162
49dd9820
VS
2163 // NB: iconv was given only wcslen(psz) characters on input, and so
2164 // it couldn't convert the trailing zero. Let's do it ourselves
2165 // if there's some room left for it in the output buffer.
2166 if (res < n)
2167 buf[0] = 0;
36acb880
VZ
2168 }
2169 else
2170 {
ef199164 2171 // no destination buffer: convert using temp buffer
36acb880
VZ
2172 // to calculate destination buffer requirement
2173 char tbuf[16];
2174 res = 0;
ef199164
DS
2175 do
2176 {
2177 buf = tbuf;
2178 outbuf = 16;
36acb880
VZ
2179
2180 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 2181
36acb880 2182 res += 16 - outbuf;
ef199164
DS
2183 }
2184 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2185 }
dccce9ea 2186
36acb880
VZ
2187 if (ms_wcNeedsSwap)
2188 {
2189 free(tmpbuf);
2190 }
dccce9ea 2191
36acb880
VZ
2192 if (ICONV_FAILED(cres, inbuf))
2193 {
ce6f8d6f 2194 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2195 return wxCONV_FAILED;
36acb880
VZ
2196 }
2197
2198 return res;
2199}
2200
7ef3ab50 2201size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2202{
c1464d9d 2203 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2204 {
2205 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2206
2207#if wxUSE_THREADS
2208 // NB: explained in MB2WC
2209 wxMutexLocker lock(self->m_iconvMutex);
2210#endif
2211
999020e1 2212 const wchar_t *wnul = L"";
c1464d9d 2213 char buf[8]; // should be enough for NUL in any encoding
356410fc 2214 size_t inLen = sizeof(wchar_t),
c1464d9d 2215 outLen = WXSIZEOF(buf);
ef199164
DS
2216 char *inBuff = (char *)wnul;
2217 char *outBuff = buf;
2218 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2219 {
c1464d9d 2220 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2221 }
2222 else // ok
2223 {
ef199164 2224 self->m_minMBCharWidth = outBuff - buf;
356410fc 2225 }
eec47cc6
VZ
2226 }
2227
c1464d9d 2228 return m_minMBCharWidth;
eec47cc6
VZ
2229}
2230
ba98e032
VS
2231#if wxUSE_UNICODE_UTF8
2232bool wxMBConv_iconv::IsUTF8() const
2233{
86501081
VS
2234 return wxStricmp(m_name, "UTF-8") == 0 ||
2235 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2236}
2237#endif
2238
b040e242 2239#endif // HAVE_ICONV
36acb880 2240
e95354ec 2241
36acb880
VZ
2242// ============================================================================
2243// Win32 conversion classes
2244// ============================================================================
1cd52418 2245
e95354ec 2246#ifdef wxHAVE_WIN32_MB2WC
373658eb 2247
8b04d4c4 2248// from utils.cpp
d775fa82 2249#if wxUSE_FONTMAP
86501081 2250extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2251extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2252#endif
373658eb 2253
e95354ec 2254class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2255{
2256public:
bde4baac
VZ
2257 wxMBConv_win32()
2258 {
2259 m_CodePage = CP_ACP;
c1464d9d 2260 m_minMBCharWidth = 0;
bde4baac
VZ
2261 }
2262
d36c9347 2263 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2264 : wxMBConv()
d36c9347
VZ
2265 {
2266 m_CodePage = conv.m_CodePage;
2267 m_minMBCharWidth = conv.m_minMBCharWidth;
2268 }
2269
7608a683 2270#if wxUSE_FONTMAP
86501081 2271 wxMBConv_win32(const char* name)
bde4baac
VZ
2272 {
2273 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2274 m_minMBCharWidth = 0;
bde4baac 2275 }
dccce9ea 2276
e95354ec 2277 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2278 {
2279 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2280 m_minMBCharWidth = 0;
bde4baac 2281 }
eec47cc6 2282#endif // wxUSE_FONTMAP
8b04d4c4 2283
d36c9347 2284 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2285 {
02272c9c
VZ
2286 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2287 // the behaviour is not compatible with the Unix version (using iconv)
2288 // and break the library itself, e.g. wxTextInputStream::NextChar()
2289 // wouldn't work if reading an incomplete MB char didn't result in an
2290 // error
667e5b3e 2291 //
89028980 2292 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2293 // Win XP or newer and it is not supported for UTF-[78] so we always
2294 // use our own conversions in this case. See
89028980
VS
2295 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2296 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2297 if ( m_CodePage == CP_UTF8 )
89028980 2298 {
5487ff0f 2299 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2300 }
830f8f11
VZ
2301
2302 if ( m_CodePage == CP_UTF7 )
2303 {
5487ff0f 2304 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2305 }
2306
2307 int flags = 0;
2308 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2309 IsAtLeastWin2kSP4() )
89028980 2310 {
830f8f11 2311 flags = MB_ERR_INVALID_CHARS;
89028980 2312 }
667e5b3e 2313
2b5f62a0
VZ
2314 const size_t len = ::MultiByteToWideChar
2315 (
2316 m_CodePage, // code page
667e5b3e 2317 flags, // flags: fall on error
2b5f62a0
VZ
2318 psz, // input string
2319 -1, // its length (NUL-terminated)
b4da152e 2320 buf, // output string
2b5f62a0
VZ
2321 buf ? n : 0 // size of output buffer
2322 );
89028980
VS
2323 if ( !len )
2324 {
2325 // function totally failed
467e0479 2326 return wxCONV_FAILED;
89028980
VS
2327 }
2328
2329 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2330 // check if we succeeded, by doing a double trip:
2331 if ( !flags && buf )
2332 {
53c174fc
VZ
2333 const size_t mbLen = strlen(psz);
2334 wxCharBuffer mbBuf(mbLen);
89028980
VS
2335 if ( ::WideCharToMultiByte
2336 (
2337 m_CodePage,
2338 0,
2339 buf,
2340 -1,
2341 mbBuf.data(),
53c174fc 2342 mbLen + 1, // size in bytes, not length
89028980
VS
2343 NULL,
2344 NULL
2345 ) == 0 ||
2346 strcmp(mbBuf, psz) != 0 )
2347 {
2348 // we didn't obtain the same thing we started from, hence
2349 // the conversion was lossy and we consider that it failed
467e0479 2350 return wxCONV_FAILED;
89028980
VS
2351 }
2352 }
2b5f62a0 2353
03a991bc
VZ
2354 // note that it returns count of written chars for buf != NULL and size
2355 // of the needed buffer for buf == NULL so in either case the length of
2356 // the string (which never includes the terminating NUL) is one less
89028980 2357 return len - 1;
f1339c56 2358 }
dccce9ea 2359
d36c9347 2360 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2361 {
13dd924a
VZ
2362 /*
2363 we have a problem here: by default, WideCharToMultiByte() may
2364 replace characters unrepresentable in the target code page with bad
2365 quality approximations such as turning "1/2" symbol (U+00BD) into
2366 "1" for the code pages which don't have it and we, obviously, want
2367 to avoid this at any price
d775fa82 2368
13dd924a
VZ
2369 the trouble is that this function does it _silently_, i.e. it won't
2370 even tell us whether it did or not... Win98/2000 and higher provide
2371 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2372 we have to resort to a round trip, i.e. check that converting back
2373 results in the same string -- this is, of course, expensive but
2374 otherwise we simply can't be sure to not garble the data.
2375 */
2376
2377 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2378 // it doesn't work with CJK encodings (which we test for rather roughly
2379 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2380 // supporting it
907173e5
WS
2381 BOOL usedDef wxDUMMY_INITIALIZE(false);
2382 BOOL *pUsedDef;
13dd924a
VZ
2383 int flags;
2384 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2385 {
2386 // it's our lucky day
2387 flags = WC_NO_BEST_FIT_CHARS;
2388 pUsedDef = &usedDef;
2389 }
2390 else // old system or unsupported encoding
2391 {
2392 flags = 0;
2393 pUsedDef = NULL;
2394 }
2395
2b5f62a0
VZ
2396 const size_t len = ::WideCharToMultiByte
2397 (
2398 m_CodePage, // code page
13dd924a
VZ
2399 flags, // either none or no best fit
2400 pwz, // input string
2b5f62a0
VZ
2401 -1, // it is (wide) NUL-terminated
2402 buf, // output buffer
2403 buf ? n : 0, // and its size
2404 NULL, // default "replacement" char
13dd924a 2405 pUsedDef // [out] was it used?
2b5f62a0
VZ
2406 );
2407
13dd924a
VZ
2408 if ( !len )
2409 {
2410 // function totally failed
467e0479 2411 return wxCONV_FAILED;
13dd924a
VZ
2412 }
2413
2414 // if we were really converting, check if we succeeded
2415 if ( buf )
2416 {
2417 if ( flags )
2418 {
2419 // check if the conversion failed, i.e. if any replacements
2420 // were done
2421 if ( usedDef )
467e0479 2422 return wxCONV_FAILED;
13dd924a
VZ
2423 }
2424 else // we must resort to double tripping...
2425 {
2426 wxWCharBuffer wcBuf(n);
467e0479 2427 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
13dd924a
VZ
2428 wcscmp(wcBuf, pwz) != 0 )
2429 {
2430 // we didn't obtain the same thing we started from, hence
2431 // the conversion was lossy and we consider that it failed
467e0479 2432 return wxCONV_FAILED;
13dd924a
VZ
2433 }
2434 }
2435 }
2436
03a991bc 2437 // see the comment above for the reason of "len - 1"
13dd924a 2438 return len - 1;
f1339c56 2439 }
dccce9ea 2440
7ef3ab50
VZ
2441 virtual size_t GetMBNulLen() const
2442 {
2443 if ( m_minMBCharWidth == 0 )
2444 {
2445 int len = ::WideCharToMultiByte
2446 (
2447 m_CodePage, // code page
2448 0, // no flags
2449 L"", // input string
2450 1, // translate just the NUL
2451 NULL, // output buffer
2452 0, // and its size
2453 NULL, // no replacement char
2454 NULL // [out] don't care if it was used
2455 );
2456
2457 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2458 switch ( len )
2459 {
2460 default:
2461 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2462 self->m_minMBCharWidth = (size_t)-1;
2463 break;
7ef3ab50
VZ
2464
2465 case 0:
2466 self->m_minMBCharWidth = (size_t)-1;
2467 break;
2468
2469 case 1:
2470 case 2:
2471 case 4:
2472 self->m_minMBCharWidth = len;
2473 break;
2474 }
2475 }
2476
2477 return m_minMBCharWidth;
2478 }
2479
d36c9347
VZ
2480 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2481
13dd924a
VZ
2482 bool IsOk() const { return m_CodePage != -1; }
2483
2484private:
2485 static bool CanUseNoBestFit()
2486 {
2487 static int s_isWin98Or2k = -1;
2488
2489 if ( s_isWin98Or2k == -1 )
2490 {
2491 int verMaj, verMin;
2492 switch ( wxGetOsVersion(&verMaj, &verMin) )
2493 {
406d283a 2494 case wxOS_WINDOWS_9X:
13dd924a
VZ
2495 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2496 break;
2497
406d283a 2498 case wxOS_WINDOWS_NT:
13dd924a
VZ
2499 s_isWin98Or2k = verMaj >= 5;
2500 break;
2501
2502 default:
ef199164 2503 // unknown: be conservative by default
13dd924a 2504 s_isWin98Or2k = 0;
ef199164 2505 break;
13dd924a
VZ
2506 }
2507
2508 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2509 }
2510
2511 return s_isWin98Or2k == 1;
2512 }
f1339c56 2513
89028980
VS
2514 static bool IsAtLeastWin2kSP4()
2515 {
8942f83a
WS
2516#ifdef __WXWINCE__
2517 return false;
2518#else
89028980
VS
2519 static int s_isAtLeastWin2kSP4 = -1;
2520
2521 if ( s_isAtLeastWin2kSP4 == -1 )
2522 {
2523 OSVERSIONINFOEX ver;
2524
2525 memset(&ver, 0, sizeof(ver));
2526 ver.dwOSVersionInfoSize = sizeof(ver);
2527 GetVersionEx((OSVERSIONINFO*)&ver);
2528
2529 s_isAtLeastWin2kSP4 =
2530 ((ver.dwMajorVersion > 5) || // Vista+
2531 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2532 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2533 ver.wServicePackMajor >= 4)) // 2000 SP4+
2534 ? 1 : 0;
2535 }
2536
2537 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2538#endif
89028980
VS
2539 }
2540
eec47cc6 2541
c1464d9d 2542 // the code page we're working with
b1d66b54 2543 long m_CodePage;
c1464d9d 2544
7ef3ab50 2545 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2546 // "unknown"
2547 size_t m_minMBCharWidth;
1cd52418 2548};
e95354ec
VZ
2549
2550#endif // wxHAVE_WIN32_MB2WC
2551
f7e98dee 2552
36acb880
VZ
2553// ============================================================================
2554// wxEncodingConverter based conversion classes
2555// ============================================================================
2556
1e6feb95 2557#if wxUSE_FONTMAP
1cd52418 2558
e95354ec 2559class wxMBConv_wxwin : public wxMBConv
1cd52418 2560{
8b04d4c4
VZ
2561private:
2562 void Init()
2563 {
6ac84a78
DE
2564 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2565 // The wxMBConv_cf class does a better job.
2566 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2567 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2568 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2569 }
2570
6001e347 2571public:
f1339c56
RR
2572 // temporarily just use wxEncodingConverter stuff,
2573 // so that it works while a better implementation is built
86501081 2574 wxMBConv_wxwin(const char* name)
f1339c56
RR
2575 {
2576 if (name)
267e11c5 2577 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2578 else
2579 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2580
8b04d4c4
VZ
2581 Init();
2582 }
2583
e95354ec 2584 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2585 {
2586 m_enc = enc;
2587
2588 Init();
f1339c56 2589 }
dccce9ea 2590
bde4baac 2591 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2592 {
2593 size_t inbuf = strlen(psz);
dccce9ea 2594 if (buf)
c643a977 2595 {
ef199164 2596 if (!m2w.Convert(psz, buf))
467e0479 2597 return wxCONV_FAILED;
c643a977 2598 }
f1339c56
RR
2599 return inbuf;
2600 }
dccce9ea 2601
bde4baac 2602 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2603 {
f8d791e0 2604 const size_t inbuf = wxWcslen(psz);
f1339c56 2605 if (buf)
c643a977 2606 {
ef199164 2607 if (!w2m.Convert(psz, buf))
467e0479 2608 return wxCONV_FAILED;
c643a977 2609 }
dccce9ea 2610
f1339c56
RR
2611 return inbuf;
2612 }
dccce9ea 2613
7ef3ab50 2614 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2615 {
2616 switch ( m_enc )
2617 {
2618 case wxFONTENCODING_UTF16BE:
2619 case wxFONTENCODING_UTF16LE:
c1464d9d 2620 return 2;
eec47cc6
VZ
2621
2622 case wxFONTENCODING_UTF32BE:
2623 case wxFONTENCODING_UTF32LE:
c1464d9d 2624 return 4;
eec47cc6
VZ
2625
2626 default:
c1464d9d 2627 return 1;
eec47cc6
VZ
2628 }
2629 }
2630
d36c9347
VZ
2631 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2632
7ef3ab50
VZ
2633 bool IsOk() const { return m_ok; }
2634
2635public:
2636 wxFontEncoding m_enc;
2637 wxEncodingConverter m2w, w2m;
2638
2639private:
cafbf6fb
VZ
2640 // were we initialized successfully?
2641 bool m_ok;
fc7a2a60 2642
e95354ec 2643 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2644};
6001e347 2645
8f115891 2646// make the constructors available for unit testing
86501081 2647WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2648{
2649 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2650 if ( !result->IsOk() )
2651 {
2652 delete result;
2653 return 0;
2654 }
ef199164 2655
8f115891
MW
2656 return result;
2657}
2658
1e6feb95
VZ
2659#endif // wxUSE_FONTMAP
2660
36acb880
VZ
2661// ============================================================================
2662// wxCSConv implementation
2663// ============================================================================
2664
8b04d4c4 2665void wxCSConv::Init()
6001e347 2666{
e95354ec
VZ
2667 m_name = NULL;
2668 m_convReal = NULL;
2669 m_deferred = true;
2670}
2671
86501081 2672wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2673{
2674 Init();
82713003 2675
86501081 2676 if ( !charset.empty() )
e95354ec 2677 {
86501081 2678 SetName(charset.ToAscii());
e95354ec 2679 }
bda3d86a 2680
e4277538
VZ
2681#if wxUSE_FONTMAP
2682 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2683#else
bda3d86a 2684 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2685#endif
6001e347
RR
2686}
2687
8b04d4c4
VZ
2688wxCSConv::wxCSConv(wxFontEncoding encoding)
2689{
bda3d86a 2690 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2691 {
2692 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2693
2694 encoding = wxFONTENCODING_SYSTEM;
2695 }
2696
8b04d4c4
VZ
2697 Init();
2698
bda3d86a 2699 m_encoding = encoding;
8b04d4c4
VZ
2700}
2701
6001e347
RR
2702wxCSConv::~wxCSConv()
2703{
65e50848
JS
2704 Clear();
2705}
2706
54380f29 2707wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2708 : wxMBConv()
54380f29 2709{
8b04d4c4
VZ
2710 Init();
2711
54380f29 2712 SetName(conv.m_name);
8b04d4c4 2713 m_encoding = conv.m_encoding;
54380f29
GD
2714}
2715
2716wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2717{
2718 Clear();
8b04d4c4 2719
54380f29 2720 SetName(conv.m_name);
8b04d4c4
VZ
2721 m_encoding = conv.m_encoding;
2722
54380f29
GD
2723 return *this;
2724}
2725
65e50848
JS
2726void wxCSConv::Clear()
2727{
8b04d4c4 2728 free(m_name);
e95354ec 2729 delete m_convReal;
8b04d4c4 2730
65e50848 2731 m_name = NULL;
e95354ec 2732 m_convReal = NULL;
6001e347
RR
2733}
2734
86501081 2735void wxCSConv::SetName(const char *charset)
6001e347 2736{
f1339c56
RR
2737 if (charset)
2738 {
d6f2a891 2739 m_name = wxStrdup(charset);
e95354ec 2740 m_deferred = true;
f1339c56 2741 }
6001e347
RR
2742}
2743
8b3eb85d 2744#if wxUSE_FONTMAP
8b3eb85d
VZ
2745
2746WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2747 wxEncodingNameCache );
8b3eb85d
VZ
2748
2749static wxEncodingNameCache gs_nameCache;
2750#endif
2751
e95354ec
VZ
2752wxMBConv *wxCSConv::DoCreate() const
2753{
ce6f8d6f
VZ
2754#if wxUSE_FONTMAP
2755 wxLogTrace(TRACE_STRCONV,
2756 wxT("creating conversion for %s"),
2757 (m_name ? m_name
86501081 2758 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2759#endif // wxUSE_FONTMAP
2760
c547282d
VZ
2761 // check for the special case of ASCII or ISO8859-1 charset: as we have
2762 // special knowledge of it anyhow, we don't need to create a special
2763 // conversion object
e4277538
VZ
2764 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2765 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2766 {
e95354ec
VZ
2767 // don't convert at all
2768 return NULL;
2769 }
dccce9ea 2770
e95354ec
VZ
2771 // we trust OS to do conversion better than we can so try external
2772 // conversion methods first
2773 //
2774 // the full order is:
2775 // 1. OS conversion (iconv() under Unix or Win32 API)
2776 // 2. hard coded conversions for UTF
2777 // 3. wxEncodingConverter as fall back
2778
2779 // step (1)
2780#ifdef HAVE_ICONV
c547282d 2781#if !wxUSE_FONTMAP
e95354ec 2782 if ( m_name )
c547282d 2783#endif // !wxUSE_FONTMAP
e95354ec 2784 {
3ef10cfc 2785#if wxUSE_FONTMAP
8b3eb85d 2786 wxFontEncoding encoding(m_encoding);
3ef10cfc 2787#endif
8b3eb85d 2788
86501081 2789 if ( m_name )
8b3eb85d 2790 {
86501081 2791 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2792 if ( conv->IsOk() )
2793 return conv;
2794
2795 delete conv;
c547282d
VZ
2796
2797#if wxUSE_FONTMAP
8b3eb85d 2798 encoding =
86501081 2799 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2800#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2801 }
2802#if wxUSE_FONTMAP
2803 {
2804 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2805 if ( it != gs_nameCache.end() )
2806 {
2807 if ( it->second.empty() )
2808 return NULL;
c547282d 2809
86501081 2810 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2811 if ( conv->IsOk() )
2812 return conv;
e95354ec 2813
8b3eb85d
VZ
2814 delete conv;
2815 }
2816
2817 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2818 // CS : in case this does not return valid names (eg for MacRoman)
2819 // encoding got a 'failure' entry in the cache all the same,
2820 // although it just has to be created using a different method, so
2821 // only store failed iconv creation attempts (or perhaps we
2822 // shoulnd't do this at all ?)
3c67ec06 2823 if ( names[0] != NULL )
8b3eb85d 2824 {
3c67ec06 2825 for ( ; *names; ++names )
8b3eb85d 2826 {
86501081
VS
2827 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2828 // will need changes that will obsolete this
2829 wxString name(*names);
2830 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2831 if ( conv->IsOk() )
2832 {
2833 gs_nameCache[encoding] = *names;
2834 return conv;
2835 }
2836
2837 delete conv;
8b3eb85d
VZ
2838 }
2839
3c67ec06 2840 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2841 }
8b3eb85d
VZ
2842 }
2843#endif // wxUSE_FONTMAP
e95354ec
VZ
2844 }
2845#endif // HAVE_ICONV
2846
2847#ifdef wxHAVE_WIN32_MB2WC
2848 {
7608a683 2849#if wxUSE_FONTMAP
e95354ec
VZ
2850 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2851 : new wxMBConv_win32(m_encoding);
2852 if ( conv->IsOk() )
2853 return conv;
2854
2855 delete conv;
7608a683
WS
2856#else
2857 return NULL;
2858#endif
e95354ec
VZ
2859 }
2860#endif // wxHAVE_WIN32_MB2WC
ef199164 2861
5c4ed98d 2862#ifdef __DARWIN__
f7e98dee 2863 {
6ff49cbc
DE
2864 // leave UTF16 and UTF32 to the built-ins of wx
2865 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2866 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 2867 {
a6900d10 2868#if wxUSE_FONTMAP
5c4ed98d
DE
2869 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2870 : new wxMBConv_cf(m_encoding);
a6900d10 2871#else
5c4ed98d 2872 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 2873#endif
ef199164 2874
f7e98dee 2875 if ( conv->IsOk() )
d775fa82
WS
2876 return conv;
2877
2878 delete conv;
2879 }
335d31e0 2880 }
5c4ed98d
DE
2881#endif // __DARWIN__
2882
e95354ec
VZ
2883 // step (2)
2884 wxFontEncoding enc = m_encoding;
2885#if wxUSE_FONTMAP
c547282d
VZ
2886 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2887 {
2888 // use "false" to suppress interactive dialogs -- we can be called from
2889 // anywhere and popping up a dialog from here is the last thing we want to
2890 // do
267e11c5 2891 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2892 }
e95354ec
VZ
2893#endif // wxUSE_FONTMAP
2894
2895 switch ( enc )
2896 {
2897 case wxFONTENCODING_UTF7:
2898 return new wxMBConvUTF7;
2899
2900 case wxFONTENCODING_UTF8:
2901 return new wxMBConvUTF8;
2902
e95354ec
VZ
2903 case wxFONTENCODING_UTF16BE:
2904 return new wxMBConvUTF16BE;
2905
2906 case wxFONTENCODING_UTF16LE:
2907 return new wxMBConvUTF16LE;
2908
e95354ec
VZ
2909 case wxFONTENCODING_UTF32BE:
2910 return new wxMBConvUTF32BE;
2911
2912 case wxFONTENCODING_UTF32LE:
2913 return new wxMBConvUTF32LE;
2914
2915 default:
2916 // nothing to do but put here to suppress gcc warnings
ef199164 2917 break;
e95354ec
VZ
2918 }
2919
2920 // step (3)
2921#if wxUSE_FONTMAP
2922 {
2923 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2924 : new wxMBConv_wxwin(m_encoding);
2925 if ( conv->IsOk() )
2926 return conv;
2927
2928 delete conv;
2929 }
2930#endif // wxUSE_FONTMAP
2931
a58d4f4d
VS
2932 // NB: This is a hack to prevent deadlock. What could otherwise happen
2933 // in Unicode build: wxConvLocal creation ends up being here
2934 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
2935 // attach a timestamp, for which it will need wxConvLocal (to convert
2936 // time to char* and then wchar_t*), but that fails, tries to log the
2937 // error, but wxLog has an (already locked) critical section that
2938 // guards the static buffer.
a58d4f4d
VS
2939 static bool alreadyLoggingError = false;
2940 if (!alreadyLoggingError)
2941 {
2942 alreadyLoggingError = true;
2943 wxLogError(_("Cannot convert from the charset '%s'!"),
2944 m_name ? m_name
e95354ec
VZ
2945 :
2946#if wxUSE_FONTMAP
86501081 2947 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 2948#else // !wxUSE_FONTMAP
86501081 2949 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
2950#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2951 );
ef199164 2952
a58d4f4d
VS
2953 alreadyLoggingError = false;
2954 }
e95354ec
VZ
2955
2956 return NULL;
2957}
2958
2959void wxCSConv::CreateConvIfNeeded() const
2960{
2961 if ( m_deferred )
2962 {
2963 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 2964
bda3d86a
VZ
2965 // if we don't have neither the name nor the encoding, use the default
2966 // encoding for this system
2967 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2968 {
4c75209f 2969#if wxUSE_INTL
02c7347b 2970 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
2971#else
2972 // fallback to some reasonable default:
2973 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 2974#endif // wxUSE_INTL
4c75209f 2975 }
bda3d86a 2976
e95354ec
VZ
2977 self->m_convReal = DoCreate();
2978 self->m_deferred = false;
6001e347 2979 }
6001e347
RR
2980}
2981
0f0298b1
VZ
2982bool wxCSConv::IsOk() const
2983{
2984 CreateConvIfNeeded();
2985
2986 // special case: no convReal created for wxFONTENCODING_ISO8859_1
2987 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2988 return true; // always ok as we do it ourselves
2989
2990 // m_convReal->IsOk() is called at its own creation, so we know it must
2991 // be ok if m_convReal is non-NULL
2992 return m_convReal != NULL;
2993}
2994
1c714a5d
VZ
2995size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
2996 const char *src, size_t srcLen) const
2997{
2998 CreateConvIfNeeded();
2999
2c74c558
VS
3000 if (m_convReal)
3001 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3002
3003 // latin-1 (direct)
3004 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3005}
3006
3007size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3008 const wchar_t *src, size_t srcLen) const
3009{
3010 CreateConvIfNeeded();
3011
2c74c558
VS
3012 if (m_convReal)
3013 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3014
3015 // latin-1 (direct)
3016 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3017}
3018
6001e347
RR
3019size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3020{
e95354ec 3021 CreateConvIfNeeded();
dccce9ea 3022
e95354ec
VZ
3023 if (m_convReal)
3024 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3025
3026 // latin-1 (direct)
4def3b35 3027 size_t len = strlen(psz);
dccce9ea 3028
f1339c56
RR
3029 if (buf)
3030 {
4def3b35 3031 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3032 buf[c] = (unsigned char)(psz[c]);
3033 }
dccce9ea 3034
f1339c56 3035 return len;
6001e347
RR
3036}
3037
3038size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3039{
e95354ec 3040 CreateConvIfNeeded();
dccce9ea 3041
e95354ec
VZ
3042 if (m_convReal)
3043 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3044
f1339c56 3045 // latin-1 (direct)
f8d791e0 3046 const size_t len = wxWcslen(psz);
f1339c56
RR
3047 if (buf)
3048 {
4def3b35 3049 for (size_t c = 0; c <= len; c++)
24642831
VS
3050 {
3051 if (psz[c] > 0xFF)
467e0479 3052 return wxCONV_FAILED;
ef199164 3053
907173e5 3054 buf[c] = (char)psz[c];
24642831
VS
3055 }
3056 }
3057 else
3058 {
3059 for (size_t c = 0; c <= len; c++)
3060 {
3061 if (psz[c] > 0xFF)
467e0479 3062 return wxCONV_FAILED;
24642831 3063 }
f1339c56 3064 }
dccce9ea 3065
f1339c56 3066 return len;
6001e347
RR
3067}
3068
7ef3ab50 3069size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3070{
3071 CreateConvIfNeeded();
3072
3073 if ( m_convReal )
3074 {
7ef3ab50 3075 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3076 }
3077
ba98e032 3078 // otherwise, we are ISO-8859-1
c1464d9d 3079 return 1;
eec47cc6
VZ
3080}
3081
ba98e032
VS
3082#if wxUSE_UNICODE_UTF8
3083bool wxCSConv::IsUTF8() const
3084{
3085 CreateConvIfNeeded();
3086
3087 if ( m_convReal )
3088 {
3089 return m_convReal->IsUTF8();
3090 }
3091
3092 // otherwise, we are ISO-8859-1
3093 return false;
3094}
3095#endif
3096
69c928ef
VZ
3097
3098#if wxUSE_UNICODE
3099
3100wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3101{
3102 if ( !s )
3103 return wxWCharBuffer();
3104
3105 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3106 if ( !wbuf )
5487ff0f 3107 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3108 if ( !wbuf )
3109 wbuf = wxConvISO8859_1.cMB2WX(s);
3110
3111 return wbuf;
3112}
3113
3114wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3115{
3116 if ( !ws )
3117 return wxCharBuffer();
3118
3119 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3120 if ( !buf )
3121 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3122
3123 return buf;
3124}
3125
3126#endif // wxUSE_UNICODE
f5a1953b 3127
1e50d914
VS
3128// ----------------------------------------------------------------------------
3129// globals
3130// ----------------------------------------------------------------------------
3131
3132// NB: The reason why we create converted objects in this convoluted way,
3133// using a factory function instead of global variable, is that they
3134// may be used at static initialization time (some of them are used by
3135// wxString ctors and there may be a global wxString object). In other
3136// words, possibly _before_ the converter global object would be
3137// initialized.
3138
3139#undef wxConvLibc
3140#undef wxConvUTF8
3141#undef wxConvUTF7
3142#undef wxConvLocal
3143#undef wxConvISO8859_1
3144
3145#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3146 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3147 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3148 { \
3149 static impl_klass name##Obj ctor_args; \
3150 return &name##Obj; \
3151 } \
3152 /* this ensures that all global converter objects are created */ \
3153 /* by the time static initialization is done, i.e. before any */ \
3154 /* thread is launched: */ \
3155 static klass* gs_##name##instance = wxGet_##name##Ptr()
3156
3157#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3158 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3159
3160#ifdef __WINDOWS__
3161 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3162#else
3163 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3164#endif
3165
0286d08d 3166WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3167WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3168
3169WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3170WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3171
3172WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3173WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3174
6ac84a78
DE
3175#ifdef __DARWIN__
3176// The xnu kernel always communicates file paths in decomposed UTF-8.
3177// WARNING: Are we sure that CFString's conversion will cause decomposition?
3178static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3179#endif
6ac84a78 3180
1e50d914 3181WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3182#ifdef __DARWIN__
1e50d914 3183 &wxConvMacUTF8DObj;
6ac84a78 3184#else // !__DARWIN__
1e50d914 3185 wxGet_wxConvLibcPtr();
6ac84a78 3186#endif // __DARWIN__/!__DARWIN__
1e50d914 3187
bde4baac
VZ
3188#else // !wxUSE_WCHAR_T
3189
1e50d914 3190// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3191// stand-ins in absence of wchar_t
3192WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3193 wxConvISO8859_1,
3194 wxConvLocal,
3195 wxConvUTF8;
3196
3197#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T