]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
wxXmlNode::GetAttribute's pointer argument must not be NULL, check for it
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
6001e347 47#ifdef __SALFORDC__
373658eb 48 #include <clib.h>
6001e347
RR
49#endif
50
b040e242 51#ifdef HAVE_ICONV
373658eb 52 #include <iconv.h>
b1d547eb 53 #include "wx/thread.h"
1cd52418 54#endif
1cd52418 55
373658eb
VZ
56#include "wx/encconv.h"
57#include "wx/fontmap.h"
58
5c4ed98d 59#ifdef __DARWIN__
e4dd1e19 60#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
61#endif //def __DARWIN__
62
ef199164 63
ce6f8d6f
VZ
64#define TRACE_STRCONV _T("strconv")
65
467e0479
VZ
66// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67// be 4 bytes
4948c2b6 68#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
69 #define WC_UTF16
70#endif
71
ef199164 72
373658eb
VZ
73// ============================================================================
74// implementation
75// ============================================================================
76
69373110
VZ
77// helper function of cMB2WC(): check if n bytes at this location are all NUL
78static bool NotAllNULs(const char *p, size_t n)
79{
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84}
85
373658eb 86// ----------------------------------------------------------------------------
467e0479 87// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 88// ----------------------------------------------------------------------------
6001e347 89
c91830cb 90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 91{
ef199164 92 if (input <= 0xffff)
4def3b35 93 {
999836aa
VZ
94 if (output)
95 *output = (wxUint16) input;
ef199164 96
4def3b35 97 return 1;
dccce9ea 98 }
ef199164 99 else if (input >= 0x110000)
4def3b35 100 {
467e0479 101 return wxCONV_FAILED;
dccce9ea
VZ
102 }
103 else
4def3b35 104 {
dccce9ea 105 if (output)
4def3b35 106 {
ef199164
DS
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 109 }
ef199164 110
4def3b35 111 return 2;
1cd52418 112 }
1cd52418
OK
113}
114
c91830cb 115static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 116{
ef199164 117 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
118 {
119 output = *input;
120 return 1;
dccce9ea 121 }
ef199164 122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
467e0479 125 return wxCONV_FAILED;
dccce9ea
VZ
126 }
127 else
4def3b35
VS
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
1cd52418
OK
132}
133
467e0479 134#ifdef WC_UTF16
35d11700
VZ
135 typedef wchar_t wxDecodeSurrogate_t;
136#else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
139
140// returns the next UTF-32 character from the wchar_t buffer and advances the
141// pointer to the character after this one
142//
143// if an invalid character is found, *pSrc is set to NULL, the caller must
144// check for this
35d11700 145static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
146{
147 wxUint32 out;
8d3dd069
VZ
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156}
157
f6bcfd97 158// ----------------------------------------------------------------------------
6001e347 159// wxMBConv
f6bcfd97 160// ----------------------------------------------------------------------------
2c53a80a 161
483b0434
VZ
162size_t
163wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
6001e347 165{
483b0434
VZ
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
483b0434 212 for ( ;; )
eec47cc6 213 {
c1464d9d 214 // try to convert the current chunk
483b0434 215 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
e4e3bbb4 218
467e0479 219 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 220
483b0434 221 dstWritten += lenChunk;
f5fb6871 222
467e0479
VZ
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
483b0434
VZ
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
830f8f11 234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
c1464d9d 239
483b0434 240 if ( !srcEnd )
c1464d9d 241 {
467e0479
VZ
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
c1464d9d
VZ
244 break;
245 }
eec47cc6
VZ
246
247 // advance the input pointer past the end of this chunk
483b0434 248 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
483b0434 254 src += nulLen;
c1464d9d 255 }
e4e3bbb4 256
483b0434 257 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
483b0434 262 if ( src >= srcEnd )
c1464d9d
VZ
263 break;
264 }
265
483b0434 266 return dstWritten;
e4e3bbb4
RN
267}
268
483b0434
VZ
269size_t
270wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
e4e3bbb4 272{
483b0434
VZ
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
e4e3bbb4 275
eec47cc6
VZ
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
467e0479 282 if ( srcLen == wxNO_LEN )
e4e3bbb4 283 {
483b0434 284 srcLen = wxWcslen(src) + 1;
eec47cc6 285 }
483b0434 286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
287 {
288 // make a copy in order to properly NUL-terminate the string
483b0434 289 bufTmp = wxWCharBuffer(srcLen);
ef199164 290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
eec47cc6 318 }
e4e3bbb4 319
483b0434
VZ
320 return dstWritten;
321}
322
ef199164 323size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 324{
ef199164 325 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 326 if ( rc != wxCONV_FAILED )
509da451
VZ
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334}
335
ef199164 336size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 337{
ef199164 338 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 339 if ( rc != wxCONV_FAILED )
509da451
VZ
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345}
346
483b0434
VZ
347wxMBConv::~wxMBConv()
348{
349 // nothing to do here (necessary for Darwin linking probably)
350}
e4e3bbb4 351
483b0434
VZ
352const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353{
354 if ( psz )
eec47cc6 355 {
483b0434 356 // calculate the length of the buffer needed first
a2db25a1 357 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 358 if ( nLen != wxCONV_FAILED )
f5fb6871 359 {
483b0434 360 // now do the actual conversion
a2db25a1 361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 362
483b0434 363 // +1 for the trailing NULL
a2db25a1 364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 365 return buf;
f5fb6871 366 }
483b0434 367 }
e4e3bbb4 368
483b0434
VZ
369 return wxWCharBuffer();
370}
3698ae71 371
483b0434
VZ
372const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373{
374 if ( pwz )
375 {
a2db25a1 376 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 377 if ( nLen != wxCONV_FAILED )
483b0434 378 {
a2db25a1
VZ
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386}
e4e3bbb4 387
483b0434 388const wxWCharBuffer
ef199164 389wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 390{
ef199164 391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 392 if ( dstLen != wxCONV_FAILED )
483b0434 393 {
0dd13d21
VZ
394 // notice that we allocate space for dstLen+1 wide characters here
395 // because we want the buffer to always be NUL-terminated, even if the
396 // input isn't (as otherwise the caller has no way to know its length)
397 wxWCharBuffer wbuf(dstLen);
00ceccee 398 wbuf.data()[dstLen - 1] = L'\0';
ef199164 399 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
400 {
401 if ( outLen )
467e0479
VZ
402 {
403 *outLen = dstLen;
404 if ( wbuf[dstLen - 1] == L'\0' )
405 (*outLen)--;
406 }
407
483b0434
VZ
408 return wbuf;
409 }
410 }
411
412 if ( outLen )
413 *outLen = 0;
414
415 return wxWCharBuffer();
416}
417
418const wxCharBuffer
ef199164 419wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 420{
13d92ad6 421 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 422 if ( dstLen != wxCONV_FAILED )
483b0434 423 {
0dd13d21
VZ
424 const size_t nulLen = GetMBNulLen();
425
426 // as above, ensure that the buffer is always NUL-terminated, even if
427 // the input is not
428 wxCharBuffer buf(dstLen + nulLen - 1);
429 memset(buf.data() + dstLen, 0, nulLen);
ef199164 430 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
431 {
432 if ( outLen )
467e0479
VZ
433 {
434 *outLen = dstLen;
435
13d92ad6
VZ
436 if ( dstLen >= nulLen &&
437 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
438 {
439 // in this case the output is NUL-terminated and we're not
440 // supposed to count NUL
13d92ad6 441 *outLen -= nulLen;
467e0479
VZ
442 }
443 }
d32a507d 444
483b0434
VZ
445 return buf;
446 }
e4e3bbb4
RN
447 }
448
eec47cc6
VZ
449 if ( outLen )
450 *outLen = 0;
451
452 return wxCharBuffer();
e4e3bbb4
RN
453}
454
6001e347 455// ----------------------------------------------------------------------------
bde4baac 456// wxMBConvLibc
6001e347
RR
457// ----------------------------------------------------------------------------
458
bde4baac
VZ
459size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
460{
461 return wxMB2WC(buf, psz, n);
462}
463
464size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
465{
466 return wxWC2MB(buf, psz, n);
467}
e1bfe89e
RR
468
469// ----------------------------------------------------------------------------
532d575b 470// wxConvBrokenFileNames
e1bfe89e
RR
471// ----------------------------------------------------------------------------
472
eec47cc6
VZ
473#ifdef __UNIX__
474
86501081 475wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 476{
86501081
VS
477 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
478 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 479 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
480 else
481 m_conv = new wxCSConv(charset);
ea8ce907
RR
482}
483
eec47cc6 484#endif // __UNIX__
c12b7f79 485
bde4baac 486// ----------------------------------------------------------------------------
3698ae71 487// UTF-7
bde4baac 488// ----------------------------------------------------------------------------
6001e347 489
15f2ee32 490// Implementation (C) 2004 Fredrik Roubert
6001e347 491
15f2ee32
RN
492//
493// BASE64 decoding table
494//
495static const unsigned char utf7unb64[] =
6001e347 496{
15f2ee32
RN
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
503 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
504 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
506 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
507 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
508 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
510 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
511 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
512 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
529};
530
531size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
532{
15f2ee32
RN
533 size_t len = 0;
534
04a37834 535 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
536 {
537 unsigned char cc = *psz++;
538 if (cc != '+')
539 {
540 // plain ASCII char
541 if (buf)
542 *buf++ = cc;
543 len++;
544 }
545 else if (*psz == '-')
546 {
547 // encoded plus sign
548 if (buf)
549 *buf++ = cc;
550 len++;
551 psz++;
552 }
04a37834 553 else // start of BASE64 encoded string
15f2ee32 554 {
04a37834 555 bool lsb, ok;
15f2ee32 556 unsigned int d, l;
04a37834
VZ
557 for ( ok = lsb = false, d = 0, l = 0;
558 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
559 psz++ )
15f2ee32
RN
560 {
561 d <<= 6;
562 d += cc;
563 for (l += 6; l >= 8; lsb = !lsb)
564 {
04a37834 565 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
566 if (lsb)
567 {
568 if (buf)
569 *buf++ |= c;
570 len ++;
571 }
572 else
04a37834 573 {
15f2ee32 574 if (buf)
6356d52a 575 *buf = (wchar_t)(c << 8);
04a37834
VZ
576 }
577
578 ok = true;
15f2ee32
RN
579 }
580 }
04a37834
VZ
581
582 if ( !ok )
583 {
584 // in valid UTF7 we should have valid characters after '+'
467e0479 585 return wxCONV_FAILED;
04a37834
VZ
586 }
587
15f2ee32
RN
588 if (*psz == '-')
589 psz++;
590 }
591 }
04a37834
VZ
592
593 if ( buf && (len < n) )
594 *buf = '\0';
595
15f2ee32 596 return len;
6001e347
RR
597}
598
15f2ee32
RN
599//
600// BASE64 encoding table
601//
602static const unsigned char utf7enb64[] =
603{
604 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
605 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
606 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
607 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
608 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
609 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
610 'w', 'x', 'y', 'z', '0', '1', '2', '3',
611 '4', '5', '6', '7', '8', '9', '+', '/'
612};
613
614//
615// UTF-7 encoding table
616//
617// 0 - Set D (directly encoded characters)
618// 1 - Set O (optional direct characters)
619// 2 - whitespace characters (optional)
620// 3 - special characters
621//
622static const unsigned char utf7encode[128] =
6001e347 623{
15f2ee32
RN
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
625 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
626 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
628 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
632};
633
667e5b3e 634size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 635{
15f2ee32
RN
636 size_t len = 0;
637
638 while (*psz && ((!buf) || (len < n)))
639 {
640 wchar_t cc = *psz++;
641 if (cc < 0x80 && utf7encode[cc] < 1)
642 {
643 // plain ASCII char
644 if (buf)
645 *buf++ = (char)cc;
ef199164 646
15f2ee32
RN
647 len++;
648 }
649#ifndef WC_UTF16
79c78d42 650 else if (((wxUint32)cc) > 0xffff)
b2c13097 651 {
15f2ee32 652 // no surrogate pair generation (yet?)
467e0479 653 return wxCONV_FAILED;
15f2ee32
RN
654 }
655#endif
656 else
657 {
658 if (buf)
659 *buf++ = '+';
ef199164 660
15f2ee32
RN
661 len++;
662 if (cc != '+')
663 {
664 // BASE64 encode string
665 unsigned int lsb, d, l;
73c902d6 666 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
667 {
668 for (lsb = 0; lsb < 2; lsb ++)
669 {
670 d <<= 8;
671 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
672
673 for (l += 8; l >= 6; )
674 {
675 l -= 6;
676 if (buf)
677 *buf++ = utf7enb64[(d >> l) % 64];
678 len++;
679 }
680 }
ef199164 681
15f2ee32
RN
682 cc = *psz;
683 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
684 break;
685 }
ef199164 686
15f2ee32
RN
687 if (l != 0)
688 {
689 if (buf)
690 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 691
15f2ee32
RN
692 len++;
693 }
694 }
ef199164 695
15f2ee32
RN
696 if (buf)
697 *buf++ = '-';
698 len++;
699 }
700 }
ef199164 701
15f2ee32
RN
702 if (buf && (len < n))
703 *buf = 0;
ef199164 704
15f2ee32 705 return len;
6001e347
RR
706}
707
f6bcfd97 708// ----------------------------------------------------------------------------
6001e347 709// UTF-8
f6bcfd97 710// ----------------------------------------------------------------------------
6001e347 711
1774c3c5 712static const wxUint32 utf8_max[]=
4def3b35 713 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 714
3698ae71
VZ
715// boundaries of the private use area we use to (temporarily) remap invalid
716// characters invalid in a UTF-8 encoded string
ea8ce907
RR
717const wxUint32 wxUnicodePUA = 0x100000;
718const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
719
0286d08d 720// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 721const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
722 // single-byte sequences (ASCII):
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
728 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
730 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
731
732 // these are invalid:
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
737 0, 0, // C0,C1
738
739 // two-byte sequences:
740 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
741 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
742
743 // three-byte sequences:
744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
745
746 // four-byte sequences:
747 4, 4, 4, 4, 4, // F0..F4
748
749 // these are invalid again (5- or 6-byte
750 // sequences and sequences for code points
751 // above U+10FFFF, as restricted by RFC 3629):
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
753};
754
755size_t
756wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
757 const char *src, size_t srcLen) const
758{
759 wchar_t *out = dstLen ? dst : NULL;
760 size_t written = 0;
761
762 if ( srcLen == wxNO_LEN )
763 srcLen = strlen(src) + 1;
764
765 for ( const char *p = src; ; p++ )
766 {
767 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
768 {
769 // all done successfully, just add the trailing NULL if we are not
770 // using explicit length
771 if ( srcLen == wxNO_LEN )
772 {
773 if ( out )
774 {
775 if ( !dstLen )
776 break;
777
778 *out = L'\0';
779 }
780
781 written++;
782 }
783
784 return written;
785 }
786
0286d08d
VZ
787 if ( out && !dstLen-- )
788 break;
789
5367a38a
VS
790 wxUint32 code;
791 unsigned char c = *p;
0286d08d 792
5367a38a
VS
793 if ( c < 0x80 )
794 {
795 if ( srcLen == 0 ) // the test works for wxNO_LEN too
796 break;
0286d08d 797
5367a38a
VS
798 if ( srcLen != wxNO_LEN )
799 srcLen--;
0286d08d 800
5367a38a
VS
801 code = c;
802 }
803 else
0286d08d 804 {
5367a38a
VS
805 unsigned len = tableUtf8Lengths[c];
806 if ( !len )
807 break;
808
809 if ( srcLen < len ) // the test works for wxNO_LEN too
810 break;
811
812 if ( srcLen != wxNO_LEN )
813 srcLen -= len;
814
815 // Char. number range | UTF-8 octet sequence
816 // (hexadecimal) | (binary)
817 // ----------------------+----------------------------------------
818 // 0000 0000 - 0000 007F | 0xxxxxxx
819 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
820 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
821 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
822 //
823 // Code point value is stored in bits marked with 'x',
824 // lowest-order bit of the value on the right side in the diagram
825 // above. (from RFC 3629)
826
827 // mask to extract lead byte's value ('x' bits above), by sequence
828 // length:
829 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
830
831 // mask and value of lead byte's most significant bits, by length:
832 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
833 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
834
835 len--; // it's more convenient to work with 0-based length here
836
837 // extract the lead byte's value bits:
838 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
839 break;
840
841 code = c & leadValueMask[len];
842
843 // all remaining bytes, if any, are handled in the same way
844 // regardless of sequence's length:
845 for ( ; len; --len )
846 {
847 c = *++p;
848 if ( (c & 0xC0) != 0x80 )
849 return wxCONV_FAILED;
0286d08d 850
5367a38a
VS
851 code <<= 6;
852 code |= c & 0x3F;
853 }
0286d08d
VZ
854 }
855
856#ifdef WC_UTF16
857 // cast is ok because wchar_t == wxUint16 if WC_UTF16
858 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
859 {
860 if ( out )
861 out++;
862 written++;
863 }
864#else // !WC_UTF16
865 if ( out )
866 *out = code;
867#endif // WC_UTF16/!WC_UTF16
868
869 if ( out )
870 out++;
871
872 written++;
873 }
874
875 return wxCONV_FAILED;
876}
877
878size_t
879wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
880 const wchar_t *src, size_t srcLen) const
881{
882 char *out = dstLen ? dst : NULL;
883 size_t written = 0;
884
885 for ( const wchar_t *wp = src; ; wp++ )
886 {
887 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
888 {
889 // all done successfully, just add the trailing NULL if we are not
890 // using explicit length
891 if ( srcLen == wxNO_LEN )
892 {
893 if ( out )
894 {
895 if ( !dstLen )
896 break;
897
898 *out = '\0';
899 }
900
901 written++;
902 }
903
904 return written;
905 }
906
907
908 wxUint32 code;
909#ifdef WC_UTF16
910 // cast is ok for WC_UTF16
911 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
912 {
913 // skip the next char too as we decoded a surrogate
914 wp++;
915 }
916#else // wchar_t is UTF-32
917 code = *wp & 0x7fffffff;
918#endif
919
920 unsigned len;
921 if ( code <= 0x7F )
922 {
923 len = 1;
924 if ( out )
925 {
926 if ( dstLen < len )
927 break;
928
929 out[0] = (char)code;
930 }
931 }
932 else if ( code <= 0x07FF )
933 {
934 len = 2;
935 if ( out )
936 {
937 if ( dstLen < len )
938 break;
939
940 // NB: this line takes 6 least significant bits, encodes them as
941 // 10xxxxxx and discards them so that the next byte can be encoded:
942 out[1] = 0x80 | (code & 0x3F); code >>= 6;
943 out[0] = 0xC0 | code;
944 }
945 }
946 else if ( code < 0xFFFF )
947 {
948 len = 3;
949 if ( out )
950 {
951 if ( dstLen < len )
952 break;
953
954 out[2] = 0x80 | (code & 0x3F); code >>= 6;
955 out[1] = 0x80 | (code & 0x3F); code >>= 6;
956 out[0] = 0xE0 | code;
957 }
958 }
959 else if ( code <= 0x10FFFF )
960 {
961 len = 4;
962 if ( out )
963 {
964 if ( dstLen < len )
965 break;
966
967 out[3] = 0x80 | (code & 0x3F); code >>= 6;
968 out[2] = 0x80 | (code & 0x3F); code >>= 6;
969 out[1] = 0x80 | (code & 0x3F); code >>= 6;
970 out[0] = 0xF0 | code;
971 }
972 }
973 else
974 {
975 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
976 break;
977 }
978
979 if ( out )
980 {
981 out += len;
982 dstLen -= len;
983 }
984
985 written += len;
986 }
987
988 // we only get here if an error occurs during decoding
989 return wxCONV_FAILED;
990}
991
d16d0917
VZ
992size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
993 const char *psz, size_t srcLen) const
6001e347 994{
0286d08d 995 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 996 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 997
4def3b35
VS
998 size_t len = 0;
999
d16d0917 1000 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1001 {
ea8ce907
RR
1002 const char *opsz = psz;
1003 bool invalid = false;
4def3b35
VS
1004 unsigned char cc = *psz++, fc = cc;
1005 unsigned cnt;
dccce9ea 1006 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1007 fc <<= 1;
ef199164 1008
dccce9ea 1009 if (!cnt)
4def3b35
VS
1010 {
1011 // plain ASCII char
dccce9ea 1012 if (buf)
4def3b35
VS
1013 *buf++ = cc;
1014 len++;
561488ef
MW
1015
1016 // escape the escape character for octal escapes
1017 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1018 && cc == '\\' && (!buf || len < n))
1019 {
1020 if (buf)
1021 *buf++ = cc;
1022 len++;
1023 }
dccce9ea
VZ
1024 }
1025 else
4def3b35
VS
1026 {
1027 cnt--;
dccce9ea 1028 if (!cnt)
4def3b35
VS
1029 {
1030 // invalid UTF-8 sequence
ea8ce907 1031 invalid = true;
dccce9ea
VZ
1032 }
1033 else
4def3b35
VS
1034 {
1035 unsigned ocnt = cnt - 1;
1036 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1037 while (cnt--)
4def3b35 1038 {
ea8ce907 1039 cc = *psz;
dccce9ea 1040 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1041 {
1042 // invalid UTF-8 sequence
ea8ce907
RR
1043 invalid = true;
1044 break;
4def3b35 1045 }
ef199164 1046
ea8ce907 1047 psz++;
4def3b35
VS
1048 res = (res << 6) | (cc & 0x3f);
1049 }
ef199164 1050
ea8ce907 1051 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1052 {
1053 // illegal UTF-8 encoding
ea8ce907 1054 invalid = true;
4def3b35 1055 }
ea8ce907
RR
1056 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1057 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1058 {
1059 // if one of our PUA characters turns up externally
1060 // it must also be treated as an illegal sequence
1061 // (a bit like you have to escape an escape character)
1062 invalid = true;
1063 }
1064 else
1065 {
1cd52418 1066#ifdef WC_UTF16
0286d08d 1067 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1068 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1069 if (pa == wxCONV_FAILED)
ea8ce907
RR
1070 {
1071 invalid = true;
1072 }
1073 else
1074 {
1075 if (buf)
1076 buf += pa;
1077 len += pa;
1078 }
373658eb 1079#else // !WC_UTF16
ea8ce907 1080 if (buf)
38d4b1e4 1081 *buf++ = (wchar_t)res;
ea8ce907 1082 len++;
373658eb 1083#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1084 }
1085 }
ef199164 1086
ea8ce907
RR
1087 if (invalid)
1088 {
1089 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1090 {
1091 while (opsz < psz && (!buf || len < n))
1092 {
1093#ifdef WC_UTF16
1094 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1095 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1096 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1097 if (buf)
1098 buf += pa;
1099 opsz++;
1100 len += pa;
1101#else
1102 if (buf)
38d4b1e4 1103 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1104 opsz++;
1105 len++;
1106#endif
1107 }
1108 }
3698ae71 1109 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1110 {
1111 while (opsz < psz && (!buf || len < n))
1112 {
3698ae71
VZ
1113 if ( buf && len + 3 < n )
1114 {
17a1ebd1 1115 unsigned char on = *opsz;
3698ae71 1116 *buf++ = L'\\';
17a1ebd1
VZ
1117 *buf++ = (wchar_t)( L'0' + on / 0100 );
1118 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1119 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1120 }
ef199164 1121
ea8ce907
RR
1122 opsz++;
1123 len += 4;
1124 }
1125 }
3698ae71 1126 else // MAP_INVALID_UTF8_NOT
ea8ce907 1127 {
467e0479 1128 return wxCONV_FAILED;
ea8ce907 1129 }
4def3b35
VS
1130 }
1131 }
6001e347 1132 }
ef199164 1133
d16d0917 1134 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1135 *buf = 0;
ef199164 1136
d16d0917 1137 return len + 1;
6001e347
RR
1138}
1139
3698ae71
VZ
1140static inline bool isoctal(wchar_t wch)
1141{
1142 return L'0' <= wch && wch <= L'7';
1143}
1144
d16d0917
VZ
1145size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1146 const wchar_t *psz, size_t srcLen) const
6001e347 1147{
0286d08d 1148 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1149 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1150
4def3b35 1151 size_t len = 0;
6001e347 1152
d16d0917 1153 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1154 {
1155 wxUint32 cc;
ef199164 1156
1cd52418 1157#ifdef WC_UTF16
b5153fd8
VZ
1158 // cast is ok for WC_UTF16
1159 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1160 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1161#else
ef199164 1162 cc = (*psz++) & 0x7fffffff;
4def3b35 1163#endif
3698ae71
VZ
1164
1165 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1166 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1167 {
dccce9ea 1168 if (buf)
ea8ce907 1169 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1170 len++;
3698ae71 1171 }
561488ef
MW
1172 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1173 && cc == L'\\' && psz[0] == L'\\' )
1174 {
1175 if (buf)
1176 *buf++ = (char)cc;
1177 psz++;
1178 len++;
1179 }
3698ae71
VZ
1180 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1181 cc == L'\\' &&
1182 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1183 {
dccce9ea 1184 if (buf)
3698ae71 1185 {
ef199164
DS
1186 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1187 (psz[1] - L'0') * 010 +
b2c13097 1188 (psz[2] - L'0'));
3698ae71
VZ
1189 }
1190
1191 psz += 3;
ea8ce907
RR
1192 len++;
1193 }
1194 else
1195 {
1196 unsigned cnt;
ef199164
DS
1197 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1198 {
1199 }
1200
ea8ce907 1201 if (!cnt)
4def3b35 1202 {
ea8ce907
RR
1203 // plain ASCII char
1204 if (buf)
1205 *buf++ = (char) cc;
1206 len++;
1207 }
ea8ce907
RR
1208 else
1209 {
1210 len += cnt + 1;
1211 if (buf)
1212 {
1213 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1214 while (cnt--)
1215 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1216 }
4def3b35
VS
1217 }
1218 }
6001e347 1219 }
4def3b35 1220
d16d0917 1221 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1222 *buf = 0;
adb45366 1223
d16d0917 1224 return len + 1;
6001e347
RR
1225}
1226
467e0479 1227// ============================================================================
c91830cb 1228// UTF-16
467e0479 1229// ============================================================================
c91830cb
VZ
1230
1231#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1232 #define wxMBConvUTF16straight wxMBConvUTF16BE
1233 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1234#else
bde4baac
VZ
1235 #define wxMBConvUTF16swap wxMBConvUTF16BE
1236 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1237#endif
1238
467e0479
VZ
1239/* static */
1240size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1241{
1242 if ( srcLen == wxNO_LEN )
1243 {
1244 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1245 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1246 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1247 ;
c91830cb 1248
467e0479
VZ
1249 srcLen *= BYTES_PER_CHAR;
1250 }
1251 else // we already have the length
1252 {
1253 // we can only convert an entire number of UTF-16 characters
1254 if ( srcLen % BYTES_PER_CHAR )
1255 return wxCONV_FAILED;
1256 }
1257
1258 return srcLen;
1259}
1260
1261// case when in-memory representation is UTF-16 too
c91830cb
VZ
1262#ifdef WC_UTF16
1263
467e0479
VZ
1264// ----------------------------------------------------------------------------
1265// conversions without endianness change
1266// ----------------------------------------------------------------------------
1267
1268size_t
1269wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1270 const char *src, size_t srcLen) const
c91830cb 1271{
467e0479
VZ
1272 // set up the scene for using memcpy() (which is presumably more efficient
1273 // than copying the bytes one by one)
1274 srcLen = GetLength(src, srcLen);
1275 if ( srcLen == wxNO_LEN )
1276 return wxCONV_FAILED;
c91830cb 1277
ef199164 1278 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1279 if ( dst )
c91830cb 1280 {
467e0479
VZ
1281 if ( dstLen < inLen )
1282 return wxCONV_FAILED;
c91830cb 1283
467e0479 1284 memcpy(dst, src, srcLen);
c91830cb 1285 }
d32a507d 1286
467e0479 1287 return inLen;
c91830cb
VZ
1288}
1289
467e0479
VZ
1290size_t
1291wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1292 const wchar_t *src, size_t srcLen) const
c91830cb 1293{
467e0479
VZ
1294 if ( srcLen == wxNO_LEN )
1295 srcLen = wxWcslen(src) + 1;
c91830cb 1296
467e0479
VZ
1297 srcLen *= BYTES_PER_CHAR;
1298
1299 if ( dst )
c91830cb 1300 {
467e0479
VZ
1301 if ( dstLen < srcLen )
1302 return wxCONV_FAILED;
d32a507d 1303
467e0479 1304 memcpy(dst, src, srcLen);
c91830cb 1305 }
d32a507d 1306
467e0479 1307 return srcLen;
c91830cb
VZ
1308}
1309
467e0479
VZ
1310// ----------------------------------------------------------------------------
1311// endian-reversing conversions
1312// ----------------------------------------------------------------------------
c91830cb 1313
467e0479
VZ
1314size_t
1315wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1316 const char *src, size_t srcLen) const
c91830cb 1317{
467e0479
VZ
1318 srcLen = GetLength(src, srcLen);
1319 if ( srcLen == wxNO_LEN )
1320 return wxCONV_FAILED;
c91830cb 1321
467e0479
VZ
1322 srcLen /= BYTES_PER_CHAR;
1323
1324 if ( dst )
c91830cb 1325 {
467e0479
VZ
1326 if ( dstLen < srcLen )
1327 return wxCONV_FAILED;
1328
ef199164
DS
1329 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1330 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1331 {
ef199164 1332 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1333 }
c91830cb 1334 }
bfab25d4 1335
467e0479 1336 return srcLen;
c91830cb
VZ
1337}
1338
467e0479
VZ
1339size_t
1340wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1341 const wchar_t *src, size_t srcLen) const
c91830cb 1342{
467e0479
VZ
1343 if ( srcLen == wxNO_LEN )
1344 srcLen = wxWcslen(src) + 1;
c91830cb 1345
467e0479
VZ
1346 srcLen *= BYTES_PER_CHAR;
1347
1348 if ( dst )
c91830cb 1349 {
467e0479
VZ
1350 if ( dstLen < srcLen )
1351 return wxCONV_FAILED;
1352
ef199164 1353 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1354 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1355 {
ef199164 1356 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1357 }
c91830cb 1358 }
eec47cc6 1359
467e0479 1360 return srcLen;
c91830cb
VZ
1361}
1362
467e0479 1363#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1364
467e0479
VZ
1365// ----------------------------------------------------------------------------
1366// conversions without endianness change
1367// ----------------------------------------------------------------------------
c91830cb 1368
35d11700
VZ
1369size_t
1370wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1371 const char *src, size_t srcLen) const
c91830cb 1372{
35d11700
VZ
1373 srcLen = GetLength(src, srcLen);
1374 if ( srcLen == wxNO_LEN )
1375 return wxCONV_FAILED;
c91830cb 1376
ef199164 1377 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1378 if ( !dst )
c91830cb 1379 {
35d11700
VZ
1380 // optimization: return maximal space which could be needed for this
1381 // string even if the real size could be smaller if the buffer contains
1382 // any surrogates
1383 return inLen;
c91830cb 1384 }
c91830cb 1385
35d11700 1386 size_t outLen = 0;
ef199164
DS
1387 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1388 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1389 {
ef199164
DS
1390 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1391 if ( !inBuff )
35d11700
VZ
1392 return wxCONV_FAILED;
1393
1394 if ( ++outLen > dstLen )
1395 return wxCONV_FAILED;
c91830cb 1396
35d11700
VZ
1397 *dst++ = ch;
1398 }
1399
1400
1401 return outLen;
1402}
c91830cb 1403
35d11700
VZ
1404size_t
1405wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1406 const wchar_t *src, size_t srcLen) const
c91830cb 1407{
35d11700
VZ
1408 if ( srcLen == wxNO_LEN )
1409 srcLen = wxWcslen(src) + 1;
c91830cb 1410
35d11700 1411 size_t outLen = 0;
ef199164 1412 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1413 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1414 {
1415 wxUint16 cc[2];
35d11700
VZ
1416 const size_t numChars = encode_utf16(*src++, cc);
1417 if ( numChars == wxCONV_FAILED )
1418 return wxCONV_FAILED;
c91830cb 1419
ef199164
DS
1420 outLen += numChars * BYTES_PER_CHAR;
1421 if ( outBuff )
c91830cb 1422 {
35d11700
VZ
1423 if ( outLen > dstLen )
1424 return wxCONV_FAILED;
1425
ef199164 1426 *outBuff++ = cc[0];
35d11700 1427 if ( numChars == 2 )
69b80d28 1428 {
35d11700 1429 // second character of a surrogate
ef199164 1430 *outBuff++ = cc[1];
69b80d28 1431 }
c91830cb 1432 }
c91830cb 1433 }
c91830cb 1434
35d11700 1435 return outLen;
c91830cb
VZ
1436}
1437
467e0479
VZ
1438// ----------------------------------------------------------------------------
1439// endian-reversing conversions
1440// ----------------------------------------------------------------------------
c91830cb 1441
35d11700
VZ
1442size_t
1443wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1444 const char *src, size_t srcLen) const
c91830cb 1445{
35d11700
VZ
1446 srcLen = GetLength(src, srcLen);
1447 if ( srcLen == wxNO_LEN )
1448 return wxCONV_FAILED;
1449
ef199164 1450 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1451 if ( !dst )
1452 {
1453 // optimization: return maximal space which could be needed for this
1454 // string even if the real size could be smaller if the buffer contains
1455 // any surrogates
1456 return inLen;
1457 }
c91830cb 1458
35d11700 1459 size_t outLen = 0;
ef199164
DS
1460 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1461 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1462 {
35d11700
VZ
1463 wxUint32 ch;
1464 wxUint16 tmp[2];
ef199164
DS
1465
1466 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1467 inBuff++;
1468 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1469
35d11700
VZ
1470 const size_t numChars = decode_utf16(tmp, ch);
1471 if ( numChars == wxCONV_FAILED )
1472 return wxCONV_FAILED;
c91830cb 1473
35d11700 1474 if ( numChars == 2 )
ef199164 1475 inBuff++;
35d11700
VZ
1476
1477 if ( ++outLen > dstLen )
1478 return wxCONV_FAILED;
c91830cb 1479
35d11700 1480 *dst++ = ch;
c91830cb 1481 }
c91830cb 1482
c91830cb 1483
35d11700
VZ
1484 return outLen;
1485}
c91830cb 1486
35d11700
VZ
1487size_t
1488wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1489 const wchar_t *src, size_t srcLen) const
c91830cb 1490{
35d11700
VZ
1491 if ( srcLen == wxNO_LEN )
1492 srcLen = wxWcslen(src) + 1;
c91830cb 1493
35d11700 1494 size_t outLen = 0;
ef199164 1495 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1496 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1497 {
1498 wxUint16 cc[2];
35d11700
VZ
1499 const size_t numChars = encode_utf16(*src, cc);
1500 if ( numChars == wxCONV_FAILED )
1501 return wxCONV_FAILED;
c91830cb 1502
ef199164
DS
1503 outLen += numChars * BYTES_PER_CHAR;
1504 if ( outBuff )
c91830cb 1505 {
35d11700
VZ
1506 if ( outLen > dstLen )
1507 return wxCONV_FAILED;
1508
ef199164 1509 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1510 if ( numChars == 2 )
c91830cb 1511 {
35d11700 1512 // second character of a surrogate
ef199164 1513 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1514 }
1515 }
c91830cb 1516 }
c91830cb 1517
35d11700 1518 return outLen;
c91830cb
VZ
1519}
1520
467e0479 1521#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1522
1523
35d11700 1524// ============================================================================
c91830cb 1525// UTF-32
35d11700 1526// ============================================================================
c91830cb
VZ
1527
1528#ifdef WORDS_BIGENDIAN
467e0479
VZ
1529 #define wxMBConvUTF32straight wxMBConvUTF32BE
1530 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1531#else
467e0479
VZ
1532 #define wxMBConvUTF32swap wxMBConvUTF32BE
1533 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1534#endif
1535
1536
1537WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1538WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1539
467e0479
VZ
1540/* static */
1541size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1542{
1543 if ( srcLen == wxNO_LEN )
1544 {
1545 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1546 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1547 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1548 ;
c91830cb 1549
467e0479
VZ
1550 srcLen *= BYTES_PER_CHAR;
1551 }
1552 else // we already have the length
1553 {
1554 // we can only convert an entire number of UTF-32 characters
1555 if ( srcLen % BYTES_PER_CHAR )
1556 return wxCONV_FAILED;
1557 }
1558
1559 return srcLen;
1560}
1561
1562// case when in-memory representation is UTF-16
c91830cb
VZ
1563#ifdef WC_UTF16
1564
467e0479
VZ
1565// ----------------------------------------------------------------------------
1566// conversions without endianness change
1567// ----------------------------------------------------------------------------
1568
1569size_t
1570wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1571 const char *src, size_t srcLen) const
c91830cb 1572{
467e0479
VZ
1573 srcLen = GetLength(src, srcLen);
1574 if ( srcLen == wxNO_LEN )
1575 return wxCONV_FAILED;
c91830cb 1576
ef199164
DS
1577 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1578 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1579 size_t outLen = 0;
1580 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1581 {
1582 wxUint16 cc[2];
ef199164 1583 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1584 if ( numChars == wxCONV_FAILED )
1585 return wxCONV_FAILED;
c91830cb 1586
467e0479
VZ
1587 outLen += numChars;
1588 if ( dst )
c91830cb 1589 {
467e0479
VZ
1590 if ( outLen > dstLen )
1591 return wxCONV_FAILED;
d32a507d 1592
467e0479
VZ
1593 *dst++ = cc[0];
1594 if ( numChars == 2 )
1595 {
1596 // second character of a surrogate
1597 *dst++ = cc[1];
1598 }
1599 }
c91830cb 1600 }
d32a507d 1601
467e0479 1602 return outLen;
c91830cb
VZ
1603}
1604
467e0479
VZ
1605size_t
1606wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1607 const wchar_t *src, size_t srcLen) const
c91830cb 1608{
467e0479
VZ
1609 if ( srcLen == wxNO_LEN )
1610 srcLen = wxWcslen(src) + 1;
c91830cb 1611
467e0479 1612 if ( !dst )
c91830cb 1613 {
467e0479
VZ
1614 // optimization: return maximal space which could be needed for this
1615 // string instead of the exact amount which could be less if there are
1616 // any surrogates in the input
1617 //
1618 // we consider that surrogates are rare enough to make it worthwhile to
1619 // avoid running the loop below at the cost of slightly extra memory
1620 // consumption
ef199164 1621 return srcLen * BYTES_PER_CHAR;
467e0479 1622 }
c91830cb 1623
ef199164 1624 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1625 size_t outLen = 0;
1626 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1627 {
1628 const wxUint32 ch = wxDecodeSurrogate(&src);
1629 if ( !src )
1630 return wxCONV_FAILED;
c91830cb 1631
467e0479 1632 outLen += BYTES_PER_CHAR;
d32a507d 1633
467e0479
VZ
1634 if ( outLen > dstLen )
1635 return wxCONV_FAILED;
b5153fd8 1636
ef199164 1637 *outBuff++ = ch;
467e0479 1638 }
c91830cb 1639
467e0479 1640 return outLen;
c91830cb
VZ
1641}
1642
467e0479
VZ
1643// ----------------------------------------------------------------------------
1644// endian-reversing conversions
1645// ----------------------------------------------------------------------------
c91830cb 1646
467e0479
VZ
1647size_t
1648wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1649 const char *src, size_t srcLen) const
c91830cb 1650{
467e0479
VZ
1651 srcLen = GetLength(src, srcLen);
1652 if ( srcLen == wxNO_LEN )
1653 return wxCONV_FAILED;
c91830cb 1654
ef199164
DS
1655 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1656 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1657 size_t outLen = 0;
ef199164 1658 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1659 {
c91830cb 1660 wxUint16 cc[2];
ef199164 1661 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1662 if ( numChars == wxCONV_FAILED )
1663 return wxCONV_FAILED;
c91830cb 1664
467e0479
VZ
1665 outLen += numChars;
1666 if ( dst )
c91830cb 1667 {
467e0479
VZ
1668 if ( outLen > dstLen )
1669 return wxCONV_FAILED;
d32a507d 1670
467e0479
VZ
1671 *dst++ = cc[0];
1672 if ( numChars == 2 )
1673 {
1674 // second character of a surrogate
1675 *dst++ = cc[1];
1676 }
1677 }
c91830cb 1678 }
b5153fd8 1679
467e0479 1680 return outLen;
c91830cb
VZ
1681}
1682
467e0479
VZ
1683size_t
1684wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1685 const wchar_t *src, size_t srcLen) const
c91830cb 1686{
467e0479
VZ
1687 if ( srcLen == wxNO_LEN )
1688 srcLen = wxWcslen(src) + 1;
c91830cb 1689
467e0479 1690 if ( !dst )
c91830cb 1691 {
467e0479
VZ
1692 // optimization: return maximal space which could be needed for this
1693 // string instead of the exact amount which could be less if there are
1694 // any surrogates in the input
1695 //
1696 // we consider that surrogates are rare enough to make it worthwhile to
1697 // avoid running the loop below at the cost of slightly extra memory
1698 // consumption
1699 return srcLen*BYTES_PER_CHAR;
1700 }
c91830cb 1701
ef199164 1702 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1703 size_t outLen = 0;
1704 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1705 {
1706 const wxUint32 ch = wxDecodeSurrogate(&src);
1707 if ( !src )
1708 return wxCONV_FAILED;
c91830cb 1709
467e0479 1710 outLen += BYTES_PER_CHAR;
d32a507d 1711
467e0479
VZ
1712 if ( outLen > dstLen )
1713 return wxCONV_FAILED;
b5153fd8 1714
ef199164 1715 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1716 }
c91830cb 1717
467e0479 1718 return outLen;
c91830cb
VZ
1719}
1720
467e0479 1721#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1722
35d11700
VZ
1723// ----------------------------------------------------------------------------
1724// conversions without endianness change
1725// ----------------------------------------------------------------------------
1726
1727size_t
1728wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1729 const char *src, size_t srcLen) const
c91830cb 1730{
35d11700
VZ
1731 // use memcpy() as it should be much faster than hand-written loop
1732 srcLen = GetLength(src, srcLen);
1733 if ( srcLen == wxNO_LEN )
1734 return wxCONV_FAILED;
c91830cb 1735
35d11700
VZ
1736 const size_t inLen = srcLen/BYTES_PER_CHAR;
1737 if ( dst )
c91830cb 1738 {
35d11700
VZ
1739 if ( dstLen < inLen )
1740 return wxCONV_FAILED;
b5153fd8 1741
35d11700
VZ
1742 memcpy(dst, src, srcLen);
1743 }
c91830cb 1744
35d11700 1745 return inLen;
c91830cb
VZ
1746}
1747
35d11700
VZ
1748size_t
1749wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1750 const wchar_t *src, size_t srcLen) const
c91830cb 1751{
35d11700
VZ
1752 if ( srcLen == wxNO_LEN )
1753 srcLen = wxWcslen(src) + 1;
1754
1755 srcLen *= BYTES_PER_CHAR;
c91830cb 1756
35d11700 1757 if ( dst )
c91830cb 1758 {
35d11700
VZ
1759 if ( dstLen < srcLen )
1760 return wxCONV_FAILED;
c91830cb 1761
35d11700 1762 memcpy(dst, src, srcLen);
c91830cb
VZ
1763 }
1764
35d11700 1765 return srcLen;
c91830cb
VZ
1766}
1767
35d11700
VZ
1768// ----------------------------------------------------------------------------
1769// endian-reversing conversions
1770// ----------------------------------------------------------------------------
c91830cb 1771
35d11700
VZ
1772size_t
1773wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1774 const char *src, size_t srcLen) const
c91830cb 1775{
35d11700
VZ
1776 srcLen = GetLength(src, srcLen);
1777 if ( srcLen == wxNO_LEN )
1778 return wxCONV_FAILED;
1779
1780 srcLen /= BYTES_PER_CHAR;
c91830cb 1781
35d11700 1782 if ( dst )
c91830cb 1783 {
35d11700
VZ
1784 if ( dstLen < srcLen )
1785 return wxCONV_FAILED;
1786
ef199164
DS
1787 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1788 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1789 {
ef199164 1790 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1791 }
c91830cb 1792 }
b5153fd8 1793
35d11700 1794 return srcLen;
c91830cb
VZ
1795}
1796
35d11700
VZ
1797size_t
1798wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1799 const wchar_t *src, size_t srcLen) const
c91830cb 1800{
35d11700
VZ
1801 if ( srcLen == wxNO_LEN )
1802 srcLen = wxWcslen(src) + 1;
1803
1804 srcLen *= BYTES_PER_CHAR;
c91830cb 1805
35d11700 1806 if ( dst )
c91830cb 1807 {
35d11700
VZ
1808 if ( dstLen < srcLen )
1809 return wxCONV_FAILED;
1810
ef199164 1811 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1812 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1813 {
ef199164 1814 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1815 }
c91830cb 1816 }
b5153fd8 1817
35d11700 1818 return srcLen;
c91830cb
VZ
1819}
1820
467e0479 1821#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1822
1823
36acb880
VZ
1824// ============================================================================
1825// The classes doing conversion using the iconv_xxx() functions
1826// ============================================================================
3caec1bb 1827
b040e242 1828#ifdef HAVE_ICONV
3a0d76bc 1829
b1d547eb
VS
1830// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1831// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1832// (unless there's yet another bug in glibc) the only case when iconv()
1833// returns with (size_t)-1 (which means error) and says there are 0 bytes
1834// left in the input buffer -- when _real_ error occurs,
1835// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1836// iconv() failure.
3caec1bb
VS
1837// [This bug does not appear in glibc 2.2.]
1838#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1839#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1840 (errno != E2BIG || bufLeft != 0))
1841#else
1842#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1843#endif
1844
ab217dba 1845#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1846
74a7eb0b
VZ
1847#define ICONV_T_INVALID ((iconv_t)-1)
1848
1849#if SIZEOF_WCHAR_T == 4
1850 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1851 #define WC_ENC wxFONTENCODING_UTF32
1852#elif SIZEOF_WCHAR_T == 2
1853 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1854 #define WC_ENC wxFONTENCODING_UTF16
1855#else // sizeof(wchar_t) != 2 nor 4
1856 // does this ever happen?
1857 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1858#endif
1859
36acb880 1860// ----------------------------------------------------------------------------
e95354ec 1861// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1862// ----------------------------------------------------------------------------
1863
e95354ec 1864class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1865{
1866public:
86501081 1867 wxMBConv_iconv(const char *name);
e95354ec 1868 virtual ~wxMBConv_iconv();
36acb880 1869
bde4baac
VZ
1870 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1871 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1872
d36c9347 1873 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1874 virtual size_t GetMBNulLen() const;
1875
ba98e032
VS
1876#if wxUSE_UNICODE_UTF8
1877 virtual bool IsUTF8() const;
1878#endif
1879
d36c9347
VZ
1880 virtual wxMBConv *Clone() const
1881 {
86501081 1882 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1883 p->m_minMBCharWidth = m_minMBCharWidth;
1884 return p;
1885 }
1886
e95354ec 1887 bool IsOk() const
74a7eb0b 1888 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1889
1890protected:
ef199164
DS
1891 // the iconv handlers used to translate from multibyte
1892 // to wide char and in the other direction
36acb880
VZ
1893 iconv_t m2w,
1894 w2m;
ef199164 1895
b1d547eb
VS
1896#if wxUSE_THREADS
1897 // guards access to m2w and w2m objects
1898 wxMutex m_iconvMutex;
1899#endif
36acb880
VZ
1900
1901private:
e95354ec 1902 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1903 // available on this machine, it will remain NULL
74a7eb0b 1904 static wxString ms_wcCharsetName;
36acb880
VZ
1905
1906 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1907 // different endian-ness than the native one
405d8f46 1908 static bool ms_wcNeedsSwap;
eec47cc6 1909
d36c9347
VZ
1910
1911 // name of the encoding handled by this conversion
1912 wxString m_name;
1913
7ef3ab50 1914 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1915 // initially
1916 size_t m_minMBCharWidth;
36acb880
VZ
1917};
1918
8f115891 1919// make the constructor available for unit testing
86501081 1920WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
1921{
1922 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1923 if ( !result->IsOk() )
1924 {
1925 delete result;
1926 return 0;
1927 }
ef199164 1928
8f115891
MW
1929 return result;
1930}
1931
422e411e 1932wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1933bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1934
86501081 1935wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 1936 : m_name(name)
36acb880 1937{
c1464d9d 1938 m_minMBCharWidth = 0;
eec47cc6 1939
36acb880 1940 // check for charset that represents wchar_t:
74a7eb0b 1941 if ( ms_wcCharsetName.empty() )
f1339c56 1942 {
c2b83fdd
VZ
1943 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1944
74a7eb0b
VZ
1945#if wxUSE_FONTMAP
1946 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1947#else // !wxUSE_FONTMAP
91cb7f52 1948 static const wxChar *names_static[] =
36acb880 1949 {
74a7eb0b
VZ
1950#if SIZEOF_WCHAR_T == 4
1951 _T("UCS-4"),
1952#elif SIZEOF_WCHAR_T = 2
1953 _T("UCS-2"),
1954#endif
1955 NULL
1956 };
91cb7f52 1957 const wxChar **names = names_static;
74a7eb0b 1958#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1959
d1f024a8 1960 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1961 {
17a1ebd1 1962 const wxString nameCS(*names);
74a7eb0b
VZ
1963
1964 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1965 wxString nameXE(nameCS);
ef199164
DS
1966
1967#ifdef WORDS_BIGENDIAN
74a7eb0b 1968 nameXE += _T("BE");
ef199164 1969#else // little endian
74a7eb0b 1970 nameXE += _T("LE");
ef199164 1971#endif
74a7eb0b 1972
c2b83fdd
VZ
1973 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1974 nameXE.c_str());
1975
86501081 1976 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 1977 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1978 {
74a7eb0b 1979 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1980 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1981 nameCS.c_str());
86501081 1982 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 1983
74a7eb0b
VZ
1984 // and check for bytesex ourselves:
1985 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1986 {
74a7eb0b
VZ
1987 char buf[2], *bufPtr;
1988 wchar_t wbuf[2], *wbufPtr;
1989 size_t insz, outsz;
1990 size_t res;
1991
1992 buf[0] = 'A';
1993 buf[1] = 0;
1994 wbuf[0] = 0;
1995 insz = 2;
1996 outsz = SIZEOF_WCHAR_T * 2;
1997 wbufPtr = wbuf;
1998 bufPtr = buf;
1999
ef199164
DS
2000 res = iconv(
2001 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2002 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
2003
2004 if (ICONV_FAILED(res, insz))
2005 {
2006 wxLogLastError(wxT("iconv"));
422e411e 2007 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2008 nameCS.c_str());
74a7eb0b
VZ
2009 }
2010 else // ok, can convert to this encoding, remember it
2011 {
17a1ebd1 2012 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2013 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2014 }
3a0d76bc
VS
2015 }
2016 }
74a7eb0b 2017 else // use charset not requiring byte swapping
36acb880 2018 {
74a7eb0b 2019 ms_wcCharsetName = nameXE;
36acb880 2020 }
3a0d76bc 2021 }
74a7eb0b 2022
0944fceb 2023 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2024 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2025 ms_wcCharsetName.empty() ? wxString("<none>")
2026 : ms_wcCharsetName,
74a7eb0b
VZ
2027 ms_wcNeedsSwap ? _T(" (needs swap)")
2028 : _T(""));
3a0d76bc 2029 }
36acb880 2030 else // we already have ms_wcCharsetName
3caec1bb 2031 {
86501081 2032 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2033 }
dccce9ea 2034
74a7eb0b 2035 if ( ms_wcCharsetName.empty() )
f1339c56 2036 {
74a7eb0b 2037 w2m = ICONV_T_INVALID;
36acb880 2038 }
405d8f46
VZ
2039 else
2040 {
86501081 2041 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2042 if ( w2m == ICONV_T_INVALID )
2043 {
2044 wxLogTrace(TRACE_STRCONV,
2045 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2046 ms_wcCharsetName.c_str(), name);
74a7eb0b 2047 }
405d8f46 2048 }
36acb880 2049}
3caec1bb 2050
e95354ec 2051wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2052{
74a7eb0b 2053 if ( m2w != ICONV_T_INVALID )
36acb880 2054 iconv_close(m2w);
74a7eb0b 2055 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2056 iconv_close(w2m);
2057}
3a0d76bc 2058
bde4baac 2059size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 2060{
69373110
VZ
2061 // find the string length: notice that must be done differently for
2062 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2063 size_t inbuf;
7ef3ab50 2064 const size_t nulLen = GetMBNulLen();
69373110
VZ
2065 switch ( nulLen )
2066 {
2067 default:
467e0479 2068 return wxCONV_FAILED;
69373110
VZ
2069
2070 case 1:
2071 inbuf = strlen(psz); // arguably more optimized than our version
2072 break;
2073
2074 case 2:
2075 case 4:
2076 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2077 // they also have to start at character boundary and not span two
2078 // adjacent characters
2079 const char *p;
2080 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2081 ;
2082 inbuf = p - psz;
2083 break;
2084 }
2085
b1d547eb 2086#if wxUSE_THREADS
6a17b868
SN
2087 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2088 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2089 // wxConvLocal that are used all over wx code, so we have to make sure
2090 // the handle is used by at most one thread at the time. Otherwise
2091 // only a few wx classes would be safe to use from non-main threads
2092 // as MB<->WC conversion would fail "randomly".
2093 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2094#endif // wxUSE_THREADS
2095
36acb880
VZ
2096 size_t outbuf = n * SIZEOF_WCHAR_T;
2097 size_t res, cres;
2098 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2099 wchar_t *bufPtr = buf;
2100 const char *pszPtr = psz;
2101
2102 if (buf)
2103 {
2104 // have destination buffer, convert there
2105 cres = iconv(m2w,
2106 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2107 (char**)&bufPtr, &outbuf);
2108 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 2109
36acb880 2110 if (ms_wcNeedsSwap)
3a0d76bc 2111 {
36acb880 2112 // convert to native endianness
17a1ebd1
VZ
2113 for ( unsigned i = 0; i < res; i++ )
2114 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 2115 }
adb45366 2116
69373110 2117 // NUL-terminate the string if there is any space left
49dd9820
VS
2118 if (res < n)
2119 buf[res] = 0;
36acb880
VZ
2120 }
2121 else
2122 {
2123 // no destination buffer... convert using temp buffer
2124 // to calculate destination buffer requirement
2125 wchar_t tbuf[8];
2126 res = 0;
ef199164
DS
2127
2128 do
2129 {
36acb880 2130 bufPtr = tbuf;
ef199164 2131 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2132
2133 cres = iconv(m2w,
2134 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2135 (char**)&bufPtr, &outbuf );
2136
ef199164
DS
2137 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2138 }
2139 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2140 }
dccce9ea 2141
36acb880 2142 if (ICONV_FAILED(cres, inbuf))
f1339c56 2143 {
36acb880 2144 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2145 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2146 return wxCONV_FAILED;
36acb880
VZ
2147 }
2148
2149 return res;
2150}
2151
bde4baac 2152size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 2153{
b1d547eb
VS
2154#if wxUSE_THREADS
2155 // NB: explained in MB2WC
2156 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2157#endif
3698ae71 2158
156162ec
MW
2159 size_t inlen = wxWcslen(psz);
2160 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
2161 size_t outbuf = n;
2162 size_t res, cres;
3a0d76bc 2163
36acb880 2164 wchar_t *tmpbuf = 0;
3caec1bb 2165
36acb880
VZ
2166 if (ms_wcNeedsSwap)
2167 {
2168 // need to copy to temp buffer to switch endianness
74a7eb0b 2169 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2170 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 2171 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
2172 for ( size_t i = 0; i < inlen; i++ )
2173 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 2174
156162ec 2175 tmpbuf[inlen] = L'\0';
74a7eb0b 2176 psz = tmpbuf;
36acb880 2177 }
3a0d76bc 2178
36acb880
VZ
2179 if (buf)
2180 {
2181 // have destination buffer, convert there
2182 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 2183
ef199164 2184 res = n - outbuf;
adb45366 2185
49dd9820
VS
2186 // NB: iconv was given only wcslen(psz) characters on input, and so
2187 // it couldn't convert the trailing zero. Let's do it ourselves
2188 // if there's some room left for it in the output buffer.
2189 if (res < n)
2190 buf[0] = 0;
36acb880
VZ
2191 }
2192 else
2193 {
ef199164 2194 // no destination buffer: convert using temp buffer
36acb880
VZ
2195 // to calculate destination buffer requirement
2196 char tbuf[16];
2197 res = 0;
ef199164
DS
2198 do
2199 {
2200 buf = tbuf;
2201 outbuf = 16;
36acb880
VZ
2202
2203 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 2204
36acb880 2205 res += 16 - outbuf;
ef199164
DS
2206 }
2207 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2208 }
dccce9ea 2209
36acb880
VZ
2210 if (ms_wcNeedsSwap)
2211 {
2212 free(tmpbuf);
2213 }
dccce9ea 2214
36acb880
VZ
2215 if (ICONV_FAILED(cres, inbuf))
2216 {
ce6f8d6f 2217 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2218 return wxCONV_FAILED;
36acb880
VZ
2219 }
2220
2221 return res;
2222}
2223
7ef3ab50 2224size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2225{
c1464d9d 2226 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2227 {
2228 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2229
2230#if wxUSE_THREADS
2231 // NB: explained in MB2WC
2232 wxMutexLocker lock(self->m_iconvMutex);
2233#endif
2234
999020e1 2235 const wchar_t *wnul = L"";
c1464d9d 2236 char buf[8]; // should be enough for NUL in any encoding
356410fc 2237 size_t inLen = sizeof(wchar_t),
c1464d9d 2238 outLen = WXSIZEOF(buf);
ef199164
DS
2239 char *inBuff = (char *)wnul;
2240 char *outBuff = buf;
2241 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2242 {
c1464d9d 2243 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2244 }
2245 else // ok
2246 {
ef199164 2247 self->m_minMBCharWidth = outBuff - buf;
356410fc 2248 }
eec47cc6
VZ
2249 }
2250
c1464d9d 2251 return m_minMBCharWidth;
eec47cc6
VZ
2252}
2253
ba98e032
VS
2254#if wxUSE_UNICODE_UTF8
2255bool wxMBConv_iconv::IsUTF8() const
2256{
86501081
VS
2257 return wxStricmp(m_name, "UTF-8") == 0 ||
2258 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2259}
2260#endif
2261
b040e242 2262#endif // HAVE_ICONV
36acb880 2263
e95354ec 2264
36acb880
VZ
2265// ============================================================================
2266// Win32 conversion classes
2267// ============================================================================
1cd52418 2268
e95354ec 2269#ifdef wxHAVE_WIN32_MB2WC
373658eb 2270
8b04d4c4 2271// from utils.cpp
d775fa82 2272#if wxUSE_FONTMAP
86501081 2273extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2274extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2275#endif
373658eb 2276
e95354ec 2277class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2278{
2279public:
bde4baac
VZ
2280 wxMBConv_win32()
2281 {
2282 m_CodePage = CP_ACP;
c1464d9d 2283 m_minMBCharWidth = 0;
bde4baac
VZ
2284 }
2285
d36c9347 2286 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2287 : wxMBConv()
d36c9347
VZ
2288 {
2289 m_CodePage = conv.m_CodePage;
2290 m_minMBCharWidth = conv.m_minMBCharWidth;
2291 }
2292
7608a683 2293#if wxUSE_FONTMAP
86501081 2294 wxMBConv_win32(const char* name)
bde4baac
VZ
2295 {
2296 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2297 m_minMBCharWidth = 0;
bde4baac 2298 }
dccce9ea 2299
e95354ec 2300 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2301 {
2302 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2303 m_minMBCharWidth = 0;
bde4baac 2304 }
eec47cc6 2305#endif // wxUSE_FONTMAP
8b04d4c4 2306
d36c9347 2307 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2308 {
02272c9c
VZ
2309 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2310 // the behaviour is not compatible with the Unix version (using iconv)
2311 // and break the library itself, e.g. wxTextInputStream::NextChar()
2312 // wouldn't work if reading an incomplete MB char didn't result in an
2313 // error
667e5b3e 2314 //
89028980 2315 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2316 // Win XP or newer and it is not supported for UTF-[78] so we always
2317 // use our own conversions in this case. See
89028980
VS
2318 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2319 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2320 if ( m_CodePage == CP_UTF8 )
89028980 2321 {
5487ff0f 2322 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2323 }
830f8f11
VZ
2324
2325 if ( m_CodePage == CP_UTF7 )
2326 {
5487ff0f 2327 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2328 }
2329
2330 int flags = 0;
2331 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2332 IsAtLeastWin2kSP4() )
89028980 2333 {
830f8f11 2334 flags = MB_ERR_INVALID_CHARS;
89028980 2335 }
667e5b3e 2336
2b5f62a0
VZ
2337 const size_t len = ::MultiByteToWideChar
2338 (
2339 m_CodePage, // code page
667e5b3e 2340 flags, // flags: fall on error
2b5f62a0
VZ
2341 psz, // input string
2342 -1, // its length (NUL-terminated)
b4da152e 2343 buf, // output string
2b5f62a0
VZ
2344 buf ? n : 0 // size of output buffer
2345 );
89028980
VS
2346 if ( !len )
2347 {
2348 // function totally failed
467e0479 2349 return wxCONV_FAILED;
89028980
VS
2350 }
2351
2352 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2353 // check if we succeeded, by doing a double trip:
2354 if ( !flags && buf )
2355 {
53c174fc
VZ
2356 const size_t mbLen = strlen(psz);
2357 wxCharBuffer mbBuf(mbLen);
89028980
VS
2358 if ( ::WideCharToMultiByte
2359 (
2360 m_CodePage,
2361 0,
2362 buf,
2363 -1,
2364 mbBuf.data(),
53c174fc 2365 mbLen + 1, // size in bytes, not length
89028980
VS
2366 NULL,
2367 NULL
2368 ) == 0 ||
2369 strcmp(mbBuf, psz) != 0 )
2370 {
2371 // we didn't obtain the same thing we started from, hence
2372 // the conversion was lossy and we consider that it failed
467e0479 2373 return wxCONV_FAILED;
89028980
VS
2374 }
2375 }
2b5f62a0 2376
03a991bc
VZ
2377 // note that it returns count of written chars for buf != NULL and size
2378 // of the needed buffer for buf == NULL so in either case the length of
2379 // the string (which never includes the terminating NUL) is one less
89028980 2380 return len - 1;
f1339c56 2381 }
dccce9ea 2382
d36c9347 2383 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2384 {
13dd924a
VZ
2385 /*
2386 we have a problem here: by default, WideCharToMultiByte() may
2387 replace characters unrepresentable in the target code page with bad
2388 quality approximations such as turning "1/2" symbol (U+00BD) into
2389 "1" for the code pages which don't have it and we, obviously, want
2390 to avoid this at any price
d775fa82 2391
13dd924a
VZ
2392 the trouble is that this function does it _silently_, i.e. it won't
2393 even tell us whether it did or not... Win98/2000 and higher provide
2394 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2395 we have to resort to a round trip, i.e. check that converting back
2396 results in the same string -- this is, of course, expensive but
2397 otherwise we simply can't be sure to not garble the data.
2398 */
2399
2400 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2401 // it doesn't work with CJK encodings (which we test for rather roughly
2402 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2403 // supporting it
907173e5
WS
2404 BOOL usedDef wxDUMMY_INITIALIZE(false);
2405 BOOL *pUsedDef;
13dd924a
VZ
2406 int flags;
2407 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2408 {
2409 // it's our lucky day
2410 flags = WC_NO_BEST_FIT_CHARS;
2411 pUsedDef = &usedDef;
2412 }
2413 else // old system or unsupported encoding
2414 {
2415 flags = 0;
2416 pUsedDef = NULL;
2417 }
2418
2b5f62a0
VZ
2419 const size_t len = ::WideCharToMultiByte
2420 (
2421 m_CodePage, // code page
13dd924a
VZ
2422 flags, // either none or no best fit
2423 pwz, // input string
2b5f62a0
VZ
2424 -1, // it is (wide) NUL-terminated
2425 buf, // output buffer
2426 buf ? n : 0, // and its size
2427 NULL, // default "replacement" char
13dd924a 2428 pUsedDef // [out] was it used?
2b5f62a0
VZ
2429 );
2430
13dd924a
VZ
2431 if ( !len )
2432 {
2433 // function totally failed
467e0479 2434 return wxCONV_FAILED;
13dd924a
VZ
2435 }
2436
765bdb4a
VZ
2437 // we did something, check if we really succeeded
2438 if ( flags )
13dd924a 2439 {
765bdb4a
VZ
2440 // check if the conversion failed, i.e. if any replacements
2441 // were done
2442 if ( usedDef )
2443 return wxCONV_FAILED;
2444 }
2445 else // we must resort to double tripping...
2446 {
2447 // first we need to ensure that we really have the MB data: this is
2448 // not the case if we're called with NULL buffer, in which case we
2449 // need to do the conversion yet again
2450 wxCharBuffer bufDef;
2451 if ( !buf )
13dd924a 2452 {
765bdb4a
VZ
2453 bufDef = wxCharBuffer(len);
2454 buf = bufDef.data();
2455 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2456 buf, len, NULL, NULL) )
467e0479 2457 return wxCONV_FAILED;
13dd924a 2458 }
765bdb4a 2459
564da6ff
VZ
2460 if ( !n )
2461 n = wcslen(pwz);
765bdb4a 2462 wxWCharBuffer wcBuf(n);
564da6ff 2463 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
765bdb4a 2464 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2465 {
765bdb4a
VZ
2466 // we didn't obtain the same thing we started from, hence
2467 // the conversion was lossy and we consider that it failed
2468 return wxCONV_FAILED;
13dd924a
VZ
2469 }
2470 }
2471
03a991bc 2472 // see the comment above for the reason of "len - 1"
13dd924a 2473 return len - 1;
f1339c56 2474 }
dccce9ea 2475
7ef3ab50
VZ
2476 virtual size_t GetMBNulLen() const
2477 {
2478 if ( m_minMBCharWidth == 0 )
2479 {
2480 int len = ::WideCharToMultiByte
2481 (
2482 m_CodePage, // code page
2483 0, // no flags
2484 L"", // input string
2485 1, // translate just the NUL
2486 NULL, // output buffer
2487 0, // and its size
2488 NULL, // no replacement char
2489 NULL // [out] don't care if it was used
2490 );
2491
2492 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2493 switch ( len )
2494 {
2495 default:
2496 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2497 self->m_minMBCharWidth = (size_t)-1;
2498 break;
7ef3ab50
VZ
2499
2500 case 0:
2501 self->m_minMBCharWidth = (size_t)-1;
2502 break;
2503
2504 case 1:
2505 case 2:
2506 case 4:
2507 self->m_minMBCharWidth = len;
2508 break;
2509 }
2510 }
2511
2512 return m_minMBCharWidth;
2513 }
2514
d36c9347
VZ
2515 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2516
13dd924a
VZ
2517 bool IsOk() const { return m_CodePage != -1; }
2518
2519private:
2520 static bool CanUseNoBestFit()
2521 {
2522 static int s_isWin98Or2k = -1;
2523
2524 if ( s_isWin98Or2k == -1 )
2525 {
2526 int verMaj, verMin;
2527 switch ( wxGetOsVersion(&verMaj, &verMin) )
2528 {
406d283a 2529 case wxOS_WINDOWS_9X:
13dd924a
VZ
2530 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2531 break;
2532
406d283a 2533 case wxOS_WINDOWS_NT:
13dd924a
VZ
2534 s_isWin98Or2k = verMaj >= 5;
2535 break;
2536
2537 default:
ef199164 2538 // unknown: be conservative by default
13dd924a 2539 s_isWin98Or2k = 0;
ef199164 2540 break;
13dd924a
VZ
2541 }
2542
2543 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2544 }
2545
2546 return s_isWin98Or2k == 1;
2547 }
f1339c56 2548
89028980
VS
2549 static bool IsAtLeastWin2kSP4()
2550 {
8942f83a
WS
2551#ifdef __WXWINCE__
2552 return false;
2553#else
89028980
VS
2554 static int s_isAtLeastWin2kSP4 = -1;
2555
2556 if ( s_isAtLeastWin2kSP4 == -1 )
2557 {
2558 OSVERSIONINFOEX ver;
2559
2560 memset(&ver, 0, sizeof(ver));
2561 ver.dwOSVersionInfoSize = sizeof(ver);
2562 GetVersionEx((OSVERSIONINFO*)&ver);
2563
2564 s_isAtLeastWin2kSP4 =
2565 ((ver.dwMajorVersion > 5) || // Vista+
2566 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2567 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2568 ver.wServicePackMajor >= 4)) // 2000 SP4+
2569 ? 1 : 0;
2570 }
2571
2572 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2573#endif
89028980
VS
2574 }
2575
eec47cc6 2576
c1464d9d 2577 // the code page we're working with
b1d66b54 2578 long m_CodePage;
c1464d9d 2579
7ef3ab50 2580 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2581 // "unknown"
2582 size_t m_minMBCharWidth;
1cd52418 2583};
e95354ec
VZ
2584
2585#endif // wxHAVE_WIN32_MB2WC
2586
f7e98dee 2587
36acb880
VZ
2588// ============================================================================
2589// wxEncodingConverter based conversion classes
2590// ============================================================================
2591
1e6feb95 2592#if wxUSE_FONTMAP
1cd52418 2593
e95354ec 2594class wxMBConv_wxwin : public wxMBConv
1cd52418 2595{
8b04d4c4
VZ
2596private:
2597 void Init()
2598 {
6ac84a78
DE
2599 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2600 // The wxMBConv_cf class does a better job.
2601 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2602 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2603 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2604 }
2605
6001e347 2606public:
f1339c56
RR
2607 // temporarily just use wxEncodingConverter stuff,
2608 // so that it works while a better implementation is built
86501081 2609 wxMBConv_wxwin(const char* name)
f1339c56
RR
2610 {
2611 if (name)
267e11c5 2612 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2613 else
2614 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2615
8b04d4c4
VZ
2616 Init();
2617 }
2618
e95354ec 2619 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2620 {
2621 m_enc = enc;
2622
2623 Init();
f1339c56 2624 }
dccce9ea 2625
bde4baac 2626 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2627 {
2628 size_t inbuf = strlen(psz);
dccce9ea 2629 if (buf)
c643a977 2630 {
ef199164 2631 if (!m2w.Convert(psz, buf))
467e0479 2632 return wxCONV_FAILED;
c643a977 2633 }
f1339c56
RR
2634 return inbuf;
2635 }
dccce9ea 2636
bde4baac 2637 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2638 {
f8d791e0 2639 const size_t inbuf = wxWcslen(psz);
f1339c56 2640 if (buf)
c643a977 2641 {
ef199164 2642 if (!w2m.Convert(psz, buf))
467e0479 2643 return wxCONV_FAILED;
c643a977 2644 }
dccce9ea 2645
f1339c56
RR
2646 return inbuf;
2647 }
dccce9ea 2648
7ef3ab50 2649 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2650 {
2651 switch ( m_enc )
2652 {
2653 case wxFONTENCODING_UTF16BE:
2654 case wxFONTENCODING_UTF16LE:
c1464d9d 2655 return 2;
eec47cc6
VZ
2656
2657 case wxFONTENCODING_UTF32BE:
2658 case wxFONTENCODING_UTF32LE:
c1464d9d 2659 return 4;
eec47cc6
VZ
2660
2661 default:
c1464d9d 2662 return 1;
eec47cc6
VZ
2663 }
2664 }
2665
d36c9347
VZ
2666 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2667
7ef3ab50
VZ
2668 bool IsOk() const { return m_ok; }
2669
2670public:
2671 wxFontEncoding m_enc;
2672 wxEncodingConverter m2w, w2m;
2673
2674private:
cafbf6fb
VZ
2675 // were we initialized successfully?
2676 bool m_ok;
fc7a2a60 2677
e95354ec 2678 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2679};
6001e347 2680
8f115891 2681// make the constructors available for unit testing
86501081 2682WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2683{
2684 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2685 if ( !result->IsOk() )
2686 {
2687 delete result;
2688 return 0;
2689 }
ef199164 2690
8f115891
MW
2691 return result;
2692}
2693
1e6feb95
VZ
2694#endif // wxUSE_FONTMAP
2695
36acb880
VZ
2696// ============================================================================
2697// wxCSConv implementation
2698// ============================================================================
2699
8b04d4c4 2700void wxCSConv::Init()
6001e347 2701{
e95354ec
VZ
2702 m_name = NULL;
2703 m_convReal = NULL;
2704 m_deferred = true;
2705}
2706
86501081 2707wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2708{
2709 Init();
82713003 2710
86501081 2711 if ( !charset.empty() )
e95354ec 2712 {
86501081 2713 SetName(charset.ToAscii());
e95354ec 2714 }
bda3d86a 2715
e4277538
VZ
2716#if wxUSE_FONTMAP
2717 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2718#else
bda3d86a 2719 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2720#endif
6001e347
RR
2721}
2722
8b04d4c4
VZ
2723wxCSConv::wxCSConv(wxFontEncoding encoding)
2724{
bda3d86a 2725 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2726 {
2727 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2728
2729 encoding = wxFONTENCODING_SYSTEM;
2730 }
2731
8b04d4c4
VZ
2732 Init();
2733
bda3d86a 2734 m_encoding = encoding;
8b04d4c4
VZ
2735}
2736
6001e347
RR
2737wxCSConv::~wxCSConv()
2738{
65e50848
JS
2739 Clear();
2740}
2741
54380f29 2742wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2743 : wxMBConv()
54380f29 2744{
8b04d4c4
VZ
2745 Init();
2746
54380f29 2747 SetName(conv.m_name);
8b04d4c4 2748 m_encoding = conv.m_encoding;
54380f29
GD
2749}
2750
2751wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2752{
2753 Clear();
8b04d4c4 2754
54380f29 2755 SetName(conv.m_name);
8b04d4c4
VZ
2756 m_encoding = conv.m_encoding;
2757
54380f29
GD
2758 return *this;
2759}
2760
65e50848
JS
2761void wxCSConv::Clear()
2762{
8b04d4c4 2763 free(m_name);
e95354ec 2764 delete m_convReal;
8b04d4c4 2765
65e50848 2766 m_name = NULL;
e95354ec 2767 m_convReal = NULL;
6001e347
RR
2768}
2769
86501081 2770void wxCSConv::SetName(const char *charset)
6001e347 2771{
f1339c56
RR
2772 if (charset)
2773 {
d6f2a891 2774 m_name = wxStrdup(charset);
e95354ec 2775 m_deferred = true;
f1339c56 2776 }
6001e347
RR
2777}
2778
8b3eb85d 2779#if wxUSE_FONTMAP
8b3eb85d
VZ
2780
2781WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2782 wxEncodingNameCache );
8b3eb85d
VZ
2783
2784static wxEncodingNameCache gs_nameCache;
2785#endif
2786
e95354ec
VZ
2787wxMBConv *wxCSConv::DoCreate() const
2788{
ce6f8d6f
VZ
2789#if wxUSE_FONTMAP
2790 wxLogTrace(TRACE_STRCONV,
2791 wxT("creating conversion for %s"),
2792 (m_name ? m_name
86501081 2793 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2794#endif // wxUSE_FONTMAP
2795
c547282d
VZ
2796 // check for the special case of ASCII or ISO8859-1 charset: as we have
2797 // special knowledge of it anyhow, we don't need to create a special
2798 // conversion object
e4277538
VZ
2799 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2800 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2801 {
e95354ec
VZ
2802 // don't convert at all
2803 return NULL;
2804 }
dccce9ea 2805
e95354ec
VZ
2806 // we trust OS to do conversion better than we can so try external
2807 // conversion methods first
2808 //
2809 // the full order is:
2810 // 1. OS conversion (iconv() under Unix or Win32 API)
2811 // 2. hard coded conversions for UTF
2812 // 3. wxEncodingConverter as fall back
2813
2814 // step (1)
2815#ifdef HAVE_ICONV
c547282d 2816#if !wxUSE_FONTMAP
e95354ec 2817 if ( m_name )
c547282d 2818#endif // !wxUSE_FONTMAP
e95354ec 2819 {
3ef10cfc 2820#if wxUSE_FONTMAP
8b3eb85d 2821 wxFontEncoding encoding(m_encoding);
3ef10cfc 2822#endif
8b3eb85d 2823
86501081 2824 if ( m_name )
8b3eb85d 2825 {
86501081 2826 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2827 if ( conv->IsOk() )
2828 return conv;
2829
2830 delete conv;
c547282d
VZ
2831
2832#if wxUSE_FONTMAP
8b3eb85d 2833 encoding =
86501081 2834 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2835#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2836 }
2837#if wxUSE_FONTMAP
2838 {
2839 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2840 if ( it != gs_nameCache.end() )
2841 {
2842 if ( it->second.empty() )
2843 return NULL;
c547282d 2844
86501081 2845 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2846 if ( conv->IsOk() )
2847 return conv;
e95354ec 2848
8b3eb85d
VZ
2849 delete conv;
2850 }
2851
2852 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2853 // CS : in case this does not return valid names (eg for MacRoman)
2854 // encoding got a 'failure' entry in the cache all the same,
2855 // although it just has to be created using a different method, so
2856 // only store failed iconv creation attempts (or perhaps we
2857 // shoulnd't do this at all ?)
3c67ec06 2858 if ( names[0] != NULL )
8b3eb85d 2859 {
3c67ec06 2860 for ( ; *names; ++names )
8b3eb85d 2861 {
86501081
VS
2862 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2863 // will need changes that will obsolete this
2864 wxString name(*names);
2865 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2866 if ( conv->IsOk() )
2867 {
2868 gs_nameCache[encoding] = *names;
2869 return conv;
2870 }
2871
2872 delete conv;
8b3eb85d
VZ
2873 }
2874
3c67ec06 2875 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2876 }
8b3eb85d
VZ
2877 }
2878#endif // wxUSE_FONTMAP
e95354ec
VZ
2879 }
2880#endif // HAVE_ICONV
2881
2882#ifdef wxHAVE_WIN32_MB2WC
2883 {
7608a683 2884#if wxUSE_FONTMAP
e95354ec
VZ
2885 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2886 : new wxMBConv_win32(m_encoding);
2887 if ( conv->IsOk() )
2888 return conv;
2889
2890 delete conv;
7608a683
WS
2891#else
2892 return NULL;
2893#endif
e95354ec
VZ
2894 }
2895#endif // wxHAVE_WIN32_MB2WC
ef199164 2896
5c4ed98d 2897#ifdef __DARWIN__
f7e98dee 2898 {
6ff49cbc
DE
2899 // leave UTF16 and UTF32 to the built-ins of wx
2900 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2901 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 2902 {
a6900d10 2903#if wxUSE_FONTMAP
5c4ed98d
DE
2904 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2905 : new wxMBConv_cf(m_encoding);
a6900d10 2906#else
5c4ed98d 2907 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 2908#endif
ef199164 2909
f7e98dee 2910 if ( conv->IsOk() )
d775fa82
WS
2911 return conv;
2912
2913 delete conv;
2914 }
335d31e0 2915 }
5c4ed98d
DE
2916#endif // __DARWIN__
2917
e95354ec
VZ
2918 // step (2)
2919 wxFontEncoding enc = m_encoding;
2920#if wxUSE_FONTMAP
c547282d
VZ
2921 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2922 {
2923 // use "false" to suppress interactive dialogs -- we can be called from
2924 // anywhere and popping up a dialog from here is the last thing we want to
2925 // do
267e11c5 2926 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2927 }
e95354ec
VZ
2928#endif // wxUSE_FONTMAP
2929
2930 switch ( enc )
2931 {
2932 case wxFONTENCODING_UTF7:
2933 return new wxMBConvUTF7;
2934
2935 case wxFONTENCODING_UTF8:
2936 return new wxMBConvUTF8;
2937
e95354ec
VZ
2938 case wxFONTENCODING_UTF16BE:
2939 return new wxMBConvUTF16BE;
2940
2941 case wxFONTENCODING_UTF16LE:
2942 return new wxMBConvUTF16LE;
2943
e95354ec
VZ
2944 case wxFONTENCODING_UTF32BE:
2945 return new wxMBConvUTF32BE;
2946
2947 case wxFONTENCODING_UTF32LE:
2948 return new wxMBConvUTF32LE;
2949
2950 default:
2951 // nothing to do but put here to suppress gcc warnings
ef199164 2952 break;
e95354ec
VZ
2953 }
2954
2955 // step (3)
2956#if wxUSE_FONTMAP
2957 {
2958 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2959 : new wxMBConv_wxwin(m_encoding);
2960 if ( conv->IsOk() )
2961 return conv;
2962
2963 delete conv;
2964 }
2965#endif // wxUSE_FONTMAP
2966
a58d4f4d
VS
2967 // NB: This is a hack to prevent deadlock. What could otherwise happen
2968 // in Unicode build: wxConvLocal creation ends up being here
2969 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
2970 // attach a timestamp, for which it will need wxConvLocal (to convert
2971 // time to char* and then wchar_t*), but that fails, tries to log the
2972 // error, but wxLog has an (already locked) critical section that
2973 // guards the static buffer.
a58d4f4d
VS
2974 static bool alreadyLoggingError = false;
2975 if (!alreadyLoggingError)
2976 {
2977 alreadyLoggingError = true;
2978 wxLogError(_("Cannot convert from the charset '%s'!"),
2979 m_name ? m_name
e95354ec
VZ
2980 :
2981#if wxUSE_FONTMAP
86501081 2982 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 2983#else // !wxUSE_FONTMAP
86501081 2984 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
2985#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2986 );
ef199164 2987
a58d4f4d
VS
2988 alreadyLoggingError = false;
2989 }
e95354ec
VZ
2990
2991 return NULL;
2992}
2993
2994void wxCSConv::CreateConvIfNeeded() const
2995{
2996 if ( m_deferred )
2997 {
2998 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 2999
bda3d86a
VZ
3000 // if we don't have neither the name nor the encoding, use the default
3001 // encoding for this system
3002 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3003 {
4c75209f 3004#if wxUSE_INTL
02c7347b 3005 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3006#else
3007 // fallback to some reasonable default:
3008 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3009#endif // wxUSE_INTL
4c75209f 3010 }
bda3d86a 3011
e95354ec
VZ
3012 self->m_convReal = DoCreate();
3013 self->m_deferred = false;
6001e347 3014 }
6001e347
RR
3015}
3016
0f0298b1
VZ
3017bool wxCSConv::IsOk() const
3018{
3019 CreateConvIfNeeded();
3020
3021 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3022 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3023 return true; // always ok as we do it ourselves
3024
3025 // m_convReal->IsOk() is called at its own creation, so we know it must
3026 // be ok if m_convReal is non-NULL
3027 return m_convReal != NULL;
3028}
3029
1c714a5d
VZ
3030size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3031 const char *src, size_t srcLen) const
3032{
3033 CreateConvIfNeeded();
3034
2c74c558
VS
3035 if (m_convReal)
3036 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3037
3038 // latin-1 (direct)
3039 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3040}
3041
3042size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3043 const wchar_t *src, size_t srcLen) const
3044{
3045 CreateConvIfNeeded();
3046
2c74c558
VS
3047 if (m_convReal)
3048 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3049
3050 // latin-1 (direct)
3051 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3052}
3053
6001e347
RR
3054size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3055{
e95354ec 3056 CreateConvIfNeeded();
dccce9ea 3057
e95354ec
VZ
3058 if (m_convReal)
3059 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3060
3061 // latin-1 (direct)
4def3b35 3062 size_t len = strlen(psz);
dccce9ea 3063
f1339c56
RR
3064 if (buf)
3065 {
4def3b35 3066 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3067 buf[c] = (unsigned char)(psz[c]);
3068 }
dccce9ea 3069
f1339c56 3070 return len;
6001e347
RR
3071}
3072
3073size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3074{
e95354ec 3075 CreateConvIfNeeded();
dccce9ea 3076
e95354ec
VZ
3077 if (m_convReal)
3078 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3079
f1339c56 3080 // latin-1 (direct)
f8d791e0 3081 const size_t len = wxWcslen(psz);
f1339c56
RR
3082 if (buf)
3083 {
4def3b35 3084 for (size_t c = 0; c <= len; c++)
24642831
VS
3085 {
3086 if (psz[c] > 0xFF)
467e0479 3087 return wxCONV_FAILED;
ef199164 3088
907173e5 3089 buf[c] = (char)psz[c];
24642831
VS
3090 }
3091 }
3092 else
3093 {
3094 for (size_t c = 0; c <= len; c++)
3095 {
3096 if (psz[c] > 0xFF)
467e0479 3097 return wxCONV_FAILED;
24642831 3098 }
f1339c56 3099 }
dccce9ea 3100
f1339c56 3101 return len;
6001e347
RR
3102}
3103
7ef3ab50 3104size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3105{
3106 CreateConvIfNeeded();
3107
3108 if ( m_convReal )
3109 {
7ef3ab50 3110 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3111 }
3112
ba98e032 3113 // otherwise, we are ISO-8859-1
c1464d9d 3114 return 1;
eec47cc6
VZ
3115}
3116
ba98e032
VS
3117#if wxUSE_UNICODE_UTF8
3118bool wxCSConv::IsUTF8() const
3119{
3120 CreateConvIfNeeded();
3121
3122 if ( m_convReal )
3123 {
3124 return m_convReal->IsUTF8();
3125 }
3126
3127 // otherwise, we are ISO-8859-1
3128 return false;
3129}
3130#endif
3131
69c928ef
VZ
3132
3133#if wxUSE_UNICODE
3134
3135wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3136{
3137 if ( !s )
3138 return wxWCharBuffer();
3139
3140 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3141 if ( !wbuf )
5487ff0f 3142 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3143 if ( !wbuf )
3144 wbuf = wxConvISO8859_1.cMB2WX(s);
3145
3146 return wbuf;
3147}
3148
3149wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3150{
3151 if ( !ws )
3152 return wxCharBuffer();
3153
3154 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3155 if ( !buf )
3156 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3157
3158 return buf;
3159}
3160
3161#endif // wxUSE_UNICODE
f5a1953b 3162
1e50d914
VS
3163// ----------------------------------------------------------------------------
3164// globals
3165// ----------------------------------------------------------------------------
3166
3167// NB: The reason why we create converted objects in this convoluted way,
3168// using a factory function instead of global variable, is that they
3169// may be used at static initialization time (some of them are used by
3170// wxString ctors and there may be a global wxString object). In other
3171// words, possibly _before_ the converter global object would be
3172// initialized.
3173
3174#undef wxConvLibc
3175#undef wxConvUTF8
3176#undef wxConvUTF7
3177#undef wxConvLocal
3178#undef wxConvISO8859_1
3179
3180#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3181 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3182 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3183 { \
3184 static impl_klass name##Obj ctor_args; \
3185 return &name##Obj; \
3186 } \
3187 /* this ensures that all global converter objects are created */ \
3188 /* by the time static initialization is done, i.e. before any */ \
3189 /* thread is launched: */ \
3190 static klass* gs_##name##instance = wxGet_##name##Ptr()
3191
3192#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3193 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3194
3195#ifdef __WINDOWS__
3196 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3197#else
3198 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3199#endif
3200
e1079eda
VZ
3201// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3202// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3203// provokes an error message about "not enough macro parameters"; and we
3204// can't use "()" here as the name##Obj declaration would be parsed as a
3205// function declaration then, so use a semicolon and live with an extra
3206// empty statement (and hope that no compilers warns about this)
3207WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3208WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3209
3210WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3211WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3212
3213WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3214WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3215
6ac84a78
DE
3216#ifdef __DARWIN__
3217// The xnu kernel always communicates file paths in decomposed UTF-8.
3218// WARNING: Are we sure that CFString's conversion will cause decomposition?
3219static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3220#endif
6ac84a78 3221
1e50d914 3222WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3223#ifdef __DARWIN__
1e50d914 3224 &wxConvMacUTF8DObj;
6ac84a78 3225#else // !__DARWIN__
1e50d914 3226 wxGet_wxConvLibcPtr();
6ac84a78 3227#endif // __DARWIN__/!__DARWIN__
1e50d914 3228
bde4baac
VZ
3229#else // !wxUSE_WCHAR_T
3230
1e50d914 3231// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3232// stand-ins in absence of wchar_t
3233WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3234 wxConvISO8859_1,
3235 wxConvLocal,
3236 wxConvUTF8;
3237
3238#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T