]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Always make wxWindow::SetFocus() focus the window, overriding SetCanFocus, some more...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
480f42ec
VS
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
373658eb
VZ
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
de6185e2 25 #include "wx/utils.h"
df69528b 26 #include "wx/hashmap.h"
ef199164 27#endif
373658eb 28
bde4baac
VZ
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec 41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
a6c2e2c7
VZ
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
e95354ec 44 #define wxHAVE_WIN32_MB2WC
ef199164 45#endif
e95354ec 46
6001e347 47#ifdef __SALFORDC__
373658eb 48 #include <clib.h>
6001e347
RR
49#endif
50
b040e242 51#ifdef HAVE_ICONV
373658eb 52 #include <iconv.h>
b1d547eb 53 #include "wx/thread.h"
1cd52418 54#endif
1cd52418 55
373658eb
VZ
56#include "wx/encconv.h"
57#include "wx/fontmap.h"
58
5c4ed98d 59#ifdef __DARWIN__
e4dd1e19 60#include "wx/mac/corefoundation/private/strconv_cf.h"
5c4ed98d
DE
61#endif //def __DARWIN__
62
ef199164 63
ce6f8d6f
VZ
64#define TRACE_STRCONV _T("strconv")
65
467e0479
VZ
66// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67// be 4 bytes
4948c2b6 68#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
69 #define WC_UTF16
70#endif
71
ef199164 72
373658eb
VZ
73// ============================================================================
74// implementation
75// ============================================================================
76
69373110
VZ
77// helper function of cMB2WC(): check if n bytes at this location are all NUL
78static bool NotAllNULs(const char *p, size_t n)
79{
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84}
85
373658eb 86// ----------------------------------------------------------------------------
467e0479 87// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 88// ----------------------------------------------------------------------------
6001e347 89
c91830cb 90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 91{
ef199164 92 if (input <= 0xffff)
4def3b35 93 {
999836aa
VZ
94 if (output)
95 *output = (wxUint16) input;
ef199164 96
4def3b35 97 return 1;
dccce9ea 98 }
ef199164 99 else if (input >= 0x110000)
4def3b35 100 {
467e0479 101 return wxCONV_FAILED;
dccce9ea
VZ
102 }
103 else
4def3b35 104 {
dccce9ea 105 if (output)
4def3b35 106 {
ef199164
DS
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 109 }
ef199164 110
4def3b35 111 return 2;
1cd52418 112 }
1cd52418
OK
113}
114
c91830cb 115static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 116{
ef199164 117 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
118 {
119 output = *input;
120 return 1;
dccce9ea 121 }
ef199164 122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
467e0479 125 return wxCONV_FAILED;
dccce9ea
VZ
126 }
127 else
4def3b35
VS
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
1cd52418
OK
132}
133
467e0479 134#ifdef WC_UTF16
35d11700
VZ
135 typedef wchar_t wxDecodeSurrogate_t;
136#else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
139
140// returns the next UTF-32 character from the wchar_t buffer and advances the
141// pointer to the character after this one
142//
143// if an invalid character is found, *pSrc is set to NULL, the caller must
144// check for this
35d11700 145static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
146{
147 wxUint32 out;
8d3dd069
VZ
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156}
157
f6bcfd97 158// ----------------------------------------------------------------------------
6001e347 159// wxMBConv
f6bcfd97 160// ----------------------------------------------------------------------------
2c53a80a 161
483b0434
VZ
162size_t
163wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
6001e347 165{
483b0434
VZ
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
6001e347 172
483b0434
VZ
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
eec47cc6 175
c1464d9d 176 // the number of NULs terminating this string
a78c43f1 177 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 178
c1464d9d
VZ
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
483b0434
VZ
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
467e0479 185 if ( srcLen != wxNO_LEN )
eec47cc6 186 {
c1464d9d 187 // we need to know how to find the end of this string
7ef3ab50 188 nulLen = GetMBNulLen();
483b0434
VZ
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
e4e3bbb4 191
c1464d9d 192 // if there are enough NULs we can avoid the copy
483b0434 193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
194 {
195 // make a copy in order to properly NUL-terminate the string
483b0434 196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 197 char * const p = bufTmp.data();
483b0434
VZ
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 200 *s = '\0';
483b0434
VZ
201
202 src = bufTmp;
eec47cc6 203 }
e4e3bbb4 204
483b0434
VZ
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
e4e3bbb4 211
483b0434 212 for ( ;; )
eec47cc6 213 {
c1464d9d 214 // try to convert the current chunk
483b0434 215 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
e4e3bbb4 218
467e0479 219 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 220
483b0434 221 dstWritten += lenChunk;
f5fb6871 222
467e0479
VZ
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
483b0434
VZ
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
830f8f11 234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
c1464d9d 239
483b0434 240 if ( !srcEnd )
c1464d9d 241 {
467e0479
VZ
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
c1464d9d
VZ
244 break;
245 }
eec47cc6
VZ
246
247 // advance the input pointer past the end of this chunk
483b0434 248 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
483b0434 254 src += nulLen;
c1464d9d 255 }
e4e3bbb4 256
483b0434 257 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
483b0434 262 if ( src >= srcEnd )
c1464d9d
VZ
263 break;
264 }
265
483b0434 266 return dstWritten;
e4e3bbb4
RN
267}
268
483b0434
VZ
269size_t
270wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
e4e3bbb4 272{
483b0434
VZ
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
e4e3bbb4 275
eec47cc6
VZ
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
467e0479 282 if ( srcLen == wxNO_LEN )
e4e3bbb4 283 {
483b0434 284 srcLen = wxWcslen(src) + 1;
eec47cc6 285 }
483b0434 286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
287 {
288 // make a copy in order to properly NUL-terminate the string
483b0434 289 bufTmp = wxWCharBuffer(srcLen);
ef199164 290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
eec47cc6 318 }
e4e3bbb4 319
483b0434
VZ
320 return dstWritten;
321}
322
ef199164 323size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 324{
ef199164 325 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 326 if ( rc != wxCONV_FAILED )
509da451
VZ
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334}
335
ef199164 336size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 337{
ef199164 338 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 339 if ( rc != wxCONV_FAILED )
509da451
VZ
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345}
346
483b0434
VZ
347wxMBConv::~wxMBConv()
348{
349 // nothing to do here (necessary for Darwin linking probably)
350}
e4e3bbb4 351
483b0434
VZ
352const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353{
354 if ( psz )
eec47cc6 355 {
483b0434 356 // calculate the length of the buffer needed first
a2db25a1 357 const size_t nLen = ToWChar(NULL, 0, psz);
467e0479 358 if ( nLen != wxCONV_FAILED )
f5fb6871 359 {
483b0434 360 // now do the actual conversion
a2db25a1 361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
eec47cc6 362
483b0434 363 // +1 for the trailing NULL
a2db25a1 364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
483b0434 365 return buf;
f5fb6871 366 }
483b0434 367 }
e4e3bbb4 368
483b0434
VZ
369 return wxWCharBuffer();
370}
3698ae71 371
483b0434
VZ
372const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373{
374 if ( pwz )
375 {
a2db25a1 376 const size_t nLen = FromWChar(NULL, 0, pwz);
467e0479 377 if ( nLen != wxCONV_FAILED )
483b0434 378 {
a2db25a1
VZ
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
483b0434
VZ
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386}
e4e3bbb4 387
483b0434 388const wxWCharBuffer
ef199164 389wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 390{
ef199164 391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 392 if ( dstLen != wxCONV_FAILED )
483b0434 393 {
0dd13d21
VZ
394 // notice that we allocate space for dstLen+1 wide characters here
395 // because we want the buffer to always be NUL-terminated, even if the
396 // input isn't (as otherwise the caller has no way to know its length)
397 wxWCharBuffer wbuf(dstLen);
ef199164 398 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
399 {
400 if ( outLen )
467e0479
VZ
401 {
402 *outLen = dstLen;
403 if ( wbuf[dstLen - 1] == L'\0' )
404 (*outLen)--;
405 }
406
483b0434
VZ
407 return wbuf;
408 }
409 }
410
411 if ( outLen )
412 *outLen = 0;
413
414 return wxWCharBuffer();
415}
416
417const wxCharBuffer
ef199164 418wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 419{
13d92ad6 420 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 421 if ( dstLen != wxCONV_FAILED )
483b0434 422 {
0dd13d21
VZ
423 const size_t nulLen = GetMBNulLen();
424
425 // as above, ensure that the buffer is always NUL-terminated, even if
426 // the input is not
427 wxCharBuffer buf(dstLen + nulLen - 1);
428 memset(buf.data() + dstLen, 0, nulLen);
ef199164 429 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
430 {
431 if ( outLen )
467e0479
VZ
432 {
433 *outLen = dstLen;
434
13d92ad6
VZ
435 if ( dstLen >= nulLen &&
436 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
437 {
438 // in this case the output is NUL-terminated and we're not
439 // supposed to count NUL
13d92ad6 440 *outLen -= nulLen;
467e0479
VZ
441 }
442 }
d32a507d 443
483b0434
VZ
444 return buf;
445 }
e4e3bbb4
RN
446 }
447
eec47cc6
VZ
448 if ( outLen )
449 *outLen = 0;
450
451 return wxCharBuffer();
e4e3bbb4
RN
452}
453
6001e347 454// ----------------------------------------------------------------------------
bde4baac 455// wxMBConvLibc
6001e347
RR
456// ----------------------------------------------------------------------------
457
bde4baac
VZ
458size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
459{
460 return wxMB2WC(buf, psz, n);
461}
462
463size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
464{
465 return wxWC2MB(buf, psz, n);
466}
e1bfe89e
RR
467
468// ----------------------------------------------------------------------------
532d575b 469// wxConvBrokenFileNames
e1bfe89e
RR
470// ----------------------------------------------------------------------------
471
eec47cc6
VZ
472#ifdef __UNIX__
473
86501081 474wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
ea8ce907 475{
86501081
VS
476 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
477 wxStricmp(charset, _T("UTF8")) == 0 )
5deedd6e 478 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
845905d5
MW
479 else
480 m_conv = new wxCSConv(charset);
ea8ce907
RR
481}
482
eec47cc6 483#endif // __UNIX__
c12b7f79 484
bde4baac 485// ----------------------------------------------------------------------------
3698ae71 486// UTF-7
bde4baac 487// ----------------------------------------------------------------------------
6001e347 488
15f2ee32 489// Implementation (C) 2004 Fredrik Roubert
6001e347 490
15f2ee32
RN
491//
492// BASE64 decoding table
493//
494static const unsigned char utf7unb64[] =
6001e347 495{
15f2ee32
RN
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
502 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
503 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
505 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
506 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
507 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
509 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
510 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
511 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
528};
529
530size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
531{
15f2ee32
RN
532 size_t len = 0;
533
04a37834 534 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
535 {
536 unsigned char cc = *psz++;
537 if (cc != '+')
538 {
539 // plain ASCII char
540 if (buf)
541 *buf++ = cc;
542 len++;
543 }
544 else if (*psz == '-')
545 {
546 // encoded plus sign
547 if (buf)
548 *buf++ = cc;
549 len++;
550 psz++;
551 }
04a37834 552 else // start of BASE64 encoded string
15f2ee32 553 {
04a37834 554 bool lsb, ok;
15f2ee32 555 unsigned int d, l;
04a37834
VZ
556 for ( ok = lsb = false, d = 0, l = 0;
557 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
558 psz++ )
15f2ee32
RN
559 {
560 d <<= 6;
561 d += cc;
562 for (l += 6; l >= 8; lsb = !lsb)
563 {
04a37834 564 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
565 if (lsb)
566 {
567 if (buf)
568 *buf++ |= c;
569 len ++;
570 }
571 else
04a37834 572 {
15f2ee32 573 if (buf)
6356d52a 574 *buf = (wchar_t)(c << 8);
04a37834
VZ
575 }
576
577 ok = true;
15f2ee32
RN
578 }
579 }
04a37834
VZ
580
581 if ( !ok )
582 {
583 // in valid UTF7 we should have valid characters after '+'
467e0479 584 return wxCONV_FAILED;
04a37834
VZ
585 }
586
15f2ee32
RN
587 if (*psz == '-')
588 psz++;
589 }
590 }
04a37834
VZ
591
592 if ( buf && (len < n) )
593 *buf = '\0';
594
15f2ee32 595 return len;
6001e347
RR
596}
597
15f2ee32
RN
598//
599// BASE64 encoding table
600//
601static const unsigned char utf7enb64[] =
602{
603 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
604 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
605 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
606 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
607 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
608 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
609 'w', 'x', 'y', 'z', '0', '1', '2', '3',
610 '4', '5', '6', '7', '8', '9', '+', '/'
611};
612
613//
614// UTF-7 encoding table
615//
616// 0 - Set D (directly encoded characters)
617// 1 - Set O (optional direct characters)
618// 2 - whitespace characters (optional)
619// 3 - special characters
620//
621static const unsigned char utf7encode[128] =
6001e347 622{
15f2ee32
RN
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
625 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
629 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
631};
632
667e5b3e 633size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 634{
15f2ee32
RN
635 size_t len = 0;
636
637 while (*psz && ((!buf) || (len < n)))
638 {
639 wchar_t cc = *psz++;
640 if (cc < 0x80 && utf7encode[cc] < 1)
641 {
642 // plain ASCII char
643 if (buf)
644 *buf++ = (char)cc;
ef199164 645
15f2ee32
RN
646 len++;
647 }
648#ifndef WC_UTF16
79c78d42 649 else if (((wxUint32)cc) > 0xffff)
b2c13097 650 {
15f2ee32 651 // no surrogate pair generation (yet?)
467e0479 652 return wxCONV_FAILED;
15f2ee32
RN
653 }
654#endif
655 else
656 {
657 if (buf)
658 *buf++ = '+';
ef199164 659
15f2ee32
RN
660 len++;
661 if (cc != '+')
662 {
663 // BASE64 encode string
664 unsigned int lsb, d, l;
73c902d6 665 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
666 {
667 for (lsb = 0; lsb < 2; lsb ++)
668 {
669 d <<= 8;
670 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
671
672 for (l += 8; l >= 6; )
673 {
674 l -= 6;
675 if (buf)
676 *buf++ = utf7enb64[(d >> l) % 64];
677 len++;
678 }
679 }
ef199164 680
15f2ee32
RN
681 cc = *psz;
682 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
683 break;
684 }
ef199164 685
15f2ee32
RN
686 if (l != 0)
687 {
688 if (buf)
689 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 690
15f2ee32
RN
691 len++;
692 }
693 }
ef199164 694
15f2ee32
RN
695 if (buf)
696 *buf++ = '-';
697 len++;
698 }
699 }
ef199164 700
15f2ee32
RN
701 if (buf && (len < n))
702 *buf = 0;
ef199164 703
15f2ee32 704 return len;
6001e347
RR
705}
706
f6bcfd97 707// ----------------------------------------------------------------------------
6001e347 708// UTF-8
f6bcfd97 709// ----------------------------------------------------------------------------
6001e347 710
1774c3c5 711static const wxUint32 utf8_max[]=
4def3b35 712 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 713
3698ae71
VZ
714// boundaries of the private use area we use to (temporarily) remap invalid
715// characters invalid in a UTF-8 encoded string
ea8ce907
RR
716const wxUint32 wxUnicodePUA = 0x100000;
717const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
718
0286d08d 719// this table gives the length of the UTF-8 encoding from its first character:
1774c3c5 720const unsigned char tableUtf8Lengths[256] = {
0286d08d
VZ
721 // single-byte sequences (ASCII):
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
728 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
730
731 // these are invalid:
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
736 0, 0, // C0,C1
737
738 // two-byte sequences:
739 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
740 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
741
742 // three-byte sequences:
743 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
744
745 // four-byte sequences:
746 4, 4, 4, 4, 4, // F0..F4
747
748 // these are invalid again (5- or 6-byte
749 // sequences and sequences for code points
750 // above U+10FFFF, as restricted by RFC 3629):
751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
752};
753
754size_t
755wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
756 const char *src, size_t srcLen) const
757{
758 wchar_t *out = dstLen ? dst : NULL;
759 size_t written = 0;
760
761 if ( srcLen == wxNO_LEN )
762 srcLen = strlen(src) + 1;
763
764 for ( const char *p = src; ; p++ )
765 {
766 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
767 {
768 // all done successfully, just add the trailing NULL if we are not
769 // using explicit length
770 if ( srcLen == wxNO_LEN )
771 {
772 if ( out )
773 {
774 if ( !dstLen )
775 break;
776
777 *out = L'\0';
778 }
779
780 written++;
781 }
782
783 return written;
784 }
785
0286d08d
VZ
786 if ( out && !dstLen-- )
787 break;
788
5367a38a
VS
789 wxUint32 code;
790 unsigned char c = *p;
0286d08d 791
5367a38a
VS
792 if ( c < 0x80 )
793 {
794 if ( srcLen == 0 ) // the test works for wxNO_LEN too
795 break;
0286d08d 796
5367a38a
VS
797 if ( srcLen != wxNO_LEN )
798 srcLen--;
0286d08d 799
5367a38a
VS
800 code = c;
801 }
802 else
0286d08d 803 {
5367a38a
VS
804 unsigned len = tableUtf8Lengths[c];
805 if ( !len )
806 break;
807
808 if ( srcLen < len ) // the test works for wxNO_LEN too
809 break;
810
811 if ( srcLen != wxNO_LEN )
812 srcLen -= len;
813
814 // Char. number range | UTF-8 octet sequence
815 // (hexadecimal) | (binary)
816 // ----------------------+----------------------------------------
817 // 0000 0000 - 0000 007F | 0xxxxxxx
818 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
819 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
820 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
821 //
822 // Code point value is stored in bits marked with 'x',
823 // lowest-order bit of the value on the right side in the diagram
824 // above. (from RFC 3629)
825
826 // mask to extract lead byte's value ('x' bits above), by sequence
827 // length:
828 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
829
830 // mask and value of lead byte's most significant bits, by length:
831 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
832 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
833
834 len--; // it's more convenient to work with 0-based length here
835
836 // extract the lead byte's value bits:
837 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
838 break;
839
840 code = c & leadValueMask[len];
841
842 // all remaining bytes, if any, are handled in the same way
843 // regardless of sequence's length:
844 for ( ; len; --len )
845 {
846 c = *++p;
847 if ( (c & 0xC0) != 0x80 )
848 return wxCONV_FAILED;
0286d08d 849
5367a38a
VS
850 code <<= 6;
851 code |= c & 0x3F;
852 }
0286d08d
VZ
853 }
854
855#ifdef WC_UTF16
856 // cast is ok because wchar_t == wxUint16 if WC_UTF16
857 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
858 {
859 if ( out )
860 out++;
861 written++;
862 }
863#else // !WC_UTF16
864 if ( out )
865 *out = code;
866#endif // WC_UTF16/!WC_UTF16
867
868 if ( out )
869 out++;
870
871 written++;
872 }
873
874 return wxCONV_FAILED;
875}
876
877size_t
878wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
879 const wchar_t *src, size_t srcLen) const
880{
881 char *out = dstLen ? dst : NULL;
882 size_t written = 0;
883
884 for ( const wchar_t *wp = src; ; wp++ )
885 {
886 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
887 {
888 // all done successfully, just add the trailing NULL if we are not
889 // using explicit length
890 if ( srcLen == wxNO_LEN )
891 {
892 if ( out )
893 {
894 if ( !dstLen )
895 break;
896
897 *out = '\0';
898 }
899
900 written++;
901 }
902
903 return written;
904 }
905
906
907 wxUint32 code;
908#ifdef WC_UTF16
909 // cast is ok for WC_UTF16
910 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
911 {
912 // skip the next char too as we decoded a surrogate
913 wp++;
914 }
915#else // wchar_t is UTF-32
916 code = *wp & 0x7fffffff;
917#endif
918
919 unsigned len;
920 if ( code <= 0x7F )
921 {
922 len = 1;
923 if ( out )
924 {
925 if ( dstLen < len )
926 break;
927
928 out[0] = (char)code;
929 }
930 }
931 else if ( code <= 0x07FF )
932 {
933 len = 2;
934 if ( out )
935 {
936 if ( dstLen < len )
937 break;
938
939 // NB: this line takes 6 least significant bits, encodes them as
940 // 10xxxxxx and discards them so that the next byte can be encoded:
941 out[1] = 0x80 | (code & 0x3F); code >>= 6;
942 out[0] = 0xC0 | code;
943 }
944 }
945 else if ( code < 0xFFFF )
946 {
947 len = 3;
948 if ( out )
949 {
950 if ( dstLen < len )
951 break;
952
953 out[2] = 0x80 | (code & 0x3F); code >>= 6;
954 out[1] = 0x80 | (code & 0x3F); code >>= 6;
955 out[0] = 0xE0 | code;
956 }
957 }
958 else if ( code <= 0x10FFFF )
959 {
960 len = 4;
961 if ( out )
962 {
963 if ( dstLen < len )
964 break;
965
966 out[3] = 0x80 | (code & 0x3F); code >>= 6;
967 out[2] = 0x80 | (code & 0x3F); code >>= 6;
968 out[1] = 0x80 | (code & 0x3F); code >>= 6;
969 out[0] = 0xF0 | code;
970 }
971 }
972 else
973 {
974 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
975 break;
976 }
977
978 if ( out )
979 {
980 out += len;
981 dstLen -= len;
982 }
983
984 written += len;
985 }
986
987 // we only get here if an error occurs during decoding
988 return wxCONV_FAILED;
989}
990
d16d0917
VZ
991size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
992 const char *psz, size_t srcLen) const
6001e347 993{
0286d08d 994 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 995 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
0286d08d 996
4def3b35
VS
997 size_t len = 0;
998
d16d0917 999 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35 1000 {
ea8ce907
RR
1001 const char *opsz = psz;
1002 bool invalid = false;
4def3b35
VS
1003 unsigned char cc = *psz++, fc = cc;
1004 unsigned cnt;
dccce9ea 1005 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 1006 fc <<= 1;
ef199164 1007
dccce9ea 1008 if (!cnt)
4def3b35
VS
1009 {
1010 // plain ASCII char
dccce9ea 1011 if (buf)
4def3b35
VS
1012 *buf++ = cc;
1013 len++;
561488ef
MW
1014
1015 // escape the escape character for octal escapes
1016 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1017 && cc == '\\' && (!buf || len < n))
1018 {
1019 if (buf)
1020 *buf++ = cc;
1021 len++;
1022 }
dccce9ea
VZ
1023 }
1024 else
4def3b35
VS
1025 {
1026 cnt--;
dccce9ea 1027 if (!cnt)
4def3b35
VS
1028 {
1029 // invalid UTF-8 sequence
ea8ce907 1030 invalid = true;
dccce9ea
VZ
1031 }
1032 else
4def3b35
VS
1033 {
1034 unsigned ocnt = cnt - 1;
1035 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 1036 while (cnt--)
4def3b35 1037 {
ea8ce907 1038 cc = *psz;
dccce9ea 1039 if ((cc & 0xC0) != 0x80)
4def3b35
VS
1040 {
1041 // invalid UTF-8 sequence
ea8ce907
RR
1042 invalid = true;
1043 break;
4def3b35 1044 }
ef199164 1045
ea8ce907 1046 psz++;
4def3b35
VS
1047 res = (res << 6) | (cc & 0x3f);
1048 }
ef199164 1049
ea8ce907 1050 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
1051 {
1052 // illegal UTF-8 encoding
ea8ce907 1053 invalid = true;
4def3b35 1054 }
ea8ce907
RR
1055 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1056 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1057 {
1058 // if one of our PUA characters turns up externally
1059 // it must also be treated as an illegal sequence
1060 // (a bit like you have to escape an escape character)
1061 invalid = true;
1062 }
1063 else
1064 {
1cd52418 1065#ifdef WC_UTF16
0286d08d 1066 // cast is ok because wchar_t == wxUint16 if WC_UTF16
ea8ce907 1067 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 1068 if (pa == wxCONV_FAILED)
ea8ce907
RR
1069 {
1070 invalid = true;
1071 }
1072 else
1073 {
1074 if (buf)
1075 buf += pa;
1076 len += pa;
1077 }
373658eb 1078#else // !WC_UTF16
ea8ce907 1079 if (buf)
38d4b1e4 1080 *buf++ = (wchar_t)res;
ea8ce907 1081 len++;
373658eb 1082#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
1083 }
1084 }
ef199164 1085
ea8ce907
RR
1086 if (invalid)
1087 {
1088 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1089 {
1090 while (opsz < psz && (!buf || len < n))
1091 {
1092#ifdef WC_UTF16
1093 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1094 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 1095 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
1096 if (buf)
1097 buf += pa;
1098 opsz++;
1099 len += pa;
1100#else
1101 if (buf)
38d4b1e4 1102 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
1103 opsz++;
1104 len++;
1105#endif
1106 }
1107 }
3698ae71 1108 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
1109 {
1110 while (opsz < psz && (!buf || len < n))
1111 {
3698ae71
VZ
1112 if ( buf && len + 3 < n )
1113 {
17a1ebd1 1114 unsigned char on = *opsz;
3698ae71 1115 *buf++ = L'\\';
17a1ebd1
VZ
1116 *buf++ = (wchar_t)( L'0' + on / 0100 );
1117 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1118 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 1119 }
ef199164 1120
ea8ce907
RR
1121 opsz++;
1122 len += 4;
1123 }
1124 }
3698ae71 1125 else // MAP_INVALID_UTF8_NOT
ea8ce907 1126 {
467e0479 1127 return wxCONV_FAILED;
ea8ce907 1128 }
4def3b35
VS
1129 }
1130 }
6001e347 1131 }
ef199164 1132
d16d0917 1133 if (srcLen == wxNO_LEN && buf && (len < n))
4def3b35 1134 *buf = 0;
ef199164 1135
d16d0917 1136 return len + 1;
6001e347
RR
1137}
1138
3698ae71
VZ
1139static inline bool isoctal(wchar_t wch)
1140{
1141 return L'0' <= wch && wch <= L'7';
1142}
1143
d16d0917
VZ
1144size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1145 const wchar_t *psz, size_t srcLen) const
6001e347 1146{
0286d08d 1147 if ( m_options == MAP_INVALID_UTF8_NOT )
d16d0917 1148 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
0286d08d 1149
4def3b35 1150 size_t len = 0;
6001e347 1151
d16d0917 1152 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
4def3b35
VS
1153 {
1154 wxUint32 cc;
ef199164 1155
1cd52418 1156#ifdef WC_UTF16
b5153fd8
VZ
1157 // cast is ok for WC_UTF16
1158 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 1159 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 1160#else
ef199164 1161 cc = (*psz++) & 0x7fffffff;
4def3b35 1162#endif
3698ae71
VZ
1163
1164 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1165 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 1166 {
dccce9ea 1167 if (buf)
ea8ce907 1168 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 1169 len++;
3698ae71 1170 }
561488ef
MW
1171 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1172 && cc == L'\\' && psz[0] == L'\\' )
1173 {
1174 if (buf)
1175 *buf++ = (char)cc;
1176 psz++;
1177 len++;
1178 }
3698ae71
VZ
1179 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1180 cc == L'\\' &&
1181 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 1182 {
dccce9ea 1183 if (buf)
3698ae71 1184 {
ef199164
DS
1185 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1186 (psz[1] - L'0') * 010 +
b2c13097 1187 (psz[2] - L'0'));
3698ae71
VZ
1188 }
1189
1190 psz += 3;
ea8ce907
RR
1191 len++;
1192 }
1193 else
1194 {
1195 unsigned cnt;
ef199164
DS
1196 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1197 {
1198 }
1199
ea8ce907 1200 if (!cnt)
4def3b35 1201 {
ea8ce907
RR
1202 // plain ASCII char
1203 if (buf)
1204 *buf++ = (char) cc;
1205 len++;
1206 }
ea8ce907
RR
1207 else
1208 {
1209 len += cnt + 1;
1210 if (buf)
1211 {
1212 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1213 while (cnt--)
1214 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1215 }
4def3b35
VS
1216 }
1217 }
6001e347 1218 }
4def3b35 1219
d16d0917 1220 if (srcLen == wxNO_LEN && buf && (len < n))
3698ae71 1221 *buf = 0;
adb45366 1222
d16d0917 1223 return len + 1;
6001e347
RR
1224}
1225
467e0479 1226// ============================================================================
c91830cb 1227// UTF-16
467e0479 1228// ============================================================================
c91830cb
VZ
1229
1230#ifdef WORDS_BIGENDIAN
bde4baac
VZ
1231 #define wxMBConvUTF16straight wxMBConvUTF16BE
1232 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 1233#else
bde4baac
VZ
1234 #define wxMBConvUTF16swap wxMBConvUTF16BE
1235 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
1236#endif
1237
467e0479
VZ
1238/* static */
1239size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1240{
1241 if ( srcLen == wxNO_LEN )
1242 {
1243 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1244 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1245 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1246 ;
c91830cb 1247
467e0479
VZ
1248 srcLen *= BYTES_PER_CHAR;
1249 }
1250 else // we already have the length
1251 {
1252 // we can only convert an entire number of UTF-16 characters
1253 if ( srcLen % BYTES_PER_CHAR )
1254 return wxCONV_FAILED;
1255 }
1256
1257 return srcLen;
1258}
1259
1260// case when in-memory representation is UTF-16 too
c91830cb
VZ
1261#ifdef WC_UTF16
1262
467e0479
VZ
1263// ----------------------------------------------------------------------------
1264// conversions without endianness change
1265// ----------------------------------------------------------------------------
1266
1267size_t
1268wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1269 const char *src, size_t srcLen) const
c91830cb 1270{
467e0479
VZ
1271 // set up the scene for using memcpy() (which is presumably more efficient
1272 // than copying the bytes one by one)
1273 srcLen = GetLength(src, srcLen);
1274 if ( srcLen == wxNO_LEN )
1275 return wxCONV_FAILED;
c91830cb 1276
ef199164 1277 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1278 if ( dst )
c91830cb 1279 {
467e0479
VZ
1280 if ( dstLen < inLen )
1281 return wxCONV_FAILED;
c91830cb 1282
467e0479 1283 memcpy(dst, src, srcLen);
c91830cb 1284 }
d32a507d 1285
467e0479 1286 return inLen;
c91830cb
VZ
1287}
1288
467e0479
VZ
1289size_t
1290wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1291 const wchar_t *src, size_t srcLen) const
c91830cb 1292{
467e0479
VZ
1293 if ( srcLen == wxNO_LEN )
1294 srcLen = wxWcslen(src) + 1;
c91830cb 1295
467e0479
VZ
1296 srcLen *= BYTES_PER_CHAR;
1297
1298 if ( dst )
c91830cb 1299 {
467e0479
VZ
1300 if ( dstLen < srcLen )
1301 return wxCONV_FAILED;
d32a507d 1302
467e0479 1303 memcpy(dst, src, srcLen);
c91830cb 1304 }
d32a507d 1305
467e0479 1306 return srcLen;
c91830cb
VZ
1307}
1308
467e0479
VZ
1309// ----------------------------------------------------------------------------
1310// endian-reversing conversions
1311// ----------------------------------------------------------------------------
c91830cb 1312
467e0479
VZ
1313size_t
1314wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1315 const char *src, size_t srcLen) const
c91830cb 1316{
467e0479
VZ
1317 srcLen = GetLength(src, srcLen);
1318 if ( srcLen == wxNO_LEN )
1319 return wxCONV_FAILED;
c91830cb 1320
467e0479
VZ
1321 srcLen /= BYTES_PER_CHAR;
1322
1323 if ( dst )
c91830cb 1324 {
467e0479
VZ
1325 if ( dstLen < srcLen )
1326 return wxCONV_FAILED;
1327
ef199164
DS
1328 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1329 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1330 {
ef199164 1331 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1332 }
c91830cb 1333 }
bfab25d4 1334
467e0479 1335 return srcLen;
c91830cb
VZ
1336}
1337
467e0479
VZ
1338size_t
1339wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1340 const wchar_t *src, size_t srcLen) const
c91830cb 1341{
467e0479
VZ
1342 if ( srcLen == wxNO_LEN )
1343 srcLen = wxWcslen(src) + 1;
c91830cb 1344
467e0479
VZ
1345 srcLen *= BYTES_PER_CHAR;
1346
1347 if ( dst )
c91830cb 1348 {
467e0479
VZ
1349 if ( dstLen < srcLen )
1350 return wxCONV_FAILED;
1351
ef199164 1352 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1353 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1354 {
ef199164 1355 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1356 }
c91830cb 1357 }
eec47cc6 1358
467e0479 1359 return srcLen;
c91830cb
VZ
1360}
1361
467e0479 1362#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1363
467e0479
VZ
1364// ----------------------------------------------------------------------------
1365// conversions without endianness change
1366// ----------------------------------------------------------------------------
c91830cb 1367
35d11700
VZ
1368size_t
1369wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1370 const char *src, size_t srcLen) const
c91830cb 1371{
35d11700
VZ
1372 srcLen = GetLength(src, srcLen);
1373 if ( srcLen == wxNO_LEN )
1374 return wxCONV_FAILED;
c91830cb 1375
ef199164 1376 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1377 if ( !dst )
c91830cb 1378 {
35d11700
VZ
1379 // optimization: return maximal space which could be needed for this
1380 // string even if the real size could be smaller if the buffer contains
1381 // any surrogates
1382 return inLen;
c91830cb 1383 }
c91830cb 1384
35d11700 1385 size_t outLen = 0;
ef199164
DS
1386 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1387 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1388 {
ef199164
DS
1389 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1390 if ( !inBuff )
35d11700
VZ
1391 return wxCONV_FAILED;
1392
1393 if ( ++outLen > dstLen )
1394 return wxCONV_FAILED;
c91830cb 1395
35d11700
VZ
1396 *dst++ = ch;
1397 }
1398
1399
1400 return outLen;
1401}
c91830cb 1402
35d11700
VZ
1403size_t
1404wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1405 const wchar_t *src, size_t srcLen) const
c91830cb 1406{
35d11700
VZ
1407 if ( srcLen == wxNO_LEN )
1408 srcLen = wxWcslen(src) + 1;
c91830cb 1409
35d11700 1410 size_t outLen = 0;
ef199164 1411 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1412 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1413 {
1414 wxUint16 cc[2];
35d11700
VZ
1415 const size_t numChars = encode_utf16(*src++, cc);
1416 if ( numChars == wxCONV_FAILED )
1417 return wxCONV_FAILED;
c91830cb 1418
ef199164
DS
1419 outLen += numChars * BYTES_PER_CHAR;
1420 if ( outBuff )
c91830cb 1421 {
35d11700
VZ
1422 if ( outLen > dstLen )
1423 return wxCONV_FAILED;
1424
ef199164 1425 *outBuff++ = cc[0];
35d11700 1426 if ( numChars == 2 )
69b80d28 1427 {
35d11700 1428 // second character of a surrogate
ef199164 1429 *outBuff++ = cc[1];
69b80d28 1430 }
c91830cb 1431 }
c91830cb 1432 }
c91830cb 1433
35d11700 1434 return outLen;
c91830cb
VZ
1435}
1436
467e0479
VZ
1437// ----------------------------------------------------------------------------
1438// endian-reversing conversions
1439// ----------------------------------------------------------------------------
c91830cb 1440
35d11700
VZ
1441size_t
1442wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1443 const char *src, size_t srcLen) const
c91830cb 1444{
35d11700
VZ
1445 srcLen = GetLength(src, srcLen);
1446 if ( srcLen == wxNO_LEN )
1447 return wxCONV_FAILED;
1448
ef199164 1449 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1450 if ( !dst )
1451 {
1452 // optimization: return maximal space which could be needed for this
1453 // string even if the real size could be smaller if the buffer contains
1454 // any surrogates
1455 return inLen;
1456 }
c91830cb 1457
35d11700 1458 size_t outLen = 0;
ef199164
DS
1459 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1460 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1461 {
35d11700
VZ
1462 wxUint32 ch;
1463 wxUint16 tmp[2];
ef199164
DS
1464
1465 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1466 inBuff++;
1467 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1468
35d11700
VZ
1469 const size_t numChars = decode_utf16(tmp, ch);
1470 if ( numChars == wxCONV_FAILED )
1471 return wxCONV_FAILED;
c91830cb 1472
35d11700 1473 if ( numChars == 2 )
ef199164 1474 inBuff++;
35d11700
VZ
1475
1476 if ( ++outLen > dstLen )
1477 return wxCONV_FAILED;
c91830cb 1478
35d11700 1479 *dst++ = ch;
c91830cb 1480 }
c91830cb 1481
c91830cb 1482
35d11700
VZ
1483 return outLen;
1484}
c91830cb 1485
35d11700
VZ
1486size_t
1487wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1488 const wchar_t *src, size_t srcLen) const
c91830cb 1489{
35d11700
VZ
1490 if ( srcLen == wxNO_LEN )
1491 srcLen = wxWcslen(src) + 1;
c91830cb 1492
35d11700 1493 size_t outLen = 0;
ef199164 1494 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1495 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1496 {
1497 wxUint16 cc[2];
35d11700
VZ
1498 const size_t numChars = encode_utf16(*src, cc);
1499 if ( numChars == wxCONV_FAILED )
1500 return wxCONV_FAILED;
c91830cb 1501
ef199164
DS
1502 outLen += numChars * BYTES_PER_CHAR;
1503 if ( outBuff )
c91830cb 1504 {
35d11700
VZ
1505 if ( outLen > dstLen )
1506 return wxCONV_FAILED;
1507
ef199164 1508 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1509 if ( numChars == 2 )
c91830cb 1510 {
35d11700 1511 // second character of a surrogate
ef199164 1512 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1513 }
1514 }
c91830cb 1515 }
c91830cb 1516
35d11700 1517 return outLen;
c91830cb
VZ
1518}
1519
467e0479 1520#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1521
1522
35d11700 1523// ============================================================================
c91830cb 1524// UTF-32
35d11700 1525// ============================================================================
c91830cb
VZ
1526
1527#ifdef WORDS_BIGENDIAN
467e0479
VZ
1528 #define wxMBConvUTF32straight wxMBConvUTF32BE
1529 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1530#else
467e0479
VZ
1531 #define wxMBConvUTF32swap wxMBConvUTF32BE
1532 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1533#endif
1534
1535
1536WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1537WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1538
467e0479
VZ
1539/* static */
1540size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1541{
1542 if ( srcLen == wxNO_LEN )
1543 {
1544 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1545 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1546 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1547 ;
c91830cb 1548
467e0479
VZ
1549 srcLen *= BYTES_PER_CHAR;
1550 }
1551 else // we already have the length
1552 {
1553 // we can only convert an entire number of UTF-32 characters
1554 if ( srcLen % BYTES_PER_CHAR )
1555 return wxCONV_FAILED;
1556 }
1557
1558 return srcLen;
1559}
1560
1561// case when in-memory representation is UTF-16
c91830cb
VZ
1562#ifdef WC_UTF16
1563
467e0479
VZ
1564// ----------------------------------------------------------------------------
1565// conversions without endianness change
1566// ----------------------------------------------------------------------------
1567
1568size_t
1569wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1570 const char *src, size_t srcLen) const
c91830cb 1571{
467e0479
VZ
1572 srcLen = GetLength(src, srcLen);
1573 if ( srcLen == wxNO_LEN )
1574 return wxCONV_FAILED;
c91830cb 1575
ef199164
DS
1576 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1577 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1578 size_t outLen = 0;
1579 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1580 {
1581 wxUint16 cc[2];
ef199164 1582 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1583 if ( numChars == wxCONV_FAILED )
1584 return wxCONV_FAILED;
c91830cb 1585
467e0479
VZ
1586 outLen += numChars;
1587 if ( dst )
c91830cb 1588 {
467e0479
VZ
1589 if ( outLen > dstLen )
1590 return wxCONV_FAILED;
d32a507d 1591
467e0479
VZ
1592 *dst++ = cc[0];
1593 if ( numChars == 2 )
1594 {
1595 // second character of a surrogate
1596 *dst++ = cc[1];
1597 }
1598 }
c91830cb 1599 }
d32a507d 1600
467e0479 1601 return outLen;
c91830cb
VZ
1602}
1603
467e0479
VZ
1604size_t
1605wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1606 const wchar_t *src, size_t srcLen) const
c91830cb 1607{
467e0479
VZ
1608 if ( srcLen == wxNO_LEN )
1609 srcLen = wxWcslen(src) + 1;
c91830cb 1610
467e0479 1611 if ( !dst )
c91830cb 1612 {
467e0479
VZ
1613 // optimization: return maximal space which could be needed for this
1614 // string instead of the exact amount which could be less if there are
1615 // any surrogates in the input
1616 //
1617 // we consider that surrogates are rare enough to make it worthwhile to
1618 // avoid running the loop below at the cost of slightly extra memory
1619 // consumption
ef199164 1620 return srcLen * BYTES_PER_CHAR;
467e0479 1621 }
c91830cb 1622
ef199164 1623 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1624 size_t outLen = 0;
1625 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1626 {
1627 const wxUint32 ch = wxDecodeSurrogate(&src);
1628 if ( !src )
1629 return wxCONV_FAILED;
c91830cb 1630
467e0479 1631 outLen += BYTES_PER_CHAR;
d32a507d 1632
467e0479
VZ
1633 if ( outLen > dstLen )
1634 return wxCONV_FAILED;
b5153fd8 1635
ef199164 1636 *outBuff++ = ch;
467e0479 1637 }
c91830cb 1638
467e0479 1639 return outLen;
c91830cb
VZ
1640}
1641
467e0479
VZ
1642// ----------------------------------------------------------------------------
1643// endian-reversing conversions
1644// ----------------------------------------------------------------------------
c91830cb 1645
467e0479
VZ
1646size_t
1647wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1648 const char *src, size_t srcLen) const
c91830cb 1649{
467e0479
VZ
1650 srcLen = GetLength(src, srcLen);
1651 if ( srcLen == wxNO_LEN )
1652 return wxCONV_FAILED;
c91830cb 1653
ef199164
DS
1654 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1655 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1656 size_t outLen = 0;
ef199164 1657 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1658 {
c91830cb 1659 wxUint16 cc[2];
ef199164 1660 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1661 if ( numChars == wxCONV_FAILED )
1662 return wxCONV_FAILED;
c91830cb 1663
467e0479
VZ
1664 outLen += numChars;
1665 if ( dst )
c91830cb 1666 {
467e0479
VZ
1667 if ( outLen > dstLen )
1668 return wxCONV_FAILED;
d32a507d 1669
467e0479
VZ
1670 *dst++ = cc[0];
1671 if ( numChars == 2 )
1672 {
1673 // second character of a surrogate
1674 *dst++ = cc[1];
1675 }
1676 }
c91830cb 1677 }
b5153fd8 1678
467e0479 1679 return outLen;
c91830cb
VZ
1680}
1681
467e0479
VZ
1682size_t
1683wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1684 const wchar_t *src, size_t srcLen) const
c91830cb 1685{
467e0479
VZ
1686 if ( srcLen == wxNO_LEN )
1687 srcLen = wxWcslen(src) + 1;
c91830cb 1688
467e0479 1689 if ( !dst )
c91830cb 1690 {
467e0479
VZ
1691 // optimization: return maximal space which could be needed for this
1692 // string instead of the exact amount which could be less if there are
1693 // any surrogates in the input
1694 //
1695 // we consider that surrogates are rare enough to make it worthwhile to
1696 // avoid running the loop below at the cost of slightly extra memory
1697 // consumption
1698 return srcLen*BYTES_PER_CHAR;
1699 }
c91830cb 1700
ef199164 1701 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1702 size_t outLen = 0;
1703 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1704 {
1705 const wxUint32 ch = wxDecodeSurrogate(&src);
1706 if ( !src )
1707 return wxCONV_FAILED;
c91830cb 1708
467e0479 1709 outLen += BYTES_PER_CHAR;
d32a507d 1710
467e0479
VZ
1711 if ( outLen > dstLen )
1712 return wxCONV_FAILED;
b5153fd8 1713
ef199164 1714 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1715 }
c91830cb 1716
467e0479 1717 return outLen;
c91830cb
VZ
1718}
1719
467e0479 1720#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1721
35d11700
VZ
1722// ----------------------------------------------------------------------------
1723// conversions without endianness change
1724// ----------------------------------------------------------------------------
1725
1726size_t
1727wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1728 const char *src, size_t srcLen) const
c91830cb 1729{
35d11700
VZ
1730 // use memcpy() as it should be much faster than hand-written loop
1731 srcLen = GetLength(src, srcLen);
1732 if ( srcLen == wxNO_LEN )
1733 return wxCONV_FAILED;
c91830cb 1734
35d11700
VZ
1735 const size_t inLen = srcLen/BYTES_PER_CHAR;
1736 if ( dst )
c91830cb 1737 {
35d11700
VZ
1738 if ( dstLen < inLen )
1739 return wxCONV_FAILED;
b5153fd8 1740
35d11700
VZ
1741 memcpy(dst, src, srcLen);
1742 }
c91830cb 1743
35d11700 1744 return inLen;
c91830cb
VZ
1745}
1746
35d11700
VZ
1747size_t
1748wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1749 const wchar_t *src, size_t srcLen) const
c91830cb 1750{
35d11700
VZ
1751 if ( srcLen == wxNO_LEN )
1752 srcLen = wxWcslen(src) + 1;
1753
1754 srcLen *= BYTES_PER_CHAR;
c91830cb 1755
35d11700 1756 if ( dst )
c91830cb 1757 {
35d11700
VZ
1758 if ( dstLen < srcLen )
1759 return wxCONV_FAILED;
c91830cb 1760
35d11700 1761 memcpy(dst, src, srcLen);
c91830cb
VZ
1762 }
1763
35d11700 1764 return srcLen;
c91830cb
VZ
1765}
1766
35d11700
VZ
1767// ----------------------------------------------------------------------------
1768// endian-reversing conversions
1769// ----------------------------------------------------------------------------
c91830cb 1770
35d11700
VZ
1771size_t
1772wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1773 const char *src, size_t srcLen) const
c91830cb 1774{
35d11700
VZ
1775 srcLen = GetLength(src, srcLen);
1776 if ( srcLen == wxNO_LEN )
1777 return wxCONV_FAILED;
1778
1779 srcLen /= BYTES_PER_CHAR;
c91830cb 1780
35d11700 1781 if ( dst )
c91830cb 1782 {
35d11700
VZ
1783 if ( dstLen < srcLen )
1784 return wxCONV_FAILED;
1785
ef199164
DS
1786 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1787 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1788 {
ef199164 1789 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1790 }
c91830cb 1791 }
b5153fd8 1792
35d11700 1793 return srcLen;
c91830cb
VZ
1794}
1795
35d11700
VZ
1796size_t
1797wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1798 const wchar_t *src, size_t srcLen) const
c91830cb 1799{
35d11700
VZ
1800 if ( srcLen == wxNO_LEN )
1801 srcLen = wxWcslen(src) + 1;
1802
1803 srcLen *= BYTES_PER_CHAR;
c91830cb 1804
35d11700 1805 if ( dst )
c91830cb 1806 {
35d11700
VZ
1807 if ( dstLen < srcLen )
1808 return wxCONV_FAILED;
1809
ef199164 1810 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1811 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1812 {
ef199164 1813 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1814 }
c91830cb 1815 }
b5153fd8 1816
35d11700 1817 return srcLen;
c91830cb
VZ
1818}
1819
467e0479 1820#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1821
1822
36acb880
VZ
1823// ============================================================================
1824// The classes doing conversion using the iconv_xxx() functions
1825// ============================================================================
3caec1bb 1826
b040e242 1827#ifdef HAVE_ICONV
3a0d76bc 1828
b1d547eb
VS
1829// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1830// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1831// (unless there's yet another bug in glibc) the only case when iconv()
1832// returns with (size_t)-1 (which means error) and says there are 0 bytes
1833// left in the input buffer -- when _real_ error occurs,
1834// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1835// iconv() failure.
3caec1bb
VS
1836// [This bug does not appear in glibc 2.2.]
1837#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1838#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1839 (errno != E2BIG || bufLeft != 0))
1840#else
1841#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1842#endif
1843
ab217dba 1844#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1845
74a7eb0b
VZ
1846#define ICONV_T_INVALID ((iconv_t)-1)
1847
1848#if SIZEOF_WCHAR_T == 4
1849 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1850 #define WC_ENC wxFONTENCODING_UTF32
1851#elif SIZEOF_WCHAR_T == 2
1852 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1853 #define WC_ENC wxFONTENCODING_UTF16
1854#else // sizeof(wchar_t) != 2 nor 4
1855 // does this ever happen?
1856 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1857#endif
1858
36acb880 1859// ----------------------------------------------------------------------------
e95354ec 1860// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1861// ----------------------------------------------------------------------------
1862
e95354ec 1863class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1864{
1865public:
86501081 1866 wxMBConv_iconv(const char *name);
e95354ec 1867 virtual ~wxMBConv_iconv();
36acb880 1868
bde4baac
VZ
1869 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1870 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1871
d36c9347 1872 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1873 virtual size_t GetMBNulLen() const;
1874
ba98e032
VS
1875#if wxUSE_UNICODE_UTF8
1876 virtual bool IsUTF8() const;
1877#endif
1878
d36c9347
VZ
1879 virtual wxMBConv *Clone() const
1880 {
86501081 1881 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
d36c9347
VZ
1882 p->m_minMBCharWidth = m_minMBCharWidth;
1883 return p;
1884 }
1885
e95354ec 1886 bool IsOk() const
74a7eb0b 1887 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1888
1889protected:
ef199164
DS
1890 // the iconv handlers used to translate from multibyte
1891 // to wide char and in the other direction
36acb880
VZ
1892 iconv_t m2w,
1893 w2m;
ef199164 1894
b1d547eb
VS
1895#if wxUSE_THREADS
1896 // guards access to m2w and w2m objects
1897 wxMutex m_iconvMutex;
1898#endif
36acb880
VZ
1899
1900private:
e95354ec 1901 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1902 // available on this machine, it will remain NULL
74a7eb0b 1903 static wxString ms_wcCharsetName;
36acb880
VZ
1904
1905 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1906 // different endian-ness than the native one
405d8f46 1907 static bool ms_wcNeedsSwap;
eec47cc6 1908
d36c9347
VZ
1909
1910 // name of the encoding handled by this conversion
1911 wxString m_name;
1912
7ef3ab50 1913 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1914 // initially
1915 size_t m_minMBCharWidth;
36acb880
VZ
1916};
1917
8f115891 1918// make the constructor available for unit testing
86501081 1919WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
8f115891
MW
1920{
1921 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1922 if ( !result->IsOk() )
1923 {
1924 delete result;
1925 return 0;
1926 }
ef199164 1927
8f115891
MW
1928 return result;
1929}
1930
422e411e 1931wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1932bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1933
86501081 1934wxMBConv_iconv::wxMBConv_iconv(const char *name)
d36c9347 1935 : m_name(name)
36acb880 1936{
c1464d9d 1937 m_minMBCharWidth = 0;
eec47cc6 1938
36acb880 1939 // check for charset that represents wchar_t:
74a7eb0b 1940 if ( ms_wcCharsetName.empty() )
f1339c56 1941 {
c2b83fdd
VZ
1942 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1943
74a7eb0b
VZ
1944#if wxUSE_FONTMAP
1945 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1946#else // !wxUSE_FONTMAP
91cb7f52 1947 static const wxChar *names_static[] =
36acb880 1948 {
74a7eb0b
VZ
1949#if SIZEOF_WCHAR_T == 4
1950 _T("UCS-4"),
1951#elif SIZEOF_WCHAR_T = 2
1952 _T("UCS-2"),
1953#endif
1954 NULL
1955 };
91cb7f52 1956 const wxChar **names = names_static;
74a7eb0b 1957#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1958
d1f024a8 1959 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1960 {
17a1ebd1 1961 const wxString nameCS(*names);
74a7eb0b
VZ
1962
1963 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1964 wxString nameXE(nameCS);
ef199164
DS
1965
1966#ifdef WORDS_BIGENDIAN
74a7eb0b 1967 nameXE += _T("BE");
ef199164 1968#else // little endian
74a7eb0b 1969 nameXE += _T("LE");
ef199164 1970#endif
74a7eb0b 1971
c2b83fdd
VZ
1972 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1973 nameXE.c_str());
1974
86501081 1975 m2w = iconv_open(nameXE.ToAscii(), name);
74a7eb0b 1976 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1977 {
74a7eb0b 1978 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1979 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1980 nameCS.c_str());
86501081 1981 m2w = iconv_open(nameCS.ToAscii(), name);
3a0d76bc 1982
74a7eb0b
VZ
1983 // and check for bytesex ourselves:
1984 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1985 {
74a7eb0b
VZ
1986 char buf[2], *bufPtr;
1987 wchar_t wbuf[2], *wbufPtr;
1988 size_t insz, outsz;
1989 size_t res;
1990
1991 buf[0] = 'A';
1992 buf[1] = 0;
1993 wbuf[0] = 0;
1994 insz = 2;
1995 outsz = SIZEOF_WCHAR_T * 2;
1996 wbufPtr = wbuf;
1997 bufPtr = buf;
1998
ef199164
DS
1999 res = iconv(
2000 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2001 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
2002
2003 if (ICONV_FAILED(res, insz))
2004 {
2005 wxLogLastError(wxT("iconv"));
422e411e 2006 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 2007 nameCS.c_str());
74a7eb0b
VZ
2008 }
2009 else // ok, can convert to this encoding, remember it
2010 {
17a1ebd1 2011 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
2012 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2013 }
3a0d76bc
VS
2014 }
2015 }
74a7eb0b 2016 else // use charset not requiring byte swapping
36acb880 2017 {
74a7eb0b 2018 ms_wcCharsetName = nameXE;
36acb880 2019 }
3a0d76bc 2020 }
74a7eb0b 2021
0944fceb 2022 wxLogTrace(TRACE_STRCONV,
74a7eb0b 2023 wxT("iconv wchar_t charset is \"%s\"%s"),
999020e1
VZ
2024 ms_wcCharsetName.empty() ? wxString("<none>")
2025 : ms_wcCharsetName,
74a7eb0b
VZ
2026 ms_wcNeedsSwap ? _T(" (needs swap)")
2027 : _T(""));
3a0d76bc 2028 }
36acb880 2029 else // we already have ms_wcCharsetName
3caec1bb 2030 {
86501081 2031 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
f1339c56 2032 }
dccce9ea 2033
74a7eb0b 2034 if ( ms_wcCharsetName.empty() )
f1339c56 2035 {
74a7eb0b 2036 w2m = ICONV_T_INVALID;
36acb880 2037 }
405d8f46
VZ
2038 else
2039 {
86501081 2040 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
74a7eb0b
VZ
2041 if ( w2m == ICONV_T_INVALID )
2042 {
2043 wxLogTrace(TRACE_STRCONV,
2044 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
86501081 2045 ms_wcCharsetName.c_str(), name);
74a7eb0b 2046 }
405d8f46 2047 }
36acb880 2048}
3caec1bb 2049
e95354ec 2050wxMBConv_iconv::~wxMBConv_iconv()
36acb880 2051{
74a7eb0b 2052 if ( m2w != ICONV_T_INVALID )
36acb880 2053 iconv_close(m2w);
74a7eb0b 2054 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
2055 iconv_close(w2m);
2056}
3a0d76bc 2057
bde4baac 2058size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 2059{
69373110
VZ
2060 // find the string length: notice that must be done differently for
2061 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2062 size_t inbuf;
7ef3ab50 2063 const size_t nulLen = GetMBNulLen();
69373110
VZ
2064 switch ( nulLen )
2065 {
2066 default:
467e0479 2067 return wxCONV_FAILED;
69373110
VZ
2068
2069 case 1:
2070 inbuf = strlen(psz); // arguably more optimized than our version
2071 break;
2072
2073 case 2:
2074 case 4:
2075 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2076 // they also have to start at character boundary and not span two
2077 // adjacent characters
2078 const char *p;
2079 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2080 ;
2081 inbuf = p - psz;
2082 break;
2083 }
2084
b1d547eb 2085#if wxUSE_THREADS
6a17b868
SN
2086 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2087 // Unfortunately there are a couple of global wxCSConv objects such as
b1d547eb
VS
2088 // wxConvLocal that are used all over wx code, so we have to make sure
2089 // the handle is used by at most one thread at the time. Otherwise
2090 // only a few wx classes would be safe to use from non-main threads
2091 // as MB<->WC conversion would fail "randomly".
2092 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
2093#endif // wxUSE_THREADS
2094
36acb880
VZ
2095 size_t outbuf = n * SIZEOF_WCHAR_T;
2096 size_t res, cres;
2097 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2098 wchar_t *bufPtr = buf;
2099 const char *pszPtr = psz;
2100
2101 if (buf)
2102 {
2103 // have destination buffer, convert there
2104 cres = iconv(m2w,
2105 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2106 (char**)&bufPtr, &outbuf);
2107 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 2108
36acb880 2109 if (ms_wcNeedsSwap)
3a0d76bc 2110 {
36acb880 2111 // convert to native endianness
17a1ebd1
VZ
2112 for ( unsigned i = 0; i < res; i++ )
2113 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 2114 }
adb45366 2115
69373110 2116 // NUL-terminate the string if there is any space left
49dd9820
VS
2117 if (res < n)
2118 buf[res] = 0;
36acb880
VZ
2119 }
2120 else
2121 {
2122 // no destination buffer... convert using temp buffer
2123 // to calculate destination buffer requirement
2124 wchar_t tbuf[8];
2125 res = 0;
ef199164
DS
2126
2127 do
2128 {
36acb880 2129 bufPtr = tbuf;
ef199164 2130 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
2131
2132 cres = iconv(m2w,
2133 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2134 (char**)&bufPtr, &outbuf );
2135
ef199164
DS
2136 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2137 }
2138 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2139 }
dccce9ea 2140
36acb880 2141 if (ICONV_FAILED(cres, inbuf))
f1339c56 2142 {
36acb880 2143 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 2144 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2145 return wxCONV_FAILED;
36acb880
VZ
2146 }
2147
2148 return res;
2149}
2150
bde4baac 2151size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 2152{
b1d547eb
VS
2153#if wxUSE_THREADS
2154 // NB: explained in MB2WC
2155 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2156#endif
3698ae71 2157
156162ec
MW
2158 size_t inlen = wxWcslen(psz);
2159 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
2160 size_t outbuf = n;
2161 size_t res, cres;
3a0d76bc 2162
36acb880 2163 wchar_t *tmpbuf = 0;
3caec1bb 2164
36acb880
VZ
2165 if (ms_wcNeedsSwap)
2166 {
2167 // need to copy to temp buffer to switch endianness
74a7eb0b 2168 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 2169 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 2170 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
2171 for ( size_t i = 0; i < inlen; i++ )
2172 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 2173
156162ec 2174 tmpbuf[inlen] = L'\0';
74a7eb0b 2175 psz = tmpbuf;
36acb880 2176 }
3a0d76bc 2177
36acb880
VZ
2178 if (buf)
2179 {
2180 // have destination buffer, convert there
2181 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 2182
ef199164 2183 res = n - outbuf;
adb45366 2184
49dd9820
VS
2185 // NB: iconv was given only wcslen(psz) characters on input, and so
2186 // it couldn't convert the trailing zero. Let's do it ourselves
2187 // if there's some room left for it in the output buffer.
2188 if (res < n)
2189 buf[0] = 0;
36acb880
VZ
2190 }
2191 else
2192 {
ef199164 2193 // no destination buffer: convert using temp buffer
36acb880
VZ
2194 // to calculate destination buffer requirement
2195 char tbuf[16];
2196 res = 0;
ef199164
DS
2197 do
2198 {
2199 buf = tbuf;
2200 outbuf = 16;
36acb880
VZ
2201
2202 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 2203
36acb880 2204 res += 16 - outbuf;
ef199164
DS
2205 }
2206 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 2207 }
dccce9ea 2208
36acb880
VZ
2209 if (ms_wcNeedsSwap)
2210 {
2211 free(tmpbuf);
2212 }
dccce9ea 2213
36acb880
VZ
2214 if (ICONV_FAILED(cres, inbuf))
2215 {
ce6f8d6f 2216 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 2217 return wxCONV_FAILED;
36acb880
VZ
2218 }
2219
2220 return res;
2221}
2222
7ef3ab50 2223size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 2224{
c1464d9d 2225 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
2226 {
2227 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2228
2229#if wxUSE_THREADS
2230 // NB: explained in MB2WC
2231 wxMutexLocker lock(self->m_iconvMutex);
2232#endif
2233
999020e1 2234 const wchar_t *wnul = L"";
c1464d9d 2235 char buf[8]; // should be enough for NUL in any encoding
356410fc 2236 size_t inLen = sizeof(wchar_t),
c1464d9d 2237 outLen = WXSIZEOF(buf);
ef199164
DS
2238 char *inBuff = (char *)wnul;
2239 char *outBuff = buf;
2240 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 2241 {
c1464d9d 2242 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
2243 }
2244 else // ok
2245 {
ef199164 2246 self->m_minMBCharWidth = outBuff - buf;
356410fc 2247 }
eec47cc6
VZ
2248 }
2249
c1464d9d 2250 return m_minMBCharWidth;
eec47cc6
VZ
2251}
2252
ba98e032
VS
2253#if wxUSE_UNICODE_UTF8
2254bool wxMBConv_iconv::IsUTF8() const
2255{
86501081
VS
2256 return wxStricmp(m_name, "UTF-8") == 0 ||
2257 wxStricmp(m_name, "UTF8") == 0;
ba98e032
VS
2258}
2259#endif
2260
b040e242 2261#endif // HAVE_ICONV
36acb880 2262
e95354ec 2263
36acb880
VZ
2264// ============================================================================
2265// Win32 conversion classes
2266// ============================================================================
1cd52418 2267
e95354ec 2268#ifdef wxHAVE_WIN32_MB2WC
373658eb 2269
8b04d4c4 2270// from utils.cpp
d775fa82 2271#if wxUSE_FONTMAP
86501081 2272extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
8b04d4c4 2273extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 2274#endif
373658eb 2275
e95354ec 2276class wxMBConv_win32 : public wxMBConv
1cd52418
OK
2277{
2278public:
bde4baac
VZ
2279 wxMBConv_win32()
2280 {
2281 m_CodePage = CP_ACP;
c1464d9d 2282 m_minMBCharWidth = 0;
bde4baac
VZ
2283 }
2284
d36c9347 2285 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2286 : wxMBConv()
d36c9347
VZ
2287 {
2288 m_CodePage = conv.m_CodePage;
2289 m_minMBCharWidth = conv.m_minMBCharWidth;
2290 }
2291
7608a683 2292#if wxUSE_FONTMAP
86501081 2293 wxMBConv_win32(const char* name)
bde4baac
VZ
2294 {
2295 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2296 m_minMBCharWidth = 0;
bde4baac 2297 }
dccce9ea 2298
e95354ec 2299 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2300 {
2301 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2302 m_minMBCharWidth = 0;
bde4baac 2303 }
eec47cc6 2304#endif // wxUSE_FONTMAP
8b04d4c4 2305
d36c9347 2306 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2307 {
02272c9c
VZ
2308 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2309 // the behaviour is not compatible with the Unix version (using iconv)
2310 // and break the library itself, e.g. wxTextInputStream::NextChar()
2311 // wouldn't work if reading an incomplete MB char didn't result in an
2312 // error
667e5b3e 2313 //
89028980 2314 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2315 // Win XP or newer and it is not supported for UTF-[78] so we always
2316 // use our own conversions in this case. See
89028980
VS
2317 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2318 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2319 if ( m_CodePage == CP_UTF8 )
89028980 2320 {
5487ff0f 2321 return wxMBConvUTF8().MB2WC(buf, psz, n);
89028980 2322 }
830f8f11
VZ
2323
2324 if ( m_CodePage == CP_UTF7 )
2325 {
5487ff0f 2326 return wxMBConvUTF7().MB2WC(buf, psz, n);
830f8f11
VZ
2327 }
2328
2329 int flags = 0;
2330 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2331 IsAtLeastWin2kSP4() )
89028980 2332 {
830f8f11 2333 flags = MB_ERR_INVALID_CHARS;
89028980 2334 }
667e5b3e 2335
2b5f62a0
VZ
2336 const size_t len = ::MultiByteToWideChar
2337 (
2338 m_CodePage, // code page
667e5b3e 2339 flags, // flags: fall on error
2b5f62a0
VZ
2340 psz, // input string
2341 -1, // its length (NUL-terminated)
b4da152e 2342 buf, // output string
2b5f62a0
VZ
2343 buf ? n : 0 // size of output buffer
2344 );
89028980
VS
2345 if ( !len )
2346 {
2347 // function totally failed
467e0479 2348 return wxCONV_FAILED;
89028980
VS
2349 }
2350
2351 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2352 // check if we succeeded, by doing a double trip:
2353 if ( !flags && buf )
2354 {
53c174fc
VZ
2355 const size_t mbLen = strlen(psz);
2356 wxCharBuffer mbBuf(mbLen);
89028980
VS
2357 if ( ::WideCharToMultiByte
2358 (
2359 m_CodePage,
2360 0,
2361 buf,
2362 -1,
2363 mbBuf.data(),
53c174fc 2364 mbLen + 1, // size in bytes, not length
89028980
VS
2365 NULL,
2366 NULL
2367 ) == 0 ||
2368 strcmp(mbBuf, psz) != 0 )
2369 {
2370 // we didn't obtain the same thing we started from, hence
2371 // the conversion was lossy and we consider that it failed
467e0479 2372 return wxCONV_FAILED;
89028980
VS
2373 }
2374 }
2b5f62a0 2375
03a991bc
VZ
2376 // note that it returns count of written chars for buf != NULL and size
2377 // of the needed buffer for buf == NULL so in either case the length of
2378 // the string (which never includes the terminating NUL) is one less
89028980 2379 return len - 1;
f1339c56 2380 }
dccce9ea 2381
d36c9347 2382 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2383 {
13dd924a
VZ
2384 /*
2385 we have a problem here: by default, WideCharToMultiByte() may
2386 replace characters unrepresentable in the target code page with bad
2387 quality approximations such as turning "1/2" symbol (U+00BD) into
2388 "1" for the code pages which don't have it and we, obviously, want
2389 to avoid this at any price
d775fa82 2390
13dd924a
VZ
2391 the trouble is that this function does it _silently_, i.e. it won't
2392 even tell us whether it did or not... Win98/2000 and higher provide
2393 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2394 we have to resort to a round trip, i.e. check that converting back
2395 results in the same string -- this is, of course, expensive but
2396 otherwise we simply can't be sure to not garble the data.
2397 */
2398
2399 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2400 // it doesn't work with CJK encodings (which we test for rather roughly
2401 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2402 // supporting it
907173e5
WS
2403 BOOL usedDef wxDUMMY_INITIALIZE(false);
2404 BOOL *pUsedDef;
13dd924a
VZ
2405 int flags;
2406 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2407 {
2408 // it's our lucky day
2409 flags = WC_NO_BEST_FIT_CHARS;
2410 pUsedDef = &usedDef;
2411 }
2412 else // old system or unsupported encoding
2413 {
2414 flags = 0;
2415 pUsedDef = NULL;
2416 }
2417
2b5f62a0
VZ
2418 const size_t len = ::WideCharToMultiByte
2419 (
2420 m_CodePage, // code page
13dd924a
VZ
2421 flags, // either none or no best fit
2422 pwz, // input string
2b5f62a0
VZ
2423 -1, // it is (wide) NUL-terminated
2424 buf, // output buffer
2425 buf ? n : 0, // and its size
2426 NULL, // default "replacement" char
13dd924a 2427 pUsedDef // [out] was it used?
2b5f62a0
VZ
2428 );
2429
13dd924a
VZ
2430 if ( !len )
2431 {
2432 // function totally failed
467e0479 2433 return wxCONV_FAILED;
13dd924a
VZ
2434 }
2435
765bdb4a
VZ
2436 // we did something, check if we really succeeded
2437 if ( flags )
13dd924a 2438 {
765bdb4a
VZ
2439 // check if the conversion failed, i.e. if any replacements
2440 // were done
2441 if ( usedDef )
2442 return wxCONV_FAILED;
2443 }
2444 else // we must resort to double tripping...
2445 {
2446 // first we need to ensure that we really have the MB data: this is
2447 // not the case if we're called with NULL buffer, in which case we
2448 // need to do the conversion yet again
2449 wxCharBuffer bufDef;
2450 if ( !buf )
13dd924a 2451 {
765bdb4a
VZ
2452 bufDef = wxCharBuffer(len);
2453 buf = bufDef.data();
2454 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2455 buf, len, NULL, NULL) )
467e0479 2456 return wxCONV_FAILED;
13dd924a 2457 }
765bdb4a
VZ
2458
2459 wxWCharBuffer wcBuf(n);
2460 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2461 wcscmp(wcBuf, pwz) != 0 )
13dd924a 2462 {
765bdb4a
VZ
2463 // we didn't obtain the same thing we started from, hence
2464 // the conversion was lossy and we consider that it failed
2465 return wxCONV_FAILED;
13dd924a
VZ
2466 }
2467 }
2468
03a991bc 2469 // see the comment above for the reason of "len - 1"
13dd924a 2470 return len - 1;
f1339c56 2471 }
dccce9ea 2472
7ef3ab50
VZ
2473 virtual size_t GetMBNulLen() const
2474 {
2475 if ( m_minMBCharWidth == 0 )
2476 {
2477 int len = ::WideCharToMultiByte
2478 (
2479 m_CodePage, // code page
2480 0, // no flags
2481 L"", // input string
2482 1, // translate just the NUL
2483 NULL, // output buffer
2484 0, // and its size
2485 NULL, // no replacement char
2486 NULL // [out] don't care if it was used
2487 );
2488
2489 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2490 switch ( len )
2491 {
2492 default:
2493 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2494 self->m_minMBCharWidth = (size_t)-1;
2495 break;
7ef3ab50
VZ
2496
2497 case 0:
2498 self->m_minMBCharWidth = (size_t)-1;
2499 break;
2500
2501 case 1:
2502 case 2:
2503 case 4:
2504 self->m_minMBCharWidth = len;
2505 break;
2506 }
2507 }
2508
2509 return m_minMBCharWidth;
2510 }
2511
d36c9347
VZ
2512 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2513
13dd924a
VZ
2514 bool IsOk() const { return m_CodePage != -1; }
2515
2516private:
2517 static bool CanUseNoBestFit()
2518 {
2519 static int s_isWin98Or2k = -1;
2520
2521 if ( s_isWin98Or2k == -1 )
2522 {
2523 int verMaj, verMin;
2524 switch ( wxGetOsVersion(&verMaj, &verMin) )
2525 {
406d283a 2526 case wxOS_WINDOWS_9X:
13dd924a
VZ
2527 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2528 break;
2529
406d283a 2530 case wxOS_WINDOWS_NT:
13dd924a
VZ
2531 s_isWin98Or2k = verMaj >= 5;
2532 break;
2533
2534 default:
ef199164 2535 // unknown: be conservative by default
13dd924a 2536 s_isWin98Or2k = 0;
ef199164 2537 break;
13dd924a
VZ
2538 }
2539
2540 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2541 }
2542
2543 return s_isWin98Or2k == 1;
2544 }
f1339c56 2545
89028980
VS
2546 static bool IsAtLeastWin2kSP4()
2547 {
8942f83a
WS
2548#ifdef __WXWINCE__
2549 return false;
2550#else
89028980
VS
2551 static int s_isAtLeastWin2kSP4 = -1;
2552
2553 if ( s_isAtLeastWin2kSP4 == -1 )
2554 {
2555 OSVERSIONINFOEX ver;
2556
2557 memset(&ver, 0, sizeof(ver));
2558 ver.dwOSVersionInfoSize = sizeof(ver);
2559 GetVersionEx((OSVERSIONINFO*)&ver);
2560
2561 s_isAtLeastWin2kSP4 =
2562 ((ver.dwMajorVersion > 5) || // Vista+
2563 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2564 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2565 ver.wServicePackMajor >= 4)) // 2000 SP4+
2566 ? 1 : 0;
2567 }
2568
2569 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2570#endif
89028980
VS
2571 }
2572
eec47cc6 2573
c1464d9d 2574 // the code page we're working with
b1d66b54 2575 long m_CodePage;
c1464d9d 2576
7ef3ab50 2577 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2578 // "unknown"
2579 size_t m_minMBCharWidth;
1cd52418 2580};
e95354ec
VZ
2581
2582#endif // wxHAVE_WIN32_MB2WC
2583
f7e98dee 2584
36acb880
VZ
2585// ============================================================================
2586// wxEncodingConverter based conversion classes
2587// ============================================================================
2588
1e6feb95 2589#if wxUSE_FONTMAP
1cd52418 2590
e95354ec 2591class wxMBConv_wxwin : public wxMBConv
1cd52418 2592{
8b04d4c4
VZ
2593private:
2594 void Init()
2595 {
6ac84a78
DE
2596 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597 // The wxMBConv_cf class does a better job.
2598 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2599 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
8b04d4c4
VZ
2600 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2601 }
2602
6001e347 2603public:
f1339c56
RR
2604 // temporarily just use wxEncodingConverter stuff,
2605 // so that it works while a better implementation is built
86501081 2606 wxMBConv_wxwin(const char* name)
f1339c56
RR
2607 {
2608 if (name)
267e11c5 2609 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2610 else
2611 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2612
8b04d4c4
VZ
2613 Init();
2614 }
2615
e95354ec 2616 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2617 {
2618 m_enc = enc;
2619
2620 Init();
f1339c56 2621 }
dccce9ea 2622
bde4baac 2623 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2624 {
2625 size_t inbuf = strlen(psz);
dccce9ea 2626 if (buf)
c643a977 2627 {
ef199164 2628 if (!m2w.Convert(psz, buf))
467e0479 2629 return wxCONV_FAILED;
c643a977 2630 }
f1339c56
RR
2631 return inbuf;
2632 }
dccce9ea 2633
bde4baac 2634 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2635 {
f8d791e0 2636 const size_t inbuf = wxWcslen(psz);
f1339c56 2637 if (buf)
c643a977 2638 {
ef199164 2639 if (!w2m.Convert(psz, buf))
467e0479 2640 return wxCONV_FAILED;
c643a977 2641 }
dccce9ea 2642
f1339c56
RR
2643 return inbuf;
2644 }
dccce9ea 2645
7ef3ab50 2646 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2647 {
2648 switch ( m_enc )
2649 {
2650 case wxFONTENCODING_UTF16BE:
2651 case wxFONTENCODING_UTF16LE:
c1464d9d 2652 return 2;
eec47cc6
VZ
2653
2654 case wxFONTENCODING_UTF32BE:
2655 case wxFONTENCODING_UTF32LE:
c1464d9d 2656 return 4;
eec47cc6
VZ
2657
2658 default:
c1464d9d 2659 return 1;
eec47cc6
VZ
2660 }
2661 }
2662
d36c9347
VZ
2663 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2664
7ef3ab50
VZ
2665 bool IsOk() const { return m_ok; }
2666
2667public:
2668 wxFontEncoding m_enc;
2669 wxEncodingConverter m2w, w2m;
2670
2671private:
cafbf6fb
VZ
2672 // were we initialized successfully?
2673 bool m_ok;
fc7a2a60 2674
e95354ec 2675 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2676};
6001e347 2677
8f115891 2678// make the constructors available for unit testing
86501081 2679WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
8f115891
MW
2680{
2681 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2682 if ( !result->IsOk() )
2683 {
2684 delete result;
2685 return 0;
2686 }
ef199164 2687
8f115891
MW
2688 return result;
2689}
2690
1e6feb95
VZ
2691#endif // wxUSE_FONTMAP
2692
36acb880
VZ
2693// ============================================================================
2694// wxCSConv implementation
2695// ============================================================================
2696
8b04d4c4 2697void wxCSConv::Init()
6001e347 2698{
e95354ec
VZ
2699 m_name = NULL;
2700 m_convReal = NULL;
2701 m_deferred = true;
2702}
2703
86501081 2704wxCSConv::wxCSConv(const wxString& charset)
8b04d4c4
VZ
2705{
2706 Init();
82713003 2707
86501081 2708 if ( !charset.empty() )
e95354ec 2709 {
86501081 2710 SetName(charset.ToAscii());
e95354ec 2711 }
bda3d86a 2712
e4277538
VZ
2713#if wxUSE_FONTMAP
2714 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2715#else
bda3d86a 2716 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2717#endif
6001e347
RR
2718}
2719
8b04d4c4
VZ
2720wxCSConv::wxCSConv(wxFontEncoding encoding)
2721{
bda3d86a 2722 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2723 {
2724 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2725
2726 encoding = wxFONTENCODING_SYSTEM;
2727 }
2728
8b04d4c4
VZ
2729 Init();
2730
bda3d86a 2731 m_encoding = encoding;
8b04d4c4
VZ
2732}
2733
6001e347
RR
2734wxCSConv::~wxCSConv()
2735{
65e50848
JS
2736 Clear();
2737}
2738
54380f29 2739wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2740 : wxMBConv()
54380f29 2741{
8b04d4c4
VZ
2742 Init();
2743
54380f29 2744 SetName(conv.m_name);
8b04d4c4 2745 m_encoding = conv.m_encoding;
54380f29
GD
2746}
2747
2748wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2749{
2750 Clear();
8b04d4c4 2751
54380f29 2752 SetName(conv.m_name);
8b04d4c4
VZ
2753 m_encoding = conv.m_encoding;
2754
54380f29
GD
2755 return *this;
2756}
2757
65e50848
JS
2758void wxCSConv::Clear()
2759{
8b04d4c4 2760 free(m_name);
e95354ec 2761 delete m_convReal;
8b04d4c4 2762
65e50848 2763 m_name = NULL;
e95354ec 2764 m_convReal = NULL;
6001e347
RR
2765}
2766
86501081 2767void wxCSConv::SetName(const char *charset)
6001e347 2768{
f1339c56
RR
2769 if (charset)
2770 {
d6f2a891 2771 m_name = wxStrdup(charset);
e95354ec 2772 m_deferred = true;
f1339c56 2773 }
6001e347
RR
2774}
2775
8b3eb85d 2776#if wxUSE_FONTMAP
8b3eb85d
VZ
2777
2778WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2779 wxEncodingNameCache );
8b3eb85d
VZ
2780
2781static wxEncodingNameCache gs_nameCache;
2782#endif
2783
e95354ec
VZ
2784wxMBConv *wxCSConv::DoCreate() const
2785{
ce6f8d6f
VZ
2786#if wxUSE_FONTMAP
2787 wxLogTrace(TRACE_STRCONV,
2788 wxT("creating conversion for %s"),
2789 (m_name ? m_name
86501081 2790 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
ce6f8d6f
VZ
2791#endif // wxUSE_FONTMAP
2792
c547282d
VZ
2793 // check for the special case of ASCII or ISO8859-1 charset: as we have
2794 // special knowledge of it anyhow, we don't need to create a special
2795 // conversion object
e4277538
VZ
2796 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2797 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2798 {
e95354ec
VZ
2799 // don't convert at all
2800 return NULL;
2801 }
dccce9ea 2802
e95354ec
VZ
2803 // we trust OS to do conversion better than we can so try external
2804 // conversion methods first
2805 //
2806 // the full order is:
2807 // 1. OS conversion (iconv() under Unix or Win32 API)
2808 // 2. hard coded conversions for UTF
2809 // 3. wxEncodingConverter as fall back
2810
2811 // step (1)
2812#ifdef HAVE_ICONV
c547282d 2813#if !wxUSE_FONTMAP
e95354ec 2814 if ( m_name )
c547282d 2815#endif // !wxUSE_FONTMAP
e95354ec 2816 {
3ef10cfc 2817#if wxUSE_FONTMAP
8b3eb85d 2818 wxFontEncoding encoding(m_encoding);
3ef10cfc 2819#endif
8b3eb85d 2820
86501081 2821 if ( m_name )
8b3eb85d 2822 {
86501081 2823 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
8b3eb85d
VZ
2824 if ( conv->IsOk() )
2825 return conv;
2826
2827 delete conv;
c547282d
VZ
2828
2829#if wxUSE_FONTMAP
8b3eb85d 2830 encoding =
86501081 2831 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2832#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2833 }
2834#if wxUSE_FONTMAP
2835 {
2836 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2837 if ( it != gs_nameCache.end() )
2838 {
2839 if ( it->second.empty() )
2840 return NULL;
c547282d 2841
86501081 2842 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
8b3eb85d
VZ
2843 if ( conv->IsOk() )
2844 return conv;
e95354ec 2845
8b3eb85d
VZ
2846 delete conv;
2847 }
2848
2849 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
86501081
VS
2850 // CS : in case this does not return valid names (eg for MacRoman)
2851 // encoding got a 'failure' entry in the cache all the same,
2852 // although it just has to be created using a different method, so
2853 // only store failed iconv creation attempts (or perhaps we
2854 // shoulnd't do this at all ?)
3c67ec06 2855 if ( names[0] != NULL )
8b3eb85d 2856 {
3c67ec06 2857 for ( ; *names; ++names )
8b3eb85d 2858 {
86501081
VS
2859 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860 // will need changes that will obsolete this
2861 wxString name(*names);
2862 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3c67ec06
SC
2863 if ( conv->IsOk() )
2864 {
2865 gs_nameCache[encoding] = *names;
2866 return conv;
2867 }
2868
2869 delete conv;
8b3eb85d
VZ
2870 }
2871
3c67ec06 2872 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d 2873 }
8b3eb85d
VZ
2874 }
2875#endif // wxUSE_FONTMAP
e95354ec
VZ
2876 }
2877#endif // HAVE_ICONV
2878
2879#ifdef wxHAVE_WIN32_MB2WC
2880 {
7608a683 2881#if wxUSE_FONTMAP
e95354ec
VZ
2882 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2883 : new wxMBConv_win32(m_encoding);
2884 if ( conv->IsOk() )
2885 return conv;
2886
2887 delete conv;
7608a683
WS
2888#else
2889 return NULL;
2890#endif
e95354ec
VZ
2891 }
2892#endif // wxHAVE_WIN32_MB2WC
ef199164 2893
5c4ed98d 2894#ifdef __DARWIN__
f7e98dee 2895 {
6ff49cbc
DE
2896 // leave UTF16 and UTF32 to the built-ins of wx
2897 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2898 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
f7e98dee 2899 {
a6900d10 2900#if wxUSE_FONTMAP
5c4ed98d
DE
2901 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2902 : new wxMBConv_cf(m_encoding);
a6900d10 2903#else
5c4ed98d 2904 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
a6900d10 2905#endif
ef199164 2906
f7e98dee 2907 if ( conv->IsOk() )
d775fa82
WS
2908 return conv;
2909
2910 delete conv;
2911 }
335d31e0 2912 }
5c4ed98d
DE
2913#endif // __DARWIN__
2914
e95354ec
VZ
2915 // step (2)
2916 wxFontEncoding enc = m_encoding;
2917#if wxUSE_FONTMAP
c547282d
VZ
2918 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2919 {
2920 // use "false" to suppress interactive dialogs -- we can be called from
2921 // anywhere and popping up a dialog from here is the last thing we want to
2922 // do
267e11c5 2923 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2924 }
e95354ec
VZ
2925#endif // wxUSE_FONTMAP
2926
2927 switch ( enc )
2928 {
2929 case wxFONTENCODING_UTF7:
2930 return new wxMBConvUTF7;
2931
2932 case wxFONTENCODING_UTF8:
2933 return new wxMBConvUTF8;
2934
e95354ec
VZ
2935 case wxFONTENCODING_UTF16BE:
2936 return new wxMBConvUTF16BE;
2937
2938 case wxFONTENCODING_UTF16LE:
2939 return new wxMBConvUTF16LE;
2940
e95354ec
VZ
2941 case wxFONTENCODING_UTF32BE:
2942 return new wxMBConvUTF32BE;
2943
2944 case wxFONTENCODING_UTF32LE:
2945 return new wxMBConvUTF32LE;
2946
2947 default:
2948 // nothing to do but put here to suppress gcc warnings
ef199164 2949 break;
e95354ec
VZ
2950 }
2951
2952 // step (3)
2953#if wxUSE_FONTMAP
2954 {
2955 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2956 : new wxMBConv_wxwin(m_encoding);
2957 if ( conv->IsOk() )
2958 return conv;
2959
2960 delete conv;
2961 }
2962#endif // wxUSE_FONTMAP
2963
a58d4f4d
VS
2964 // NB: This is a hack to prevent deadlock. What could otherwise happen
2965 // in Unicode build: wxConvLocal creation ends up being here
2966 // because of some failure and logs the error. But wxLog will try to
6a17b868
SN
2967 // attach a timestamp, for which it will need wxConvLocal (to convert
2968 // time to char* and then wchar_t*), but that fails, tries to log the
2969 // error, but wxLog has an (already locked) critical section that
2970 // guards the static buffer.
a58d4f4d
VS
2971 static bool alreadyLoggingError = false;
2972 if (!alreadyLoggingError)
2973 {
2974 alreadyLoggingError = true;
2975 wxLogError(_("Cannot convert from the charset '%s'!"),
2976 m_name ? m_name
e95354ec
VZ
2977 :
2978#if wxUSE_FONTMAP
86501081 2979 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
e95354ec 2980#else // !wxUSE_FONTMAP
86501081 2981 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
e95354ec
VZ
2982#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2983 );
ef199164 2984
a58d4f4d
VS
2985 alreadyLoggingError = false;
2986 }
e95354ec
VZ
2987
2988 return NULL;
2989}
2990
2991void wxCSConv::CreateConvIfNeeded() const
2992{
2993 if ( m_deferred )
2994 {
2995 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a 2996
bda3d86a
VZ
2997 // if we don't have neither the name nor the encoding, use the default
2998 // encoding for this system
2999 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3000 {
4c75209f 3001#if wxUSE_INTL
02c7347b 3002 self->m_encoding = wxLocale::GetSystemEncoding();
4c75209f
VS
3003#else
3004 // fallback to some reasonable default:
3005 self->m_encoding = wxFONTENCODING_ISO8859_1;
bda3d86a 3006#endif // wxUSE_INTL
4c75209f 3007 }
bda3d86a 3008
e95354ec
VZ
3009 self->m_convReal = DoCreate();
3010 self->m_deferred = false;
6001e347 3011 }
6001e347
RR
3012}
3013
0f0298b1
VZ
3014bool wxCSConv::IsOk() const
3015{
3016 CreateConvIfNeeded();
3017
3018 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3020 return true; // always ok as we do it ourselves
3021
3022 // m_convReal->IsOk() is called at its own creation, so we know it must
3023 // be ok if m_convReal is non-NULL
3024 return m_convReal != NULL;
3025}
3026
1c714a5d
VZ
3027size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3028 const char *src, size_t srcLen) const
3029{
3030 CreateConvIfNeeded();
3031
2c74c558
VS
3032 if (m_convReal)
3033 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3034
3035 // latin-1 (direct)
3036 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3037}
3038
3039size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3040 const wchar_t *src, size_t srcLen) const
3041{
3042 CreateConvIfNeeded();
3043
2c74c558
VS
3044 if (m_convReal)
3045 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3046
3047 // latin-1 (direct)
3048 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
1c714a5d
VZ
3049}
3050
6001e347
RR
3051size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3052{
e95354ec 3053 CreateConvIfNeeded();
dccce9ea 3054
e95354ec
VZ
3055 if (m_convReal)
3056 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3057
3058 // latin-1 (direct)
4def3b35 3059 size_t len = strlen(psz);
dccce9ea 3060
f1339c56
RR
3061 if (buf)
3062 {
4def3b35 3063 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3064 buf[c] = (unsigned char)(psz[c]);
3065 }
dccce9ea 3066
f1339c56 3067 return len;
6001e347
RR
3068}
3069
3070size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3071{
e95354ec 3072 CreateConvIfNeeded();
dccce9ea 3073
e95354ec
VZ
3074 if (m_convReal)
3075 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3076
f1339c56 3077 // latin-1 (direct)
f8d791e0 3078 const size_t len = wxWcslen(psz);
f1339c56
RR
3079 if (buf)
3080 {
4def3b35 3081 for (size_t c = 0; c <= len; c++)
24642831
VS
3082 {
3083 if (psz[c] > 0xFF)
467e0479 3084 return wxCONV_FAILED;
ef199164 3085
907173e5 3086 buf[c] = (char)psz[c];
24642831
VS
3087 }
3088 }
3089 else
3090 {
3091 for (size_t c = 0; c <= len; c++)
3092 {
3093 if (psz[c] > 0xFF)
467e0479 3094 return wxCONV_FAILED;
24642831 3095 }
f1339c56 3096 }
dccce9ea 3097
f1339c56 3098 return len;
6001e347
RR
3099}
3100
7ef3ab50 3101size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3102{
3103 CreateConvIfNeeded();
3104
3105 if ( m_convReal )
3106 {
7ef3ab50 3107 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3108 }
3109
ba98e032 3110 // otherwise, we are ISO-8859-1
c1464d9d 3111 return 1;
eec47cc6
VZ
3112}
3113
ba98e032
VS
3114#if wxUSE_UNICODE_UTF8
3115bool wxCSConv::IsUTF8() const
3116{
3117 CreateConvIfNeeded();
3118
3119 if ( m_convReal )
3120 {
3121 return m_convReal->IsUTF8();
3122 }
3123
3124 // otherwise, we are ISO-8859-1
3125 return false;
3126}
3127#endif
3128
69c928ef
VZ
3129
3130#if wxUSE_UNICODE
3131
3132wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3133{
3134 if ( !s )
3135 return wxWCharBuffer();
3136
3137 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3138 if ( !wbuf )
5487ff0f 3139 wbuf = wxMBConvUTF8().cMB2WX(s);
69c928ef
VZ
3140 if ( !wbuf )
3141 wbuf = wxConvISO8859_1.cMB2WX(s);
3142
3143 return wbuf;
3144}
3145
3146wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3147{
3148 if ( !ws )
3149 return wxCharBuffer();
3150
3151 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3152 if ( !buf )
3153 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3154
3155 return buf;
3156}
3157
3158#endif // wxUSE_UNICODE
f5a1953b 3159
1e50d914
VS
3160// ----------------------------------------------------------------------------
3161// globals
3162// ----------------------------------------------------------------------------
3163
3164// NB: The reason why we create converted objects in this convoluted way,
3165// using a factory function instead of global variable, is that they
3166// may be used at static initialization time (some of them are used by
3167// wxString ctors and there may be a global wxString object). In other
3168// words, possibly _before_ the converter global object would be
3169// initialized.
3170
3171#undef wxConvLibc
3172#undef wxConvUTF8
3173#undef wxConvUTF7
3174#undef wxConvLocal
3175#undef wxConvISO8859_1
3176
3177#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3178 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
092ee46f 3179 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
1e50d914
VS
3180 { \
3181 static impl_klass name##Obj ctor_args; \
3182 return &name##Obj; \
3183 } \
3184 /* this ensures that all global converter objects are created */ \
3185 /* by the time static initialization is done, i.e. before any */ \
3186 /* thread is launched: */ \
3187 static klass* gs_##name##instance = wxGet_##name##Ptr()
3188
3189#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3191
3192#ifdef __WINDOWS__
3193 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
1e50d914
VS
3194#else
3195 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3196#endif
3197
e1079eda
VZ
3198// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200// provokes an error message about "not enough macro parameters"; and we
3201// can't use "()" here as the name##Obj declaration would be parsed as a
3202// function declaration then, so use a semicolon and live with an extra
3203// empty statement (and hope that no compilers warns about this)
3204WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3205WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
1e50d914
VS
3206
3207WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3208WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3209
3210WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3211WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3212
6ac84a78
DE
3213#ifdef __DARWIN__
3214// The xnu kernel always communicates file paths in decomposed UTF-8.
3215// WARNING: Are we sure that CFString's conversion will cause decomposition?
3216static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
1e50d914 3217#endif
6ac84a78 3218
1e50d914 3219WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
6ac84a78 3220#ifdef __DARWIN__
1e50d914 3221 &wxConvMacUTF8DObj;
6ac84a78 3222#else // !__DARWIN__
1e50d914 3223 wxGet_wxConvLibcPtr();
6ac84a78 3224#endif // __DARWIN__/!__DARWIN__
1e50d914 3225
bde4baac
VZ
3226#else // !wxUSE_WCHAR_T
3227
1e50d914 3228// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
bde4baac
VZ
3229// stand-ins in absence of wchar_t
3230WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3231 wxConvISO8859_1,
3232 wxConvLocal,
3233 wxConvUTF8;
3234
3235#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T