]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
fix recently introduced memory leak of m_conv (bug 1466559)
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
373658eb
VZ
18#ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
ef199164 21#endif
373658eb 22
bde4baac
VZ
23#include "wx/strconv.h"
24
25#if wxUSE_WCHAR_T
26
7608a683 27#ifdef __WINDOWS__
532d575b 28 #include "wx/msw/private.h"
13dd924a 29 #include "wx/msw/missing.h"
0a1c1e62
GRG
30#endif
31
1c193821 32#ifndef __WXWINCE__
1cd52418 33#include <errno.h>
1c193821
JS
34#endif
35
6001e347
RR
36#include <ctype.h>
37#include <string.h>
38#include <stdlib.h>
39
e95354ec
VZ
40#if defined(__WIN32__) && !defined(__WXMICROWIN__)
41 #define wxHAVE_WIN32_MB2WC
ef199164 42#endif
e95354ec 43
6001e347 44#ifdef __SALFORDC__
373658eb 45 #include <clib.h>
6001e347
RR
46#endif
47
b040e242 48#ifdef HAVE_ICONV
373658eb 49 #include <iconv.h>
b1d547eb 50 #include "wx/thread.h"
1cd52418 51#endif
1cd52418 52
373658eb
VZ
53#include "wx/encconv.h"
54#include "wx/fontmap.h"
7608a683 55#include "wx/utils.h"
373658eb 56
335d31e0 57#ifdef __WXMAC__
40ba2f3b 58#ifndef __DARWIN__
4227afa4
SC
59#include <ATSUnicode.h>
60#include <TextCommon.h>
61#include <TextEncodingConverter.h>
40ba2f3b 62#endif
335d31e0 63
ef199164
DS
64// includes Mac headers
65#include "wx/mac/private.h"
335d31e0 66#endif
ce6f8d6f 67
ef199164 68
ce6f8d6f
VZ
69#define TRACE_STRCONV _T("strconv")
70
467e0479
VZ
71// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72// be 4 bytes
4948c2b6 73#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
74 #define WC_UTF16
75#endif
76
ef199164 77
373658eb
VZ
78// ============================================================================
79// implementation
80// ============================================================================
81
69373110
VZ
82// helper function of cMB2WC(): check if n bytes at this location are all NUL
83static bool NotAllNULs(const char *p, size_t n)
84{
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89}
90
373658eb 91// ----------------------------------------------------------------------------
467e0479 92// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 93// ----------------------------------------------------------------------------
6001e347 94
c91830cb 95static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 96{
ef199164 97 if (input <= 0xffff)
4def3b35 98 {
999836aa
VZ
99 if (output)
100 *output = (wxUint16) input;
ef199164 101
4def3b35 102 return 1;
dccce9ea 103 }
ef199164 104 else if (input >= 0x110000)
4def3b35 105 {
467e0479 106 return wxCONV_FAILED;
dccce9ea
VZ
107 }
108 else
4def3b35 109 {
dccce9ea 110 if (output)
4def3b35 111 {
ef199164
DS
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 114 }
ef199164 115
4def3b35 116 return 2;
1cd52418 117 }
1cd52418
OK
118}
119
c91830cb 120static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 121{
ef199164 122 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
125 return 1;
dccce9ea 126 }
ef199164 127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
128 {
129 output = *input;
467e0479 130 return wxCONV_FAILED;
dccce9ea
VZ
131 }
132 else
4def3b35
VS
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
1cd52418
OK
137}
138
467e0479 139#ifdef WC_UTF16
35d11700
VZ
140 typedef wchar_t wxDecodeSurrogate_t;
141#else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
144
145// returns the next UTF-32 character from the wchar_t buffer and advances the
146// pointer to the character after this one
147//
148// if an invalid character is found, *pSrc is set to NULL, the caller must
149// check for this
35d11700 150static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
151{
152 wxUint32 out;
8d3dd069
VZ
153 const size_t
154 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
155 if ( n == wxCONV_FAILED )
156 *pSrc = NULL;
157 else
158 *pSrc += n;
159
160 return out;
161}
162
f6bcfd97 163// ----------------------------------------------------------------------------
6001e347 164// wxMBConv
f6bcfd97 165// ----------------------------------------------------------------------------
2c53a80a 166
483b0434
VZ
167size_t
168wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
169 const char *src, size_t srcLen) const
6001e347 170{
483b0434
VZ
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
6001e347 177
483b0434
VZ
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten = 0;
eec47cc6 180
c1464d9d 181 // the number of NULs terminating this string
483b0434 182 size_t nulLen wxDUMMY_INITIALIZE(0);
eec47cc6 183
c1464d9d
VZ
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
187 // NULs at the end
483b0434
VZ
188 wxCharBuffer bufTmp;
189 const char *srcEnd;
467e0479 190 if ( srcLen != wxNO_LEN )
eec47cc6 191 {
c1464d9d 192 // we need to know how to find the end of this string
7ef3ab50 193 nulLen = GetMBNulLen();
483b0434
VZ
194 if ( nulLen == wxCONV_FAILED )
195 return wxCONV_FAILED;
e4e3bbb4 196
c1464d9d 197 // if there are enough NULs we can avoid the copy
483b0434 198 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
199 {
200 // make a copy in order to properly NUL-terminate the string
483b0434 201 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 202 char * const p = bufTmp.data();
483b0434
VZ
203 memcpy(p, src, srcLen);
204 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 205 *s = '\0';
483b0434
VZ
206
207 src = bufTmp;
eec47cc6 208 }
e4e3bbb4 209
483b0434
VZ
210 srcEnd = src + srcLen;
211 }
212 else // quit after the first loop iteration
213 {
214 srcEnd = NULL;
215 }
e4e3bbb4 216
483b0434 217 for ( ;; )
eec47cc6 218 {
c1464d9d 219 // try to convert the current chunk
483b0434 220 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
221 if ( lenChunk == wxCONV_FAILED )
222 return wxCONV_FAILED;
e4e3bbb4 223
467e0479 224 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 225
483b0434 226 dstWritten += lenChunk;
f5fb6871 227
467e0479
VZ
228 if ( lenChunk == 1 )
229 {
230 // nothing left in the input string, conversion succeeded
231 break;
232 }
233
483b0434
VZ
234 if ( dst )
235 {
236 if ( dstWritten > dstLen )
237 return wxCONV_FAILED;
238
830f8f11 239 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
240 return wxCONV_FAILED;
241
242 dst += lenChunk;
243 }
c1464d9d 244
483b0434 245 if ( !srcEnd )
c1464d9d 246 {
467e0479
VZ
247 // we convert just one chunk in this case as this is the entire
248 // string anyhow
c1464d9d
VZ
249 break;
250 }
eec47cc6
VZ
251
252 // advance the input pointer past the end of this chunk
483b0434 253 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
254 {
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
483b0434 259 src += nulLen;
c1464d9d 260 }
e4e3bbb4 261
483b0434 262 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
263
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
483b0434 267 if ( src >= srcEnd )
c1464d9d
VZ
268 break;
269 }
270
483b0434 271 return dstWritten;
e4e3bbb4
RN
272}
273
483b0434
VZ
274size_t
275wxMBConv::FromWChar(char *dst, size_t dstLen,
276 const wchar_t *src, size_t srcLen) const
e4e3bbb4 277{
483b0434
VZ
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten = 0;
e4e3bbb4 280
eec47cc6
VZ
281 // make a copy of the input string unless it is already properly
282 // NUL-terminated
283 //
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp;
467e0479 287 if ( srcLen == wxNO_LEN )
e4e3bbb4 288 {
483b0434 289 srcLen = wxWcslen(src) + 1;
eec47cc6 290 }
483b0434 291 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
292 {
293 // make a copy in order to properly NUL-terminate the string
483b0434 294 bufTmp = wxWCharBuffer(srcLen);
ef199164 295 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
296 src = bufTmp;
297 }
298
299 const size_t lenNul = GetMBNulLen();
300 for ( const wchar_t * const srcEnd = src + srcLen;
301 src < srcEnd;
302 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
303 {
304 // try to convert the current chunk
305 size_t lenChunk = WC2MB(NULL, src, 0);
306
307 if ( lenChunk == wxCONV_FAILED )
308 return wxCONV_FAILED;
309
310 lenChunk += lenNul;
311 dstWritten += lenChunk;
312
313 if ( dst )
314 {
315 if ( dstWritten > dstLen )
316 return wxCONV_FAILED;
317
318 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dst += lenChunk;
322 }
eec47cc6 323 }
e4e3bbb4 324
483b0434
VZ
325 return dstWritten;
326}
327
ef199164 328size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 329{
ef199164 330 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 331 if ( rc != wxCONV_FAILED )
509da451
VZ
332 {
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
335 rc--;
336 }
337
338 return rc;
339}
340
ef199164 341size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 342{
ef199164 343 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 344 if ( rc != wxCONV_FAILED )
509da451
VZ
345 {
346 rc -= GetMBNulLen();
347 }
348
349 return rc;
350}
351
483b0434
VZ
352wxMBConv::~wxMBConv()
353{
354 // nothing to do here (necessary for Darwin linking probably)
355}
e4e3bbb4 356
483b0434
VZ
357const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
358{
359 if ( psz )
eec47cc6 360 {
483b0434
VZ
361 // calculate the length of the buffer needed first
362 const size_t nLen = MB2WC(NULL, psz, 0);
467e0479 363 if ( nLen != wxCONV_FAILED )
f5fb6871 364 {
483b0434
VZ
365 // now do the actual conversion
366 wxWCharBuffer buf(nLen /* +1 added implicitly */);
eec47cc6 367
483b0434
VZ
368 // +1 for the trailing NULL
369 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
370 return buf;
f5fb6871 371 }
483b0434 372 }
e4e3bbb4 373
483b0434
VZ
374 return wxWCharBuffer();
375}
3698ae71 376
483b0434
VZ
377const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
378{
379 if ( pwz )
380 {
381 const size_t nLen = WC2MB(NULL, pwz, 0);
467e0479 382 if ( nLen != wxCONV_FAILED )
483b0434
VZ
383 {
384 // extra space for trailing NUL(s)
385 static const size_t extraLen = GetMaxMBNulLen();
f5fb6871 386
483b0434
VZ
387 wxCharBuffer buf(nLen + extraLen - 1);
388 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
389 return buf;
390 }
391 }
392
393 return wxCharBuffer();
394}
e4e3bbb4 395
483b0434 396const wxWCharBuffer
ef199164 397wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 398{
ef199164 399 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 400 if ( dstLen != wxCONV_FAILED )
483b0434 401 {
830f8f11 402 wxWCharBuffer wbuf(dstLen - 1);
ef199164 403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
404 {
405 if ( outLen )
467e0479
VZ
406 {
407 *outLen = dstLen;
408 if ( wbuf[dstLen - 1] == L'\0' )
409 (*outLen)--;
410 }
411
483b0434
VZ
412 return wbuf;
413 }
414 }
415
416 if ( outLen )
417 *outLen = 0;
418
419 return wxWCharBuffer();
420}
421
422const wxCharBuffer
ef199164 423wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 424{
ef199164 425 const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 426 if ( dstLen != wxCONV_FAILED )
483b0434 427 {
830f8f11 428 wxCharBuffer buf(dstLen - 1);
ef199164 429 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
430 {
431 if ( outLen )
467e0479
VZ
432 {
433 *outLen = dstLen;
434
435 const size_t nulLen = GetMBNulLen();
436 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
437 {
438 // in this case the output is NUL-terminated and we're not
439 // supposed to count NUL
440 (*outLen) -= nulLen;
441 }
442 }
d32a507d 443
483b0434
VZ
444 return buf;
445 }
e4e3bbb4
RN
446 }
447
eec47cc6
VZ
448 if ( outLen )
449 *outLen = 0;
450
451 return wxCharBuffer();
e4e3bbb4
RN
452}
453
6001e347 454// ----------------------------------------------------------------------------
bde4baac 455// wxMBConvLibc
6001e347
RR
456// ----------------------------------------------------------------------------
457
bde4baac
VZ
458size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
459{
460 return wxMB2WC(buf, psz, n);
461}
462
463size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
464{
465 return wxWC2MB(buf, psz, n);
466}
e1bfe89e
RR
467
468// ----------------------------------------------------------------------------
532d575b 469// wxConvBrokenFileNames
e1bfe89e
RR
470// ----------------------------------------------------------------------------
471
eec47cc6
VZ
472#ifdef __UNIX__
473
845905d5 474wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 475{
845905d5
MW
476 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
477 || wxStricmp(charset, _T("UTF8")) == 0 )
478 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
479 else
480 m_conv = new wxCSConv(charset);
ea8ce907
RR
481}
482
eec47cc6 483#endif // __UNIX__
c12b7f79 484
bde4baac 485// ----------------------------------------------------------------------------
3698ae71 486// UTF-7
bde4baac 487// ----------------------------------------------------------------------------
6001e347 488
15f2ee32 489// Implementation (C) 2004 Fredrik Roubert
6001e347 490
15f2ee32
RN
491//
492// BASE64 decoding table
493//
494static const unsigned char utf7unb64[] =
6001e347 495{
15f2ee32
RN
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
502 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
503 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
505 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
506 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
507 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
509 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
510 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
511 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
528};
529
530size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
531{
15f2ee32
RN
532 size_t len = 0;
533
04a37834 534 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
535 {
536 unsigned char cc = *psz++;
537 if (cc != '+')
538 {
539 // plain ASCII char
540 if (buf)
541 *buf++ = cc;
542 len++;
543 }
544 else if (*psz == '-')
545 {
546 // encoded plus sign
547 if (buf)
548 *buf++ = cc;
549 len++;
550 psz++;
551 }
04a37834 552 else // start of BASE64 encoded string
15f2ee32 553 {
04a37834 554 bool lsb, ok;
15f2ee32 555 unsigned int d, l;
04a37834
VZ
556 for ( ok = lsb = false, d = 0, l = 0;
557 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
558 psz++ )
15f2ee32
RN
559 {
560 d <<= 6;
561 d += cc;
562 for (l += 6; l >= 8; lsb = !lsb)
563 {
04a37834 564 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
565 if (lsb)
566 {
567 if (buf)
568 *buf++ |= c;
569 len ++;
570 }
571 else
04a37834 572 {
15f2ee32 573 if (buf)
6356d52a 574 *buf = (wchar_t)(c << 8);
04a37834
VZ
575 }
576
577 ok = true;
15f2ee32
RN
578 }
579 }
04a37834
VZ
580
581 if ( !ok )
582 {
583 // in valid UTF7 we should have valid characters after '+'
467e0479 584 return wxCONV_FAILED;
04a37834
VZ
585 }
586
15f2ee32
RN
587 if (*psz == '-')
588 psz++;
589 }
590 }
04a37834
VZ
591
592 if ( buf && (len < n) )
593 *buf = '\0';
594
15f2ee32 595 return len;
6001e347
RR
596}
597
15f2ee32
RN
598//
599// BASE64 encoding table
600//
601static const unsigned char utf7enb64[] =
602{
603 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
604 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
605 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
606 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
607 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
608 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
609 'w', 'x', 'y', 'z', '0', '1', '2', '3',
610 '4', '5', '6', '7', '8', '9', '+', '/'
611};
612
613//
614// UTF-7 encoding table
615//
616// 0 - Set D (directly encoded characters)
617// 1 - Set O (optional direct characters)
618// 2 - whitespace characters (optional)
619// 3 - special characters
620//
621static const unsigned char utf7encode[128] =
6001e347 622{
15f2ee32
RN
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
625 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
629 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
631};
632
667e5b3e 633size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 634{
15f2ee32
RN
635 size_t len = 0;
636
637 while (*psz && ((!buf) || (len < n)))
638 {
639 wchar_t cc = *psz++;
640 if (cc < 0x80 && utf7encode[cc] < 1)
641 {
642 // plain ASCII char
643 if (buf)
644 *buf++ = (char)cc;
ef199164 645
15f2ee32
RN
646 len++;
647 }
648#ifndef WC_UTF16
79c78d42 649 else if (((wxUint32)cc) > 0xffff)
b2c13097 650 {
15f2ee32 651 // no surrogate pair generation (yet?)
467e0479 652 return wxCONV_FAILED;
15f2ee32
RN
653 }
654#endif
655 else
656 {
657 if (buf)
658 *buf++ = '+';
ef199164 659
15f2ee32
RN
660 len++;
661 if (cc != '+')
662 {
663 // BASE64 encode string
664 unsigned int lsb, d, l;
73c902d6 665 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
666 {
667 for (lsb = 0; lsb < 2; lsb ++)
668 {
669 d <<= 8;
670 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
671
672 for (l += 8; l >= 6; )
673 {
674 l -= 6;
675 if (buf)
676 *buf++ = utf7enb64[(d >> l) % 64];
677 len++;
678 }
679 }
ef199164 680
15f2ee32
RN
681 cc = *psz;
682 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
683 break;
684 }
ef199164 685
15f2ee32
RN
686 if (l != 0)
687 {
688 if (buf)
689 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 690
15f2ee32
RN
691 len++;
692 }
693 }
ef199164 694
15f2ee32
RN
695 if (buf)
696 *buf++ = '-';
697 len++;
698 }
699 }
ef199164 700
15f2ee32
RN
701 if (buf && (len < n))
702 *buf = 0;
ef199164 703
15f2ee32 704 return len;
6001e347
RR
705}
706
f6bcfd97 707// ----------------------------------------------------------------------------
6001e347 708// UTF-8
f6bcfd97 709// ----------------------------------------------------------------------------
6001e347 710
dccce9ea 711static wxUint32 utf8_max[]=
4def3b35 712 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 713
3698ae71
VZ
714// boundaries of the private use area we use to (temporarily) remap invalid
715// characters invalid in a UTF-8 encoded string
ea8ce907
RR
716const wxUint32 wxUnicodePUA = 0x100000;
717const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
718
6001e347
RR
719size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
720{
4def3b35
VS
721 size_t len = 0;
722
dccce9ea 723 while (*psz && ((!buf) || (len < n)))
4def3b35 724 {
ea8ce907
RR
725 const char *opsz = psz;
726 bool invalid = false;
4def3b35
VS
727 unsigned char cc = *psz++, fc = cc;
728 unsigned cnt;
dccce9ea 729 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 730 fc <<= 1;
ef199164 731
dccce9ea 732 if (!cnt)
4def3b35
VS
733 {
734 // plain ASCII char
dccce9ea 735 if (buf)
4def3b35
VS
736 *buf++ = cc;
737 len++;
561488ef
MW
738
739 // escape the escape character for octal escapes
740 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
741 && cc == '\\' && (!buf || len < n))
742 {
743 if (buf)
744 *buf++ = cc;
745 len++;
746 }
dccce9ea
VZ
747 }
748 else
4def3b35
VS
749 {
750 cnt--;
dccce9ea 751 if (!cnt)
4def3b35
VS
752 {
753 // invalid UTF-8 sequence
ea8ce907 754 invalid = true;
dccce9ea
VZ
755 }
756 else
4def3b35
VS
757 {
758 unsigned ocnt = cnt - 1;
759 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 760 while (cnt--)
4def3b35 761 {
ea8ce907 762 cc = *psz;
dccce9ea 763 if ((cc & 0xC0) != 0x80)
4def3b35
VS
764 {
765 // invalid UTF-8 sequence
ea8ce907
RR
766 invalid = true;
767 break;
4def3b35 768 }
ef199164 769
ea8ce907 770 psz++;
4def3b35
VS
771 res = (res << 6) | (cc & 0x3f);
772 }
ef199164 773
ea8ce907 774 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
775 {
776 // illegal UTF-8 encoding
ea8ce907 777 invalid = true;
4def3b35 778 }
ea8ce907
RR
779 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
780 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
781 {
782 // if one of our PUA characters turns up externally
783 // it must also be treated as an illegal sequence
784 // (a bit like you have to escape an escape character)
785 invalid = true;
786 }
787 else
788 {
1cd52418 789#ifdef WC_UTF16
ea8ce907
RR
790 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
791 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 792 if (pa == wxCONV_FAILED)
ea8ce907
RR
793 {
794 invalid = true;
795 }
796 else
797 {
798 if (buf)
799 buf += pa;
800 len += pa;
801 }
373658eb 802#else // !WC_UTF16
ea8ce907 803 if (buf)
38d4b1e4 804 *buf++ = (wchar_t)res;
ea8ce907 805 len++;
373658eb 806#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
807 }
808 }
ef199164 809
ea8ce907
RR
810 if (invalid)
811 {
812 if (m_options & MAP_INVALID_UTF8_TO_PUA)
813 {
814 while (opsz < psz && (!buf || len < n))
815 {
816#ifdef WC_UTF16
817 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
818 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 819 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
820 if (buf)
821 buf += pa;
822 opsz++;
823 len += pa;
824#else
825 if (buf)
38d4b1e4 826 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
827 opsz++;
828 len++;
829#endif
830 }
831 }
3698ae71 832 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
833 {
834 while (opsz < psz && (!buf || len < n))
835 {
3698ae71
VZ
836 if ( buf && len + 3 < n )
837 {
17a1ebd1 838 unsigned char on = *opsz;
3698ae71 839 *buf++ = L'\\';
17a1ebd1
VZ
840 *buf++ = (wchar_t)( L'0' + on / 0100 );
841 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
842 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 843 }
ef199164 844
ea8ce907
RR
845 opsz++;
846 len += 4;
847 }
848 }
3698ae71 849 else // MAP_INVALID_UTF8_NOT
ea8ce907 850 {
467e0479 851 return wxCONV_FAILED;
ea8ce907 852 }
4def3b35
VS
853 }
854 }
6001e347 855 }
ef199164 856
dccce9ea 857 if (buf && (len < n))
4def3b35 858 *buf = 0;
ef199164 859
4def3b35 860 return len;
6001e347
RR
861}
862
3698ae71
VZ
863static inline bool isoctal(wchar_t wch)
864{
865 return L'0' <= wch && wch <= L'7';
866}
867
6001e347
RR
868size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
869{
4def3b35 870 size_t len = 0;
6001e347 871
dccce9ea 872 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
873 {
874 wxUint32 cc;
ef199164 875
1cd52418 876#ifdef WC_UTF16
b5153fd8
VZ
877 // cast is ok for WC_UTF16
878 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 879 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 880#else
ef199164 881 cc = (*psz++) & 0x7fffffff;
4def3b35 882#endif
3698ae71
VZ
883
884 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
885 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 886 {
dccce9ea 887 if (buf)
ea8ce907 888 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 889 len++;
3698ae71 890 }
561488ef
MW
891 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
892 && cc == L'\\' && psz[0] == L'\\' )
893 {
894 if (buf)
895 *buf++ = (char)cc;
896 psz++;
897 len++;
898 }
3698ae71
VZ
899 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
900 cc == L'\\' &&
901 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 902 {
dccce9ea 903 if (buf)
3698ae71 904 {
ef199164
DS
905 *buf++ = (char) ((psz[0] - L'0') * 0100 +
906 (psz[1] - L'0') * 010 +
b2c13097 907 (psz[2] - L'0'));
3698ae71
VZ
908 }
909
910 psz += 3;
ea8ce907
RR
911 len++;
912 }
913 else
914 {
915 unsigned cnt;
ef199164
DS
916 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
917 {
918 }
919
ea8ce907 920 if (!cnt)
4def3b35 921 {
ea8ce907
RR
922 // plain ASCII char
923 if (buf)
924 *buf++ = (char) cc;
925 len++;
926 }
ea8ce907
RR
927 else
928 {
929 len += cnt + 1;
930 if (buf)
931 {
932 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
933 while (cnt--)
934 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
935 }
4def3b35
VS
936 }
937 }
6001e347 938 }
4def3b35 939
ef199164 940 if (buf && (len < n))
3698ae71 941 *buf = 0;
adb45366 942
4def3b35 943 return len;
6001e347
RR
944}
945
467e0479 946// ============================================================================
c91830cb 947// UTF-16
467e0479 948// ============================================================================
c91830cb
VZ
949
950#ifdef WORDS_BIGENDIAN
bde4baac
VZ
951 #define wxMBConvUTF16straight wxMBConvUTF16BE
952 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 953#else
bde4baac
VZ
954 #define wxMBConvUTF16swap wxMBConvUTF16BE
955 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
956#endif
957
467e0479
VZ
958/* static */
959size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
960{
961 if ( srcLen == wxNO_LEN )
962 {
963 // count the number of bytes in input, including the trailing NULs
ef199164
DS
964 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
965 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 966 ;
c91830cb 967
467e0479
VZ
968 srcLen *= BYTES_PER_CHAR;
969 }
970 else // we already have the length
971 {
972 // we can only convert an entire number of UTF-16 characters
973 if ( srcLen % BYTES_PER_CHAR )
974 return wxCONV_FAILED;
975 }
976
977 return srcLen;
978}
979
980// case when in-memory representation is UTF-16 too
c91830cb
VZ
981#ifdef WC_UTF16
982
467e0479
VZ
983// ----------------------------------------------------------------------------
984// conversions without endianness change
985// ----------------------------------------------------------------------------
986
987size_t
988wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
989 const char *src, size_t srcLen) const
c91830cb 990{
467e0479
VZ
991 // set up the scene for using memcpy() (which is presumably more efficient
992 // than copying the bytes one by one)
993 srcLen = GetLength(src, srcLen);
994 if ( srcLen == wxNO_LEN )
995 return wxCONV_FAILED;
c91830cb 996
ef199164 997 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 998 if ( dst )
c91830cb 999 {
467e0479
VZ
1000 if ( dstLen < inLen )
1001 return wxCONV_FAILED;
c91830cb 1002
467e0479 1003 memcpy(dst, src, srcLen);
c91830cb 1004 }
d32a507d 1005
467e0479 1006 return inLen;
c91830cb
VZ
1007}
1008
467e0479
VZ
1009size_t
1010wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1011 const wchar_t *src, size_t srcLen) const
c91830cb 1012{
467e0479
VZ
1013 if ( srcLen == wxNO_LEN )
1014 srcLen = wxWcslen(src) + 1;
c91830cb 1015
467e0479
VZ
1016 srcLen *= BYTES_PER_CHAR;
1017
1018 if ( dst )
c91830cb 1019 {
467e0479
VZ
1020 if ( dstLen < srcLen )
1021 return wxCONV_FAILED;
d32a507d 1022
467e0479 1023 memcpy(dst, src, srcLen);
c91830cb 1024 }
d32a507d 1025
467e0479 1026 return srcLen;
c91830cb
VZ
1027}
1028
467e0479
VZ
1029// ----------------------------------------------------------------------------
1030// endian-reversing conversions
1031// ----------------------------------------------------------------------------
c91830cb 1032
467e0479
VZ
1033size_t
1034wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1035 const char *src, size_t srcLen) const
c91830cb 1036{
467e0479
VZ
1037 srcLen = GetLength(src, srcLen);
1038 if ( srcLen == wxNO_LEN )
1039 return wxCONV_FAILED;
c91830cb 1040
467e0479
VZ
1041 srcLen /= BYTES_PER_CHAR;
1042
1043 if ( dst )
c91830cb 1044 {
467e0479
VZ
1045 if ( dstLen < srcLen )
1046 return wxCONV_FAILED;
1047
ef199164
DS
1048 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1049 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1050 {
ef199164 1051 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1052 }
c91830cb 1053 }
bfab25d4 1054
467e0479 1055 return srcLen;
c91830cb
VZ
1056}
1057
467e0479
VZ
1058size_t
1059wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1060 const wchar_t *src, size_t srcLen) const
c91830cb 1061{
467e0479
VZ
1062 if ( srcLen == wxNO_LEN )
1063 srcLen = wxWcslen(src) + 1;
c91830cb 1064
467e0479
VZ
1065 srcLen *= BYTES_PER_CHAR;
1066
1067 if ( dst )
c91830cb 1068 {
467e0479
VZ
1069 if ( dstLen < srcLen )
1070 return wxCONV_FAILED;
1071
ef199164 1072 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1073 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1074 {
ef199164 1075 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1076 }
c91830cb 1077 }
eec47cc6 1078
467e0479 1079 return srcLen;
c91830cb
VZ
1080}
1081
467e0479 1082#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1083
467e0479
VZ
1084// ----------------------------------------------------------------------------
1085// conversions without endianness change
1086// ----------------------------------------------------------------------------
c91830cb 1087
35d11700
VZ
1088size_t
1089wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1090 const char *src, size_t srcLen) const
c91830cb 1091{
35d11700
VZ
1092 srcLen = GetLength(src, srcLen);
1093 if ( srcLen == wxNO_LEN )
1094 return wxCONV_FAILED;
c91830cb 1095
ef199164 1096 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1097 if ( !dst )
c91830cb 1098 {
35d11700
VZ
1099 // optimization: return maximal space which could be needed for this
1100 // string even if the real size could be smaller if the buffer contains
1101 // any surrogates
1102 return inLen;
c91830cb 1103 }
c91830cb 1104
35d11700 1105 size_t outLen = 0;
ef199164
DS
1106 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1107 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1108 {
ef199164
DS
1109 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1110 if ( !inBuff )
35d11700
VZ
1111 return wxCONV_FAILED;
1112
1113 if ( ++outLen > dstLen )
1114 return wxCONV_FAILED;
c91830cb 1115
35d11700
VZ
1116 *dst++ = ch;
1117 }
1118
1119
1120 return outLen;
1121}
c91830cb 1122
35d11700
VZ
1123size_t
1124wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1125 const wchar_t *src, size_t srcLen) const
c91830cb 1126{
35d11700
VZ
1127 if ( srcLen == wxNO_LEN )
1128 srcLen = wxWcslen(src) + 1;
c91830cb 1129
35d11700 1130 size_t outLen = 0;
ef199164 1131 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1132 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1133 {
1134 wxUint16 cc[2];
35d11700
VZ
1135 const size_t numChars = encode_utf16(*src++, cc);
1136 if ( numChars == wxCONV_FAILED )
1137 return wxCONV_FAILED;
c91830cb 1138
ef199164
DS
1139 outLen += numChars * BYTES_PER_CHAR;
1140 if ( outBuff )
c91830cb 1141 {
35d11700
VZ
1142 if ( outLen > dstLen )
1143 return wxCONV_FAILED;
1144
ef199164 1145 *outBuff++ = cc[0];
35d11700 1146 if ( numChars == 2 )
69b80d28 1147 {
35d11700 1148 // second character of a surrogate
ef199164 1149 *outBuff++ = cc[1];
69b80d28 1150 }
c91830cb 1151 }
c91830cb 1152 }
c91830cb 1153
35d11700 1154 return outLen;
c91830cb
VZ
1155}
1156
467e0479
VZ
1157// ----------------------------------------------------------------------------
1158// endian-reversing conversions
1159// ----------------------------------------------------------------------------
c91830cb 1160
35d11700
VZ
1161size_t
1162wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1163 const char *src, size_t srcLen) const
c91830cb 1164{
35d11700
VZ
1165 srcLen = GetLength(src, srcLen);
1166 if ( srcLen == wxNO_LEN )
1167 return wxCONV_FAILED;
1168
ef199164 1169 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1170 if ( !dst )
1171 {
1172 // optimization: return maximal space which could be needed for this
1173 // string even if the real size could be smaller if the buffer contains
1174 // any surrogates
1175 return inLen;
1176 }
c91830cb 1177
35d11700 1178 size_t outLen = 0;
ef199164
DS
1179 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1180 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1181 {
35d11700
VZ
1182 wxUint32 ch;
1183 wxUint16 tmp[2];
ef199164
DS
1184
1185 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1186 inBuff++;
1187 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1188
35d11700
VZ
1189 const size_t numChars = decode_utf16(tmp, ch);
1190 if ( numChars == wxCONV_FAILED )
1191 return wxCONV_FAILED;
c91830cb 1192
35d11700 1193 if ( numChars == 2 )
ef199164 1194 inBuff++;
35d11700
VZ
1195
1196 if ( ++outLen > dstLen )
1197 return wxCONV_FAILED;
c91830cb 1198
35d11700 1199 *dst++ = ch;
c91830cb 1200 }
c91830cb 1201
c91830cb 1202
35d11700
VZ
1203 return outLen;
1204}
c91830cb 1205
35d11700
VZ
1206size_t
1207wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1208 const wchar_t *src, size_t srcLen) const
c91830cb 1209{
35d11700
VZ
1210 if ( srcLen == wxNO_LEN )
1211 srcLen = wxWcslen(src) + 1;
c91830cb 1212
35d11700 1213 size_t outLen = 0;
ef199164 1214 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1215 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1216 {
1217 wxUint16 cc[2];
35d11700
VZ
1218 const size_t numChars = encode_utf16(*src, cc);
1219 if ( numChars == wxCONV_FAILED )
1220 return wxCONV_FAILED;
c91830cb 1221
ef199164
DS
1222 outLen += numChars * BYTES_PER_CHAR;
1223 if ( outBuff )
c91830cb 1224 {
35d11700
VZ
1225 if ( outLen > dstLen )
1226 return wxCONV_FAILED;
1227
ef199164 1228 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1229 if ( numChars == 2 )
c91830cb 1230 {
35d11700 1231 // second character of a surrogate
ef199164 1232 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1233 }
1234 }
c91830cb 1235 }
c91830cb 1236
35d11700 1237 return outLen;
c91830cb
VZ
1238}
1239
467e0479 1240#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1241
1242
35d11700 1243// ============================================================================
c91830cb 1244// UTF-32
35d11700 1245// ============================================================================
c91830cb
VZ
1246
1247#ifdef WORDS_BIGENDIAN
467e0479
VZ
1248 #define wxMBConvUTF32straight wxMBConvUTF32BE
1249 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1250#else
467e0479
VZ
1251 #define wxMBConvUTF32swap wxMBConvUTF32BE
1252 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1253#endif
1254
1255
1256WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1257WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1258
467e0479
VZ
1259/* static */
1260size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1261{
1262 if ( srcLen == wxNO_LEN )
1263 {
1264 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1265 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1266 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1267 ;
c91830cb 1268
467e0479
VZ
1269 srcLen *= BYTES_PER_CHAR;
1270 }
1271 else // we already have the length
1272 {
1273 // we can only convert an entire number of UTF-32 characters
1274 if ( srcLen % BYTES_PER_CHAR )
1275 return wxCONV_FAILED;
1276 }
1277
1278 return srcLen;
1279}
1280
1281// case when in-memory representation is UTF-16
c91830cb
VZ
1282#ifdef WC_UTF16
1283
467e0479
VZ
1284// ----------------------------------------------------------------------------
1285// conversions without endianness change
1286// ----------------------------------------------------------------------------
1287
1288size_t
1289wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1290 const char *src, size_t srcLen) const
c91830cb 1291{
467e0479
VZ
1292 srcLen = GetLength(src, srcLen);
1293 if ( srcLen == wxNO_LEN )
1294 return wxCONV_FAILED;
c91830cb 1295
ef199164
DS
1296 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1297 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1298 size_t outLen = 0;
1299 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1300 {
1301 wxUint16 cc[2];
ef199164 1302 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1303 if ( numChars == wxCONV_FAILED )
1304 return wxCONV_FAILED;
c91830cb 1305
467e0479
VZ
1306 outLen += numChars;
1307 if ( dst )
c91830cb 1308 {
467e0479
VZ
1309 if ( outLen > dstLen )
1310 return wxCONV_FAILED;
d32a507d 1311
467e0479
VZ
1312 *dst++ = cc[0];
1313 if ( numChars == 2 )
1314 {
1315 // second character of a surrogate
1316 *dst++ = cc[1];
1317 }
1318 }
c91830cb 1319 }
d32a507d 1320
467e0479 1321 return outLen;
c91830cb
VZ
1322}
1323
467e0479
VZ
1324size_t
1325wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1326 const wchar_t *src, size_t srcLen) const
c91830cb 1327{
467e0479
VZ
1328 if ( srcLen == wxNO_LEN )
1329 srcLen = wxWcslen(src) + 1;
c91830cb 1330
467e0479 1331 if ( !dst )
c91830cb 1332 {
467e0479
VZ
1333 // optimization: return maximal space which could be needed for this
1334 // string instead of the exact amount which could be less if there are
1335 // any surrogates in the input
1336 //
1337 // we consider that surrogates are rare enough to make it worthwhile to
1338 // avoid running the loop below at the cost of slightly extra memory
1339 // consumption
ef199164 1340 return srcLen * BYTES_PER_CHAR;
467e0479 1341 }
c91830cb 1342
ef199164 1343 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1344 size_t outLen = 0;
1345 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1346 {
1347 const wxUint32 ch = wxDecodeSurrogate(&src);
1348 if ( !src )
1349 return wxCONV_FAILED;
c91830cb 1350
467e0479 1351 outLen += BYTES_PER_CHAR;
d32a507d 1352
467e0479
VZ
1353 if ( outLen > dstLen )
1354 return wxCONV_FAILED;
b5153fd8 1355
ef199164 1356 *outBuff++ = ch;
467e0479 1357 }
c91830cb 1358
467e0479 1359 return outLen;
c91830cb
VZ
1360}
1361
467e0479
VZ
1362// ----------------------------------------------------------------------------
1363// endian-reversing conversions
1364// ----------------------------------------------------------------------------
c91830cb 1365
467e0479
VZ
1366size_t
1367wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1368 const char *src, size_t srcLen) const
c91830cb 1369{
467e0479
VZ
1370 srcLen = GetLength(src, srcLen);
1371 if ( srcLen == wxNO_LEN )
1372 return wxCONV_FAILED;
c91830cb 1373
ef199164
DS
1374 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1375 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1376 size_t outLen = 0;
ef199164 1377 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1378 {
c91830cb 1379 wxUint16 cc[2];
ef199164 1380 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1381 if ( numChars == wxCONV_FAILED )
1382 return wxCONV_FAILED;
c91830cb 1383
467e0479
VZ
1384 outLen += numChars;
1385 if ( dst )
c91830cb 1386 {
467e0479
VZ
1387 if ( outLen > dstLen )
1388 return wxCONV_FAILED;
d32a507d 1389
467e0479
VZ
1390 *dst++ = cc[0];
1391 if ( numChars == 2 )
1392 {
1393 // second character of a surrogate
1394 *dst++ = cc[1];
1395 }
1396 }
c91830cb 1397 }
b5153fd8 1398
467e0479 1399 return outLen;
c91830cb
VZ
1400}
1401
467e0479
VZ
1402size_t
1403wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1404 const wchar_t *src, size_t srcLen) const
c91830cb 1405{
467e0479
VZ
1406 if ( srcLen == wxNO_LEN )
1407 srcLen = wxWcslen(src) + 1;
c91830cb 1408
467e0479 1409 if ( !dst )
c91830cb 1410 {
467e0479
VZ
1411 // optimization: return maximal space which could be needed for this
1412 // string instead of the exact amount which could be less if there are
1413 // any surrogates in the input
1414 //
1415 // we consider that surrogates are rare enough to make it worthwhile to
1416 // avoid running the loop below at the cost of slightly extra memory
1417 // consumption
1418 return srcLen*BYTES_PER_CHAR;
1419 }
c91830cb 1420
ef199164 1421 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1422 size_t outLen = 0;
1423 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1424 {
1425 const wxUint32 ch = wxDecodeSurrogate(&src);
1426 if ( !src )
1427 return wxCONV_FAILED;
c91830cb 1428
467e0479 1429 outLen += BYTES_PER_CHAR;
d32a507d 1430
467e0479
VZ
1431 if ( outLen > dstLen )
1432 return wxCONV_FAILED;
b5153fd8 1433
ef199164 1434 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1435 }
c91830cb 1436
467e0479 1437 return outLen;
c91830cb
VZ
1438}
1439
467e0479 1440#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1441
35d11700
VZ
1442// ----------------------------------------------------------------------------
1443// conversions without endianness change
1444// ----------------------------------------------------------------------------
1445
1446size_t
1447wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1448 const char *src, size_t srcLen) const
c91830cb 1449{
35d11700
VZ
1450 // use memcpy() as it should be much faster than hand-written loop
1451 srcLen = GetLength(src, srcLen);
1452 if ( srcLen == wxNO_LEN )
1453 return wxCONV_FAILED;
c91830cb 1454
35d11700
VZ
1455 const size_t inLen = srcLen/BYTES_PER_CHAR;
1456 if ( dst )
c91830cb 1457 {
35d11700
VZ
1458 if ( dstLen < inLen )
1459 return wxCONV_FAILED;
b5153fd8 1460
35d11700
VZ
1461 memcpy(dst, src, srcLen);
1462 }
c91830cb 1463
35d11700 1464 return inLen;
c91830cb
VZ
1465}
1466
35d11700
VZ
1467size_t
1468wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1469 const wchar_t *src, size_t srcLen) const
c91830cb 1470{
35d11700
VZ
1471 if ( srcLen == wxNO_LEN )
1472 srcLen = wxWcslen(src) + 1;
1473
1474 srcLen *= BYTES_PER_CHAR;
c91830cb 1475
35d11700 1476 if ( dst )
c91830cb 1477 {
35d11700
VZ
1478 if ( dstLen < srcLen )
1479 return wxCONV_FAILED;
c91830cb 1480
35d11700 1481 memcpy(dst, src, srcLen);
c91830cb
VZ
1482 }
1483
35d11700 1484 return srcLen;
c91830cb
VZ
1485}
1486
35d11700
VZ
1487// ----------------------------------------------------------------------------
1488// endian-reversing conversions
1489// ----------------------------------------------------------------------------
c91830cb 1490
35d11700
VZ
1491size_t
1492wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1493 const char *src, size_t srcLen) const
c91830cb 1494{
35d11700
VZ
1495 srcLen = GetLength(src, srcLen);
1496 if ( srcLen == wxNO_LEN )
1497 return wxCONV_FAILED;
1498
1499 srcLen /= BYTES_PER_CHAR;
c91830cb 1500
35d11700 1501 if ( dst )
c91830cb 1502 {
35d11700
VZ
1503 if ( dstLen < srcLen )
1504 return wxCONV_FAILED;
1505
ef199164
DS
1506 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1507 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1508 {
ef199164 1509 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1510 }
c91830cb 1511 }
b5153fd8 1512
35d11700 1513 return srcLen;
c91830cb
VZ
1514}
1515
35d11700
VZ
1516size_t
1517wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1518 const wchar_t *src, size_t srcLen) const
c91830cb 1519{
35d11700
VZ
1520 if ( srcLen == wxNO_LEN )
1521 srcLen = wxWcslen(src) + 1;
1522
1523 srcLen *= BYTES_PER_CHAR;
c91830cb 1524
35d11700 1525 if ( dst )
c91830cb 1526 {
35d11700
VZ
1527 if ( dstLen < srcLen )
1528 return wxCONV_FAILED;
1529
ef199164 1530 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1531 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1532 {
ef199164 1533 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1534 }
c91830cb 1535 }
b5153fd8 1536
35d11700 1537 return srcLen;
c91830cb
VZ
1538}
1539
467e0479 1540#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1541
1542
36acb880
VZ
1543// ============================================================================
1544// The classes doing conversion using the iconv_xxx() functions
1545// ============================================================================
3caec1bb 1546
b040e242 1547#ifdef HAVE_ICONV
3a0d76bc 1548
b1d547eb
VS
1549// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1550// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1551// (unless there's yet another bug in glibc) the only case when iconv()
1552// returns with (size_t)-1 (which means error) and says there are 0 bytes
1553// left in the input buffer -- when _real_ error occurs,
1554// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1555// iconv() failure.
3caec1bb
VS
1556// [This bug does not appear in glibc 2.2.]
1557#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1558#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1559 (errno != E2BIG || bufLeft != 0))
1560#else
1561#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1562#endif
1563
ab217dba 1564#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1565
74a7eb0b
VZ
1566#define ICONV_T_INVALID ((iconv_t)-1)
1567
1568#if SIZEOF_WCHAR_T == 4
1569 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1570 #define WC_ENC wxFONTENCODING_UTF32
1571#elif SIZEOF_WCHAR_T == 2
1572 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1573 #define WC_ENC wxFONTENCODING_UTF16
1574#else // sizeof(wchar_t) != 2 nor 4
1575 // does this ever happen?
1576 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1577#endif
1578
36acb880 1579// ----------------------------------------------------------------------------
e95354ec 1580// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1581// ----------------------------------------------------------------------------
1582
e95354ec 1583class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1584{
1585public:
e95354ec
VZ
1586 wxMBConv_iconv(const wxChar *name);
1587 virtual ~wxMBConv_iconv();
36acb880 1588
bde4baac
VZ
1589 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1590 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1591
d36c9347 1592 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1593 virtual size_t GetMBNulLen() const;
1594
d36c9347
VZ
1595 virtual wxMBConv *Clone() const
1596 {
1597 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1598 p->m_minMBCharWidth = m_minMBCharWidth;
1599 return p;
1600 }
1601
e95354ec 1602 bool IsOk() const
74a7eb0b 1603 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1604
1605protected:
ef199164
DS
1606 // the iconv handlers used to translate from multibyte
1607 // to wide char and in the other direction
36acb880
VZ
1608 iconv_t m2w,
1609 w2m;
ef199164 1610
b1d547eb
VS
1611#if wxUSE_THREADS
1612 // guards access to m2w and w2m objects
1613 wxMutex m_iconvMutex;
1614#endif
36acb880
VZ
1615
1616private:
e95354ec 1617 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1618 // available on this machine, it will remain NULL
74a7eb0b 1619 static wxString ms_wcCharsetName;
36acb880
VZ
1620
1621 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1622 // different endian-ness than the native one
405d8f46 1623 static bool ms_wcNeedsSwap;
eec47cc6 1624
d36c9347
VZ
1625
1626 // name of the encoding handled by this conversion
1627 wxString m_name;
1628
7ef3ab50 1629 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1630 // initially
1631 size_t m_minMBCharWidth;
36acb880
VZ
1632};
1633
8f115891
MW
1634// make the constructor available for unit testing
1635WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1636{
1637 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1638 if ( !result->IsOk() )
1639 {
1640 delete result;
1641 return 0;
1642 }
ef199164 1643
8f115891
MW
1644 return result;
1645}
1646
422e411e 1647wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1648bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1649
e95354ec 1650wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
d36c9347 1651 : m_name(name)
36acb880 1652{
c1464d9d 1653 m_minMBCharWidth = 0;
eec47cc6 1654
0331b385
VZ
1655 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1656 // names for the charsets
200a9923 1657 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1658
36acb880 1659 // check for charset that represents wchar_t:
74a7eb0b 1660 if ( ms_wcCharsetName.empty() )
f1339c56 1661 {
c2b83fdd
VZ
1662 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1663
74a7eb0b
VZ
1664#if wxUSE_FONTMAP
1665 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1666#else // !wxUSE_FONTMAP
1667 static const wxChar *names[] =
36acb880 1668 {
74a7eb0b
VZ
1669#if SIZEOF_WCHAR_T == 4
1670 _T("UCS-4"),
1671#elif SIZEOF_WCHAR_T = 2
1672 _T("UCS-2"),
1673#endif
1674 NULL
1675 };
1676#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1677
d1f024a8 1678 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1679 {
17a1ebd1 1680 const wxString nameCS(*names);
74a7eb0b
VZ
1681
1682 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1683 wxString nameXE(nameCS);
ef199164
DS
1684
1685#ifdef WORDS_BIGENDIAN
74a7eb0b 1686 nameXE += _T("BE");
ef199164 1687#else // little endian
74a7eb0b 1688 nameXE += _T("LE");
ef199164 1689#endif
74a7eb0b 1690
c2b83fdd
VZ
1691 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1692 nameXE.c_str());
1693
74a7eb0b
VZ
1694 m2w = iconv_open(nameXE.ToAscii(), cname);
1695 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1696 {
74a7eb0b 1697 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1698 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1699 nameCS.c_str());
17a1ebd1 1700 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1701
74a7eb0b
VZ
1702 // and check for bytesex ourselves:
1703 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1704 {
74a7eb0b
VZ
1705 char buf[2], *bufPtr;
1706 wchar_t wbuf[2], *wbufPtr;
1707 size_t insz, outsz;
1708 size_t res;
1709
1710 buf[0] = 'A';
1711 buf[1] = 0;
1712 wbuf[0] = 0;
1713 insz = 2;
1714 outsz = SIZEOF_WCHAR_T * 2;
1715 wbufPtr = wbuf;
1716 bufPtr = buf;
1717
ef199164
DS
1718 res = iconv(
1719 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1720 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
1721
1722 if (ICONV_FAILED(res, insz))
1723 {
1724 wxLogLastError(wxT("iconv"));
422e411e 1725 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1726 nameCS.c_str());
74a7eb0b
VZ
1727 }
1728 else // ok, can convert to this encoding, remember it
1729 {
17a1ebd1 1730 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1731 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1732 }
3a0d76bc
VS
1733 }
1734 }
74a7eb0b 1735 else // use charset not requiring byte swapping
36acb880 1736 {
74a7eb0b 1737 ms_wcCharsetName = nameXE;
36acb880 1738 }
3a0d76bc 1739 }
74a7eb0b 1740
0944fceb 1741 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1742 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1743 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1744 : ms_wcCharsetName.c_str(),
1745 ms_wcNeedsSwap ? _T(" (needs swap)")
1746 : _T(""));
3a0d76bc 1747 }
36acb880 1748 else // we already have ms_wcCharsetName
3caec1bb 1749 {
74a7eb0b 1750 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1751 }
dccce9ea 1752
74a7eb0b 1753 if ( ms_wcCharsetName.empty() )
f1339c56 1754 {
74a7eb0b 1755 w2m = ICONV_T_INVALID;
36acb880 1756 }
405d8f46
VZ
1757 else
1758 {
74a7eb0b
VZ
1759 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1760 if ( w2m == ICONV_T_INVALID )
1761 {
1762 wxLogTrace(TRACE_STRCONV,
1763 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1764 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1765 }
405d8f46 1766 }
36acb880 1767}
3caec1bb 1768
e95354ec 1769wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1770{
74a7eb0b 1771 if ( m2w != ICONV_T_INVALID )
36acb880 1772 iconv_close(m2w);
74a7eb0b 1773 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1774 iconv_close(w2m);
1775}
3a0d76bc 1776
bde4baac 1777size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1778{
69373110
VZ
1779 // find the string length: notice that must be done differently for
1780 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1781 size_t inbuf;
7ef3ab50 1782 const size_t nulLen = GetMBNulLen();
69373110
VZ
1783 switch ( nulLen )
1784 {
1785 default:
467e0479 1786 return wxCONV_FAILED;
69373110
VZ
1787
1788 case 1:
1789 inbuf = strlen(psz); // arguably more optimized than our version
1790 break;
1791
1792 case 2:
1793 case 4:
1794 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1795 // they also have to start at character boundary and not span two
1796 // adjacent characters
1797 const char *p;
1798 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1799 ;
1800 inbuf = p - psz;
1801 break;
1802 }
1803
b1d547eb
VS
1804#if wxUSE_THREADS
1805 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1806 // Unfortunately there is a couple of global wxCSConv objects such as
1807 // wxConvLocal that are used all over wx code, so we have to make sure
1808 // the handle is used by at most one thread at the time. Otherwise
1809 // only a few wx classes would be safe to use from non-main threads
1810 // as MB<->WC conversion would fail "randomly".
1811 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
1812#endif // wxUSE_THREADS
1813
36acb880
VZ
1814 size_t outbuf = n * SIZEOF_WCHAR_T;
1815 size_t res, cres;
1816 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1817 wchar_t *bufPtr = buf;
1818 const char *pszPtr = psz;
1819
1820 if (buf)
1821 {
1822 // have destination buffer, convert there
1823 cres = iconv(m2w,
1824 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1825 (char**)&bufPtr, &outbuf);
1826 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1827
36acb880 1828 if (ms_wcNeedsSwap)
3a0d76bc 1829 {
36acb880 1830 // convert to native endianness
17a1ebd1
VZ
1831 for ( unsigned i = 0; i < res; i++ )
1832 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1833 }
adb45366 1834
69373110 1835 // NUL-terminate the string if there is any space left
49dd9820
VS
1836 if (res < n)
1837 buf[res] = 0;
36acb880
VZ
1838 }
1839 else
1840 {
1841 // no destination buffer... convert using temp buffer
1842 // to calculate destination buffer requirement
1843 wchar_t tbuf[8];
1844 res = 0;
ef199164
DS
1845
1846 do
1847 {
36acb880 1848 bufPtr = tbuf;
ef199164 1849 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
1850
1851 cres = iconv(m2w,
1852 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1853 (char**)&bufPtr, &outbuf );
1854
ef199164
DS
1855 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1856 }
1857 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 1858 }
dccce9ea 1859
36acb880 1860 if (ICONV_FAILED(cres, inbuf))
f1339c56 1861 {
36acb880 1862 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1863 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 1864 return wxCONV_FAILED;
36acb880
VZ
1865 }
1866
1867 return res;
1868}
1869
bde4baac 1870size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1871{
b1d547eb
VS
1872#if wxUSE_THREADS
1873 // NB: explained in MB2WC
1874 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1875#endif
3698ae71 1876
156162ec
MW
1877 size_t inlen = wxWcslen(psz);
1878 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1879 size_t outbuf = n;
1880 size_t res, cres;
3a0d76bc 1881
36acb880 1882 wchar_t *tmpbuf = 0;
3caec1bb 1883
36acb880
VZ
1884 if (ms_wcNeedsSwap)
1885 {
1886 // need to copy to temp buffer to switch endianness
74a7eb0b 1887 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1888 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1889 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1890 for ( size_t i = 0; i < inlen; i++ )
1891 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 1892
156162ec 1893 tmpbuf[inlen] = L'\0';
74a7eb0b 1894 psz = tmpbuf;
36acb880 1895 }
3a0d76bc 1896
36acb880
VZ
1897 if (buf)
1898 {
1899 // have destination buffer, convert there
1900 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1901
ef199164 1902 res = n - outbuf;
adb45366 1903
49dd9820
VS
1904 // NB: iconv was given only wcslen(psz) characters on input, and so
1905 // it couldn't convert the trailing zero. Let's do it ourselves
1906 // if there's some room left for it in the output buffer.
1907 if (res < n)
1908 buf[0] = 0;
36acb880
VZ
1909 }
1910 else
1911 {
ef199164 1912 // no destination buffer: convert using temp buffer
36acb880
VZ
1913 // to calculate destination buffer requirement
1914 char tbuf[16];
1915 res = 0;
ef199164
DS
1916 do
1917 {
1918 buf = tbuf;
1919 outbuf = 16;
36acb880
VZ
1920
1921 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1922
36acb880 1923 res += 16 - outbuf;
ef199164
DS
1924 }
1925 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 1926 }
dccce9ea 1927
36acb880
VZ
1928 if (ms_wcNeedsSwap)
1929 {
1930 free(tmpbuf);
1931 }
dccce9ea 1932
36acb880
VZ
1933 if (ICONV_FAILED(cres, inbuf))
1934 {
ce6f8d6f 1935 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 1936 return wxCONV_FAILED;
36acb880
VZ
1937 }
1938
1939 return res;
1940}
1941
7ef3ab50 1942size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 1943{
c1464d9d 1944 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
1945 {
1946 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1947
1948#if wxUSE_THREADS
1949 // NB: explained in MB2WC
1950 wxMutexLocker lock(self->m_iconvMutex);
1951#endif
1952
356410fc 1953 wchar_t *wnul = L"";
c1464d9d 1954 char buf[8]; // should be enough for NUL in any encoding
356410fc 1955 size_t inLen = sizeof(wchar_t),
c1464d9d 1956 outLen = WXSIZEOF(buf);
ef199164
DS
1957 char *inBuff = (char *)wnul;
1958 char *outBuff = buf;
1959 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 1960 {
c1464d9d 1961 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
1962 }
1963 else // ok
1964 {
ef199164 1965 self->m_minMBCharWidth = outBuff - buf;
356410fc 1966 }
eec47cc6
VZ
1967 }
1968
c1464d9d 1969 return m_minMBCharWidth;
eec47cc6
VZ
1970}
1971
b040e242 1972#endif // HAVE_ICONV
36acb880 1973
e95354ec 1974
36acb880
VZ
1975// ============================================================================
1976// Win32 conversion classes
1977// ============================================================================
1cd52418 1978
e95354ec 1979#ifdef wxHAVE_WIN32_MB2WC
373658eb 1980
8b04d4c4 1981// from utils.cpp
d775fa82 1982#if wxUSE_FONTMAP
8b04d4c4
VZ
1983extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1984extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1985#endif
373658eb 1986
e95354ec 1987class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1988{
1989public:
bde4baac
VZ
1990 wxMBConv_win32()
1991 {
1992 m_CodePage = CP_ACP;
c1464d9d 1993 m_minMBCharWidth = 0;
bde4baac
VZ
1994 }
1995
d36c9347
VZ
1996 wxMBConv_win32(const wxMBConv_win32& conv)
1997 {
1998 m_CodePage = conv.m_CodePage;
1999 m_minMBCharWidth = conv.m_minMBCharWidth;
2000 }
2001
7608a683 2002#if wxUSE_FONTMAP
e95354ec 2003 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
2004 {
2005 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2006 m_minMBCharWidth = 0;
bde4baac 2007 }
dccce9ea 2008
e95354ec 2009 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2010 {
2011 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2012 m_minMBCharWidth = 0;
bde4baac 2013 }
eec47cc6 2014#endif // wxUSE_FONTMAP
8b04d4c4 2015
d36c9347 2016 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2017 {
02272c9c
VZ
2018 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2019 // the behaviour is not compatible with the Unix version (using iconv)
2020 // and break the library itself, e.g. wxTextInputStream::NextChar()
2021 // wouldn't work if reading an incomplete MB char didn't result in an
2022 // error
667e5b3e 2023 //
89028980 2024 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2025 // Win XP or newer and it is not supported for UTF-[78] so we always
2026 // use our own conversions in this case. See
89028980
VS
2027 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2028 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2029 if ( m_CodePage == CP_UTF8 )
89028980 2030 {
830f8f11 2031 return wxConvUTF8.MB2WC(buf, psz, n);
89028980 2032 }
830f8f11
VZ
2033
2034 if ( m_CodePage == CP_UTF7 )
2035 {
2036 return wxConvUTF7.MB2WC(buf, psz, n);
2037 }
2038
2039 int flags = 0;
2040 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2041 IsAtLeastWin2kSP4() )
89028980 2042 {
830f8f11 2043 flags = MB_ERR_INVALID_CHARS;
89028980 2044 }
667e5b3e 2045
2b5f62a0
VZ
2046 const size_t len = ::MultiByteToWideChar
2047 (
2048 m_CodePage, // code page
667e5b3e 2049 flags, // flags: fall on error
2b5f62a0
VZ
2050 psz, // input string
2051 -1, // its length (NUL-terminated)
b4da152e 2052 buf, // output string
2b5f62a0
VZ
2053 buf ? n : 0 // size of output buffer
2054 );
89028980
VS
2055 if ( !len )
2056 {
2057 // function totally failed
467e0479 2058 return wxCONV_FAILED;
89028980
VS
2059 }
2060
2061 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2062 // check if we succeeded, by doing a double trip:
2063 if ( !flags && buf )
2064 {
53c174fc
VZ
2065 const size_t mbLen = strlen(psz);
2066 wxCharBuffer mbBuf(mbLen);
89028980
VS
2067 if ( ::WideCharToMultiByte
2068 (
2069 m_CodePage,
2070 0,
2071 buf,
2072 -1,
2073 mbBuf.data(),
53c174fc 2074 mbLen + 1, // size in bytes, not length
89028980
VS
2075 NULL,
2076 NULL
2077 ) == 0 ||
2078 strcmp(mbBuf, psz) != 0 )
2079 {
2080 // we didn't obtain the same thing we started from, hence
2081 // the conversion was lossy and we consider that it failed
467e0479 2082 return wxCONV_FAILED;
89028980
VS
2083 }
2084 }
2b5f62a0 2085
03a991bc
VZ
2086 // note that it returns count of written chars for buf != NULL and size
2087 // of the needed buffer for buf == NULL so in either case the length of
2088 // the string (which never includes the terminating NUL) is one less
89028980 2089 return len - 1;
f1339c56 2090 }
dccce9ea 2091
d36c9347 2092 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2093 {
13dd924a
VZ
2094 /*
2095 we have a problem here: by default, WideCharToMultiByte() may
2096 replace characters unrepresentable in the target code page with bad
2097 quality approximations such as turning "1/2" symbol (U+00BD) into
2098 "1" for the code pages which don't have it and we, obviously, want
2099 to avoid this at any price
d775fa82 2100
13dd924a
VZ
2101 the trouble is that this function does it _silently_, i.e. it won't
2102 even tell us whether it did or not... Win98/2000 and higher provide
2103 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2104 we have to resort to a round trip, i.e. check that converting back
2105 results in the same string -- this is, of course, expensive but
2106 otherwise we simply can't be sure to not garble the data.
2107 */
2108
2109 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2110 // it doesn't work with CJK encodings (which we test for rather roughly
2111 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2112 // supporting it
907173e5
WS
2113 BOOL usedDef wxDUMMY_INITIALIZE(false);
2114 BOOL *pUsedDef;
13dd924a
VZ
2115 int flags;
2116 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2117 {
2118 // it's our lucky day
2119 flags = WC_NO_BEST_FIT_CHARS;
2120 pUsedDef = &usedDef;
2121 }
2122 else // old system or unsupported encoding
2123 {
2124 flags = 0;
2125 pUsedDef = NULL;
2126 }
2127
2b5f62a0
VZ
2128 const size_t len = ::WideCharToMultiByte
2129 (
2130 m_CodePage, // code page
13dd924a
VZ
2131 flags, // either none or no best fit
2132 pwz, // input string
2b5f62a0
VZ
2133 -1, // it is (wide) NUL-terminated
2134 buf, // output buffer
2135 buf ? n : 0, // and its size
2136 NULL, // default "replacement" char
13dd924a 2137 pUsedDef // [out] was it used?
2b5f62a0
VZ
2138 );
2139
13dd924a
VZ
2140 if ( !len )
2141 {
2142 // function totally failed
467e0479 2143 return wxCONV_FAILED;
13dd924a
VZ
2144 }
2145
2146 // if we were really converting, check if we succeeded
2147 if ( buf )
2148 {
2149 if ( flags )
2150 {
2151 // check if the conversion failed, i.e. if any replacements
2152 // were done
2153 if ( usedDef )
467e0479 2154 return wxCONV_FAILED;
13dd924a
VZ
2155 }
2156 else // we must resort to double tripping...
2157 {
2158 wxWCharBuffer wcBuf(n);
467e0479 2159 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
13dd924a
VZ
2160 wcscmp(wcBuf, pwz) != 0 )
2161 {
2162 // we didn't obtain the same thing we started from, hence
2163 // the conversion was lossy and we consider that it failed
467e0479 2164 return wxCONV_FAILED;
13dd924a
VZ
2165 }
2166 }
2167 }
2168
03a991bc 2169 // see the comment above for the reason of "len - 1"
13dd924a 2170 return len - 1;
f1339c56 2171 }
dccce9ea 2172
7ef3ab50
VZ
2173 virtual size_t GetMBNulLen() const
2174 {
2175 if ( m_minMBCharWidth == 0 )
2176 {
2177 int len = ::WideCharToMultiByte
2178 (
2179 m_CodePage, // code page
2180 0, // no flags
2181 L"", // input string
2182 1, // translate just the NUL
2183 NULL, // output buffer
2184 0, // and its size
2185 NULL, // no replacement char
2186 NULL // [out] don't care if it was used
2187 );
2188
2189 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2190 switch ( len )
2191 {
2192 default:
2193 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2194 self->m_minMBCharWidth = (size_t)-1;
2195 break;
7ef3ab50
VZ
2196
2197 case 0:
2198 self->m_minMBCharWidth = (size_t)-1;
2199 break;
2200
2201 case 1:
2202 case 2:
2203 case 4:
2204 self->m_minMBCharWidth = len;
2205 break;
2206 }
2207 }
2208
2209 return m_minMBCharWidth;
2210 }
2211
d36c9347
VZ
2212 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2213
13dd924a
VZ
2214 bool IsOk() const { return m_CodePage != -1; }
2215
2216private:
2217 static bool CanUseNoBestFit()
2218 {
2219 static int s_isWin98Or2k = -1;
2220
2221 if ( s_isWin98Or2k == -1 )
2222 {
2223 int verMaj, verMin;
2224 switch ( wxGetOsVersion(&verMaj, &verMin) )
2225 {
2226 case wxWIN95:
2227 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2228 break;
2229
2230 case wxWINDOWS_NT:
2231 s_isWin98Or2k = verMaj >= 5;
2232 break;
2233
2234 default:
ef199164 2235 // unknown: be conservative by default
13dd924a 2236 s_isWin98Or2k = 0;
ef199164 2237 break;
13dd924a
VZ
2238 }
2239
2240 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2241 }
2242
2243 return s_isWin98Or2k == 1;
2244 }
f1339c56 2245
89028980
VS
2246 static bool IsAtLeastWin2kSP4()
2247 {
8942f83a
WS
2248#ifdef __WXWINCE__
2249 return false;
2250#else
89028980
VS
2251 static int s_isAtLeastWin2kSP4 = -1;
2252
2253 if ( s_isAtLeastWin2kSP4 == -1 )
2254 {
2255 OSVERSIONINFOEX ver;
2256
2257 memset(&ver, 0, sizeof(ver));
2258 ver.dwOSVersionInfoSize = sizeof(ver);
2259 GetVersionEx((OSVERSIONINFO*)&ver);
2260
2261 s_isAtLeastWin2kSP4 =
2262 ((ver.dwMajorVersion > 5) || // Vista+
2263 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2264 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2265 ver.wServicePackMajor >= 4)) // 2000 SP4+
2266 ? 1 : 0;
2267 }
2268
2269 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2270#endif
89028980
VS
2271 }
2272
eec47cc6 2273
c1464d9d 2274 // the code page we're working with
b1d66b54 2275 long m_CodePage;
c1464d9d 2276
7ef3ab50 2277 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2278 // "unknown"
2279 size_t m_minMBCharWidth;
1cd52418 2280};
e95354ec
VZ
2281
2282#endif // wxHAVE_WIN32_MB2WC
2283
f7e98dee
RN
2284// ============================================================================
2285// Cocoa conversion classes
2286// ============================================================================
2287
2288#if defined(__WXCOCOA__)
2289
ef199164
DS
2290// RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2291// Strangely enough, internally Core Foundation uses
2292// UTF-32 internally quite a bit - its just not public (yet).
f7e98dee
RN
2293
2294#include <CoreFoundation/CFString.h>
2295#include <CoreFoundation/CFStringEncodingExt.h>
2296
2297CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 2298{
638357a0 2299 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ef199164
DS
2300
2301 switch (encoding)
ecd9653b 2302 {
ef199164
DS
2303 case wxFONTENCODING_DEFAULT :
2304 enc = CFStringGetSystemEncoding();
2305 break ;
2306
ecd9653b
WS
2307 case wxFONTENCODING_ISO8859_1 :
2308 enc = kCFStringEncodingISOLatin1 ;
2309 break ;
2310 case wxFONTENCODING_ISO8859_2 :
2311 enc = kCFStringEncodingISOLatin2;
2312 break ;
2313 case wxFONTENCODING_ISO8859_3 :
2314 enc = kCFStringEncodingISOLatin3 ;
2315 break ;
2316 case wxFONTENCODING_ISO8859_4 :
2317 enc = kCFStringEncodingISOLatin4;
2318 break ;
2319 case wxFONTENCODING_ISO8859_5 :
2320 enc = kCFStringEncodingISOLatinCyrillic;
2321 break ;
2322 case wxFONTENCODING_ISO8859_6 :
2323 enc = kCFStringEncodingISOLatinArabic;
2324 break ;
2325 case wxFONTENCODING_ISO8859_7 :
2326 enc = kCFStringEncodingISOLatinGreek;
2327 break ;
2328 case wxFONTENCODING_ISO8859_8 :
2329 enc = kCFStringEncodingISOLatinHebrew;
2330 break ;
2331 case wxFONTENCODING_ISO8859_9 :
2332 enc = kCFStringEncodingISOLatin5;
2333 break ;
2334 case wxFONTENCODING_ISO8859_10 :
2335 enc = kCFStringEncodingISOLatin6;
2336 break ;
2337 case wxFONTENCODING_ISO8859_11 :
2338 enc = kCFStringEncodingISOLatinThai;
2339 break ;
2340 case wxFONTENCODING_ISO8859_13 :
2341 enc = kCFStringEncodingISOLatin7;
2342 break ;
2343 case wxFONTENCODING_ISO8859_14 :
2344 enc = kCFStringEncodingISOLatin8;
2345 break ;
2346 case wxFONTENCODING_ISO8859_15 :
2347 enc = kCFStringEncodingISOLatin9;
2348 break ;
2349
2350 case wxFONTENCODING_KOI8 :
2351 enc = kCFStringEncodingKOI8_R;
2352 break ;
2353 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2354 enc = kCFStringEncodingDOSRussian;
2355 break ;
2356
2357// case wxFONTENCODING_BULGARIAN :
2358// enc = ;
2359// break ;
2360
2361 case wxFONTENCODING_CP437 :
ef199164 2362 enc = kCFStringEncodingDOSLatinUS ;
ecd9653b
WS
2363 break ;
2364 case wxFONTENCODING_CP850 :
2365 enc = kCFStringEncodingDOSLatin1;
2366 break ;
2367 case wxFONTENCODING_CP852 :
2368 enc = kCFStringEncodingDOSLatin2;
2369 break ;
2370 case wxFONTENCODING_CP855 :
2371 enc = kCFStringEncodingDOSCyrillic;
2372 break ;
2373 case wxFONTENCODING_CP866 :
ef199164 2374 enc = kCFStringEncodingDOSRussian ;
ecd9653b
WS
2375 break ;
2376 case wxFONTENCODING_CP874 :
2377 enc = kCFStringEncodingDOSThai;
2378 break ;
2379 case wxFONTENCODING_CP932 :
2380 enc = kCFStringEncodingDOSJapanese;
2381 break ;
2382 case wxFONTENCODING_CP936 :
ef199164 2383 enc = kCFStringEncodingDOSChineseSimplif ;
ecd9653b
WS
2384 break ;
2385 case wxFONTENCODING_CP949 :
2386 enc = kCFStringEncodingDOSKorean;
2387 break ;
2388 case wxFONTENCODING_CP950 :
2389 enc = kCFStringEncodingDOSChineseTrad;
2390 break ;
ecd9653b
WS
2391 case wxFONTENCODING_CP1250 :
2392 enc = kCFStringEncodingWindowsLatin2;
2393 break ;
2394 case wxFONTENCODING_CP1251 :
ef199164 2395 enc = kCFStringEncodingWindowsCyrillic ;
ecd9653b
WS
2396 break ;
2397 case wxFONTENCODING_CP1252 :
ef199164 2398 enc = kCFStringEncodingWindowsLatin1 ;
ecd9653b
WS
2399 break ;
2400 case wxFONTENCODING_CP1253 :
2401 enc = kCFStringEncodingWindowsGreek;
2402 break ;
2403 case wxFONTENCODING_CP1254 :
2404 enc = kCFStringEncodingWindowsLatin5;
2405 break ;
2406 case wxFONTENCODING_CP1255 :
ef199164 2407 enc = kCFStringEncodingWindowsHebrew ;
ecd9653b
WS
2408 break ;
2409 case wxFONTENCODING_CP1256 :
ef199164 2410 enc = kCFStringEncodingWindowsArabic ;
ecd9653b
WS
2411 break ;
2412 case wxFONTENCODING_CP1257 :
2413 enc = kCFStringEncodingWindowsBalticRim;
2414 break ;
638357a0
RN
2415// This only really encodes to UTF7 (if that) evidently
2416// case wxFONTENCODING_UTF7 :
2417// enc = kCFStringEncodingNonLossyASCII ;
2418// break ;
ecd9653b
WS
2419 case wxFONTENCODING_UTF8 :
2420 enc = kCFStringEncodingUTF8 ;
2421 break ;
2422 case wxFONTENCODING_EUC_JP :
2423 enc = kCFStringEncodingEUC_JP;
2424 break ;
2425 case wxFONTENCODING_UTF16 :
f7e98dee 2426 enc = kCFStringEncodingUnicode ;
ecd9653b 2427 break ;
f7e98dee
RN
2428 case wxFONTENCODING_MACROMAN :
2429 enc = kCFStringEncodingMacRoman ;
2430 break ;
2431 case wxFONTENCODING_MACJAPANESE :
2432 enc = kCFStringEncodingMacJapanese ;
2433 break ;
2434 case wxFONTENCODING_MACCHINESETRAD :
2435 enc = kCFStringEncodingMacChineseTrad ;
2436 break ;
2437 case wxFONTENCODING_MACKOREAN :
2438 enc = kCFStringEncodingMacKorean ;
2439 break ;
2440 case wxFONTENCODING_MACARABIC :
2441 enc = kCFStringEncodingMacArabic ;
2442 break ;
2443 case wxFONTENCODING_MACHEBREW :
2444 enc = kCFStringEncodingMacHebrew ;
2445 break ;
2446 case wxFONTENCODING_MACGREEK :
2447 enc = kCFStringEncodingMacGreek ;
2448 break ;
2449 case wxFONTENCODING_MACCYRILLIC :
2450 enc = kCFStringEncodingMacCyrillic ;
2451 break ;
2452 case wxFONTENCODING_MACDEVANAGARI :
2453 enc = kCFStringEncodingMacDevanagari ;
2454 break ;
2455 case wxFONTENCODING_MACGURMUKHI :
2456 enc = kCFStringEncodingMacGurmukhi ;
2457 break ;
2458 case wxFONTENCODING_MACGUJARATI :
2459 enc = kCFStringEncodingMacGujarati ;
2460 break ;
2461 case wxFONTENCODING_MACORIYA :
2462 enc = kCFStringEncodingMacOriya ;
2463 break ;
2464 case wxFONTENCODING_MACBENGALI :
2465 enc = kCFStringEncodingMacBengali ;
2466 break ;
2467 case wxFONTENCODING_MACTAMIL :
2468 enc = kCFStringEncodingMacTamil ;
2469 break ;
2470 case wxFONTENCODING_MACTELUGU :
2471 enc = kCFStringEncodingMacTelugu ;
2472 break ;
2473 case wxFONTENCODING_MACKANNADA :
2474 enc = kCFStringEncodingMacKannada ;
2475 break ;
2476 case wxFONTENCODING_MACMALAJALAM :
2477 enc = kCFStringEncodingMacMalayalam ;
2478 break ;
2479 case wxFONTENCODING_MACSINHALESE :
2480 enc = kCFStringEncodingMacSinhalese ;
2481 break ;
2482 case wxFONTENCODING_MACBURMESE :
2483 enc = kCFStringEncodingMacBurmese ;
2484 break ;
2485 case wxFONTENCODING_MACKHMER :
2486 enc = kCFStringEncodingMacKhmer ;
2487 break ;
2488 case wxFONTENCODING_MACTHAI :
2489 enc = kCFStringEncodingMacThai ;
2490 break ;
2491 case wxFONTENCODING_MACLAOTIAN :
2492 enc = kCFStringEncodingMacLaotian ;
2493 break ;
2494 case wxFONTENCODING_MACGEORGIAN :
2495 enc = kCFStringEncodingMacGeorgian ;
2496 break ;
2497 case wxFONTENCODING_MACARMENIAN :
2498 enc = kCFStringEncodingMacArmenian ;
2499 break ;
2500 case wxFONTENCODING_MACCHINESESIMP :
2501 enc = kCFStringEncodingMacChineseSimp ;
2502 break ;
2503 case wxFONTENCODING_MACTIBETAN :
2504 enc = kCFStringEncodingMacTibetan ;
2505 break ;
2506 case wxFONTENCODING_MACMONGOLIAN :
2507 enc = kCFStringEncodingMacMongolian ;
2508 break ;
2509 case wxFONTENCODING_MACETHIOPIC :
2510 enc = kCFStringEncodingMacEthiopic ;
2511 break ;
2512 case wxFONTENCODING_MACCENTRALEUR :
2513 enc = kCFStringEncodingMacCentralEurRoman ;
2514 break ;
2515 case wxFONTENCODING_MACVIATNAMESE :
2516 enc = kCFStringEncodingMacVietnamese ;
2517 break ;
2518 case wxFONTENCODING_MACARABICEXT :
2519 enc = kCFStringEncodingMacExtArabic ;
2520 break ;
2521 case wxFONTENCODING_MACSYMBOL :
2522 enc = kCFStringEncodingMacSymbol ;
2523 break ;
2524 case wxFONTENCODING_MACDINGBATS :
2525 enc = kCFStringEncodingMacDingbats ;
2526 break ;
2527 case wxFONTENCODING_MACTURKISH :
2528 enc = kCFStringEncodingMacTurkish ;
2529 break ;
2530 case wxFONTENCODING_MACCROATIAN :
2531 enc = kCFStringEncodingMacCroatian ;
2532 break ;
2533 case wxFONTENCODING_MACICELANDIC :
2534 enc = kCFStringEncodingMacIcelandic ;
2535 break ;
2536 case wxFONTENCODING_MACROMANIAN :
2537 enc = kCFStringEncodingMacRomanian ;
2538 break ;
2539 case wxFONTENCODING_MACCELTIC :
2540 enc = kCFStringEncodingMacCeltic ;
2541 break ;
2542 case wxFONTENCODING_MACGAELIC :
2543 enc = kCFStringEncodingMacGaelic ;
2544 break ;
ecd9653b
WS
2545// case wxFONTENCODING_MACKEYBOARD :
2546// enc = kCFStringEncodingMacKeyboardGlyphs ;
2547// break ;
ef199164 2548
ecd9653b
WS
2549 default :
2550 // because gcc is picky
2551 break ;
ef199164
DS
2552 }
2553
ecd9653b 2554 return enc ;
f7e98dee
RN
2555}
2556
f7e98dee
RN
2557class wxMBConv_cocoa : public wxMBConv
2558{
2559public:
2560 wxMBConv_cocoa()
2561 {
2562 Init(CFStringGetSystemEncoding()) ;
2563 }
2564
d36c9347
VZ
2565 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2566 {
2567 m_encoding = conv.m_encoding;
2568 }
2569
a6900d10 2570#if wxUSE_FONTMAP
f7e98dee
RN
2571 wxMBConv_cocoa(const wxChar* name)
2572 {
267e11c5 2573 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2574 }
a6900d10 2575#endif
f7e98dee
RN
2576
2577 wxMBConv_cocoa(wxFontEncoding encoding)
2578 {
2579 Init( wxCFStringEncFromFontEnc(encoding) );
2580 }
2581
2582 ~wxMBConv_cocoa()
2583 {
2584 }
2585
2586 void Init( CFStringEncoding encoding)
2587 {
638357a0 2588 m_encoding = encoding ;
f7e98dee
RN
2589 }
2590
2591 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2592 {
2593 wxASSERT(szUnConv);
ecd9653b 2594
638357a0
RN
2595 CFStringRef theString = CFStringCreateWithBytes (
2596 NULL, //the allocator
2597 (const UInt8*)szUnConv,
2598 strlen(szUnConv),
2599 m_encoding,
2600 false //no BOM/external representation
f7e98dee
RN
2601 );
2602
2603 wxASSERT(theString);
2604
638357a0
RN
2605 size_t nOutLength = CFStringGetLength(theString);
2606
2607 if (szOut == NULL)
f7e98dee 2608 {
f7e98dee 2609 CFRelease(theString);
638357a0 2610 return nOutLength;
f7e98dee 2611 }
ecd9653b 2612
638357a0 2613 CFRange theRange = { 0, nOutSize };
ecd9653b 2614
638357a0
RN
2615#if SIZEOF_WCHAR_T == 4
2616 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2617#endif
3698ae71 2618
f7e98dee 2619 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2620
f7e98dee 2621 CFRelease(theString);
ecd9653b 2622
ef199164 2623 szUniCharBuffer[nOutLength] = '\0';
f7e98dee
RN
2624
2625#if SIZEOF_WCHAR_T == 4
ef199164
DS
2626 wxMBConvUTF16 converter;
2627 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2628 delete [] szUniCharBuffer;
f7e98dee 2629#endif
3698ae71 2630
638357a0 2631 return nOutLength;
f7e98dee
RN
2632 }
2633
2634 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2635 {
638357a0 2636 wxASSERT(szUnConv);
3698ae71 2637
f7e98dee 2638 size_t nRealOutSize;
638357a0 2639 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2640 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2641
f7e98dee 2642#if SIZEOF_WCHAR_T == 4
d9d488cf 2643 wxMBConvUTF16 converter ;
ef199164
DS
2644 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2645 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2646 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
f7e98dee 2647 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2648#endif
2649
2650 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2651 NULL, //allocator
2652 szUniBuffer,
2653 nBufSize,
638357a0 2654 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2655 );
ecd9653b 2656
f7e98dee 2657 wxASSERT(theString);
ecd9653b 2658
f7e98dee 2659 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2660 //so we check and use getchars instead in that case
2661 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2662 {
638357a0
RN
2663 if (szOut != NULL)
2664 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2665
638357a0
RN
2666 nRealOutSize = CFStringGetLength(theString) + 1;
2667 }
2668 else
2669 {
2670 CFStringGetBytes(
2671 theString,
2672 CFRangeMake(0, CFStringGetLength(theString)),
2673 m_encoding,
2674 0, //what to put in characters that can't be converted -
2675 //0 tells CFString to return NULL if it meets such a character
2676 false, //not an external representation
2677 (UInt8*) szOut,
3698ae71 2678 nOutSize,
638357a0
RN
2679 (CFIndex*) &nRealOutSize
2680 );
f7e98dee 2681 }
ecd9653b 2682
638357a0 2683 CFRelease(theString);
ecd9653b 2684
638357a0
RN
2685#if SIZEOF_WCHAR_T == 4
2686 delete[] szUniBuffer;
2687#endif
ecd9653b 2688
f7e98dee
RN
2689 return nRealOutSize - 1;
2690 }
2691
d36c9347
VZ
2692 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2693
f7e98dee 2694 bool IsOk() const
ecd9653b 2695 {
3698ae71 2696 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2697 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2698 }
2699
2700private:
638357a0 2701 CFStringEncoding m_encoding ;
f7e98dee
RN
2702};
2703
2704#endif // defined(__WXCOCOA__)
2705
335d31e0
SC
2706// ============================================================================
2707// Mac conversion classes
2708// ============================================================================
2709
2710#if defined(__WXMAC__) && defined(TARGET_CARBON)
2711
2712class wxMBConv_mac : public wxMBConv
2713{
2714public:
2715 wxMBConv_mac()
2716 {
2717 Init(CFStringGetSystemEncoding()) ;
2718 }
2719
d36c9347
VZ
2720 wxMBConv_mac(const wxMBConv_mac& conv)
2721 {
2722 Init(conv.m_char_encoding);
2723 }
2724
2d1659cf 2725#if wxUSE_FONTMAP
335d31e0
SC
2726 wxMBConv_mac(const wxChar* name)
2727 {
ef199164 2728 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
335d31e0 2729 }
2d1659cf 2730#endif
335d31e0
SC
2731
2732 wxMBConv_mac(wxFontEncoding encoding)
2733 {
d775fa82
WS
2734 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2735 }
2736
2737 ~wxMBConv_mac()
2738 {
2739 OSStatus status = noErr ;
2740 status = TECDisposeConverter(m_MB2WC_converter);
2741 status = TECDisposeConverter(m_WC2MB_converter);
2742 }
2743
2744
2745 void Init( TextEncodingBase encoding)
2746 {
2747 OSStatus status = noErr ;
2748 m_char_encoding = encoding ;
ef199164 2749 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
d775fa82
WS
2750
2751 status = TECCreateConverter(&m_MB2WC_converter,
2752 m_char_encoding,
2753 m_unicode_encoding);
2754 status = TECCreateConverter(&m_WC2MB_converter,
2755 m_unicode_encoding,
2756 m_char_encoding);
2757 }
2758
335d31e0
SC
2759 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2760 {
d775fa82
WS
2761 OSStatus status = noErr ;
2762 ByteCount byteOutLen ;
9088c87b 2763 ByteCount byteInLen = strlen(psz) + 1;
d775fa82
WS
2764 wchar_t *tbuf = NULL ;
2765 UniChar* ubuf = NULL ;
2766 size_t res = 0 ;
2767
2768 if (buf == NULL)
2769 {
ef199164
DS
2770 // Apple specs say at least 32
2771 n = wxMax( 32, byteInLen ) ;
2772 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
d775fa82 2773 }
ef199164 2774
d775fa82 2775 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
ef199164 2776
f3a355ce 2777#if SIZEOF_WCHAR_T == 4
d775fa82 2778 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2779#else
d775fa82 2780 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2781#endif
ef199164
DS
2782
2783 status = TECConvertText(
2784 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2785 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2786
f3a355ce 2787#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2788 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2789 // is not properly terminated we get random characters at the end
2790 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2791 wxMBConvUTF16 converter ;
ef199164 2792 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
d775fa82 2793 free( ubuf ) ;
f3a355ce 2794#else
d775fa82 2795 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2796#endif
ef199164 2797
d775fa82
WS
2798 if ( buf == NULL )
2799 free(tbuf) ;
335d31e0 2800
335d31e0
SC
2801 if ( buf && res < n)
2802 buf[res] = 0;
2803
d775fa82 2804 return res ;
335d31e0
SC
2805 }
2806
2807 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2808 {
2809 OSStatus status = noErr ;
2810 ByteCount byteOutLen ;
2811 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2812
2813 char *tbuf = NULL ;
2814
2815 if (buf == NULL)
2816 {
ef199164
DS
2817 // Apple specs say at least 32
2818 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2819 tbuf = (char*) malloc( n ) ;
2820 }
2821
2822 ByteCount byteBufferLen = n ;
2823 UniChar* ubuf = NULL ;
ef199164 2824
f3a355ce 2825#if SIZEOF_WCHAR_T == 4
d9d488cf 2826 wxMBConvUTF16 converter ;
ef199164 2827 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
d775fa82
WS
2828 byteInLen = unicharlen ;
2829 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
ef199164 2830 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
f3a355ce 2831#else
d775fa82 2832 ubuf = (UniChar*) psz ;
f3a355ce 2833#endif
ef199164
DS
2834
2835 status = TECConvertText(
2836 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2837 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2838
f3a355ce 2839#if SIZEOF_WCHAR_T == 4
d775fa82 2840 free( ubuf ) ;
f3a355ce 2841#endif
ef199164 2842
d775fa82
WS
2843 if ( buf == NULL )
2844 free(tbuf) ;
335d31e0 2845
d775fa82 2846 size_t res = byteOutLen ;
335d31e0 2847 if ( buf && res < n)
638357a0 2848 {
335d31e0 2849 buf[res] = 0;
3698ae71 2850
638357a0
RN
2851 //we need to double-trip to verify it didn't insert any ? in place
2852 //of bogus characters
2853 wxWCharBuffer wcBuf(n);
2854 size_t pszlen = wxWcslen(psz);
467e0479 2855 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
638357a0
RN
2856 wxWcslen(wcBuf) != pszlen ||
2857 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2858 {
2859 // we didn't obtain the same thing we started from, hence
2860 // the conversion was lossy and we consider that it failed
467e0479 2861 return wxCONV_FAILED;
638357a0
RN
2862 }
2863 }
335d31e0 2864
d775fa82 2865 return res ;
335d31e0
SC
2866 }
2867
d3478e2c 2868 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
d36c9347 2869
335d31e0 2870 bool IsOk() const
ef199164 2871 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
335d31e0
SC
2872
2873private:
ef199164
DS
2874 TECObjectRef m_MB2WC_converter;
2875 TECObjectRef m_WC2MB_converter;
d775fa82 2876
ef199164
DS
2877 TextEncodingBase m_char_encoding;
2878 TextEncodingBase m_unicode_encoding;
335d31e0
SC
2879};
2880
2881#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2882
36acb880
VZ
2883// ============================================================================
2884// wxEncodingConverter based conversion classes
2885// ============================================================================
2886
1e6feb95 2887#if wxUSE_FONTMAP
1cd52418 2888
e95354ec 2889class wxMBConv_wxwin : public wxMBConv
1cd52418 2890{
8b04d4c4
VZ
2891private:
2892 void Init()
2893 {
2894 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2895 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2896 }
2897
6001e347 2898public:
f1339c56
RR
2899 // temporarily just use wxEncodingConverter stuff,
2900 // so that it works while a better implementation is built
e95354ec 2901 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2902 {
2903 if (name)
267e11c5 2904 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2905 else
2906 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2907
8b04d4c4
VZ
2908 Init();
2909 }
2910
e95354ec 2911 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2912 {
2913 m_enc = enc;
2914
2915 Init();
f1339c56 2916 }
dccce9ea 2917
bde4baac 2918 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2919 {
2920 size_t inbuf = strlen(psz);
dccce9ea 2921 if (buf)
c643a977 2922 {
ef199164 2923 if (!m2w.Convert(psz, buf))
467e0479 2924 return wxCONV_FAILED;
c643a977 2925 }
f1339c56
RR
2926 return inbuf;
2927 }
dccce9ea 2928
bde4baac 2929 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2930 {
f8d791e0 2931 const size_t inbuf = wxWcslen(psz);
f1339c56 2932 if (buf)
c643a977 2933 {
ef199164 2934 if (!w2m.Convert(psz, buf))
467e0479 2935 return wxCONV_FAILED;
c643a977 2936 }
dccce9ea 2937
f1339c56
RR
2938 return inbuf;
2939 }
dccce9ea 2940
7ef3ab50 2941 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2942 {
2943 switch ( m_enc )
2944 {
2945 case wxFONTENCODING_UTF16BE:
2946 case wxFONTENCODING_UTF16LE:
c1464d9d 2947 return 2;
eec47cc6
VZ
2948
2949 case wxFONTENCODING_UTF32BE:
2950 case wxFONTENCODING_UTF32LE:
c1464d9d 2951 return 4;
eec47cc6
VZ
2952
2953 default:
c1464d9d 2954 return 1;
eec47cc6
VZ
2955 }
2956 }
2957
d36c9347
VZ
2958 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2959
7ef3ab50
VZ
2960 bool IsOk() const { return m_ok; }
2961
2962public:
2963 wxFontEncoding m_enc;
2964 wxEncodingConverter m2w, w2m;
2965
2966private:
cafbf6fb
VZ
2967 // were we initialized successfully?
2968 bool m_ok;
fc7a2a60 2969
e95354ec 2970 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2971};
6001e347 2972
8f115891
MW
2973// make the constructors available for unit testing
2974WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2975{
2976 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2977 if ( !result->IsOk() )
2978 {
2979 delete result;
2980 return 0;
2981 }
ef199164 2982
8f115891
MW
2983 return result;
2984}
2985
1e6feb95
VZ
2986#endif // wxUSE_FONTMAP
2987
36acb880
VZ
2988// ============================================================================
2989// wxCSConv implementation
2990// ============================================================================
2991
8b04d4c4 2992void wxCSConv::Init()
6001e347 2993{
e95354ec
VZ
2994 m_name = NULL;
2995 m_convReal = NULL;
2996 m_deferred = true;
2997}
2998
8b04d4c4
VZ
2999wxCSConv::wxCSConv(const wxChar *charset)
3000{
3001 Init();
82713003 3002
e95354ec
VZ
3003 if ( charset )
3004 {
e95354ec
VZ
3005 SetName(charset);
3006 }
bda3d86a 3007
e4277538
VZ
3008#if wxUSE_FONTMAP
3009 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3010#else
bda3d86a 3011 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 3012#endif
6001e347
RR
3013}
3014
8b04d4c4
VZ
3015wxCSConv::wxCSConv(wxFontEncoding encoding)
3016{
bda3d86a 3017 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
3018 {
3019 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3020
3021 encoding = wxFONTENCODING_SYSTEM;
3022 }
3023
8b04d4c4
VZ
3024 Init();
3025
bda3d86a 3026 m_encoding = encoding;
8b04d4c4
VZ
3027}
3028
6001e347
RR
3029wxCSConv::~wxCSConv()
3030{
65e50848
JS
3031 Clear();
3032}
3033
54380f29 3034wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3035 : wxMBConv()
54380f29 3036{
8b04d4c4
VZ
3037 Init();
3038
54380f29 3039 SetName(conv.m_name);
8b04d4c4 3040 m_encoding = conv.m_encoding;
54380f29
GD
3041}
3042
3043wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3044{
3045 Clear();
8b04d4c4 3046
54380f29 3047 SetName(conv.m_name);
8b04d4c4
VZ
3048 m_encoding = conv.m_encoding;
3049
54380f29
GD
3050 return *this;
3051}
3052
65e50848
JS
3053void wxCSConv::Clear()
3054{
8b04d4c4 3055 free(m_name);
e95354ec 3056 delete m_convReal;
8b04d4c4 3057
65e50848 3058 m_name = NULL;
e95354ec 3059 m_convReal = NULL;
6001e347
RR
3060}
3061
3062void wxCSConv::SetName(const wxChar *charset)
3063{
f1339c56
RR
3064 if (charset)
3065 {
3066 m_name = wxStrdup(charset);
e95354ec 3067 m_deferred = true;
f1339c56 3068 }
6001e347
RR
3069}
3070
8b3eb85d
VZ
3071#if wxUSE_FONTMAP
3072#include "wx/hashmap.h"
3073
3074WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3075 wxEncodingNameCache );
8b3eb85d
VZ
3076
3077static wxEncodingNameCache gs_nameCache;
3078#endif
3079
e95354ec
VZ
3080wxMBConv *wxCSConv::DoCreate() const
3081{
ce6f8d6f
VZ
3082#if wxUSE_FONTMAP
3083 wxLogTrace(TRACE_STRCONV,
3084 wxT("creating conversion for %s"),
3085 (m_name ? m_name
3086 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3087#endif // wxUSE_FONTMAP
3088
c547282d
VZ
3089 // check for the special case of ASCII or ISO8859-1 charset: as we have
3090 // special knowledge of it anyhow, we don't need to create a special
3091 // conversion object
e4277538
VZ
3092 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3093 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3094 {
e95354ec
VZ
3095 // don't convert at all
3096 return NULL;
3097 }
dccce9ea 3098
e95354ec
VZ
3099 // we trust OS to do conversion better than we can so try external
3100 // conversion methods first
3101 //
3102 // the full order is:
3103 // 1. OS conversion (iconv() under Unix or Win32 API)
3104 // 2. hard coded conversions for UTF
3105 // 3. wxEncodingConverter as fall back
3106
3107 // step (1)
3108#ifdef HAVE_ICONV
c547282d 3109#if !wxUSE_FONTMAP
e95354ec 3110 if ( m_name )
c547282d 3111#endif // !wxUSE_FONTMAP
e95354ec 3112 {
c547282d 3113 wxString name(m_name);
8b3eb85d
VZ
3114 wxFontEncoding encoding(m_encoding);
3115
3116 if ( !name.empty() )
3117 {
3118 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3119 if ( conv->IsOk() )
3120 return conv;
3121
3122 delete conv;
c547282d
VZ
3123
3124#if wxUSE_FONTMAP
8b3eb85d
VZ
3125 encoding =
3126 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 3127#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3128 }
3129#if wxUSE_FONTMAP
3130 {
3131 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3132 if ( it != gs_nameCache.end() )
3133 {
3134 if ( it->second.empty() )
3135 return NULL;
c547282d 3136
8b3eb85d
VZ
3137 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3138 if ( conv->IsOk() )
3139 return conv;
e95354ec 3140
8b3eb85d
VZ
3141 delete conv;
3142 }
3143
3144 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3145
3146 for ( ; *names; ++names )
3147 {
3148 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3149 if ( conv->IsOk() )
3150 {
3151 gs_nameCache[encoding] = *names;
3152 return conv;
3153 }
3154
3155 delete conv;
3156 }
3157
40711af8 3158 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
3159 }
3160#endif // wxUSE_FONTMAP
e95354ec
VZ
3161 }
3162#endif // HAVE_ICONV
3163
3164#ifdef wxHAVE_WIN32_MB2WC
3165 {
7608a683 3166#if wxUSE_FONTMAP
e95354ec
VZ
3167 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3168 : new wxMBConv_win32(m_encoding);
3169 if ( conv->IsOk() )
3170 return conv;
3171
3172 delete conv;
7608a683
WS
3173#else
3174 return NULL;
3175#endif
e95354ec
VZ
3176 }
3177#endif // wxHAVE_WIN32_MB2WC
ef199164 3178
d775fa82
WS
3179#if defined(__WXMAC__)
3180 {
5c3c8676 3181 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 3182 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 3183 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82 3184 {
2d1659cf 3185#if wxUSE_FONTMAP
d775fa82
WS
3186 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3187 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
3188#else
3189 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3190#endif
d775fa82 3191 if ( conv->IsOk() )
f7e98dee
RN
3192 return conv;
3193
3194 delete conv;
3195 }
3196 }
3197#endif
ef199164 3198
f7e98dee
RN
3199#if defined(__WXCOCOA__)
3200 {
3201 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3202 {
a6900d10 3203#if wxUSE_FONTMAP
f7e98dee
RN
3204 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3205 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
3206#else
3207 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3208#endif
ef199164 3209
f7e98dee 3210 if ( conv->IsOk() )
d775fa82
WS
3211 return conv;
3212
3213 delete conv;
3214 }
335d31e0
SC
3215 }
3216#endif
e95354ec
VZ
3217 // step (2)
3218 wxFontEncoding enc = m_encoding;
3219#if wxUSE_FONTMAP
c547282d
VZ
3220 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3221 {
3222 // use "false" to suppress interactive dialogs -- we can be called from
3223 // anywhere and popping up a dialog from here is the last thing we want to
3224 // do
267e11c5 3225 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3226 }
e95354ec
VZ
3227#endif // wxUSE_FONTMAP
3228
3229 switch ( enc )
3230 {
3231 case wxFONTENCODING_UTF7:
3232 return new wxMBConvUTF7;
3233
3234 case wxFONTENCODING_UTF8:
3235 return new wxMBConvUTF8;
3236
e95354ec
VZ
3237 case wxFONTENCODING_UTF16BE:
3238 return new wxMBConvUTF16BE;
3239
3240 case wxFONTENCODING_UTF16LE:
3241 return new wxMBConvUTF16LE;
3242
e95354ec
VZ
3243 case wxFONTENCODING_UTF32BE:
3244 return new wxMBConvUTF32BE;
3245
3246 case wxFONTENCODING_UTF32LE:
3247 return new wxMBConvUTF32LE;
3248
3249 default:
3250 // nothing to do but put here to suppress gcc warnings
ef199164 3251 break;
e95354ec
VZ
3252 }
3253
3254 // step (3)
3255#if wxUSE_FONTMAP
3256 {
3257 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3258 : new wxMBConv_wxwin(m_encoding);
3259 if ( conv->IsOk() )
3260 return conv;
3261
3262 delete conv;
3263 }
3264#endif // wxUSE_FONTMAP
3265
a58d4f4d
VS
3266 // NB: This is a hack to prevent deadlock. What could otherwise happen
3267 // in Unicode build: wxConvLocal creation ends up being here
3268 // because of some failure and logs the error. But wxLog will try to
3269 // attach timestamp, for which it will need wxConvLocal (to convert
3270 // time to char* and then wchar_t*), but that fails, tries to log
3271 // error, but wxLog has a (already locked) critical section that
3272 // guards static buffer.
3273 static bool alreadyLoggingError = false;
3274 if (!alreadyLoggingError)
3275 {
3276 alreadyLoggingError = true;
3277 wxLogError(_("Cannot convert from the charset '%s'!"),
3278 m_name ? m_name
e95354ec
VZ
3279 :
3280#if wxUSE_FONTMAP
267e11c5 3281 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
3282#else // !wxUSE_FONTMAP
3283 wxString::Format(_("encoding %s"), m_encoding).c_str()
3284#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3285 );
ef199164 3286
a58d4f4d
VS
3287 alreadyLoggingError = false;
3288 }
e95354ec
VZ
3289
3290 return NULL;
3291}
3292
3293void wxCSConv::CreateConvIfNeeded() const
3294{
3295 if ( m_deferred )
3296 {
3297 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
3298
3299#if wxUSE_INTL
3300 // if we don't have neither the name nor the encoding, use the default
3301 // encoding for this system
3302 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3303 {
4d312c22 3304 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
3305 }
3306#endif // wxUSE_INTL
3307
e95354ec
VZ
3308 self->m_convReal = DoCreate();
3309 self->m_deferred = false;
6001e347 3310 }
6001e347
RR
3311}
3312
3313size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3314{
e95354ec 3315 CreateConvIfNeeded();
dccce9ea 3316
e95354ec
VZ
3317 if (m_convReal)
3318 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3319
3320 // latin-1 (direct)
4def3b35 3321 size_t len = strlen(psz);
dccce9ea 3322
f1339c56
RR
3323 if (buf)
3324 {
4def3b35 3325 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3326 buf[c] = (unsigned char)(psz[c]);
3327 }
dccce9ea 3328
f1339c56 3329 return len;
6001e347
RR
3330}
3331
3332size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3333{
e95354ec 3334 CreateConvIfNeeded();
dccce9ea 3335
e95354ec
VZ
3336 if (m_convReal)
3337 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3338
f1339c56 3339 // latin-1 (direct)
f8d791e0 3340 const size_t len = wxWcslen(psz);
f1339c56
RR
3341 if (buf)
3342 {
4def3b35 3343 for (size_t c = 0; c <= len; c++)
24642831
VS
3344 {
3345 if (psz[c] > 0xFF)
467e0479 3346 return wxCONV_FAILED;
ef199164 3347
907173e5 3348 buf[c] = (char)psz[c];
24642831
VS
3349 }
3350 }
3351 else
3352 {
3353 for (size_t c = 0; c <= len; c++)
3354 {
3355 if (psz[c] > 0xFF)
467e0479 3356 return wxCONV_FAILED;
24642831 3357 }
f1339c56 3358 }
dccce9ea 3359
f1339c56 3360 return len;
6001e347
RR
3361}
3362
7ef3ab50 3363size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3364{
3365 CreateConvIfNeeded();
3366
3367 if ( m_convReal )
3368 {
7ef3ab50 3369 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3370 }
3371
c1464d9d 3372 return 1;
eec47cc6
VZ
3373}
3374
bde4baac
VZ
3375// ----------------------------------------------------------------------------
3376// globals
3377// ----------------------------------------------------------------------------
3378
3379#ifdef __WINDOWS__
3380 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
3381#elif defined(__WXMAC__) && !defined(__MACH__)
3382 static wxMBConv_mac wxConvLibcObj ;
bde4baac 3383#else
dcc8fac0 3384 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
3385#endif
3386
3387static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3388static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3389static wxMBConvUTF7 wxConvUTF7Obj;
3390static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 3391
bde4baac
VZ
3392WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3393WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3394WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3395WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3396WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3397WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
3398WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3399#ifdef __WXOSX__
ea8ce907 3400 wxConvUTF8Obj;
f5a1953b 3401#else
ea8ce907 3402 wxConvLibcObj;
f5a1953b
VZ
3403#endif
3404
bde4baac
VZ
3405
3406#else // !wxUSE_WCHAR_T
3407
3408// stand-ins in absence of wchar_t
3409WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3410 wxConvISO8859_1,
3411 wxConvLocal,
3412 wxConvUTF8;
3413
3414#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T