]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Code symetry for both directions of trimming towards fixing bug #1472688.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
373658eb
VZ
18#ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
de6185e2 21 #include "wx/utils.h"
ef199164 22#endif
373658eb 23
bde4baac
VZ
24#include "wx/strconv.h"
25
26#if wxUSE_WCHAR_T
27
7608a683 28#ifdef __WINDOWS__
532d575b 29 #include "wx/msw/private.h"
13dd924a 30 #include "wx/msw/missing.h"
0a1c1e62
GRG
31#endif
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec
VZ
41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
6001e347 45#ifdef __SALFORDC__
373658eb 46 #include <clib.h>
6001e347
RR
47#endif
48
b040e242 49#ifdef HAVE_ICONV
373658eb 50 #include <iconv.h>
b1d547eb 51 #include "wx/thread.h"
1cd52418 52#endif
1cd52418 53
373658eb
VZ
54#include "wx/encconv.h"
55#include "wx/fontmap.h"
56
335d31e0 57#ifdef __WXMAC__
40ba2f3b 58#ifndef __DARWIN__
4227afa4
SC
59#include <ATSUnicode.h>
60#include <TextCommon.h>
61#include <TextEncodingConverter.h>
40ba2f3b 62#endif
335d31e0 63
ef199164
DS
64// includes Mac headers
65#include "wx/mac/private.h"
335d31e0 66#endif
ce6f8d6f 67
ef199164 68
ce6f8d6f
VZ
69#define TRACE_STRCONV _T("strconv")
70
467e0479
VZ
71// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72// be 4 bytes
4948c2b6 73#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
74 #define WC_UTF16
75#endif
76
ef199164 77
373658eb
VZ
78// ============================================================================
79// implementation
80// ============================================================================
81
69373110
VZ
82// helper function of cMB2WC(): check if n bytes at this location are all NUL
83static bool NotAllNULs(const char *p, size_t n)
84{
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89}
90
373658eb 91// ----------------------------------------------------------------------------
467e0479 92// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 93// ----------------------------------------------------------------------------
6001e347 94
c91830cb 95static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 96{
ef199164 97 if (input <= 0xffff)
4def3b35 98 {
999836aa
VZ
99 if (output)
100 *output = (wxUint16) input;
ef199164 101
4def3b35 102 return 1;
dccce9ea 103 }
ef199164 104 else if (input >= 0x110000)
4def3b35 105 {
467e0479 106 return wxCONV_FAILED;
dccce9ea
VZ
107 }
108 else
4def3b35 109 {
dccce9ea 110 if (output)
4def3b35 111 {
ef199164
DS
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 114 }
ef199164 115
4def3b35 116 return 2;
1cd52418 117 }
1cd52418
OK
118}
119
c91830cb 120static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 121{
ef199164 122 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
125 return 1;
dccce9ea 126 }
ef199164 127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
128 {
129 output = *input;
467e0479 130 return wxCONV_FAILED;
dccce9ea
VZ
131 }
132 else
4def3b35
VS
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
1cd52418
OK
137}
138
467e0479 139#ifdef WC_UTF16
35d11700
VZ
140 typedef wchar_t wxDecodeSurrogate_t;
141#else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
144
145// returns the next UTF-32 character from the wchar_t buffer and advances the
146// pointer to the character after this one
147//
148// if an invalid character is found, *pSrc is set to NULL, the caller must
149// check for this
35d11700 150static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
151{
152 wxUint32 out;
8d3dd069
VZ
153 const size_t
154 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
155 if ( n == wxCONV_FAILED )
156 *pSrc = NULL;
157 else
158 *pSrc += n;
159
160 return out;
161}
162
f6bcfd97 163// ----------------------------------------------------------------------------
6001e347 164// wxMBConv
f6bcfd97 165// ----------------------------------------------------------------------------
2c53a80a 166
483b0434
VZ
167size_t
168wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
169 const char *src, size_t srcLen) const
6001e347 170{
483b0434
VZ
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
6001e347 177
483b0434
VZ
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten = 0;
eec47cc6 180
c1464d9d 181 // the number of NULs terminating this string
a78c43f1 182 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 183
c1464d9d
VZ
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
187 // NULs at the end
483b0434
VZ
188 wxCharBuffer bufTmp;
189 const char *srcEnd;
467e0479 190 if ( srcLen != wxNO_LEN )
eec47cc6 191 {
c1464d9d 192 // we need to know how to find the end of this string
7ef3ab50 193 nulLen = GetMBNulLen();
483b0434
VZ
194 if ( nulLen == wxCONV_FAILED )
195 return wxCONV_FAILED;
e4e3bbb4 196
c1464d9d 197 // if there are enough NULs we can avoid the copy
483b0434 198 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
199 {
200 // make a copy in order to properly NUL-terminate the string
483b0434 201 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 202 char * const p = bufTmp.data();
483b0434
VZ
203 memcpy(p, src, srcLen);
204 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 205 *s = '\0';
483b0434
VZ
206
207 src = bufTmp;
eec47cc6 208 }
e4e3bbb4 209
483b0434
VZ
210 srcEnd = src + srcLen;
211 }
212 else // quit after the first loop iteration
213 {
214 srcEnd = NULL;
215 }
e4e3bbb4 216
483b0434 217 for ( ;; )
eec47cc6 218 {
c1464d9d 219 // try to convert the current chunk
483b0434 220 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
221 if ( lenChunk == wxCONV_FAILED )
222 return wxCONV_FAILED;
e4e3bbb4 223
467e0479 224 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 225
483b0434 226 dstWritten += lenChunk;
f5fb6871 227
467e0479
VZ
228 if ( lenChunk == 1 )
229 {
230 // nothing left in the input string, conversion succeeded
231 break;
232 }
233
483b0434
VZ
234 if ( dst )
235 {
236 if ( dstWritten > dstLen )
237 return wxCONV_FAILED;
238
830f8f11 239 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
240 return wxCONV_FAILED;
241
242 dst += lenChunk;
243 }
c1464d9d 244
483b0434 245 if ( !srcEnd )
c1464d9d 246 {
467e0479
VZ
247 // we convert just one chunk in this case as this is the entire
248 // string anyhow
c1464d9d
VZ
249 break;
250 }
eec47cc6
VZ
251
252 // advance the input pointer past the end of this chunk
483b0434 253 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
254 {
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
483b0434 259 src += nulLen;
c1464d9d 260 }
e4e3bbb4 261
483b0434 262 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
263
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
483b0434 267 if ( src >= srcEnd )
c1464d9d
VZ
268 break;
269 }
270
483b0434 271 return dstWritten;
e4e3bbb4
RN
272}
273
483b0434
VZ
274size_t
275wxMBConv::FromWChar(char *dst, size_t dstLen,
276 const wchar_t *src, size_t srcLen) const
e4e3bbb4 277{
483b0434
VZ
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten = 0;
e4e3bbb4 280
eec47cc6
VZ
281 // make a copy of the input string unless it is already properly
282 // NUL-terminated
283 //
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp;
467e0479 287 if ( srcLen == wxNO_LEN )
e4e3bbb4 288 {
483b0434 289 srcLen = wxWcslen(src) + 1;
eec47cc6 290 }
483b0434 291 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
292 {
293 // make a copy in order to properly NUL-terminate the string
483b0434 294 bufTmp = wxWCharBuffer(srcLen);
ef199164 295 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
296 src = bufTmp;
297 }
298
299 const size_t lenNul = GetMBNulLen();
300 for ( const wchar_t * const srcEnd = src + srcLen;
301 src < srcEnd;
302 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
303 {
304 // try to convert the current chunk
305 size_t lenChunk = WC2MB(NULL, src, 0);
306
307 if ( lenChunk == wxCONV_FAILED )
308 return wxCONV_FAILED;
309
310 lenChunk += lenNul;
311 dstWritten += lenChunk;
312
313 if ( dst )
314 {
315 if ( dstWritten > dstLen )
316 return wxCONV_FAILED;
317
318 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dst += lenChunk;
322 }
eec47cc6 323 }
e4e3bbb4 324
483b0434
VZ
325 return dstWritten;
326}
327
ef199164 328size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 329{
ef199164 330 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 331 if ( rc != wxCONV_FAILED )
509da451
VZ
332 {
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
335 rc--;
336 }
337
338 return rc;
339}
340
ef199164 341size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 342{
ef199164 343 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 344 if ( rc != wxCONV_FAILED )
509da451
VZ
345 {
346 rc -= GetMBNulLen();
347 }
348
349 return rc;
350}
351
483b0434
VZ
352wxMBConv::~wxMBConv()
353{
354 // nothing to do here (necessary for Darwin linking probably)
355}
e4e3bbb4 356
483b0434
VZ
357const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
358{
359 if ( psz )
eec47cc6 360 {
483b0434
VZ
361 // calculate the length of the buffer needed first
362 const size_t nLen = MB2WC(NULL, psz, 0);
467e0479 363 if ( nLen != wxCONV_FAILED )
f5fb6871 364 {
483b0434
VZ
365 // now do the actual conversion
366 wxWCharBuffer buf(nLen /* +1 added implicitly */);
eec47cc6 367
483b0434
VZ
368 // +1 for the trailing NULL
369 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
370 return buf;
f5fb6871 371 }
483b0434 372 }
e4e3bbb4 373
483b0434
VZ
374 return wxWCharBuffer();
375}
3698ae71 376
483b0434
VZ
377const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
378{
379 if ( pwz )
380 {
381 const size_t nLen = WC2MB(NULL, pwz, 0);
467e0479 382 if ( nLen != wxCONV_FAILED )
483b0434
VZ
383 {
384 // extra space for trailing NUL(s)
385 static const size_t extraLen = GetMaxMBNulLen();
f5fb6871 386
483b0434
VZ
387 wxCharBuffer buf(nLen + extraLen - 1);
388 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
389 return buf;
390 }
391 }
392
393 return wxCharBuffer();
394}
e4e3bbb4 395
483b0434 396const wxWCharBuffer
ef199164 397wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 398{
ef199164 399 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 400 if ( dstLen != wxCONV_FAILED )
483b0434 401 {
830f8f11 402 wxWCharBuffer wbuf(dstLen - 1);
ef199164 403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
404 {
405 if ( outLen )
467e0479
VZ
406 {
407 *outLen = dstLen;
408 if ( wbuf[dstLen - 1] == L'\0' )
409 (*outLen)--;
410 }
411
483b0434
VZ
412 return wbuf;
413 }
414 }
415
416 if ( outLen )
417 *outLen = 0;
418
419 return wxWCharBuffer();
420}
421
422const wxCharBuffer
ef199164 423wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 424{
13d92ad6 425 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 426 if ( dstLen != wxCONV_FAILED )
483b0434 427 {
168a76fe
VZ
428 // special case of empty input: can't allocate 0 size buffer below as
429 // wxCharBuffer insists on NUL-terminating it
430 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
ef199164 431 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
432 {
433 if ( outLen )
467e0479
VZ
434 {
435 *outLen = dstLen;
436
437 const size_t nulLen = GetMBNulLen();
13d92ad6
VZ
438 if ( dstLen >= nulLen &&
439 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
440 {
441 // in this case the output is NUL-terminated and we're not
442 // supposed to count NUL
13d92ad6 443 *outLen -= nulLen;
467e0479
VZ
444 }
445 }
d32a507d 446
483b0434
VZ
447 return buf;
448 }
e4e3bbb4
RN
449 }
450
eec47cc6
VZ
451 if ( outLen )
452 *outLen = 0;
453
454 return wxCharBuffer();
e4e3bbb4
RN
455}
456
6001e347 457// ----------------------------------------------------------------------------
bde4baac 458// wxMBConvLibc
6001e347
RR
459// ----------------------------------------------------------------------------
460
bde4baac
VZ
461size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
462{
463 return wxMB2WC(buf, psz, n);
464}
465
466size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
467{
468 return wxWC2MB(buf, psz, n);
469}
e1bfe89e
RR
470
471// ----------------------------------------------------------------------------
532d575b 472// wxConvBrokenFileNames
e1bfe89e
RR
473// ----------------------------------------------------------------------------
474
eec47cc6
VZ
475#ifdef __UNIX__
476
845905d5 477wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 478{
845905d5
MW
479 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
480 || wxStricmp(charset, _T("UTF8")) == 0 )
481 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
482 else
483 m_conv = new wxCSConv(charset);
ea8ce907
RR
484}
485
eec47cc6 486#endif // __UNIX__
c12b7f79 487
bde4baac 488// ----------------------------------------------------------------------------
3698ae71 489// UTF-7
bde4baac 490// ----------------------------------------------------------------------------
6001e347 491
15f2ee32 492// Implementation (C) 2004 Fredrik Roubert
6001e347 493
15f2ee32
RN
494//
495// BASE64 decoding table
496//
497static const unsigned char utf7unb64[] =
6001e347 498{
15f2ee32
RN
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
505 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
506 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
508 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
509 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
510 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
512 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
513 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
514 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
531};
532
533size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
534{
15f2ee32
RN
535 size_t len = 0;
536
04a37834 537 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
538 {
539 unsigned char cc = *psz++;
540 if (cc != '+')
541 {
542 // plain ASCII char
543 if (buf)
544 *buf++ = cc;
545 len++;
546 }
547 else if (*psz == '-')
548 {
549 // encoded plus sign
550 if (buf)
551 *buf++ = cc;
552 len++;
553 psz++;
554 }
04a37834 555 else // start of BASE64 encoded string
15f2ee32 556 {
04a37834 557 bool lsb, ok;
15f2ee32 558 unsigned int d, l;
04a37834
VZ
559 for ( ok = lsb = false, d = 0, l = 0;
560 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
561 psz++ )
15f2ee32
RN
562 {
563 d <<= 6;
564 d += cc;
565 for (l += 6; l >= 8; lsb = !lsb)
566 {
04a37834 567 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
568 if (lsb)
569 {
570 if (buf)
571 *buf++ |= c;
572 len ++;
573 }
574 else
04a37834 575 {
15f2ee32 576 if (buf)
6356d52a 577 *buf = (wchar_t)(c << 8);
04a37834
VZ
578 }
579
580 ok = true;
15f2ee32
RN
581 }
582 }
04a37834
VZ
583
584 if ( !ok )
585 {
586 // in valid UTF7 we should have valid characters after '+'
467e0479 587 return wxCONV_FAILED;
04a37834
VZ
588 }
589
15f2ee32
RN
590 if (*psz == '-')
591 psz++;
592 }
593 }
04a37834
VZ
594
595 if ( buf && (len < n) )
596 *buf = '\0';
597
15f2ee32 598 return len;
6001e347
RR
599}
600
15f2ee32
RN
601//
602// BASE64 encoding table
603//
604static const unsigned char utf7enb64[] =
605{
606 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
607 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
608 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
609 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
610 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
611 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
612 'w', 'x', 'y', 'z', '0', '1', '2', '3',
613 '4', '5', '6', '7', '8', '9', '+', '/'
614};
615
616//
617// UTF-7 encoding table
618//
619// 0 - Set D (directly encoded characters)
620// 1 - Set O (optional direct characters)
621// 2 - whitespace characters (optional)
622// 3 - special characters
623//
624static const unsigned char utf7encode[128] =
6001e347 625{
15f2ee32
RN
626 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
634};
635
667e5b3e 636size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 637{
15f2ee32
RN
638 size_t len = 0;
639
640 while (*psz && ((!buf) || (len < n)))
641 {
642 wchar_t cc = *psz++;
643 if (cc < 0x80 && utf7encode[cc] < 1)
644 {
645 // plain ASCII char
646 if (buf)
647 *buf++ = (char)cc;
ef199164 648
15f2ee32
RN
649 len++;
650 }
651#ifndef WC_UTF16
79c78d42 652 else if (((wxUint32)cc) > 0xffff)
b2c13097 653 {
15f2ee32 654 // no surrogate pair generation (yet?)
467e0479 655 return wxCONV_FAILED;
15f2ee32
RN
656 }
657#endif
658 else
659 {
660 if (buf)
661 *buf++ = '+';
ef199164 662
15f2ee32
RN
663 len++;
664 if (cc != '+')
665 {
666 // BASE64 encode string
667 unsigned int lsb, d, l;
73c902d6 668 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
669 {
670 for (lsb = 0; lsb < 2; lsb ++)
671 {
672 d <<= 8;
673 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
674
675 for (l += 8; l >= 6; )
676 {
677 l -= 6;
678 if (buf)
679 *buf++ = utf7enb64[(d >> l) % 64];
680 len++;
681 }
682 }
ef199164 683
15f2ee32
RN
684 cc = *psz;
685 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
686 break;
687 }
ef199164 688
15f2ee32
RN
689 if (l != 0)
690 {
691 if (buf)
692 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 693
15f2ee32
RN
694 len++;
695 }
696 }
ef199164 697
15f2ee32
RN
698 if (buf)
699 *buf++ = '-';
700 len++;
701 }
702 }
ef199164 703
15f2ee32
RN
704 if (buf && (len < n))
705 *buf = 0;
ef199164 706
15f2ee32 707 return len;
6001e347
RR
708}
709
f6bcfd97 710// ----------------------------------------------------------------------------
6001e347 711// UTF-8
f6bcfd97 712// ----------------------------------------------------------------------------
6001e347 713
dccce9ea 714static wxUint32 utf8_max[]=
4def3b35 715 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 716
3698ae71
VZ
717// boundaries of the private use area we use to (temporarily) remap invalid
718// characters invalid in a UTF-8 encoded string
ea8ce907
RR
719const wxUint32 wxUnicodePUA = 0x100000;
720const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
721
6001e347
RR
722size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
723{
4def3b35
VS
724 size_t len = 0;
725
dccce9ea 726 while (*psz && ((!buf) || (len < n)))
4def3b35 727 {
ea8ce907
RR
728 const char *opsz = psz;
729 bool invalid = false;
4def3b35
VS
730 unsigned char cc = *psz++, fc = cc;
731 unsigned cnt;
dccce9ea 732 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 733 fc <<= 1;
ef199164 734
dccce9ea 735 if (!cnt)
4def3b35
VS
736 {
737 // plain ASCII char
dccce9ea 738 if (buf)
4def3b35
VS
739 *buf++ = cc;
740 len++;
561488ef
MW
741
742 // escape the escape character for octal escapes
743 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
744 && cc == '\\' && (!buf || len < n))
745 {
746 if (buf)
747 *buf++ = cc;
748 len++;
749 }
dccce9ea
VZ
750 }
751 else
4def3b35
VS
752 {
753 cnt--;
dccce9ea 754 if (!cnt)
4def3b35
VS
755 {
756 // invalid UTF-8 sequence
ea8ce907 757 invalid = true;
dccce9ea
VZ
758 }
759 else
4def3b35
VS
760 {
761 unsigned ocnt = cnt - 1;
762 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 763 while (cnt--)
4def3b35 764 {
ea8ce907 765 cc = *psz;
dccce9ea 766 if ((cc & 0xC0) != 0x80)
4def3b35
VS
767 {
768 // invalid UTF-8 sequence
ea8ce907
RR
769 invalid = true;
770 break;
4def3b35 771 }
ef199164 772
ea8ce907 773 psz++;
4def3b35
VS
774 res = (res << 6) | (cc & 0x3f);
775 }
ef199164 776
ea8ce907 777 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
778 {
779 // illegal UTF-8 encoding
ea8ce907 780 invalid = true;
4def3b35 781 }
ea8ce907
RR
782 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
783 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
784 {
785 // if one of our PUA characters turns up externally
786 // it must also be treated as an illegal sequence
787 // (a bit like you have to escape an escape character)
788 invalid = true;
789 }
790 else
791 {
1cd52418 792#ifdef WC_UTF16
ea8ce907
RR
793 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
794 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 795 if (pa == wxCONV_FAILED)
ea8ce907
RR
796 {
797 invalid = true;
798 }
799 else
800 {
801 if (buf)
802 buf += pa;
803 len += pa;
804 }
373658eb 805#else // !WC_UTF16
ea8ce907 806 if (buf)
38d4b1e4 807 *buf++ = (wchar_t)res;
ea8ce907 808 len++;
373658eb 809#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
810 }
811 }
ef199164 812
ea8ce907
RR
813 if (invalid)
814 {
815 if (m_options & MAP_INVALID_UTF8_TO_PUA)
816 {
817 while (opsz < psz && (!buf || len < n))
818 {
819#ifdef WC_UTF16
820 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
821 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 822 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
823 if (buf)
824 buf += pa;
825 opsz++;
826 len += pa;
827#else
828 if (buf)
38d4b1e4 829 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
830 opsz++;
831 len++;
832#endif
833 }
834 }
3698ae71 835 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
836 {
837 while (opsz < psz && (!buf || len < n))
838 {
3698ae71
VZ
839 if ( buf && len + 3 < n )
840 {
17a1ebd1 841 unsigned char on = *opsz;
3698ae71 842 *buf++ = L'\\';
17a1ebd1
VZ
843 *buf++ = (wchar_t)( L'0' + on / 0100 );
844 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
845 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 846 }
ef199164 847
ea8ce907
RR
848 opsz++;
849 len += 4;
850 }
851 }
3698ae71 852 else // MAP_INVALID_UTF8_NOT
ea8ce907 853 {
467e0479 854 return wxCONV_FAILED;
ea8ce907 855 }
4def3b35
VS
856 }
857 }
6001e347 858 }
ef199164 859
dccce9ea 860 if (buf && (len < n))
4def3b35 861 *buf = 0;
ef199164 862
4def3b35 863 return len;
6001e347
RR
864}
865
3698ae71
VZ
866static inline bool isoctal(wchar_t wch)
867{
868 return L'0' <= wch && wch <= L'7';
869}
870
6001e347
RR
871size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
872{
4def3b35 873 size_t len = 0;
6001e347 874
dccce9ea 875 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
876 {
877 wxUint32 cc;
ef199164 878
1cd52418 879#ifdef WC_UTF16
b5153fd8
VZ
880 // cast is ok for WC_UTF16
881 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 882 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 883#else
ef199164 884 cc = (*psz++) & 0x7fffffff;
4def3b35 885#endif
3698ae71
VZ
886
887 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
888 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 889 {
dccce9ea 890 if (buf)
ea8ce907 891 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 892 len++;
3698ae71 893 }
561488ef
MW
894 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
895 && cc == L'\\' && psz[0] == L'\\' )
896 {
897 if (buf)
898 *buf++ = (char)cc;
899 psz++;
900 len++;
901 }
3698ae71
VZ
902 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
903 cc == L'\\' &&
904 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 905 {
dccce9ea 906 if (buf)
3698ae71 907 {
ef199164
DS
908 *buf++ = (char) ((psz[0] - L'0') * 0100 +
909 (psz[1] - L'0') * 010 +
b2c13097 910 (psz[2] - L'0'));
3698ae71
VZ
911 }
912
913 psz += 3;
ea8ce907
RR
914 len++;
915 }
916 else
917 {
918 unsigned cnt;
ef199164
DS
919 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
920 {
921 }
922
ea8ce907 923 if (!cnt)
4def3b35 924 {
ea8ce907
RR
925 // plain ASCII char
926 if (buf)
927 *buf++ = (char) cc;
928 len++;
929 }
ea8ce907
RR
930 else
931 {
932 len += cnt + 1;
933 if (buf)
934 {
935 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
936 while (cnt--)
937 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
938 }
4def3b35
VS
939 }
940 }
6001e347 941 }
4def3b35 942
ef199164 943 if (buf && (len < n))
3698ae71 944 *buf = 0;
adb45366 945
4def3b35 946 return len;
6001e347
RR
947}
948
467e0479 949// ============================================================================
c91830cb 950// UTF-16
467e0479 951// ============================================================================
c91830cb
VZ
952
953#ifdef WORDS_BIGENDIAN
bde4baac
VZ
954 #define wxMBConvUTF16straight wxMBConvUTF16BE
955 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 956#else
bde4baac
VZ
957 #define wxMBConvUTF16swap wxMBConvUTF16BE
958 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
959#endif
960
467e0479
VZ
961/* static */
962size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
963{
964 if ( srcLen == wxNO_LEN )
965 {
966 // count the number of bytes in input, including the trailing NULs
ef199164
DS
967 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
968 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 969 ;
c91830cb 970
467e0479
VZ
971 srcLen *= BYTES_PER_CHAR;
972 }
973 else // we already have the length
974 {
975 // we can only convert an entire number of UTF-16 characters
976 if ( srcLen % BYTES_PER_CHAR )
977 return wxCONV_FAILED;
978 }
979
980 return srcLen;
981}
982
983// case when in-memory representation is UTF-16 too
c91830cb
VZ
984#ifdef WC_UTF16
985
467e0479
VZ
986// ----------------------------------------------------------------------------
987// conversions without endianness change
988// ----------------------------------------------------------------------------
989
990size_t
991wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
992 const char *src, size_t srcLen) const
c91830cb 993{
467e0479
VZ
994 // set up the scene for using memcpy() (which is presumably more efficient
995 // than copying the bytes one by one)
996 srcLen = GetLength(src, srcLen);
997 if ( srcLen == wxNO_LEN )
998 return wxCONV_FAILED;
c91830cb 999
ef199164 1000 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1001 if ( dst )
c91830cb 1002 {
467e0479
VZ
1003 if ( dstLen < inLen )
1004 return wxCONV_FAILED;
c91830cb 1005
467e0479 1006 memcpy(dst, src, srcLen);
c91830cb 1007 }
d32a507d 1008
467e0479 1009 return inLen;
c91830cb
VZ
1010}
1011
467e0479
VZ
1012size_t
1013wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1014 const wchar_t *src, size_t srcLen) const
c91830cb 1015{
467e0479
VZ
1016 if ( srcLen == wxNO_LEN )
1017 srcLen = wxWcslen(src) + 1;
c91830cb 1018
467e0479
VZ
1019 srcLen *= BYTES_PER_CHAR;
1020
1021 if ( dst )
c91830cb 1022 {
467e0479
VZ
1023 if ( dstLen < srcLen )
1024 return wxCONV_FAILED;
d32a507d 1025
467e0479 1026 memcpy(dst, src, srcLen);
c91830cb 1027 }
d32a507d 1028
467e0479 1029 return srcLen;
c91830cb
VZ
1030}
1031
467e0479
VZ
1032// ----------------------------------------------------------------------------
1033// endian-reversing conversions
1034// ----------------------------------------------------------------------------
c91830cb 1035
467e0479
VZ
1036size_t
1037wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1038 const char *src, size_t srcLen) const
c91830cb 1039{
467e0479
VZ
1040 srcLen = GetLength(src, srcLen);
1041 if ( srcLen == wxNO_LEN )
1042 return wxCONV_FAILED;
c91830cb 1043
467e0479
VZ
1044 srcLen /= BYTES_PER_CHAR;
1045
1046 if ( dst )
c91830cb 1047 {
467e0479
VZ
1048 if ( dstLen < srcLen )
1049 return wxCONV_FAILED;
1050
ef199164
DS
1051 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1052 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1053 {
ef199164 1054 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1055 }
c91830cb 1056 }
bfab25d4 1057
467e0479 1058 return srcLen;
c91830cb
VZ
1059}
1060
467e0479
VZ
1061size_t
1062wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1063 const wchar_t *src, size_t srcLen) const
c91830cb 1064{
467e0479
VZ
1065 if ( srcLen == wxNO_LEN )
1066 srcLen = wxWcslen(src) + 1;
c91830cb 1067
467e0479
VZ
1068 srcLen *= BYTES_PER_CHAR;
1069
1070 if ( dst )
c91830cb 1071 {
467e0479
VZ
1072 if ( dstLen < srcLen )
1073 return wxCONV_FAILED;
1074
ef199164 1075 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1076 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1077 {
ef199164 1078 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1079 }
c91830cb 1080 }
eec47cc6 1081
467e0479 1082 return srcLen;
c91830cb
VZ
1083}
1084
467e0479 1085#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1086
467e0479
VZ
1087// ----------------------------------------------------------------------------
1088// conversions without endianness change
1089// ----------------------------------------------------------------------------
c91830cb 1090
35d11700
VZ
1091size_t
1092wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1093 const char *src, size_t srcLen) const
c91830cb 1094{
35d11700
VZ
1095 srcLen = GetLength(src, srcLen);
1096 if ( srcLen == wxNO_LEN )
1097 return wxCONV_FAILED;
c91830cb 1098
ef199164 1099 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1100 if ( !dst )
c91830cb 1101 {
35d11700
VZ
1102 // optimization: return maximal space which could be needed for this
1103 // string even if the real size could be smaller if the buffer contains
1104 // any surrogates
1105 return inLen;
c91830cb 1106 }
c91830cb 1107
35d11700 1108 size_t outLen = 0;
ef199164
DS
1109 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1110 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1111 {
ef199164
DS
1112 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1113 if ( !inBuff )
35d11700
VZ
1114 return wxCONV_FAILED;
1115
1116 if ( ++outLen > dstLen )
1117 return wxCONV_FAILED;
c91830cb 1118
35d11700
VZ
1119 *dst++ = ch;
1120 }
1121
1122
1123 return outLen;
1124}
c91830cb 1125
35d11700
VZ
1126size_t
1127wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1128 const wchar_t *src, size_t srcLen) const
c91830cb 1129{
35d11700
VZ
1130 if ( srcLen == wxNO_LEN )
1131 srcLen = wxWcslen(src) + 1;
c91830cb 1132
35d11700 1133 size_t outLen = 0;
ef199164 1134 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1135 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1136 {
1137 wxUint16 cc[2];
35d11700
VZ
1138 const size_t numChars = encode_utf16(*src++, cc);
1139 if ( numChars == wxCONV_FAILED )
1140 return wxCONV_FAILED;
c91830cb 1141
ef199164
DS
1142 outLen += numChars * BYTES_PER_CHAR;
1143 if ( outBuff )
c91830cb 1144 {
35d11700
VZ
1145 if ( outLen > dstLen )
1146 return wxCONV_FAILED;
1147
ef199164 1148 *outBuff++ = cc[0];
35d11700 1149 if ( numChars == 2 )
69b80d28 1150 {
35d11700 1151 // second character of a surrogate
ef199164 1152 *outBuff++ = cc[1];
69b80d28 1153 }
c91830cb 1154 }
c91830cb 1155 }
c91830cb 1156
35d11700 1157 return outLen;
c91830cb
VZ
1158}
1159
467e0479
VZ
1160// ----------------------------------------------------------------------------
1161// endian-reversing conversions
1162// ----------------------------------------------------------------------------
c91830cb 1163
35d11700
VZ
1164size_t
1165wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1166 const char *src, size_t srcLen) const
c91830cb 1167{
35d11700
VZ
1168 srcLen = GetLength(src, srcLen);
1169 if ( srcLen == wxNO_LEN )
1170 return wxCONV_FAILED;
1171
ef199164 1172 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1173 if ( !dst )
1174 {
1175 // optimization: return maximal space which could be needed for this
1176 // string even if the real size could be smaller if the buffer contains
1177 // any surrogates
1178 return inLen;
1179 }
c91830cb 1180
35d11700 1181 size_t outLen = 0;
ef199164
DS
1182 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1183 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1184 {
35d11700
VZ
1185 wxUint32 ch;
1186 wxUint16 tmp[2];
ef199164
DS
1187
1188 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189 inBuff++;
1190 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1191
35d11700
VZ
1192 const size_t numChars = decode_utf16(tmp, ch);
1193 if ( numChars == wxCONV_FAILED )
1194 return wxCONV_FAILED;
c91830cb 1195
35d11700 1196 if ( numChars == 2 )
ef199164 1197 inBuff++;
35d11700
VZ
1198
1199 if ( ++outLen > dstLen )
1200 return wxCONV_FAILED;
c91830cb 1201
35d11700 1202 *dst++ = ch;
c91830cb 1203 }
c91830cb 1204
c91830cb 1205
35d11700
VZ
1206 return outLen;
1207}
c91830cb 1208
35d11700
VZ
1209size_t
1210wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1211 const wchar_t *src, size_t srcLen) const
c91830cb 1212{
35d11700
VZ
1213 if ( srcLen == wxNO_LEN )
1214 srcLen = wxWcslen(src) + 1;
c91830cb 1215
35d11700 1216 size_t outLen = 0;
ef199164 1217 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1218 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1219 {
1220 wxUint16 cc[2];
35d11700
VZ
1221 const size_t numChars = encode_utf16(*src, cc);
1222 if ( numChars == wxCONV_FAILED )
1223 return wxCONV_FAILED;
c91830cb 1224
ef199164
DS
1225 outLen += numChars * BYTES_PER_CHAR;
1226 if ( outBuff )
c91830cb 1227 {
35d11700
VZ
1228 if ( outLen > dstLen )
1229 return wxCONV_FAILED;
1230
ef199164 1231 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1232 if ( numChars == 2 )
c91830cb 1233 {
35d11700 1234 // second character of a surrogate
ef199164 1235 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1236 }
1237 }
c91830cb 1238 }
c91830cb 1239
35d11700 1240 return outLen;
c91830cb
VZ
1241}
1242
467e0479 1243#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1244
1245
35d11700 1246// ============================================================================
c91830cb 1247// UTF-32
35d11700 1248// ============================================================================
c91830cb
VZ
1249
1250#ifdef WORDS_BIGENDIAN
467e0479
VZ
1251 #define wxMBConvUTF32straight wxMBConvUTF32BE
1252 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1253#else
467e0479
VZ
1254 #define wxMBConvUTF32swap wxMBConvUTF32BE
1255 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1256#endif
1257
1258
1259WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1260WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1261
467e0479
VZ
1262/* static */
1263size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1264{
1265 if ( srcLen == wxNO_LEN )
1266 {
1267 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1268 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1269 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1270 ;
c91830cb 1271
467e0479
VZ
1272 srcLen *= BYTES_PER_CHAR;
1273 }
1274 else // we already have the length
1275 {
1276 // we can only convert an entire number of UTF-32 characters
1277 if ( srcLen % BYTES_PER_CHAR )
1278 return wxCONV_FAILED;
1279 }
1280
1281 return srcLen;
1282}
1283
1284// case when in-memory representation is UTF-16
c91830cb
VZ
1285#ifdef WC_UTF16
1286
467e0479
VZ
1287// ----------------------------------------------------------------------------
1288// conversions without endianness change
1289// ----------------------------------------------------------------------------
1290
1291size_t
1292wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1293 const char *src, size_t srcLen) const
c91830cb 1294{
467e0479
VZ
1295 srcLen = GetLength(src, srcLen);
1296 if ( srcLen == wxNO_LEN )
1297 return wxCONV_FAILED;
c91830cb 1298
ef199164
DS
1299 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1300 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1301 size_t outLen = 0;
1302 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1303 {
1304 wxUint16 cc[2];
ef199164 1305 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1306 if ( numChars == wxCONV_FAILED )
1307 return wxCONV_FAILED;
c91830cb 1308
467e0479
VZ
1309 outLen += numChars;
1310 if ( dst )
c91830cb 1311 {
467e0479
VZ
1312 if ( outLen > dstLen )
1313 return wxCONV_FAILED;
d32a507d 1314
467e0479
VZ
1315 *dst++ = cc[0];
1316 if ( numChars == 2 )
1317 {
1318 // second character of a surrogate
1319 *dst++ = cc[1];
1320 }
1321 }
c91830cb 1322 }
d32a507d 1323
467e0479 1324 return outLen;
c91830cb
VZ
1325}
1326
467e0479
VZ
1327size_t
1328wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1329 const wchar_t *src, size_t srcLen) const
c91830cb 1330{
467e0479
VZ
1331 if ( srcLen == wxNO_LEN )
1332 srcLen = wxWcslen(src) + 1;
c91830cb 1333
467e0479 1334 if ( !dst )
c91830cb 1335 {
467e0479
VZ
1336 // optimization: return maximal space which could be needed for this
1337 // string instead of the exact amount which could be less if there are
1338 // any surrogates in the input
1339 //
1340 // we consider that surrogates are rare enough to make it worthwhile to
1341 // avoid running the loop below at the cost of slightly extra memory
1342 // consumption
ef199164 1343 return srcLen * BYTES_PER_CHAR;
467e0479 1344 }
c91830cb 1345
ef199164 1346 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1347 size_t outLen = 0;
1348 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1349 {
1350 const wxUint32 ch = wxDecodeSurrogate(&src);
1351 if ( !src )
1352 return wxCONV_FAILED;
c91830cb 1353
467e0479 1354 outLen += BYTES_PER_CHAR;
d32a507d 1355
467e0479
VZ
1356 if ( outLen > dstLen )
1357 return wxCONV_FAILED;
b5153fd8 1358
ef199164 1359 *outBuff++ = ch;
467e0479 1360 }
c91830cb 1361
467e0479 1362 return outLen;
c91830cb
VZ
1363}
1364
467e0479
VZ
1365// ----------------------------------------------------------------------------
1366// endian-reversing conversions
1367// ----------------------------------------------------------------------------
c91830cb 1368
467e0479
VZ
1369size_t
1370wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1371 const char *src, size_t srcLen) const
c91830cb 1372{
467e0479
VZ
1373 srcLen = GetLength(src, srcLen);
1374 if ( srcLen == wxNO_LEN )
1375 return wxCONV_FAILED;
c91830cb 1376
ef199164
DS
1377 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1378 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1379 size_t outLen = 0;
ef199164 1380 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1381 {
c91830cb 1382 wxUint16 cc[2];
ef199164 1383 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1384 if ( numChars == wxCONV_FAILED )
1385 return wxCONV_FAILED;
c91830cb 1386
467e0479
VZ
1387 outLen += numChars;
1388 if ( dst )
c91830cb 1389 {
467e0479
VZ
1390 if ( outLen > dstLen )
1391 return wxCONV_FAILED;
d32a507d 1392
467e0479
VZ
1393 *dst++ = cc[0];
1394 if ( numChars == 2 )
1395 {
1396 // second character of a surrogate
1397 *dst++ = cc[1];
1398 }
1399 }
c91830cb 1400 }
b5153fd8 1401
467e0479 1402 return outLen;
c91830cb
VZ
1403}
1404
467e0479
VZ
1405size_t
1406wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1407 const wchar_t *src, size_t srcLen) const
c91830cb 1408{
467e0479
VZ
1409 if ( srcLen == wxNO_LEN )
1410 srcLen = wxWcslen(src) + 1;
c91830cb 1411
467e0479 1412 if ( !dst )
c91830cb 1413 {
467e0479
VZ
1414 // optimization: return maximal space which could be needed for this
1415 // string instead of the exact amount which could be less if there are
1416 // any surrogates in the input
1417 //
1418 // we consider that surrogates are rare enough to make it worthwhile to
1419 // avoid running the loop below at the cost of slightly extra memory
1420 // consumption
1421 return srcLen*BYTES_PER_CHAR;
1422 }
c91830cb 1423
ef199164 1424 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1425 size_t outLen = 0;
1426 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1427 {
1428 const wxUint32 ch = wxDecodeSurrogate(&src);
1429 if ( !src )
1430 return wxCONV_FAILED;
c91830cb 1431
467e0479 1432 outLen += BYTES_PER_CHAR;
d32a507d 1433
467e0479
VZ
1434 if ( outLen > dstLen )
1435 return wxCONV_FAILED;
b5153fd8 1436
ef199164 1437 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1438 }
c91830cb 1439
467e0479 1440 return outLen;
c91830cb
VZ
1441}
1442
467e0479 1443#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1444
35d11700
VZ
1445// ----------------------------------------------------------------------------
1446// conversions without endianness change
1447// ----------------------------------------------------------------------------
1448
1449size_t
1450wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1451 const char *src, size_t srcLen) const
c91830cb 1452{
35d11700
VZ
1453 // use memcpy() as it should be much faster than hand-written loop
1454 srcLen = GetLength(src, srcLen);
1455 if ( srcLen == wxNO_LEN )
1456 return wxCONV_FAILED;
c91830cb 1457
35d11700
VZ
1458 const size_t inLen = srcLen/BYTES_PER_CHAR;
1459 if ( dst )
c91830cb 1460 {
35d11700
VZ
1461 if ( dstLen < inLen )
1462 return wxCONV_FAILED;
b5153fd8 1463
35d11700
VZ
1464 memcpy(dst, src, srcLen);
1465 }
c91830cb 1466
35d11700 1467 return inLen;
c91830cb
VZ
1468}
1469
35d11700
VZ
1470size_t
1471wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1472 const wchar_t *src, size_t srcLen) const
c91830cb 1473{
35d11700
VZ
1474 if ( srcLen == wxNO_LEN )
1475 srcLen = wxWcslen(src) + 1;
1476
1477 srcLen *= BYTES_PER_CHAR;
c91830cb 1478
35d11700 1479 if ( dst )
c91830cb 1480 {
35d11700
VZ
1481 if ( dstLen < srcLen )
1482 return wxCONV_FAILED;
c91830cb 1483
35d11700 1484 memcpy(dst, src, srcLen);
c91830cb
VZ
1485 }
1486
35d11700 1487 return srcLen;
c91830cb
VZ
1488}
1489
35d11700
VZ
1490// ----------------------------------------------------------------------------
1491// endian-reversing conversions
1492// ----------------------------------------------------------------------------
c91830cb 1493
35d11700
VZ
1494size_t
1495wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1496 const char *src, size_t srcLen) const
c91830cb 1497{
35d11700
VZ
1498 srcLen = GetLength(src, srcLen);
1499 if ( srcLen == wxNO_LEN )
1500 return wxCONV_FAILED;
1501
1502 srcLen /= BYTES_PER_CHAR;
c91830cb 1503
35d11700 1504 if ( dst )
c91830cb 1505 {
35d11700
VZ
1506 if ( dstLen < srcLen )
1507 return wxCONV_FAILED;
1508
ef199164
DS
1509 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1510 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1511 {
ef199164 1512 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1513 }
c91830cb 1514 }
b5153fd8 1515
35d11700 1516 return srcLen;
c91830cb
VZ
1517}
1518
35d11700
VZ
1519size_t
1520wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1521 const wchar_t *src, size_t srcLen) const
c91830cb 1522{
35d11700
VZ
1523 if ( srcLen == wxNO_LEN )
1524 srcLen = wxWcslen(src) + 1;
1525
1526 srcLen *= BYTES_PER_CHAR;
c91830cb 1527
35d11700 1528 if ( dst )
c91830cb 1529 {
35d11700
VZ
1530 if ( dstLen < srcLen )
1531 return wxCONV_FAILED;
1532
ef199164 1533 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1534 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1535 {
ef199164 1536 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1537 }
c91830cb 1538 }
b5153fd8 1539
35d11700 1540 return srcLen;
c91830cb
VZ
1541}
1542
467e0479 1543#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1544
1545
36acb880
VZ
1546// ============================================================================
1547// The classes doing conversion using the iconv_xxx() functions
1548// ============================================================================
3caec1bb 1549
b040e242 1550#ifdef HAVE_ICONV
3a0d76bc 1551
b1d547eb
VS
1552// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1553// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1554// (unless there's yet another bug in glibc) the only case when iconv()
1555// returns with (size_t)-1 (which means error) and says there are 0 bytes
1556// left in the input buffer -- when _real_ error occurs,
1557// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1558// iconv() failure.
3caec1bb
VS
1559// [This bug does not appear in glibc 2.2.]
1560#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1561#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1562 (errno != E2BIG || bufLeft != 0))
1563#else
1564#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1565#endif
1566
ab217dba 1567#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1568
74a7eb0b
VZ
1569#define ICONV_T_INVALID ((iconv_t)-1)
1570
1571#if SIZEOF_WCHAR_T == 4
1572 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1573 #define WC_ENC wxFONTENCODING_UTF32
1574#elif SIZEOF_WCHAR_T == 2
1575 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1576 #define WC_ENC wxFONTENCODING_UTF16
1577#else // sizeof(wchar_t) != 2 nor 4
1578 // does this ever happen?
1579 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1580#endif
1581
36acb880 1582// ----------------------------------------------------------------------------
e95354ec 1583// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1584// ----------------------------------------------------------------------------
1585
e95354ec 1586class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1587{
1588public:
e95354ec
VZ
1589 wxMBConv_iconv(const wxChar *name);
1590 virtual ~wxMBConv_iconv();
36acb880 1591
bde4baac
VZ
1592 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1593 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1594
d36c9347 1595 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1596 virtual size_t GetMBNulLen() const;
1597
d36c9347
VZ
1598 virtual wxMBConv *Clone() const
1599 {
1600 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1601 p->m_minMBCharWidth = m_minMBCharWidth;
1602 return p;
1603 }
1604
e95354ec 1605 bool IsOk() const
74a7eb0b 1606 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1607
1608protected:
ef199164
DS
1609 // the iconv handlers used to translate from multibyte
1610 // to wide char and in the other direction
36acb880
VZ
1611 iconv_t m2w,
1612 w2m;
ef199164 1613
b1d547eb
VS
1614#if wxUSE_THREADS
1615 // guards access to m2w and w2m objects
1616 wxMutex m_iconvMutex;
1617#endif
36acb880
VZ
1618
1619private:
e95354ec 1620 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1621 // available on this machine, it will remain NULL
74a7eb0b 1622 static wxString ms_wcCharsetName;
36acb880
VZ
1623
1624 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1625 // different endian-ness than the native one
405d8f46 1626 static bool ms_wcNeedsSwap;
eec47cc6 1627
d36c9347
VZ
1628
1629 // name of the encoding handled by this conversion
1630 wxString m_name;
1631
7ef3ab50 1632 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1633 // initially
1634 size_t m_minMBCharWidth;
36acb880
VZ
1635};
1636
8f115891
MW
1637// make the constructor available for unit testing
1638WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1639{
1640 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1641 if ( !result->IsOk() )
1642 {
1643 delete result;
1644 return 0;
1645 }
ef199164 1646
8f115891
MW
1647 return result;
1648}
1649
422e411e 1650wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1651bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1652
e95354ec 1653wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
d36c9347 1654 : m_name(name)
36acb880 1655{
c1464d9d 1656 m_minMBCharWidth = 0;
eec47cc6 1657
0331b385
VZ
1658 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1659 // names for the charsets
200a9923 1660 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1661
36acb880 1662 // check for charset that represents wchar_t:
74a7eb0b 1663 if ( ms_wcCharsetName.empty() )
f1339c56 1664 {
c2b83fdd
VZ
1665 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1666
74a7eb0b
VZ
1667#if wxUSE_FONTMAP
1668 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1669#else // !wxUSE_FONTMAP
1670 static const wxChar *names[] =
36acb880 1671 {
74a7eb0b
VZ
1672#if SIZEOF_WCHAR_T == 4
1673 _T("UCS-4"),
1674#elif SIZEOF_WCHAR_T = 2
1675 _T("UCS-2"),
1676#endif
1677 NULL
1678 };
1679#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1680
d1f024a8 1681 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1682 {
17a1ebd1 1683 const wxString nameCS(*names);
74a7eb0b
VZ
1684
1685 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1686 wxString nameXE(nameCS);
ef199164
DS
1687
1688#ifdef WORDS_BIGENDIAN
74a7eb0b 1689 nameXE += _T("BE");
ef199164 1690#else // little endian
74a7eb0b 1691 nameXE += _T("LE");
ef199164 1692#endif
74a7eb0b 1693
c2b83fdd
VZ
1694 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1695 nameXE.c_str());
1696
74a7eb0b
VZ
1697 m2w = iconv_open(nameXE.ToAscii(), cname);
1698 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1699 {
74a7eb0b 1700 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1701 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1702 nameCS.c_str());
17a1ebd1 1703 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1704
74a7eb0b
VZ
1705 // and check for bytesex ourselves:
1706 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1707 {
74a7eb0b
VZ
1708 char buf[2], *bufPtr;
1709 wchar_t wbuf[2], *wbufPtr;
1710 size_t insz, outsz;
1711 size_t res;
1712
1713 buf[0] = 'A';
1714 buf[1] = 0;
1715 wbuf[0] = 0;
1716 insz = 2;
1717 outsz = SIZEOF_WCHAR_T * 2;
1718 wbufPtr = wbuf;
1719 bufPtr = buf;
1720
ef199164
DS
1721 res = iconv(
1722 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1723 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
1724
1725 if (ICONV_FAILED(res, insz))
1726 {
1727 wxLogLastError(wxT("iconv"));
422e411e 1728 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1729 nameCS.c_str());
74a7eb0b
VZ
1730 }
1731 else // ok, can convert to this encoding, remember it
1732 {
17a1ebd1 1733 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1734 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1735 }
3a0d76bc
VS
1736 }
1737 }
74a7eb0b 1738 else // use charset not requiring byte swapping
36acb880 1739 {
74a7eb0b 1740 ms_wcCharsetName = nameXE;
36acb880 1741 }
3a0d76bc 1742 }
74a7eb0b 1743
0944fceb 1744 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1745 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1746 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1747 : ms_wcCharsetName.c_str(),
1748 ms_wcNeedsSwap ? _T(" (needs swap)")
1749 : _T(""));
3a0d76bc 1750 }
36acb880 1751 else // we already have ms_wcCharsetName
3caec1bb 1752 {
74a7eb0b 1753 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1754 }
dccce9ea 1755
74a7eb0b 1756 if ( ms_wcCharsetName.empty() )
f1339c56 1757 {
74a7eb0b 1758 w2m = ICONV_T_INVALID;
36acb880 1759 }
405d8f46
VZ
1760 else
1761 {
74a7eb0b
VZ
1762 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1763 if ( w2m == ICONV_T_INVALID )
1764 {
1765 wxLogTrace(TRACE_STRCONV,
1766 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1767 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1768 }
405d8f46 1769 }
36acb880 1770}
3caec1bb 1771
e95354ec 1772wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1773{
74a7eb0b 1774 if ( m2w != ICONV_T_INVALID )
36acb880 1775 iconv_close(m2w);
74a7eb0b 1776 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1777 iconv_close(w2m);
1778}
3a0d76bc 1779
bde4baac 1780size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1781{
69373110
VZ
1782 // find the string length: notice that must be done differently for
1783 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1784 size_t inbuf;
7ef3ab50 1785 const size_t nulLen = GetMBNulLen();
69373110
VZ
1786 switch ( nulLen )
1787 {
1788 default:
467e0479 1789 return wxCONV_FAILED;
69373110
VZ
1790
1791 case 1:
1792 inbuf = strlen(psz); // arguably more optimized than our version
1793 break;
1794
1795 case 2:
1796 case 4:
1797 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1798 // they also have to start at character boundary and not span two
1799 // adjacent characters
1800 const char *p;
1801 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1802 ;
1803 inbuf = p - psz;
1804 break;
1805 }
1806
b1d547eb
VS
1807#if wxUSE_THREADS
1808 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1809 // Unfortunately there is a couple of global wxCSConv objects such as
1810 // wxConvLocal that are used all over wx code, so we have to make sure
1811 // the handle is used by at most one thread at the time. Otherwise
1812 // only a few wx classes would be safe to use from non-main threads
1813 // as MB<->WC conversion would fail "randomly".
1814 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
1815#endif // wxUSE_THREADS
1816
36acb880
VZ
1817 size_t outbuf = n * SIZEOF_WCHAR_T;
1818 size_t res, cres;
1819 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1820 wchar_t *bufPtr = buf;
1821 const char *pszPtr = psz;
1822
1823 if (buf)
1824 {
1825 // have destination buffer, convert there
1826 cres = iconv(m2w,
1827 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1828 (char**)&bufPtr, &outbuf);
1829 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1830
36acb880 1831 if (ms_wcNeedsSwap)
3a0d76bc 1832 {
36acb880 1833 // convert to native endianness
17a1ebd1
VZ
1834 for ( unsigned i = 0; i < res; i++ )
1835 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1836 }
adb45366 1837
69373110 1838 // NUL-terminate the string if there is any space left
49dd9820
VS
1839 if (res < n)
1840 buf[res] = 0;
36acb880
VZ
1841 }
1842 else
1843 {
1844 // no destination buffer... convert using temp buffer
1845 // to calculate destination buffer requirement
1846 wchar_t tbuf[8];
1847 res = 0;
ef199164
DS
1848
1849 do
1850 {
36acb880 1851 bufPtr = tbuf;
ef199164 1852 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
1853
1854 cres = iconv(m2w,
1855 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1856 (char**)&bufPtr, &outbuf );
1857
ef199164
DS
1858 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1859 }
1860 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 1861 }
dccce9ea 1862
36acb880 1863 if (ICONV_FAILED(cres, inbuf))
f1339c56 1864 {
36acb880 1865 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1866 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 1867 return wxCONV_FAILED;
36acb880
VZ
1868 }
1869
1870 return res;
1871}
1872
bde4baac 1873size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1874{
b1d547eb
VS
1875#if wxUSE_THREADS
1876 // NB: explained in MB2WC
1877 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1878#endif
3698ae71 1879
156162ec
MW
1880 size_t inlen = wxWcslen(psz);
1881 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1882 size_t outbuf = n;
1883 size_t res, cres;
3a0d76bc 1884
36acb880 1885 wchar_t *tmpbuf = 0;
3caec1bb 1886
36acb880
VZ
1887 if (ms_wcNeedsSwap)
1888 {
1889 // need to copy to temp buffer to switch endianness
74a7eb0b 1890 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1891 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1892 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1893 for ( size_t i = 0; i < inlen; i++ )
1894 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 1895
156162ec 1896 tmpbuf[inlen] = L'\0';
74a7eb0b 1897 psz = tmpbuf;
36acb880 1898 }
3a0d76bc 1899
36acb880
VZ
1900 if (buf)
1901 {
1902 // have destination buffer, convert there
1903 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1904
ef199164 1905 res = n - outbuf;
adb45366 1906
49dd9820
VS
1907 // NB: iconv was given only wcslen(psz) characters on input, and so
1908 // it couldn't convert the trailing zero. Let's do it ourselves
1909 // if there's some room left for it in the output buffer.
1910 if (res < n)
1911 buf[0] = 0;
36acb880
VZ
1912 }
1913 else
1914 {
ef199164 1915 // no destination buffer: convert using temp buffer
36acb880
VZ
1916 // to calculate destination buffer requirement
1917 char tbuf[16];
1918 res = 0;
ef199164
DS
1919 do
1920 {
1921 buf = tbuf;
1922 outbuf = 16;
36acb880
VZ
1923
1924 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1925
36acb880 1926 res += 16 - outbuf;
ef199164
DS
1927 }
1928 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 1929 }
dccce9ea 1930
36acb880
VZ
1931 if (ms_wcNeedsSwap)
1932 {
1933 free(tmpbuf);
1934 }
dccce9ea 1935
36acb880
VZ
1936 if (ICONV_FAILED(cres, inbuf))
1937 {
ce6f8d6f 1938 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 1939 return wxCONV_FAILED;
36acb880
VZ
1940 }
1941
1942 return res;
1943}
1944
7ef3ab50 1945size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 1946{
c1464d9d 1947 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
1948 {
1949 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1950
1951#if wxUSE_THREADS
1952 // NB: explained in MB2WC
1953 wxMutexLocker lock(self->m_iconvMutex);
1954#endif
1955
356410fc 1956 wchar_t *wnul = L"";
c1464d9d 1957 char buf[8]; // should be enough for NUL in any encoding
356410fc 1958 size_t inLen = sizeof(wchar_t),
c1464d9d 1959 outLen = WXSIZEOF(buf);
ef199164
DS
1960 char *inBuff = (char *)wnul;
1961 char *outBuff = buf;
1962 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 1963 {
c1464d9d 1964 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
1965 }
1966 else // ok
1967 {
ef199164 1968 self->m_minMBCharWidth = outBuff - buf;
356410fc 1969 }
eec47cc6
VZ
1970 }
1971
c1464d9d 1972 return m_minMBCharWidth;
eec47cc6
VZ
1973}
1974
b040e242 1975#endif // HAVE_ICONV
36acb880 1976
e95354ec 1977
36acb880
VZ
1978// ============================================================================
1979// Win32 conversion classes
1980// ============================================================================
1cd52418 1981
e95354ec 1982#ifdef wxHAVE_WIN32_MB2WC
373658eb 1983
8b04d4c4 1984// from utils.cpp
d775fa82 1985#if wxUSE_FONTMAP
8b04d4c4
VZ
1986extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1987extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1988#endif
373658eb 1989
e95354ec 1990class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1991{
1992public:
bde4baac
VZ
1993 wxMBConv_win32()
1994 {
1995 m_CodePage = CP_ACP;
c1464d9d 1996 m_minMBCharWidth = 0;
bde4baac
VZ
1997 }
1998
d36c9347 1999 wxMBConv_win32(const wxMBConv_win32& conv)
1e1c5d62 2000 : wxMBConv()
d36c9347
VZ
2001 {
2002 m_CodePage = conv.m_CodePage;
2003 m_minMBCharWidth = conv.m_minMBCharWidth;
2004 }
2005
7608a683 2006#if wxUSE_FONTMAP
e95354ec 2007 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
2008 {
2009 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2010 m_minMBCharWidth = 0;
bde4baac 2011 }
dccce9ea 2012
e95354ec 2013 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2014 {
2015 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2016 m_minMBCharWidth = 0;
bde4baac 2017 }
eec47cc6 2018#endif // wxUSE_FONTMAP
8b04d4c4 2019
d36c9347 2020 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2021 {
02272c9c
VZ
2022 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2023 // the behaviour is not compatible with the Unix version (using iconv)
2024 // and break the library itself, e.g. wxTextInputStream::NextChar()
2025 // wouldn't work if reading an incomplete MB char didn't result in an
2026 // error
667e5b3e 2027 //
89028980 2028 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2029 // Win XP or newer and it is not supported for UTF-[78] so we always
2030 // use our own conversions in this case. See
89028980
VS
2031 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2032 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2033 if ( m_CodePage == CP_UTF8 )
89028980 2034 {
830f8f11 2035 return wxConvUTF8.MB2WC(buf, psz, n);
89028980 2036 }
830f8f11
VZ
2037
2038 if ( m_CodePage == CP_UTF7 )
2039 {
2040 return wxConvUTF7.MB2WC(buf, psz, n);
2041 }
2042
2043 int flags = 0;
2044 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2045 IsAtLeastWin2kSP4() )
89028980 2046 {
830f8f11 2047 flags = MB_ERR_INVALID_CHARS;
89028980 2048 }
667e5b3e 2049
2b5f62a0
VZ
2050 const size_t len = ::MultiByteToWideChar
2051 (
2052 m_CodePage, // code page
667e5b3e 2053 flags, // flags: fall on error
2b5f62a0
VZ
2054 psz, // input string
2055 -1, // its length (NUL-terminated)
b4da152e 2056 buf, // output string
2b5f62a0
VZ
2057 buf ? n : 0 // size of output buffer
2058 );
89028980
VS
2059 if ( !len )
2060 {
2061 // function totally failed
467e0479 2062 return wxCONV_FAILED;
89028980
VS
2063 }
2064
2065 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2066 // check if we succeeded, by doing a double trip:
2067 if ( !flags && buf )
2068 {
53c174fc
VZ
2069 const size_t mbLen = strlen(psz);
2070 wxCharBuffer mbBuf(mbLen);
89028980
VS
2071 if ( ::WideCharToMultiByte
2072 (
2073 m_CodePage,
2074 0,
2075 buf,
2076 -1,
2077 mbBuf.data(),
53c174fc 2078 mbLen + 1, // size in bytes, not length
89028980
VS
2079 NULL,
2080 NULL
2081 ) == 0 ||
2082 strcmp(mbBuf, psz) != 0 )
2083 {
2084 // we didn't obtain the same thing we started from, hence
2085 // the conversion was lossy and we consider that it failed
467e0479 2086 return wxCONV_FAILED;
89028980
VS
2087 }
2088 }
2b5f62a0 2089
03a991bc
VZ
2090 // note that it returns count of written chars for buf != NULL and size
2091 // of the needed buffer for buf == NULL so in either case the length of
2092 // the string (which never includes the terminating NUL) is one less
89028980 2093 return len - 1;
f1339c56 2094 }
dccce9ea 2095
d36c9347 2096 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2097 {
13dd924a
VZ
2098 /*
2099 we have a problem here: by default, WideCharToMultiByte() may
2100 replace characters unrepresentable in the target code page with bad
2101 quality approximations such as turning "1/2" symbol (U+00BD) into
2102 "1" for the code pages which don't have it and we, obviously, want
2103 to avoid this at any price
d775fa82 2104
13dd924a
VZ
2105 the trouble is that this function does it _silently_, i.e. it won't
2106 even tell us whether it did or not... Win98/2000 and higher provide
2107 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2108 we have to resort to a round trip, i.e. check that converting back
2109 results in the same string -- this is, of course, expensive but
2110 otherwise we simply can't be sure to not garble the data.
2111 */
2112
2113 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2114 // it doesn't work with CJK encodings (which we test for rather roughly
2115 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2116 // supporting it
907173e5
WS
2117 BOOL usedDef wxDUMMY_INITIALIZE(false);
2118 BOOL *pUsedDef;
13dd924a
VZ
2119 int flags;
2120 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2121 {
2122 // it's our lucky day
2123 flags = WC_NO_BEST_FIT_CHARS;
2124 pUsedDef = &usedDef;
2125 }
2126 else // old system or unsupported encoding
2127 {
2128 flags = 0;
2129 pUsedDef = NULL;
2130 }
2131
2b5f62a0
VZ
2132 const size_t len = ::WideCharToMultiByte
2133 (
2134 m_CodePage, // code page
13dd924a
VZ
2135 flags, // either none or no best fit
2136 pwz, // input string
2b5f62a0
VZ
2137 -1, // it is (wide) NUL-terminated
2138 buf, // output buffer
2139 buf ? n : 0, // and its size
2140 NULL, // default "replacement" char
13dd924a 2141 pUsedDef // [out] was it used?
2b5f62a0
VZ
2142 );
2143
13dd924a
VZ
2144 if ( !len )
2145 {
2146 // function totally failed
467e0479 2147 return wxCONV_FAILED;
13dd924a
VZ
2148 }
2149
2150 // if we were really converting, check if we succeeded
2151 if ( buf )
2152 {
2153 if ( flags )
2154 {
2155 // check if the conversion failed, i.e. if any replacements
2156 // were done
2157 if ( usedDef )
467e0479 2158 return wxCONV_FAILED;
13dd924a
VZ
2159 }
2160 else // we must resort to double tripping...
2161 {
2162 wxWCharBuffer wcBuf(n);
467e0479 2163 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
13dd924a
VZ
2164 wcscmp(wcBuf, pwz) != 0 )
2165 {
2166 // we didn't obtain the same thing we started from, hence
2167 // the conversion was lossy and we consider that it failed
467e0479 2168 return wxCONV_FAILED;
13dd924a
VZ
2169 }
2170 }
2171 }
2172
03a991bc 2173 // see the comment above for the reason of "len - 1"
13dd924a 2174 return len - 1;
f1339c56 2175 }
dccce9ea 2176
7ef3ab50
VZ
2177 virtual size_t GetMBNulLen() const
2178 {
2179 if ( m_minMBCharWidth == 0 )
2180 {
2181 int len = ::WideCharToMultiByte
2182 (
2183 m_CodePage, // code page
2184 0, // no flags
2185 L"", // input string
2186 1, // translate just the NUL
2187 NULL, // output buffer
2188 0, // and its size
2189 NULL, // no replacement char
2190 NULL // [out] don't care if it was used
2191 );
2192
2193 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2194 switch ( len )
2195 {
2196 default:
2197 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2198 self->m_minMBCharWidth = (size_t)-1;
2199 break;
7ef3ab50
VZ
2200
2201 case 0:
2202 self->m_minMBCharWidth = (size_t)-1;
2203 break;
2204
2205 case 1:
2206 case 2:
2207 case 4:
2208 self->m_minMBCharWidth = len;
2209 break;
2210 }
2211 }
2212
2213 return m_minMBCharWidth;
2214 }
2215
d36c9347
VZ
2216 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2217
13dd924a
VZ
2218 bool IsOk() const { return m_CodePage != -1; }
2219
2220private:
2221 static bool CanUseNoBestFit()
2222 {
2223 static int s_isWin98Or2k = -1;
2224
2225 if ( s_isWin98Or2k == -1 )
2226 {
2227 int verMaj, verMin;
2228 switch ( wxGetOsVersion(&verMaj, &verMin) )
2229 {
2230 case wxWIN95:
2231 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2232 break;
2233
2234 case wxWINDOWS_NT:
2235 s_isWin98Or2k = verMaj >= 5;
2236 break;
2237
2238 default:
ef199164 2239 // unknown: be conservative by default
13dd924a 2240 s_isWin98Or2k = 0;
ef199164 2241 break;
13dd924a
VZ
2242 }
2243
2244 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2245 }
2246
2247 return s_isWin98Or2k == 1;
2248 }
f1339c56 2249
89028980
VS
2250 static bool IsAtLeastWin2kSP4()
2251 {
8942f83a
WS
2252#ifdef __WXWINCE__
2253 return false;
2254#else
89028980
VS
2255 static int s_isAtLeastWin2kSP4 = -1;
2256
2257 if ( s_isAtLeastWin2kSP4 == -1 )
2258 {
2259 OSVERSIONINFOEX ver;
2260
2261 memset(&ver, 0, sizeof(ver));
2262 ver.dwOSVersionInfoSize = sizeof(ver);
2263 GetVersionEx((OSVERSIONINFO*)&ver);
2264
2265 s_isAtLeastWin2kSP4 =
2266 ((ver.dwMajorVersion > 5) || // Vista+
2267 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2268 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2269 ver.wServicePackMajor >= 4)) // 2000 SP4+
2270 ? 1 : 0;
2271 }
2272
2273 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2274#endif
89028980
VS
2275 }
2276
eec47cc6 2277
c1464d9d 2278 // the code page we're working with
b1d66b54 2279 long m_CodePage;
c1464d9d 2280
7ef3ab50 2281 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2282 // "unknown"
2283 size_t m_minMBCharWidth;
1cd52418 2284};
e95354ec
VZ
2285
2286#endif // wxHAVE_WIN32_MB2WC
2287
f7e98dee
RN
2288// ============================================================================
2289// Cocoa conversion classes
2290// ============================================================================
2291
2292#if defined(__WXCOCOA__)
2293
ef199164
DS
2294// RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2295// Strangely enough, internally Core Foundation uses
2296// UTF-32 internally quite a bit - its just not public (yet).
f7e98dee
RN
2297
2298#include <CoreFoundation/CFString.h>
2299#include <CoreFoundation/CFStringEncodingExt.h>
2300
2301CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 2302{
638357a0 2303 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ef199164
DS
2304
2305 switch (encoding)
ecd9653b 2306 {
ef199164
DS
2307 case wxFONTENCODING_DEFAULT :
2308 enc = CFStringGetSystemEncoding();
2309 break ;
2310
ecd9653b
WS
2311 case wxFONTENCODING_ISO8859_1 :
2312 enc = kCFStringEncodingISOLatin1 ;
2313 break ;
2314 case wxFONTENCODING_ISO8859_2 :
2315 enc = kCFStringEncodingISOLatin2;
2316 break ;
2317 case wxFONTENCODING_ISO8859_3 :
2318 enc = kCFStringEncodingISOLatin3 ;
2319 break ;
2320 case wxFONTENCODING_ISO8859_4 :
2321 enc = kCFStringEncodingISOLatin4;
2322 break ;
2323 case wxFONTENCODING_ISO8859_5 :
2324 enc = kCFStringEncodingISOLatinCyrillic;
2325 break ;
2326 case wxFONTENCODING_ISO8859_6 :
2327 enc = kCFStringEncodingISOLatinArabic;
2328 break ;
2329 case wxFONTENCODING_ISO8859_7 :
2330 enc = kCFStringEncodingISOLatinGreek;
2331 break ;
2332 case wxFONTENCODING_ISO8859_8 :
2333 enc = kCFStringEncodingISOLatinHebrew;
2334 break ;
2335 case wxFONTENCODING_ISO8859_9 :
2336 enc = kCFStringEncodingISOLatin5;
2337 break ;
2338 case wxFONTENCODING_ISO8859_10 :
2339 enc = kCFStringEncodingISOLatin6;
2340 break ;
2341 case wxFONTENCODING_ISO8859_11 :
2342 enc = kCFStringEncodingISOLatinThai;
2343 break ;
2344 case wxFONTENCODING_ISO8859_13 :
2345 enc = kCFStringEncodingISOLatin7;
2346 break ;
2347 case wxFONTENCODING_ISO8859_14 :
2348 enc = kCFStringEncodingISOLatin8;
2349 break ;
2350 case wxFONTENCODING_ISO8859_15 :
2351 enc = kCFStringEncodingISOLatin9;
2352 break ;
2353
2354 case wxFONTENCODING_KOI8 :
2355 enc = kCFStringEncodingKOI8_R;
2356 break ;
2357 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2358 enc = kCFStringEncodingDOSRussian;
2359 break ;
2360
2361// case wxFONTENCODING_BULGARIAN :
2362// enc = ;
2363// break ;
2364
2365 case wxFONTENCODING_CP437 :
ef199164 2366 enc = kCFStringEncodingDOSLatinUS ;
ecd9653b
WS
2367 break ;
2368 case wxFONTENCODING_CP850 :
2369 enc = kCFStringEncodingDOSLatin1;
2370 break ;
2371 case wxFONTENCODING_CP852 :
2372 enc = kCFStringEncodingDOSLatin2;
2373 break ;
2374 case wxFONTENCODING_CP855 :
2375 enc = kCFStringEncodingDOSCyrillic;
2376 break ;
2377 case wxFONTENCODING_CP866 :
ef199164 2378 enc = kCFStringEncodingDOSRussian ;
ecd9653b
WS
2379 break ;
2380 case wxFONTENCODING_CP874 :
2381 enc = kCFStringEncodingDOSThai;
2382 break ;
2383 case wxFONTENCODING_CP932 :
2384 enc = kCFStringEncodingDOSJapanese;
2385 break ;
2386 case wxFONTENCODING_CP936 :
ef199164 2387 enc = kCFStringEncodingDOSChineseSimplif ;
ecd9653b
WS
2388 break ;
2389 case wxFONTENCODING_CP949 :
2390 enc = kCFStringEncodingDOSKorean;
2391 break ;
2392 case wxFONTENCODING_CP950 :
2393 enc = kCFStringEncodingDOSChineseTrad;
2394 break ;
ecd9653b
WS
2395 case wxFONTENCODING_CP1250 :
2396 enc = kCFStringEncodingWindowsLatin2;
2397 break ;
2398 case wxFONTENCODING_CP1251 :
ef199164 2399 enc = kCFStringEncodingWindowsCyrillic ;
ecd9653b
WS
2400 break ;
2401 case wxFONTENCODING_CP1252 :
ef199164 2402 enc = kCFStringEncodingWindowsLatin1 ;
ecd9653b
WS
2403 break ;
2404 case wxFONTENCODING_CP1253 :
2405 enc = kCFStringEncodingWindowsGreek;
2406 break ;
2407 case wxFONTENCODING_CP1254 :
2408 enc = kCFStringEncodingWindowsLatin5;
2409 break ;
2410 case wxFONTENCODING_CP1255 :
ef199164 2411 enc = kCFStringEncodingWindowsHebrew ;
ecd9653b
WS
2412 break ;
2413 case wxFONTENCODING_CP1256 :
ef199164 2414 enc = kCFStringEncodingWindowsArabic ;
ecd9653b
WS
2415 break ;
2416 case wxFONTENCODING_CP1257 :
2417 enc = kCFStringEncodingWindowsBalticRim;
2418 break ;
638357a0
RN
2419// This only really encodes to UTF7 (if that) evidently
2420// case wxFONTENCODING_UTF7 :
2421// enc = kCFStringEncodingNonLossyASCII ;
2422// break ;
ecd9653b
WS
2423 case wxFONTENCODING_UTF8 :
2424 enc = kCFStringEncodingUTF8 ;
2425 break ;
2426 case wxFONTENCODING_EUC_JP :
2427 enc = kCFStringEncodingEUC_JP;
2428 break ;
2429 case wxFONTENCODING_UTF16 :
f7e98dee 2430 enc = kCFStringEncodingUnicode ;
ecd9653b 2431 break ;
f7e98dee
RN
2432 case wxFONTENCODING_MACROMAN :
2433 enc = kCFStringEncodingMacRoman ;
2434 break ;
2435 case wxFONTENCODING_MACJAPANESE :
2436 enc = kCFStringEncodingMacJapanese ;
2437 break ;
2438 case wxFONTENCODING_MACCHINESETRAD :
2439 enc = kCFStringEncodingMacChineseTrad ;
2440 break ;
2441 case wxFONTENCODING_MACKOREAN :
2442 enc = kCFStringEncodingMacKorean ;
2443 break ;
2444 case wxFONTENCODING_MACARABIC :
2445 enc = kCFStringEncodingMacArabic ;
2446 break ;
2447 case wxFONTENCODING_MACHEBREW :
2448 enc = kCFStringEncodingMacHebrew ;
2449 break ;
2450 case wxFONTENCODING_MACGREEK :
2451 enc = kCFStringEncodingMacGreek ;
2452 break ;
2453 case wxFONTENCODING_MACCYRILLIC :
2454 enc = kCFStringEncodingMacCyrillic ;
2455 break ;
2456 case wxFONTENCODING_MACDEVANAGARI :
2457 enc = kCFStringEncodingMacDevanagari ;
2458 break ;
2459 case wxFONTENCODING_MACGURMUKHI :
2460 enc = kCFStringEncodingMacGurmukhi ;
2461 break ;
2462 case wxFONTENCODING_MACGUJARATI :
2463 enc = kCFStringEncodingMacGujarati ;
2464 break ;
2465 case wxFONTENCODING_MACORIYA :
2466 enc = kCFStringEncodingMacOriya ;
2467 break ;
2468 case wxFONTENCODING_MACBENGALI :
2469 enc = kCFStringEncodingMacBengali ;
2470 break ;
2471 case wxFONTENCODING_MACTAMIL :
2472 enc = kCFStringEncodingMacTamil ;
2473 break ;
2474 case wxFONTENCODING_MACTELUGU :
2475 enc = kCFStringEncodingMacTelugu ;
2476 break ;
2477 case wxFONTENCODING_MACKANNADA :
2478 enc = kCFStringEncodingMacKannada ;
2479 break ;
2480 case wxFONTENCODING_MACMALAJALAM :
2481 enc = kCFStringEncodingMacMalayalam ;
2482 break ;
2483 case wxFONTENCODING_MACSINHALESE :
2484 enc = kCFStringEncodingMacSinhalese ;
2485 break ;
2486 case wxFONTENCODING_MACBURMESE :
2487 enc = kCFStringEncodingMacBurmese ;
2488 break ;
2489 case wxFONTENCODING_MACKHMER :
2490 enc = kCFStringEncodingMacKhmer ;
2491 break ;
2492 case wxFONTENCODING_MACTHAI :
2493 enc = kCFStringEncodingMacThai ;
2494 break ;
2495 case wxFONTENCODING_MACLAOTIAN :
2496 enc = kCFStringEncodingMacLaotian ;
2497 break ;
2498 case wxFONTENCODING_MACGEORGIAN :
2499 enc = kCFStringEncodingMacGeorgian ;
2500 break ;
2501 case wxFONTENCODING_MACARMENIAN :
2502 enc = kCFStringEncodingMacArmenian ;
2503 break ;
2504 case wxFONTENCODING_MACCHINESESIMP :
2505 enc = kCFStringEncodingMacChineseSimp ;
2506 break ;
2507 case wxFONTENCODING_MACTIBETAN :
2508 enc = kCFStringEncodingMacTibetan ;
2509 break ;
2510 case wxFONTENCODING_MACMONGOLIAN :
2511 enc = kCFStringEncodingMacMongolian ;
2512 break ;
2513 case wxFONTENCODING_MACETHIOPIC :
2514 enc = kCFStringEncodingMacEthiopic ;
2515 break ;
2516 case wxFONTENCODING_MACCENTRALEUR :
2517 enc = kCFStringEncodingMacCentralEurRoman ;
2518 break ;
2519 case wxFONTENCODING_MACVIATNAMESE :
2520 enc = kCFStringEncodingMacVietnamese ;
2521 break ;
2522 case wxFONTENCODING_MACARABICEXT :
2523 enc = kCFStringEncodingMacExtArabic ;
2524 break ;
2525 case wxFONTENCODING_MACSYMBOL :
2526 enc = kCFStringEncodingMacSymbol ;
2527 break ;
2528 case wxFONTENCODING_MACDINGBATS :
2529 enc = kCFStringEncodingMacDingbats ;
2530 break ;
2531 case wxFONTENCODING_MACTURKISH :
2532 enc = kCFStringEncodingMacTurkish ;
2533 break ;
2534 case wxFONTENCODING_MACCROATIAN :
2535 enc = kCFStringEncodingMacCroatian ;
2536 break ;
2537 case wxFONTENCODING_MACICELANDIC :
2538 enc = kCFStringEncodingMacIcelandic ;
2539 break ;
2540 case wxFONTENCODING_MACROMANIAN :
2541 enc = kCFStringEncodingMacRomanian ;
2542 break ;
2543 case wxFONTENCODING_MACCELTIC :
2544 enc = kCFStringEncodingMacCeltic ;
2545 break ;
2546 case wxFONTENCODING_MACGAELIC :
2547 enc = kCFStringEncodingMacGaelic ;
2548 break ;
ecd9653b
WS
2549// case wxFONTENCODING_MACKEYBOARD :
2550// enc = kCFStringEncodingMacKeyboardGlyphs ;
2551// break ;
ef199164 2552
ecd9653b
WS
2553 default :
2554 // because gcc is picky
2555 break ;
ef199164
DS
2556 }
2557
ecd9653b 2558 return enc ;
f7e98dee
RN
2559}
2560
f7e98dee
RN
2561class wxMBConv_cocoa : public wxMBConv
2562{
2563public:
2564 wxMBConv_cocoa()
2565 {
2566 Init(CFStringGetSystemEncoding()) ;
2567 }
2568
d36c9347
VZ
2569 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2570 {
2571 m_encoding = conv.m_encoding;
2572 }
2573
a6900d10 2574#if wxUSE_FONTMAP
f7e98dee
RN
2575 wxMBConv_cocoa(const wxChar* name)
2576 {
267e11c5 2577 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2578 }
a6900d10 2579#endif
f7e98dee
RN
2580
2581 wxMBConv_cocoa(wxFontEncoding encoding)
2582 {
2583 Init( wxCFStringEncFromFontEnc(encoding) );
2584 }
2585
2586 ~wxMBConv_cocoa()
2587 {
2588 }
2589
2590 void Init( CFStringEncoding encoding)
2591 {
638357a0 2592 m_encoding = encoding ;
f7e98dee
RN
2593 }
2594
2595 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2596 {
2597 wxASSERT(szUnConv);
ecd9653b 2598
638357a0
RN
2599 CFStringRef theString = CFStringCreateWithBytes (
2600 NULL, //the allocator
2601 (const UInt8*)szUnConv,
2602 strlen(szUnConv),
2603 m_encoding,
2604 false //no BOM/external representation
f7e98dee
RN
2605 );
2606
2607 wxASSERT(theString);
2608
638357a0
RN
2609 size_t nOutLength = CFStringGetLength(theString);
2610
2611 if (szOut == NULL)
f7e98dee 2612 {
f7e98dee 2613 CFRelease(theString);
638357a0 2614 return nOutLength;
f7e98dee 2615 }
ecd9653b 2616
638357a0 2617 CFRange theRange = { 0, nOutSize };
ecd9653b 2618
638357a0
RN
2619#if SIZEOF_WCHAR_T == 4
2620 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2621#endif
3698ae71 2622
f7e98dee 2623 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2624
f7e98dee 2625 CFRelease(theString);
ecd9653b 2626
ef199164 2627 szUniCharBuffer[nOutLength] = '\0';
f7e98dee
RN
2628
2629#if SIZEOF_WCHAR_T == 4
ef199164
DS
2630 wxMBConvUTF16 converter;
2631 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2632 delete [] szUniCharBuffer;
f7e98dee 2633#endif
3698ae71 2634
638357a0 2635 return nOutLength;
f7e98dee
RN
2636 }
2637
2638 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2639 {
638357a0 2640 wxASSERT(szUnConv);
3698ae71 2641
f7e98dee 2642 size_t nRealOutSize;
638357a0 2643 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2644 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2645
f7e98dee 2646#if SIZEOF_WCHAR_T == 4
d9d488cf 2647 wxMBConvUTF16 converter ;
ef199164
DS
2648 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2649 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2650 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
f7e98dee 2651 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2652#endif
2653
2654 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2655 NULL, //allocator
2656 szUniBuffer,
2657 nBufSize,
638357a0 2658 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2659 );
ecd9653b 2660
f7e98dee 2661 wxASSERT(theString);
ecd9653b 2662
f7e98dee 2663 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2664 //so we check and use getchars instead in that case
2665 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2666 {
638357a0
RN
2667 if (szOut != NULL)
2668 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2669
638357a0
RN
2670 nRealOutSize = CFStringGetLength(theString) + 1;
2671 }
2672 else
2673 {
2674 CFStringGetBytes(
2675 theString,
2676 CFRangeMake(0, CFStringGetLength(theString)),
2677 m_encoding,
2678 0, //what to put in characters that can't be converted -
2679 //0 tells CFString to return NULL if it meets such a character
2680 false, //not an external representation
2681 (UInt8*) szOut,
3698ae71 2682 nOutSize,
638357a0
RN
2683 (CFIndex*) &nRealOutSize
2684 );
f7e98dee 2685 }
ecd9653b 2686
638357a0 2687 CFRelease(theString);
ecd9653b 2688
638357a0
RN
2689#if SIZEOF_WCHAR_T == 4
2690 delete[] szUniBuffer;
2691#endif
ecd9653b 2692
f7e98dee
RN
2693 return nRealOutSize - 1;
2694 }
2695
d36c9347
VZ
2696 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2697
f7e98dee 2698 bool IsOk() const
ecd9653b 2699 {
3698ae71 2700 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2701 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2702 }
2703
2704private:
638357a0 2705 CFStringEncoding m_encoding ;
f7e98dee
RN
2706};
2707
2708#endif // defined(__WXCOCOA__)
2709
335d31e0
SC
2710// ============================================================================
2711// Mac conversion classes
2712// ============================================================================
2713
2714#if defined(__WXMAC__) && defined(TARGET_CARBON)
2715
2716class wxMBConv_mac : public wxMBConv
2717{
2718public:
2719 wxMBConv_mac()
2720 {
2721 Init(CFStringGetSystemEncoding()) ;
2722 }
2723
d36c9347
VZ
2724 wxMBConv_mac(const wxMBConv_mac& conv)
2725 {
2726 Init(conv.m_char_encoding);
2727 }
2728
2d1659cf 2729#if wxUSE_FONTMAP
335d31e0
SC
2730 wxMBConv_mac(const wxChar* name)
2731 {
ef199164 2732 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
335d31e0 2733 }
2d1659cf 2734#endif
335d31e0
SC
2735
2736 wxMBConv_mac(wxFontEncoding encoding)
2737 {
d775fa82
WS
2738 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2739 }
2740
2741 ~wxMBConv_mac()
2742 {
2743 OSStatus status = noErr ;
2744 status = TECDisposeConverter(m_MB2WC_converter);
2745 status = TECDisposeConverter(m_WC2MB_converter);
2746 }
2747
2748
2749 void Init( TextEncodingBase encoding)
2750 {
2751 OSStatus status = noErr ;
2752 m_char_encoding = encoding ;
ef199164 2753 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
d775fa82
WS
2754
2755 status = TECCreateConverter(&m_MB2WC_converter,
2756 m_char_encoding,
2757 m_unicode_encoding);
2758 status = TECCreateConverter(&m_WC2MB_converter,
2759 m_unicode_encoding,
2760 m_char_encoding);
2761 }
2762
335d31e0
SC
2763 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2764 {
d775fa82
WS
2765 OSStatus status = noErr ;
2766 ByteCount byteOutLen ;
9088c87b 2767 ByteCount byteInLen = strlen(psz) + 1;
d775fa82
WS
2768 wchar_t *tbuf = NULL ;
2769 UniChar* ubuf = NULL ;
2770 size_t res = 0 ;
2771
2772 if (buf == NULL)
2773 {
ef199164
DS
2774 // Apple specs say at least 32
2775 n = wxMax( 32, byteInLen ) ;
2776 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
d775fa82 2777 }
ef199164 2778
d775fa82 2779 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
ef199164 2780
f3a355ce 2781#if SIZEOF_WCHAR_T == 4
d775fa82 2782 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2783#else
d775fa82 2784 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2785#endif
ef199164
DS
2786
2787 status = TECConvertText(
2788 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2789 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2790
f3a355ce 2791#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2792 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2793 // is not properly terminated we get random characters at the end
2794 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2795 wxMBConvUTF16 converter ;
ef199164 2796 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
d775fa82 2797 free( ubuf ) ;
f3a355ce 2798#else
d775fa82 2799 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2800#endif
ef199164 2801
d775fa82
WS
2802 if ( buf == NULL )
2803 free(tbuf) ;
335d31e0 2804
335d31e0
SC
2805 if ( buf && res < n)
2806 buf[res] = 0;
2807
d775fa82 2808 return res ;
335d31e0
SC
2809 }
2810
2811 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2812 {
2813 OSStatus status = noErr ;
2814 ByteCount byteOutLen ;
2815 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2816
2817 char *tbuf = NULL ;
2818
2819 if (buf == NULL)
2820 {
ef199164
DS
2821 // Apple specs say at least 32
2822 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2823 tbuf = (char*) malloc( n ) ;
2824 }
2825
2826 ByteCount byteBufferLen = n ;
2827 UniChar* ubuf = NULL ;
ef199164 2828
f3a355ce 2829#if SIZEOF_WCHAR_T == 4
d9d488cf 2830 wxMBConvUTF16 converter ;
ef199164 2831 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
d775fa82
WS
2832 byteInLen = unicharlen ;
2833 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
ef199164 2834 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
f3a355ce 2835#else
d775fa82 2836 ubuf = (UniChar*) psz ;
f3a355ce 2837#endif
ef199164
DS
2838
2839 status = TECConvertText(
2840 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2841 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2842
f3a355ce 2843#if SIZEOF_WCHAR_T == 4
d775fa82 2844 free( ubuf ) ;
f3a355ce 2845#endif
ef199164 2846
d775fa82
WS
2847 if ( buf == NULL )
2848 free(tbuf) ;
335d31e0 2849
d775fa82 2850 size_t res = byteOutLen ;
335d31e0 2851 if ( buf && res < n)
638357a0 2852 {
335d31e0 2853 buf[res] = 0;
3698ae71 2854
638357a0
RN
2855 //we need to double-trip to verify it didn't insert any ? in place
2856 //of bogus characters
2857 wxWCharBuffer wcBuf(n);
2858 size_t pszlen = wxWcslen(psz);
467e0479 2859 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
638357a0
RN
2860 wxWcslen(wcBuf) != pszlen ||
2861 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2862 {
2863 // we didn't obtain the same thing we started from, hence
2864 // the conversion was lossy and we consider that it failed
467e0479 2865 return wxCONV_FAILED;
638357a0
RN
2866 }
2867 }
335d31e0 2868
d775fa82 2869 return res ;
335d31e0
SC
2870 }
2871
d3478e2c 2872 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
d36c9347 2873
335d31e0 2874 bool IsOk() const
ef199164 2875 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
335d31e0
SC
2876
2877private:
ef199164
DS
2878 TECObjectRef m_MB2WC_converter;
2879 TECObjectRef m_WC2MB_converter;
d775fa82 2880
ef199164
DS
2881 TextEncodingBase m_char_encoding;
2882 TextEncodingBase m_unicode_encoding;
335d31e0
SC
2883};
2884
2885#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2886
36acb880
VZ
2887// ============================================================================
2888// wxEncodingConverter based conversion classes
2889// ============================================================================
2890
1e6feb95 2891#if wxUSE_FONTMAP
1cd52418 2892
e95354ec 2893class wxMBConv_wxwin : public wxMBConv
1cd52418 2894{
8b04d4c4
VZ
2895private:
2896 void Init()
2897 {
2898 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2899 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2900 }
2901
6001e347 2902public:
f1339c56
RR
2903 // temporarily just use wxEncodingConverter stuff,
2904 // so that it works while a better implementation is built
e95354ec 2905 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2906 {
2907 if (name)
267e11c5 2908 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2909 else
2910 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2911
8b04d4c4
VZ
2912 Init();
2913 }
2914
e95354ec 2915 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2916 {
2917 m_enc = enc;
2918
2919 Init();
f1339c56 2920 }
dccce9ea 2921
bde4baac 2922 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2923 {
2924 size_t inbuf = strlen(psz);
dccce9ea 2925 if (buf)
c643a977 2926 {
ef199164 2927 if (!m2w.Convert(psz, buf))
467e0479 2928 return wxCONV_FAILED;
c643a977 2929 }
f1339c56
RR
2930 return inbuf;
2931 }
dccce9ea 2932
bde4baac 2933 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2934 {
f8d791e0 2935 const size_t inbuf = wxWcslen(psz);
f1339c56 2936 if (buf)
c643a977 2937 {
ef199164 2938 if (!w2m.Convert(psz, buf))
467e0479 2939 return wxCONV_FAILED;
c643a977 2940 }
dccce9ea 2941
f1339c56
RR
2942 return inbuf;
2943 }
dccce9ea 2944
7ef3ab50 2945 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2946 {
2947 switch ( m_enc )
2948 {
2949 case wxFONTENCODING_UTF16BE:
2950 case wxFONTENCODING_UTF16LE:
c1464d9d 2951 return 2;
eec47cc6
VZ
2952
2953 case wxFONTENCODING_UTF32BE:
2954 case wxFONTENCODING_UTF32LE:
c1464d9d 2955 return 4;
eec47cc6
VZ
2956
2957 default:
c1464d9d 2958 return 1;
eec47cc6
VZ
2959 }
2960 }
2961
d36c9347
VZ
2962 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2963
7ef3ab50
VZ
2964 bool IsOk() const { return m_ok; }
2965
2966public:
2967 wxFontEncoding m_enc;
2968 wxEncodingConverter m2w, w2m;
2969
2970private:
cafbf6fb
VZ
2971 // were we initialized successfully?
2972 bool m_ok;
fc7a2a60 2973
e95354ec 2974 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2975};
6001e347 2976
8f115891
MW
2977// make the constructors available for unit testing
2978WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2979{
2980 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2981 if ( !result->IsOk() )
2982 {
2983 delete result;
2984 return 0;
2985 }
ef199164 2986
8f115891
MW
2987 return result;
2988}
2989
1e6feb95
VZ
2990#endif // wxUSE_FONTMAP
2991
36acb880
VZ
2992// ============================================================================
2993// wxCSConv implementation
2994// ============================================================================
2995
8b04d4c4 2996void wxCSConv::Init()
6001e347 2997{
e95354ec
VZ
2998 m_name = NULL;
2999 m_convReal = NULL;
3000 m_deferred = true;
3001}
3002
8b04d4c4
VZ
3003wxCSConv::wxCSConv(const wxChar *charset)
3004{
3005 Init();
82713003 3006
e95354ec
VZ
3007 if ( charset )
3008 {
e95354ec
VZ
3009 SetName(charset);
3010 }
bda3d86a 3011
e4277538
VZ
3012#if wxUSE_FONTMAP
3013 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3014#else
bda3d86a 3015 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 3016#endif
6001e347
RR
3017}
3018
8b04d4c4
VZ
3019wxCSConv::wxCSConv(wxFontEncoding encoding)
3020{
bda3d86a 3021 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
3022 {
3023 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3024
3025 encoding = wxFONTENCODING_SYSTEM;
3026 }
3027
8b04d4c4
VZ
3028 Init();
3029
bda3d86a 3030 m_encoding = encoding;
8b04d4c4
VZ
3031}
3032
6001e347
RR
3033wxCSConv::~wxCSConv()
3034{
65e50848
JS
3035 Clear();
3036}
3037
54380f29 3038wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3039 : wxMBConv()
54380f29 3040{
8b04d4c4
VZ
3041 Init();
3042
54380f29 3043 SetName(conv.m_name);
8b04d4c4 3044 m_encoding = conv.m_encoding;
54380f29
GD
3045}
3046
3047wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3048{
3049 Clear();
8b04d4c4 3050
54380f29 3051 SetName(conv.m_name);
8b04d4c4
VZ
3052 m_encoding = conv.m_encoding;
3053
54380f29
GD
3054 return *this;
3055}
3056
65e50848
JS
3057void wxCSConv::Clear()
3058{
8b04d4c4 3059 free(m_name);
e95354ec 3060 delete m_convReal;
8b04d4c4 3061
65e50848 3062 m_name = NULL;
e95354ec 3063 m_convReal = NULL;
6001e347
RR
3064}
3065
3066void wxCSConv::SetName(const wxChar *charset)
3067{
f1339c56
RR
3068 if (charset)
3069 {
3070 m_name = wxStrdup(charset);
e95354ec 3071 m_deferred = true;
f1339c56 3072 }
6001e347
RR
3073}
3074
8b3eb85d
VZ
3075#if wxUSE_FONTMAP
3076#include "wx/hashmap.h"
3077
3078WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3079 wxEncodingNameCache );
8b3eb85d
VZ
3080
3081static wxEncodingNameCache gs_nameCache;
3082#endif
3083
e95354ec
VZ
3084wxMBConv *wxCSConv::DoCreate() const
3085{
ce6f8d6f
VZ
3086#if wxUSE_FONTMAP
3087 wxLogTrace(TRACE_STRCONV,
3088 wxT("creating conversion for %s"),
3089 (m_name ? m_name
3090 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3091#endif // wxUSE_FONTMAP
3092
c547282d
VZ
3093 // check for the special case of ASCII or ISO8859-1 charset: as we have
3094 // special knowledge of it anyhow, we don't need to create a special
3095 // conversion object
e4277538
VZ
3096 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3097 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3098 {
e95354ec
VZ
3099 // don't convert at all
3100 return NULL;
3101 }
dccce9ea 3102
e95354ec
VZ
3103 // we trust OS to do conversion better than we can so try external
3104 // conversion methods first
3105 //
3106 // the full order is:
3107 // 1. OS conversion (iconv() under Unix or Win32 API)
3108 // 2. hard coded conversions for UTF
3109 // 3. wxEncodingConverter as fall back
3110
3111 // step (1)
3112#ifdef HAVE_ICONV
c547282d 3113#if !wxUSE_FONTMAP
e95354ec 3114 if ( m_name )
c547282d 3115#endif // !wxUSE_FONTMAP
e95354ec 3116 {
c547282d 3117 wxString name(m_name);
8b3eb85d
VZ
3118 wxFontEncoding encoding(m_encoding);
3119
3120 if ( !name.empty() )
3121 {
3122 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3123 if ( conv->IsOk() )
3124 return conv;
3125
3126 delete conv;
c547282d
VZ
3127
3128#if wxUSE_FONTMAP
8b3eb85d
VZ
3129 encoding =
3130 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 3131#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3132 }
3133#if wxUSE_FONTMAP
3134 {
3135 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3136 if ( it != gs_nameCache.end() )
3137 {
3138 if ( it->second.empty() )
3139 return NULL;
c547282d 3140
8b3eb85d
VZ
3141 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3142 if ( conv->IsOk() )
3143 return conv;
e95354ec 3144
8b3eb85d
VZ
3145 delete conv;
3146 }
3147
3148 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3149
3150 for ( ; *names; ++names )
3151 {
3152 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3153 if ( conv->IsOk() )
3154 {
3155 gs_nameCache[encoding] = *names;
3156 return conv;
3157 }
3158
3159 delete conv;
3160 }
3161
40711af8 3162 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
3163 }
3164#endif // wxUSE_FONTMAP
e95354ec
VZ
3165 }
3166#endif // HAVE_ICONV
3167
3168#ifdef wxHAVE_WIN32_MB2WC
3169 {
7608a683 3170#if wxUSE_FONTMAP
e95354ec
VZ
3171 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3172 : new wxMBConv_win32(m_encoding);
3173 if ( conv->IsOk() )
3174 return conv;
3175
3176 delete conv;
7608a683
WS
3177#else
3178 return NULL;
3179#endif
e95354ec
VZ
3180 }
3181#endif // wxHAVE_WIN32_MB2WC
ef199164 3182
d775fa82
WS
3183#if defined(__WXMAC__)
3184 {
5c3c8676 3185 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 3186 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 3187 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82 3188 {
2d1659cf 3189#if wxUSE_FONTMAP
d775fa82
WS
3190 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3191 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
3192#else
3193 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3194#endif
d775fa82 3195 if ( conv->IsOk() )
f7e98dee
RN
3196 return conv;
3197
3198 delete conv;
3199 }
3200 }
3201#endif
ef199164 3202
f7e98dee
RN
3203#if defined(__WXCOCOA__)
3204 {
3205 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3206 {
a6900d10 3207#if wxUSE_FONTMAP
f7e98dee
RN
3208 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3209 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
3210#else
3211 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3212#endif
ef199164 3213
f7e98dee 3214 if ( conv->IsOk() )
d775fa82
WS
3215 return conv;
3216
3217 delete conv;
3218 }
335d31e0
SC
3219 }
3220#endif
e95354ec
VZ
3221 // step (2)
3222 wxFontEncoding enc = m_encoding;
3223#if wxUSE_FONTMAP
c547282d
VZ
3224 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225 {
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3228 // do
267e11c5 3229 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3230 }
e95354ec
VZ
3231#endif // wxUSE_FONTMAP
3232
3233 switch ( enc )
3234 {
3235 case wxFONTENCODING_UTF7:
3236 return new wxMBConvUTF7;
3237
3238 case wxFONTENCODING_UTF8:
3239 return new wxMBConvUTF8;
3240
e95354ec
VZ
3241 case wxFONTENCODING_UTF16BE:
3242 return new wxMBConvUTF16BE;
3243
3244 case wxFONTENCODING_UTF16LE:
3245 return new wxMBConvUTF16LE;
3246
e95354ec
VZ
3247 case wxFONTENCODING_UTF32BE:
3248 return new wxMBConvUTF32BE;
3249
3250 case wxFONTENCODING_UTF32LE:
3251 return new wxMBConvUTF32LE;
3252
3253 default:
3254 // nothing to do but put here to suppress gcc warnings
ef199164 3255 break;
e95354ec
VZ
3256 }
3257
3258 // step (3)
3259#if wxUSE_FONTMAP
3260 {
3261 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262 : new wxMBConv_wxwin(m_encoding);
3263 if ( conv->IsOk() )
3264 return conv;
3265
3266 delete conv;
3267 }
3268#endif // wxUSE_FONTMAP
3269
a58d4f4d
VS
3270 // NB: This is a hack to prevent deadlock. What could otherwise happen
3271 // in Unicode build: wxConvLocal creation ends up being here
3272 // because of some failure and logs the error. But wxLog will try to
3273 // attach timestamp, for which it will need wxConvLocal (to convert
3274 // time to char* and then wchar_t*), but that fails, tries to log
3275 // error, but wxLog has a (already locked) critical section that
3276 // guards static buffer.
3277 static bool alreadyLoggingError = false;
3278 if (!alreadyLoggingError)
3279 {
3280 alreadyLoggingError = true;
3281 wxLogError(_("Cannot convert from the charset '%s'!"),
3282 m_name ? m_name
e95354ec
VZ
3283 :
3284#if wxUSE_FONTMAP
267e11c5 3285 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
3286#else // !wxUSE_FONTMAP
3287 wxString::Format(_("encoding %s"), m_encoding).c_str()
3288#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3289 );
ef199164 3290
a58d4f4d
VS
3291 alreadyLoggingError = false;
3292 }
e95354ec
VZ
3293
3294 return NULL;
3295}
3296
3297void wxCSConv::CreateConvIfNeeded() const
3298{
3299 if ( m_deferred )
3300 {
3301 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
3302
3303#if wxUSE_INTL
3304 // if we don't have neither the name nor the encoding, use the default
3305 // encoding for this system
3306 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3307 {
4d312c22 3308 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
3309 }
3310#endif // wxUSE_INTL
3311
e95354ec
VZ
3312 self->m_convReal = DoCreate();
3313 self->m_deferred = false;
6001e347 3314 }
6001e347
RR
3315}
3316
3317size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3318{
e95354ec 3319 CreateConvIfNeeded();
dccce9ea 3320
e95354ec
VZ
3321 if (m_convReal)
3322 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3323
3324 // latin-1 (direct)
4def3b35 3325 size_t len = strlen(psz);
dccce9ea 3326
f1339c56
RR
3327 if (buf)
3328 {
4def3b35 3329 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3330 buf[c] = (unsigned char)(psz[c]);
3331 }
dccce9ea 3332
f1339c56 3333 return len;
6001e347
RR
3334}
3335
3336size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3337{
e95354ec 3338 CreateConvIfNeeded();
dccce9ea 3339
e95354ec
VZ
3340 if (m_convReal)
3341 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3342
f1339c56 3343 // latin-1 (direct)
f8d791e0 3344 const size_t len = wxWcslen(psz);
f1339c56
RR
3345 if (buf)
3346 {
4def3b35 3347 for (size_t c = 0; c <= len; c++)
24642831
VS
3348 {
3349 if (psz[c] > 0xFF)
467e0479 3350 return wxCONV_FAILED;
ef199164 3351
907173e5 3352 buf[c] = (char)psz[c];
24642831
VS
3353 }
3354 }
3355 else
3356 {
3357 for (size_t c = 0; c <= len; c++)
3358 {
3359 if (psz[c] > 0xFF)
467e0479 3360 return wxCONV_FAILED;
24642831 3361 }
f1339c56 3362 }
dccce9ea 3363
f1339c56 3364 return len;
6001e347
RR
3365}
3366
7ef3ab50 3367size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3368{
3369 CreateConvIfNeeded();
3370
3371 if ( m_convReal )
3372 {
7ef3ab50 3373 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3374 }
3375
c1464d9d 3376 return 1;
eec47cc6
VZ
3377}
3378
bde4baac
VZ
3379// ----------------------------------------------------------------------------
3380// globals
3381// ----------------------------------------------------------------------------
3382
3383#ifdef __WINDOWS__
3384 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
3385#elif defined(__WXMAC__) && !defined(__MACH__)
3386 static wxMBConv_mac wxConvLibcObj ;
bde4baac 3387#else
dcc8fac0 3388 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
3389#endif
3390
3391static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3392static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3393static wxMBConvUTF7 wxConvUTF7Obj;
3394static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 3395
bde4baac
VZ
3396WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3397WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3398WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3399WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3400WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3401WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
d5bef0a3 3402WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
f5a1953b
VZ
3403WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3404#ifdef __WXOSX__
ea8ce907 3405 wxConvUTF8Obj;
f5a1953b 3406#else
ea8ce907 3407 wxConvLibcObj;
f5a1953b
VZ
3408#endif
3409
bde4baac
VZ
3410#else // !wxUSE_WCHAR_T
3411
3412// stand-ins in absence of wchar_t
3413WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3414 wxConvISO8859_1,
3415 wxConvLocal,
3416 wxConvUTF8;
3417
3418#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T