]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
use wxTE_PROCESS_ENTER for the text control part
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
6001e347
RR
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
373658eb
VZ
18#ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
de6185e2 21 #include "wx/utils.h"
ef199164 22#endif
373658eb 23
bde4baac
VZ
24#include "wx/strconv.h"
25
26#if wxUSE_WCHAR_T
27
7608a683 28#ifdef __WINDOWS__
532d575b 29 #include "wx/msw/private.h"
13dd924a 30 #include "wx/msw/missing.h"
0a1c1e62
GRG
31#endif
32
1c193821 33#ifndef __WXWINCE__
1cd52418 34#include <errno.h>
1c193821
JS
35#endif
36
6001e347
RR
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
e95354ec
VZ
41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
ef199164 43#endif
e95354ec 44
6001e347 45#ifdef __SALFORDC__
373658eb 46 #include <clib.h>
6001e347
RR
47#endif
48
b040e242 49#ifdef HAVE_ICONV
373658eb 50 #include <iconv.h>
b1d547eb 51 #include "wx/thread.h"
1cd52418 52#endif
1cd52418 53
373658eb
VZ
54#include "wx/encconv.h"
55#include "wx/fontmap.h"
56
335d31e0 57#ifdef __WXMAC__
40ba2f3b 58#ifndef __DARWIN__
4227afa4
SC
59#include <ATSUnicode.h>
60#include <TextCommon.h>
61#include <TextEncodingConverter.h>
40ba2f3b 62#endif
335d31e0 63
ef199164
DS
64// includes Mac headers
65#include "wx/mac/private.h"
335d31e0 66#endif
ce6f8d6f 67
ef199164 68
ce6f8d6f
VZ
69#define TRACE_STRCONV _T("strconv")
70
467e0479
VZ
71// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72// be 4 bytes
4948c2b6 73#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
74 #define WC_UTF16
75#endif
76
ef199164 77
373658eb
VZ
78// ============================================================================
79// implementation
80// ============================================================================
81
69373110
VZ
82// helper function of cMB2WC(): check if n bytes at this location are all NUL
83static bool NotAllNULs(const char *p, size_t n)
84{
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89}
90
373658eb 91// ----------------------------------------------------------------------------
467e0479 92// UTF-16 en/decoding to/from UCS-4 with surrogates handling
373658eb 93// ----------------------------------------------------------------------------
6001e347 94
c91830cb 95static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 96{
ef199164 97 if (input <= 0xffff)
4def3b35 98 {
999836aa
VZ
99 if (output)
100 *output = (wxUint16) input;
ef199164 101
4def3b35 102 return 1;
dccce9ea 103 }
ef199164 104 else if (input >= 0x110000)
4def3b35 105 {
467e0479 106 return wxCONV_FAILED;
dccce9ea
VZ
107 }
108 else
4def3b35 109 {
dccce9ea 110 if (output)
4def3b35 111 {
ef199164
DS
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
4def3b35 114 }
ef199164 115
4def3b35 116 return 2;
1cd52418 117 }
1cd52418
OK
118}
119
c91830cb 120static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 121{
ef199164 122 if ((*input < 0xd800) || (*input > 0xdfff))
4def3b35
VS
123 {
124 output = *input;
125 return 1;
dccce9ea 126 }
ef199164 127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
4def3b35
VS
128 {
129 output = *input;
467e0479 130 return wxCONV_FAILED;
dccce9ea
VZ
131 }
132 else
4def3b35
VS
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
1cd52418
OK
137}
138
467e0479 139#ifdef WC_UTF16
35d11700
VZ
140 typedef wchar_t wxDecodeSurrogate_t;
141#else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143#endif // WC_UTF16/!WC_UTF16
467e0479
VZ
144
145// returns the next UTF-32 character from the wchar_t buffer and advances the
146// pointer to the character after this one
147//
148// if an invalid character is found, *pSrc is set to NULL, the caller must
149// check for this
35d11700 150static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
467e0479
VZ
151{
152 wxUint32 out;
8d3dd069
VZ
153 const size_t
154 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
467e0479
VZ
155 if ( n == wxCONV_FAILED )
156 *pSrc = NULL;
157 else
158 *pSrc += n;
159
160 return out;
161}
162
f6bcfd97 163// ----------------------------------------------------------------------------
6001e347 164// wxMBConv
f6bcfd97 165// ----------------------------------------------------------------------------
2c53a80a 166
483b0434
VZ
167size_t
168wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
169 const char *src, size_t srcLen) const
6001e347 170{
483b0434
VZ
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
6001e347 177
483b0434
VZ
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten = 0;
eec47cc6 180
c1464d9d 181 // the number of NULs terminating this string
a78c43f1 182 size_t nulLen = 0; // not really needed, but just to avoid warnings
eec47cc6 183
c1464d9d
VZ
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
187 // NULs at the end
483b0434
VZ
188 wxCharBuffer bufTmp;
189 const char *srcEnd;
467e0479 190 if ( srcLen != wxNO_LEN )
eec47cc6 191 {
c1464d9d 192 // we need to know how to find the end of this string
7ef3ab50 193 nulLen = GetMBNulLen();
483b0434
VZ
194 if ( nulLen == wxCONV_FAILED )
195 return wxCONV_FAILED;
e4e3bbb4 196
c1464d9d 197 // if there are enough NULs we can avoid the copy
483b0434 198 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
eec47cc6
VZ
199 {
200 // make a copy in order to properly NUL-terminate the string
483b0434 201 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
c1464d9d 202 char * const p = bufTmp.data();
483b0434
VZ
203 memcpy(p, src, srcLen);
204 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
c1464d9d 205 *s = '\0';
483b0434
VZ
206
207 src = bufTmp;
eec47cc6 208 }
e4e3bbb4 209
483b0434
VZ
210 srcEnd = src + srcLen;
211 }
212 else // quit after the first loop iteration
213 {
214 srcEnd = NULL;
215 }
e4e3bbb4 216
483b0434 217 for ( ;; )
eec47cc6 218 {
c1464d9d 219 // try to convert the current chunk
483b0434 220 size_t lenChunk = MB2WC(NULL, src, 0);
483b0434
VZ
221 if ( lenChunk == wxCONV_FAILED )
222 return wxCONV_FAILED;
e4e3bbb4 223
467e0479 224 lenChunk++; // for the L'\0' at the end of this chunk
e4e3bbb4 225
483b0434 226 dstWritten += lenChunk;
f5fb6871 227
467e0479
VZ
228 if ( lenChunk == 1 )
229 {
230 // nothing left in the input string, conversion succeeded
231 break;
232 }
233
483b0434
VZ
234 if ( dst )
235 {
236 if ( dstWritten > dstLen )
237 return wxCONV_FAILED;
238
830f8f11 239 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
483b0434
VZ
240 return wxCONV_FAILED;
241
242 dst += lenChunk;
243 }
c1464d9d 244
483b0434 245 if ( !srcEnd )
c1464d9d 246 {
467e0479
VZ
247 // we convert just one chunk in this case as this is the entire
248 // string anyhow
c1464d9d
VZ
249 break;
250 }
eec47cc6
VZ
251
252 // advance the input pointer past the end of this chunk
483b0434 253 while ( NotAllNULs(src, nulLen) )
c1464d9d
VZ
254 {
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
483b0434 259 src += nulLen;
c1464d9d 260 }
e4e3bbb4 261
483b0434 262 src += nulLen; // skipping over its terminator as well
c1464d9d
VZ
263
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
483b0434 267 if ( src >= srcEnd )
c1464d9d
VZ
268 break;
269 }
270
483b0434 271 return dstWritten;
e4e3bbb4
RN
272}
273
483b0434
VZ
274size_t
275wxMBConv::FromWChar(char *dst, size_t dstLen,
276 const wchar_t *src, size_t srcLen) const
e4e3bbb4 277{
483b0434
VZ
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten = 0;
e4e3bbb4 280
eec47cc6
VZ
281 // make a copy of the input string unless it is already properly
282 // NUL-terminated
283 //
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp;
467e0479 287 if ( srcLen == wxNO_LEN )
e4e3bbb4 288 {
483b0434 289 srcLen = wxWcslen(src) + 1;
eec47cc6 290 }
483b0434 291 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
eec47cc6
VZ
292 {
293 // make a copy in order to properly NUL-terminate the string
483b0434 294 bufTmp = wxWCharBuffer(srcLen);
ef199164 295 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
483b0434
VZ
296 src = bufTmp;
297 }
298
299 const size_t lenNul = GetMBNulLen();
300 for ( const wchar_t * const srcEnd = src + srcLen;
301 src < srcEnd;
302 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
303 {
304 // try to convert the current chunk
305 size_t lenChunk = WC2MB(NULL, src, 0);
306
307 if ( lenChunk == wxCONV_FAILED )
308 return wxCONV_FAILED;
309
310 lenChunk += lenNul;
311 dstWritten += lenChunk;
312
313 if ( dst )
314 {
315 if ( dstWritten > dstLen )
316 return wxCONV_FAILED;
317
318 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dst += lenChunk;
322 }
eec47cc6 323 }
e4e3bbb4 324
483b0434
VZ
325 return dstWritten;
326}
327
ef199164 328size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
509da451 329{
ef199164 330 size_t rc = ToWChar(outBuff, outLen, inBuff);
467e0479 331 if ( rc != wxCONV_FAILED )
509da451
VZ
332 {
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
335 rc--;
336 }
337
338 return rc;
339}
340
ef199164 341size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
509da451 342{
ef199164 343 size_t rc = FromWChar(outBuff, outLen, inBuff);
467e0479 344 if ( rc != wxCONV_FAILED )
509da451
VZ
345 {
346 rc -= GetMBNulLen();
347 }
348
349 return rc;
350}
351
483b0434
VZ
352wxMBConv::~wxMBConv()
353{
354 // nothing to do here (necessary for Darwin linking probably)
355}
e4e3bbb4 356
483b0434
VZ
357const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
358{
359 if ( psz )
eec47cc6 360 {
483b0434
VZ
361 // calculate the length of the buffer needed first
362 const size_t nLen = MB2WC(NULL, psz, 0);
467e0479 363 if ( nLen != wxCONV_FAILED )
f5fb6871 364 {
483b0434
VZ
365 // now do the actual conversion
366 wxWCharBuffer buf(nLen /* +1 added implicitly */);
eec47cc6 367
483b0434
VZ
368 // +1 for the trailing NULL
369 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
370 return buf;
f5fb6871 371 }
483b0434 372 }
e4e3bbb4 373
483b0434
VZ
374 return wxWCharBuffer();
375}
3698ae71 376
483b0434
VZ
377const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
378{
379 if ( pwz )
380 {
381 const size_t nLen = WC2MB(NULL, pwz, 0);
467e0479 382 if ( nLen != wxCONV_FAILED )
483b0434
VZ
383 {
384 // extra space for trailing NUL(s)
385 static const size_t extraLen = GetMaxMBNulLen();
f5fb6871 386
483b0434
VZ
387 wxCharBuffer buf(nLen + extraLen - 1);
388 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
389 return buf;
390 }
391 }
392
393 return wxCharBuffer();
394}
e4e3bbb4 395
483b0434 396const wxWCharBuffer
ef199164 397wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
483b0434 398{
ef199164 399 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
467e0479 400 if ( dstLen != wxCONV_FAILED )
483b0434 401 {
830f8f11 402 wxWCharBuffer wbuf(dstLen - 1);
ef199164 403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
404 {
405 if ( outLen )
467e0479
VZ
406 {
407 *outLen = dstLen;
408 if ( wbuf[dstLen - 1] == L'\0' )
409 (*outLen)--;
410 }
411
483b0434
VZ
412 return wbuf;
413 }
414 }
415
416 if ( outLen )
417 *outLen = 0;
418
419 return wxWCharBuffer();
420}
421
422const wxCharBuffer
ef199164 423wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
483b0434 424{
13d92ad6 425 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
467e0479 426 if ( dstLen != wxCONV_FAILED )
483b0434 427 {
168a76fe
VZ
428 // special case of empty input: can't allocate 0 size buffer below as
429 // wxCharBuffer insists on NUL-terminating it
430 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
ef199164 431 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
483b0434
VZ
432 {
433 if ( outLen )
467e0479
VZ
434 {
435 *outLen = dstLen;
436
437 const size_t nulLen = GetMBNulLen();
13d92ad6
VZ
438 if ( dstLen >= nulLen &&
439 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
467e0479
VZ
440 {
441 // in this case the output is NUL-terminated and we're not
442 // supposed to count NUL
13d92ad6 443 *outLen -= nulLen;
467e0479
VZ
444 }
445 }
d32a507d 446
483b0434
VZ
447 return buf;
448 }
e4e3bbb4
RN
449 }
450
eec47cc6
VZ
451 if ( outLen )
452 *outLen = 0;
453
454 return wxCharBuffer();
e4e3bbb4
RN
455}
456
6001e347 457// ----------------------------------------------------------------------------
bde4baac 458// wxMBConvLibc
6001e347
RR
459// ----------------------------------------------------------------------------
460
bde4baac
VZ
461size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
462{
463 return wxMB2WC(buf, psz, n);
464}
465
466size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
467{
468 return wxWC2MB(buf, psz, n);
469}
e1bfe89e
RR
470
471// ----------------------------------------------------------------------------
532d575b 472// wxConvBrokenFileNames
e1bfe89e
RR
473// ----------------------------------------------------------------------------
474
eec47cc6
VZ
475#ifdef __UNIX__
476
845905d5 477wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 478{
845905d5
MW
479 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
480 || wxStricmp(charset, _T("UTF8")) == 0 )
481 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
482 else
483 m_conv = new wxCSConv(charset);
ea8ce907
RR
484}
485
eec47cc6 486#endif // __UNIX__
c12b7f79 487
bde4baac 488// ----------------------------------------------------------------------------
3698ae71 489// UTF-7
bde4baac 490// ----------------------------------------------------------------------------
6001e347 491
15f2ee32 492// Implementation (C) 2004 Fredrik Roubert
6001e347 493
15f2ee32
RN
494//
495// BASE64 decoding table
496//
497static const unsigned char utf7unb64[] =
6001e347 498{
15f2ee32
RN
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
505 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
506 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
508 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
509 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
510 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
512 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
513 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
514 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
531};
532
533size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
534{
15f2ee32
RN
535 size_t len = 0;
536
04a37834 537 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
538 {
539 unsigned char cc = *psz++;
540 if (cc != '+')
541 {
542 // plain ASCII char
543 if (buf)
544 *buf++ = cc;
545 len++;
546 }
547 else if (*psz == '-')
548 {
549 // encoded plus sign
550 if (buf)
551 *buf++ = cc;
552 len++;
553 psz++;
554 }
04a37834 555 else // start of BASE64 encoded string
15f2ee32 556 {
04a37834 557 bool lsb, ok;
15f2ee32 558 unsigned int d, l;
04a37834
VZ
559 for ( ok = lsb = false, d = 0, l = 0;
560 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
561 psz++ )
15f2ee32
RN
562 {
563 d <<= 6;
564 d += cc;
565 for (l += 6; l >= 8; lsb = !lsb)
566 {
04a37834 567 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
568 if (lsb)
569 {
570 if (buf)
571 *buf++ |= c;
572 len ++;
573 }
574 else
04a37834 575 {
15f2ee32 576 if (buf)
6356d52a 577 *buf = (wchar_t)(c << 8);
04a37834
VZ
578 }
579
580 ok = true;
15f2ee32
RN
581 }
582 }
04a37834
VZ
583
584 if ( !ok )
585 {
586 // in valid UTF7 we should have valid characters after '+'
467e0479 587 return wxCONV_FAILED;
04a37834
VZ
588 }
589
15f2ee32
RN
590 if (*psz == '-')
591 psz++;
592 }
593 }
04a37834
VZ
594
595 if ( buf && (len < n) )
596 *buf = '\0';
597
15f2ee32 598 return len;
6001e347
RR
599}
600
15f2ee32
RN
601//
602// BASE64 encoding table
603//
604static const unsigned char utf7enb64[] =
605{
606 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
607 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
608 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
609 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
610 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
611 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
612 'w', 'x', 'y', 'z', '0', '1', '2', '3',
613 '4', '5', '6', '7', '8', '9', '+', '/'
614};
615
616//
617// UTF-7 encoding table
618//
619// 0 - Set D (directly encoded characters)
620// 1 - Set O (optional direct characters)
621// 2 - whitespace characters (optional)
622// 3 - special characters
623//
624static const unsigned char utf7encode[128] =
6001e347 625{
15f2ee32
RN
626 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
634};
635
667e5b3e 636size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 637{
15f2ee32
RN
638 size_t len = 0;
639
640 while (*psz && ((!buf) || (len < n)))
641 {
642 wchar_t cc = *psz++;
643 if (cc < 0x80 && utf7encode[cc] < 1)
644 {
645 // plain ASCII char
646 if (buf)
647 *buf++ = (char)cc;
ef199164 648
15f2ee32
RN
649 len++;
650 }
651#ifndef WC_UTF16
79c78d42 652 else if (((wxUint32)cc) > 0xffff)
b2c13097 653 {
15f2ee32 654 // no surrogate pair generation (yet?)
467e0479 655 return wxCONV_FAILED;
15f2ee32
RN
656 }
657#endif
658 else
659 {
660 if (buf)
661 *buf++ = '+';
ef199164 662
15f2ee32
RN
663 len++;
664 if (cc != '+')
665 {
666 // BASE64 encode string
667 unsigned int lsb, d, l;
73c902d6 668 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
669 {
670 for (lsb = 0; lsb < 2; lsb ++)
671 {
672 d <<= 8;
673 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
674
675 for (l += 8; l >= 6; )
676 {
677 l -= 6;
678 if (buf)
679 *buf++ = utf7enb64[(d >> l) % 64];
680 len++;
681 }
682 }
ef199164 683
15f2ee32
RN
684 cc = *psz;
685 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
686 break;
687 }
ef199164 688
15f2ee32
RN
689 if (l != 0)
690 {
691 if (buf)
692 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
ef199164 693
15f2ee32
RN
694 len++;
695 }
696 }
ef199164 697
15f2ee32
RN
698 if (buf)
699 *buf++ = '-';
700 len++;
701 }
702 }
ef199164 703
15f2ee32
RN
704 if (buf && (len < n))
705 *buf = 0;
ef199164 706
15f2ee32 707 return len;
6001e347
RR
708}
709
f6bcfd97 710// ----------------------------------------------------------------------------
6001e347 711// UTF-8
f6bcfd97 712// ----------------------------------------------------------------------------
6001e347 713
dccce9ea 714static wxUint32 utf8_max[]=
4def3b35 715 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 716
3698ae71
VZ
717// boundaries of the private use area we use to (temporarily) remap invalid
718// characters invalid in a UTF-8 encoded string
ea8ce907
RR
719const wxUint32 wxUnicodePUA = 0x100000;
720const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
721
6001e347
RR
722size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
723{
4def3b35
VS
724 size_t len = 0;
725
dccce9ea 726 while (*psz && ((!buf) || (len < n)))
4def3b35 727 {
ea8ce907
RR
728 const char *opsz = psz;
729 bool invalid = false;
4def3b35
VS
730 unsigned char cc = *psz++, fc = cc;
731 unsigned cnt;
dccce9ea 732 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 733 fc <<= 1;
ef199164 734
dccce9ea 735 if (!cnt)
4def3b35
VS
736 {
737 // plain ASCII char
dccce9ea 738 if (buf)
4def3b35
VS
739 *buf++ = cc;
740 len++;
561488ef
MW
741
742 // escape the escape character for octal escapes
743 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
744 && cc == '\\' && (!buf || len < n))
745 {
746 if (buf)
747 *buf++ = cc;
748 len++;
749 }
dccce9ea
VZ
750 }
751 else
4def3b35
VS
752 {
753 cnt--;
dccce9ea 754 if (!cnt)
4def3b35
VS
755 {
756 // invalid UTF-8 sequence
ea8ce907 757 invalid = true;
dccce9ea
VZ
758 }
759 else
4def3b35
VS
760 {
761 unsigned ocnt = cnt - 1;
762 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 763 while (cnt--)
4def3b35 764 {
ea8ce907 765 cc = *psz;
dccce9ea 766 if ((cc & 0xC0) != 0x80)
4def3b35
VS
767 {
768 // invalid UTF-8 sequence
ea8ce907
RR
769 invalid = true;
770 break;
4def3b35 771 }
ef199164 772
ea8ce907 773 psz++;
4def3b35
VS
774 res = (res << 6) | (cc & 0x3f);
775 }
ef199164 776
ea8ce907 777 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
778 {
779 // illegal UTF-8 encoding
ea8ce907 780 invalid = true;
4def3b35 781 }
ea8ce907
RR
782 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
783 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
784 {
785 // if one of our PUA characters turns up externally
786 // it must also be treated as an illegal sequence
787 // (a bit like you have to escape an escape character)
788 invalid = true;
789 }
790 else
791 {
1cd52418 792#ifdef WC_UTF16
ea8ce907
RR
793 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
794 size_t pa = encode_utf16(res, (wxUint16 *)buf);
467e0479 795 if (pa == wxCONV_FAILED)
ea8ce907
RR
796 {
797 invalid = true;
798 }
799 else
800 {
801 if (buf)
802 buf += pa;
803 len += pa;
804 }
373658eb 805#else // !WC_UTF16
ea8ce907 806 if (buf)
38d4b1e4 807 *buf++ = (wchar_t)res;
ea8ce907 808 len++;
373658eb 809#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
810 }
811 }
ef199164 812
ea8ce907
RR
813 if (invalid)
814 {
815 if (m_options & MAP_INVALID_UTF8_TO_PUA)
816 {
817 while (opsz < psz && (!buf || len < n))
818 {
819#ifdef WC_UTF16
820 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
821 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
467e0479 822 wxASSERT(pa != wxCONV_FAILED);
ea8ce907
RR
823 if (buf)
824 buf += pa;
825 opsz++;
826 len += pa;
827#else
828 if (buf)
38d4b1e4 829 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
830 opsz++;
831 len++;
832#endif
833 }
834 }
3698ae71 835 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
836 {
837 while (opsz < psz && (!buf || len < n))
838 {
3698ae71
VZ
839 if ( buf && len + 3 < n )
840 {
17a1ebd1 841 unsigned char on = *opsz;
3698ae71 842 *buf++ = L'\\';
17a1ebd1
VZ
843 *buf++ = (wchar_t)( L'0' + on / 0100 );
844 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
845 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 846 }
ef199164 847
ea8ce907
RR
848 opsz++;
849 len += 4;
850 }
851 }
3698ae71 852 else // MAP_INVALID_UTF8_NOT
ea8ce907 853 {
467e0479 854 return wxCONV_FAILED;
ea8ce907 855 }
4def3b35
VS
856 }
857 }
6001e347 858 }
ef199164 859
dccce9ea 860 if (buf && (len < n))
4def3b35 861 *buf = 0;
ef199164 862
4def3b35 863 return len;
6001e347
RR
864}
865
3698ae71
VZ
866static inline bool isoctal(wchar_t wch)
867{
868 return L'0' <= wch && wch <= L'7';
869}
870
6001e347
RR
871size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
872{
4def3b35 873 size_t len = 0;
6001e347 874
dccce9ea 875 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
876 {
877 wxUint32 cc;
ef199164 878
1cd52418 879#ifdef WC_UTF16
b5153fd8
VZ
880 // cast is ok for WC_UTF16
881 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
467e0479 882 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1cd52418 883#else
ef199164 884 cc = (*psz++) & 0x7fffffff;
4def3b35 885#endif
3698ae71
VZ
886
887 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
888 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 889 {
dccce9ea 890 if (buf)
ea8ce907 891 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 892 len++;
3698ae71 893 }
561488ef
MW
894 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
895 && cc == L'\\' && psz[0] == L'\\' )
896 {
897 if (buf)
898 *buf++ = (char)cc;
899 psz++;
900 len++;
901 }
3698ae71
VZ
902 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
903 cc == L'\\' &&
904 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 905 {
dccce9ea 906 if (buf)
3698ae71 907 {
ef199164
DS
908 *buf++ = (char) ((psz[0] - L'0') * 0100 +
909 (psz[1] - L'0') * 010 +
b2c13097 910 (psz[2] - L'0'));
3698ae71
VZ
911 }
912
913 psz += 3;
ea8ce907
RR
914 len++;
915 }
916 else
917 {
918 unsigned cnt;
ef199164
DS
919 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
920 {
921 }
922
ea8ce907 923 if (!cnt)
4def3b35 924 {
ea8ce907
RR
925 // plain ASCII char
926 if (buf)
927 *buf++ = (char) cc;
928 len++;
929 }
ea8ce907
RR
930 else
931 {
932 len += cnt + 1;
933 if (buf)
934 {
935 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
936 while (cnt--)
937 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
938 }
4def3b35
VS
939 }
940 }
6001e347 941 }
4def3b35 942
ef199164 943 if (buf && (len < n))
3698ae71 944 *buf = 0;
adb45366 945
4def3b35 946 return len;
6001e347
RR
947}
948
467e0479 949// ============================================================================
c91830cb 950// UTF-16
467e0479 951// ============================================================================
c91830cb
VZ
952
953#ifdef WORDS_BIGENDIAN
bde4baac
VZ
954 #define wxMBConvUTF16straight wxMBConvUTF16BE
955 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 956#else
bde4baac
VZ
957 #define wxMBConvUTF16swap wxMBConvUTF16BE
958 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
959#endif
960
467e0479
VZ
961/* static */
962size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
963{
964 if ( srcLen == wxNO_LEN )
965 {
966 // count the number of bytes in input, including the trailing NULs
ef199164
DS
967 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
968 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 969 ;
c91830cb 970
467e0479
VZ
971 srcLen *= BYTES_PER_CHAR;
972 }
973 else // we already have the length
974 {
975 // we can only convert an entire number of UTF-16 characters
976 if ( srcLen % BYTES_PER_CHAR )
977 return wxCONV_FAILED;
978 }
979
980 return srcLen;
981}
982
983// case when in-memory representation is UTF-16 too
c91830cb
VZ
984#ifdef WC_UTF16
985
467e0479
VZ
986// ----------------------------------------------------------------------------
987// conversions without endianness change
988// ----------------------------------------------------------------------------
989
990size_t
991wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
992 const char *src, size_t srcLen) const
c91830cb 993{
467e0479
VZ
994 // set up the scene for using memcpy() (which is presumably more efficient
995 // than copying the bytes one by one)
996 srcLen = GetLength(src, srcLen);
997 if ( srcLen == wxNO_LEN )
998 return wxCONV_FAILED;
c91830cb 999
ef199164 1000 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1001 if ( dst )
c91830cb 1002 {
467e0479
VZ
1003 if ( dstLen < inLen )
1004 return wxCONV_FAILED;
c91830cb 1005
467e0479 1006 memcpy(dst, src, srcLen);
c91830cb 1007 }
d32a507d 1008
467e0479 1009 return inLen;
c91830cb
VZ
1010}
1011
467e0479
VZ
1012size_t
1013wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1014 const wchar_t *src, size_t srcLen) const
c91830cb 1015{
467e0479
VZ
1016 if ( srcLen == wxNO_LEN )
1017 srcLen = wxWcslen(src) + 1;
c91830cb 1018
467e0479
VZ
1019 srcLen *= BYTES_PER_CHAR;
1020
1021 if ( dst )
c91830cb 1022 {
467e0479
VZ
1023 if ( dstLen < srcLen )
1024 return wxCONV_FAILED;
d32a507d 1025
467e0479 1026 memcpy(dst, src, srcLen);
c91830cb 1027 }
d32a507d 1028
467e0479 1029 return srcLen;
c91830cb
VZ
1030}
1031
467e0479
VZ
1032// ----------------------------------------------------------------------------
1033// endian-reversing conversions
1034// ----------------------------------------------------------------------------
c91830cb 1035
467e0479
VZ
1036size_t
1037wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1038 const char *src, size_t srcLen) const
c91830cb 1039{
467e0479
VZ
1040 srcLen = GetLength(src, srcLen);
1041 if ( srcLen == wxNO_LEN )
1042 return wxCONV_FAILED;
c91830cb 1043
467e0479
VZ
1044 srcLen /= BYTES_PER_CHAR;
1045
1046 if ( dst )
c91830cb 1047 {
467e0479
VZ
1048 if ( dstLen < srcLen )
1049 return wxCONV_FAILED;
1050
ef199164
DS
1051 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1052 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1053 {
ef199164 1054 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1055 }
c91830cb 1056 }
bfab25d4 1057
467e0479 1058 return srcLen;
c91830cb
VZ
1059}
1060
467e0479
VZ
1061size_t
1062wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1063 const wchar_t *src, size_t srcLen) const
c91830cb 1064{
467e0479
VZ
1065 if ( srcLen == wxNO_LEN )
1066 srcLen = wxWcslen(src) + 1;
c91830cb 1067
467e0479
VZ
1068 srcLen *= BYTES_PER_CHAR;
1069
1070 if ( dst )
c91830cb 1071 {
467e0479
VZ
1072 if ( dstLen < srcLen )
1073 return wxCONV_FAILED;
1074
ef199164 1075 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
467e0479 1076 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1077 {
ef199164 1078 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
c91830cb 1079 }
c91830cb 1080 }
eec47cc6 1081
467e0479 1082 return srcLen;
c91830cb
VZ
1083}
1084
467e0479 1085#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1086
467e0479
VZ
1087// ----------------------------------------------------------------------------
1088// conversions without endianness change
1089// ----------------------------------------------------------------------------
c91830cb 1090
35d11700
VZ
1091size_t
1092wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1093 const char *src, size_t srcLen) const
c91830cb 1094{
35d11700
VZ
1095 srcLen = GetLength(src, srcLen);
1096 if ( srcLen == wxNO_LEN )
1097 return wxCONV_FAILED;
c91830cb 1098
ef199164 1099 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700 1100 if ( !dst )
c91830cb 1101 {
35d11700
VZ
1102 // optimization: return maximal space which could be needed for this
1103 // string even if the real size could be smaller if the buffer contains
1104 // any surrogates
1105 return inLen;
c91830cb 1106 }
c91830cb 1107
35d11700 1108 size_t outLen = 0;
ef199164
DS
1109 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1110 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
35d11700 1111 {
ef199164
DS
1112 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1113 if ( !inBuff )
35d11700
VZ
1114 return wxCONV_FAILED;
1115
1116 if ( ++outLen > dstLen )
1117 return wxCONV_FAILED;
c91830cb 1118
35d11700
VZ
1119 *dst++ = ch;
1120 }
1121
1122
1123 return outLen;
1124}
c91830cb 1125
35d11700
VZ
1126size_t
1127wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1128 const wchar_t *src, size_t srcLen) const
c91830cb 1129{
35d11700
VZ
1130 if ( srcLen == wxNO_LEN )
1131 srcLen = wxWcslen(src) + 1;
c91830cb 1132
35d11700 1133 size_t outLen = 0;
ef199164 1134 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1135 for ( size_t n = 0; n < srcLen; n++ )
c91830cb
VZ
1136 {
1137 wxUint16 cc[2];
35d11700
VZ
1138 const size_t numChars = encode_utf16(*src++, cc);
1139 if ( numChars == wxCONV_FAILED )
1140 return wxCONV_FAILED;
c91830cb 1141
ef199164
DS
1142 outLen += numChars * BYTES_PER_CHAR;
1143 if ( outBuff )
c91830cb 1144 {
35d11700
VZ
1145 if ( outLen > dstLen )
1146 return wxCONV_FAILED;
1147
ef199164 1148 *outBuff++ = cc[0];
35d11700 1149 if ( numChars == 2 )
69b80d28 1150 {
35d11700 1151 // second character of a surrogate
ef199164 1152 *outBuff++ = cc[1];
69b80d28 1153 }
c91830cb 1154 }
c91830cb 1155 }
c91830cb 1156
35d11700 1157 return outLen;
c91830cb
VZ
1158}
1159
467e0479
VZ
1160// ----------------------------------------------------------------------------
1161// endian-reversing conversions
1162// ----------------------------------------------------------------------------
c91830cb 1163
35d11700
VZ
1164size_t
1165wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1166 const char *src, size_t srcLen) const
c91830cb 1167{
35d11700
VZ
1168 srcLen = GetLength(src, srcLen);
1169 if ( srcLen == wxNO_LEN )
1170 return wxCONV_FAILED;
1171
ef199164 1172 const size_t inLen = srcLen / BYTES_PER_CHAR;
35d11700
VZ
1173 if ( !dst )
1174 {
1175 // optimization: return maximal space which could be needed for this
1176 // string even if the real size could be smaller if the buffer contains
1177 // any surrogates
1178 return inLen;
1179 }
c91830cb 1180
35d11700 1181 size_t outLen = 0;
ef199164
DS
1182 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1183 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
c91830cb 1184 {
35d11700
VZ
1185 wxUint32 ch;
1186 wxUint16 tmp[2];
ef199164
DS
1187
1188 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189 inBuff++;
1190 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
c91830cb 1191
35d11700
VZ
1192 const size_t numChars = decode_utf16(tmp, ch);
1193 if ( numChars == wxCONV_FAILED )
1194 return wxCONV_FAILED;
c91830cb 1195
35d11700 1196 if ( numChars == 2 )
ef199164 1197 inBuff++;
35d11700
VZ
1198
1199 if ( ++outLen > dstLen )
1200 return wxCONV_FAILED;
c91830cb 1201
35d11700 1202 *dst++ = ch;
c91830cb 1203 }
c91830cb 1204
c91830cb 1205
35d11700
VZ
1206 return outLen;
1207}
c91830cb 1208
35d11700
VZ
1209size_t
1210wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1211 const wchar_t *src, size_t srcLen) const
c91830cb 1212{
35d11700
VZ
1213 if ( srcLen == wxNO_LEN )
1214 srcLen = wxWcslen(src) + 1;
c91830cb 1215
35d11700 1216 size_t outLen = 0;
ef199164 1217 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
35d11700 1218 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
c91830cb
VZ
1219 {
1220 wxUint16 cc[2];
35d11700
VZ
1221 const size_t numChars = encode_utf16(*src, cc);
1222 if ( numChars == wxCONV_FAILED )
1223 return wxCONV_FAILED;
c91830cb 1224
ef199164
DS
1225 outLen += numChars * BYTES_PER_CHAR;
1226 if ( outBuff )
c91830cb 1227 {
35d11700
VZ
1228 if ( outLen > dstLen )
1229 return wxCONV_FAILED;
1230
ef199164 1231 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
35d11700 1232 if ( numChars == 2 )
c91830cb 1233 {
35d11700 1234 // second character of a surrogate
ef199164 1235 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
c91830cb
VZ
1236 }
1237 }
c91830cb 1238 }
c91830cb 1239
35d11700 1240 return outLen;
c91830cb
VZ
1241}
1242
467e0479 1243#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1244
1245
35d11700 1246// ============================================================================
c91830cb 1247// UTF-32
35d11700 1248// ============================================================================
c91830cb
VZ
1249
1250#ifdef WORDS_BIGENDIAN
467e0479
VZ
1251 #define wxMBConvUTF32straight wxMBConvUTF32BE
1252 #define wxMBConvUTF32swap wxMBConvUTF32LE
c91830cb 1253#else
467e0479
VZ
1254 #define wxMBConvUTF32swap wxMBConvUTF32BE
1255 #define wxMBConvUTF32straight wxMBConvUTF32LE
c91830cb
VZ
1256#endif
1257
1258
1259WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1260WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1261
467e0479
VZ
1262/* static */
1263size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1264{
1265 if ( srcLen == wxNO_LEN )
1266 {
1267 // count the number of bytes in input, including the trailing NULs
ef199164
DS
1268 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1269 for ( srcLen = 1; *inBuff++; srcLen++ )
467e0479 1270 ;
c91830cb 1271
467e0479
VZ
1272 srcLen *= BYTES_PER_CHAR;
1273 }
1274 else // we already have the length
1275 {
1276 // we can only convert an entire number of UTF-32 characters
1277 if ( srcLen % BYTES_PER_CHAR )
1278 return wxCONV_FAILED;
1279 }
1280
1281 return srcLen;
1282}
1283
1284// case when in-memory representation is UTF-16
c91830cb
VZ
1285#ifdef WC_UTF16
1286
467e0479
VZ
1287// ----------------------------------------------------------------------------
1288// conversions without endianness change
1289// ----------------------------------------------------------------------------
1290
1291size_t
1292wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1293 const char *src, size_t srcLen) const
c91830cb 1294{
467e0479
VZ
1295 srcLen = GetLength(src, srcLen);
1296 if ( srcLen == wxNO_LEN )
1297 return wxCONV_FAILED;
c91830cb 1298
ef199164
DS
1299 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1300 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479
VZ
1301 size_t outLen = 0;
1302 for ( size_t n = 0; n < inLen; n++ )
c91830cb
VZ
1303 {
1304 wxUint16 cc[2];
ef199164 1305 const size_t numChars = encode_utf16(*inBuff++, cc);
467e0479
VZ
1306 if ( numChars == wxCONV_FAILED )
1307 return wxCONV_FAILED;
c91830cb 1308
467e0479
VZ
1309 outLen += numChars;
1310 if ( dst )
c91830cb 1311 {
467e0479
VZ
1312 if ( outLen > dstLen )
1313 return wxCONV_FAILED;
d32a507d 1314
467e0479
VZ
1315 *dst++ = cc[0];
1316 if ( numChars == 2 )
1317 {
1318 // second character of a surrogate
1319 *dst++ = cc[1];
1320 }
1321 }
c91830cb 1322 }
d32a507d 1323
467e0479 1324 return outLen;
c91830cb
VZ
1325}
1326
467e0479
VZ
1327size_t
1328wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1329 const wchar_t *src, size_t srcLen) const
c91830cb 1330{
467e0479
VZ
1331 if ( srcLen == wxNO_LEN )
1332 srcLen = wxWcslen(src) + 1;
c91830cb 1333
467e0479 1334 if ( !dst )
c91830cb 1335 {
467e0479
VZ
1336 // optimization: return maximal space which could be needed for this
1337 // string instead of the exact amount which could be less if there are
1338 // any surrogates in the input
1339 //
1340 // we consider that surrogates are rare enough to make it worthwhile to
1341 // avoid running the loop below at the cost of slightly extra memory
1342 // consumption
ef199164 1343 return srcLen * BYTES_PER_CHAR;
467e0479 1344 }
c91830cb 1345
ef199164 1346 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1347 size_t outLen = 0;
1348 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1349 {
1350 const wxUint32 ch = wxDecodeSurrogate(&src);
1351 if ( !src )
1352 return wxCONV_FAILED;
c91830cb 1353
467e0479 1354 outLen += BYTES_PER_CHAR;
d32a507d 1355
467e0479
VZ
1356 if ( outLen > dstLen )
1357 return wxCONV_FAILED;
b5153fd8 1358
ef199164 1359 *outBuff++ = ch;
467e0479 1360 }
c91830cb 1361
467e0479 1362 return outLen;
c91830cb
VZ
1363}
1364
467e0479
VZ
1365// ----------------------------------------------------------------------------
1366// endian-reversing conversions
1367// ----------------------------------------------------------------------------
c91830cb 1368
467e0479
VZ
1369size_t
1370wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1371 const char *src, size_t srcLen) const
c91830cb 1372{
467e0479
VZ
1373 srcLen = GetLength(src, srcLen);
1374 if ( srcLen == wxNO_LEN )
1375 return wxCONV_FAILED;
c91830cb 1376
ef199164
DS
1377 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1378 const size_t inLen = srcLen / BYTES_PER_CHAR;
467e0479 1379 size_t outLen = 0;
ef199164 1380 for ( size_t n = 0; n < inLen; n++, inBuff++ )
c91830cb 1381 {
c91830cb 1382 wxUint16 cc[2];
ef199164 1383 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
467e0479
VZ
1384 if ( numChars == wxCONV_FAILED )
1385 return wxCONV_FAILED;
c91830cb 1386
467e0479
VZ
1387 outLen += numChars;
1388 if ( dst )
c91830cb 1389 {
467e0479
VZ
1390 if ( outLen > dstLen )
1391 return wxCONV_FAILED;
d32a507d 1392
467e0479
VZ
1393 *dst++ = cc[0];
1394 if ( numChars == 2 )
1395 {
1396 // second character of a surrogate
1397 *dst++ = cc[1];
1398 }
1399 }
c91830cb 1400 }
b5153fd8 1401
467e0479 1402 return outLen;
c91830cb
VZ
1403}
1404
467e0479
VZ
1405size_t
1406wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1407 const wchar_t *src, size_t srcLen) const
c91830cb 1408{
467e0479
VZ
1409 if ( srcLen == wxNO_LEN )
1410 srcLen = wxWcslen(src) + 1;
c91830cb 1411
467e0479 1412 if ( !dst )
c91830cb 1413 {
467e0479
VZ
1414 // optimization: return maximal space which could be needed for this
1415 // string instead of the exact amount which could be less if there are
1416 // any surrogates in the input
1417 //
1418 // we consider that surrogates are rare enough to make it worthwhile to
1419 // avoid running the loop below at the cost of slightly extra memory
1420 // consumption
1421 return srcLen*BYTES_PER_CHAR;
1422 }
c91830cb 1423
ef199164 1424 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
467e0479
VZ
1425 size_t outLen = 0;
1426 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1427 {
1428 const wxUint32 ch = wxDecodeSurrogate(&src);
1429 if ( !src )
1430 return wxCONV_FAILED;
c91830cb 1431
467e0479 1432 outLen += BYTES_PER_CHAR;
d32a507d 1433
467e0479
VZ
1434 if ( outLen > dstLen )
1435 return wxCONV_FAILED;
b5153fd8 1436
ef199164 1437 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
467e0479 1438 }
c91830cb 1439
467e0479 1440 return outLen;
c91830cb
VZ
1441}
1442
467e0479 1443#else // !WC_UTF16: wchar_t is UTF-32
c91830cb 1444
35d11700
VZ
1445// ----------------------------------------------------------------------------
1446// conversions without endianness change
1447// ----------------------------------------------------------------------------
1448
1449size_t
1450wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1451 const char *src, size_t srcLen) const
c91830cb 1452{
35d11700
VZ
1453 // use memcpy() as it should be much faster than hand-written loop
1454 srcLen = GetLength(src, srcLen);
1455 if ( srcLen == wxNO_LEN )
1456 return wxCONV_FAILED;
c91830cb 1457
35d11700
VZ
1458 const size_t inLen = srcLen/BYTES_PER_CHAR;
1459 if ( dst )
c91830cb 1460 {
35d11700
VZ
1461 if ( dstLen < inLen )
1462 return wxCONV_FAILED;
b5153fd8 1463
35d11700
VZ
1464 memcpy(dst, src, srcLen);
1465 }
c91830cb 1466
35d11700 1467 return inLen;
c91830cb
VZ
1468}
1469
35d11700
VZ
1470size_t
1471wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1472 const wchar_t *src, size_t srcLen) const
c91830cb 1473{
35d11700
VZ
1474 if ( srcLen == wxNO_LEN )
1475 srcLen = wxWcslen(src) + 1;
1476
1477 srcLen *= BYTES_PER_CHAR;
c91830cb 1478
35d11700 1479 if ( dst )
c91830cb 1480 {
35d11700
VZ
1481 if ( dstLen < srcLen )
1482 return wxCONV_FAILED;
c91830cb 1483
35d11700 1484 memcpy(dst, src, srcLen);
c91830cb
VZ
1485 }
1486
35d11700 1487 return srcLen;
c91830cb
VZ
1488}
1489
35d11700
VZ
1490// ----------------------------------------------------------------------------
1491// endian-reversing conversions
1492// ----------------------------------------------------------------------------
c91830cb 1493
35d11700
VZ
1494size_t
1495wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1496 const char *src, size_t srcLen) const
c91830cb 1497{
35d11700
VZ
1498 srcLen = GetLength(src, srcLen);
1499 if ( srcLen == wxNO_LEN )
1500 return wxCONV_FAILED;
1501
1502 srcLen /= BYTES_PER_CHAR;
c91830cb 1503
35d11700 1504 if ( dst )
c91830cb 1505 {
35d11700
VZ
1506 if ( dstLen < srcLen )
1507 return wxCONV_FAILED;
1508
ef199164
DS
1509 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1510 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
c91830cb 1511 {
ef199164 1512 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
c91830cb 1513 }
c91830cb 1514 }
b5153fd8 1515
35d11700 1516 return srcLen;
c91830cb
VZ
1517}
1518
35d11700
VZ
1519size_t
1520wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1521 const wchar_t *src, size_t srcLen) const
c91830cb 1522{
35d11700
VZ
1523 if ( srcLen == wxNO_LEN )
1524 srcLen = wxWcslen(src) + 1;
1525
1526 srcLen *= BYTES_PER_CHAR;
c91830cb 1527
35d11700 1528 if ( dst )
c91830cb 1529 {
35d11700
VZ
1530 if ( dstLen < srcLen )
1531 return wxCONV_FAILED;
1532
ef199164 1533 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
35d11700 1534 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
c91830cb 1535 {
ef199164 1536 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
c91830cb 1537 }
c91830cb 1538 }
b5153fd8 1539
35d11700 1540 return srcLen;
c91830cb
VZ
1541}
1542
467e0479 1543#endif // WC_UTF16/!WC_UTF16
c91830cb
VZ
1544
1545
36acb880
VZ
1546// ============================================================================
1547// The classes doing conversion using the iconv_xxx() functions
1548// ============================================================================
3caec1bb 1549
b040e242 1550#ifdef HAVE_ICONV
3a0d76bc 1551
b1d547eb
VS
1552// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1553// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1554// (unless there's yet another bug in glibc) the only case when iconv()
1555// returns with (size_t)-1 (which means error) and says there are 0 bytes
1556// left in the input buffer -- when _real_ error occurs,
1557// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1558// iconv() failure.
3caec1bb
VS
1559// [This bug does not appear in glibc 2.2.]
1560#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1561#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1562 (errno != E2BIG || bufLeft != 0))
1563#else
1564#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1565#endif
1566
ab217dba 1567#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1568
74a7eb0b
VZ
1569#define ICONV_T_INVALID ((iconv_t)-1)
1570
1571#if SIZEOF_WCHAR_T == 4
1572 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1573 #define WC_ENC wxFONTENCODING_UTF32
1574#elif SIZEOF_WCHAR_T == 2
1575 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1576 #define WC_ENC wxFONTENCODING_UTF16
1577#else // sizeof(wchar_t) != 2 nor 4
1578 // does this ever happen?
1579 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1580#endif
1581
36acb880 1582// ----------------------------------------------------------------------------
e95354ec 1583// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1584// ----------------------------------------------------------------------------
1585
e95354ec 1586class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1587{
1588public:
e95354ec
VZ
1589 wxMBConv_iconv(const wxChar *name);
1590 virtual ~wxMBConv_iconv();
36acb880 1591
bde4baac
VZ
1592 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1593 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1594
d36c9347 1595 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
7ef3ab50
VZ
1596 virtual size_t GetMBNulLen() const;
1597
d36c9347
VZ
1598 virtual wxMBConv *Clone() const
1599 {
1600 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1601 p->m_minMBCharWidth = m_minMBCharWidth;
1602 return p;
1603 }
1604
e95354ec 1605 bool IsOk() const
74a7eb0b 1606 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1607
1608protected:
ef199164
DS
1609 // the iconv handlers used to translate from multibyte
1610 // to wide char and in the other direction
36acb880
VZ
1611 iconv_t m2w,
1612 w2m;
ef199164 1613
b1d547eb
VS
1614#if wxUSE_THREADS
1615 // guards access to m2w and w2m objects
1616 wxMutex m_iconvMutex;
1617#endif
36acb880
VZ
1618
1619private:
e95354ec 1620 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1621 // available on this machine, it will remain NULL
74a7eb0b 1622 static wxString ms_wcCharsetName;
36acb880
VZ
1623
1624 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1625 // different endian-ness than the native one
405d8f46 1626 static bool ms_wcNeedsSwap;
eec47cc6 1627
d36c9347
VZ
1628
1629 // name of the encoding handled by this conversion
1630 wxString m_name;
1631
7ef3ab50 1632 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
c1464d9d
VZ
1633 // initially
1634 size_t m_minMBCharWidth;
36acb880
VZ
1635};
1636
8f115891
MW
1637// make the constructor available for unit testing
1638WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1639{
1640 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1641 if ( !result->IsOk() )
1642 {
1643 delete result;
1644 return 0;
1645 }
ef199164 1646
8f115891
MW
1647 return result;
1648}
1649
422e411e 1650wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1651bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1652
e95354ec 1653wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
d36c9347 1654 : m_name(name)
36acb880 1655{
c1464d9d 1656 m_minMBCharWidth = 0;
eec47cc6 1657
0331b385
VZ
1658 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1659 // names for the charsets
200a9923 1660 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1661
36acb880 1662 // check for charset that represents wchar_t:
74a7eb0b 1663 if ( ms_wcCharsetName.empty() )
f1339c56 1664 {
c2b83fdd
VZ
1665 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1666
74a7eb0b
VZ
1667#if wxUSE_FONTMAP
1668 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1669#else // !wxUSE_FONTMAP
1670 static const wxChar *names[] =
36acb880 1671 {
74a7eb0b
VZ
1672#if SIZEOF_WCHAR_T == 4
1673 _T("UCS-4"),
1674#elif SIZEOF_WCHAR_T = 2
1675 _T("UCS-2"),
1676#endif
1677 NULL
1678 };
1679#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1680
d1f024a8 1681 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1682 {
17a1ebd1 1683 const wxString nameCS(*names);
74a7eb0b
VZ
1684
1685 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1686 wxString nameXE(nameCS);
ef199164
DS
1687
1688#ifdef WORDS_BIGENDIAN
74a7eb0b 1689 nameXE += _T("BE");
ef199164 1690#else // little endian
74a7eb0b 1691 nameXE += _T("LE");
ef199164 1692#endif
74a7eb0b 1693
c2b83fdd
VZ
1694 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1695 nameXE.c_str());
1696
74a7eb0b
VZ
1697 m2w = iconv_open(nameXE.ToAscii(), cname);
1698 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1699 {
74a7eb0b 1700 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1701 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1702 nameCS.c_str());
17a1ebd1 1703 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1704
74a7eb0b
VZ
1705 // and check for bytesex ourselves:
1706 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1707 {
74a7eb0b
VZ
1708 char buf[2], *bufPtr;
1709 wchar_t wbuf[2], *wbufPtr;
1710 size_t insz, outsz;
1711 size_t res;
1712
1713 buf[0] = 'A';
1714 buf[1] = 0;
1715 wbuf[0] = 0;
1716 insz = 2;
1717 outsz = SIZEOF_WCHAR_T * 2;
1718 wbufPtr = wbuf;
1719 bufPtr = buf;
1720
ef199164
DS
1721 res = iconv(
1722 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1723 (char**)&wbufPtr, &outsz);
74a7eb0b
VZ
1724
1725 if (ICONV_FAILED(res, insz))
1726 {
1727 wxLogLastError(wxT("iconv"));
422e411e 1728 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1729 nameCS.c_str());
74a7eb0b
VZ
1730 }
1731 else // ok, can convert to this encoding, remember it
1732 {
17a1ebd1 1733 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1734 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1735 }
3a0d76bc
VS
1736 }
1737 }
74a7eb0b 1738 else // use charset not requiring byte swapping
36acb880 1739 {
74a7eb0b 1740 ms_wcCharsetName = nameXE;
36acb880 1741 }
3a0d76bc 1742 }
74a7eb0b 1743
0944fceb 1744 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1745 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1746 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1747 : ms_wcCharsetName.c_str(),
1748 ms_wcNeedsSwap ? _T(" (needs swap)")
1749 : _T(""));
3a0d76bc 1750 }
36acb880 1751 else // we already have ms_wcCharsetName
3caec1bb 1752 {
74a7eb0b 1753 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1754 }
dccce9ea 1755
74a7eb0b 1756 if ( ms_wcCharsetName.empty() )
f1339c56 1757 {
74a7eb0b 1758 w2m = ICONV_T_INVALID;
36acb880 1759 }
405d8f46
VZ
1760 else
1761 {
74a7eb0b
VZ
1762 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1763 if ( w2m == ICONV_T_INVALID )
1764 {
1765 wxLogTrace(TRACE_STRCONV,
1766 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1767 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1768 }
405d8f46 1769 }
36acb880 1770}
3caec1bb 1771
e95354ec 1772wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1773{
74a7eb0b 1774 if ( m2w != ICONV_T_INVALID )
36acb880 1775 iconv_close(m2w);
74a7eb0b 1776 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1777 iconv_close(w2m);
1778}
3a0d76bc 1779
bde4baac 1780size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1781{
69373110
VZ
1782 // find the string length: notice that must be done differently for
1783 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1784 size_t inbuf;
7ef3ab50 1785 const size_t nulLen = GetMBNulLen();
69373110
VZ
1786 switch ( nulLen )
1787 {
1788 default:
467e0479 1789 return wxCONV_FAILED;
69373110
VZ
1790
1791 case 1:
1792 inbuf = strlen(psz); // arguably more optimized than our version
1793 break;
1794
1795 case 2:
1796 case 4:
1797 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1798 // they also have to start at character boundary and not span two
1799 // adjacent characters
1800 const char *p;
1801 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1802 ;
1803 inbuf = p - psz;
1804 break;
1805 }
1806
b1d547eb
VS
1807#if wxUSE_THREADS
1808 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1809 // Unfortunately there is a couple of global wxCSConv objects such as
1810 // wxConvLocal that are used all over wx code, so we have to make sure
1811 // the handle is used by at most one thread at the time. Otherwise
1812 // only a few wx classes would be safe to use from non-main threads
1813 // as MB<->WC conversion would fail "randomly".
1814 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
69373110
VZ
1815#endif // wxUSE_THREADS
1816
36acb880
VZ
1817 size_t outbuf = n * SIZEOF_WCHAR_T;
1818 size_t res, cres;
1819 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1820 wchar_t *bufPtr = buf;
1821 const char *pszPtr = psz;
1822
1823 if (buf)
1824 {
1825 // have destination buffer, convert there
1826 cres = iconv(m2w,
1827 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1828 (char**)&bufPtr, &outbuf);
1829 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1830
36acb880 1831 if (ms_wcNeedsSwap)
3a0d76bc 1832 {
36acb880 1833 // convert to native endianness
17a1ebd1
VZ
1834 for ( unsigned i = 0; i < res; i++ )
1835 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1836 }
adb45366 1837
69373110 1838 // NUL-terminate the string if there is any space left
49dd9820
VS
1839 if (res < n)
1840 buf[res] = 0;
36acb880
VZ
1841 }
1842 else
1843 {
1844 // no destination buffer... convert using temp buffer
1845 // to calculate destination buffer requirement
1846 wchar_t tbuf[8];
1847 res = 0;
ef199164
DS
1848
1849 do
1850 {
36acb880 1851 bufPtr = tbuf;
ef199164 1852 outbuf = 8 * SIZEOF_WCHAR_T;
36acb880
VZ
1853
1854 cres = iconv(m2w,
1855 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1856 (char**)&bufPtr, &outbuf );
1857
ef199164
DS
1858 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1859 }
1860 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 1861 }
dccce9ea 1862
36acb880 1863 if (ICONV_FAILED(cres, inbuf))
f1339c56 1864 {
36acb880 1865 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1866 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 1867 return wxCONV_FAILED;
36acb880
VZ
1868 }
1869
1870 return res;
1871}
1872
bde4baac 1873size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1874{
b1d547eb
VS
1875#if wxUSE_THREADS
1876 // NB: explained in MB2WC
1877 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1878#endif
3698ae71 1879
156162ec
MW
1880 size_t inlen = wxWcslen(psz);
1881 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1882 size_t outbuf = n;
1883 size_t res, cres;
3a0d76bc 1884
36acb880 1885 wchar_t *tmpbuf = 0;
3caec1bb 1886
36acb880
VZ
1887 if (ms_wcNeedsSwap)
1888 {
1889 // need to copy to temp buffer to switch endianness
74a7eb0b 1890 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1891 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1892 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1893 for ( size_t i = 0; i < inlen; i++ )
1894 tmpbuf[n] = WC_BSWAP(psz[i]);
ef199164 1895
156162ec 1896 tmpbuf[inlen] = L'\0';
74a7eb0b 1897 psz = tmpbuf;
36acb880 1898 }
3a0d76bc 1899
36acb880
VZ
1900 if (buf)
1901 {
1902 // have destination buffer, convert there
1903 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1904
ef199164 1905 res = n - outbuf;
adb45366 1906
49dd9820
VS
1907 // NB: iconv was given only wcslen(psz) characters on input, and so
1908 // it couldn't convert the trailing zero. Let's do it ourselves
1909 // if there's some room left for it in the output buffer.
1910 if (res < n)
1911 buf[0] = 0;
36acb880
VZ
1912 }
1913 else
1914 {
ef199164 1915 // no destination buffer: convert using temp buffer
36acb880
VZ
1916 // to calculate destination buffer requirement
1917 char tbuf[16];
1918 res = 0;
ef199164
DS
1919 do
1920 {
1921 buf = tbuf;
1922 outbuf = 16;
36acb880
VZ
1923
1924 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1925
36acb880 1926 res += 16 - outbuf;
ef199164
DS
1927 }
1928 while ((cres == (size_t)-1) && (errno == E2BIG));
f1339c56 1929 }
dccce9ea 1930
36acb880
VZ
1931 if (ms_wcNeedsSwap)
1932 {
1933 free(tmpbuf);
1934 }
dccce9ea 1935
36acb880
VZ
1936 if (ICONV_FAILED(cres, inbuf))
1937 {
ce6f8d6f 1938 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
467e0479 1939 return wxCONV_FAILED;
36acb880
VZ
1940 }
1941
1942 return res;
1943}
1944
7ef3ab50 1945size_t wxMBConv_iconv::GetMBNulLen() const
eec47cc6 1946{
c1464d9d 1947 if ( m_minMBCharWidth == 0 )
eec47cc6
VZ
1948 {
1949 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1950
1951#if wxUSE_THREADS
1952 // NB: explained in MB2WC
1953 wxMutexLocker lock(self->m_iconvMutex);
1954#endif
1955
356410fc 1956 wchar_t *wnul = L"";
c1464d9d 1957 char buf[8]; // should be enough for NUL in any encoding
356410fc 1958 size_t inLen = sizeof(wchar_t),
c1464d9d 1959 outLen = WXSIZEOF(buf);
ef199164
DS
1960 char *inBuff = (char *)wnul;
1961 char *outBuff = buf;
1962 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
356410fc 1963 {
c1464d9d 1964 self->m_minMBCharWidth = (size_t)-1;
356410fc
VZ
1965 }
1966 else // ok
1967 {
ef199164 1968 self->m_minMBCharWidth = outBuff - buf;
356410fc 1969 }
eec47cc6
VZ
1970 }
1971
c1464d9d 1972 return m_minMBCharWidth;
eec47cc6
VZ
1973}
1974
b040e242 1975#endif // HAVE_ICONV
36acb880 1976
e95354ec 1977
36acb880
VZ
1978// ============================================================================
1979// Win32 conversion classes
1980// ============================================================================
1cd52418 1981
e95354ec 1982#ifdef wxHAVE_WIN32_MB2WC
373658eb 1983
8b04d4c4 1984// from utils.cpp
d775fa82 1985#if wxUSE_FONTMAP
8b04d4c4
VZ
1986extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1987extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1988#endif
373658eb 1989
e95354ec 1990class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1991{
1992public:
bde4baac
VZ
1993 wxMBConv_win32()
1994 {
1995 m_CodePage = CP_ACP;
c1464d9d 1996 m_minMBCharWidth = 0;
bde4baac
VZ
1997 }
1998
d36c9347
VZ
1999 wxMBConv_win32(const wxMBConv_win32& conv)
2000 {
2001 m_CodePage = conv.m_CodePage;
2002 m_minMBCharWidth = conv.m_minMBCharWidth;
2003 }
2004
7608a683 2005#if wxUSE_FONTMAP
e95354ec 2006 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
2007 {
2008 m_CodePage = wxCharsetToCodepage(name);
c1464d9d 2009 m_minMBCharWidth = 0;
bde4baac 2010 }
dccce9ea 2011
e95354ec 2012 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
2013 {
2014 m_CodePage = wxEncodingToCodepage(encoding);
c1464d9d 2015 m_minMBCharWidth = 0;
bde4baac 2016 }
eec47cc6 2017#endif // wxUSE_FONTMAP
8b04d4c4 2018
d36c9347 2019 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 2020 {
02272c9c
VZ
2021 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2022 // the behaviour is not compatible with the Unix version (using iconv)
2023 // and break the library itself, e.g. wxTextInputStream::NextChar()
2024 // wouldn't work if reading an incomplete MB char didn't result in an
2025 // error
667e5b3e 2026 //
89028980 2027 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
830f8f11
VZ
2028 // Win XP or newer and it is not supported for UTF-[78] so we always
2029 // use our own conversions in this case. See
89028980
VS
2030 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2031 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
830f8f11 2032 if ( m_CodePage == CP_UTF8 )
89028980 2033 {
830f8f11 2034 return wxConvUTF8.MB2WC(buf, psz, n);
89028980 2035 }
830f8f11
VZ
2036
2037 if ( m_CodePage == CP_UTF7 )
2038 {
2039 return wxConvUTF7.MB2WC(buf, psz, n);
2040 }
2041
2042 int flags = 0;
2043 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2044 IsAtLeastWin2kSP4() )
89028980 2045 {
830f8f11 2046 flags = MB_ERR_INVALID_CHARS;
89028980 2047 }
667e5b3e 2048
2b5f62a0
VZ
2049 const size_t len = ::MultiByteToWideChar
2050 (
2051 m_CodePage, // code page
667e5b3e 2052 flags, // flags: fall on error
2b5f62a0
VZ
2053 psz, // input string
2054 -1, // its length (NUL-terminated)
b4da152e 2055 buf, // output string
2b5f62a0
VZ
2056 buf ? n : 0 // size of output buffer
2057 );
89028980
VS
2058 if ( !len )
2059 {
2060 // function totally failed
467e0479 2061 return wxCONV_FAILED;
89028980
VS
2062 }
2063
2064 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2065 // check if we succeeded, by doing a double trip:
2066 if ( !flags && buf )
2067 {
53c174fc
VZ
2068 const size_t mbLen = strlen(psz);
2069 wxCharBuffer mbBuf(mbLen);
89028980
VS
2070 if ( ::WideCharToMultiByte
2071 (
2072 m_CodePage,
2073 0,
2074 buf,
2075 -1,
2076 mbBuf.data(),
53c174fc 2077 mbLen + 1, // size in bytes, not length
89028980
VS
2078 NULL,
2079 NULL
2080 ) == 0 ||
2081 strcmp(mbBuf, psz) != 0 )
2082 {
2083 // we didn't obtain the same thing we started from, hence
2084 // the conversion was lossy and we consider that it failed
467e0479 2085 return wxCONV_FAILED;
89028980
VS
2086 }
2087 }
2b5f62a0 2088
03a991bc
VZ
2089 // note that it returns count of written chars for buf != NULL and size
2090 // of the needed buffer for buf == NULL so in either case the length of
2091 // the string (which never includes the terminating NUL) is one less
89028980 2092 return len - 1;
f1339c56 2093 }
dccce9ea 2094
d36c9347 2095 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 2096 {
13dd924a
VZ
2097 /*
2098 we have a problem here: by default, WideCharToMultiByte() may
2099 replace characters unrepresentable in the target code page with bad
2100 quality approximations such as turning "1/2" symbol (U+00BD) into
2101 "1" for the code pages which don't have it and we, obviously, want
2102 to avoid this at any price
d775fa82 2103
13dd924a
VZ
2104 the trouble is that this function does it _silently_, i.e. it won't
2105 even tell us whether it did or not... Win98/2000 and higher provide
2106 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2107 we have to resort to a round trip, i.e. check that converting back
2108 results in the same string -- this is, of course, expensive but
2109 otherwise we simply can't be sure to not garble the data.
2110 */
2111
2112 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2113 // it doesn't work with CJK encodings (which we test for rather roughly
2114 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2115 // supporting it
907173e5
WS
2116 BOOL usedDef wxDUMMY_INITIALIZE(false);
2117 BOOL *pUsedDef;
13dd924a
VZ
2118 int flags;
2119 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2120 {
2121 // it's our lucky day
2122 flags = WC_NO_BEST_FIT_CHARS;
2123 pUsedDef = &usedDef;
2124 }
2125 else // old system or unsupported encoding
2126 {
2127 flags = 0;
2128 pUsedDef = NULL;
2129 }
2130
2b5f62a0
VZ
2131 const size_t len = ::WideCharToMultiByte
2132 (
2133 m_CodePage, // code page
13dd924a
VZ
2134 flags, // either none or no best fit
2135 pwz, // input string
2b5f62a0
VZ
2136 -1, // it is (wide) NUL-terminated
2137 buf, // output buffer
2138 buf ? n : 0, // and its size
2139 NULL, // default "replacement" char
13dd924a 2140 pUsedDef // [out] was it used?
2b5f62a0
VZ
2141 );
2142
13dd924a
VZ
2143 if ( !len )
2144 {
2145 // function totally failed
467e0479 2146 return wxCONV_FAILED;
13dd924a
VZ
2147 }
2148
2149 // if we were really converting, check if we succeeded
2150 if ( buf )
2151 {
2152 if ( flags )
2153 {
2154 // check if the conversion failed, i.e. if any replacements
2155 // were done
2156 if ( usedDef )
467e0479 2157 return wxCONV_FAILED;
13dd924a
VZ
2158 }
2159 else // we must resort to double tripping...
2160 {
2161 wxWCharBuffer wcBuf(n);
467e0479 2162 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
13dd924a
VZ
2163 wcscmp(wcBuf, pwz) != 0 )
2164 {
2165 // we didn't obtain the same thing we started from, hence
2166 // the conversion was lossy and we consider that it failed
467e0479 2167 return wxCONV_FAILED;
13dd924a
VZ
2168 }
2169 }
2170 }
2171
03a991bc 2172 // see the comment above for the reason of "len - 1"
13dd924a 2173 return len - 1;
f1339c56 2174 }
dccce9ea 2175
7ef3ab50
VZ
2176 virtual size_t GetMBNulLen() const
2177 {
2178 if ( m_minMBCharWidth == 0 )
2179 {
2180 int len = ::WideCharToMultiByte
2181 (
2182 m_CodePage, // code page
2183 0, // no flags
2184 L"", // input string
2185 1, // translate just the NUL
2186 NULL, // output buffer
2187 0, // and its size
2188 NULL, // no replacement char
2189 NULL // [out] don't care if it was used
2190 );
2191
2192 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2193 switch ( len )
2194 {
2195 default:
2196 wxLogDebug(_T("Unexpected NUL length %d"), len);
ef199164
DS
2197 self->m_minMBCharWidth = (size_t)-1;
2198 break;
7ef3ab50
VZ
2199
2200 case 0:
2201 self->m_minMBCharWidth = (size_t)-1;
2202 break;
2203
2204 case 1:
2205 case 2:
2206 case 4:
2207 self->m_minMBCharWidth = len;
2208 break;
2209 }
2210 }
2211
2212 return m_minMBCharWidth;
2213 }
2214
d36c9347
VZ
2215 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2216
13dd924a
VZ
2217 bool IsOk() const { return m_CodePage != -1; }
2218
2219private:
2220 static bool CanUseNoBestFit()
2221 {
2222 static int s_isWin98Or2k = -1;
2223
2224 if ( s_isWin98Or2k == -1 )
2225 {
2226 int verMaj, verMin;
2227 switch ( wxGetOsVersion(&verMaj, &verMin) )
2228 {
2229 case wxWIN95:
2230 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2231 break;
2232
2233 case wxWINDOWS_NT:
2234 s_isWin98Or2k = verMaj >= 5;
2235 break;
2236
2237 default:
ef199164 2238 // unknown: be conservative by default
13dd924a 2239 s_isWin98Or2k = 0;
ef199164 2240 break;
13dd924a
VZ
2241 }
2242
2243 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2244 }
2245
2246 return s_isWin98Or2k == 1;
2247 }
f1339c56 2248
89028980
VS
2249 static bool IsAtLeastWin2kSP4()
2250 {
8942f83a
WS
2251#ifdef __WXWINCE__
2252 return false;
2253#else
89028980
VS
2254 static int s_isAtLeastWin2kSP4 = -1;
2255
2256 if ( s_isAtLeastWin2kSP4 == -1 )
2257 {
2258 OSVERSIONINFOEX ver;
2259
2260 memset(&ver, 0, sizeof(ver));
2261 ver.dwOSVersionInfoSize = sizeof(ver);
2262 GetVersionEx((OSVERSIONINFO*)&ver);
2263
2264 s_isAtLeastWin2kSP4 =
2265 ((ver.dwMajorVersion > 5) || // Vista+
2266 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2267 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2268 ver.wServicePackMajor >= 4)) // 2000 SP4+
2269 ? 1 : 0;
2270 }
2271
2272 return s_isAtLeastWin2kSP4 == 1;
8942f83a 2273#endif
89028980
VS
2274 }
2275
eec47cc6 2276
c1464d9d 2277 // the code page we're working with
b1d66b54 2278 long m_CodePage;
c1464d9d 2279
7ef3ab50 2280 // cached result of GetMBNulLen(), set to 0 initially meaning
c1464d9d
VZ
2281 // "unknown"
2282 size_t m_minMBCharWidth;
1cd52418 2283};
e95354ec
VZ
2284
2285#endif // wxHAVE_WIN32_MB2WC
2286
f7e98dee
RN
2287// ============================================================================
2288// Cocoa conversion classes
2289// ============================================================================
2290
2291#if defined(__WXCOCOA__)
2292
ef199164
DS
2293// RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2294// Strangely enough, internally Core Foundation uses
2295// UTF-32 internally quite a bit - its just not public (yet).
f7e98dee
RN
2296
2297#include <CoreFoundation/CFString.h>
2298#include <CoreFoundation/CFStringEncodingExt.h>
2299
2300CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 2301{
638357a0 2302 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ef199164
DS
2303
2304 switch (encoding)
ecd9653b 2305 {
ef199164
DS
2306 case wxFONTENCODING_DEFAULT :
2307 enc = CFStringGetSystemEncoding();
2308 break ;
2309
ecd9653b
WS
2310 case wxFONTENCODING_ISO8859_1 :
2311 enc = kCFStringEncodingISOLatin1 ;
2312 break ;
2313 case wxFONTENCODING_ISO8859_2 :
2314 enc = kCFStringEncodingISOLatin2;
2315 break ;
2316 case wxFONTENCODING_ISO8859_3 :
2317 enc = kCFStringEncodingISOLatin3 ;
2318 break ;
2319 case wxFONTENCODING_ISO8859_4 :
2320 enc = kCFStringEncodingISOLatin4;
2321 break ;
2322 case wxFONTENCODING_ISO8859_5 :
2323 enc = kCFStringEncodingISOLatinCyrillic;
2324 break ;
2325 case wxFONTENCODING_ISO8859_6 :
2326 enc = kCFStringEncodingISOLatinArabic;
2327 break ;
2328 case wxFONTENCODING_ISO8859_7 :
2329 enc = kCFStringEncodingISOLatinGreek;
2330 break ;
2331 case wxFONTENCODING_ISO8859_8 :
2332 enc = kCFStringEncodingISOLatinHebrew;
2333 break ;
2334 case wxFONTENCODING_ISO8859_9 :
2335 enc = kCFStringEncodingISOLatin5;
2336 break ;
2337 case wxFONTENCODING_ISO8859_10 :
2338 enc = kCFStringEncodingISOLatin6;
2339 break ;
2340 case wxFONTENCODING_ISO8859_11 :
2341 enc = kCFStringEncodingISOLatinThai;
2342 break ;
2343 case wxFONTENCODING_ISO8859_13 :
2344 enc = kCFStringEncodingISOLatin7;
2345 break ;
2346 case wxFONTENCODING_ISO8859_14 :
2347 enc = kCFStringEncodingISOLatin8;
2348 break ;
2349 case wxFONTENCODING_ISO8859_15 :
2350 enc = kCFStringEncodingISOLatin9;
2351 break ;
2352
2353 case wxFONTENCODING_KOI8 :
2354 enc = kCFStringEncodingKOI8_R;
2355 break ;
2356 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2357 enc = kCFStringEncodingDOSRussian;
2358 break ;
2359
2360// case wxFONTENCODING_BULGARIAN :
2361// enc = ;
2362// break ;
2363
2364 case wxFONTENCODING_CP437 :
ef199164 2365 enc = kCFStringEncodingDOSLatinUS ;
ecd9653b
WS
2366 break ;
2367 case wxFONTENCODING_CP850 :
2368 enc = kCFStringEncodingDOSLatin1;
2369 break ;
2370 case wxFONTENCODING_CP852 :
2371 enc = kCFStringEncodingDOSLatin2;
2372 break ;
2373 case wxFONTENCODING_CP855 :
2374 enc = kCFStringEncodingDOSCyrillic;
2375 break ;
2376 case wxFONTENCODING_CP866 :
ef199164 2377 enc = kCFStringEncodingDOSRussian ;
ecd9653b
WS
2378 break ;
2379 case wxFONTENCODING_CP874 :
2380 enc = kCFStringEncodingDOSThai;
2381 break ;
2382 case wxFONTENCODING_CP932 :
2383 enc = kCFStringEncodingDOSJapanese;
2384 break ;
2385 case wxFONTENCODING_CP936 :
ef199164 2386 enc = kCFStringEncodingDOSChineseSimplif ;
ecd9653b
WS
2387 break ;
2388 case wxFONTENCODING_CP949 :
2389 enc = kCFStringEncodingDOSKorean;
2390 break ;
2391 case wxFONTENCODING_CP950 :
2392 enc = kCFStringEncodingDOSChineseTrad;
2393 break ;
ecd9653b
WS
2394 case wxFONTENCODING_CP1250 :
2395 enc = kCFStringEncodingWindowsLatin2;
2396 break ;
2397 case wxFONTENCODING_CP1251 :
ef199164 2398 enc = kCFStringEncodingWindowsCyrillic ;
ecd9653b
WS
2399 break ;
2400 case wxFONTENCODING_CP1252 :
ef199164 2401 enc = kCFStringEncodingWindowsLatin1 ;
ecd9653b
WS
2402 break ;
2403 case wxFONTENCODING_CP1253 :
2404 enc = kCFStringEncodingWindowsGreek;
2405 break ;
2406 case wxFONTENCODING_CP1254 :
2407 enc = kCFStringEncodingWindowsLatin5;
2408 break ;
2409 case wxFONTENCODING_CP1255 :
ef199164 2410 enc = kCFStringEncodingWindowsHebrew ;
ecd9653b
WS
2411 break ;
2412 case wxFONTENCODING_CP1256 :
ef199164 2413 enc = kCFStringEncodingWindowsArabic ;
ecd9653b
WS
2414 break ;
2415 case wxFONTENCODING_CP1257 :
2416 enc = kCFStringEncodingWindowsBalticRim;
2417 break ;
638357a0
RN
2418// This only really encodes to UTF7 (if that) evidently
2419// case wxFONTENCODING_UTF7 :
2420// enc = kCFStringEncodingNonLossyASCII ;
2421// break ;
ecd9653b
WS
2422 case wxFONTENCODING_UTF8 :
2423 enc = kCFStringEncodingUTF8 ;
2424 break ;
2425 case wxFONTENCODING_EUC_JP :
2426 enc = kCFStringEncodingEUC_JP;
2427 break ;
2428 case wxFONTENCODING_UTF16 :
f7e98dee 2429 enc = kCFStringEncodingUnicode ;
ecd9653b 2430 break ;
f7e98dee
RN
2431 case wxFONTENCODING_MACROMAN :
2432 enc = kCFStringEncodingMacRoman ;
2433 break ;
2434 case wxFONTENCODING_MACJAPANESE :
2435 enc = kCFStringEncodingMacJapanese ;
2436 break ;
2437 case wxFONTENCODING_MACCHINESETRAD :
2438 enc = kCFStringEncodingMacChineseTrad ;
2439 break ;
2440 case wxFONTENCODING_MACKOREAN :
2441 enc = kCFStringEncodingMacKorean ;
2442 break ;
2443 case wxFONTENCODING_MACARABIC :
2444 enc = kCFStringEncodingMacArabic ;
2445 break ;
2446 case wxFONTENCODING_MACHEBREW :
2447 enc = kCFStringEncodingMacHebrew ;
2448 break ;
2449 case wxFONTENCODING_MACGREEK :
2450 enc = kCFStringEncodingMacGreek ;
2451 break ;
2452 case wxFONTENCODING_MACCYRILLIC :
2453 enc = kCFStringEncodingMacCyrillic ;
2454 break ;
2455 case wxFONTENCODING_MACDEVANAGARI :
2456 enc = kCFStringEncodingMacDevanagari ;
2457 break ;
2458 case wxFONTENCODING_MACGURMUKHI :
2459 enc = kCFStringEncodingMacGurmukhi ;
2460 break ;
2461 case wxFONTENCODING_MACGUJARATI :
2462 enc = kCFStringEncodingMacGujarati ;
2463 break ;
2464 case wxFONTENCODING_MACORIYA :
2465 enc = kCFStringEncodingMacOriya ;
2466 break ;
2467 case wxFONTENCODING_MACBENGALI :
2468 enc = kCFStringEncodingMacBengali ;
2469 break ;
2470 case wxFONTENCODING_MACTAMIL :
2471 enc = kCFStringEncodingMacTamil ;
2472 break ;
2473 case wxFONTENCODING_MACTELUGU :
2474 enc = kCFStringEncodingMacTelugu ;
2475 break ;
2476 case wxFONTENCODING_MACKANNADA :
2477 enc = kCFStringEncodingMacKannada ;
2478 break ;
2479 case wxFONTENCODING_MACMALAJALAM :
2480 enc = kCFStringEncodingMacMalayalam ;
2481 break ;
2482 case wxFONTENCODING_MACSINHALESE :
2483 enc = kCFStringEncodingMacSinhalese ;
2484 break ;
2485 case wxFONTENCODING_MACBURMESE :
2486 enc = kCFStringEncodingMacBurmese ;
2487 break ;
2488 case wxFONTENCODING_MACKHMER :
2489 enc = kCFStringEncodingMacKhmer ;
2490 break ;
2491 case wxFONTENCODING_MACTHAI :
2492 enc = kCFStringEncodingMacThai ;
2493 break ;
2494 case wxFONTENCODING_MACLAOTIAN :
2495 enc = kCFStringEncodingMacLaotian ;
2496 break ;
2497 case wxFONTENCODING_MACGEORGIAN :
2498 enc = kCFStringEncodingMacGeorgian ;
2499 break ;
2500 case wxFONTENCODING_MACARMENIAN :
2501 enc = kCFStringEncodingMacArmenian ;
2502 break ;
2503 case wxFONTENCODING_MACCHINESESIMP :
2504 enc = kCFStringEncodingMacChineseSimp ;
2505 break ;
2506 case wxFONTENCODING_MACTIBETAN :
2507 enc = kCFStringEncodingMacTibetan ;
2508 break ;
2509 case wxFONTENCODING_MACMONGOLIAN :
2510 enc = kCFStringEncodingMacMongolian ;
2511 break ;
2512 case wxFONTENCODING_MACETHIOPIC :
2513 enc = kCFStringEncodingMacEthiopic ;
2514 break ;
2515 case wxFONTENCODING_MACCENTRALEUR :
2516 enc = kCFStringEncodingMacCentralEurRoman ;
2517 break ;
2518 case wxFONTENCODING_MACVIATNAMESE :
2519 enc = kCFStringEncodingMacVietnamese ;
2520 break ;
2521 case wxFONTENCODING_MACARABICEXT :
2522 enc = kCFStringEncodingMacExtArabic ;
2523 break ;
2524 case wxFONTENCODING_MACSYMBOL :
2525 enc = kCFStringEncodingMacSymbol ;
2526 break ;
2527 case wxFONTENCODING_MACDINGBATS :
2528 enc = kCFStringEncodingMacDingbats ;
2529 break ;
2530 case wxFONTENCODING_MACTURKISH :
2531 enc = kCFStringEncodingMacTurkish ;
2532 break ;
2533 case wxFONTENCODING_MACCROATIAN :
2534 enc = kCFStringEncodingMacCroatian ;
2535 break ;
2536 case wxFONTENCODING_MACICELANDIC :
2537 enc = kCFStringEncodingMacIcelandic ;
2538 break ;
2539 case wxFONTENCODING_MACROMANIAN :
2540 enc = kCFStringEncodingMacRomanian ;
2541 break ;
2542 case wxFONTENCODING_MACCELTIC :
2543 enc = kCFStringEncodingMacCeltic ;
2544 break ;
2545 case wxFONTENCODING_MACGAELIC :
2546 enc = kCFStringEncodingMacGaelic ;
2547 break ;
ecd9653b
WS
2548// case wxFONTENCODING_MACKEYBOARD :
2549// enc = kCFStringEncodingMacKeyboardGlyphs ;
2550// break ;
ef199164 2551
ecd9653b
WS
2552 default :
2553 // because gcc is picky
2554 break ;
ef199164
DS
2555 }
2556
ecd9653b 2557 return enc ;
f7e98dee
RN
2558}
2559
f7e98dee
RN
2560class wxMBConv_cocoa : public wxMBConv
2561{
2562public:
2563 wxMBConv_cocoa()
2564 {
2565 Init(CFStringGetSystemEncoding()) ;
2566 }
2567
d36c9347
VZ
2568 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2569 {
2570 m_encoding = conv.m_encoding;
2571 }
2572
a6900d10 2573#if wxUSE_FONTMAP
f7e98dee
RN
2574 wxMBConv_cocoa(const wxChar* name)
2575 {
267e11c5 2576 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2577 }
a6900d10 2578#endif
f7e98dee
RN
2579
2580 wxMBConv_cocoa(wxFontEncoding encoding)
2581 {
2582 Init( wxCFStringEncFromFontEnc(encoding) );
2583 }
2584
2585 ~wxMBConv_cocoa()
2586 {
2587 }
2588
2589 void Init( CFStringEncoding encoding)
2590 {
638357a0 2591 m_encoding = encoding ;
f7e98dee
RN
2592 }
2593
2594 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2595 {
2596 wxASSERT(szUnConv);
ecd9653b 2597
638357a0
RN
2598 CFStringRef theString = CFStringCreateWithBytes (
2599 NULL, //the allocator
2600 (const UInt8*)szUnConv,
2601 strlen(szUnConv),
2602 m_encoding,
2603 false //no BOM/external representation
f7e98dee
RN
2604 );
2605
2606 wxASSERT(theString);
2607
638357a0
RN
2608 size_t nOutLength = CFStringGetLength(theString);
2609
2610 if (szOut == NULL)
f7e98dee 2611 {
f7e98dee 2612 CFRelease(theString);
638357a0 2613 return nOutLength;
f7e98dee 2614 }
ecd9653b 2615
638357a0 2616 CFRange theRange = { 0, nOutSize };
ecd9653b 2617
638357a0
RN
2618#if SIZEOF_WCHAR_T == 4
2619 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2620#endif
3698ae71 2621
f7e98dee 2622 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2623
f7e98dee 2624 CFRelease(theString);
ecd9653b 2625
ef199164 2626 szUniCharBuffer[nOutLength] = '\0';
f7e98dee
RN
2627
2628#if SIZEOF_WCHAR_T == 4
ef199164
DS
2629 wxMBConvUTF16 converter;
2630 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2631 delete [] szUniCharBuffer;
f7e98dee 2632#endif
3698ae71 2633
638357a0 2634 return nOutLength;
f7e98dee
RN
2635 }
2636
2637 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2638 {
638357a0 2639 wxASSERT(szUnConv);
3698ae71 2640
f7e98dee 2641 size_t nRealOutSize;
638357a0 2642 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2643 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2644
f7e98dee 2645#if SIZEOF_WCHAR_T == 4
d9d488cf 2646 wxMBConvUTF16 converter ;
ef199164
DS
2647 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2648 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2649 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
f7e98dee 2650 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2651#endif
2652
2653 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2654 NULL, //allocator
2655 szUniBuffer,
2656 nBufSize,
638357a0 2657 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2658 );
ecd9653b 2659
f7e98dee 2660 wxASSERT(theString);
ecd9653b 2661
f7e98dee 2662 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2663 //so we check and use getchars instead in that case
2664 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2665 {
638357a0
RN
2666 if (szOut != NULL)
2667 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2668
638357a0
RN
2669 nRealOutSize = CFStringGetLength(theString) + 1;
2670 }
2671 else
2672 {
2673 CFStringGetBytes(
2674 theString,
2675 CFRangeMake(0, CFStringGetLength(theString)),
2676 m_encoding,
2677 0, //what to put in characters that can't be converted -
2678 //0 tells CFString to return NULL if it meets such a character
2679 false, //not an external representation
2680 (UInt8*) szOut,
3698ae71 2681 nOutSize,
638357a0
RN
2682 (CFIndex*) &nRealOutSize
2683 );
f7e98dee 2684 }
ecd9653b 2685
638357a0 2686 CFRelease(theString);
ecd9653b 2687
638357a0
RN
2688#if SIZEOF_WCHAR_T == 4
2689 delete[] szUniBuffer;
2690#endif
ecd9653b 2691
f7e98dee
RN
2692 return nRealOutSize - 1;
2693 }
2694
d36c9347
VZ
2695 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2696
f7e98dee 2697 bool IsOk() const
ecd9653b 2698 {
3698ae71 2699 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2700 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2701 }
2702
2703private:
638357a0 2704 CFStringEncoding m_encoding ;
f7e98dee
RN
2705};
2706
2707#endif // defined(__WXCOCOA__)
2708
335d31e0
SC
2709// ============================================================================
2710// Mac conversion classes
2711// ============================================================================
2712
2713#if defined(__WXMAC__) && defined(TARGET_CARBON)
2714
2715class wxMBConv_mac : public wxMBConv
2716{
2717public:
2718 wxMBConv_mac()
2719 {
2720 Init(CFStringGetSystemEncoding()) ;
2721 }
2722
d36c9347
VZ
2723 wxMBConv_mac(const wxMBConv_mac& conv)
2724 {
2725 Init(conv.m_char_encoding);
2726 }
2727
2d1659cf 2728#if wxUSE_FONTMAP
335d31e0
SC
2729 wxMBConv_mac(const wxChar* name)
2730 {
ef199164 2731 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
335d31e0 2732 }
2d1659cf 2733#endif
335d31e0
SC
2734
2735 wxMBConv_mac(wxFontEncoding encoding)
2736 {
d775fa82
WS
2737 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2738 }
2739
2740 ~wxMBConv_mac()
2741 {
2742 OSStatus status = noErr ;
2743 status = TECDisposeConverter(m_MB2WC_converter);
2744 status = TECDisposeConverter(m_WC2MB_converter);
2745 }
2746
2747
2748 void Init( TextEncodingBase encoding)
2749 {
2750 OSStatus status = noErr ;
2751 m_char_encoding = encoding ;
ef199164 2752 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
d775fa82
WS
2753
2754 status = TECCreateConverter(&m_MB2WC_converter,
2755 m_char_encoding,
2756 m_unicode_encoding);
2757 status = TECCreateConverter(&m_WC2MB_converter,
2758 m_unicode_encoding,
2759 m_char_encoding);
2760 }
2761
335d31e0
SC
2762 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2763 {
d775fa82
WS
2764 OSStatus status = noErr ;
2765 ByteCount byteOutLen ;
9088c87b 2766 ByteCount byteInLen = strlen(psz) + 1;
d775fa82
WS
2767 wchar_t *tbuf = NULL ;
2768 UniChar* ubuf = NULL ;
2769 size_t res = 0 ;
2770
2771 if (buf == NULL)
2772 {
ef199164
DS
2773 // Apple specs say at least 32
2774 n = wxMax( 32, byteInLen ) ;
2775 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
d775fa82 2776 }
ef199164 2777
d775fa82 2778 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
ef199164 2779
f3a355ce 2780#if SIZEOF_WCHAR_T == 4
d775fa82 2781 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2782#else
d775fa82 2783 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2784#endif
ef199164
DS
2785
2786 status = TECConvertText(
2787 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2788 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2789
f3a355ce 2790#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2791 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2792 // is not properly terminated we get random characters at the end
2793 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2794 wxMBConvUTF16 converter ;
ef199164 2795 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
d775fa82 2796 free( ubuf ) ;
f3a355ce 2797#else
d775fa82 2798 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2799#endif
ef199164 2800
d775fa82
WS
2801 if ( buf == NULL )
2802 free(tbuf) ;
335d31e0 2803
335d31e0
SC
2804 if ( buf && res < n)
2805 buf[res] = 0;
2806
d775fa82 2807 return res ;
335d31e0
SC
2808 }
2809
2810 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2811 {
2812 OSStatus status = noErr ;
2813 ByteCount byteOutLen ;
2814 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2815
2816 char *tbuf = NULL ;
2817
2818 if (buf == NULL)
2819 {
ef199164
DS
2820 // Apple specs say at least 32
2821 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2822 tbuf = (char*) malloc( n ) ;
2823 }
2824
2825 ByteCount byteBufferLen = n ;
2826 UniChar* ubuf = NULL ;
ef199164 2827
f3a355ce 2828#if SIZEOF_WCHAR_T == 4
d9d488cf 2829 wxMBConvUTF16 converter ;
ef199164 2830 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
d775fa82
WS
2831 byteInLen = unicharlen ;
2832 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
ef199164 2833 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
f3a355ce 2834#else
d775fa82 2835 ubuf = (UniChar*) psz ;
f3a355ce 2836#endif
ef199164
DS
2837
2838 status = TECConvertText(
2839 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2840 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2841
f3a355ce 2842#if SIZEOF_WCHAR_T == 4
d775fa82 2843 free( ubuf ) ;
f3a355ce 2844#endif
ef199164 2845
d775fa82
WS
2846 if ( buf == NULL )
2847 free(tbuf) ;
335d31e0 2848
d775fa82 2849 size_t res = byteOutLen ;
335d31e0 2850 if ( buf && res < n)
638357a0 2851 {
335d31e0 2852 buf[res] = 0;
3698ae71 2853
638357a0
RN
2854 //we need to double-trip to verify it didn't insert any ? in place
2855 //of bogus characters
2856 wxWCharBuffer wcBuf(n);
2857 size_t pszlen = wxWcslen(psz);
467e0479 2858 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
638357a0
RN
2859 wxWcslen(wcBuf) != pszlen ||
2860 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2861 {
2862 // we didn't obtain the same thing we started from, hence
2863 // the conversion was lossy and we consider that it failed
467e0479 2864 return wxCONV_FAILED;
638357a0
RN
2865 }
2866 }
335d31e0 2867
d775fa82 2868 return res ;
335d31e0
SC
2869 }
2870
d3478e2c 2871 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
d36c9347 2872
335d31e0 2873 bool IsOk() const
ef199164 2874 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
335d31e0
SC
2875
2876private:
ef199164
DS
2877 TECObjectRef m_MB2WC_converter;
2878 TECObjectRef m_WC2MB_converter;
d775fa82 2879
ef199164
DS
2880 TextEncodingBase m_char_encoding;
2881 TextEncodingBase m_unicode_encoding;
335d31e0
SC
2882};
2883
2884#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2885
36acb880
VZ
2886// ============================================================================
2887// wxEncodingConverter based conversion classes
2888// ============================================================================
2889
1e6feb95 2890#if wxUSE_FONTMAP
1cd52418 2891
e95354ec 2892class wxMBConv_wxwin : public wxMBConv
1cd52418 2893{
8b04d4c4
VZ
2894private:
2895 void Init()
2896 {
2897 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2898 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2899 }
2900
6001e347 2901public:
f1339c56
RR
2902 // temporarily just use wxEncodingConverter stuff,
2903 // so that it works while a better implementation is built
e95354ec 2904 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2905 {
2906 if (name)
267e11c5 2907 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2908 else
2909 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2910
8b04d4c4
VZ
2911 Init();
2912 }
2913
e95354ec 2914 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2915 {
2916 m_enc = enc;
2917
2918 Init();
f1339c56 2919 }
dccce9ea 2920
bde4baac 2921 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2922 {
2923 size_t inbuf = strlen(psz);
dccce9ea 2924 if (buf)
c643a977 2925 {
ef199164 2926 if (!m2w.Convert(psz, buf))
467e0479 2927 return wxCONV_FAILED;
c643a977 2928 }
f1339c56
RR
2929 return inbuf;
2930 }
dccce9ea 2931
bde4baac 2932 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2933 {
f8d791e0 2934 const size_t inbuf = wxWcslen(psz);
f1339c56 2935 if (buf)
c643a977 2936 {
ef199164 2937 if (!w2m.Convert(psz, buf))
467e0479 2938 return wxCONV_FAILED;
c643a977 2939 }
dccce9ea 2940
f1339c56
RR
2941 return inbuf;
2942 }
dccce9ea 2943
7ef3ab50 2944 virtual size_t GetMBNulLen() const
eec47cc6
VZ
2945 {
2946 switch ( m_enc )
2947 {
2948 case wxFONTENCODING_UTF16BE:
2949 case wxFONTENCODING_UTF16LE:
c1464d9d 2950 return 2;
eec47cc6
VZ
2951
2952 case wxFONTENCODING_UTF32BE:
2953 case wxFONTENCODING_UTF32LE:
c1464d9d 2954 return 4;
eec47cc6
VZ
2955
2956 default:
c1464d9d 2957 return 1;
eec47cc6
VZ
2958 }
2959 }
2960
d36c9347
VZ
2961 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2962
7ef3ab50
VZ
2963 bool IsOk() const { return m_ok; }
2964
2965public:
2966 wxFontEncoding m_enc;
2967 wxEncodingConverter m2w, w2m;
2968
2969private:
cafbf6fb
VZ
2970 // were we initialized successfully?
2971 bool m_ok;
fc7a2a60 2972
e95354ec 2973 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2974};
6001e347 2975
8f115891
MW
2976// make the constructors available for unit testing
2977WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2978{
2979 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2980 if ( !result->IsOk() )
2981 {
2982 delete result;
2983 return 0;
2984 }
ef199164 2985
8f115891
MW
2986 return result;
2987}
2988
1e6feb95
VZ
2989#endif // wxUSE_FONTMAP
2990
36acb880
VZ
2991// ============================================================================
2992// wxCSConv implementation
2993// ============================================================================
2994
8b04d4c4 2995void wxCSConv::Init()
6001e347 2996{
e95354ec
VZ
2997 m_name = NULL;
2998 m_convReal = NULL;
2999 m_deferred = true;
3000}
3001
8b04d4c4
VZ
3002wxCSConv::wxCSConv(const wxChar *charset)
3003{
3004 Init();
82713003 3005
e95354ec
VZ
3006 if ( charset )
3007 {
e95354ec
VZ
3008 SetName(charset);
3009 }
bda3d86a 3010
e4277538
VZ
3011#if wxUSE_FONTMAP
3012 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3013#else
bda3d86a 3014 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 3015#endif
6001e347
RR
3016}
3017
8b04d4c4
VZ
3018wxCSConv::wxCSConv(wxFontEncoding encoding)
3019{
bda3d86a 3020 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
3021 {
3022 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3023
3024 encoding = wxFONTENCODING_SYSTEM;
3025 }
3026
8b04d4c4
VZ
3027 Init();
3028
bda3d86a 3029 m_encoding = encoding;
8b04d4c4
VZ
3030}
3031
6001e347
RR
3032wxCSConv::~wxCSConv()
3033{
65e50848
JS
3034 Clear();
3035}
3036
54380f29 3037wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 3038 : wxMBConv()
54380f29 3039{
8b04d4c4
VZ
3040 Init();
3041
54380f29 3042 SetName(conv.m_name);
8b04d4c4 3043 m_encoding = conv.m_encoding;
54380f29
GD
3044}
3045
3046wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3047{
3048 Clear();
8b04d4c4 3049
54380f29 3050 SetName(conv.m_name);
8b04d4c4
VZ
3051 m_encoding = conv.m_encoding;
3052
54380f29
GD
3053 return *this;
3054}
3055
65e50848
JS
3056void wxCSConv::Clear()
3057{
8b04d4c4 3058 free(m_name);
e95354ec 3059 delete m_convReal;
8b04d4c4 3060
65e50848 3061 m_name = NULL;
e95354ec 3062 m_convReal = NULL;
6001e347
RR
3063}
3064
3065void wxCSConv::SetName(const wxChar *charset)
3066{
f1339c56
RR
3067 if (charset)
3068 {
3069 m_name = wxStrdup(charset);
e95354ec 3070 m_deferred = true;
f1339c56 3071 }
6001e347
RR
3072}
3073
8b3eb85d
VZ
3074#if wxUSE_FONTMAP
3075#include "wx/hashmap.h"
3076
3077WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 3078 wxEncodingNameCache );
8b3eb85d
VZ
3079
3080static wxEncodingNameCache gs_nameCache;
3081#endif
3082
e95354ec
VZ
3083wxMBConv *wxCSConv::DoCreate() const
3084{
ce6f8d6f
VZ
3085#if wxUSE_FONTMAP
3086 wxLogTrace(TRACE_STRCONV,
3087 wxT("creating conversion for %s"),
3088 (m_name ? m_name
3089 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3090#endif // wxUSE_FONTMAP
3091
c547282d
VZ
3092 // check for the special case of ASCII or ISO8859-1 charset: as we have
3093 // special knowledge of it anyhow, we don't need to create a special
3094 // conversion object
e4277538
VZ
3095 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3096 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 3097 {
e95354ec
VZ
3098 // don't convert at all
3099 return NULL;
3100 }
dccce9ea 3101
e95354ec
VZ
3102 // we trust OS to do conversion better than we can so try external
3103 // conversion methods first
3104 //
3105 // the full order is:
3106 // 1. OS conversion (iconv() under Unix or Win32 API)
3107 // 2. hard coded conversions for UTF
3108 // 3. wxEncodingConverter as fall back
3109
3110 // step (1)
3111#ifdef HAVE_ICONV
c547282d 3112#if !wxUSE_FONTMAP
e95354ec 3113 if ( m_name )
c547282d 3114#endif // !wxUSE_FONTMAP
e95354ec 3115 {
c547282d 3116 wxString name(m_name);
8b3eb85d
VZ
3117 wxFontEncoding encoding(m_encoding);
3118
3119 if ( !name.empty() )
3120 {
3121 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3122 if ( conv->IsOk() )
3123 return conv;
3124
3125 delete conv;
c547282d
VZ
3126
3127#if wxUSE_FONTMAP
8b3eb85d
VZ
3128 encoding =
3129 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 3130#endif // wxUSE_FONTMAP
8b3eb85d
VZ
3131 }
3132#if wxUSE_FONTMAP
3133 {
3134 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3135 if ( it != gs_nameCache.end() )
3136 {
3137 if ( it->second.empty() )
3138 return NULL;
c547282d 3139
8b3eb85d
VZ
3140 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3141 if ( conv->IsOk() )
3142 return conv;
e95354ec 3143
8b3eb85d
VZ
3144 delete conv;
3145 }
3146
3147 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3148
3149 for ( ; *names; ++names )
3150 {
3151 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3152 if ( conv->IsOk() )
3153 {
3154 gs_nameCache[encoding] = *names;
3155 return conv;
3156 }
3157
3158 delete conv;
3159 }
3160
40711af8 3161 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
3162 }
3163#endif // wxUSE_FONTMAP
e95354ec
VZ
3164 }
3165#endif // HAVE_ICONV
3166
3167#ifdef wxHAVE_WIN32_MB2WC
3168 {
7608a683 3169#if wxUSE_FONTMAP
e95354ec
VZ
3170 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3171 : new wxMBConv_win32(m_encoding);
3172 if ( conv->IsOk() )
3173 return conv;
3174
3175 delete conv;
7608a683
WS
3176#else
3177 return NULL;
3178#endif
e95354ec
VZ
3179 }
3180#endif // wxHAVE_WIN32_MB2WC
ef199164 3181
d775fa82
WS
3182#if defined(__WXMAC__)
3183 {
5c3c8676 3184 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 3185 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 3186 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82 3187 {
2d1659cf 3188#if wxUSE_FONTMAP
d775fa82
WS
3189 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3190 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
3191#else
3192 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3193#endif
d775fa82 3194 if ( conv->IsOk() )
f7e98dee
RN
3195 return conv;
3196
3197 delete conv;
3198 }
3199 }
3200#endif
ef199164 3201
f7e98dee
RN
3202#if defined(__WXCOCOA__)
3203 {
3204 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3205 {
a6900d10 3206#if wxUSE_FONTMAP
f7e98dee
RN
3207 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3208 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
3209#else
3210 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3211#endif
ef199164 3212
f7e98dee 3213 if ( conv->IsOk() )
d775fa82
WS
3214 return conv;
3215
3216 delete conv;
3217 }
335d31e0
SC
3218 }
3219#endif
e95354ec
VZ
3220 // step (2)
3221 wxFontEncoding enc = m_encoding;
3222#if wxUSE_FONTMAP
c547282d
VZ
3223 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3224 {
3225 // use "false" to suppress interactive dialogs -- we can be called from
3226 // anywhere and popping up a dialog from here is the last thing we want to
3227 // do
267e11c5 3228 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 3229 }
e95354ec
VZ
3230#endif // wxUSE_FONTMAP
3231
3232 switch ( enc )
3233 {
3234 case wxFONTENCODING_UTF7:
3235 return new wxMBConvUTF7;
3236
3237 case wxFONTENCODING_UTF8:
3238 return new wxMBConvUTF8;
3239
e95354ec
VZ
3240 case wxFONTENCODING_UTF16BE:
3241 return new wxMBConvUTF16BE;
3242
3243 case wxFONTENCODING_UTF16LE:
3244 return new wxMBConvUTF16LE;
3245
e95354ec
VZ
3246 case wxFONTENCODING_UTF32BE:
3247 return new wxMBConvUTF32BE;
3248
3249 case wxFONTENCODING_UTF32LE:
3250 return new wxMBConvUTF32LE;
3251
3252 default:
3253 // nothing to do but put here to suppress gcc warnings
ef199164 3254 break;
e95354ec
VZ
3255 }
3256
3257 // step (3)
3258#if wxUSE_FONTMAP
3259 {
3260 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3261 : new wxMBConv_wxwin(m_encoding);
3262 if ( conv->IsOk() )
3263 return conv;
3264
3265 delete conv;
3266 }
3267#endif // wxUSE_FONTMAP
3268
a58d4f4d
VS
3269 // NB: This is a hack to prevent deadlock. What could otherwise happen
3270 // in Unicode build: wxConvLocal creation ends up being here
3271 // because of some failure and logs the error. But wxLog will try to
3272 // attach timestamp, for which it will need wxConvLocal (to convert
3273 // time to char* and then wchar_t*), but that fails, tries to log
3274 // error, but wxLog has a (already locked) critical section that
3275 // guards static buffer.
3276 static bool alreadyLoggingError = false;
3277 if (!alreadyLoggingError)
3278 {
3279 alreadyLoggingError = true;
3280 wxLogError(_("Cannot convert from the charset '%s'!"),
3281 m_name ? m_name
e95354ec
VZ
3282 :
3283#if wxUSE_FONTMAP
267e11c5 3284 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
3285#else // !wxUSE_FONTMAP
3286 wxString::Format(_("encoding %s"), m_encoding).c_str()
3287#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3288 );
ef199164 3289
a58d4f4d
VS
3290 alreadyLoggingError = false;
3291 }
e95354ec
VZ
3292
3293 return NULL;
3294}
3295
3296void wxCSConv::CreateConvIfNeeded() const
3297{
3298 if ( m_deferred )
3299 {
3300 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
3301
3302#if wxUSE_INTL
3303 // if we don't have neither the name nor the encoding, use the default
3304 // encoding for this system
3305 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3306 {
4d312c22 3307 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
3308 }
3309#endif // wxUSE_INTL
3310
e95354ec
VZ
3311 self->m_convReal = DoCreate();
3312 self->m_deferred = false;
6001e347 3313 }
6001e347
RR
3314}
3315
3316size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3317{
e95354ec 3318 CreateConvIfNeeded();
dccce9ea 3319
e95354ec
VZ
3320 if (m_convReal)
3321 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
3322
3323 // latin-1 (direct)
4def3b35 3324 size_t len = strlen(psz);
dccce9ea 3325
f1339c56
RR
3326 if (buf)
3327 {
4def3b35 3328 for (size_t c = 0; c <= len; c++)
f1339c56
RR
3329 buf[c] = (unsigned char)(psz[c]);
3330 }
dccce9ea 3331
f1339c56 3332 return len;
6001e347
RR
3333}
3334
3335size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3336{
e95354ec 3337 CreateConvIfNeeded();
dccce9ea 3338
e95354ec
VZ
3339 if (m_convReal)
3340 return m_convReal->WC2MB(buf, psz, n);
1cd52418 3341
f1339c56 3342 // latin-1 (direct)
f8d791e0 3343 const size_t len = wxWcslen(psz);
f1339c56
RR
3344 if (buf)
3345 {
4def3b35 3346 for (size_t c = 0; c <= len; c++)
24642831
VS
3347 {
3348 if (psz[c] > 0xFF)
467e0479 3349 return wxCONV_FAILED;
ef199164 3350
907173e5 3351 buf[c] = (char)psz[c];
24642831
VS
3352 }
3353 }
3354 else
3355 {
3356 for (size_t c = 0; c <= len; c++)
3357 {
3358 if (psz[c] > 0xFF)
467e0479 3359 return wxCONV_FAILED;
24642831 3360 }
f1339c56 3361 }
dccce9ea 3362
f1339c56 3363 return len;
6001e347
RR
3364}
3365
7ef3ab50 3366size_t wxCSConv::GetMBNulLen() const
eec47cc6
VZ
3367{
3368 CreateConvIfNeeded();
3369
3370 if ( m_convReal )
3371 {
7ef3ab50 3372 return m_convReal->GetMBNulLen();
eec47cc6
VZ
3373 }
3374
c1464d9d 3375 return 1;
eec47cc6
VZ
3376}
3377
bde4baac
VZ
3378// ----------------------------------------------------------------------------
3379// globals
3380// ----------------------------------------------------------------------------
3381
3382#ifdef __WINDOWS__
3383 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
3384#elif defined(__WXMAC__) && !defined(__MACH__)
3385 static wxMBConv_mac wxConvLibcObj ;
bde4baac 3386#else
dcc8fac0 3387 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
3388#endif
3389
3390static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3391static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3392static wxMBConvUTF7 wxConvUTF7Obj;
3393static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 3394
bde4baac
VZ
3395WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3396WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3397WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3398WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3399WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3400WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
d5bef0a3 3401WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
f5a1953b
VZ
3402WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3403#ifdef __WXOSX__
ea8ce907 3404 wxConvUTF8Obj;
f5a1953b 3405#else
ea8ce907 3406 wxConvLibcObj;
f5a1953b
VZ
3407#endif
3408
bde4baac
VZ
3409#else // !wxUSE_WCHAR_T
3410
3411// stand-ins in absence of wchar_t
3412WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3413 wxConvISO8859_1,
3414 wxConvLocal,
3415 wxConvUTF8;
3416
3417#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T