]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
let the UTF7 test fail but not crash
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
6001e347
RR
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
373658eb
VZ
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
bde4baac
VZ
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
7608a683 39#ifdef __WINDOWS__
532d575b 40 #include "wx/msw/private.h"
13dd924a 41 #include "wx/msw/missing.h"
0a1c1e62
GRG
42#endif
43
1c193821 44#ifndef __WXWINCE__
1cd52418 45#include <errno.h>
1c193821
JS
46#endif
47
6001e347
RR
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
e95354ec
VZ
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
6001e347 56#ifdef __SALFORDC__
373658eb 57 #include <clib.h>
6001e347
RR
58#endif
59
b040e242 60#ifdef HAVE_ICONV
373658eb 61 #include <iconv.h>
b1d547eb 62 #include "wx/thread.h"
1cd52418 63#endif
1cd52418 64
373658eb
VZ
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
7608a683 67#include "wx/utils.h"
373658eb 68
335d31e0 69#ifdef __WXMAC__
40ba2f3b 70#ifndef __DARWIN__
4227afa4
SC
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
40ba2f3b 74#endif
335d31e0
SC
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
ce6f8d6f
VZ
78
79#define TRACE_STRCONV _T("strconv")
80
4948c2b6 81#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
82 #define WC_UTF16
83#endif
84
373658eb
VZ
85// ============================================================================
86// implementation
87// ============================================================================
88
89// ----------------------------------------------------------------------------
c91830cb 90// UTF-16 en/decoding to/from UCS-4
373658eb 91// ----------------------------------------------------------------------------
6001e347 92
b0a6bb75 93
c91830cb 94static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 95{
dccce9ea 96 if (input<=0xffff)
4def3b35 97 {
999836aa
VZ
98 if (output)
99 *output = (wxUint16) input;
4def3b35 100 return 1;
dccce9ea
VZ
101 }
102 else if (input>=0x110000)
4def3b35
VS
103 {
104 return (size_t)-1;
dccce9ea
VZ
105 }
106 else
4def3b35 107 {
dccce9ea 108 if (output)
4def3b35 109 {
c91830cb 110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
112 }
113 return 2;
1cd52418 114 }
1cd52418
OK
115}
116
c91830cb 117static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 118{
dccce9ea 119 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
120 {
121 output = *input;
122 return 1;
dccce9ea 123 }
cdb14ecb 124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
4def3b35
VS
125 {
126 output = *input;
127 return (size_t)-1;
dccce9ea
VZ
128 }
129 else
4def3b35
VS
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
1cd52418
OK
134}
135
b0a6bb75 136
f6bcfd97 137// ----------------------------------------------------------------------------
6001e347 138// wxMBConv
f6bcfd97 139// ----------------------------------------------------------------------------
2c53a80a
WS
140
141wxMBConv::~wxMBConv()
142{
143 // nothing to do here (necessary for Darwin linking probably)
144}
6001e347 145
6001e347
RR
146const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147{
2b5f62a0 148 if ( psz )
6001e347 149 {
2b5f62a0
VZ
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
635f33ce
VS
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
2b5f62a0 161 }
f6bcfd97 162 }
2b5f62a0
VZ
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
6001e347
RR
167}
168
e5cceba0 169const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 170{
2b5f62a0
VZ
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
c91830cb 176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
2b5f62a0
VZ
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
e5cceba0 186
e5cceba0 187 return buf;
6001e347
RR
188}
189
eec47cc6
VZ
190const wxWCharBuffer
191wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
e4e3bbb4 192{
eec47cc6
VZ
193 // the currently accumulated wide characters
194 wxWCharBuffer wbuf;
195
196 // the current length of wbuf
197 size_t lenBuf = 0;
198
199 // we need to know the representation of L'\0' for this conversion
200 size_t nulLen;
201 const char * const nul = GetMBNul(&nulLen);
202 if ( nulLen == (size_t)-1 || nulLen == 0 )
203 return wxWCharBuffer();
204
205 // make a copy of the input string unless it is already properly
206 // NUL-terminated
207 wxCharBuffer bufTmp;
208
209 // now we can compute the input size if we were not given it: notice that
210 // in this case the string must be properly NUL-terminated, of course, as
211 // otherwise we have no way of knowing how long it is
212 if ( inLen == (size_t)-1 )
213 {
214 // not the most efficient algorithm but it shouldn't matter as normally
215 // there are not many NULs in the string and so normally memcmp()
216 // should stop on the first character
22886fb3
VZ
217 const char *p = in;
218 while ( memcmp(p, nul, nulLen) != 0 )
219 p++;
e4e3bbb4 220
eec47cc6
VZ
221 inLen = p - in + nulLen;
222 }
223 else // we already have the size
e4e3bbb4 224 {
eec47cc6
VZ
225 // check if it's not already NUL-terminated too to avoid the copy
226 if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
227 {
228 // make a copy in order to properly NUL-terminate the string
229 bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
230 memcpy(bufTmp.data(), in, inLen);
231 memcpy(bufTmp.data() + inLen, nul, nulLen);
232 }
233 }
e4e3bbb4 234
eec47cc6
VZ
235 if ( bufTmp )
236 in = bufTmp;
e4e3bbb4 237
eec47cc6
VZ
238 for ( const char * const inEnd = in + inLen;; )
239 {
240 // try to convert the current chunk if anything left
241 size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
242 if ( lenChunk == 0 )
f5fb6871 243 {
eec47cc6
VZ
244 // nothing left in the input string, conversion succeeded
245 if ( outLen )
246 {
247 // we shouldn't include the last NUL in the result length
248 *outLen = lenBuf ? lenBuf - 1 : 0;
249 }
250
251 return wbuf;
f5fb6871
RN
252 }
253
eec47cc6
VZ
254 if ( lenChunk == (size_t)-1 )
255 break;
e4e3bbb4 256
eec47cc6
VZ
257 const size_t lenBufNew = lenBuf + lenChunk;
258 if ( !wbuf.extend(lenBufNew) )
259 break;
e4e3bbb4 260
eec47cc6
VZ
261 lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
262 if ( lenChunk == (size_t)-1 )
263 break;
f5fb6871 264
eec47cc6
VZ
265 // +! for the embedded NUL (if something follows)
266 lenBuf = lenBufNew + 1;
267
268 // advance the input pointer past the end of this chunk
269 while ( memcmp(in, nul, nulLen) != 0 )
270 in++;
e4e3bbb4 271
eec47cc6 272 in += nulLen; // skipping over its terminator as well
e4e3bbb4
RN
273 }
274
eec47cc6
VZ
275 // conversion failed
276 if ( outLen )
277 *outLen = 0;
278
279 return wxWCharBuffer();
e4e3bbb4
RN
280}
281
eec47cc6
VZ
282const wxCharBuffer
283wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
e4e3bbb4 284{
eec47cc6
VZ
285 // the currently accumulated multibyte characters
286 wxCharBuffer buf;
f5fb6871 287
eec47cc6
VZ
288 // the current length of buf
289 size_t lenBuf = 0;
e4e3bbb4 290
eec47cc6
VZ
291 // make a copy of the input string unless it is already properly
292 // NUL-terminated
293 //
294 // if we don't know its length we have no choice but to assume that it is,
295 // indeed, properly terminated
296 wxWCharBuffer bufTmp;
297 if ( inLen == (size_t)-1 )
e4e3bbb4 298 {
eec47cc6
VZ
299 inLen = wxWcslen(in) + 1;
300 }
301 else if ( inLen != 0 && in[inLen - 1] != L'\0' )
302 {
303 // make a copy in order to properly NUL-terminate the string
304 bufTmp = wxWCharBuffer(inLen);
305 memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
306 }
e4e3bbb4 307
eec47cc6
VZ
308 if ( bufTmp )
309 in = bufTmp;
e4e3bbb4 310
eec47cc6
VZ
311 for ( const wchar_t * const inEnd = in + inLen;; )
312 {
313 // try to convert the current chunk, if anything left
314 size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
315 if ( lenChunk == 0 )
f5fb6871 316 {
eec47cc6
VZ
317 // nothing left in the input string, conversion succeeded
318 if ( outLen )
319 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
320
321 return buf;
f5fb6871 322 }
e4e3bbb4 323
eec47cc6
VZ
324 if ( lenChunk == (size_t)-1 )
325 break;
3698ae71 326
eec47cc6
VZ
327 const size_t lenBufNew = lenBuf + lenChunk;
328 if ( !buf.extend(lenBufNew) )
329 break;
f5fb6871 330
eec47cc6
VZ
331 lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
332 if ( lenChunk == (size_t)-1 )
333 break;
e4e3bbb4 334
eec47cc6
VZ
335 // chunk successfully converted, go to the next one
336 in += wxWcslen(in) + 1 /* skip NUL too */;
337 lenBuf = lenBufNew + 1;
e4e3bbb4
RN
338 }
339
eec47cc6
VZ
340 // conversion failed
341 if ( outLen )
342 *outLen = 0;
343
344 return wxCharBuffer();
e4e3bbb4
RN
345}
346
6001e347 347// ----------------------------------------------------------------------------
bde4baac 348// wxMBConvLibc
6001e347
RR
349// ----------------------------------------------------------------------------
350
bde4baac
VZ
351size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
352{
353 return wxMB2WC(buf, psz, n);
354}
355
356size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
357{
358 return wxWC2MB(buf, psz, n);
359}
e1bfe89e
RR
360
361// ----------------------------------------------------------------------------
532d575b 362// wxConvBrokenFileNames
e1bfe89e
RR
363// ----------------------------------------------------------------------------
364
eec47cc6
VZ
365#ifdef __UNIX__
366
845905d5 367wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 368{
845905d5
MW
369 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
370 || wxStricmp(charset, _T("UTF8")) == 0 )
371 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
372 else
373 m_conv = new wxCSConv(charset);
ea8ce907
RR
374}
375
eec47cc6 376#endif // __UNIX__
c12b7f79 377
bde4baac 378// ----------------------------------------------------------------------------
3698ae71 379// UTF-7
bde4baac 380// ----------------------------------------------------------------------------
6001e347 381
15f2ee32 382// Implementation (C) 2004 Fredrik Roubert
6001e347 383
15f2ee32
RN
384//
385// BASE64 decoding table
386//
387static const unsigned char utf7unb64[] =
6001e347 388{
15f2ee32
RN
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
395 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
396 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
398 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
399 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
400 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
402 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
403 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
404 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
412 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
413 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
414 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
415 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
416 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
417 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
418 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
421};
422
423size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
424{
15f2ee32
RN
425 size_t len = 0;
426
04a37834 427 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
428 {
429 unsigned char cc = *psz++;
430 if (cc != '+')
431 {
432 // plain ASCII char
433 if (buf)
434 *buf++ = cc;
435 len++;
436 }
437 else if (*psz == '-')
438 {
439 // encoded plus sign
440 if (buf)
441 *buf++ = cc;
442 len++;
443 psz++;
444 }
04a37834 445 else // start of BASE64 encoded string
15f2ee32 446 {
04a37834 447 bool lsb, ok;
15f2ee32 448 unsigned int d, l;
04a37834
VZ
449 for ( ok = lsb = false, d = 0, l = 0;
450 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
451 psz++ )
15f2ee32
RN
452 {
453 d <<= 6;
454 d += cc;
455 for (l += 6; l >= 8; lsb = !lsb)
456 {
04a37834 457 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
458 if (lsb)
459 {
460 if (buf)
461 *buf++ |= c;
462 len ++;
463 }
464 else
04a37834 465 {
15f2ee32 466 if (buf)
6356d52a 467 *buf = (wchar_t)(c << 8);
04a37834
VZ
468 }
469
470 ok = true;
15f2ee32
RN
471 }
472 }
04a37834
VZ
473
474 if ( !ok )
475 {
476 // in valid UTF7 we should have valid characters after '+'
477 return (size_t)-1;
478 }
479
15f2ee32
RN
480 if (*psz == '-')
481 psz++;
482 }
483 }
04a37834
VZ
484
485 if ( buf && (len < n) )
486 *buf = '\0';
487
15f2ee32 488 return len;
6001e347
RR
489}
490
15f2ee32
RN
491//
492// BASE64 encoding table
493//
494static const unsigned char utf7enb64[] =
495{
496 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
497 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
498 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
499 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
500 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
501 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
502 'w', 'x', 'y', 'z', '0', '1', '2', '3',
503 '4', '5', '6', '7', '8', '9', '+', '/'
504};
505
506//
507// UTF-7 encoding table
508//
509// 0 - Set D (directly encoded characters)
510// 1 - Set O (optional direct characters)
511// 2 - whitespace characters (optional)
512// 3 - special characters
513//
514static const unsigned char utf7encode[128] =
6001e347 515{
15f2ee32
RN
516 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
517 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
518 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
520 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
522 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
524};
525
667e5b3e 526size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 527{
15f2ee32
RN
528 size_t len = 0;
529
530 while (*psz && ((!buf) || (len < n)))
531 {
532 wchar_t cc = *psz++;
533 if (cc < 0x80 && utf7encode[cc] < 1)
534 {
535 // plain ASCII char
536 if (buf)
537 *buf++ = (char)cc;
538 len++;
539 }
540#ifndef WC_UTF16
79c78d42 541 else if (((wxUint32)cc) > 0xffff)
b2c13097 542 {
15f2ee32
RN
543 // no surrogate pair generation (yet?)
544 return (size_t)-1;
545 }
546#endif
547 else
548 {
549 if (buf)
550 *buf++ = '+';
551 len++;
552 if (cc != '+')
553 {
554 // BASE64 encode string
555 unsigned int lsb, d, l;
73c902d6 556 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
557 {
558 for (lsb = 0; lsb < 2; lsb ++)
559 {
560 d <<= 8;
561 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
562
563 for (l += 8; l >= 6; )
564 {
565 l -= 6;
566 if (buf)
567 *buf++ = utf7enb64[(d >> l) % 64];
568 len++;
569 }
570 }
571 cc = *psz;
572 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
573 break;
574 }
575 if (l != 0)
576 {
577 if (buf)
578 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
579 len++;
580 }
581 }
582 if (buf)
583 *buf++ = '-';
584 len++;
585 }
586 }
587 if (buf && (len < n))
588 *buf = 0;
589 return len;
6001e347
RR
590}
591
f6bcfd97 592// ----------------------------------------------------------------------------
6001e347 593// UTF-8
f6bcfd97 594// ----------------------------------------------------------------------------
6001e347 595
dccce9ea 596static wxUint32 utf8_max[]=
4def3b35 597 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 598
3698ae71
VZ
599// boundaries of the private use area we use to (temporarily) remap invalid
600// characters invalid in a UTF-8 encoded string
ea8ce907
RR
601const wxUint32 wxUnicodePUA = 0x100000;
602const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
603
6001e347
RR
604size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
605{
4def3b35
VS
606 size_t len = 0;
607
dccce9ea 608 while (*psz && ((!buf) || (len < n)))
4def3b35 609 {
ea8ce907
RR
610 const char *opsz = psz;
611 bool invalid = false;
4def3b35
VS
612 unsigned char cc = *psz++, fc = cc;
613 unsigned cnt;
dccce9ea 614 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 615 fc <<= 1;
dccce9ea 616 if (!cnt)
4def3b35
VS
617 {
618 // plain ASCII char
dccce9ea 619 if (buf)
4def3b35
VS
620 *buf++ = cc;
621 len++;
561488ef
MW
622
623 // escape the escape character for octal escapes
624 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
625 && cc == '\\' && (!buf || len < n))
626 {
627 if (buf)
628 *buf++ = cc;
629 len++;
630 }
dccce9ea
VZ
631 }
632 else
4def3b35
VS
633 {
634 cnt--;
dccce9ea 635 if (!cnt)
4def3b35
VS
636 {
637 // invalid UTF-8 sequence
ea8ce907 638 invalid = true;
dccce9ea
VZ
639 }
640 else
4def3b35
VS
641 {
642 unsigned ocnt = cnt - 1;
643 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 644 while (cnt--)
4def3b35 645 {
ea8ce907 646 cc = *psz;
dccce9ea 647 if ((cc & 0xC0) != 0x80)
4def3b35
VS
648 {
649 // invalid UTF-8 sequence
ea8ce907
RR
650 invalid = true;
651 break;
4def3b35 652 }
ea8ce907 653 psz++;
4def3b35
VS
654 res = (res << 6) | (cc & 0x3f);
655 }
ea8ce907 656 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
657 {
658 // illegal UTF-8 encoding
ea8ce907 659 invalid = true;
4def3b35 660 }
ea8ce907
RR
661 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
662 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
663 {
664 // if one of our PUA characters turns up externally
665 // it must also be treated as an illegal sequence
666 // (a bit like you have to escape an escape character)
667 invalid = true;
668 }
669 else
670 {
1cd52418 671#ifdef WC_UTF16
ea8ce907
RR
672 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
673 size_t pa = encode_utf16(res, (wxUint16 *)buf);
674 if (pa == (size_t)-1)
675 {
676 invalid = true;
677 }
678 else
679 {
680 if (buf)
681 buf += pa;
682 len += pa;
683 }
373658eb 684#else // !WC_UTF16
ea8ce907 685 if (buf)
38d4b1e4 686 *buf++ = (wchar_t)res;
ea8ce907 687 len++;
373658eb 688#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
689 }
690 }
691 if (invalid)
692 {
693 if (m_options & MAP_INVALID_UTF8_TO_PUA)
694 {
695 while (opsz < psz && (!buf || len < n))
696 {
697#ifdef WC_UTF16
698 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
699 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
700 wxASSERT(pa != (size_t)-1);
701 if (buf)
702 buf += pa;
703 opsz++;
704 len += pa;
705#else
706 if (buf)
38d4b1e4 707 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
708 opsz++;
709 len++;
710#endif
711 }
712 }
3698ae71 713 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
714 {
715 while (opsz < psz && (!buf || len < n))
716 {
3698ae71
VZ
717 if ( buf && len + 3 < n )
718 {
17a1ebd1 719 unsigned char on = *opsz;
3698ae71 720 *buf++ = L'\\';
17a1ebd1
VZ
721 *buf++ = (wchar_t)( L'0' + on / 0100 );
722 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
723 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 724 }
ea8ce907
RR
725 opsz++;
726 len += 4;
727 }
728 }
3698ae71 729 else // MAP_INVALID_UTF8_NOT
ea8ce907
RR
730 {
731 return (size_t)-1;
732 }
4def3b35
VS
733 }
734 }
6001e347 735 }
dccce9ea 736 if (buf && (len < n))
4def3b35
VS
737 *buf = 0;
738 return len;
6001e347
RR
739}
740
3698ae71
VZ
741static inline bool isoctal(wchar_t wch)
742{
743 return L'0' <= wch && wch <= L'7';
744}
745
6001e347
RR
746size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
747{
4def3b35 748 size_t len = 0;
6001e347 749
dccce9ea 750 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
751 {
752 wxUint32 cc;
1cd52418 753#ifdef WC_UTF16
b5153fd8
VZ
754 // cast is ok for WC_UTF16
755 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 756 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 757#else
4def3b35
VS
758 cc=(*psz++) & 0x7fffffff;
759#endif
3698ae71
VZ
760
761 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
762 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 763 {
dccce9ea 764 if (buf)
ea8ce907 765 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 766 len++;
3698ae71 767 }
561488ef
MW
768 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
769 && cc == L'\\' && psz[0] == L'\\' )
770 {
771 if (buf)
772 *buf++ = (char)cc;
773 psz++;
774 len++;
775 }
3698ae71
VZ
776 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
777 cc == L'\\' &&
778 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 779 {
dccce9ea 780 if (buf)
3698ae71 781 {
b2c13097
WS
782 *buf++ = (char) ((psz[0] - L'0')*0100 +
783 (psz[1] - L'0')*010 +
784 (psz[2] - L'0'));
3698ae71
VZ
785 }
786
787 psz += 3;
ea8ce907
RR
788 len++;
789 }
790 else
791 {
792 unsigned cnt;
793 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
794 if (!cnt)
4def3b35 795 {
ea8ce907
RR
796 // plain ASCII char
797 if (buf)
798 *buf++ = (char) cc;
799 len++;
800 }
801
802 else
803 {
804 len += cnt + 1;
805 if (buf)
806 {
807 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
808 while (cnt--)
809 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
810 }
4def3b35
VS
811 }
812 }
6001e347 813 }
4def3b35 814
3698ae71
VZ
815 if (buf && (len<n))
816 *buf = 0;
adb45366 817
4def3b35 818 return len;
6001e347
RR
819}
820
c91830cb
VZ
821// ----------------------------------------------------------------------------
822// UTF-16
823// ----------------------------------------------------------------------------
824
825#ifdef WORDS_BIGENDIAN
bde4baac
VZ
826 #define wxMBConvUTF16straight wxMBConvUTF16BE
827 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 828#else
bde4baac
VZ
829 #define wxMBConvUTF16swap wxMBConvUTF16BE
830 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
831#endif
832
833
c91830cb
VZ
834#ifdef WC_UTF16
835
c91830cb
VZ
836// copy 16bit MB to 16bit String
837size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
838{
839 size_t len=0;
840
841 while (*(wxUint16*)psz && (!buf || len < n))
842 {
843 if (buf)
844 *buf++ = *(wxUint16*)psz;
845 len++;
846
847 psz += sizeof(wxUint16);
848 }
849 if (buf && len<n) *buf=0;
850
851 return len;
852}
853
854
855// copy 16bit String to 16bit MB
856size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
857{
858 size_t len=0;
859
860 while (*psz && (!buf || len < n))
861 {
862 if (buf)
863 {
864 *(wxUint16*)buf = *psz;
865 buf += sizeof(wxUint16);
866 }
867 len += sizeof(wxUint16);
868 psz++;
869 }
870 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
871
872 return len;
873}
874
875
876// swap 16bit MB to 16bit String
877size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
878{
bfab25d4 879 size_t len = 0;
c91830cb 880
da12017a
VZ
881 // UTF16 string must be terminated by 2 NULs as single NULs may occur
882 // inside the string
883 while ( (psz[0] || psz[1]) && (!buf || len < n) )
c91830cb 884 {
bfab25d4 885 if ( buf )
c91830cb
VZ
886 {
887 ((char *)buf)[0] = psz[1];
888 ((char *)buf)[1] = psz[0];
889 buf++;
890 }
891 len++;
bfab25d4 892 psz += 2;
c91830cb 893 }
bfab25d4
VZ
894
895 if ( buf && len < n )
896 *buf = L'\0';
c91830cb
VZ
897
898 return len;
899}
900
901
902// swap 16bit MB to 16bit String
903size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
904{
eec47cc6 905 size_t len = 0;
c91830cb 906
eec47cc6 907 while ( *psz && (!buf || len < n) )
c91830cb 908 {
eec47cc6 909 if ( buf )
c91830cb
VZ
910 {
911 *buf++ = ((char*)psz)[1];
912 *buf++ = ((char*)psz)[0];
913 }
eec47cc6 914 len += 2;
c91830cb
VZ
915 psz++;
916 }
eec47cc6
VZ
917
918 if ( buf && len < n )
919 *buf = '\0';
c91830cb
VZ
920
921 return len;
922}
923
924
925#else // WC_UTF16
926
927
928// copy 16bit MB to 32bit String
929size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
930{
931 size_t len=0;
932
933 while (*(wxUint16*)psz && (!buf || len < n))
934 {
935 wxUint32 cc;
936 size_t pa=decode_utf16((wxUint16*)psz, cc);
937 if (pa == (size_t)-1)
938 return pa;
939
940 if (buf)
38d4b1e4 941 *buf++ = (wchar_t)cc;
c91830cb
VZ
942 len++;
943 psz += pa * sizeof(wxUint16);
944 }
945 if (buf && len<n) *buf=0;
946
947 return len;
948}
949
950
951// copy 32bit String to 16bit MB
952size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
953{
954 size_t len=0;
955
956 while (*psz && (!buf || len < n))
957 {
958 wxUint16 cc[2];
959 size_t pa=encode_utf16(*psz, cc);
960
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
965 {
69b80d28 966 *(wxUint16*)buf = cc[0];
b5153fd8 967 buf += sizeof(wxUint16);
c91830cb 968 if (pa > 1)
69b80d28
VZ
969 {
970 *(wxUint16*)buf = cc[1];
971 buf += sizeof(wxUint16);
972 }
c91830cb
VZ
973 }
974
975 len += pa*sizeof(wxUint16);
976 psz++;
977 }
978 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
979
980 return len;
981}
982
983
984// swap 16bit MB to 32bit String
985size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
986{
987 size_t len=0;
988
989 while (*(wxUint16*)psz && (!buf || len < n))
990 {
991 wxUint32 cc;
992 char tmp[4];
993 tmp[0]=psz[1]; tmp[1]=psz[0];
994 tmp[2]=psz[3]; tmp[3]=psz[2];
995
996 size_t pa=decode_utf16((wxUint16*)tmp, cc);
997 if (pa == (size_t)-1)
998 return pa;
999
1000 if (buf)
38d4b1e4 1001 *buf++ = (wchar_t)cc;
c91830cb
VZ
1002
1003 len++;
1004 psz += pa * sizeof(wxUint16);
1005 }
1006 if (buf && len<n) *buf=0;
1007
1008 return len;
1009}
1010
1011
1012// swap 32bit String to 16bit MB
1013size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1014{
1015 size_t len=0;
1016
1017 while (*psz && (!buf || len < n))
1018 {
1019 wxUint16 cc[2];
1020 size_t pa=encode_utf16(*psz, cc);
1021
1022 if (pa == (size_t)-1)
1023 return pa;
1024
1025 if (buf)
1026 {
1027 *buf++ = ((char*)cc)[1];
1028 *buf++ = ((char*)cc)[0];
1029 if (pa > 1)
1030 {
1031 *buf++ = ((char*)cc)[3];
1032 *buf++ = ((char*)cc)[2];
1033 }
1034 }
1035
1036 len += pa*sizeof(wxUint16);
1037 psz++;
1038 }
1039 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1040
1041 return len;
1042}
1043
1044#endif // WC_UTF16
1045
1046
1047// ----------------------------------------------------------------------------
1048// UTF-32
1049// ----------------------------------------------------------------------------
1050
1051#ifdef WORDS_BIGENDIAN
1052#define wxMBConvUTF32straight wxMBConvUTF32BE
1053#define wxMBConvUTF32swap wxMBConvUTF32LE
1054#else
1055#define wxMBConvUTF32swap wxMBConvUTF32BE
1056#define wxMBConvUTF32straight wxMBConvUTF32LE
1057#endif
1058
1059
1060WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1061WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1062
1063
1064#ifdef WC_UTF16
1065
1066// copy 32bit MB to 16bit String
1067size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1068{
1069 size_t len=0;
1070
1071 while (*(wxUint32*)psz && (!buf || len < n))
1072 {
1073 wxUint16 cc[2];
1074
1075 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1076 if (pa == (size_t)-1)
1077 return pa;
1078
1079 if (buf)
1080 {
1081 *buf++ = cc[0];
1082 if (pa > 1)
1083 *buf++ = cc[1];
1084 }
1085 len += pa;
1086 psz += sizeof(wxUint32);
1087 }
1088 if (buf && len<n) *buf=0;
1089
1090 return len;
1091}
1092
1093
1094// copy 16bit String to 32bit MB
1095size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1096{
1097 size_t len=0;
1098
1099 while (*psz && (!buf || len < n))
1100 {
1101 wxUint32 cc;
1102
b5153fd8
VZ
1103 // cast is ok for WC_UTF16
1104 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
1105 if (pa == (size_t)-1)
1106 return pa;
1107
1108 if (buf)
1109 {
1110 *(wxUint32*)buf = cc;
1111 buf += sizeof(wxUint32);
1112 }
1113 len += sizeof(wxUint32);
1114 psz += pa;
1115 }
b5153fd8
VZ
1116
1117 if (buf && len<=n-sizeof(wxUint32))
1118 *(wxUint32*)buf=0;
c91830cb
VZ
1119
1120 return len;
1121}
1122
1123
1124
1125// swap 32bit MB to 16bit String
1126size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1127{
1128 size_t len=0;
1129
1130 while (*(wxUint32*)psz && (!buf || len < n))
1131 {
1132 char tmp[4];
1133 tmp[0] = psz[3]; tmp[1] = psz[2];
1134 tmp[2] = psz[1]; tmp[3] = psz[0];
1135
1136
1137 wxUint16 cc[2];
1138
1139 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1140 if (pa == (size_t)-1)
1141 return pa;
1142
1143 if (buf)
1144 {
1145 *buf++ = cc[0];
1146 if (pa > 1)
1147 *buf++ = cc[1];
1148 }
1149 len += pa;
1150 psz += sizeof(wxUint32);
1151 }
b5153fd8
VZ
1152
1153 if (buf && len<n)
1154 *buf=0;
c91830cb
VZ
1155
1156 return len;
1157}
1158
1159
1160// swap 16bit String to 32bit MB
1161size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1162{
1163 size_t len=0;
1164
1165 while (*psz && (!buf || len < n))
1166 {
1167 char cc[4];
1168
b5153fd8
VZ
1169 // cast is ok for WC_UTF16
1170 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
1171 if (pa == (size_t)-1)
1172 return pa;
1173
1174 if (buf)
1175 {
1176 *buf++ = cc[3];
1177 *buf++ = cc[2];
1178 *buf++ = cc[1];
1179 *buf++ = cc[0];
1180 }
1181 len += sizeof(wxUint32);
1182 psz += pa;
1183 }
b5153fd8
VZ
1184
1185 if (buf && len<=n-sizeof(wxUint32))
1186 *(wxUint32*)buf=0;
c91830cb
VZ
1187
1188 return len;
1189}
1190
1191#else // WC_UTF16
1192
1193
1194// copy 32bit MB to 32bit String
1195size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1196{
1197 size_t len=0;
1198
1199 while (*(wxUint32*)psz && (!buf || len < n))
1200 {
1201 if (buf)
38d4b1e4 1202 *buf++ = (wchar_t)(*(wxUint32*)psz);
c91830cb
VZ
1203 len++;
1204 psz += sizeof(wxUint32);
1205 }
b5153fd8
VZ
1206
1207 if (buf && len<n)
1208 *buf=0;
c91830cb
VZ
1209
1210 return len;
1211}
1212
1213
1214// copy 32bit String to 32bit MB
1215size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1216{
1217 size_t len=0;
1218
1219 while (*psz && (!buf || len < n))
1220 {
1221 if (buf)
1222 {
1223 *(wxUint32*)buf = *psz;
1224 buf += sizeof(wxUint32);
1225 }
1226
1227 len += sizeof(wxUint32);
1228 psz++;
1229 }
1230
b5153fd8
VZ
1231 if (buf && len<=n-sizeof(wxUint32))
1232 *(wxUint32*)buf=0;
c91830cb
VZ
1233
1234 return len;
1235}
1236
1237
1238// swap 32bit MB to 32bit String
1239size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1240{
1241 size_t len=0;
1242
1243 while (*(wxUint32*)psz && (!buf || len < n))
1244 {
1245 if (buf)
1246 {
1247 ((char *)buf)[0] = psz[3];
1248 ((char *)buf)[1] = psz[2];
1249 ((char *)buf)[2] = psz[1];
1250 ((char *)buf)[3] = psz[0];
1251 buf++;
1252 }
1253 len++;
1254 psz += sizeof(wxUint32);
1255 }
b5153fd8
VZ
1256
1257 if (buf && len<n)
1258 *buf=0;
c91830cb
VZ
1259
1260 return len;
1261}
1262
1263
1264// swap 32bit String to 32bit MB
1265size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1266{
1267 size_t len=0;
1268
1269 while (*psz && (!buf || len < n))
1270 {
1271 if (buf)
1272 {
1273 *buf++ = ((char *)psz)[3];
1274 *buf++ = ((char *)psz)[2];
1275 *buf++ = ((char *)psz)[1];
1276 *buf++ = ((char *)psz)[0];
1277 }
1278 len += sizeof(wxUint32);
1279 psz++;
1280 }
b5153fd8
VZ
1281
1282 if (buf && len<=n-sizeof(wxUint32))
1283 *(wxUint32*)buf=0;
c91830cb
VZ
1284
1285 return len;
1286}
1287
1288
1289#endif // WC_UTF16
1290
1291
36acb880
VZ
1292// ============================================================================
1293// The classes doing conversion using the iconv_xxx() functions
1294// ============================================================================
3caec1bb 1295
b040e242 1296#ifdef HAVE_ICONV
3a0d76bc 1297
b1d547eb
VS
1298// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1299// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1300// (unless there's yet another bug in glibc) the only case when iconv()
1301// returns with (size_t)-1 (which means error) and says there are 0 bytes
1302// left in the input buffer -- when _real_ error occurs,
1303// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1304// iconv() failure.
3caec1bb
VS
1305// [This bug does not appear in glibc 2.2.]
1306#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1307#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1308 (errno != E2BIG || bufLeft != 0))
1309#else
1310#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1311#endif
1312
ab217dba 1313#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1314
74a7eb0b
VZ
1315#define ICONV_T_INVALID ((iconv_t)-1)
1316
1317#if SIZEOF_WCHAR_T == 4
1318 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1319 #define WC_ENC wxFONTENCODING_UTF32
1320#elif SIZEOF_WCHAR_T == 2
1321 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1322 #define WC_ENC wxFONTENCODING_UTF16
1323#else // sizeof(wchar_t) != 2 nor 4
1324 // does this ever happen?
1325 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1326#endif
1327
36acb880 1328// ----------------------------------------------------------------------------
e95354ec 1329// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1330// ----------------------------------------------------------------------------
1331
e95354ec 1332class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1333{
1334public:
e95354ec
VZ
1335 wxMBConv_iconv(const wxChar *name);
1336 virtual ~wxMBConv_iconv();
36acb880 1337
bde4baac
VZ
1338 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1339 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1340
e95354ec 1341 bool IsOk() const
74a7eb0b 1342 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1343
1344protected:
1345 // the iconv handlers used to translate from multibyte to wide char and in
1346 // the other direction
1347 iconv_t m2w,
1348 w2m;
b1d547eb
VS
1349#if wxUSE_THREADS
1350 // guards access to m2w and w2m objects
1351 wxMutex m_iconvMutex;
1352#endif
36acb880
VZ
1353
1354private:
eec47cc6
VZ
1355 virtual const char *GetMBNul(size_t *nulLen) const;
1356
e95354ec 1357 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1358 // available on this machine, it will remain NULL
74a7eb0b 1359 static wxString ms_wcCharsetName;
36acb880
VZ
1360
1361 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1362 // different endian-ness than the native one
405d8f46 1363 static bool ms_wcNeedsSwap;
eec47cc6
VZ
1364
1365 // NUL representation
1366 size_t m_nulLen;
1367 char m_nulBuf[8];
36acb880
VZ
1368};
1369
8f115891
MW
1370// make the constructor available for unit testing
1371WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1372{
1373 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1374 if ( !result->IsOk() )
1375 {
1376 delete result;
1377 return 0;
1378 }
1379 return result;
1380}
1381
422e411e 1382wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1383bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1384
e95354ec 1385wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1386{
eec47cc6
VZ
1387 m_nulLen = (size_t)-2;
1388
0331b385
VZ
1389 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1390 // names for the charsets
200a9923 1391 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1392
36acb880 1393 // check for charset that represents wchar_t:
74a7eb0b 1394 if ( ms_wcCharsetName.empty() )
f1339c56 1395 {
c2b83fdd
VZ
1396 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1397
74a7eb0b
VZ
1398#if wxUSE_FONTMAP
1399 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1400#else // !wxUSE_FONTMAP
1401 static const wxChar *names[] =
36acb880 1402 {
74a7eb0b
VZ
1403#if SIZEOF_WCHAR_T == 4
1404 _T("UCS-4"),
1405#elif SIZEOF_WCHAR_T = 2
1406 _T("UCS-2"),
1407#endif
1408 NULL
1409 };
1410#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1411
d1f024a8 1412 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1413 {
17a1ebd1 1414 const wxString nameCS(*names);
74a7eb0b
VZ
1415
1416 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1417 wxString nameXE(nameCS);
74a7eb0b
VZ
1418 #ifdef WORDS_BIGENDIAN
1419 nameXE += _T("BE");
1420 #else // little endian
1421 nameXE += _T("LE");
1422 #endif
1423
c2b83fdd
VZ
1424 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1425 nameXE.c_str());
1426
74a7eb0b
VZ
1427 m2w = iconv_open(nameXE.ToAscii(), cname);
1428 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1429 {
74a7eb0b 1430 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1431 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1432 nameCS.c_str());
17a1ebd1 1433 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1434
74a7eb0b
VZ
1435 // and check for bytesex ourselves:
1436 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1437 {
74a7eb0b
VZ
1438 char buf[2], *bufPtr;
1439 wchar_t wbuf[2], *wbufPtr;
1440 size_t insz, outsz;
1441 size_t res;
1442
1443 buf[0] = 'A';
1444 buf[1] = 0;
1445 wbuf[0] = 0;
1446 insz = 2;
1447 outsz = SIZEOF_WCHAR_T * 2;
1448 wbufPtr = wbuf;
1449 bufPtr = buf;
1450
1451 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1452 (char**)&wbufPtr, &outsz);
1453
1454 if (ICONV_FAILED(res, insz))
1455 {
1456 wxLogLastError(wxT("iconv"));
422e411e 1457 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1458 nameCS.c_str());
74a7eb0b
VZ
1459 }
1460 else // ok, can convert to this encoding, remember it
1461 {
17a1ebd1 1462 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1463 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1464 }
3a0d76bc
VS
1465 }
1466 }
74a7eb0b 1467 else // use charset not requiring byte swapping
36acb880 1468 {
74a7eb0b 1469 ms_wcCharsetName = nameXE;
36acb880 1470 }
3a0d76bc 1471 }
74a7eb0b 1472
0944fceb 1473 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1474 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1475 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1476 : ms_wcCharsetName.c_str(),
1477 ms_wcNeedsSwap ? _T(" (needs swap)")
1478 : _T(""));
3a0d76bc 1479 }
36acb880 1480 else // we already have ms_wcCharsetName
3caec1bb 1481 {
74a7eb0b 1482 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1483 }
dccce9ea 1484
74a7eb0b 1485 if ( ms_wcCharsetName.empty() )
f1339c56 1486 {
74a7eb0b 1487 w2m = ICONV_T_INVALID;
36acb880 1488 }
405d8f46
VZ
1489 else
1490 {
74a7eb0b
VZ
1491 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1492 if ( w2m == ICONV_T_INVALID )
1493 {
1494 wxLogTrace(TRACE_STRCONV,
1495 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1496 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1497 }
405d8f46 1498 }
36acb880 1499}
3caec1bb 1500
e95354ec 1501wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1502{
74a7eb0b 1503 if ( m2w != ICONV_T_INVALID )
36acb880 1504 iconv_close(m2w);
74a7eb0b 1505 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1506 iconv_close(w2m);
1507}
3a0d76bc 1508
bde4baac 1509size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1510{
b1d547eb
VS
1511#if wxUSE_THREADS
1512 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1513 // Unfortunately there is a couple of global wxCSConv objects such as
1514 // wxConvLocal that are used all over wx code, so we have to make sure
1515 // the handle is used by at most one thread at the time. Otherwise
1516 // only a few wx classes would be safe to use from non-main threads
1517 // as MB<->WC conversion would fail "randomly".
1518 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1519#endif
3698ae71 1520
36acb880
VZ
1521 size_t inbuf = strlen(psz);
1522 size_t outbuf = n * SIZEOF_WCHAR_T;
1523 size_t res, cres;
1524 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1525 wchar_t *bufPtr = buf;
1526 const char *pszPtr = psz;
1527
1528 if (buf)
1529 {
1530 // have destination buffer, convert there
1531 cres = iconv(m2w,
1532 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1533 (char**)&bufPtr, &outbuf);
1534 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1535
36acb880 1536 if (ms_wcNeedsSwap)
3a0d76bc 1537 {
36acb880 1538 // convert to native endianness
17a1ebd1
VZ
1539 for ( unsigned i = 0; i < res; i++ )
1540 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1541 }
adb45366 1542
49dd9820
VS
1543 // NB: iconv was given only strlen(psz) characters on input, and so
1544 // it couldn't convert the trailing zero. Let's do it ourselves
1545 // if there's some room left for it in the output buffer.
1546 if (res < n)
1547 buf[res] = 0;
36acb880
VZ
1548 }
1549 else
1550 {
1551 // no destination buffer... convert using temp buffer
1552 // to calculate destination buffer requirement
1553 wchar_t tbuf[8];
1554 res = 0;
1555 do {
1556 bufPtr = tbuf;
1557 outbuf = 8*SIZEOF_WCHAR_T;
1558
1559 cres = iconv(m2w,
1560 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1561 (char**)&bufPtr, &outbuf );
1562
1563 res += 8-(outbuf/SIZEOF_WCHAR_T);
1564 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1565 }
dccce9ea 1566
36acb880 1567 if (ICONV_FAILED(cres, inbuf))
f1339c56 1568 {
36acb880 1569 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1570 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1571 return (size_t)-1;
1572 }
1573
1574 return res;
1575}
1576
bde4baac 1577size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1578{
b1d547eb
VS
1579#if wxUSE_THREADS
1580 // NB: explained in MB2WC
1581 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1582#endif
3698ae71 1583
156162ec
MW
1584 size_t inlen = wxWcslen(psz);
1585 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1586 size_t outbuf = n;
1587 size_t res, cres;
3a0d76bc 1588
36acb880 1589 wchar_t *tmpbuf = 0;
3caec1bb 1590
36acb880
VZ
1591 if (ms_wcNeedsSwap)
1592 {
1593 // need to copy to temp buffer to switch endianness
74a7eb0b 1594 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1595 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1596 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1597 for ( size_t i = 0; i < inlen; i++ )
1598 tmpbuf[n] = WC_BSWAP(psz[i]);
156162ec 1599 tmpbuf[inlen] = L'\0';
74a7eb0b 1600 psz = tmpbuf;
36acb880 1601 }
3a0d76bc 1602
36acb880
VZ
1603 if (buf)
1604 {
1605 // have destination buffer, convert there
1606 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1607
36acb880 1608 res = n-outbuf;
adb45366 1609
49dd9820
VS
1610 // NB: iconv was given only wcslen(psz) characters on input, and so
1611 // it couldn't convert the trailing zero. Let's do it ourselves
1612 // if there's some room left for it in the output buffer.
1613 if (res < n)
1614 buf[0] = 0;
36acb880
VZ
1615 }
1616 else
1617 {
1618 // no destination buffer... convert using temp buffer
1619 // to calculate destination buffer requirement
1620 char tbuf[16];
1621 res = 0;
1622 do {
1623 buf = tbuf; outbuf = 16;
1624
1625 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1626
36acb880
VZ
1627 res += 16 - outbuf;
1628 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1629 }
dccce9ea 1630
36acb880
VZ
1631 if (ms_wcNeedsSwap)
1632 {
1633 free(tmpbuf);
1634 }
dccce9ea 1635
36acb880
VZ
1636 if (ICONV_FAILED(cres, inbuf))
1637 {
ce6f8d6f 1638 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1639 return (size_t)-1;
1640 }
1641
1642 return res;
1643}
1644
eec47cc6
VZ
1645const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
1646{
1647 if ( m_nulLen == (size_t)-2 )
1648 {
1649 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1650
1651#if wxUSE_THREADS
1652 // NB: explained in MB2WC
1653 wxMutexLocker lock(self->m_iconvMutex);
1654#endif
1655
356410fc
VZ
1656 wchar_t *wnul = L"";
1657 size_t inLen = sizeof(wchar_t),
eec47cc6 1658 outLen = WXSIZEOF(m_nulBuf);
356410fc
VZ
1659 char *in = (char *)wnul,
1660 *out = self->m_nulBuf;
1661 if ( iconv(w2m, &in, &inLen, &out, &outLen) == (size_t)-1 )
1662 {
1663 self->m_nulLen = (size_t)-1;
1664 }
1665 else // ok
1666 {
1667 self->m_nulLen = out - m_nulBuf;
1668 }
eec47cc6
VZ
1669 }
1670
1671 *nulLen = m_nulLen;
1672 return m_nulBuf;
1673}
1674
b040e242 1675#endif // HAVE_ICONV
36acb880 1676
e95354ec 1677
36acb880
VZ
1678// ============================================================================
1679// Win32 conversion classes
1680// ============================================================================
1cd52418 1681
e95354ec 1682#ifdef wxHAVE_WIN32_MB2WC
373658eb 1683
8b04d4c4 1684// from utils.cpp
d775fa82 1685#if wxUSE_FONTMAP
8b04d4c4
VZ
1686extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1687extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1688#endif
373658eb 1689
e95354ec 1690class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1691{
1692public:
bde4baac
VZ
1693 wxMBConv_win32()
1694 {
1695 m_CodePage = CP_ACP;
eec47cc6 1696 m_nulLen = (size_t)-2;
bde4baac
VZ
1697 }
1698
7608a683 1699#if wxUSE_FONTMAP
e95354ec 1700 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1701 {
1702 m_CodePage = wxCharsetToCodepage(name);
eec47cc6 1703 m_nulLen = (size_t)-2;
bde4baac 1704 }
dccce9ea 1705
e95354ec 1706 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1707 {
1708 m_CodePage = wxEncodingToCodepage(encoding);
eec47cc6 1709 m_nulLen = (size_t)-2;
bde4baac 1710 }
eec47cc6 1711#endif // wxUSE_FONTMAP
8b04d4c4 1712
bde4baac 1713 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1714 {
02272c9c
VZ
1715 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1716 // the behaviour is not compatible with the Unix version (using iconv)
1717 // and break the library itself, e.g. wxTextInputStream::NextChar()
1718 // wouldn't work if reading an incomplete MB char didn't result in an
1719 // error
667e5b3e
VZ
1720 //
1721 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1722 // an error (tested under Windows Server 2003) and apparently it is
1723 // done on purpose, i.e. the function accepts any input in this case
1724 // and although I'd prefer to return error on ill-formed output, our
1725 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1726 // explicitly ill-formed according to RFC 2152) neither so we don't
1727 // even have any fallback here...
89028980
VS
1728 //
1729 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1730 // Win XP or newer and if it is specified on older versions, conversion
1731 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1732 // fails. So we can only use the flag on newer Windows versions.
1733 // Additionally, the flag is not supported by UTF7, symbol and CJK
1734 // encodings. See here:
1735 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1736 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1737 int flags = 0;
1738 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1739 m_CodePage < 50000 &&
1740 IsAtLeastWin2kSP4() )
1741 {
1742 flags = MB_ERR_INVALID_CHARS;
1743 }
1744 else if ( m_CodePage == CP_UTF8 )
1745 {
1746 // Avoid round-trip in the special case of UTF-8 by using our
1747 // own UTF-8 conversion code:
1748 return wxMBConvUTF8().MB2WC(buf, psz, n);
1749 }
667e5b3e 1750
2b5f62a0
VZ
1751 const size_t len = ::MultiByteToWideChar
1752 (
1753 m_CodePage, // code page
667e5b3e 1754 flags, // flags: fall on error
2b5f62a0
VZ
1755 psz, // input string
1756 -1, // its length (NUL-terminated)
b4da152e 1757 buf, // output string
2b5f62a0
VZ
1758 buf ? n : 0 // size of output buffer
1759 );
89028980
VS
1760 if ( !len )
1761 {
1762 // function totally failed
1763 return (size_t)-1;
1764 }
1765
1766 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1767 // check if we succeeded, by doing a double trip:
1768 if ( !flags && buf )
1769 {
53c174fc
VZ
1770 const size_t mbLen = strlen(psz);
1771 wxCharBuffer mbBuf(mbLen);
89028980
VS
1772 if ( ::WideCharToMultiByte
1773 (
1774 m_CodePage,
1775 0,
1776 buf,
1777 -1,
1778 mbBuf.data(),
53c174fc 1779 mbLen + 1, // size in bytes, not length
89028980
VS
1780 NULL,
1781 NULL
1782 ) == 0 ||
1783 strcmp(mbBuf, psz) != 0 )
1784 {
1785 // we didn't obtain the same thing we started from, hence
1786 // the conversion was lossy and we consider that it failed
1787 return (size_t)-1;
1788 }
1789 }
2b5f62a0 1790
03a991bc
VZ
1791 // note that it returns count of written chars for buf != NULL and size
1792 // of the needed buffer for buf == NULL so in either case the length of
1793 // the string (which never includes the terminating NUL) is one less
89028980 1794 return len - 1;
f1339c56 1795 }
dccce9ea 1796
13dd924a 1797 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1798 {
13dd924a
VZ
1799 /*
1800 we have a problem here: by default, WideCharToMultiByte() may
1801 replace characters unrepresentable in the target code page with bad
1802 quality approximations such as turning "1/2" symbol (U+00BD) into
1803 "1" for the code pages which don't have it and we, obviously, want
1804 to avoid this at any price
d775fa82 1805
13dd924a
VZ
1806 the trouble is that this function does it _silently_, i.e. it won't
1807 even tell us whether it did or not... Win98/2000 and higher provide
1808 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1809 we have to resort to a round trip, i.e. check that converting back
1810 results in the same string -- this is, of course, expensive but
1811 otherwise we simply can't be sure to not garble the data.
1812 */
1813
1814 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1815 // it doesn't work with CJK encodings (which we test for rather roughly
1816 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1817 // supporting it
907173e5
WS
1818 BOOL usedDef wxDUMMY_INITIALIZE(false);
1819 BOOL *pUsedDef;
13dd924a
VZ
1820 int flags;
1821 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1822 {
1823 // it's our lucky day
1824 flags = WC_NO_BEST_FIT_CHARS;
1825 pUsedDef = &usedDef;
1826 }
1827 else // old system or unsupported encoding
1828 {
1829 flags = 0;
1830 pUsedDef = NULL;
1831 }
1832
2b5f62a0
VZ
1833 const size_t len = ::WideCharToMultiByte
1834 (
1835 m_CodePage, // code page
13dd924a
VZ
1836 flags, // either none or no best fit
1837 pwz, // input string
2b5f62a0
VZ
1838 -1, // it is (wide) NUL-terminated
1839 buf, // output buffer
1840 buf ? n : 0, // and its size
1841 NULL, // default "replacement" char
13dd924a 1842 pUsedDef // [out] was it used?
2b5f62a0
VZ
1843 );
1844
13dd924a
VZ
1845 if ( !len )
1846 {
1847 // function totally failed
1848 return (size_t)-1;
1849 }
1850
1851 // if we were really converting, check if we succeeded
1852 if ( buf )
1853 {
1854 if ( flags )
1855 {
1856 // check if the conversion failed, i.e. if any replacements
1857 // were done
1858 if ( usedDef )
1859 return (size_t)-1;
1860 }
1861 else // we must resort to double tripping...
1862 {
1863 wxWCharBuffer wcBuf(n);
1864 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1865 wcscmp(wcBuf, pwz) != 0 )
1866 {
1867 // we didn't obtain the same thing we started from, hence
1868 // the conversion was lossy and we consider that it failed
1869 return (size_t)-1;
1870 }
1871 }
1872 }
1873
03a991bc 1874 // see the comment above for the reason of "len - 1"
13dd924a 1875 return len - 1;
f1339c56 1876 }
dccce9ea 1877
13dd924a
VZ
1878 bool IsOk() const { return m_CodePage != -1; }
1879
1880private:
1881 static bool CanUseNoBestFit()
1882 {
1883 static int s_isWin98Or2k = -1;
1884
1885 if ( s_isWin98Or2k == -1 )
1886 {
1887 int verMaj, verMin;
1888 switch ( wxGetOsVersion(&verMaj, &verMin) )
1889 {
1890 case wxWIN95:
1891 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1892 break;
1893
1894 case wxWINDOWS_NT:
1895 s_isWin98Or2k = verMaj >= 5;
1896 break;
1897
1898 default:
1899 // unknown, be conseravtive by default
1900 s_isWin98Or2k = 0;
1901 }
1902
1903 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1904 }
1905
1906 return s_isWin98Or2k == 1;
1907 }
f1339c56 1908
89028980
VS
1909 static bool IsAtLeastWin2kSP4()
1910 {
8942f83a
WS
1911#ifdef __WXWINCE__
1912 return false;
1913#else
89028980
VS
1914 static int s_isAtLeastWin2kSP4 = -1;
1915
1916 if ( s_isAtLeastWin2kSP4 == -1 )
1917 {
1918 OSVERSIONINFOEX ver;
1919
1920 memset(&ver, 0, sizeof(ver));
1921 ver.dwOSVersionInfoSize = sizeof(ver);
1922 GetVersionEx((OSVERSIONINFO*)&ver);
1923
1924 s_isAtLeastWin2kSP4 =
1925 ((ver.dwMajorVersion > 5) || // Vista+
1926 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1927 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1928 ver.wServicePackMajor >= 4)) // 2000 SP4+
1929 ? 1 : 0;
1930 }
1931
1932 return s_isAtLeastWin2kSP4 == 1;
8942f83a 1933#endif
89028980
VS
1934 }
1935
eec47cc6
VZ
1936 virtual const char *GetMBNul(size_t *nulLen) const
1937 {
1938 if ( m_nulLen == (size_t)-2 )
1939 {
1940 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1941
1942 self->m_nulLen = ::WideCharToMultiByte
1943 (
1944 m_CodePage, // code page
1945 0, // no flags
1946 L"", // input string
1947 1, // translate just NUL
1948 self->m_nulBuf, // output buffer
1949 WXSIZEOF(m_nulBuf), // and its size
1950 NULL, // "replacement" char
1951 NULL // [out] was it used?
1952 );
1953
1954 if ( m_nulLen == 0 )
1955 self->m_nulLen = (size_t)-1;
1956 }
1957
1958 *nulLen = m_nulLen;
1959 return m_nulBuf;
1960 }
1961
b1d66b54 1962 long m_CodePage;
eec47cc6
VZ
1963 size_t m_nulLen;
1964 char m_nulBuf[8];
1cd52418 1965};
e95354ec
VZ
1966
1967#endif // wxHAVE_WIN32_MB2WC
1968
f7e98dee
RN
1969// ============================================================================
1970// Cocoa conversion classes
1971// ============================================================================
1972
1973#if defined(__WXCOCOA__)
1974
ecd9653b 1975// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1976// Cocoa. Strangely enough, internally Core Foundation uses
1977// UTF 32 internally quite a bit - its just not public (yet).
1978
1979#include <CoreFoundation/CFString.h>
1980#include <CoreFoundation/CFStringEncodingExt.h>
1981
1982CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1983{
638357a0 1984 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1985 if ( encoding == wxFONTENCODING_DEFAULT )
1986 {
638357a0 1987 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1988 }
1989 else switch( encoding)
1990 {
1991 case wxFONTENCODING_ISO8859_1 :
1992 enc = kCFStringEncodingISOLatin1 ;
1993 break ;
1994 case wxFONTENCODING_ISO8859_2 :
1995 enc = kCFStringEncodingISOLatin2;
1996 break ;
1997 case wxFONTENCODING_ISO8859_3 :
1998 enc = kCFStringEncodingISOLatin3 ;
1999 break ;
2000 case wxFONTENCODING_ISO8859_4 :
2001 enc = kCFStringEncodingISOLatin4;
2002 break ;
2003 case wxFONTENCODING_ISO8859_5 :
2004 enc = kCFStringEncodingISOLatinCyrillic;
2005 break ;
2006 case wxFONTENCODING_ISO8859_6 :
2007 enc = kCFStringEncodingISOLatinArabic;
2008 break ;
2009 case wxFONTENCODING_ISO8859_7 :
2010 enc = kCFStringEncodingISOLatinGreek;
2011 break ;
2012 case wxFONTENCODING_ISO8859_8 :
2013 enc = kCFStringEncodingISOLatinHebrew;
2014 break ;
2015 case wxFONTENCODING_ISO8859_9 :
2016 enc = kCFStringEncodingISOLatin5;
2017 break ;
2018 case wxFONTENCODING_ISO8859_10 :
2019 enc = kCFStringEncodingISOLatin6;
2020 break ;
2021 case wxFONTENCODING_ISO8859_11 :
2022 enc = kCFStringEncodingISOLatinThai;
2023 break ;
2024 case wxFONTENCODING_ISO8859_13 :
2025 enc = kCFStringEncodingISOLatin7;
2026 break ;
2027 case wxFONTENCODING_ISO8859_14 :
2028 enc = kCFStringEncodingISOLatin8;
2029 break ;
2030 case wxFONTENCODING_ISO8859_15 :
2031 enc = kCFStringEncodingISOLatin9;
2032 break ;
2033
2034 case wxFONTENCODING_KOI8 :
2035 enc = kCFStringEncodingKOI8_R;
2036 break ;
2037 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2038 enc = kCFStringEncodingDOSRussian;
2039 break ;
2040
2041// case wxFONTENCODING_BULGARIAN :
2042// enc = ;
2043// break ;
2044
2045 case wxFONTENCODING_CP437 :
2046 enc =kCFStringEncodingDOSLatinUS ;
2047 break ;
2048 case wxFONTENCODING_CP850 :
2049 enc = kCFStringEncodingDOSLatin1;
2050 break ;
2051 case wxFONTENCODING_CP852 :
2052 enc = kCFStringEncodingDOSLatin2;
2053 break ;
2054 case wxFONTENCODING_CP855 :
2055 enc = kCFStringEncodingDOSCyrillic;
2056 break ;
2057 case wxFONTENCODING_CP866 :
2058 enc =kCFStringEncodingDOSRussian ;
2059 break ;
2060 case wxFONTENCODING_CP874 :
2061 enc = kCFStringEncodingDOSThai;
2062 break ;
2063 case wxFONTENCODING_CP932 :
2064 enc = kCFStringEncodingDOSJapanese;
2065 break ;
2066 case wxFONTENCODING_CP936 :
2067 enc =kCFStringEncodingDOSChineseSimplif ;
2068 break ;
2069 case wxFONTENCODING_CP949 :
2070 enc = kCFStringEncodingDOSKorean;
2071 break ;
2072 case wxFONTENCODING_CP950 :
2073 enc = kCFStringEncodingDOSChineseTrad;
2074 break ;
ecd9653b
WS
2075 case wxFONTENCODING_CP1250 :
2076 enc = kCFStringEncodingWindowsLatin2;
2077 break ;
2078 case wxFONTENCODING_CP1251 :
2079 enc =kCFStringEncodingWindowsCyrillic ;
2080 break ;
2081 case wxFONTENCODING_CP1252 :
2082 enc =kCFStringEncodingWindowsLatin1 ;
2083 break ;
2084 case wxFONTENCODING_CP1253 :
2085 enc = kCFStringEncodingWindowsGreek;
2086 break ;
2087 case wxFONTENCODING_CP1254 :
2088 enc = kCFStringEncodingWindowsLatin5;
2089 break ;
2090 case wxFONTENCODING_CP1255 :
2091 enc =kCFStringEncodingWindowsHebrew ;
2092 break ;
2093 case wxFONTENCODING_CP1256 :
2094 enc =kCFStringEncodingWindowsArabic ;
2095 break ;
2096 case wxFONTENCODING_CP1257 :
2097 enc = kCFStringEncodingWindowsBalticRim;
2098 break ;
638357a0
RN
2099// This only really encodes to UTF7 (if that) evidently
2100// case wxFONTENCODING_UTF7 :
2101// enc = kCFStringEncodingNonLossyASCII ;
2102// break ;
ecd9653b
WS
2103 case wxFONTENCODING_UTF8 :
2104 enc = kCFStringEncodingUTF8 ;
2105 break ;
2106 case wxFONTENCODING_EUC_JP :
2107 enc = kCFStringEncodingEUC_JP;
2108 break ;
2109 case wxFONTENCODING_UTF16 :
f7e98dee 2110 enc = kCFStringEncodingUnicode ;
ecd9653b 2111 break ;
f7e98dee
RN
2112 case wxFONTENCODING_MACROMAN :
2113 enc = kCFStringEncodingMacRoman ;
2114 break ;
2115 case wxFONTENCODING_MACJAPANESE :
2116 enc = kCFStringEncodingMacJapanese ;
2117 break ;
2118 case wxFONTENCODING_MACCHINESETRAD :
2119 enc = kCFStringEncodingMacChineseTrad ;
2120 break ;
2121 case wxFONTENCODING_MACKOREAN :
2122 enc = kCFStringEncodingMacKorean ;
2123 break ;
2124 case wxFONTENCODING_MACARABIC :
2125 enc = kCFStringEncodingMacArabic ;
2126 break ;
2127 case wxFONTENCODING_MACHEBREW :
2128 enc = kCFStringEncodingMacHebrew ;
2129 break ;
2130 case wxFONTENCODING_MACGREEK :
2131 enc = kCFStringEncodingMacGreek ;
2132 break ;
2133 case wxFONTENCODING_MACCYRILLIC :
2134 enc = kCFStringEncodingMacCyrillic ;
2135 break ;
2136 case wxFONTENCODING_MACDEVANAGARI :
2137 enc = kCFStringEncodingMacDevanagari ;
2138 break ;
2139 case wxFONTENCODING_MACGURMUKHI :
2140 enc = kCFStringEncodingMacGurmukhi ;
2141 break ;
2142 case wxFONTENCODING_MACGUJARATI :
2143 enc = kCFStringEncodingMacGujarati ;
2144 break ;
2145 case wxFONTENCODING_MACORIYA :
2146 enc = kCFStringEncodingMacOriya ;
2147 break ;
2148 case wxFONTENCODING_MACBENGALI :
2149 enc = kCFStringEncodingMacBengali ;
2150 break ;
2151 case wxFONTENCODING_MACTAMIL :
2152 enc = kCFStringEncodingMacTamil ;
2153 break ;
2154 case wxFONTENCODING_MACTELUGU :
2155 enc = kCFStringEncodingMacTelugu ;
2156 break ;
2157 case wxFONTENCODING_MACKANNADA :
2158 enc = kCFStringEncodingMacKannada ;
2159 break ;
2160 case wxFONTENCODING_MACMALAJALAM :
2161 enc = kCFStringEncodingMacMalayalam ;
2162 break ;
2163 case wxFONTENCODING_MACSINHALESE :
2164 enc = kCFStringEncodingMacSinhalese ;
2165 break ;
2166 case wxFONTENCODING_MACBURMESE :
2167 enc = kCFStringEncodingMacBurmese ;
2168 break ;
2169 case wxFONTENCODING_MACKHMER :
2170 enc = kCFStringEncodingMacKhmer ;
2171 break ;
2172 case wxFONTENCODING_MACTHAI :
2173 enc = kCFStringEncodingMacThai ;
2174 break ;
2175 case wxFONTENCODING_MACLAOTIAN :
2176 enc = kCFStringEncodingMacLaotian ;
2177 break ;
2178 case wxFONTENCODING_MACGEORGIAN :
2179 enc = kCFStringEncodingMacGeorgian ;
2180 break ;
2181 case wxFONTENCODING_MACARMENIAN :
2182 enc = kCFStringEncodingMacArmenian ;
2183 break ;
2184 case wxFONTENCODING_MACCHINESESIMP :
2185 enc = kCFStringEncodingMacChineseSimp ;
2186 break ;
2187 case wxFONTENCODING_MACTIBETAN :
2188 enc = kCFStringEncodingMacTibetan ;
2189 break ;
2190 case wxFONTENCODING_MACMONGOLIAN :
2191 enc = kCFStringEncodingMacMongolian ;
2192 break ;
2193 case wxFONTENCODING_MACETHIOPIC :
2194 enc = kCFStringEncodingMacEthiopic ;
2195 break ;
2196 case wxFONTENCODING_MACCENTRALEUR :
2197 enc = kCFStringEncodingMacCentralEurRoman ;
2198 break ;
2199 case wxFONTENCODING_MACVIATNAMESE :
2200 enc = kCFStringEncodingMacVietnamese ;
2201 break ;
2202 case wxFONTENCODING_MACARABICEXT :
2203 enc = kCFStringEncodingMacExtArabic ;
2204 break ;
2205 case wxFONTENCODING_MACSYMBOL :
2206 enc = kCFStringEncodingMacSymbol ;
2207 break ;
2208 case wxFONTENCODING_MACDINGBATS :
2209 enc = kCFStringEncodingMacDingbats ;
2210 break ;
2211 case wxFONTENCODING_MACTURKISH :
2212 enc = kCFStringEncodingMacTurkish ;
2213 break ;
2214 case wxFONTENCODING_MACCROATIAN :
2215 enc = kCFStringEncodingMacCroatian ;
2216 break ;
2217 case wxFONTENCODING_MACICELANDIC :
2218 enc = kCFStringEncodingMacIcelandic ;
2219 break ;
2220 case wxFONTENCODING_MACROMANIAN :
2221 enc = kCFStringEncodingMacRomanian ;
2222 break ;
2223 case wxFONTENCODING_MACCELTIC :
2224 enc = kCFStringEncodingMacCeltic ;
2225 break ;
2226 case wxFONTENCODING_MACGAELIC :
2227 enc = kCFStringEncodingMacGaelic ;
2228 break ;
ecd9653b
WS
2229// case wxFONTENCODING_MACKEYBOARD :
2230// enc = kCFStringEncodingMacKeyboardGlyphs ;
2231// break ;
2232 default :
2233 // because gcc is picky
2234 break ;
2235 } ;
2236 return enc ;
f7e98dee
RN
2237}
2238
f7e98dee
RN
2239class wxMBConv_cocoa : public wxMBConv
2240{
2241public:
2242 wxMBConv_cocoa()
2243 {
2244 Init(CFStringGetSystemEncoding()) ;
2245 }
2246
a6900d10 2247#if wxUSE_FONTMAP
f7e98dee
RN
2248 wxMBConv_cocoa(const wxChar* name)
2249 {
267e11c5 2250 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2251 }
a6900d10 2252#endif
f7e98dee
RN
2253
2254 wxMBConv_cocoa(wxFontEncoding encoding)
2255 {
2256 Init( wxCFStringEncFromFontEnc(encoding) );
2257 }
2258
2259 ~wxMBConv_cocoa()
2260 {
2261 }
2262
2263 void Init( CFStringEncoding encoding)
2264 {
638357a0 2265 m_encoding = encoding ;
f7e98dee
RN
2266 }
2267
2268 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2269 {
2270 wxASSERT(szUnConv);
ecd9653b 2271
638357a0
RN
2272 CFStringRef theString = CFStringCreateWithBytes (
2273 NULL, //the allocator
2274 (const UInt8*)szUnConv,
2275 strlen(szUnConv),
2276 m_encoding,
2277 false //no BOM/external representation
f7e98dee
RN
2278 );
2279
2280 wxASSERT(theString);
2281
638357a0
RN
2282 size_t nOutLength = CFStringGetLength(theString);
2283
2284 if (szOut == NULL)
f7e98dee 2285 {
f7e98dee 2286 CFRelease(theString);
638357a0 2287 return nOutLength;
f7e98dee 2288 }
ecd9653b 2289
638357a0 2290 CFRange theRange = { 0, nOutSize };
ecd9653b 2291
638357a0
RN
2292#if SIZEOF_WCHAR_T == 4
2293 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2294#endif
3698ae71 2295
f7e98dee 2296 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2297
f7e98dee 2298 CFRelease(theString);
ecd9653b 2299
638357a0 2300 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
2301
2302#if SIZEOF_WCHAR_T == 4
2303 wxMBConvUTF16 converter ;
638357a0 2304 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
2305 delete[] szUniCharBuffer;
2306#endif
3698ae71 2307
638357a0 2308 return nOutLength;
f7e98dee
RN
2309 }
2310
2311 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2312 {
638357a0 2313 wxASSERT(szUnConv);
3698ae71 2314
f7e98dee 2315 size_t nRealOutSize;
638357a0 2316 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2317 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2318
f7e98dee 2319#if SIZEOF_WCHAR_T == 4
d9d488cf 2320 wxMBConvUTF16 converter ;
f7e98dee
RN
2321 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2322 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2323 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2324 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2325#endif
2326
2327 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2328 NULL, //allocator
2329 szUniBuffer,
2330 nBufSize,
638357a0 2331 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2332 );
ecd9653b 2333
f7e98dee 2334 wxASSERT(theString);
ecd9653b 2335
f7e98dee 2336 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2337 //so we check and use getchars instead in that case
2338 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2339 {
638357a0
RN
2340 if (szOut != NULL)
2341 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2342
638357a0
RN
2343 nRealOutSize = CFStringGetLength(theString) + 1;
2344 }
2345 else
2346 {
2347 CFStringGetBytes(
2348 theString,
2349 CFRangeMake(0, CFStringGetLength(theString)),
2350 m_encoding,
2351 0, //what to put in characters that can't be converted -
2352 //0 tells CFString to return NULL if it meets such a character
2353 false, //not an external representation
2354 (UInt8*) szOut,
3698ae71 2355 nOutSize,
638357a0
RN
2356 (CFIndex*) &nRealOutSize
2357 );
f7e98dee 2358 }
ecd9653b 2359
638357a0 2360 CFRelease(theString);
ecd9653b 2361
638357a0
RN
2362#if SIZEOF_WCHAR_T == 4
2363 delete[] szUniBuffer;
2364#endif
ecd9653b 2365
f7e98dee
RN
2366 return nRealOutSize - 1;
2367 }
2368
2369 bool IsOk() const
ecd9653b 2370 {
3698ae71 2371 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2372 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2373 }
2374
2375private:
638357a0 2376 CFStringEncoding m_encoding ;
f7e98dee
RN
2377};
2378
2379#endif // defined(__WXCOCOA__)
2380
335d31e0
SC
2381// ============================================================================
2382// Mac conversion classes
2383// ============================================================================
2384
2385#if defined(__WXMAC__) && defined(TARGET_CARBON)
2386
2387class wxMBConv_mac : public wxMBConv
2388{
2389public:
2390 wxMBConv_mac()
2391 {
2392 Init(CFStringGetSystemEncoding()) ;
2393 }
2394
2d1659cf 2395#if wxUSE_FONTMAP
335d31e0
SC
2396 wxMBConv_mac(const wxChar* name)
2397 {
267e11c5 2398 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0 2399 }
2d1659cf 2400#endif
335d31e0
SC
2401
2402 wxMBConv_mac(wxFontEncoding encoding)
2403 {
d775fa82
WS
2404 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2405 }
2406
2407 ~wxMBConv_mac()
2408 {
2409 OSStatus status = noErr ;
2410 status = TECDisposeConverter(m_MB2WC_converter);
2411 status = TECDisposeConverter(m_WC2MB_converter);
2412 }
2413
2414
2415 void Init( TextEncodingBase encoding)
2416 {
2417 OSStatus status = noErr ;
2418 m_char_encoding = encoding ;
2419 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2420
2421 status = TECCreateConverter(&m_MB2WC_converter,
2422 m_char_encoding,
2423 m_unicode_encoding);
2424 status = TECCreateConverter(&m_WC2MB_converter,
2425 m_unicode_encoding,
2426 m_char_encoding);
2427 }
2428
335d31e0
SC
2429 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2430 {
d775fa82
WS
2431 OSStatus status = noErr ;
2432 ByteCount byteOutLen ;
2433 ByteCount byteInLen = strlen(psz) ;
2434 wchar_t *tbuf = NULL ;
2435 UniChar* ubuf = NULL ;
2436 size_t res = 0 ;
2437
2438 if (buf == NULL)
2439 {
638357a0 2440 //apple specs say at least 32
c543817b 2441 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2442 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2443 }
2444 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2445#if SIZEOF_WCHAR_T == 4
d775fa82 2446 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2447#else
d775fa82 2448 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2449#endif
d775fa82
WS
2450 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2451 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2452#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2453 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2454 // is not properly terminated we get random characters at the end
2455 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2456 wxMBConvUTF16 converter ;
d775fa82
WS
2457 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2458 free( ubuf ) ;
f3a355ce 2459#else
d775fa82 2460 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2461#endif
d775fa82
WS
2462 if ( buf == NULL )
2463 free(tbuf) ;
335d31e0 2464
335d31e0
SC
2465 if ( buf && res < n)
2466 buf[res] = 0;
2467
d775fa82 2468 return res ;
335d31e0
SC
2469 }
2470
2471 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2472 {
2473 OSStatus status = noErr ;
2474 ByteCount byteOutLen ;
2475 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2476
2477 char *tbuf = NULL ;
2478
2479 if (buf == NULL)
2480 {
638357a0 2481 //apple specs say at least 32
c543817b 2482 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2483 tbuf = (char*) malloc( n ) ;
2484 }
2485
2486 ByteCount byteBufferLen = n ;
2487 UniChar* ubuf = NULL ;
f3a355ce 2488#if SIZEOF_WCHAR_T == 4
d9d488cf 2489 wxMBConvUTF16 converter ;
d775fa82
WS
2490 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2491 byteInLen = unicharlen ;
2492 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2493 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2494#else
d775fa82 2495 ubuf = (UniChar*) psz ;
f3a355ce 2496#endif
d775fa82
WS
2497 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2498 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2499#if SIZEOF_WCHAR_T == 4
d775fa82 2500 free( ubuf ) ;
f3a355ce 2501#endif
d775fa82
WS
2502 if ( buf == NULL )
2503 free(tbuf) ;
335d31e0 2504
d775fa82 2505 size_t res = byteOutLen ;
335d31e0 2506 if ( buf && res < n)
638357a0 2507 {
335d31e0 2508 buf[res] = 0;
3698ae71 2509
638357a0
RN
2510 //we need to double-trip to verify it didn't insert any ? in place
2511 //of bogus characters
2512 wxWCharBuffer wcBuf(n);
2513 size_t pszlen = wxWcslen(psz);
2514 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2515 wxWcslen(wcBuf) != pszlen ||
2516 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2517 {
2518 // we didn't obtain the same thing we started from, hence
2519 // the conversion was lossy and we consider that it failed
2520 return (size_t)-1;
2521 }
2522 }
335d31e0 2523
d775fa82 2524 return res ;
335d31e0
SC
2525 }
2526
2527 bool IsOk() const
2528 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2529
2530private:
d775fa82
WS
2531 TECObjectRef m_MB2WC_converter ;
2532 TECObjectRef m_WC2MB_converter ;
2533
2534 TextEncodingBase m_char_encoding ;
2535 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2536};
2537
2538#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2539
36acb880
VZ
2540// ============================================================================
2541// wxEncodingConverter based conversion classes
2542// ============================================================================
2543
1e6feb95 2544#if wxUSE_FONTMAP
1cd52418 2545
e95354ec 2546class wxMBConv_wxwin : public wxMBConv
1cd52418 2547{
8b04d4c4
VZ
2548private:
2549 void Init()
2550 {
2551 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2552 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2553 }
2554
6001e347 2555public:
f1339c56
RR
2556 // temporarily just use wxEncodingConverter stuff,
2557 // so that it works while a better implementation is built
e95354ec 2558 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2559 {
2560 if (name)
267e11c5 2561 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2562 else
2563 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2564
8b04d4c4
VZ
2565 Init();
2566 }
2567
e95354ec 2568 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2569 {
2570 m_enc = enc;
2571
2572 Init();
f1339c56 2573 }
dccce9ea 2574
bde4baac 2575 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2576 {
2577 size_t inbuf = strlen(psz);
dccce9ea 2578 if (buf)
c643a977
VS
2579 {
2580 if (!m2w.Convert(psz,buf))
2581 return (size_t)-1;
2582 }
f1339c56
RR
2583 return inbuf;
2584 }
dccce9ea 2585
bde4baac 2586 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2587 {
f8d791e0 2588 const size_t inbuf = wxWcslen(psz);
f1339c56 2589 if (buf)
c643a977
VS
2590 {
2591 if (!w2m.Convert(psz,buf))
2592 return (size_t)-1;
2593 }
dccce9ea 2594
f1339c56
RR
2595 return inbuf;
2596 }
dccce9ea 2597
e95354ec 2598 bool IsOk() const { return m_ok; }
f1339c56
RR
2599
2600public:
8b04d4c4 2601 wxFontEncoding m_enc;
f1339c56 2602 wxEncodingConverter m2w, w2m;
cafbf6fb 2603
eec47cc6
VZ
2604private:
2605 virtual const char *GetMBNul(size_t *nulLen) const
2606 {
2607 switch ( m_enc )
2608 {
2609 case wxFONTENCODING_UTF16BE:
2610 case wxFONTENCODING_UTF16LE:
2611 *nulLen = 2;
2612 return "\0";
2613
2614 case wxFONTENCODING_UTF32BE:
2615 case wxFONTENCODING_UTF32LE:
2616 *nulLen = 4;
2617 return "\0\0\0";
2618
2619 default:
2620 *nulLen = 1;
2621 return "";
2622 }
2623 }
2624
cafbf6fb
VZ
2625 // were we initialized successfully?
2626 bool m_ok;
fc7a2a60 2627
e95354ec 2628 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2629};
6001e347 2630
8f115891
MW
2631// make the constructors available for unit testing
2632WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2633{
2634 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2635 if ( !result->IsOk() )
2636 {
2637 delete result;
2638 return 0;
2639 }
2640 return result;
2641}
2642
1e6feb95
VZ
2643#endif // wxUSE_FONTMAP
2644
36acb880
VZ
2645// ============================================================================
2646// wxCSConv implementation
2647// ============================================================================
2648
8b04d4c4 2649void wxCSConv::Init()
6001e347 2650{
e95354ec
VZ
2651 m_name = NULL;
2652 m_convReal = NULL;
2653 m_deferred = true;
2654}
2655
8b04d4c4
VZ
2656wxCSConv::wxCSConv(const wxChar *charset)
2657{
2658 Init();
82713003 2659
e95354ec
VZ
2660 if ( charset )
2661 {
e95354ec
VZ
2662 SetName(charset);
2663 }
bda3d86a 2664
e4277538
VZ
2665#if wxUSE_FONTMAP
2666 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2667#else
bda3d86a 2668 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2669#endif
6001e347
RR
2670}
2671
8b04d4c4
VZ
2672wxCSConv::wxCSConv(wxFontEncoding encoding)
2673{
bda3d86a 2674 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2675 {
2676 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2677
2678 encoding = wxFONTENCODING_SYSTEM;
2679 }
2680
8b04d4c4
VZ
2681 Init();
2682
bda3d86a 2683 m_encoding = encoding;
8b04d4c4
VZ
2684}
2685
6001e347
RR
2686wxCSConv::~wxCSConv()
2687{
65e50848
JS
2688 Clear();
2689}
2690
54380f29 2691wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2692 : wxMBConv()
54380f29 2693{
8b04d4c4
VZ
2694 Init();
2695
54380f29 2696 SetName(conv.m_name);
8b04d4c4 2697 m_encoding = conv.m_encoding;
54380f29
GD
2698}
2699
2700wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2701{
2702 Clear();
8b04d4c4 2703
54380f29 2704 SetName(conv.m_name);
8b04d4c4
VZ
2705 m_encoding = conv.m_encoding;
2706
54380f29
GD
2707 return *this;
2708}
2709
65e50848
JS
2710void wxCSConv::Clear()
2711{
8b04d4c4 2712 free(m_name);
e95354ec 2713 delete m_convReal;
8b04d4c4 2714
65e50848 2715 m_name = NULL;
e95354ec 2716 m_convReal = NULL;
6001e347
RR
2717}
2718
2719void wxCSConv::SetName(const wxChar *charset)
2720{
f1339c56
RR
2721 if (charset)
2722 {
2723 m_name = wxStrdup(charset);
e95354ec 2724 m_deferred = true;
f1339c56 2725 }
6001e347
RR
2726}
2727
8b3eb85d
VZ
2728#if wxUSE_FONTMAP
2729#include "wx/hashmap.h"
2730
2731WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2732 wxEncodingNameCache );
8b3eb85d
VZ
2733
2734static wxEncodingNameCache gs_nameCache;
2735#endif
2736
e95354ec
VZ
2737wxMBConv *wxCSConv::DoCreate() const
2738{
ce6f8d6f
VZ
2739#if wxUSE_FONTMAP
2740 wxLogTrace(TRACE_STRCONV,
2741 wxT("creating conversion for %s"),
2742 (m_name ? m_name
2743 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2744#endif // wxUSE_FONTMAP
2745
c547282d
VZ
2746 // check for the special case of ASCII or ISO8859-1 charset: as we have
2747 // special knowledge of it anyhow, we don't need to create a special
2748 // conversion object
e4277538
VZ
2749 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2750 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2751 {
e95354ec
VZ
2752 // don't convert at all
2753 return NULL;
2754 }
dccce9ea 2755
e95354ec
VZ
2756 // we trust OS to do conversion better than we can so try external
2757 // conversion methods first
2758 //
2759 // the full order is:
2760 // 1. OS conversion (iconv() under Unix or Win32 API)
2761 // 2. hard coded conversions for UTF
2762 // 3. wxEncodingConverter as fall back
2763
2764 // step (1)
2765#ifdef HAVE_ICONV
c547282d 2766#if !wxUSE_FONTMAP
e95354ec 2767 if ( m_name )
c547282d 2768#endif // !wxUSE_FONTMAP
e95354ec 2769 {
c547282d 2770 wxString name(m_name);
8b3eb85d
VZ
2771 wxFontEncoding encoding(m_encoding);
2772
2773 if ( !name.empty() )
2774 {
2775 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2776 if ( conv->IsOk() )
2777 return conv;
2778
2779 delete conv;
c547282d
VZ
2780
2781#if wxUSE_FONTMAP
8b3eb85d
VZ
2782 encoding =
2783 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 2784#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2785 }
2786#if wxUSE_FONTMAP
2787 {
2788 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2789 if ( it != gs_nameCache.end() )
2790 {
2791 if ( it->second.empty() )
2792 return NULL;
c547282d 2793
8b3eb85d
VZ
2794 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2795 if ( conv->IsOk() )
2796 return conv;
e95354ec 2797
8b3eb85d
VZ
2798 delete conv;
2799 }
2800
2801 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2802
2803 for ( ; *names; ++names )
2804 {
2805 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2806 if ( conv->IsOk() )
2807 {
2808 gs_nameCache[encoding] = *names;
2809 return conv;
2810 }
2811
2812 delete conv;
2813 }
2814
40711af8 2815 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
2816 }
2817#endif // wxUSE_FONTMAP
e95354ec
VZ
2818 }
2819#endif // HAVE_ICONV
2820
2821#ifdef wxHAVE_WIN32_MB2WC
2822 {
7608a683 2823#if wxUSE_FONTMAP
e95354ec
VZ
2824 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2825 : new wxMBConv_win32(m_encoding);
2826 if ( conv->IsOk() )
2827 return conv;
2828
2829 delete conv;
7608a683
WS
2830#else
2831 return NULL;
2832#endif
e95354ec
VZ
2833 }
2834#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2835#if defined(__WXMAC__)
2836 {
5c3c8676 2837 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 2838 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 2839 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82
WS
2840 {
2841
2d1659cf 2842#if wxUSE_FONTMAP
d775fa82
WS
2843 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2844 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
2845#else
2846 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2847#endif
d775fa82 2848 if ( conv->IsOk() )
f7e98dee
RN
2849 return conv;
2850
2851 delete conv;
2852 }
2853 }
2854#endif
2855#if defined(__WXCOCOA__)
2856 {
2857 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2858 {
2859
a6900d10 2860#if wxUSE_FONTMAP
f7e98dee
RN
2861 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2862 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
2863#else
2864 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2865#endif
f7e98dee 2866 if ( conv->IsOk() )
d775fa82
WS
2867 return conv;
2868
2869 delete conv;
2870 }
335d31e0
SC
2871 }
2872#endif
e95354ec
VZ
2873 // step (2)
2874 wxFontEncoding enc = m_encoding;
2875#if wxUSE_FONTMAP
c547282d
VZ
2876 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2877 {
2878 // use "false" to suppress interactive dialogs -- we can be called from
2879 // anywhere and popping up a dialog from here is the last thing we want to
2880 // do
267e11c5 2881 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2882 }
e95354ec
VZ
2883#endif // wxUSE_FONTMAP
2884
2885 switch ( enc )
2886 {
2887 case wxFONTENCODING_UTF7:
2888 return new wxMBConvUTF7;
2889
2890 case wxFONTENCODING_UTF8:
2891 return new wxMBConvUTF8;
2892
e95354ec
VZ
2893 case wxFONTENCODING_UTF16BE:
2894 return new wxMBConvUTF16BE;
2895
2896 case wxFONTENCODING_UTF16LE:
2897 return new wxMBConvUTF16LE;
2898
e95354ec
VZ
2899 case wxFONTENCODING_UTF32BE:
2900 return new wxMBConvUTF32BE;
2901
2902 case wxFONTENCODING_UTF32LE:
2903 return new wxMBConvUTF32LE;
2904
2905 default:
2906 // nothing to do but put here to suppress gcc warnings
2907 ;
2908 }
2909
2910 // step (3)
2911#if wxUSE_FONTMAP
2912 {
2913 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2914 : new wxMBConv_wxwin(m_encoding);
2915 if ( conv->IsOk() )
2916 return conv;
2917
2918 delete conv;
2919 }
2920#endif // wxUSE_FONTMAP
2921
a58d4f4d
VS
2922 // NB: This is a hack to prevent deadlock. What could otherwise happen
2923 // in Unicode build: wxConvLocal creation ends up being here
2924 // because of some failure and logs the error. But wxLog will try to
2925 // attach timestamp, for which it will need wxConvLocal (to convert
2926 // time to char* and then wchar_t*), but that fails, tries to log
2927 // error, but wxLog has a (already locked) critical section that
2928 // guards static buffer.
2929 static bool alreadyLoggingError = false;
2930 if (!alreadyLoggingError)
2931 {
2932 alreadyLoggingError = true;
2933 wxLogError(_("Cannot convert from the charset '%s'!"),
2934 m_name ? m_name
e95354ec
VZ
2935 :
2936#if wxUSE_FONTMAP
267e11c5 2937 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
2938#else // !wxUSE_FONTMAP
2939 wxString::Format(_("encoding %s"), m_encoding).c_str()
2940#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2941 );
a58d4f4d
VS
2942 alreadyLoggingError = false;
2943 }
e95354ec
VZ
2944
2945 return NULL;
2946}
2947
2948void wxCSConv::CreateConvIfNeeded() const
2949{
2950 if ( m_deferred )
2951 {
2952 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2953
2954#if wxUSE_INTL
2955 // if we don't have neither the name nor the encoding, use the default
2956 // encoding for this system
2957 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2958 {
4d312c22 2959 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2960 }
2961#endif // wxUSE_INTL
2962
e95354ec
VZ
2963 self->m_convReal = DoCreate();
2964 self->m_deferred = false;
6001e347 2965 }
6001e347
RR
2966}
2967
2968size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2969{
e95354ec 2970 CreateConvIfNeeded();
dccce9ea 2971
e95354ec
VZ
2972 if (m_convReal)
2973 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2974
2975 // latin-1 (direct)
4def3b35 2976 size_t len = strlen(psz);
dccce9ea 2977
f1339c56
RR
2978 if (buf)
2979 {
4def3b35 2980 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2981 buf[c] = (unsigned char)(psz[c]);
2982 }
dccce9ea 2983
f1339c56 2984 return len;
6001e347
RR
2985}
2986
2987size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2988{
e95354ec 2989 CreateConvIfNeeded();
dccce9ea 2990
e95354ec
VZ
2991 if (m_convReal)
2992 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2993
f1339c56 2994 // latin-1 (direct)
f8d791e0 2995 const size_t len = wxWcslen(psz);
f1339c56
RR
2996 if (buf)
2997 {
4def3b35 2998 for (size_t c = 0; c <= len; c++)
24642831
VS
2999 {
3000 if (psz[c] > 0xFF)
3001 return (size_t)-1;
907173e5 3002 buf[c] = (char)psz[c];
24642831
VS
3003 }
3004 }
3005 else
3006 {
3007 for (size_t c = 0; c <= len; c++)
3008 {
3009 if (psz[c] > 0xFF)
3010 return (size_t)-1;
3011 }
f1339c56 3012 }
dccce9ea 3013
f1339c56 3014 return len;
6001e347
RR
3015}
3016
eec47cc6
VZ
3017const char *wxCSConv::GetMBNul(size_t *nulLen) const
3018{
3019 CreateConvIfNeeded();
3020
3021 if ( m_convReal )
3022 {
3023 // cast needed just to call private function of m_convReal
3024 return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
3025 }
3026
3027 *nulLen = 1;
3028 return "";
3029}
3030
bde4baac
VZ
3031// ----------------------------------------------------------------------------
3032// globals
3033// ----------------------------------------------------------------------------
3034
3035#ifdef __WINDOWS__
3036 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
3037#elif defined(__WXMAC__) && !defined(__MACH__)
3038 static wxMBConv_mac wxConvLibcObj ;
bde4baac 3039#else
dcc8fac0 3040 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
3041#endif
3042
3043static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3044static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3045static wxMBConvUTF7 wxConvUTF7Obj;
3046static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 3047
bde4baac
VZ
3048WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3049WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3050WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3051WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3052WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3053WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
3054WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3055#ifdef __WXOSX__
ea8ce907 3056 wxConvUTF8Obj;
f5a1953b 3057#else
ea8ce907 3058 wxConvLibcObj;
f5a1953b
VZ
3059#endif
3060
bde4baac
VZ
3061
3062#else // !wxUSE_WCHAR_T
3063
3064// stand-ins in absence of wchar_t
3065WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3066 wxConvISO8859_1,
3067 wxConvLocal,
3068 wxConvUTF8;
3069
3070#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T