]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
added NUL command
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
6001e347
RR
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
373658eb
VZ
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
bde4baac
VZ
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
7608a683 39#ifdef __WINDOWS__
532d575b 40 #include "wx/msw/private.h"
13dd924a 41 #include "wx/msw/missing.h"
0a1c1e62
GRG
42#endif
43
1c193821 44#ifndef __WXWINCE__
1cd52418 45#include <errno.h>
1c193821
JS
46#endif
47
6001e347
RR
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
e95354ec
VZ
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
6001e347 56#ifdef __SALFORDC__
373658eb 57 #include <clib.h>
6001e347
RR
58#endif
59
b040e242 60#ifdef HAVE_ICONV
373658eb 61 #include <iconv.h>
b1d547eb 62 #include "wx/thread.h"
1cd52418 63#endif
1cd52418 64
373658eb
VZ
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
7608a683 67#include "wx/utils.h"
373658eb 68
335d31e0 69#ifdef __WXMAC__
40ba2f3b 70#ifndef __DARWIN__
4227afa4
SC
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
40ba2f3b 74#endif
335d31e0
SC
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
ce6f8d6f
VZ
78
79#define TRACE_STRCONV _T("strconv")
80
4948c2b6 81#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
82 #define WC_UTF16
83#endif
84
373658eb
VZ
85// ============================================================================
86// implementation
87// ============================================================================
88
89// ----------------------------------------------------------------------------
c91830cb 90// UTF-16 en/decoding to/from UCS-4
373658eb 91// ----------------------------------------------------------------------------
6001e347 92
b0a6bb75 93
c91830cb 94static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 95{
dccce9ea 96 if (input<=0xffff)
4def3b35 97 {
999836aa
VZ
98 if (output)
99 *output = (wxUint16) input;
4def3b35 100 return 1;
dccce9ea
VZ
101 }
102 else if (input>=0x110000)
4def3b35
VS
103 {
104 return (size_t)-1;
dccce9ea
VZ
105 }
106 else
4def3b35 107 {
dccce9ea 108 if (output)
4def3b35 109 {
c91830cb 110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
112 }
113 return 2;
1cd52418 114 }
1cd52418
OK
115}
116
c91830cb 117static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 118{
dccce9ea 119 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
120 {
121 output = *input;
122 return 1;
dccce9ea 123 }
cdb14ecb 124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
4def3b35
VS
125 {
126 output = *input;
127 return (size_t)-1;
dccce9ea
VZ
128 }
129 else
4def3b35
VS
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
1cd52418
OK
134}
135
b0a6bb75 136
f6bcfd97 137// ----------------------------------------------------------------------------
6001e347 138// wxMBConv
f6bcfd97 139// ----------------------------------------------------------------------------
2c53a80a
WS
140
141wxMBConv::~wxMBConv()
142{
143 // nothing to do here (necessary for Darwin linking probably)
144}
6001e347 145
6001e347
RR
146const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147{
2b5f62a0 148 if ( psz )
6001e347 149 {
2b5f62a0
VZ
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
635f33ce
VS
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
2b5f62a0 161 }
f6bcfd97 162 }
2b5f62a0
VZ
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
6001e347
RR
167}
168
e5cceba0 169const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 170{
2b5f62a0
VZ
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
c91830cb 176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
2b5f62a0
VZ
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
e5cceba0 186
e5cceba0 187 return buf;
6001e347
RR
188}
189
eec47cc6
VZ
190const wxWCharBuffer
191wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
e4e3bbb4 192{
eec47cc6
VZ
193 // the currently accumulated wide characters
194 wxWCharBuffer wbuf;
195
196 // the current length of wbuf
197 size_t lenBuf = 0;
198
199 // we need to know the representation of L'\0' for this conversion
200 size_t nulLen;
201 const char * const nul = GetMBNul(&nulLen);
202 if ( nulLen == (size_t)-1 || nulLen == 0 )
203 return wxWCharBuffer();
204
205 // make a copy of the input string unless it is already properly
206 // NUL-terminated
207 wxCharBuffer bufTmp;
208
209 // now we can compute the input size if we were not given it: notice that
210 // in this case the string must be properly NUL-terminated, of course, as
211 // otherwise we have no way of knowing how long it is
212 if ( inLen == (size_t)-1 )
213 {
214 // not the most efficient algorithm but it shouldn't matter as normally
215 // there are not many NULs in the string and so normally memcmp()
216 // should stop on the first character
217 for ( const char *p = in; ; p++ )
218 {
219 if ( memcmp(p, nul, nulLen) == 0 )
220 break;
221 }
e4e3bbb4 222
eec47cc6
VZ
223 inLen = p - in + nulLen;
224 }
225 else // we already have the size
e4e3bbb4 226 {
eec47cc6
VZ
227 // check if it's not already NUL-terminated too to avoid the copy
228 if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
229 {
230 // make a copy in order to properly NUL-terminate the string
231 bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
232 memcpy(bufTmp.data(), in, inLen);
233 memcpy(bufTmp.data() + inLen, nul, nulLen);
234 }
235 }
e4e3bbb4 236
eec47cc6
VZ
237 if ( bufTmp )
238 in = bufTmp;
e4e3bbb4 239
eec47cc6
VZ
240 for ( const char * const inEnd = in + inLen;; )
241 {
242 // try to convert the current chunk if anything left
243 size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
244 if ( lenChunk == 0 )
f5fb6871 245 {
eec47cc6
VZ
246 // nothing left in the input string, conversion succeeded
247 if ( outLen )
248 {
249 // we shouldn't include the last NUL in the result length
250 *outLen = lenBuf ? lenBuf - 1 : 0;
251 }
252
253 return wbuf;
f5fb6871
RN
254 }
255
eec47cc6
VZ
256 if ( lenChunk == (size_t)-1 )
257 break;
e4e3bbb4 258
eec47cc6
VZ
259 const size_t lenBufNew = lenBuf + lenChunk;
260 if ( !wbuf.extend(lenBufNew) )
261 break;
e4e3bbb4 262
eec47cc6
VZ
263 lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
264 if ( lenChunk == (size_t)-1 )
265 break;
f5fb6871 266
eec47cc6
VZ
267 // +! for the embedded NUL (if something follows)
268 lenBuf = lenBufNew + 1;
269
270 // advance the input pointer past the end of this chunk
271 while ( memcmp(in, nul, nulLen) != 0 )
272 in++;
e4e3bbb4 273
eec47cc6 274 in += nulLen; // skipping over its terminator as well
e4e3bbb4
RN
275 }
276
eec47cc6
VZ
277 // conversion failed
278 if ( outLen )
279 *outLen = 0;
280
281 return wxWCharBuffer();
e4e3bbb4
RN
282}
283
eec47cc6
VZ
284const wxCharBuffer
285wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
e4e3bbb4 286{
eec47cc6
VZ
287 // the currently accumulated multibyte characters
288 wxCharBuffer buf;
f5fb6871 289
eec47cc6
VZ
290 // the current length of buf
291 size_t lenBuf = 0;
e4e3bbb4 292
eec47cc6
VZ
293 // make a copy of the input string unless it is already properly
294 // NUL-terminated
295 //
296 // if we don't know its length we have no choice but to assume that it is,
297 // indeed, properly terminated
298 wxWCharBuffer bufTmp;
299 if ( inLen == (size_t)-1 )
e4e3bbb4 300 {
eec47cc6
VZ
301 inLen = wxWcslen(in) + 1;
302 }
303 else if ( inLen != 0 && in[inLen - 1] != L'\0' )
304 {
305 // make a copy in order to properly NUL-terminate the string
306 bufTmp = wxWCharBuffer(inLen);
307 memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
308 }
e4e3bbb4 309
eec47cc6
VZ
310 if ( bufTmp )
311 in = bufTmp;
e4e3bbb4 312
eec47cc6
VZ
313 for ( const wchar_t * const inEnd = in + inLen;; )
314 {
315 // try to convert the current chunk, if anything left
316 size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
317 if ( lenChunk == 0 )
f5fb6871 318 {
eec47cc6
VZ
319 // nothing left in the input string, conversion succeeded
320 if ( outLen )
321 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
322
323 return buf;
f5fb6871 324 }
e4e3bbb4 325
eec47cc6
VZ
326 if ( lenChunk == (size_t)-1 )
327 break;
3698ae71 328
eec47cc6
VZ
329 const size_t lenBufNew = lenBuf + lenChunk;
330 if ( !buf.extend(lenBufNew) )
331 break;
f5fb6871 332
eec47cc6
VZ
333 lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
334 if ( lenChunk == (size_t)-1 )
335 break;
e4e3bbb4 336
eec47cc6
VZ
337 // chunk successfully converted, go to the next one
338 in += wxWcslen(in) + 1 /* skip NUL too */;
339 lenBuf = lenBufNew + 1;
e4e3bbb4
RN
340 }
341
eec47cc6
VZ
342 // conversion failed
343 if ( outLen )
344 *outLen = 0;
345
346 return wxCharBuffer();
e4e3bbb4
RN
347}
348
6001e347 349// ----------------------------------------------------------------------------
bde4baac 350// wxMBConvLibc
6001e347
RR
351// ----------------------------------------------------------------------------
352
bde4baac
VZ
353size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
354{
355 return wxMB2WC(buf, psz, n);
356}
357
358size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
359{
360 return wxWC2MB(buf, psz, n);
361}
e1bfe89e
RR
362
363// ----------------------------------------------------------------------------
532d575b 364// wxConvBrokenFileNames
e1bfe89e
RR
365// ----------------------------------------------------------------------------
366
eec47cc6
VZ
367#ifdef __UNIX__
368
845905d5 369wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 370{
845905d5
MW
371 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
372 || wxStricmp(charset, _T("UTF8")) == 0 )
373 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
374 else
375 m_conv = new wxCSConv(charset);
ea8ce907
RR
376}
377
eec47cc6 378#endif // __UNIX__
c12b7f79 379
bde4baac 380// ----------------------------------------------------------------------------
3698ae71 381// UTF-7
bde4baac 382// ----------------------------------------------------------------------------
6001e347 383
15f2ee32 384// Implementation (C) 2004 Fredrik Roubert
6001e347 385
15f2ee32
RN
386//
387// BASE64 decoding table
388//
389static const unsigned char utf7unb64[] =
6001e347 390{
15f2ee32
RN
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
397 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
398 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
400 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
401 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
402 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
403 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
404 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
405 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
406 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
412 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
413 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
414 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
415 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
416 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
417 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
418 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
423};
424
425size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
426{
15f2ee32
RN
427 size_t len = 0;
428
04a37834 429 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
430 {
431 unsigned char cc = *psz++;
432 if (cc != '+')
433 {
434 // plain ASCII char
435 if (buf)
436 *buf++ = cc;
437 len++;
438 }
439 else if (*psz == '-')
440 {
441 // encoded plus sign
442 if (buf)
443 *buf++ = cc;
444 len++;
445 psz++;
446 }
04a37834 447 else // start of BASE64 encoded string
15f2ee32 448 {
04a37834 449 bool lsb, ok;
15f2ee32 450 unsigned int d, l;
04a37834
VZ
451 for ( ok = lsb = false, d = 0, l = 0;
452 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
453 psz++ )
15f2ee32
RN
454 {
455 d <<= 6;
456 d += cc;
457 for (l += 6; l >= 8; lsb = !lsb)
458 {
04a37834 459 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
460 if (lsb)
461 {
462 if (buf)
463 *buf++ |= c;
464 len ++;
465 }
466 else
04a37834 467 {
15f2ee32 468 if (buf)
6356d52a 469 *buf = (wchar_t)(c << 8);
04a37834
VZ
470 }
471
472 ok = true;
15f2ee32
RN
473 }
474 }
04a37834
VZ
475
476 if ( !ok )
477 {
478 // in valid UTF7 we should have valid characters after '+'
479 return (size_t)-1;
480 }
481
15f2ee32
RN
482 if (*psz == '-')
483 psz++;
484 }
485 }
04a37834
VZ
486
487 if ( buf && (len < n) )
488 *buf = '\0';
489
15f2ee32 490 return len;
6001e347
RR
491}
492
15f2ee32
RN
493//
494// BASE64 encoding table
495//
496static const unsigned char utf7enb64[] =
497{
498 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
499 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
500 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
501 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
502 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
503 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
504 'w', 'x', 'y', 'z', '0', '1', '2', '3',
505 '4', '5', '6', '7', '8', '9', '+', '/'
506};
507
508//
509// UTF-7 encoding table
510//
511// 0 - Set D (directly encoded characters)
512// 1 - Set O (optional direct characters)
513// 2 - whitespace characters (optional)
514// 3 - special characters
515//
516static const unsigned char utf7encode[128] =
6001e347 517{
15f2ee32
RN
518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
519 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
520 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
522 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
524 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
526};
527
667e5b3e 528size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 529{
15f2ee32
RN
530 size_t len = 0;
531
532 while (*psz && ((!buf) || (len < n)))
533 {
534 wchar_t cc = *psz++;
535 if (cc < 0x80 && utf7encode[cc] < 1)
536 {
537 // plain ASCII char
538 if (buf)
539 *buf++ = (char)cc;
540 len++;
541 }
542#ifndef WC_UTF16
79c78d42 543 else if (((wxUint32)cc) > 0xffff)
b2c13097 544 {
15f2ee32
RN
545 // no surrogate pair generation (yet?)
546 return (size_t)-1;
547 }
548#endif
549 else
550 {
551 if (buf)
552 *buf++ = '+';
553 len++;
554 if (cc != '+')
555 {
556 // BASE64 encode string
557 unsigned int lsb, d, l;
73c902d6 558 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
559 {
560 for (lsb = 0; lsb < 2; lsb ++)
561 {
562 d <<= 8;
563 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
564
565 for (l += 8; l >= 6; )
566 {
567 l -= 6;
568 if (buf)
569 *buf++ = utf7enb64[(d >> l) % 64];
570 len++;
571 }
572 }
573 cc = *psz;
574 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
575 break;
576 }
577 if (l != 0)
578 {
579 if (buf)
580 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
581 len++;
582 }
583 }
584 if (buf)
585 *buf++ = '-';
586 len++;
587 }
588 }
589 if (buf && (len < n))
590 *buf = 0;
591 return len;
6001e347
RR
592}
593
f6bcfd97 594// ----------------------------------------------------------------------------
6001e347 595// UTF-8
f6bcfd97 596// ----------------------------------------------------------------------------
6001e347 597
dccce9ea 598static wxUint32 utf8_max[]=
4def3b35 599 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 600
3698ae71
VZ
601// boundaries of the private use area we use to (temporarily) remap invalid
602// characters invalid in a UTF-8 encoded string
ea8ce907
RR
603const wxUint32 wxUnicodePUA = 0x100000;
604const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
605
6001e347
RR
606size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
607{
4def3b35
VS
608 size_t len = 0;
609
dccce9ea 610 while (*psz && ((!buf) || (len < n)))
4def3b35 611 {
ea8ce907
RR
612 const char *opsz = psz;
613 bool invalid = false;
4def3b35
VS
614 unsigned char cc = *psz++, fc = cc;
615 unsigned cnt;
dccce9ea 616 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 617 fc <<= 1;
dccce9ea 618 if (!cnt)
4def3b35
VS
619 {
620 // plain ASCII char
dccce9ea 621 if (buf)
4def3b35
VS
622 *buf++ = cc;
623 len++;
561488ef
MW
624
625 // escape the escape character for octal escapes
626 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
627 && cc == '\\' && (!buf || len < n))
628 {
629 if (buf)
630 *buf++ = cc;
631 len++;
632 }
dccce9ea
VZ
633 }
634 else
4def3b35
VS
635 {
636 cnt--;
dccce9ea 637 if (!cnt)
4def3b35
VS
638 {
639 // invalid UTF-8 sequence
ea8ce907 640 invalid = true;
dccce9ea
VZ
641 }
642 else
4def3b35
VS
643 {
644 unsigned ocnt = cnt - 1;
645 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 646 while (cnt--)
4def3b35 647 {
ea8ce907 648 cc = *psz;
dccce9ea 649 if ((cc & 0xC0) != 0x80)
4def3b35
VS
650 {
651 // invalid UTF-8 sequence
ea8ce907
RR
652 invalid = true;
653 break;
4def3b35 654 }
ea8ce907 655 psz++;
4def3b35
VS
656 res = (res << 6) | (cc & 0x3f);
657 }
ea8ce907 658 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
659 {
660 // illegal UTF-8 encoding
ea8ce907 661 invalid = true;
4def3b35 662 }
ea8ce907
RR
663 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
664 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
665 {
666 // if one of our PUA characters turns up externally
667 // it must also be treated as an illegal sequence
668 // (a bit like you have to escape an escape character)
669 invalid = true;
670 }
671 else
672 {
1cd52418 673#ifdef WC_UTF16
ea8ce907
RR
674 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
675 size_t pa = encode_utf16(res, (wxUint16 *)buf);
676 if (pa == (size_t)-1)
677 {
678 invalid = true;
679 }
680 else
681 {
682 if (buf)
683 buf += pa;
684 len += pa;
685 }
373658eb 686#else // !WC_UTF16
ea8ce907 687 if (buf)
38d4b1e4 688 *buf++ = (wchar_t)res;
ea8ce907 689 len++;
373658eb 690#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
691 }
692 }
693 if (invalid)
694 {
695 if (m_options & MAP_INVALID_UTF8_TO_PUA)
696 {
697 while (opsz < psz && (!buf || len < n))
698 {
699#ifdef WC_UTF16
700 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
701 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
702 wxASSERT(pa != (size_t)-1);
703 if (buf)
704 buf += pa;
705 opsz++;
706 len += pa;
707#else
708 if (buf)
38d4b1e4 709 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
710 opsz++;
711 len++;
712#endif
713 }
714 }
3698ae71 715 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
716 {
717 while (opsz < psz && (!buf || len < n))
718 {
3698ae71
VZ
719 if ( buf && len + 3 < n )
720 {
17a1ebd1 721 unsigned char on = *opsz;
3698ae71 722 *buf++ = L'\\';
17a1ebd1
VZ
723 *buf++ = (wchar_t)( L'0' + on / 0100 );
724 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
725 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 726 }
ea8ce907
RR
727 opsz++;
728 len += 4;
729 }
730 }
3698ae71 731 else // MAP_INVALID_UTF8_NOT
ea8ce907
RR
732 {
733 return (size_t)-1;
734 }
4def3b35
VS
735 }
736 }
6001e347 737 }
dccce9ea 738 if (buf && (len < n))
4def3b35
VS
739 *buf = 0;
740 return len;
6001e347
RR
741}
742
3698ae71
VZ
743static inline bool isoctal(wchar_t wch)
744{
745 return L'0' <= wch && wch <= L'7';
746}
747
6001e347
RR
748size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
749{
4def3b35 750 size_t len = 0;
6001e347 751
dccce9ea 752 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
753 {
754 wxUint32 cc;
1cd52418 755#ifdef WC_UTF16
b5153fd8
VZ
756 // cast is ok for WC_UTF16
757 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 758 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 759#else
4def3b35
VS
760 cc=(*psz++) & 0x7fffffff;
761#endif
3698ae71
VZ
762
763 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
764 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 765 {
dccce9ea 766 if (buf)
ea8ce907 767 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 768 len++;
3698ae71 769 }
561488ef
MW
770 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
771 && cc == L'\\' && psz[0] == L'\\' )
772 {
773 if (buf)
774 *buf++ = (char)cc;
775 psz++;
776 len++;
777 }
3698ae71
VZ
778 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
779 cc == L'\\' &&
780 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 781 {
dccce9ea 782 if (buf)
3698ae71 783 {
b2c13097
WS
784 *buf++ = (char) ((psz[0] - L'0')*0100 +
785 (psz[1] - L'0')*010 +
786 (psz[2] - L'0'));
3698ae71
VZ
787 }
788
789 psz += 3;
ea8ce907
RR
790 len++;
791 }
792 else
793 {
794 unsigned cnt;
795 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
796 if (!cnt)
4def3b35 797 {
ea8ce907
RR
798 // plain ASCII char
799 if (buf)
800 *buf++ = (char) cc;
801 len++;
802 }
803
804 else
805 {
806 len += cnt + 1;
807 if (buf)
808 {
809 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
810 while (cnt--)
811 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
812 }
4def3b35
VS
813 }
814 }
6001e347 815 }
4def3b35 816
3698ae71
VZ
817 if (buf && (len<n))
818 *buf = 0;
adb45366 819
4def3b35 820 return len;
6001e347
RR
821}
822
c91830cb
VZ
823// ----------------------------------------------------------------------------
824// UTF-16
825// ----------------------------------------------------------------------------
826
827#ifdef WORDS_BIGENDIAN
bde4baac
VZ
828 #define wxMBConvUTF16straight wxMBConvUTF16BE
829 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 830#else
bde4baac
VZ
831 #define wxMBConvUTF16swap wxMBConvUTF16BE
832 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
833#endif
834
835
c91830cb
VZ
836#ifdef WC_UTF16
837
c91830cb
VZ
838// copy 16bit MB to 16bit String
839size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
840{
841 size_t len=0;
842
843 while (*(wxUint16*)psz && (!buf || len < n))
844 {
845 if (buf)
846 *buf++ = *(wxUint16*)psz;
847 len++;
848
849 psz += sizeof(wxUint16);
850 }
851 if (buf && len<n) *buf=0;
852
853 return len;
854}
855
856
857// copy 16bit String to 16bit MB
858size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
859{
860 size_t len=0;
861
862 while (*psz && (!buf || len < n))
863 {
864 if (buf)
865 {
866 *(wxUint16*)buf = *psz;
867 buf += sizeof(wxUint16);
868 }
869 len += sizeof(wxUint16);
870 psz++;
871 }
872 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
873
874 return len;
875}
876
877
878// swap 16bit MB to 16bit String
879size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
880{
bfab25d4 881 size_t len = 0;
c91830cb 882
da12017a
VZ
883 // UTF16 string must be terminated by 2 NULs as single NULs may occur
884 // inside the string
885 while ( (psz[0] || psz[1]) && (!buf || len < n) )
c91830cb 886 {
bfab25d4 887 if ( buf )
c91830cb
VZ
888 {
889 ((char *)buf)[0] = psz[1];
890 ((char *)buf)[1] = psz[0];
891 buf++;
892 }
893 len++;
bfab25d4 894 psz += 2;
c91830cb 895 }
bfab25d4
VZ
896
897 if ( buf && len < n )
898 *buf = L'\0';
c91830cb
VZ
899
900 return len;
901}
902
903
904// swap 16bit MB to 16bit String
905size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
906{
eec47cc6 907 size_t len = 0;
c91830cb 908
eec47cc6 909 while ( *psz && (!buf || len < n) )
c91830cb 910 {
eec47cc6 911 if ( buf )
c91830cb
VZ
912 {
913 *buf++ = ((char*)psz)[1];
914 *buf++ = ((char*)psz)[0];
915 }
eec47cc6 916 len += 2;
c91830cb
VZ
917 psz++;
918 }
eec47cc6
VZ
919
920 if ( buf && len < n )
921 *buf = '\0';
c91830cb
VZ
922
923 return len;
924}
925
926
927#else // WC_UTF16
928
929
930// copy 16bit MB to 32bit String
931size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
932{
933 size_t len=0;
934
935 while (*(wxUint16*)psz && (!buf || len < n))
936 {
937 wxUint32 cc;
938 size_t pa=decode_utf16((wxUint16*)psz, cc);
939 if (pa == (size_t)-1)
940 return pa;
941
942 if (buf)
38d4b1e4 943 *buf++ = (wchar_t)cc;
c91830cb
VZ
944 len++;
945 psz += pa * sizeof(wxUint16);
946 }
947 if (buf && len<n) *buf=0;
948
949 return len;
950}
951
952
953// copy 32bit String to 16bit MB
954size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
955{
956 size_t len=0;
957
958 while (*psz && (!buf || len < n))
959 {
960 wxUint16 cc[2];
961 size_t pa=encode_utf16(*psz, cc);
962
963 if (pa == (size_t)-1)
964 return pa;
965
966 if (buf)
967 {
69b80d28 968 *(wxUint16*)buf = cc[0];
b5153fd8 969 buf += sizeof(wxUint16);
c91830cb 970 if (pa > 1)
69b80d28
VZ
971 {
972 *(wxUint16*)buf = cc[1];
973 buf += sizeof(wxUint16);
974 }
c91830cb
VZ
975 }
976
977 len += pa*sizeof(wxUint16);
978 psz++;
979 }
980 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
981
982 return len;
983}
984
985
986// swap 16bit MB to 32bit String
987size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
988{
989 size_t len=0;
990
991 while (*(wxUint16*)psz && (!buf || len < n))
992 {
993 wxUint32 cc;
994 char tmp[4];
995 tmp[0]=psz[1]; tmp[1]=psz[0];
996 tmp[2]=psz[3]; tmp[3]=psz[2];
997
998 size_t pa=decode_utf16((wxUint16*)tmp, cc);
999 if (pa == (size_t)-1)
1000 return pa;
1001
1002 if (buf)
38d4b1e4 1003 *buf++ = (wchar_t)cc;
c91830cb
VZ
1004
1005 len++;
1006 psz += pa * sizeof(wxUint16);
1007 }
1008 if (buf && len<n) *buf=0;
1009
1010 return len;
1011}
1012
1013
1014// swap 32bit String to 16bit MB
1015size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1016{
1017 size_t len=0;
1018
1019 while (*psz && (!buf || len < n))
1020 {
1021 wxUint16 cc[2];
1022 size_t pa=encode_utf16(*psz, cc);
1023
1024 if (pa == (size_t)-1)
1025 return pa;
1026
1027 if (buf)
1028 {
1029 *buf++ = ((char*)cc)[1];
1030 *buf++ = ((char*)cc)[0];
1031 if (pa > 1)
1032 {
1033 *buf++ = ((char*)cc)[3];
1034 *buf++ = ((char*)cc)[2];
1035 }
1036 }
1037
1038 len += pa*sizeof(wxUint16);
1039 psz++;
1040 }
1041 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1042
1043 return len;
1044}
1045
1046#endif // WC_UTF16
1047
1048
1049// ----------------------------------------------------------------------------
1050// UTF-32
1051// ----------------------------------------------------------------------------
1052
1053#ifdef WORDS_BIGENDIAN
1054#define wxMBConvUTF32straight wxMBConvUTF32BE
1055#define wxMBConvUTF32swap wxMBConvUTF32LE
1056#else
1057#define wxMBConvUTF32swap wxMBConvUTF32BE
1058#define wxMBConvUTF32straight wxMBConvUTF32LE
1059#endif
1060
1061
1062WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1063WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1064
1065
1066#ifdef WC_UTF16
1067
1068// copy 32bit MB to 16bit String
1069size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1070{
1071 size_t len=0;
1072
1073 while (*(wxUint32*)psz && (!buf || len < n))
1074 {
1075 wxUint16 cc[2];
1076
1077 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1078 if (pa == (size_t)-1)
1079 return pa;
1080
1081 if (buf)
1082 {
1083 *buf++ = cc[0];
1084 if (pa > 1)
1085 *buf++ = cc[1];
1086 }
1087 len += pa;
1088 psz += sizeof(wxUint32);
1089 }
1090 if (buf && len<n) *buf=0;
1091
1092 return len;
1093}
1094
1095
1096// copy 16bit String to 32bit MB
1097size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1098{
1099 size_t len=0;
1100
1101 while (*psz && (!buf || len < n))
1102 {
1103 wxUint32 cc;
1104
b5153fd8
VZ
1105 // cast is ok for WC_UTF16
1106 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
1107 if (pa == (size_t)-1)
1108 return pa;
1109
1110 if (buf)
1111 {
1112 *(wxUint32*)buf = cc;
1113 buf += sizeof(wxUint32);
1114 }
1115 len += sizeof(wxUint32);
1116 psz += pa;
1117 }
b5153fd8
VZ
1118
1119 if (buf && len<=n-sizeof(wxUint32))
1120 *(wxUint32*)buf=0;
c91830cb
VZ
1121
1122 return len;
1123}
1124
1125
1126
1127// swap 32bit MB to 16bit String
1128size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1129{
1130 size_t len=0;
1131
1132 while (*(wxUint32*)psz && (!buf || len < n))
1133 {
1134 char tmp[4];
1135 tmp[0] = psz[3]; tmp[1] = psz[2];
1136 tmp[2] = psz[1]; tmp[3] = psz[0];
1137
1138
1139 wxUint16 cc[2];
1140
1141 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1142 if (pa == (size_t)-1)
1143 return pa;
1144
1145 if (buf)
1146 {
1147 *buf++ = cc[0];
1148 if (pa > 1)
1149 *buf++ = cc[1];
1150 }
1151 len += pa;
1152 psz += sizeof(wxUint32);
1153 }
b5153fd8
VZ
1154
1155 if (buf && len<n)
1156 *buf=0;
c91830cb
VZ
1157
1158 return len;
1159}
1160
1161
1162// swap 16bit String to 32bit MB
1163size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1164{
1165 size_t len=0;
1166
1167 while (*psz && (!buf || len < n))
1168 {
1169 char cc[4];
1170
b5153fd8
VZ
1171 // cast is ok for WC_UTF16
1172 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
1173 if (pa == (size_t)-1)
1174 return pa;
1175
1176 if (buf)
1177 {
1178 *buf++ = cc[3];
1179 *buf++ = cc[2];
1180 *buf++ = cc[1];
1181 *buf++ = cc[0];
1182 }
1183 len += sizeof(wxUint32);
1184 psz += pa;
1185 }
b5153fd8
VZ
1186
1187 if (buf && len<=n-sizeof(wxUint32))
1188 *(wxUint32*)buf=0;
c91830cb
VZ
1189
1190 return len;
1191}
1192
1193#else // WC_UTF16
1194
1195
1196// copy 32bit MB to 32bit String
1197size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1198{
1199 size_t len=0;
1200
1201 while (*(wxUint32*)psz && (!buf || len < n))
1202 {
1203 if (buf)
38d4b1e4 1204 *buf++ = (wchar_t)(*(wxUint32*)psz);
c91830cb
VZ
1205 len++;
1206 psz += sizeof(wxUint32);
1207 }
b5153fd8
VZ
1208
1209 if (buf && len<n)
1210 *buf=0;
c91830cb
VZ
1211
1212 return len;
1213}
1214
1215
1216// copy 32bit String to 32bit MB
1217size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1218{
1219 size_t len=0;
1220
1221 while (*psz && (!buf || len < n))
1222 {
1223 if (buf)
1224 {
1225 *(wxUint32*)buf = *psz;
1226 buf += sizeof(wxUint32);
1227 }
1228
1229 len += sizeof(wxUint32);
1230 psz++;
1231 }
1232
b5153fd8
VZ
1233 if (buf && len<=n-sizeof(wxUint32))
1234 *(wxUint32*)buf=0;
c91830cb
VZ
1235
1236 return len;
1237}
1238
1239
1240// swap 32bit MB to 32bit String
1241size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1242{
1243 size_t len=0;
1244
1245 while (*(wxUint32*)psz && (!buf || len < n))
1246 {
1247 if (buf)
1248 {
1249 ((char *)buf)[0] = psz[3];
1250 ((char *)buf)[1] = psz[2];
1251 ((char *)buf)[2] = psz[1];
1252 ((char *)buf)[3] = psz[0];
1253 buf++;
1254 }
1255 len++;
1256 psz += sizeof(wxUint32);
1257 }
b5153fd8
VZ
1258
1259 if (buf && len<n)
1260 *buf=0;
c91830cb
VZ
1261
1262 return len;
1263}
1264
1265
1266// swap 32bit String to 32bit MB
1267size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1268{
1269 size_t len=0;
1270
1271 while (*psz && (!buf || len < n))
1272 {
1273 if (buf)
1274 {
1275 *buf++ = ((char *)psz)[3];
1276 *buf++ = ((char *)psz)[2];
1277 *buf++ = ((char *)psz)[1];
1278 *buf++ = ((char *)psz)[0];
1279 }
1280 len += sizeof(wxUint32);
1281 psz++;
1282 }
b5153fd8
VZ
1283
1284 if (buf && len<=n-sizeof(wxUint32))
1285 *(wxUint32*)buf=0;
c91830cb
VZ
1286
1287 return len;
1288}
1289
1290
1291#endif // WC_UTF16
1292
1293
36acb880
VZ
1294// ============================================================================
1295// The classes doing conversion using the iconv_xxx() functions
1296// ============================================================================
3caec1bb 1297
b040e242 1298#ifdef HAVE_ICONV
3a0d76bc 1299
b1d547eb
VS
1300// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1301// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1302// (unless there's yet another bug in glibc) the only case when iconv()
1303// returns with (size_t)-1 (which means error) and says there are 0 bytes
1304// left in the input buffer -- when _real_ error occurs,
1305// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1306// iconv() failure.
3caec1bb
VS
1307// [This bug does not appear in glibc 2.2.]
1308#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1309#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1310 (errno != E2BIG || bufLeft != 0))
1311#else
1312#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1313#endif
1314
ab217dba 1315#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1316
74a7eb0b
VZ
1317#define ICONV_T_INVALID ((iconv_t)-1)
1318
1319#if SIZEOF_WCHAR_T == 4
1320 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1321 #define WC_ENC wxFONTENCODING_UTF32
1322#elif SIZEOF_WCHAR_T == 2
1323 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1324 #define WC_ENC wxFONTENCODING_UTF16
1325#else // sizeof(wchar_t) != 2 nor 4
1326 // does this ever happen?
1327 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1328#endif
1329
36acb880 1330// ----------------------------------------------------------------------------
e95354ec 1331// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1332// ----------------------------------------------------------------------------
1333
e95354ec 1334class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1335{
1336public:
e95354ec
VZ
1337 wxMBConv_iconv(const wxChar *name);
1338 virtual ~wxMBConv_iconv();
36acb880 1339
bde4baac
VZ
1340 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1341 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1342
e95354ec 1343 bool IsOk() const
74a7eb0b 1344 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1345
1346protected:
1347 // the iconv handlers used to translate from multibyte to wide char and in
1348 // the other direction
1349 iconv_t m2w,
1350 w2m;
b1d547eb
VS
1351#if wxUSE_THREADS
1352 // guards access to m2w and w2m objects
1353 wxMutex m_iconvMutex;
1354#endif
36acb880
VZ
1355
1356private:
eec47cc6
VZ
1357 virtual const char *GetMBNul(size_t *nulLen) const;
1358
e95354ec 1359 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1360 // available on this machine, it will remain NULL
74a7eb0b 1361 static wxString ms_wcCharsetName;
36acb880
VZ
1362
1363 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1364 // different endian-ness than the native one
405d8f46 1365 static bool ms_wcNeedsSwap;
eec47cc6
VZ
1366
1367 // NUL representation
1368 size_t m_nulLen;
1369 char m_nulBuf[8];
36acb880
VZ
1370};
1371
8f115891
MW
1372// make the constructor available for unit testing
1373WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1374{
1375 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1376 if ( !result->IsOk() )
1377 {
1378 delete result;
1379 return 0;
1380 }
1381 return result;
1382}
1383
422e411e 1384wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1385bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1386
e95354ec 1387wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1388{
eec47cc6
VZ
1389 m_nulLen = (size_t)-2;
1390
0331b385
VZ
1391 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1392 // names for the charsets
200a9923 1393 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1394
36acb880 1395 // check for charset that represents wchar_t:
74a7eb0b 1396 if ( ms_wcCharsetName.empty() )
f1339c56 1397 {
c2b83fdd
VZ
1398 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1399
74a7eb0b
VZ
1400#if wxUSE_FONTMAP
1401 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1402#else // !wxUSE_FONTMAP
1403 static const wxChar *names[] =
36acb880 1404 {
74a7eb0b
VZ
1405#if SIZEOF_WCHAR_T == 4
1406 _T("UCS-4"),
1407#elif SIZEOF_WCHAR_T = 2
1408 _T("UCS-2"),
1409#endif
1410 NULL
1411 };
1412#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1413
d1f024a8 1414 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1415 {
17a1ebd1 1416 const wxString nameCS(*names);
74a7eb0b
VZ
1417
1418 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1419 wxString nameXE(nameCS);
74a7eb0b
VZ
1420 #ifdef WORDS_BIGENDIAN
1421 nameXE += _T("BE");
1422 #else // little endian
1423 nameXE += _T("LE");
1424 #endif
1425
c2b83fdd
VZ
1426 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1427 nameXE.c_str());
1428
74a7eb0b
VZ
1429 m2w = iconv_open(nameXE.ToAscii(), cname);
1430 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1431 {
74a7eb0b 1432 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1433 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1434 nameCS.c_str());
17a1ebd1 1435 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1436
74a7eb0b
VZ
1437 // and check for bytesex ourselves:
1438 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1439 {
74a7eb0b
VZ
1440 char buf[2], *bufPtr;
1441 wchar_t wbuf[2], *wbufPtr;
1442 size_t insz, outsz;
1443 size_t res;
1444
1445 buf[0] = 'A';
1446 buf[1] = 0;
1447 wbuf[0] = 0;
1448 insz = 2;
1449 outsz = SIZEOF_WCHAR_T * 2;
1450 wbufPtr = wbuf;
1451 bufPtr = buf;
1452
1453 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1454 (char**)&wbufPtr, &outsz);
1455
1456 if (ICONV_FAILED(res, insz))
1457 {
1458 wxLogLastError(wxT("iconv"));
422e411e 1459 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1460 nameCS.c_str());
74a7eb0b
VZ
1461 }
1462 else // ok, can convert to this encoding, remember it
1463 {
17a1ebd1 1464 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1465 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1466 }
3a0d76bc
VS
1467 }
1468 }
74a7eb0b 1469 else // use charset not requiring byte swapping
36acb880 1470 {
74a7eb0b 1471 ms_wcCharsetName = nameXE;
36acb880 1472 }
3a0d76bc 1473 }
74a7eb0b 1474
0944fceb 1475 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1476 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1477 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1478 : ms_wcCharsetName.c_str(),
1479 ms_wcNeedsSwap ? _T(" (needs swap)")
1480 : _T(""));
3a0d76bc 1481 }
36acb880 1482 else // we already have ms_wcCharsetName
3caec1bb 1483 {
74a7eb0b 1484 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1485 }
dccce9ea 1486
74a7eb0b 1487 if ( ms_wcCharsetName.empty() )
f1339c56 1488 {
74a7eb0b 1489 w2m = ICONV_T_INVALID;
36acb880 1490 }
405d8f46
VZ
1491 else
1492 {
74a7eb0b
VZ
1493 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1494 if ( w2m == ICONV_T_INVALID )
1495 {
1496 wxLogTrace(TRACE_STRCONV,
1497 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1498 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1499 }
405d8f46 1500 }
36acb880 1501}
3caec1bb 1502
e95354ec 1503wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1504{
74a7eb0b 1505 if ( m2w != ICONV_T_INVALID )
36acb880 1506 iconv_close(m2w);
74a7eb0b 1507 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1508 iconv_close(w2m);
1509}
3a0d76bc 1510
bde4baac 1511size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1512{
b1d547eb
VS
1513#if wxUSE_THREADS
1514 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1515 // Unfortunately there is a couple of global wxCSConv objects such as
1516 // wxConvLocal that are used all over wx code, so we have to make sure
1517 // the handle is used by at most one thread at the time. Otherwise
1518 // only a few wx classes would be safe to use from non-main threads
1519 // as MB<->WC conversion would fail "randomly".
1520 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1521#endif
3698ae71 1522
36acb880
VZ
1523 size_t inbuf = strlen(psz);
1524 size_t outbuf = n * SIZEOF_WCHAR_T;
1525 size_t res, cres;
1526 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1527 wchar_t *bufPtr = buf;
1528 const char *pszPtr = psz;
1529
1530 if (buf)
1531 {
1532 // have destination buffer, convert there
1533 cres = iconv(m2w,
1534 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1535 (char**)&bufPtr, &outbuf);
1536 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1537
36acb880 1538 if (ms_wcNeedsSwap)
3a0d76bc 1539 {
36acb880 1540 // convert to native endianness
17a1ebd1
VZ
1541 for ( unsigned i = 0; i < res; i++ )
1542 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1543 }
adb45366 1544
49dd9820
VS
1545 // NB: iconv was given only strlen(psz) characters on input, and so
1546 // it couldn't convert the trailing zero. Let's do it ourselves
1547 // if there's some room left for it in the output buffer.
1548 if (res < n)
1549 buf[res] = 0;
36acb880
VZ
1550 }
1551 else
1552 {
1553 // no destination buffer... convert using temp buffer
1554 // to calculate destination buffer requirement
1555 wchar_t tbuf[8];
1556 res = 0;
1557 do {
1558 bufPtr = tbuf;
1559 outbuf = 8*SIZEOF_WCHAR_T;
1560
1561 cres = iconv(m2w,
1562 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1563 (char**)&bufPtr, &outbuf );
1564
1565 res += 8-(outbuf/SIZEOF_WCHAR_T);
1566 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1567 }
dccce9ea 1568
36acb880 1569 if (ICONV_FAILED(cres, inbuf))
f1339c56 1570 {
36acb880 1571 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1572 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1573 return (size_t)-1;
1574 }
1575
1576 return res;
1577}
1578
bde4baac 1579size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1580{
b1d547eb
VS
1581#if wxUSE_THREADS
1582 // NB: explained in MB2WC
1583 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1584#endif
3698ae71 1585
156162ec
MW
1586 size_t inlen = wxWcslen(psz);
1587 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1588 size_t outbuf = n;
1589 size_t res, cres;
3a0d76bc 1590
36acb880 1591 wchar_t *tmpbuf = 0;
3caec1bb 1592
36acb880
VZ
1593 if (ms_wcNeedsSwap)
1594 {
1595 // need to copy to temp buffer to switch endianness
74a7eb0b 1596 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1597 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1598 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1599 for ( size_t i = 0; i < inlen; i++ )
1600 tmpbuf[n] = WC_BSWAP(psz[i]);
156162ec 1601 tmpbuf[inlen] = L'\0';
74a7eb0b 1602 psz = tmpbuf;
36acb880 1603 }
3a0d76bc 1604
36acb880
VZ
1605 if (buf)
1606 {
1607 // have destination buffer, convert there
1608 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1609
36acb880 1610 res = n-outbuf;
adb45366 1611
49dd9820
VS
1612 // NB: iconv was given only wcslen(psz) characters on input, and so
1613 // it couldn't convert the trailing zero. Let's do it ourselves
1614 // if there's some room left for it in the output buffer.
1615 if (res < n)
1616 buf[0] = 0;
36acb880
VZ
1617 }
1618 else
1619 {
1620 // no destination buffer... convert using temp buffer
1621 // to calculate destination buffer requirement
1622 char tbuf[16];
1623 res = 0;
1624 do {
1625 buf = tbuf; outbuf = 16;
1626
1627 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1628
36acb880
VZ
1629 res += 16 - outbuf;
1630 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1631 }
dccce9ea 1632
36acb880
VZ
1633 if (ms_wcNeedsSwap)
1634 {
1635 free(tmpbuf);
1636 }
dccce9ea 1637
36acb880
VZ
1638 if (ICONV_FAILED(cres, inbuf))
1639 {
ce6f8d6f 1640 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1641 return (size_t)-1;
1642 }
1643
1644 return res;
1645}
1646
eec47cc6
VZ
1647const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
1648{
1649 if ( m_nulLen == (size_t)-2 )
1650 {
1651 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1652
1653#if wxUSE_THREADS
1654 // NB: explained in MB2WC
1655 wxMutexLocker lock(self->m_iconvMutex);
1656#endif
1657
1658 size_t inLen = 1,
1659 outLen = WXSIZEOF(m_nulBuf);
1660 self->m_nulLen = iconv(w2m, ICONV_CHAR_CAST(L""), &inLen,
1661 &self->m_nulBuf, &outLen);
1662 }
1663
1664 *nulLen = m_nulLen;
1665 return m_nulBuf;
1666}
1667
b040e242 1668#endif // HAVE_ICONV
36acb880 1669
e95354ec 1670
36acb880
VZ
1671// ============================================================================
1672// Win32 conversion classes
1673// ============================================================================
1cd52418 1674
e95354ec 1675#ifdef wxHAVE_WIN32_MB2WC
373658eb 1676
8b04d4c4 1677// from utils.cpp
d775fa82 1678#if wxUSE_FONTMAP
8b04d4c4
VZ
1679extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1680extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1681#endif
373658eb 1682
e95354ec 1683class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1684{
1685public:
bde4baac
VZ
1686 wxMBConv_win32()
1687 {
1688 m_CodePage = CP_ACP;
eec47cc6 1689 m_nulLen = (size_t)-2;
bde4baac
VZ
1690 }
1691
7608a683 1692#if wxUSE_FONTMAP
e95354ec 1693 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1694 {
1695 m_CodePage = wxCharsetToCodepage(name);
eec47cc6 1696 m_nulLen = (size_t)-2;
bde4baac 1697 }
dccce9ea 1698
e95354ec 1699 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1700 {
1701 m_CodePage = wxEncodingToCodepage(encoding);
eec47cc6 1702 m_nulLen = (size_t)-2;
bde4baac 1703 }
eec47cc6 1704#endif // wxUSE_FONTMAP
8b04d4c4 1705
bde4baac 1706 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1707 {
02272c9c
VZ
1708 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1709 // the behaviour is not compatible with the Unix version (using iconv)
1710 // and break the library itself, e.g. wxTextInputStream::NextChar()
1711 // wouldn't work if reading an incomplete MB char didn't result in an
1712 // error
667e5b3e
VZ
1713 //
1714 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1715 // an error (tested under Windows Server 2003) and apparently it is
1716 // done on purpose, i.e. the function accepts any input in this case
1717 // and although I'd prefer to return error on ill-formed output, our
1718 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1719 // explicitly ill-formed according to RFC 2152) neither so we don't
1720 // even have any fallback here...
89028980
VS
1721 //
1722 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1723 // Win XP or newer and if it is specified on older versions, conversion
1724 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1725 // fails. So we can only use the flag on newer Windows versions.
1726 // Additionally, the flag is not supported by UTF7, symbol and CJK
1727 // encodings. See here:
1728 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1729 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1730 int flags = 0;
1731 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1732 m_CodePage < 50000 &&
1733 IsAtLeastWin2kSP4() )
1734 {
1735 flags = MB_ERR_INVALID_CHARS;
1736 }
1737 else if ( m_CodePage == CP_UTF8 )
1738 {
1739 // Avoid round-trip in the special case of UTF-8 by using our
1740 // own UTF-8 conversion code:
1741 return wxMBConvUTF8().MB2WC(buf, psz, n);
1742 }
667e5b3e 1743
2b5f62a0
VZ
1744 const size_t len = ::MultiByteToWideChar
1745 (
1746 m_CodePage, // code page
667e5b3e 1747 flags, // flags: fall on error
2b5f62a0
VZ
1748 psz, // input string
1749 -1, // its length (NUL-terminated)
b4da152e 1750 buf, // output string
2b5f62a0
VZ
1751 buf ? n : 0 // size of output buffer
1752 );
89028980
VS
1753 if ( !len )
1754 {
1755 // function totally failed
1756 return (size_t)-1;
1757 }
1758
1759 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1760 // check if we succeeded, by doing a double trip:
1761 if ( !flags && buf )
1762 {
53c174fc
VZ
1763 const size_t mbLen = strlen(psz);
1764 wxCharBuffer mbBuf(mbLen);
89028980
VS
1765 if ( ::WideCharToMultiByte
1766 (
1767 m_CodePage,
1768 0,
1769 buf,
1770 -1,
1771 mbBuf.data(),
53c174fc 1772 mbLen + 1, // size in bytes, not length
89028980
VS
1773 NULL,
1774 NULL
1775 ) == 0 ||
1776 strcmp(mbBuf, psz) != 0 )
1777 {
1778 // we didn't obtain the same thing we started from, hence
1779 // the conversion was lossy and we consider that it failed
1780 return (size_t)-1;
1781 }
1782 }
2b5f62a0 1783
03a991bc
VZ
1784 // note that it returns count of written chars for buf != NULL and size
1785 // of the needed buffer for buf == NULL so in either case the length of
1786 // the string (which never includes the terminating NUL) is one less
89028980 1787 return len - 1;
f1339c56 1788 }
dccce9ea 1789
13dd924a 1790 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1791 {
13dd924a
VZ
1792 /*
1793 we have a problem here: by default, WideCharToMultiByte() may
1794 replace characters unrepresentable in the target code page with bad
1795 quality approximations such as turning "1/2" symbol (U+00BD) into
1796 "1" for the code pages which don't have it and we, obviously, want
1797 to avoid this at any price
d775fa82 1798
13dd924a
VZ
1799 the trouble is that this function does it _silently_, i.e. it won't
1800 even tell us whether it did or not... Win98/2000 and higher provide
1801 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1802 we have to resort to a round trip, i.e. check that converting back
1803 results in the same string -- this is, of course, expensive but
1804 otherwise we simply can't be sure to not garble the data.
1805 */
1806
1807 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1808 // it doesn't work with CJK encodings (which we test for rather roughly
1809 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1810 // supporting it
907173e5
WS
1811 BOOL usedDef wxDUMMY_INITIALIZE(false);
1812 BOOL *pUsedDef;
13dd924a
VZ
1813 int flags;
1814 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1815 {
1816 // it's our lucky day
1817 flags = WC_NO_BEST_FIT_CHARS;
1818 pUsedDef = &usedDef;
1819 }
1820 else // old system or unsupported encoding
1821 {
1822 flags = 0;
1823 pUsedDef = NULL;
1824 }
1825
2b5f62a0
VZ
1826 const size_t len = ::WideCharToMultiByte
1827 (
1828 m_CodePage, // code page
13dd924a
VZ
1829 flags, // either none or no best fit
1830 pwz, // input string
2b5f62a0
VZ
1831 -1, // it is (wide) NUL-terminated
1832 buf, // output buffer
1833 buf ? n : 0, // and its size
1834 NULL, // default "replacement" char
13dd924a 1835 pUsedDef // [out] was it used?
2b5f62a0
VZ
1836 );
1837
13dd924a
VZ
1838 if ( !len )
1839 {
1840 // function totally failed
1841 return (size_t)-1;
1842 }
1843
1844 // if we were really converting, check if we succeeded
1845 if ( buf )
1846 {
1847 if ( flags )
1848 {
1849 // check if the conversion failed, i.e. if any replacements
1850 // were done
1851 if ( usedDef )
1852 return (size_t)-1;
1853 }
1854 else // we must resort to double tripping...
1855 {
1856 wxWCharBuffer wcBuf(n);
1857 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1858 wcscmp(wcBuf, pwz) != 0 )
1859 {
1860 // we didn't obtain the same thing we started from, hence
1861 // the conversion was lossy and we consider that it failed
1862 return (size_t)-1;
1863 }
1864 }
1865 }
1866
03a991bc 1867 // see the comment above for the reason of "len - 1"
13dd924a 1868 return len - 1;
f1339c56 1869 }
dccce9ea 1870
13dd924a
VZ
1871 bool IsOk() const { return m_CodePage != -1; }
1872
1873private:
1874 static bool CanUseNoBestFit()
1875 {
1876 static int s_isWin98Or2k = -1;
1877
1878 if ( s_isWin98Or2k == -1 )
1879 {
1880 int verMaj, verMin;
1881 switch ( wxGetOsVersion(&verMaj, &verMin) )
1882 {
1883 case wxWIN95:
1884 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1885 break;
1886
1887 case wxWINDOWS_NT:
1888 s_isWin98Or2k = verMaj >= 5;
1889 break;
1890
1891 default:
1892 // unknown, be conseravtive by default
1893 s_isWin98Or2k = 0;
1894 }
1895
1896 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1897 }
1898
1899 return s_isWin98Or2k == 1;
1900 }
f1339c56 1901
89028980
VS
1902 static bool IsAtLeastWin2kSP4()
1903 {
8942f83a
WS
1904#ifdef __WXWINCE__
1905 return false;
1906#else
89028980
VS
1907 static int s_isAtLeastWin2kSP4 = -1;
1908
1909 if ( s_isAtLeastWin2kSP4 == -1 )
1910 {
1911 OSVERSIONINFOEX ver;
1912
1913 memset(&ver, 0, sizeof(ver));
1914 ver.dwOSVersionInfoSize = sizeof(ver);
1915 GetVersionEx((OSVERSIONINFO*)&ver);
1916
1917 s_isAtLeastWin2kSP4 =
1918 ((ver.dwMajorVersion > 5) || // Vista+
1919 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1920 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1921 ver.wServicePackMajor >= 4)) // 2000 SP4+
1922 ? 1 : 0;
1923 }
1924
1925 return s_isAtLeastWin2kSP4 == 1;
8942f83a 1926#endif
89028980
VS
1927 }
1928
eec47cc6
VZ
1929 virtual const char *GetMBNul(size_t *nulLen) const
1930 {
1931 if ( m_nulLen == (size_t)-2 )
1932 {
1933 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1934
1935 self->m_nulLen = ::WideCharToMultiByte
1936 (
1937 m_CodePage, // code page
1938 0, // no flags
1939 L"", // input string
1940 1, // translate just NUL
1941 self->m_nulBuf, // output buffer
1942 WXSIZEOF(m_nulBuf), // and its size
1943 NULL, // "replacement" char
1944 NULL // [out] was it used?
1945 );
1946
1947 if ( m_nulLen == 0 )
1948 self->m_nulLen = (size_t)-1;
1949 }
1950
1951 *nulLen = m_nulLen;
1952 return m_nulBuf;
1953 }
1954
b1d66b54 1955 long m_CodePage;
eec47cc6
VZ
1956 size_t m_nulLen;
1957 char m_nulBuf[8];
1cd52418 1958};
e95354ec
VZ
1959
1960#endif // wxHAVE_WIN32_MB2WC
1961
f7e98dee
RN
1962// ============================================================================
1963// Cocoa conversion classes
1964// ============================================================================
1965
1966#if defined(__WXCOCOA__)
1967
ecd9653b 1968// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1969// Cocoa. Strangely enough, internally Core Foundation uses
1970// UTF 32 internally quite a bit - its just not public (yet).
1971
1972#include <CoreFoundation/CFString.h>
1973#include <CoreFoundation/CFStringEncodingExt.h>
1974
1975CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1976{
638357a0 1977 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1978 if ( encoding == wxFONTENCODING_DEFAULT )
1979 {
638357a0 1980 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1981 }
1982 else switch( encoding)
1983 {
1984 case wxFONTENCODING_ISO8859_1 :
1985 enc = kCFStringEncodingISOLatin1 ;
1986 break ;
1987 case wxFONTENCODING_ISO8859_2 :
1988 enc = kCFStringEncodingISOLatin2;
1989 break ;
1990 case wxFONTENCODING_ISO8859_3 :
1991 enc = kCFStringEncodingISOLatin3 ;
1992 break ;
1993 case wxFONTENCODING_ISO8859_4 :
1994 enc = kCFStringEncodingISOLatin4;
1995 break ;
1996 case wxFONTENCODING_ISO8859_5 :
1997 enc = kCFStringEncodingISOLatinCyrillic;
1998 break ;
1999 case wxFONTENCODING_ISO8859_6 :
2000 enc = kCFStringEncodingISOLatinArabic;
2001 break ;
2002 case wxFONTENCODING_ISO8859_7 :
2003 enc = kCFStringEncodingISOLatinGreek;
2004 break ;
2005 case wxFONTENCODING_ISO8859_8 :
2006 enc = kCFStringEncodingISOLatinHebrew;
2007 break ;
2008 case wxFONTENCODING_ISO8859_9 :
2009 enc = kCFStringEncodingISOLatin5;
2010 break ;
2011 case wxFONTENCODING_ISO8859_10 :
2012 enc = kCFStringEncodingISOLatin6;
2013 break ;
2014 case wxFONTENCODING_ISO8859_11 :
2015 enc = kCFStringEncodingISOLatinThai;
2016 break ;
2017 case wxFONTENCODING_ISO8859_13 :
2018 enc = kCFStringEncodingISOLatin7;
2019 break ;
2020 case wxFONTENCODING_ISO8859_14 :
2021 enc = kCFStringEncodingISOLatin8;
2022 break ;
2023 case wxFONTENCODING_ISO8859_15 :
2024 enc = kCFStringEncodingISOLatin9;
2025 break ;
2026
2027 case wxFONTENCODING_KOI8 :
2028 enc = kCFStringEncodingKOI8_R;
2029 break ;
2030 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2031 enc = kCFStringEncodingDOSRussian;
2032 break ;
2033
2034// case wxFONTENCODING_BULGARIAN :
2035// enc = ;
2036// break ;
2037
2038 case wxFONTENCODING_CP437 :
2039 enc =kCFStringEncodingDOSLatinUS ;
2040 break ;
2041 case wxFONTENCODING_CP850 :
2042 enc = kCFStringEncodingDOSLatin1;
2043 break ;
2044 case wxFONTENCODING_CP852 :
2045 enc = kCFStringEncodingDOSLatin2;
2046 break ;
2047 case wxFONTENCODING_CP855 :
2048 enc = kCFStringEncodingDOSCyrillic;
2049 break ;
2050 case wxFONTENCODING_CP866 :
2051 enc =kCFStringEncodingDOSRussian ;
2052 break ;
2053 case wxFONTENCODING_CP874 :
2054 enc = kCFStringEncodingDOSThai;
2055 break ;
2056 case wxFONTENCODING_CP932 :
2057 enc = kCFStringEncodingDOSJapanese;
2058 break ;
2059 case wxFONTENCODING_CP936 :
2060 enc =kCFStringEncodingDOSChineseSimplif ;
2061 break ;
2062 case wxFONTENCODING_CP949 :
2063 enc = kCFStringEncodingDOSKorean;
2064 break ;
2065 case wxFONTENCODING_CP950 :
2066 enc = kCFStringEncodingDOSChineseTrad;
2067 break ;
ecd9653b
WS
2068 case wxFONTENCODING_CP1250 :
2069 enc = kCFStringEncodingWindowsLatin2;
2070 break ;
2071 case wxFONTENCODING_CP1251 :
2072 enc =kCFStringEncodingWindowsCyrillic ;
2073 break ;
2074 case wxFONTENCODING_CP1252 :
2075 enc =kCFStringEncodingWindowsLatin1 ;
2076 break ;
2077 case wxFONTENCODING_CP1253 :
2078 enc = kCFStringEncodingWindowsGreek;
2079 break ;
2080 case wxFONTENCODING_CP1254 :
2081 enc = kCFStringEncodingWindowsLatin5;
2082 break ;
2083 case wxFONTENCODING_CP1255 :
2084 enc =kCFStringEncodingWindowsHebrew ;
2085 break ;
2086 case wxFONTENCODING_CP1256 :
2087 enc =kCFStringEncodingWindowsArabic ;
2088 break ;
2089 case wxFONTENCODING_CP1257 :
2090 enc = kCFStringEncodingWindowsBalticRim;
2091 break ;
638357a0
RN
2092// This only really encodes to UTF7 (if that) evidently
2093// case wxFONTENCODING_UTF7 :
2094// enc = kCFStringEncodingNonLossyASCII ;
2095// break ;
ecd9653b
WS
2096 case wxFONTENCODING_UTF8 :
2097 enc = kCFStringEncodingUTF8 ;
2098 break ;
2099 case wxFONTENCODING_EUC_JP :
2100 enc = kCFStringEncodingEUC_JP;
2101 break ;
2102 case wxFONTENCODING_UTF16 :
f7e98dee 2103 enc = kCFStringEncodingUnicode ;
ecd9653b 2104 break ;
f7e98dee
RN
2105 case wxFONTENCODING_MACROMAN :
2106 enc = kCFStringEncodingMacRoman ;
2107 break ;
2108 case wxFONTENCODING_MACJAPANESE :
2109 enc = kCFStringEncodingMacJapanese ;
2110 break ;
2111 case wxFONTENCODING_MACCHINESETRAD :
2112 enc = kCFStringEncodingMacChineseTrad ;
2113 break ;
2114 case wxFONTENCODING_MACKOREAN :
2115 enc = kCFStringEncodingMacKorean ;
2116 break ;
2117 case wxFONTENCODING_MACARABIC :
2118 enc = kCFStringEncodingMacArabic ;
2119 break ;
2120 case wxFONTENCODING_MACHEBREW :
2121 enc = kCFStringEncodingMacHebrew ;
2122 break ;
2123 case wxFONTENCODING_MACGREEK :
2124 enc = kCFStringEncodingMacGreek ;
2125 break ;
2126 case wxFONTENCODING_MACCYRILLIC :
2127 enc = kCFStringEncodingMacCyrillic ;
2128 break ;
2129 case wxFONTENCODING_MACDEVANAGARI :
2130 enc = kCFStringEncodingMacDevanagari ;
2131 break ;
2132 case wxFONTENCODING_MACGURMUKHI :
2133 enc = kCFStringEncodingMacGurmukhi ;
2134 break ;
2135 case wxFONTENCODING_MACGUJARATI :
2136 enc = kCFStringEncodingMacGujarati ;
2137 break ;
2138 case wxFONTENCODING_MACORIYA :
2139 enc = kCFStringEncodingMacOriya ;
2140 break ;
2141 case wxFONTENCODING_MACBENGALI :
2142 enc = kCFStringEncodingMacBengali ;
2143 break ;
2144 case wxFONTENCODING_MACTAMIL :
2145 enc = kCFStringEncodingMacTamil ;
2146 break ;
2147 case wxFONTENCODING_MACTELUGU :
2148 enc = kCFStringEncodingMacTelugu ;
2149 break ;
2150 case wxFONTENCODING_MACKANNADA :
2151 enc = kCFStringEncodingMacKannada ;
2152 break ;
2153 case wxFONTENCODING_MACMALAJALAM :
2154 enc = kCFStringEncodingMacMalayalam ;
2155 break ;
2156 case wxFONTENCODING_MACSINHALESE :
2157 enc = kCFStringEncodingMacSinhalese ;
2158 break ;
2159 case wxFONTENCODING_MACBURMESE :
2160 enc = kCFStringEncodingMacBurmese ;
2161 break ;
2162 case wxFONTENCODING_MACKHMER :
2163 enc = kCFStringEncodingMacKhmer ;
2164 break ;
2165 case wxFONTENCODING_MACTHAI :
2166 enc = kCFStringEncodingMacThai ;
2167 break ;
2168 case wxFONTENCODING_MACLAOTIAN :
2169 enc = kCFStringEncodingMacLaotian ;
2170 break ;
2171 case wxFONTENCODING_MACGEORGIAN :
2172 enc = kCFStringEncodingMacGeorgian ;
2173 break ;
2174 case wxFONTENCODING_MACARMENIAN :
2175 enc = kCFStringEncodingMacArmenian ;
2176 break ;
2177 case wxFONTENCODING_MACCHINESESIMP :
2178 enc = kCFStringEncodingMacChineseSimp ;
2179 break ;
2180 case wxFONTENCODING_MACTIBETAN :
2181 enc = kCFStringEncodingMacTibetan ;
2182 break ;
2183 case wxFONTENCODING_MACMONGOLIAN :
2184 enc = kCFStringEncodingMacMongolian ;
2185 break ;
2186 case wxFONTENCODING_MACETHIOPIC :
2187 enc = kCFStringEncodingMacEthiopic ;
2188 break ;
2189 case wxFONTENCODING_MACCENTRALEUR :
2190 enc = kCFStringEncodingMacCentralEurRoman ;
2191 break ;
2192 case wxFONTENCODING_MACVIATNAMESE :
2193 enc = kCFStringEncodingMacVietnamese ;
2194 break ;
2195 case wxFONTENCODING_MACARABICEXT :
2196 enc = kCFStringEncodingMacExtArabic ;
2197 break ;
2198 case wxFONTENCODING_MACSYMBOL :
2199 enc = kCFStringEncodingMacSymbol ;
2200 break ;
2201 case wxFONTENCODING_MACDINGBATS :
2202 enc = kCFStringEncodingMacDingbats ;
2203 break ;
2204 case wxFONTENCODING_MACTURKISH :
2205 enc = kCFStringEncodingMacTurkish ;
2206 break ;
2207 case wxFONTENCODING_MACCROATIAN :
2208 enc = kCFStringEncodingMacCroatian ;
2209 break ;
2210 case wxFONTENCODING_MACICELANDIC :
2211 enc = kCFStringEncodingMacIcelandic ;
2212 break ;
2213 case wxFONTENCODING_MACROMANIAN :
2214 enc = kCFStringEncodingMacRomanian ;
2215 break ;
2216 case wxFONTENCODING_MACCELTIC :
2217 enc = kCFStringEncodingMacCeltic ;
2218 break ;
2219 case wxFONTENCODING_MACGAELIC :
2220 enc = kCFStringEncodingMacGaelic ;
2221 break ;
ecd9653b
WS
2222// case wxFONTENCODING_MACKEYBOARD :
2223// enc = kCFStringEncodingMacKeyboardGlyphs ;
2224// break ;
2225 default :
2226 // because gcc is picky
2227 break ;
2228 } ;
2229 return enc ;
f7e98dee
RN
2230}
2231
f7e98dee
RN
2232class wxMBConv_cocoa : public wxMBConv
2233{
2234public:
2235 wxMBConv_cocoa()
2236 {
2237 Init(CFStringGetSystemEncoding()) ;
2238 }
2239
a6900d10 2240#if wxUSE_FONTMAP
f7e98dee
RN
2241 wxMBConv_cocoa(const wxChar* name)
2242 {
267e11c5 2243 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2244 }
a6900d10 2245#endif
f7e98dee
RN
2246
2247 wxMBConv_cocoa(wxFontEncoding encoding)
2248 {
2249 Init( wxCFStringEncFromFontEnc(encoding) );
2250 }
2251
2252 ~wxMBConv_cocoa()
2253 {
2254 }
2255
2256 void Init( CFStringEncoding encoding)
2257 {
638357a0 2258 m_encoding = encoding ;
f7e98dee
RN
2259 }
2260
2261 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2262 {
2263 wxASSERT(szUnConv);
ecd9653b 2264
638357a0
RN
2265 CFStringRef theString = CFStringCreateWithBytes (
2266 NULL, //the allocator
2267 (const UInt8*)szUnConv,
2268 strlen(szUnConv),
2269 m_encoding,
2270 false //no BOM/external representation
f7e98dee
RN
2271 );
2272
2273 wxASSERT(theString);
2274
638357a0
RN
2275 size_t nOutLength = CFStringGetLength(theString);
2276
2277 if (szOut == NULL)
f7e98dee 2278 {
f7e98dee 2279 CFRelease(theString);
638357a0 2280 return nOutLength;
f7e98dee 2281 }
ecd9653b 2282
638357a0 2283 CFRange theRange = { 0, nOutSize };
ecd9653b 2284
638357a0
RN
2285#if SIZEOF_WCHAR_T == 4
2286 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2287#endif
3698ae71 2288
f7e98dee 2289 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2290
f7e98dee 2291 CFRelease(theString);
ecd9653b 2292
638357a0 2293 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
2294
2295#if SIZEOF_WCHAR_T == 4
2296 wxMBConvUTF16 converter ;
638357a0 2297 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
2298 delete[] szUniCharBuffer;
2299#endif
3698ae71 2300
638357a0 2301 return nOutLength;
f7e98dee
RN
2302 }
2303
2304 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2305 {
638357a0 2306 wxASSERT(szUnConv);
3698ae71 2307
f7e98dee 2308 size_t nRealOutSize;
638357a0 2309 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2310 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2311
f7e98dee 2312#if SIZEOF_WCHAR_T == 4
d9d488cf 2313 wxMBConvUTF16 converter ;
f7e98dee
RN
2314 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2315 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2316 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2317 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2318#endif
2319
2320 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2321 NULL, //allocator
2322 szUniBuffer,
2323 nBufSize,
638357a0 2324 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2325 );
ecd9653b 2326
f7e98dee 2327 wxASSERT(theString);
ecd9653b 2328
f7e98dee 2329 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2330 //so we check and use getchars instead in that case
2331 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2332 {
638357a0
RN
2333 if (szOut != NULL)
2334 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2335
638357a0
RN
2336 nRealOutSize = CFStringGetLength(theString) + 1;
2337 }
2338 else
2339 {
2340 CFStringGetBytes(
2341 theString,
2342 CFRangeMake(0, CFStringGetLength(theString)),
2343 m_encoding,
2344 0, //what to put in characters that can't be converted -
2345 //0 tells CFString to return NULL if it meets such a character
2346 false, //not an external representation
2347 (UInt8*) szOut,
3698ae71 2348 nOutSize,
638357a0
RN
2349 (CFIndex*) &nRealOutSize
2350 );
f7e98dee 2351 }
ecd9653b 2352
638357a0 2353 CFRelease(theString);
ecd9653b 2354
638357a0
RN
2355#if SIZEOF_WCHAR_T == 4
2356 delete[] szUniBuffer;
2357#endif
ecd9653b 2358
f7e98dee
RN
2359 return nRealOutSize - 1;
2360 }
2361
2362 bool IsOk() const
ecd9653b 2363 {
3698ae71 2364 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2365 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2366 }
2367
2368private:
638357a0 2369 CFStringEncoding m_encoding ;
f7e98dee
RN
2370};
2371
2372#endif // defined(__WXCOCOA__)
2373
335d31e0
SC
2374// ============================================================================
2375// Mac conversion classes
2376// ============================================================================
2377
2378#if defined(__WXMAC__) && defined(TARGET_CARBON)
2379
2380class wxMBConv_mac : public wxMBConv
2381{
2382public:
2383 wxMBConv_mac()
2384 {
2385 Init(CFStringGetSystemEncoding()) ;
2386 }
2387
2d1659cf 2388#if wxUSE_FONTMAP
335d31e0
SC
2389 wxMBConv_mac(const wxChar* name)
2390 {
267e11c5 2391 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0 2392 }
2d1659cf 2393#endif
335d31e0
SC
2394
2395 wxMBConv_mac(wxFontEncoding encoding)
2396 {
d775fa82
WS
2397 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2398 }
2399
2400 ~wxMBConv_mac()
2401 {
2402 OSStatus status = noErr ;
2403 status = TECDisposeConverter(m_MB2WC_converter);
2404 status = TECDisposeConverter(m_WC2MB_converter);
2405 }
2406
2407
2408 void Init( TextEncodingBase encoding)
2409 {
2410 OSStatus status = noErr ;
2411 m_char_encoding = encoding ;
2412 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2413
2414 status = TECCreateConverter(&m_MB2WC_converter,
2415 m_char_encoding,
2416 m_unicode_encoding);
2417 status = TECCreateConverter(&m_WC2MB_converter,
2418 m_unicode_encoding,
2419 m_char_encoding);
2420 }
2421
335d31e0
SC
2422 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2423 {
d775fa82
WS
2424 OSStatus status = noErr ;
2425 ByteCount byteOutLen ;
2426 ByteCount byteInLen = strlen(psz) ;
2427 wchar_t *tbuf = NULL ;
2428 UniChar* ubuf = NULL ;
2429 size_t res = 0 ;
2430
2431 if (buf == NULL)
2432 {
638357a0 2433 //apple specs say at least 32
c543817b 2434 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2435 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2436 }
2437 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2438#if SIZEOF_WCHAR_T == 4
d775fa82 2439 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2440#else
d775fa82 2441 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2442#endif
d775fa82
WS
2443 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2444 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2445#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2446 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2447 // is not properly terminated we get random characters at the end
2448 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2449 wxMBConvUTF16 converter ;
d775fa82
WS
2450 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2451 free( ubuf ) ;
f3a355ce 2452#else
d775fa82 2453 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2454#endif
d775fa82
WS
2455 if ( buf == NULL )
2456 free(tbuf) ;
335d31e0 2457
335d31e0
SC
2458 if ( buf && res < n)
2459 buf[res] = 0;
2460
d775fa82 2461 return res ;
335d31e0
SC
2462 }
2463
2464 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2465 {
2466 OSStatus status = noErr ;
2467 ByteCount byteOutLen ;
2468 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2469
2470 char *tbuf = NULL ;
2471
2472 if (buf == NULL)
2473 {
638357a0 2474 //apple specs say at least 32
c543817b 2475 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2476 tbuf = (char*) malloc( n ) ;
2477 }
2478
2479 ByteCount byteBufferLen = n ;
2480 UniChar* ubuf = NULL ;
f3a355ce 2481#if SIZEOF_WCHAR_T == 4
d9d488cf 2482 wxMBConvUTF16 converter ;
d775fa82
WS
2483 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2484 byteInLen = unicharlen ;
2485 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2486 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2487#else
d775fa82 2488 ubuf = (UniChar*) psz ;
f3a355ce 2489#endif
d775fa82
WS
2490 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2491 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2492#if SIZEOF_WCHAR_T == 4
d775fa82 2493 free( ubuf ) ;
f3a355ce 2494#endif
d775fa82
WS
2495 if ( buf == NULL )
2496 free(tbuf) ;
335d31e0 2497
d775fa82 2498 size_t res = byteOutLen ;
335d31e0 2499 if ( buf && res < n)
638357a0 2500 {
335d31e0 2501 buf[res] = 0;
3698ae71 2502
638357a0
RN
2503 //we need to double-trip to verify it didn't insert any ? in place
2504 //of bogus characters
2505 wxWCharBuffer wcBuf(n);
2506 size_t pszlen = wxWcslen(psz);
2507 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2508 wxWcslen(wcBuf) != pszlen ||
2509 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2510 {
2511 // we didn't obtain the same thing we started from, hence
2512 // the conversion was lossy and we consider that it failed
2513 return (size_t)-1;
2514 }
2515 }
335d31e0 2516
d775fa82 2517 return res ;
335d31e0
SC
2518 }
2519
2520 bool IsOk() const
2521 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2522
2523private:
d775fa82
WS
2524 TECObjectRef m_MB2WC_converter ;
2525 TECObjectRef m_WC2MB_converter ;
2526
2527 TextEncodingBase m_char_encoding ;
2528 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2529};
2530
2531#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2532
36acb880
VZ
2533// ============================================================================
2534// wxEncodingConverter based conversion classes
2535// ============================================================================
2536
1e6feb95 2537#if wxUSE_FONTMAP
1cd52418 2538
e95354ec 2539class wxMBConv_wxwin : public wxMBConv
1cd52418 2540{
8b04d4c4
VZ
2541private:
2542 void Init()
2543 {
2544 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2545 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2546 }
2547
6001e347 2548public:
f1339c56
RR
2549 // temporarily just use wxEncodingConverter stuff,
2550 // so that it works while a better implementation is built
e95354ec 2551 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2552 {
2553 if (name)
267e11c5 2554 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2555 else
2556 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2557
8b04d4c4
VZ
2558 Init();
2559 }
2560
e95354ec 2561 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2562 {
2563 m_enc = enc;
2564
2565 Init();
f1339c56 2566 }
dccce9ea 2567
bde4baac 2568 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2569 {
2570 size_t inbuf = strlen(psz);
dccce9ea 2571 if (buf)
c643a977
VS
2572 {
2573 if (!m2w.Convert(psz,buf))
2574 return (size_t)-1;
2575 }
f1339c56
RR
2576 return inbuf;
2577 }
dccce9ea 2578
bde4baac 2579 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2580 {
f8d791e0 2581 const size_t inbuf = wxWcslen(psz);
f1339c56 2582 if (buf)
c643a977
VS
2583 {
2584 if (!w2m.Convert(psz,buf))
2585 return (size_t)-1;
2586 }
dccce9ea 2587
f1339c56
RR
2588 return inbuf;
2589 }
dccce9ea 2590
e95354ec 2591 bool IsOk() const { return m_ok; }
f1339c56
RR
2592
2593public:
8b04d4c4 2594 wxFontEncoding m_enc;
f1339c56 2595 wxEncodingConverter m2w, w2m;
cafbf6fb 2596
eec47cc6
VZ
2597private:
2598 virtual const char *GetMBNul(size_t *nulLen) const
2599 {
2600 switch ( m_enc )
2601 {
2602 case wxFONTENCODING_UTF16BE:
2603 case wxFONTENCODING_UTF16LE:
2604 *nulLen = 2;
2605 return "\0";
2606
2607 case wxFONTENCODING_UTF32BE:
2608 case wxFONTENCODING_UTF32LE:
2609 *nulLen = 4;
2610 return "\0\0\0";
2611
2612 default:
2613 *nulLen = 1;
2614 return "";
2615 }
2616 }
2617
cafbf6fb
VZ
2618 // were we initialized successfully?
2619 bool m_ok;
fc7a2a60 2620
e95354ec 2621 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2622};
6001e347 2623
8f115891
MW
2624// make the constructors available for unit testing
2625WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2626{
2627 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2628 if ( !result->IsOk() )
2629 {
2630 delete result;
2631 return 0;
2632 }
2633 return result;
2634}
2635
1e6feb95
VZ
2636#endif // wxUSE_FONTMAP
2637
36acb880
VZ
2638// ============================================================================
2639// wxCSConv implementation
2640// ============================================================================
2641
8b04d4c4 2642void wxCSConv::Init()
6001e347 2643{
e95354ec
VZ
2644 m_name = NULL;
2645 m_convReal = NULL;
2646 m_deferred = true;
2647}
2648
8b04d4c4
VZ
2649wxCSConv::wxCSConv(const wxChar *charset)
2650{
2651 Init();
82713003 2652
e95354ec
VZ
2653 if ( charset )
2654 {
e95354ec
VZ
2655 SetName(charset);
2656 }
bda3d86a 2657
e4277538
VZ
2658#if wxUSE_FONTMAP
2659 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2660#else
bda3d86a 2661 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2662#endif
6001e347
RR
2663}
2664
8b04d4c4
VZ
2665wxCSConv::wxCSConv(wxFontEncoding encoding)
2666{
bda3d86a 2667 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2668 {
2669 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2670
2671 encoding = wxFONTENCODING_SYSTEM;
2672 }
2673
8b04d4c4
VZ
2674 Init();
2675
bda3d86a 2676 m_encoding = encoding;
8b04d4c4
VZ
2677}
2678
6001e347
RR
2679wxCSConv::~wxCSConv()
2680{
65e50848
JS
2681 Clear();
2682}
2683
54380f29 2684wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2685 : wxMBConv()
54380f29 2686{
8b04d4c4
VZ
2687 Init();
2688
54380f29 2689 SetName(conv.m_name);
8b04d4c4 2690 m_encoding = conv.m_encoding;
54380f29
GD
2691}
2692
2693wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2694{
2695 Clear();
8b04d4c4 2696
54380f29 2697 SetName(conv.m_name);
8b04d4c4
VZ
2698 m_encoding = conv.m_encoding;
2699
54380f29
GD
2700 return *this;
2701}
2702
65e50848
JS
2703void wxCSConv::Clear()
2704{
8b04d4c4 2705 free(m_name);
e95354ec 2706 delete m_convReal;
8b04d4c4 2707
65e50848 2708 m_name = NULL;
e95354ec 2709 m_convReal = NULL;
6001e347
RR
2710}
2711
2712void wxCSConv::SetName(const wxChar *charset)
2713{
f1339c56
RR
2714 if (charset)
2715 {
2716 m_name = wxStrdup(charset);
e95354ec 2717 m_deferred = true;
f1339c56 2718 }
6001e347
RR
2719}
2720
8b3eb85d
VZ
2721#if wxUSE_FONTMAP
2722#include "wx/hashmap.h"
2723
2724WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2725 wxEncodingNameCache );
8b3eb85d
VZ
2726
2727static wxEncodingNameCache gs_nameCache;
2728#endif
2729
e95354ec
VZ
2730wxMBConv *wxCSConv::DoCreate() const
2731{
ce6f8d6f
VZ
2732#if wxUSE_FONTMAP
2733 wxLogTrace(TRACE_STRCONV,
2734 wxT("creating conversion for %s"),
2735 (m_name ? m_name
2736 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2737#endif // wxUSE_FONTMAP
2738
c547282d
VZ
2739 // check for the special case of ASCII or ISO8859-1 charset: as we have
2740 // special knowledge of it anyhow, we don't need to create a special
2741 // conversion object
e4277538
VZ
2742 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2743 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2744 {
e95354ec
VZ
2745 // don't convert at all
2746 return NULL;
2747 }
dccce9ea 2748
e95354ec
VZ
2749 // we trust OS to do conversion better than we can so try external
2750 // conversion methods first
2751 //
2752 // the full order is:
2753 // 1. OS conversion (iconv() under Unix or Win32 API)
2754 // 2. hard coded conversions for UTF
2755 // 3. wxEncodingConverter as fall back
2756
2757 // step (1)
2758#ifdef HAVE_ICONV
c547282d 2759#if !wxUSE_FONTMAP
e95354ec 2760 if ( m_name )
c547282d 2761#endif // !wxUSE_FONTMAP
e95354ec 2762 {
c547282d 2763 wxString name(m_name);
8b3eb85d
VZ
2764 wxFontEncoding encoding(m_encoding);
2765
2766 if ( !name.empty() )
2767 {
2768 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2769 if ( conv->IsOk() )
2770 return conv;
2771
2772 delete conv;
c547282d
VZ
2773
2774#if wxUSE_FONTMAP
8b3eb85d
VZ
2775 encoding =
2776 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 2777#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2778 }
2779#if wxUSE_FONTMAP
2780 {
2781 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2782 if ( it != gs_nameCache.end() )
2783 {
2784 if ( it->second.empty() )
2785 return NULL;
c547282d 2786
8b3eb85d
VZ
2787 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2788 if ( conv->IsOk() )
2789 return conv;
e95354ec 2790
8b3eb85d
VZ
2791 delete conv;
2792 }
2793
2794 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2795
2796 for ( ; *names; ++names )
2797 {
2798 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2799 if ( conv->IsOk() )
2800 {
2801 gs_nameCache[encoding] = *names;
2802 return conv;
2803 }
2804
2805 delete conv;
2806 }
2807
40711af8 2808 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
2809 }
2810#endif // wxUSE_FONTMAP
e95354ec
VZ
2811 }
2812#endif // HAVE_ICONV
2813
2814#ifdef wxHAVE_WIN32_MB2WC
2815 {
7608a683 2816#if wxUSE_FONTMAP
e95354ec
VZ
2817 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2818 : new wxMBConv_win32(m_encoding);
2819 if ( conv->IsOk() )
2820 return conv;
2821
2822 delete conv;
7608a683
WS
2823#else
2824 return NULL;
2825#endif
e95354ec
VZ
2826 }
2827#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2828#if defined(__WXMAC__)
2829 {
5c3c8676 2830 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 2831 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 2832 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82
WS
2833 {
2834
2d1659cf 2835#if wxUSE_FONTMAP
d775fa82
WS
2836 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2837 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
2838#else
2839 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2840#endif
d775fa82 2841 if ( conv->IsOk() )
f7e98dee
RN
2842 return conv;
2843
2844 delete conv;
2845 }
2846 }
2847#endif
2848#if defined(__WXCOCOA__)
2849 {
2850 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2851 {
2852
a6900d10 2853#if wxUSE_FONTMAP
f7e98dee
RN
2854 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2855 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
2856#else
2857 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2858#endif
f7e98dee 2859 if ( conv->IsOk() )
d775fa82
WS
2860 return conv;
2861
2862 delete conv;
2863 }
335d31e0
SC
2864 }
2865#endif
e95354ec
VZ
2866 // step (2)
2867 wxFontEncoding enc = m_encoding;
2868#if wxUSE_FONTMAP
c547282d
VZ
2869 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2870 {
2871 // use "false" to suppress interactive dialogs -- we can be called from
2872 // anywhere and popping up a dialog from here is the last thing we want to
2873 // do
267e11c5 2874 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2875 }
e95354ec
VZ
2876#endif // wxUSE_FONTMAP
2877
2878 switch ( enc )
2879 {
2880 case wxFONTENCODING_UTF7:
2881 return new wxMBConvUTF7;
2882
2883 case wxFONTENCODING_UTF8:
2884 return new wxMBConvUTF8;
2885
e95354ec
VZ
2886 case wxFONTENCODING_UTF16BE:
2887 return new wxMBConvUTF16BE;
2888
2889 case wxFONTENCODING_UTF16LE:
2890 return new wxMBConvUTF16LE;
2891
e95354ec
VZ
2892 case wxFONTENCODING_UTF32BE:
2893 return new wxMBConvUTF32BE;
2894
2895 case wxFONTENCODING_UTF32LE:
2896 return new wxMBConvUTF32LE;
2897
2898 default:
2899 // nothing to do but put here to suppress gcc warnings
2900 ;
2901 }
2902
2903 // step (3)
2904#if wxUSE_FONTMAP
2905 {
2906 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2907 : new wxMBConv_wxwin(m_encoding);
2908 if ( conv->IsOk() )
2909 return conv;
2910
2911 delete conv;
2912 }
2913#endif // wxUSE_FONTMAP
2914
a58d4f4d
VS
2915 // NB: This is a hack to prevent deadlock. What could otherwise happen
2916 // in Unicode build: wxConvLocal creation ends up being here
2917 // because of some failure and logs the error. But wxLog will try to
2918 // attach timestamp, for which it will need wxConvLocal (to convert
2919 // time to char* and then wchar_t*), but that fails, tries to log
2920 // error, but wxLog has a (already locked) critical section that
2921 // guards static buffer.
2922 static bool alreadyLoggingError = false;
2923 if (!alreadyLoggingError)
2924 {
2925 alreadyLoggingError = true;
2926 wxLogError(_("Cannot convert from the charset '%s'!"),
2927 m_name ? m_name
e95354ec
VZ
2928 :
2929#if wxUSE_FONTMAP
267e11c5 2930 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
2931#else // !wxUSE_FONTMAP
2932 wxString::Format(_("encoding %s"), m_encoding).c_str()
2933#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2934 );
a58d4f4d
VS
2935 alreadyLoggingError = false;
2936 }
e95354ec
VZ
2937
2938 return NULL;
2939}
2940
2941void wxCSConv::CreateConvIfNeeded() const
2942{
2943 if ( m_deferred )
2944 {
2945 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2946
2947#if wxUSE_INTL
2948 // if we don't have neither the name nor the encoding, use the default
2949 // encoding for this system
2950 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2951 {
4d312c22 2952 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2953 }
2954#endif // wxUSE_INTL
2955
e95354ec
VZ
2956 self->m_convReal = DoCreate();
2957 self->m_deferred = false;
6001e347 2958 }
6001e347
RR
2959}
2960
2961size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2962{
e95354ec 2963 CreateConvIfNeeded();
dccce9ea 2964
e95354ec
VZ
2965 if (m_convReal)
2966 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2967
2968 // latin-1 (direct)
4def3b35 2969 size_t len = strlen(psz);
dccce9ea 2970
f1339c56
RR
2971 if (buf)
2972 {
4def3b35 2973 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2974 buf[c] = (unsigned char)(psz[c]);
2975 }
dccce9ea 2976
f1339c56 2977 return len;
6001e347
RR
2978}
2979
2980size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2981{
e95354ec 2982 CreateConvIfNeeded();
dccce9ea 2983
e95354ec
VZ
2984 if (m_convReal)
2985 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2986
f1339c56 2987 // latin-1 (direct)
f8d791e0 2988 const size_t len = wxWcslen(psz);
f1339c56
RR
2989 if (buf)
2990 {
4def3b35 2991 for (size_t c = 0; c <= len; c++)
24642831
VS
2992 {
2993 if (psz[c] > 0xFF)
2994 return (size_t)-1;
907173e5 2995 buf[c] = (char)psz[c];
24642831
VS
2996 }
2997 }
2998 else
2999 {
3000 for (size_t c = 0; c <= len; c++)
3001 {
3002 if (psz[c] > 0xFF)
3003 return (size_t)-1;
3004 }
f1339c56 3005 }
dccce9ea 3006
f1339c56 3007 return len;
6001e347
RR
3008}
3009
eec47cc6
VZ
3010const char *wxCSConv::GetMBNul(size_t *nulLen) const
3011{
3012 CreateConvIfNeeded();
3013
3014 if ( m_convReal )
3015 {
3016 // cast needed just to call private function of m_convReal
3017 return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
3018 }
3019
3020 *nulLen = 1;
3021 return "";
3022}
3023
bde4baac
VZ
3024// ----------------------------------------------------------------------------
3025// globals
3026// ----------------------------------------------------------------------------
3027
3028#ifdef __WINDOWS__
3029 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
3030#elif defined(__WXMAC__) && !defined(__MACH__)
3031 static wxMBConv_mac wxConvLibcObj ;
bde4baac 3032#else
dcc8fac0 3033 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
3034#endif
3035
3036static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3037static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3038static wxMBConvUTF7 wxConvUTF7Obj;
3039static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 3040
bde4baac
VZ
3041WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3042WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3043WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3044WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3045WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3046WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
3047WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3048#ifdef __WXOSX__
ea8ce907 3049 wxConvUTF8Obj;
f5a1953b 3050#else
ea8ce907 3051 wxConvLibcObj;
f5a1953b
VZ
3052#endif
3053
bde4baac
VZ
3054
3055#else // !wxUSE_WCHAR_T
3056
3057// stand-ins in absence of wchar_t
3058WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3059 wxConvISO8859_1,
3060 wxConvLocal,
3061 wxConvUTF8;
3062
3063#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T