]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Unix compilation fixes after last commit
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
6001e347
RR
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
373658eb
VZ
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
bde4baac
VZ
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
7608a683 39#ifdef __WINDOWS__
532d575b 40 #include "wx/msw/private.h"
13dd924a 41 #include "wx/msw/missing.h"
0a1c1e62
GRG
42#endif
43
1c193821 44#ifndef __WXWINCE__
1cd52418 45#include <errno.h>
1c193821
JS
46#endif
47
6001e347
RR
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
e95354ec
VZ
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
6001e347 56#ifdef __SALFORDC__
373658eb 57 #include <clib.h>
6001e347
RR
58#endif
59
b040e242 60#ifdef HAVE_ICONV
373658eb 61 #include <iconv.h>
b1d547eb 62 #include "wx/thread.h"
1cd52418 63#endif
1cd52418 64
373658eb
VZ
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
7608a683 67#include "wx/utils.h"
373658eb 68
335d31e0 69#ifdef __WXMAC__
40ba2f3b 70#ifndef __DARWIN__
4227afa4
SC
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
40ba2f3b 74#endif
335d31e0
SC
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
ce6f8d6f
VZ
78
79#define TRACE_STRCONV _T("strconv")
80
4948c2b6 81#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
82 #define WC_UTF16
83#endif
84
373658eb
VZ
85// ============================================================================
86// implementation
87// ============================================================================
88
89// ----------------------------------------------------------------------------
c91830cb 90// UTF-16 en/decoding to/from UCS-4
373658eb 91// ----------------------------------------------------------------------------
6001e347 92
b0a6bb75 93
c91830cb 94static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 95{
dccce9ea 96 if (input<=0xffff)
4def3b35 97 {
999836aa
VZ
98 if (output)
99 *output = (wxUint16) input;
4def3b35 100 return 1;
dccce9ea
VZ
101 }
102 else if (input>=0x110000)
4def3b35
VS
103 {
104 return (size_t)-1;
dccce9ea
VZ
105 }
106 else
4def3b35 107 {
dccce9ea 108 if (output)
4def3b35 109 {
c91830cb 110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
112 }
113 return 2;
1cd52418 114 }
1cd52418
OK
115}
116
c91830cb 117static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 118{
dccce9ea 119 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
120 {
121 output = *input;
122 return 1;
dccce9ea 123 }
cdb14ecb 124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
4def3b35
VS
125 {
126 output = *input;
127 return (size_t)-1;
dccce9ea
VZ
128 }
129 else
4def3b35
VS
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
1cd52418
OK
134}
135
b0a6bb75 136
f6bcfd97 137// ----------------------------------------------------------------------------
6001e347 138// wxMBConv
f6bcfd97 139// ----------------------------------------------------------------------------
2c53a80a
WS
140
141wxMBConv::~wxMBConv()
142{
143 // nothing to do here (necessary for Darwin linking probably)
144}
6001e347 145
6001e347
RR
146const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147{
2b5f62a0 148 if ( psz )
6001e347 149 {
2b5f62a0
VZ
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
635f33ce
VS
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
2b5f62a0 161 }
f6bcfd97 162 }
2b5f62a0
VZ
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
6001e347
RR
167}
168
e5cceba0 169const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 170{
2b5f62a0
VZ
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
c91830cb 176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
2b5f62a0
VZ
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
e5cceba0 186
e5cceba0 187 return buf;
6001e347
RR
188}
189
eec47cc6
VZ
190const wxWCharBuffer
191wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
e4e3bbb4 192{
eec47cc6
VZ
193 // the currently accumulated wide characters
194 wxWCharBuffer wbuf;
195
196 // the current length of wbuf
197 size_t lenBuf = 0;
198
199 // we need to know the representation of L'\0' for this conversion
200 size_t nulLen;
201 const char * const nul = GetMBNul(&nulLen);
202 if ( nulLen == (size_t)-1 || nulLen == 0 )
203 return wxWCharBuffer();
204
205 // make a copy of the input string unless it is already properly
206 // NUL-terminated
207 wxCharBuffer bufTmp;
208
209 // now we can compute the input size if we were not given it: notice that
210 // in this case the string must be properly NUL-terminated, of course, as
211 // otherwise we have no way of knowing how long it is
212 if ( inLen == (size_t)-1 )
213 {
214 // not the most efficient algorithm but it shouldn't matter as normally
215 // there are not many NULs in the string and so normally memcmp()
216 // should stop on the first character
22886fb3
VZ
217 const char *p = in;
218 while ( memcmp(p, nul, nulLen) != 0 )
219 p++;
e4e3bbb4 220
eec47cc6
VZ
221 inLen = p - in + nulLen;
222 }
223 else // we already have the size
e4e3bbb4 224 {
eec47cc6
VZ
225 // check if it's not already NUL-terminated too to avoid the copy
226 if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
227 {
228 // make a copy in order to properly NUL-terminate the string
229 bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
230 memcpy(bufTmp.data(), in, inLen);
231 memcpy(bufTmp.data() + inLen, nul, nulLen);
232 }
233 }
e4e3bbb4 234
eec47cc6
VZ
235 if ( bufTmp )
236 in = bufTmp;
e4e3bbb4 237
eec47cc6
VZ
238 for ( const char * const inEnd = in + inLen;; )
239 {
240 // try to convert the current chunk if anything left
241 size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
242 if ( lenChunk == 0 )
f5fb6871 243 {
eec47cc6
VZ
244 // nothing left in the input string, conversion succeeded
245 if ( outLen )
246 {
247 // we shouldn't include the last NUL in the result length
248 *outLen = lenBuf ? lenBuf - 1 : 0;
249 }
250
251 return wbuf;
f5fb6871
RN
252 }
253
eec47cc6
VZ
254 if ( lenChunk == (size_t)-1 )
255 break;
e4e3bbb4 256
eec47cc6
VZ
257 const size_t lenBufNew = lenBuf + lenChunk;
258 if ( !wbuf.extend(lenBufNew) )
259 break;
e4e3bbb4 260
eec47cc6
VZ
261 lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
262 if ( lenChunk == (size_t)-1 )
263 break;
f5fb6871 264
eec47cc6
VZ
265 // +! for the embedded NUL (if something follows)
266 lenBuf = lenBufNew + 1;
267
268 // advance the input pointer past the end of this chunk
269 while ( memcmp(in, nul, nulLen) != 0 )
270 in++;
e4e3bbb4 271
eec47cc6 272 in += nulLen; // skipping over its terminator as well
e4e3bbb4
RN
273 }
274
eec47cc6
VZ
275 // conversion failed
276 if ( outLen )
277 *outLen = 0;
278
279 return wxWCharBuffer();
e4e3bbb4
RN
280}
281
eec47cc6
VZ
282const wxCharBuffer
283wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
e4e3bbb4 284{
eec47cc6
VZ
285 // the currently accumulated multibyte characters
286 wxCharBuffer buf;
f5fb6871 287
eec47cc6
VZ
288 // the current length of buf
289 size_t lenBuf = 0;
e4e3bbb4 290
eec47cc6
VZ
291 // make a copy of the input string unless it is already properly
292 // NUL-terminated
293 //
294 // if we don't know its length we have no choice but to assume that it is,
295 // indeed, properly terminated
296 wxWCharBuffer bufTmp;
297 if ( inLen == (size_t)-1 )
e4e3bbb4 298 {
eec47cc6
VZ
299 inLen = wxWcslen(in) + 1;
300 }
301 else if ( inLen != 0 && in[inLen - 1] != L'\0' )
302 {
303 // make a copy in order to properly NUL-terminate the string
304 bufTmp = wxWCharBuffer(inLen);
305 memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
306 }
e4e3bbb4 307
eec47cc6
VZ
308 if ( bufTmp )
309 in = bufTmp;
e4e3bbb4 310
eec47cc6
VZ
311 for ( const wchar_t * const inEnd = in + inLen;; )
312 {
313 // try to convert the current chunk, if anything left
314 size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
315 if ( lenChunk == 0 )
f5fb6871 316 {
eec47cc6
VZ
317 // nothing left in the input string, conversion succeeded
318 if ( outLen )
319 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
320
321 return buf;
f5fb6871 322 }
e4e3bbb4 323
eec47cc6
VZ
324 if ( lenChunk == (size_t)-1 )
325 break;
3698ae71 326
eec47cc6
VZ
327 const size_t lenBufNew = lenBuf + lenChunk;
328 if ( !buf.extend(lenBufNew) )
329 break;
f5fb6871 330
eec47cc6
VZ
331 lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
332 if ( lenChunk == (size_t)-1 )
333 break;
e4e3bbb4 334
eec47cc6
VZ
335 // chunk successfully converted, go to the next one
336 in += wxWcslen(in) + 1 /* skip NUL too */;
337 lenBuf = lenBufNew + 1;
e4e3bbb4
RN
338 }
339
eec47cc6
VZ
340 // conversion failed
341 if ( outLen )
342 *outLen = 0;
343
344 return wxCharBuffer();
e4e3bbb4
RN
345}
346
6001e347 347// ----------------------------------------------------------------------------
bde4baac 348// wxMBConvLibc
6001e347
RR
349// ----------------------------------------------------------------------------
350
bde4baac
VZ
351size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
352{
353 return wxMB2WC(buf, psz, n);
354}
355
356size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
357{
358 return wxWC2MB(buf, psz, n);
359}
e1bfe89e
RR
360
361// ----------------------------------------------------------------------------
532d575b 362// wxConvBrokenFileNames
e1bfe89e
RR
363// ----------------------------------------------------------------------------
364
eec47cc6
VZ
365#ifdef __UNIX__
366
845905d5 367wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 368{
845905d5
MW
369 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
370 || wxStricmp(charset, _T("UTF8")) == 0 )
371 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
372 else
373 m_conv = new wxCSConv(charset);
ea8ce907
RR
374}
375
eec47cc6 376#endif // __UNIX__
c12b7f79 377
bde4baac 378// ----------------------------------------------------------------------------
3698ae71 379// UTF-7
bde4baac 380// ----------------------------------------------------------------------------
6001e347 381
15f2ee32 382// Implementation (C) 2004 Fredrik Roubert
6001e347 383
15f2ee32
RN
384//
385// BASE64 decoding table
386//
387static const unsigned char utf7unb64[] =
6001e347 388{
15f2ee32
RN
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
395 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
396 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
398 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
399 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
400 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
402 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
403 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
404 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
412 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
413 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
414 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
415 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
416 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
417 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
418 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
421};
422
423size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
424{
15f2ee32
RN
425 size_t len = 0;
426
04a37834 427 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
428 {
429 unsigned char cc = *psz++;
430 if (cc != '+')
431 {
432 // plain ASCII char
433 if (buf)
434 *buf++ = cc;
435 len++;
436 }
437 else if (*psz == '-')
438 {
439 // encoded plus sign
440 if (buf)
441 *buf++ = cc;
442 len++;
443 psz++;
444 }
04a37834 445 else // start of BASE64 encoded string
15f2ee32 446 {
04a37834 447 bool lsb, ok;
15f2ee32 448 unsigned int d, l;
04a37834
VZ
449 for ( ok = lsb = false, d = 0, l = 0;
450 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
451 psz++ )
15f2ee32
RN
452 {
453 d <<= 6;
454 d += cc;
455 for (l += 6; l >= 8; lsb = !lsb)
456 {
04a37834 457 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
458 if (lsb)
459 {
460 if (buf)
461 *buf++ |= c;
462 len ++;
463 }
464 else
04a37834 465 {
15f2ee32 466 if (buf)
6356d52a 467 *buf = (wchar_t)(c << 8);
04a37834
VZ
468 }
469
470 ok = true;
15f2ee32
RN
471 }
472 }
04a37834
VZ
473
474 if ( !ok )
475 {
476 // in valid UTF7 we should have valid characters after '+'
477 return (size_t)-1;
478 }
479
15f2ee32
RN
480 if (*psz == '-')
481 psz++;
482 }
483 }
04a37834
VZ
484
485 if ( buf && (len < n) )
486 *buf = '\0';
487
15f2ee32 488 return len;
6001e347
RR
489}
490
15f2ee32
RN
491//
492// BASE64 encoding table
493//
494static const unsigned char utf7enb64[] =
495{
496 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
497 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
498 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
499 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
500 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
501 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
502 'w', 'x', 'y', 'z', '0', '1', '2', '3',
503 '4', '5', '6', '7', '8', '9', '+', '/'
504};
505
506//
507// UTF-7 encoding table
508//
509// 0 - Set D (directly encoded characters)
510// 1 - Set O (optional direct characters)
511// 2 - whitespace characters (optional)
512// 3 - special characters
513//
514static const unsigned char utf7encode[128] =
6001e347 515{
15f2ee32
RN
516 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
517 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
518 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
520 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
522 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
524};
525
667e5b3e 526size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32 527{
15f2ee32
RN
528 size_t len = 0;
529
530 while (*psz && ((!buf) || (len < n)))
531 {
532 wchar_t cc = *psz++;
533 if (cc < 0x80 && utf7encode[cc] < 1)
534 {
535 // plain ASCII char
536 if (buf)
537 *buf++ = (char)cc;
538 len++;
539 }
540#ifndef WC_UTF16
79c78d42 541 else if (((wxUint32)cc) > 0xffff)
b2c13097 542 {
15f2ee32
RN
543 // no surrogate pair generation (yet?)
544 return (size_t)-1;
545 }
546#endif
547 else
548 {
549 if (buf)
550 *buf++ = '+';
551 len++;
552 if (cc != '+')
553 {
554 // BASE64 encode string
555 unsigned int lsb, d, l;
73c902d6 556 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
557 {
558 for (lsb = 0; lsb < 2; lsb ++)
559 {
560 d <<= 8;
561 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
562
563 for (l += 8; l >= 6; )
564 {
565 l -= 6;
566 if (buf)
567 *buf++ = utf7enb64[(d >> l) % 64];
568 len++;
569 }
570 }
571 cc = *psz;
572 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
573 break;
574 }
575 if (l != 0)
576 {
577 if (buf)
578 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
579 len++;
580 }
581 }
582 if (buf)
583 *buf++ = '-';
584 len++;
585 }
586 }
587 if (buf && (len < n))
588 *buf = 0;
589 return len;
6001e347
RR
590}
591
f6bcfd97 592// ----------------------------------------------------------------------------
6001e347 593// UTF-8
f6bcfd97 594// ----------------------------------------------------------------------------
6001e347 595
dccce9ea 596static wxUint32 utf8_max[]=
4def3b35 597 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 598
3698ae71
VZ
599// boundaries of the private use area we use to (temporarily) remap invalid
600// characters invalid in a UTF-8 encoded string
ea8ce907
RR
601const wxUint32 wxUnicodePUA = 0x100000;
602const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
603
6001e347
RR
604size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
605{
4def3b35
VS
606 size_t len = 0;
607
dccce9ea 608 while (*psz && ((!buf) || (len < n)))
4def3b35 609 {
ea8ce907
RR
610 const char *opsz = psz;
611 bool invalid = false;
4def3b35
VS
612 unsigned char cc = *psz++, fc = cc;
613 unsigned cnt;
dccce9ea 614 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 615 fc <<= 1;
dccce9ea 616 if (!cnt)
4def3b35
VS
617 {
618 // plain ASCII char
dccce9ea 619 if (buf)
4def3b35
VS
620 *buf++ = cc;
621 len++;
561488ef
MW
622
623 // escape the escape character for octal escapes
624 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
625 && cc == '\\' && (!buf || len < n))
626 {
627 if (buf)
628 *buf++ = cc;
629 len++;
630 }
dccce9ea
VZ
631 }
632 else
4def3b35
VS
633 {
634 cnt--;
dccce9ea 635 if (!cnt)
4def3b35
VS
636 {
637 // invalid UTF-8 sequence
ea8ce907 638 invalid = true;
dccce9ea
VZ
639 }
640 else
4def3b35
VS
641 {
642 unsigned ocnt = cnt - 1;
643 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 644 while (cnt--)
4def3b35 645 {
ea8ce907 646 cc = *psz;
dccce9ea 647 if ((cc & 0xC0) != 0x80)
4def3b35
VS
648 {
649 // invalid UTF-8 sequence
ea8ce907
RR
650 invalid = true;
651 break;
4def3b35 652 }
ea8ce907 653 psz++;
4def3b35
VS
654 res = (res << 6) | (cc & 0x3f);
655 }
ea8ce907 656 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
657 {
658 // illegal UTF-8 encoding
ea8ce907 659 invalid = true;
4def3b35 660 }
ea8ce907
RR
661 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
662 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
663 {
664 // if one of our PUA characters turns up externally
665 // it must also be treated as an illegal sequence
666 // (a bit like you have to escape an escape character)
667 invalid = true;
668 }
669 else
670 {
1cd52418 671#ifdef WC_UTF16
ea8ce907
RR
672 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
673 size_t pa = encode_utf16(res, (wxUint16 *)buf);
674 if (pa == (size_t)-1)
675 {
676 invalid = true;
677 }
678 else
679 {
680 if (buf)
681 buf += pa;
682 len += pa;
683 }
373658eb 684#else // !WC_UTF16
ea8ce907 685 if (buf)
38d4b1e4 686 *buf++ = (wchar_t)res;
ea8ce907 687 len++;
373658eb 688#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
689 }
690 }
691 if (invalid)
692 {
693 if (m_options & MAP_INVALID_UTF8_TO_PUA)
694 {
695 while (opsz < psz && (!buf || len < n))
696 {
697#ifdef WC_UTF16
698 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
699 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
700 wxASSERT(pa != (size_t)-1);
701 if (buf)
702 buf += pa;
703 opsz++;
704 len += pa;
705#else
706 if (buf)
38d4b1e4 707 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
708 opsz++;
709 len++;
710#endif
711 }
712 }
3698ae71 713 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
714 {
715 while (opsz < psz && (!buf || len < n))
716 {
3698ae71
VZ
717 if ( buf && len + 3 < n )
718 {
17a1ebd1 719 unsigned char on = *opsz;
3698ae71 720 *buf++ = L'\\';
17a1ebd1
VZ
721 *buf++ = (wchar_t)( L'0' + on / 0100 );
722 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
723 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 724 }
ea8ce907
RR
725 opsz++;
726 len += 4;
727 }
728 }
3698ae71 729 else // MAP_INVALID_UTF8_NOT
ea8ce907
RR
730 {
731 return (size_t)-1;
732 }
4def3b35
VS
733 }
734 }
6001e347 735 }
dccce9ea 736 if (buf && (len < n))
4def3b35
VS
737 *buf = 0;
738 return len;
6001e347
RR
739}
740
3698ae71
VZ
741static inline bool isoctal(wchar_t wch)
742{
743 return L'0' <= wch && wch <= L'7';
744}
745
6001e347
RR
746size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
747{
4def3b35 748 size_t len = 0;
6001e347 749
dccce9ea 750 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
751 {
752 wxUint32 cc;
1cd52418 753#ifdef WC_UTF16
b5153fd8
VZ
754 // cast is ok for WC_UTF16
755 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 756 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 757#else
4def3b35
VS
758 cc=(*psz++) & 0x7fffffff;
759#endif
3698ae71
VZ
760
761 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
762 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 763 {
dccce9ea 764 if (buf)
ea8ce907 765 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 766 len++;
3698ae71 767 }
561488ef
MW
768 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
769 && cc == L'\\' && psz[0] == L'\\' )
770 {
771 if (buf)
772 *buf++ = (char)cc;
773 psz++;
774 len++;
775 }
3698ae71
VZ
776 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
777 cc == L'\\' &&
778 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 779 {
dccce9ea 780 if (buf)
3698ae71 781 {
b2c13097
WS
782 *buf++ = (char) ((psz[0] - L'0')*0100 +
783 (psz[1] - L'0')*010 +
784 (psz[2] - L'0'));
3698ae71
VZ
785 }
786
787 psz += 3;
ea8ce907
RR
788 len++;
789 }
790 else
791 {
792 unsigned cnt;
793 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
794 if (!cnt)
4def3b35 795 {
ea8ce907
RR
796 // plain ASCII char
797 if (buf)
798 *buf++ = (char) cc;
799 len++;
800 }
801
802 else
803 {
804 len += cnt + 1;
805 if (buf)
806 {
807 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
808 while (cnt--)
809 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
810 }
4def3b35
VS
811 }
812 }
6001e347 813 }
4def3b35 814
3698ae71
VZ
815 if (buf && (len<n))
816 *buf = 0;
adb45366 817
4def3b35 818 return len;
6001e347
RR
819}
820
c91830cb
VZ
821// ----------------------------------------------------------------------------
822// UTF-16
823// ----------------------------------------------------------------------------
824
825#ifdef WORDS_BIGENDIAN
bde4baac
VZ
826 #define wxMBConvUTF16straight wxMBConvUTF16BE
827 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 828#else
bde4baac
VZ
829 #define wxMBConvUTF16swap wxMBConvUTF16BE
830 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
831#endif
832
833
c91830cb
VZ
834#ifdef WC_UTF16
835
c91830cb
VZ
836// copy 16bit MB to 16bit String
837size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
838{
839 size_t len=0;
840
841 while (*(wxUint16*)psz && (!buf || len < n))
842 {
843 if (buf)
844 *buf++ = *(wxUint16*)psz;
845 len++;
846
847 psz += sizeof(wxUint16);
848 }
849 if (buf && len<n) *buf=0;
850
851 return len;
852}
853
854
855// copy 16bit String to 16bit MB
856size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
857{
858 size_t len=0;
859
860 while (*psz && (!buf || len < n))
861 {
862 if (buf)
863 {
864 *(wxUint16*)buf = *psz;
865 buf += sizeof(wxUint16);
866 }
867 len += sizeof(wxUint16);
868 psz++;
869 }
870 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
871
872 return len;
873}
874
875
876// swap 16bit MB to 16bit String
877size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
878{
bfab25d4 879 size_t len = 0;
c91830cb 880
da12017a
VZ
881 // UTF16 string must be terminated by 2 NULs as single NULs may occur
882 // inside the string
883 while ( (psz[0] || psz[1]) && (!buf || len < n) )
c91830cb 884 {
bfab25d4 885 if ( buf )
c91830cb
VZ
886 {
887 ((char *)buf)[0] = psz[1];
888 ((char *)buf)[1] = psz[0];
889 buf++;
890 }
891 len++;
bfab25d4 892 psz += 2;
c91830cb 893 }
bfab25d4
VZ
894
895 if ( buf && len < n )
896 *buf = L'\0';
c91830cb
VZ
897
898 return len;
899}
900
901
902// swap 16bit MB to 16bit String
903size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
904{
eec47cc6 905 size_t len = 0;
c91830cb 906
eec47cc6 907 while ( *psz && (!buf || len < n) )
c91830cb 908 {
eec47cc6 909 if ( buf )
c91830cb
VZ
910 {
911 *buf++ = ((char*)psz)[1];
912 *buf++ = ((char*)psz)[0];
913 }
eec47cc6 914 len += 2;
c91830cb
VZ
915 psz++;
916 }
eec47cc6
VZ
917
918 if ( buf && len < n )
919 *buf = '\0';
c91830cb
VZ
920
921 return len;
922}
923
924
925#else // WC_UTF16
926
927
928// copy 16bit MB to 32bit String
929size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
930{
931 size_t len=0;
932
933 while (*(wxUint16*)psz && (!buf || len < n))
934 {
935 wxUint32 cc;
936 size_t pa=decode_utf16((wxUint16*)psz, cc);
937 if (pa == (size_t)-1)
938 return pa;
939
940 if (buf)
38d4b1e4 941 *buf++ = (wchar_t)cc;
c91830cb
VZ
942 len++;
943 psz += pa * sizeof(wxUint16);
944 }
945 if (buf && len<n) *buf=0;
946
947 return len;
948}
949
950
951// copy 32bit String to 16bit MB
952size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
953{
954 size_t len=0;
955
956 while (*psz && (!buf || len < n))
957 {
958 wxUint16 cc[2];
959 size_t pa=encode_utf16(*psz, cc);
960
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
965 {
69b80d28 966 *(wxUint16*)buf = cc[0];
b5153fd8 967 buf += sizeof(wxUint16);
c91830cb 968 if (pa > 1)
69b80d28
VZ
969 {
970 *(wxUint16*)buf = cc[1];
971 buf += sizeof(wxUint16);
972 }
c91830cb
VZ
973 }
974
975 len += pa*sizeof(wxUint16);
976 psz++;
977 }
978 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
979
980 return len;
981}
982
983
984// swap 16bit MB to 32bit String
985size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
986{
987 size_t len=0;
988
989 while (*(wxUint16*)psz && (!buf || len < n))
990 {
991 wxUint32 cc;
992 char tmp[4];
993 tmp[0]=psz[1]; tmp[1]=psz[0];
994 tmp[2]=psz[3]; tmp[3]=psz[2];
995
996 size_t pa=decode_utf16((wxUint16*)tmp, cc);
997 if (pa == (size_t)-1)
998 return pa;
999
1000 if (buf)
38d4b1e4 1001 *buf++ = (wchar_t)cc;
c91830cb
VZ
1002
1003 len++;
1004 psz += pa * sizeof(wxUint16);
1005 }
1006 if (buf && len<n) *buf=0;
1007
1008 return len;
1009}
1010
1011
1012// swap 32bit String to 16bit MB
1013size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1014{
1015 size_t len=0;
1016
1017 while (*psz && (!buf || len < n))
1018 {
1019 wxUint16 cc[2];
1020 size_t pa=encode_utf16(*psz, cc);
1021
1022 if (pa == (size_t)-1)
1023 return pa;
1024
1025 if (buf)
1026 {
1027 *buf++ = ((char*)cc)[1];
1028 *buf++ = ((char*)cc)[0];
1029 if (pa > 1)
1030 {
1031 *buf++ = ((char*)cc)[3];
1032 *buf++ = ((char*)cc)[2];
1033 }
1034 }
1035
1036 len += pa*sizeof(wxUint16);
1037 psz++;
1038 }
1039 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1040
1041 return len;
1042}
1043
1044#endif // WC_UTF16
1045
1046
1047// ----------------------------------------------------------------------------
1048// UTF-32
1049// ----------------------------------------------------------------------------
1050
1051#ifdef WORDS_BIGENDIAN
1052#define wxMBConvUTF32straight wxMBConvUTF32BE
1053#define wxMBConvUTF32swap wxMBConvUTF32LE
1054#else
1055#define wxMBConvUTF32swap wxMBConvUTF32BE
1056#define wxMBConvUTF32straight wxMBConvUTF32LE
1057#endif
1058
1059
1060WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1061WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1062
1063
1064#ifdef WC_UTF16
1065
1066// copy 32bit MB to 16bit String
1067size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1068{
1069 size_t len=0;
1070
1071 while (*(wxUint32*)psz && (!buf || len < n))
1072 {
1073 wxUint16 cc[2];
1074
1075 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1076 if (pa == (size_t)-1)
1077 return pa;
1078
1079 if (buf)
1080 {
1081 *buf++ = cc[0];
1082 if (pa > 1)
1083 *buf++ = cc[1];
1084 }
1085 len += pa;
1086 psz += sizeof(wxUint32);
1087 }
1088 if (buf && len<n) *buf=0;
1089
1090 return len;
1091}
1092
1093
1094// copy 16bit String to 32bit MB
1095size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1096{
1097 size_t len=0;
1098
1099 while (*psz && (!buf || len < n))
1100 {
1101 wxUint32 cc;
1102
b5153fd8
VZ
1103 // cast is ok for WC_UTF16
1104 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
1105 if (pa == (size_t)-1)
1106 return pa;
1107
1108 if (buf)
1109 {
1110 *(wxUint32*)buf = cc;
1111 buf += sizeof(wxUint32);
1112 }
1113 len += sizeof(wxUint32);
1114 psz += pa;
1115 }
b5153fd8
VZ
1116
1117 if (buf && len<=n-sizeof(wxUint32))
1118 *(wxUint32*)buf=0;
c91830cb
VZ
1119
1120 return len;
1121}
1122
1123
1124
1125// swap 32bit MB to 16bit String
1126size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1127{
1128 size_t len=0;
1129
1130 while (*(wxUint32*)psz && (!buf || len < n))
1131 {
1132 char tmp[4];
1133 tmp[0] = psz[3]; tmp[1] = psz[2];
1134 tmp[2] = psz[1]; tmp[3] = psz[0];
1135
1136
1137 wxUint16 cc[2];
1138
1139 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1140 if (pa == (size_t)-1)
1141 return pa;
1142
1143 if (buf)
1144 {
1145 *buf++ = cc[0];
1146 if (pa > 1)
1147 *buf++ = cc[1];
1148 }
1149 len += pa;
1150 psz += sizeof(wxUint32);
1151 }
b5153fd8
VZ
1152
1153 if (buf && len<n)
1154 *buf=0;
c91830cb
VZ
1155
1156 return len;
1157}
1158
1159
1160// swap 16bit String to 32bit MB
1161size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1162{
1163 size_t len=0;
1164
1165 while (*psz && (!buf || len < n))
1166 {
1167 char cc[4];
1168
b5153fd8
VZ
1169 // cast is ok for WC_UTF16
1170 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
1171 if (pa == (size_t)-1)
1172 return pa;
1173
1174 if (buf)
1175 {
1176 *buf++ = cc[3];
1177 *buf++ = cc[2];
1178 *buf++ = cc[1];
1179 *buf++ = cc[0];
1180 }
1181 len += sizeof(wxUint32);
1182 psz += pa;
1183 }
b5153fd8
VZ
1184
1185 if (buf && len<=n-sizeof(wxUint32))
1186 *(wxUint32*)buf=0;
c91830cb
VZ
1187
1188 return len;
1189}
1190
1191#else // WC_UTF16
1192
1193
1194// copy 32bit MB to 32bit String
1195size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1196{
1197 size_t len=0;
1198
1199 while (*(wxUint32*)psz && (!buf || len < n))
1200 {
1201 if (buf)
38d4b1e4 1202 *buf++ = (wchar_t)(*(wxUint32*)psz);
c91830cb
VZ
1203 len++;
1204 psz += sizeof(wxUint32);
1205 }
b5153fd8
VZ
1206
1207 if (buf && len<n)
1208 *buf=0;
c91830cb
VZ
1209
1210 return len;
1211}
1212
1213
1214// copy 32bit String to 32bit MB
1215size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1216{
1217 size_t len=0;
1218
1219 while (*psz && (!buf || len < n))
1220 {
1221 if (buf)
1222 {
1223 *(wxUint32*)buf = *psz;
1224 buf += sizeof(wxUint32);
1225 }
1226
1227 len += sizeof(wxUint32);
1228 psz++;
1229 }
1230
b5153fd8
VZ
1231 if (buf && len<=n-sizeof(wxUint32))
1232 *(wxUint32*)buf=0;
c91830cb
VZ
1233
1234 return len;
1235}
1236
1237
1238// swap 32bit MB to 32bit String
1239size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1240{
1241 size_t len=0;
1242
1243 while (*(wxUint32*)psz && (!buf || len < n))
1244 {
1245 if (buf)
1246 {
1247 ((char *)buf)[0] = psz[3];
1248 ((char *)buf)[1] = psz[2];
1249 ((char *)buf)[2] = psz[1];
1250 ((char *)buf)[3] = psz[0];
1251 buf++;
1252 }
1253 len++;
1254 psz += sizeof(wxUint32);
1255 }
b5153fd8
VZ
1256
1257 if (buf && len<n)
1258 *buf=0;
c91830cb
VZ
1259
1260 return len;
1261}
1262
1263
1264// swap 32bit String to 32bit MB
1265size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1266{
1267 size_t len=0;
1268
1269 while (*psz && (!buf || len < n))
1270 {
1271 if (buf)
1272 {
1273 *buf++ = ((char *)psz)[3];
1274 *buf++ = ((char *)psz)[2];
1275 *buf++ = ((char *)psz)[1];
1276 *buf++ = ((char *)psz)[0];
1277 }
1278 len += sizeof(wxUint32);
1279 psz++;
1280 }
b5153fd8
VZ
1281
1282 if (buf && len<=n-sizeof(wxUint32))
1283 *(wxUint32*)buf=0;
c91830cb
VZ
1284
1285 return len;
1286}
1287
1288
1289#endif // WC_UTF16
1290
1291
36acb880
VZ
1292// ============================================================================
1293// The classes doing conversion using the iconv_xxx() functions
1294// ============================================================================
3caec1bb 1295
b040e242 1296#ifdef HAVE_ICONV
3a0d76bc 1297
b1d547eb
VS
1298// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1299// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1300// (unless there's yet another bug in glibc) the only case when iconv()
1301// returns with (size_t)-1 (which means error) and says there are 0 bytes
1302// left in the input buffer -- when _real_ error occurs,
1303// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1304// iconv() failure.
3caec1bb
VS
1305// [This bug does not appear in glibc 2.2.]
1306#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1307#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1308 (errno != E2BIG || bufLeft != 0))
1309#else
1310#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1311#endif
1312
ab217dba 1313#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1314
74a7eb0b
VZ
1315#define ICONV_T_INVALID ((iconv_t)-1)
1316
1317#if SIZEOF_WCHAR_T == 4
1318 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1319 #define WC_ENC wxFONTENCODING_UTF32
1320#elif SIZEOF_WCHAR_T == 2
1321 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1322 #define WC_ENC wxFONTENCODING_UTF16
1323#else // sizeof(wchar_t) != 2 nor 4
1324 // does this ever happen?
1325 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1326#endif
1327
36acb880 1328// ----------------------------------------------------------------------------
e95354ec 1329// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1330// ----------------------------------------------------------------------------
1331
e95354ec 1332class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1333{
1334public:
e95354ec
VZ
1335 wxMBConv_iconv(const wxChar *name);
1336 virtual ~wxMBConv_iconv();
36acb880 1337
bde4baac
VZ
1338 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1339 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1340
e95354ec 1341 bool IsOk() const
74a7eb0b 1342 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1343
1344protected:
1345 // the iconv handlers used to translate from multibyte to wide char and in
1346 // the other direction
1347 iconv_t m2w,
1348 w2m;
b1d547eb
VS
1349#if wxUSE_THREADS
1350 // guards access to m2w and w2m objects
1351 wxMutex m_iconvMutex;
1352#endif
36acb880
VZ
1353
1354private:
eec47cc6
VZ
1355 virtual const char *GetMBNul(size_t *nulLen) const;
1356
e95354ec 1357 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1358 // available on this machine, it will remain NULL
74a7eb0b 1359 static wxString ms_wcCharsetName;
36acb880
VZ
1360
1361 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1362 // different endian-ness than the native one
405d8f46 1363 static bool ms_wcNeedsSwap;
eec47cc6
VZ
1364
1365 // NUL representation
1366 size_t m_nulLen;
1367 char m_nulBuf[8];
36acb880
VZ
1368};
1369
8f115891
MW
1370// make the constructor available for unit testing
1371WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1372{
1373 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1374 if ( !result->IsOk() )
1375 {
1376 delete result;
1377 return 0;
1378 }
1379 return result;
1380}
1381
422e411e 1382wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1383bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1384
e95354ec 1385wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1386{
eec47cc6
VZ
1387 m_nulLen = (size_t)-2;
1388
0331b385
VZ
1389 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1390 // names for the charsets
200a9923 1391 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1392
36acb880 1393 // check for charset that represents wchar_t:
74a7eb0b 1394 if ( ms_wcCharsetName.empty() )
f1339c56 1395 {
c2b83fdd
VZ
1396 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1397
74a7eb0b
VZ
1398#if wxUSE_FONTMAP
1399 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1400#else // !wxUSE_FONTMAP
1401 static const wxChar *names[] =
36acb880 1402 {
74a7eb0b
VZ
1403#if SIZEOF_WCHAR_T == 4
1404 _T("UCS-4"),
1405#elif SIZEOF_WCHAR_T = 2
1406 _T("UCS-2"),
1407#endif
1408 NULL
1409 };
1410#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1411
d1f024a8 1412 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1413 {
17a1ebd1 1414 const wxString nameCS(*names);
74a7eb0b
VZ
1415
1416 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1417 wxString nameXE(nameCS);
74a7eb0b
VZ
1418 #ifdef WORDS_BIGENDIAN
1419 nameXE += _T("BE");
1420 #else // little endian
1421 nameXE += _T("LE");
1422 #endif
1423
c2b83fdd
VZ
1424 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1425 nameXE.c_str());
1426
74a7eb0b
VZ
1427 m2w = iconv_open(nameXE.ToAscii(), cname);
1428 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1429 {
74a7eb0b 1430 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1431 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1432 nameCS.c_str());
17a1ebd1 1433 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1434
74a7eb0b
VZ
1435 // and check for bytesex ourselves:
1436 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1437 {
74a7eb0b
VZ
1438 char buf[2], *bufPtr;
1439 wchar_t wbuf[2], *wbufPtr;
1440 size_t insz, outsz;
1441 size_t res;
1442
1443 buf[0] = 'A';
1444 buf[1] = 0;
1445 wbuf[0] = 0;
1446 insz = 2;
1447 outsz = SIZEOF_WCHAR_T * 2;
1448 wbufPtr = wbuf;
1449 bufPtr = buf;
1450
1451 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1452 (char**)&wbufPtr, &outsz);
1453
1454 if (ICONV_FAILED(res, insz))
1455 {
1456 wxLogLastError(wxT("iconv"));
422e411e 1457 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1458 nameCS.c_str());
74a7eb0b
VZ
1459 }
1460 else // ok, can convert to this encoding, remember it
1461 {
17a1ebd1 1462 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1463 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1464 }
3a0d76bc
VS
1465 }
1466 }
74a7eb0b 1467 else // use charset not requiring byte swapping
36acb880 1468 {
74a7eb0b 1469 ms_wcCharsetName = nameXE;
36acb880 1470 }
3a0d76bc 1471 }
74a7eb0b 1472
0944fceb 1473 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1474 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1475 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1476 : ms_wcCharsetName.c_str(),
1477 ms_wcNeedsSwap ? _T(" (needs swap)")
1478 : _T(""));
3a0d76bc 1479 }
36acb880 1480 else // we already have ms_wcCharsetName
3caec1bb 1481 {
74a7eb0b 1482 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1483 }
dccce9ea 1484
74a7eb0b 1485 if ( ms_wcCharsetName.empty() )
f1339c56 1486 {
74a7eb0b 1487 w2m = ICONV_T_INVALID;
36acb880 1488 }
405d8f46
VZ
1489 else
1490 {
74a7eb0b
VZ
1491 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1492 if ( w2m == ICONV_T_INVALID )
1493 {
1494 wxLogTrace(TRACE_STRCONV,
1495 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1496 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1497 }
405d8f46 1498 }
36acb880 1499}
3caec1bb 1500
e95354ec 1501wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1502{
74a7eb0b 1503 if ( m2w != ICONV_T_INVALID )
36acb880 1504 iconv_close(m2w);
74a7eb0b 1505 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1506 iconv_close(w2m);
1507}
3a0d76bc 1508
bde4baac 1509size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1510{
b1d547eb
VS
1511#if wxUSE_THREADS
1512 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1513 // Unfortunately there is a couple of global wxCSConv objects such as
1514 // wxConvLocal that are used all over wx code, so we have to make sure
1515 // the handle is used by at most one thread at the time. Otherwise
1516 // only a few wx classes would be safe to use from non-main threads
1517 // as MB<->WC conversion would fail "randomly".
1518 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1519#endif
3698ae71 1520
36acb880
VZ
1521 size_t inbuf = strlen(psz);
1522 size_t outbuf = n * SIZEOF_WCHAR_T;
1523 size_t res, cres;
1524 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1525 wchar_t *bufPtr = buf;
1526 const char *pszPtr = psz;
1527
1528 if (buf)
1529 {
1530 // have destination buffer, convert there
1531 cres = iconv(m2w,
1532 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1533 (char**)&bufPtr, &outbuf);
1534 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1535
36acb880 1536 if (ms_wcNeedsSwap)
3a0d76bc 1537 {
36acb880 1538 // convert to native endianness
17a1ebd1
VZ
1539 for ( unsigned i = 0; i < res; i++ )
1540 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1541 }
adb45366 1542
49dd9820
VS
1543 // NB: iconv was given only strlen(psz) characters on input, and so
1544 // it couldn't convert the trailing zero. Let's do it ourselves
1545 // if there's some room left for it in the output buffer.
1546 if (res < n)
1547 buf[res] = 0;
36acb880
VZ
1548 }
1549 else
1550 {
1551 // no destination buffer... convert using temp buffer
1552 // to calculate destination buffer requirement
1553 wchar_t tbuf[8];
1554 res = 0;
1555 do {
1556 bufPtr = tbuf;
1557 outbuf = 8*SIZEOF_WCHAR_T;
1558
1559 cres = iconv(m2w,
1560 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1561 (char**)&bufPtr, &outbuf );
1562
1563 res += 8-(outbuf/SIZEOF_WCHAR_T);
1564 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1565 }
dccce9ea 1566
36acb880 1567 if (ICONV_FAILED(cres, inbuf))
f1339c56 1568 {
36acb880 1569 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1570 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1571 return (size_t)-1;
1572 }
1573
1574 return res;
1575}
1576
bde4baac 1577size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1578{
b1d547eb
VS
1579#if wxUSE_THREADS
1580 // NB: explained in MB2WC
1581 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1582#endif
3698ae71 1583
156162ec
MW
1584 size_t inlen = wxWcslen(psz);
1585 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1586 size_t outbuf = n;
1587 size_t res, cres;
3a0d76bc 1588
36acb880 1589 wchar_t *tmpbuf = 0;
3caec1bb 1590
36acb880
VZ
1591 if (ms_wcNeedsSwap)
1592 {
1593 // need to copy to temp buffer to switch endianness
74a7eb0b 1594 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1595 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1596 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1597 for ( size_t i = 0; i < inlen; i++ )
1598 tmpbuf[n] = WC_BSWAP(psz[i]);
156162ec 1599 tmpbuf[inlen] = L'\0';
74a7eb0b 1600 psz = tmpbuf;
36acb880 1601 }
3a0d76bc 1602
36acb880
VZ
1603 if (buf)
1604 {
1605 // have destination buffer, convert there
1606 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1607
36acb880 1608 res = n-outbuf;
adb45366 1609
49dd9820
VS
1610 // NB: iconv was given only wcslen(psz) characters on input, and so
1611 // it couldn't convert the trailing zero. Let's do it ourselves
1612 // if there's some room left for it in the output buffer.
1613 if (res < n)
1614 buf[0] = 0;
36acb880
VZ
1615 }
1616 else
1617 {
1618 // no destination buffer... convert using temp buffer
1619 // to calculate destination buffer requirement
1620 char tbuf[16];
1621 res = 0;
1622 do {
1623 buf = tbuf; outbuf = 16;
1624
1625 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1626
36acb880
VZ
1627 res += 16 - outbuf;
1628 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1629 }
dccce9ea 1630
36acb880
VZ
1631 if (ms_wcNeedsSwap)
1632 {
1633 free(tmpbuf);
1634 }
dccce9ea 1635
36acb880
VZ
1636 if (ICONV_FAILED(cres, inbuf))
1637 {
ce6f8d6f 1638 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1639 return (size_t)-1;
1640 }
1641
1642 return res;
1643}
1644
eec47cc6
VZ
1645const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
1646{
1647 if ( m_nulLen == (size_t)-2 )
1648 {
1649 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1650
1651#if wxUSE_THREADS
1652 // NB: explained in MB2WC
1653 wxMutexLocker lock(self->m_iconvMutex);
1654#endif
1655
1656 size_t inLen = 1,
1657 outLen = WXSIZEOF(m_nulBuf);
1658 self->m_nulLen = iconv(w2m, ICONV_CHAR_CAST(L""), &inLen,
22886fb3 1659 (char **)&self->m_nulBuf, &outLen);
eec47cc6
VZ
1660 }
1661
1662 *nulLen = m_nulLen;
1663 return m_nulBuf;
1664}
1665
b040e242 1666#endif // HAVE_ICONV
36acb880 1667
e95354ec 1668
36acb880
VZ
1669// ============================================================================
1670// Win32 conversion classes
1671// ============================================================================
1cd52418 1672
e95354ec 1673#ifdef wxHAVE_WIN32_MB2WC
373658eb 1674
8b04d4c4 1675// from utils.cpp
d775fa82 1676#if wxUSE_FONTMAP
8b04d4c4
VZ
1677extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1678extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1679#endif
373658eb 1680
e95354ec 1681class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1682{
1683public:
bde4baac
VZ
1684 wxMBConv_win32()
1685 {
1686 m_CodePage = CP_ACP;
eec47cc6 1687 m_nulLen = (size_t)-2;
bde4baac
VZ
1688 }
1689
7608a683 1690#if wxUSE_FONTMAP
e95354ec 1691 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1692 {
1693 m_CodePage = wxCharsetToCodepage(name);
eec47cc6 1694 m_nulLen = (size_t)-2;
bde4baac 1695 }
dccce9ea 1696
e95354ec 1697 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1698 {
1699 m_CodePage = wxEncodingToCodepage(encoding);
eec47cc6 1700 m_nulLen = (size_t)-2;
bde4baac 1701 }
eec47cc6 1702#endif // wxUSE_FONTMAP
8b04d4c4 1703
bde4baac 1704 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1705 {
02272c9c
VZ
1706 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1707 // the behaviour is not compatible with the Unix version (using iconv)
1708 // and break the library itself, e.g. wxTextInputStream::NextChar()
1709 // wouldn't work if reading an incomplete MB char didn't result in an
1710 // error
667e5b3e
VZ
1711 //
1712 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1713 // an error (tested under Windows Server 2003) and apparently it is
1714 // done on purpose, i.e. the function accepts any input in this case
1715 // and although I'd prefer to return error on ill-formed output, our
1716 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1717 // explicitly ill-formed according to RFC 2152) neither so we don't
1718 // even have any fallback here...
89028980
VS
1719 //
1720 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1721 // Win XP or newer and if it is specified on older versions, conversion
1722 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1723 // fails. So we can only use the flag on newer Windows versions.
1724 // Additionally, the flag is not supported by UTF7, symbol and CJK
1725 // encodings. See here:
1726 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1727 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1728 int flags = 0;
1729 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1730 m_CodePage < 50000 &&
1731 IsAtLeastWin2kSP4() )
1732 {
1733 flags = MB_ERR_INVALID_CHARS;
1734 }
1735 else if ( m_CodePage == CP_UTF8 )
1736 {
1737 // Avoid round-trip in the special case of UTF-8 by using our
1738 // own UTF-8 conversion code:
1739 return wxMBConvUTF8().MB2WC(buf, psz, n);
1740 }
667e5b3e 1741
2b5f62a0
VZ
1742 const size_t len = ::MultiByteToWideChar
1743 (
1744 m_CodePage, // code page
667e5b3e 1745 flags, // flags: fall on error
2b5f62a0
VZ
1746 psz, // input string
1747 -1, // its length (NUL-terminated)
b4da152e 1748 buf, // output string
2b5f62a0
VZ
1749 buf ? n : 0 // size of output buffer
1750 );
89028980
VS
1751 if ( !len )
1752 {
1753 // function totally failed
1754 return (size_t)-1;
1755 }
1756
1757 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1758 // check if we succeeded, by doing a double trip:
1759 if ( !flags && buf )
1760 {
53c174fc
VZ
1761 const size_t mbLen = strlen(psz);
1762 wxCharBuffer mbBuf(mbLen);
89028980
VS
1763 if ( ::WideCharToMultiByte
1764 (
1765 m_CodePage,
1766 0,
1767 buf,
1768 -1,
1769 mbBuf.data(),
53c174fc 1770 mbLen + 1, // size in bytes, not length
89028980
VS
1771 NULL,
1772 NULL
1773 ) == 0 ||
1774 strcmp(mbBuf, psz) != 0 )
1775 {
1776 // we didn't obtain the same thing we started from, hence
1777 // the conversion was lossy and we consider that it failed
1778 return (size_t)-1;
1779 }
1780 }
2b5f62a0 1781
03a991bc
VZ
1782 // note that it returns count of written chars for buf != NULL and size
1783 // of the needed buffer for buf == NULL so in either case the length of
1784 // the string (which never includes the terminating NUL) is one less
89028980 1785 return len - 1;
f1339c56 1786 }
dccce9ea 1787
13dd924a 1788 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1789 {
13dd924a
VZ
1790 /*
1791 we have a problem here: by default, WideCharToMultiByte() may
1792 replace characters unrepresentable in the target code page with bad
1793 quality approximations such as turning "1/2" symbol (U+00BD) into
1794 "1" for the code pages which don't have it and we, obviously, want
1795 to avoid this at any price
d775fa82 1796
13dd924a
VZ
1797 the trouble is that this function does it _silently_, i.e. it won't
1798 even tell us whether it did or not... Win98/2000 and higher provide
1799 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1800 we have to resort to a round trip, i.e. check that converting back
1801 results in the same string -- this is, of course, expensive but
1802 otherwise we simply can't be sure to not garble the data.
1803 */
1804
1805 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1806 // it doesn't work with CJK encodings (which we test for rather roughly
1807 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1808 // supporting it
907173e5
WS
1809 BOOL usedDef wxDUMMY_INITIALIZE(false);
1810 BOOL *pUsedDef;
13dd924a
VZ
1811 int flags;
1812 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1813 {
1814 // it's our lucky day
1815 flags = WC_NO_BEST_FIT_CHARS;
1816 pUsedDef = &usedDef;
1817 }
1818 else // old system or unsupported encoding
1819 {
1820 flags = 0;
1821 pUsedDef = NULL;
1822 }
1823
2b5f62a0
VZ
1824 const size_t len = ::WideCharToMultiByte
1825 (
1826 m_CodePage, // code page
13dd924a
VZ
1827 flags, // either none or no best fit
1828 pwz, // input string
2b5f62a0
VZ
1829 -1, // it is (wide) NUL-terminated
1830 buf, // output buffer
1831 buf ? n : 0, // and its size
1832 NULL, // default "replacement" char
13dd924a 1833 pUsedDef // [out] was it used?
2b5f62a0
VZ
1834 );
1835
13dd924a
VZ
1836 if ( !len )
1837 {
1838 // function totally failed
1839 return (size_t)-1;
1840 }
1841
1842 // if we were really converting, check if we succeeded
1843 if ( buf )
1844 {
1845 if ( flags )
1846 {
1847 // check if the conversion failed, i.e. if any replacements
1848 // were done
1849 if ( usedDef )
1850 return (size_t)-1;
1851 }
1852 else // we must resort to double tripping...
1853 {
1854 wxWCharBuffer wcBuf(n);
1855 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1856 wcscmp(wcBuf, pwz) != 0 )
1857 {
1858 // we didn't obtain the same thing we started from, hence
1859 // the conversion was lossy and we consider that it failed
1860 return (size_t)-1;
1861 }
1862 }
1863 }
1864
03a991bc 1865 // see the comment above for the reason of "len - 1"
13dd924a 1866 return len - 1;
f1339c56 1867 }
dccce9ea 1868
13dd924a
VZ
1869 bool IsOk() const { return m_CodePage != -1; }
1870
1871private:
1872 static bool CanUseNoBestFit()
1873 {
1874 static int s_isWin98Or2k = -1;
1875
1876 if ( s_isWin98Or2k == -1 )
1877 {
1878 int verMaj, verMin;
1879 switch ( wxGetOsVersion(&verMaj, &verMin) )
1880 {
1881 case wxWIN95:
1882 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1883 break;
1884
1885 case wxWINDOWS_NT:
1886 s_isWin98Or2k = verMaj >= 5;
1887 break;
1888
1889 default:
1890 // unknown, be conseravtive by default
1891 s_isWin98Or2k = 0;
1892 }
1893
1894 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1895 }
1896
1897 return s_isWin98Or2k == 1;
1898 }
f1339c56 1899
89028980
VS
1900 static bool IsAtLeastWin2kSP4()
1901 {
8942f83a
WS
1902#ifdef __WXWINCE__
1903 return false;
1904#else
89028980
VS
1905 static int s_isAtLeastWin2kSP4 = -1;
1906
1907 if ( s_isAtLeastWin2kSP4 == -1 )
1908 {
1909 OSVERSIONINFOEX ver;
1910
1911 memset(&ver, 0, sizeof(ver));
1912 ver.dwOSVersionInfoSize = sizeof(ver);
1913 GetVersionEx((OSVERSIONINFO*)&ver);
1914
1915 s_isAtLeastWin2kSP4 =
1916 ((ver.dwMajorVersion > 5) || // Vista+
1917 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1918 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1919 ver.wServicePackMajor >= 4)) // 2000 SP4+
1920 ? 1 : 0;
1921 }
1922
1923 return s_isAtLeastWin2kSP4 == 1;
8942f83a 1924#endif
89028980
VS
1925 }
1926
eec47cc6
VZ
1927 virtual const char *GetMBNul(size_t *nulLen) const
1928 {
1929 if ( m_nulLen == (size_t)-2 )
1930 {
1931 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1932
1933 self->m_nulLen = ::WideCharToMultiByte
1934 (
1935 m_CodePage, // code page
1936 0, // no flags
1937 L"", // input string
1938 1, // translate just NUL
1939 self->m_nulBuf, // output buffer
1940 WXSIZEOF(m_nulBuf), // and its size
1941 NULL, // "replacement" char
1942 NULL // [out] was it used?
1943 );
1944
1945 if ( m_nulLen == 0 )
1946 self->m_nulLen = (size_t)-1;
1947 }
1948
1949 *nulLen = m_nulLen;
1950 return m_nulBuf;
1951 }
1952
b1d66b54 1953 long m_CodePage;
eec47cc6
VZ
1954 size_t m_nulLen;
1955 char m_nulBuf[8];
1cd52418 1956};
e95354ec
VZ
1957
1958#endif // wxHAVE_WIN32_MB2WC
1959
f7e98dee
RN
1960// ============================================================================
1961// Cocoa conversion classes
1962// ============================================================================
1963
1964#if defined(__WXCOCOA__)
1965
ecd9653b 1966// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1967// Cocoa. Strangely enough, internally Core Foundation uses
1968// UTF 32 internally quite a bit - its just not public (yet).
1969
1970#include <CoreFoundation/CFString.h>
1971#include <CoreFoundation/CFStringEncodingExt.h>
1972
1973CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1974{
638357a0 1975 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1976 if ( encoding == wxFONTENCODING_DEFAULT )
1977 {
638357a0 1978 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1979 }
1980 else switch( encoding)
1981 {
1982 case wxFONTENCODING_ISO8859_1 :
1983 enc = kCFStringEncodingISOLatin1 ;
1984 break ;
1985 case wxFONTENCODING_ISO8859_2 :
1986 enc = kCFStringEncodingISOLatin2;
1987 break ;
1988 case wxFONTENCODING_ISO8859_3 :
1989 enc = kCFStringEncodingISOLatin3 ;
1990 break ;
1991 case wxFONTENCODING_ISO8859_4 :
1992 enc = kCFStringEncodingISOLatin4;
1993 break ;
1994 case wxFONTENCODING_ISO8859_5 :
1995 enc = kCFStringEncodingISOLatinCyrillic;
1996 break ;
1997 case wxFONTENCODING_ISO8859_6 :
1998 enc = kCFStringEncodingISOLatinArabic;
1999 break ;
2000 case wxFONTENCODING_ISO8859_7 :
2001 enc = kCFStringEncodingISOLatinGreek;
2002 break ;
2003 case wxFONTENCODING_ISO8859_8 :
2004 enc = kCFStringEncodingISOLatinHebrew;
2005 break ;
2006 case wxFONTENCODING_ISO8859_9 :
2007 enc = kCFStringEncodingISOLatin5;
2008 break ;
2009 case wxFONTENCODING_ISO8859_10 :
2010 enc = kCFStringEncodingISOLatin6;
2011 break ;
2012 case wxFONTENCODING_ISO8859_11 :
2013 enc = kCFStringEncodingISOLatinThai;
2014 break ;
2015 case wxFONTENCODING_ISO8859_13 :
2016 enc = kCFStringEncodingISOLatin7;
2017 break ;
2018 case wxFONTENCODING_ISO8859_14 :
2019 enc = kCFStringEncodingISOLatin8;
2020 break ;
2021 case wxFONTENCODING_ISO8859_15 :
2022 enc = kCFStringEncodingISOLatin9;
2023 break ;
2024
2025 case wxFONTENCODING_KOI8 :
2026 enc = kCFStringEncodingKOI8_R;
2027 break ;
2028 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2029 enc = kCFStringEncodingDOSRussian;
2030 break ;
2031
2032// case wxFONTENCODING_BULGARIAN :
2033// enc = ;
2034// break ;
2035
2036 case wxFONTENCODING_CP437 :
2037 enc =kCFStringEncodingDOSLatinUS ;
2038 break ;
2039 case wxFONTENCODING_CP850 :
2040 enc = kCFStringEncodingDOSLatin1;
2041 break ;
2042 case wxFONTENCODING_CP852 :
2043 enc = kCFStringEncodingDOSLatin2;
2044 break ;
2045 case wxFONTENCODING_CP855 :
2046 enc = kCFStringEncodingDOSCyrillic;
2047 break ;
2048 case wxFONTENCODING_CP866 :
2049 enc =kCFStringEncodingDOSRussian ;
2050 break ;
2051 case wxFONTENCODING_CP874 :
2052 enc = kCFStringEncodingDOSThai;
2053 break ;
2054 case wxFONTENCODING_CP932 :
2055 enc = kCFStringEncodingDOSJapanese;
2056 break ;
2057 case wxFONTENCODING_CP936 :
2058 enc =kCFStringEncodingDOSChineseSimplif ;
2059 break ;
2060 case wxFONTENCODING_CP949 :
2061 enc = kCFStringEncodingDOSKorean;
2062 break ;
2063 case wxFONTENCODING_CP950 :
2064 enc = kCFStringEncodingDOSChineseTrad;
2065 break ;
ecd9653b
WS
2066 case wxFONTENCODING_CP1250 :
2067 enc = kCFStringEncodingWindowsLatin2;
2068 break ;
2069 case wxFONTENCODING_CP1251 :
2070 enc =kCFStringEncodingWindowsCyrillic ;
2071 break ;
2072 case wxFONTENCODING_CP1252 :
2073 enc =kCFStringEncodingWindowsLatin1 ;
2074 break ;
2075 case wxFONTENCODING_CP1253 :
2076 enc = kCFStringEncodingWindowsGreek;
2077 break ;
2078 case wxFONTENCODING_CP1254 :
2079 enc = kCFStringEncodingWindowsLatin5;
2080 break ;
2081 case wxFONTENCODING_CP1255 :
2082 enc =kCFStringEncodingWindowsHebrew ;
2083 break ;
2084 case wxFONTENCODING_CP1256 :
2085 enc =kCFStringEncodingWindowsArabic ;
2086 break ;
2087 case wxFONTENCODING_CP1257 :
2088 enc = kCFStringEncodingWindowsBalticRim;
2089 break ;
638357a0
RN
2090// This only really encodes to UTF7 (if that) evidently
2091// case wxFONTENCODING_UTF7 :
2092// enc = kCFStringEncodingNonLossyASCII ;
2093// break ;
ecd9653b
WS
2094 case wxFONTENCODING_UTF8 :
2095 enc = kCFStringEncodingUTF8 ;
2096 break ;
2097 case wxFONTENCODING_EUC_JP :
2098 enc = kCFStringEncodingEUC_JP;
2099 break ;
2100 case wxFONTENCODING_UTF16 :
f7e98dee 2101 enc = kCFStringEncodingUnicode ;
ecd9653b 2102 break ;
f7e98dee
RN
2103 case wxFONTENCODING_MACROMAN :
2104 enc = kCFStringEncodingMacRoman ;
2105 break ;
2106 case wxFONTENCODING_MACJAPANESE :
2107 enc = kCFStringEncodingMacJapanese ;
2108 break ;
2109 case wxFONTENCODING_MACCHINESETRAD :
2110 enc = kCFStringEncodingMacChineseTrad ;
2111 break ;
2112 case wxFONTENCODING_MACKOREAN :
2113 enc = kCFStringEncodingMacKorean ;
2114 break ;
2115 case wxFONTENCODING_MACARABIC :
2116 enc = kCFStringEncodingMacArabic ;
2117 break ;
2118 case wxFONTENCODING_MACHEBREW :
2119 enc = kCFStringEncodingMacHebrew ;
2120 break ;
2121 case wxFONTENCODING_MACGREEK :
2122 enc = kCFStringEncodingMacGreek ;
2123 break ;
2124 case wxFONTENCODING_MACCYRILLIC :
2125 enc = kCFStringEncodingMacCyrillic ;
2126 break ;
2127 case wxFONTENCODING_MACDEVANAGARI :
2128 enc = kCFStringEncodingMacDevanagari ;
2129 break ;
2130 case wxFONTENCODING_MACGURMUKHI :
2131 enc = kCFStringEncodingMacGurmukhi ;
2132 break ;
2133 case wxFONTENCODING_MACGUJARATI :
2134 enc = kCFStringEncodingMacGujarati ;
2135 break ;
2136 case wxFONTENCODING_MACORIYA :
2137 enc = kCFStringEncodingMacOriya ;
2138 break ;
2139 case wxFONTENCODING_MACBENGALI :
2140 enc = kCFStringEncodingMacBengali ;
2141 break ;
2142 case wxFONTENCODING_MACTAMIL :
2143 enc = kCFStringEncodingMacTamil ;
2144 break ;
2145 case wxFONTENCODING_MACTELUGU :
2146 enc = kCFStringEncodingMacTelugu ;
2147 break ;
2148 case wxFONTENCODING_MACKANNADA :
2149 enc = kCFStringEncodingMacKannada ;
2150 break ;
2151 case wxFONTENCODING_MACMALAJALAM :
2152 enc = kCFStringEncodingMacMalayalam ;
2153 break ;
2154 case wxFONTENCODING_MACSINHALESE :
2155 enc = kCFStringEncodingMacSinhalese ;
2156 break ;
2157 case wxFONTENCODING_MACBURMESE :
2158 enc = kCFStringEncodingMacBurmese ;
2159 break ;
2160 case wxFONTENCODING_MACKHMER :
2161 enc = kCFStringEncodingMacKhmer ;
2162 break ;
2163 case wxFONTENCODING_MACTHAI :
2164 enc = kCFStringEncodingMacThai ;
2165 break ;
2166 case wxFONTENCODING_MACLAOTIAN :
2167 enc = kCFStringEncodingMacLaotian ;
2168 break ;
2169 case wxFONTENCODING_MACGEORGIAN :
2170 enc = kCFStringEncodingMacGeorgian ;
2171 break ;
2172 case wxFONTENCODING_MACARMENIAN :
2173 enc = kCFStringEncodingMacArmenian ;
2174 break ;
2175 case wxFONTENCODING_MACCHINESESIMP :
2176 enc = kCFStringEncodingMacChineseSimp ;
2177 break ;
2178 case wxFONTENCODING_MACTIBETAN :
2179 enc = kCFStringEncodingMacTibetan ;
2180 break ;
2181 case wxFONTENCODING_MACMONGOLIAN :
2182 enc = kCFStringEncodingMacMongolian ;
2183 break ;
2184 case wxFONTENCODING_MACETHIOPIC :
2185 enc = kCFStringEncodingMacEthiopic ;
2186 break ;
2187 case wxFONTENCODING_MACCENTRALEUR :
2188 enc = kCFStringEncodingMacCentralEurRoman ;
2189 break ;
2190 case wxFONTENCODING_MACVIATNAMESE :
2191 enc = kCFStringEncodingMacVietnamese ;
2192 break ;
2193 case wxFONTENCODING_MACARABICEXT :
2194 enc = kCFStringEncodingMacExtArabic ;
2195 break ;
2196 case wxFONTENCODING_MACSYMBOL :
2197 enc = kCFStringEncodingMacSymbol ;
2198 break ;
2199 case wxFONTENCODING_MACDINGBATS :
2200 enc = kCFStringEncodingMacDingbats ;
2201 break ;
2202 case wxFONTENCODING_MACTURKISH :
2203 enc = kCFStringEncodingMacTurkish ;
2204 break ;
2205 case wxFONTENCODING_MACCROATIAN :
2206 enc = kCFStringEncodingMacCroatian ;
2207 break ;
2208 case wxFONTENCODING_MACICELANDIC :
2209 enc = kCFStringEncodingMacIcelandic ;
2210 break ;
2211 case wxFONTENCODING_MACROMANIAN :
2212 enc = kCFStringEncodingMacRomanian ;
2213 break ;
2214 case wxFONTENCODING_MACCELTIC :
2215 enc = kCFStringEncodingMacCeltic ;
2216 break ;
2217 case wxFONTENCODING_MACGAELIC :
2218 enc = kCFStringEncodingMacGaelic ;
2219 break ;
ecd9653b
WS
2220// case wxFONTENCODING_MACKEYBOARD :
2221// enc = kCFStringEncodingMacKeyboardGlyphs ;
2222// break ;
2223 default :
2224 // because gcc is picky
2225 break ;
2226 } ;
2227 return enc ;
f7e98dee
RN
2228}
2229
f7e98dee
RN
2230class wxMBConv_cocoa : public wxMBConv
2231{
2232public:
2233 wxMBConv_cocoa()
2234 {
2235 Init(CFStringGetSystemEncoding()) ;
2236 }
2237
a6900d10 2238#if wxUSE_FONTMAP
f7e98dee
RN
2239 wxMBConv_cocoa(const wxChar* name)
2240 {
267e11c5 2241 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2242 }
a6900d10 2243#endif
f7e98dee
RN
2244
2245 wxMBConv_cocoa(wxFontEncoding encoding)
2246 {
2247 Init( wxCFStringEncFromFontEnc(encoding) );
2248 }
2249
2250 ~wxMBConv_cocoa()
2251 {
2252 }
2253
2254 void Init( CFStringEncoding encoding)
2255 {
638357a0 2256 m_encoding = encoding ;
f7e98dee
RN
2257 }
2258
2259 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2260 {
2261 wxASSERT(szUnConv);
ecd9653b 2262
638357a0
RN
2263 CFStringRef theString = CFStringCreateWithBytes (
2264 NULL, //the allocator
2265 (const UInt8*)szUnConv,
2266 strlen(szUnConv),
2267 m_encoding,
2268 false //no BOM/external representation
f7e98dee
RN
2269 );
2270
2271 wxASSERT(theString);
2272
638357a0
RN
2273 size_t nOutLength = CFStringGetLength(theString);
2274
2275 if (szOut == NULL)
f7e98dee 2276 {
f7e98dee 2277 CFRelease(theString);
638357a0 2278 return nOutLength;
f7e98dee 2279 }
ecd9653b 2280
638357a0 2281 CFRange theRange = { 0, nOutSize };
ecd9653b 2282
638357a0
RN
2283#if SIZEOF_WCHAR_T == 4
2284 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2285#endif
3698ae71 2286
f7e98dee 2287 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2288
f7e98dee 2289 CFRelease(theString);
ecd9653b 2290
638357a0 2291 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
2292
2293#if SIZEOF_WCHAR_T == 4
2294 wxMBConvUTF16 converter ;
638357a0 2295 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
2296 delete[] szUniCharBuffer;
2297#endif
3698ae71 2298
638357a0 2299 return nOutLength;
f7e98dee
RN
2300 }
2301
2302 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2303 {
638357a0 2304 wxASSERT(szUnConv);
3698ae71 2305
f7e98dee 2306 size_t nRealOutSize;
638357a0 2307 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2308 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2309
f7e98dee 2310#if SIZEOF_WCHAR_T == 4
d9d488cf 2311 wxMBConvUTF16 converter ;
f7e98dee
RN
2312 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2313 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2314 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2315 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2316#endif
2317
2318 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2319 NULL, //allocator
2320 szUniBuffer,
2321 nBufSize,
638357a0 2322 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2323 );
ecd9653b 2324
f7e98dee 2325 wxASSERT(theString);
ecd9653b 2326
f7e98dee 2327 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2328 //so we check and use getchars instead in that case
2329 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2330 {
638357a0
RN
2331 if (szOut != NULL)
2332 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2333
638357a0
RN
2334 nRealOutSize = CFStringGetLength(theString) + 1;
2335 }
2336 else
2337 {
2338 CFStringGetBytes(
2339 theString,
2340 CFRangeMake(0, CFStringGetLength(theString)),
2341 m_encoding,
2342 0, //what to put in characters that can't be converted -
2343 //0 tells CFString to return NULL if it meets such a character
2344 false, //not an external representation
2345 (UInt8*) szOut,
3698ae71 2346 nOutSize,
638357a0
RN
2347 (CFIndex*) &nRealOutSize
2348 );
f7e98dee 2349 }
ecd9653b 2350
638357a0 2351 CFRelease(theString);
ecd9653b 2352
638357a0
RN
2353#if SIZEOF_WCHAR_T == 4
2354 delete[] szUniBuffer;
2355#endif
ecd9653b 2356
f7e98dee
RN
2357 return nRealOutSize - 1;
2358 }
2359
2360 bool IsOk() const
ecd9653b 2361 {
3698ae71 2362 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2363 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2364 }
2365
2366private:
638357a0 2367 CFStringEncoding m_encoding ;
f7e98dee
RN
2368};
2369
2370#endif // defined(__WXCOCOA__)
2371
335d31e0
SC
2372// ============================================================================
2373// Mac conversion classes
2374// ============================================================================
2375
2376#if defined(__WXMAC__) && defined(TARGET_CARBON)
2377
2378class wxMBConv_mac : public wxMBConv
2379{
2380public:
2381 wxMBConv_mac()
2382 {
2383 Init(CFStringGetSystemEncoding()) ;
2384 }
2385
2d1659cf 2386#if wxUSE_FONTMAP
335d31e0
SC
2387 wxMBConv_mac(const wxChar* name)
2388 {
267e11c5 2389 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0 2390 }
2d1659cf 2391#endif
335d31e0
SC
2392
2393 wxMBConv_mac(wxFontEncoding encoding)
2394 {
d775fa82
WS
2395 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2396 }
2397
2398 ~wxMBConv_mac()
2399 {
2400 OSStatus status = noErr ;
2401 status = TECDisposeConverter(m_MB2WC_converter);
2402 status = TECDisposeConverter(m_WC2MB_converter);
2403 }
2404
2405
2406 void Init( TextEncodingBase encoding)
2407 {
2408 OSStatus status = noErr ;
2409 m_char_encoding = encoding ;
2410 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2411
2412 status = TECCreateConverter(&m_MB2WC_converter,
2413 m_char_encoding,
2414 m_unicode_encoding);
2415 status = TECCreateConverter(&m_WC2MB_converter,
2416 m_unicode_encoding,
2417 m_char_encoding);
2418 }
2419
335d31e0
SC
2420 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2421 {
d775fa82
WS
2422 OSStatus status = noErr ;
2423 ByteCount byteOutLen ;
2424 ByteCount byteInLen = strlen(psz) ;
2425 wchar_t *tbuf = NULL ;
2426 UniChar* ubuf = NULL ;
2427 size_t res = 0 ;
2428
2429 if (buf == NULL)
2430 {
638357a0 2431 //apple specs say at least 32
c543817b 2432 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2433 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2434 }
2435 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2436#if SIZEOF_WCHAR_T == 4
d775fa82 2437 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2438#else
d775fa82 2439 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2440#endif
d775fa82
WS
2441 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2442 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2443#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2444 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2445 // is not properly terminated we get random characters at the end
2446 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2447 wxMBConvUTF16 converter ;
d775fa82
WS
2448 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2449 free( ubuf ) ;
f3a355ce 2450#else
d775fa82 2451 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2452#endif
d775fa82
WS
2453 if ( buf == NULL )
2454 free(tbuf) ;
335d31e0 2455
335d31e0
SC
2456 if ( buf && res < n)
2457 buf[res] = 0;
2458
d775fa82 2459 return res ;
335d31e0
SC
2460 }
2461
2462 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2463 {
2464 OSStatus status = noErr ;
2465 ByteCount byteOutLen ;
2466 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2467
2468 char *tbuf = NULL ;
2469
2470 if (buf == NULL)
2471 {
638357a0 2472 //apple specs say at least 32
c543817b 2473 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2474 tbuf = (char*) malloc( n ) ;
2475 }
2476
2477 ByteCount byteBufferLen = n ;
2478 UniChar* ubuf = NULL ;
f3a355ce 2479#if SIZEOF_WCHAR_T == 4
d9d488cf 2480 wxMBConvUTF16 converter ;
d775fa82
WS
2481 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2482 byteInLen = unicharlen ;
2483 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2484 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2485#else
d775fa82 2486 ubuf = (UniChar*) psz ;
f3a355ce 2487#endif
d775fa82
WS
2488 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2489 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2490#if SIZEOF_WCHAR_T == 4
d775fa82 2491 free( ubuf ) ;
f3a355ce 2492#endif
d775fa82
WS
2493 if ( buf == NULL )
2494 free(tbuf) ;
335d31e0 2495
d775fa82 2496 size_t res = byteOutLen ;
335d31e0 2497 if ( buf && res < n)
638357a0 2498 {
335d31e0 2499 buf[res] = 0;
3698ae71 2500
638357a0
RN
2501 //we need to double-trip to verify it didn't insert any ? in place
2502 //of bogus characters
2503 wxWCharBuffer wcBuf(n);
2504 size_t pszlen = wxWcslen(psz);
2505 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2506 wxWcslen(wcBuf) != pszlen ||
2507 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2508 {
2509 // we didn't obtain the same thing we started from, hence
2510 // the conversion was lossy and we consider that it failed
2511 return (size_t)-1;
2512 }
2513 }
335d31e0 2514
d775fa82 2515 return res ;
335d31e0
SC
2516 }
2517
2518 bool IsOk() const
2519 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2520
2521private:
d775fa82
WS
2522 TECObjectRef m_MB2WC_converter ;
2523 TECObjectRef m_WC2MB_converter ;
2524
2525 TextEncodingBase m_char_encoding ;
2526 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2527};
2528
2529#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2530
36acb880
VZ
2531// ============================================================================
2532// wxEncodingConverter based conversion classes
2533// ============================================================================
2534
1e6feb95 2535#if wxUSE_FONTMAP
1cd52418 2536
e95354ec 2537class wxMBConv_wxwin : public wxMBConv
1cd52418 2538{
8b04d4c4
VZ
2539private:
2540 void Init()
2541 {
2542 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2543 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2544 }
2545
6001e347 2546public:
f1339c56
RR
2547 // temporarily just use wxEncodingConverter stuff,
2548 // so that it works while a better implementation is built
e95354ec 2549 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2550 {
2551 if (name)
267e11c5 2552 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2553 else
2554 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2555
8b04d4c4
VZ
2556 Init();
2557 }
2558
e95354ec 2559 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2560 {
2561 m_enc = enc;
2562
2563 Init();
f1339c56 2564 }
dccce9ea 2565
bde4baac 2566 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2567 {
2568 size_t inbuf = strlen(psz);
dccce9ea 2569 if (buf)
c643a977
VS
2570 {
2571 if (!m2w.Convert(psz,buf))
2572 return (size_t)-1;
2573 }
f1339c56
RR
2574 return inbuf;
2575 }
dccce9ea 2576
bde4baac 2577 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2578 {
f8d791e0 2579 const size_t inbuf = wxWcslen(psz);
f1339c56 2580 if (buf)
c643a977
VS
2581 {
2582 if (!w2m.Convert(psz,buf))
2583 return (size_t)-1;
2584 }
dccce9ea 2585
f1339c56
RR
2586 return inbuf;
2587 }
dccce9ea 2588
e95354ec 2589 bool IsOk() const { return m_ok; }
f1339c56
RR
2590
2591public:
8b04d4c4 2592 wxFontEncoding m_enc;
f1339c56 2593 wxEncodingConverter m2w, w2m;
cafbf6fb 2594
eec47cc6
VZ
2595private:
2596 virtual const char *GetMBNul(size_t *nulLen) const
2597 {
2598 switch ( m_enc )
2599 {
2600 case wxFONTENCODING_UTF16BE:
2601 case wxFONTENCODING_UTF16LE:
2602 *nulLen = 2;
2603 return "\0";
2604
2605 case wxFONTENCODING_UTF32BE:
2606 case wxFONTENCODING_UTF32LE:
2607 *nulLen = 4;
2608 return "\0\0\0";
2609
2610 default:
2611 *nulLen = 1;
2612 return "";
2613 }
2614 }
2615
cafbf6fb
VZ
2616 // were we initialized successfully?
2617 bool m_ok;
fc7a2a60 2618
e95354ec 2619 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2620};
6001e347 2621
8f115891
MW
2622// make the constructors available for unit testing
2623WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2624{
2625 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2626 if ( !result->IsOk() )
2627 {
2628 delete result;
2629 return 0;
2630 }
2631 return result;
2632}
2633
1e6feb95
VZ
2634#endif // wxUSE_FONTMAP
2635
36acb880
VZ
2636// ============================================================================
2637// wxCSConv implementation
2638// ============================================================================
2639
8b04d4c4 2640void wxCSConv::Init()
6001e347 2641{
e95354ec
VZ
2642 m_name = NULL;
2643 m_convReal = NULL;
2644 m_deferred = true;
2645}
2646
8b04d4c4
VZ
2647wxCSConv::wxCSConv(const wxChar *charset)
2648{
2649 Init();
82713003 2650
e95354ec
VZ
2651 if ( charset )
2652 {
e95354ec
VZ
2653 SetName(charset);
2654 }
bda3d86a 2655
e4277538
VZ
2656#if wxUSE_FONTMAP
2657 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2658#else
bda3d86a 2659 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2660#endif
6001e347
RR
2661}
2662
8b04d4c4
VZ
2663wxCSConv::wxCSConv(wxFontEncoding encoding)
2664{
bda3d86a 2665 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2666 {
2667 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2668
2669 encoding = wxFONTENCODING_SYSTEM;
2670 }
2671
8b04d4c4
VZ
2672 Init();
2673
bda3d86a 2674 m_encoding = encoding;
8b04d4c4
VZ
2675}
2676
6001e347
RR
2677wxCSConv::~wxCSConv()
2678{
65e50848
JS
2679 Clear();
2680}
2681
54380f29 2682wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2683 : wxMBConv()
54380f29 2684{
8b04d4c4
VZ
2685 Init();
2686
54380f29 2687 SetName(conv.m_name);
8b04d4c4 2688 m_encoding = conv.m_encoding;
54380f29
GD
2689}
2690
2691wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2692{
2693 Clear();
8b04d4c4 2694
54380f29 2695 SetName(conv.m_name);
8b04d4c4
VZ
2696 m_encoding = conv.m_encoding;
2697
54380f29
GD
2698 return *this;
2699}
2700
65e50848
JS
2701void wxCSConv::Clear()
2702{
8b04d4c4 2703 free(m_name);
e95354ec 2704 delete m_convReal;
8b04d4c4 2705
65e50848 2706 m_name = NULL;
e95354ec 2707 m_convReal = NULL;
6001e347
RR
2708}
2709
2710void wxCSConv::SetName(const wxChar *charset)
2711{
f1339c56
RR
2712 if (charset)
2713 {
2714 m_name = wxStrdup(charset);
e95354ec 2715 m_deferred = true;
f1339c56 2716 }
6001e347
RR
2717}
2718
8b3eb85d
VZ
2719#if wxUSE_FONTMAP
2720#include "wx/hashmap.h"
2721
2722WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2723 wxEncodingNameCache );
8b3eb85d
VZ
2724
2725static wxEncodingNameCache gs_nameCache;
2726#endif
2727
e95354ec
VZ
2728wxMBConv *wxCSConv::DoCreate() const
2729{
ce6f8d6f
VZ
2730#if wxUSE_FONTMAP
2731 wxLogTrace(TRACE_STRCONV,
2732 wxT("creating conversion for %s"),
2733 (m_name ? m_name
2734 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2735#endif // wxUSE_FONTMAP
2736
c547282d
VZ
2737 // check for the special case of ASCII or ISO8859-1 charset: as we have
2738 // special knowledge of it anyhow, we don't need to create a special
2739 // conversion object
e4277538
VZ
2740 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2741 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2742 {
e95354ec
VZ
2743 // don't convert at all
2744 return NULL;
2745 }
dccce9ea 2746
e95354ec
VZ
2747 // we trust OS to do conversion better than we can so try external
2748 // conversion methods first
2749 //
2750 // the full order is:
2751 // 1. OS conversion (iconv() under Unix or Win32 API)
2752 // 2. hard coded conversions for UTF
2753 // 3. wxEncodingConverter as fall back
2754
2755 // step (1)
2756#ifdef HAVE_ICONV
c547282d 2757#if !wxUSE_FONTMAP
e95354ec 2758 if ( m_name )
c547282d 2759#endif // !wxUSE_FONTMAP
e95354ec 2760 {
c547282d 2761 wxString name(m_name);
8b3eb85d
VZ
2762 wxFontEncoding encoding(m_encoding);
2763
2764 if ( !name.empty() )
2765 {
2766 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2767 if ( conv->IsOk() )
2768 return conv;
2769
2770 delete conv;
c547282d
VZ
2771
2772#if wxUSE_FONTMAP
8b3eb85d
VZ
2773 encoding =
2774 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 2775#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2776 }
2777#if wxUSE_FONTMAP
2778 {
2779 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2780 if ( it != gs_nameCache.end() )
2781 {
2782 if ( it->second.empty() )
2783 return NULL;
c547282d 2784
8b3eb85d
VZ
2785 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2786 if ( conv->IsOk() )
2787 return conv;
e95354ec 2788
8b3eb85d
VZ
2789 delete conv;
2790 }
2791
2792 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2793
2794 for ( ; *names; ++names )
2795 {
2796 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2797 if ( conv->IsOk() )
2798 {
2799 gs_nameCache[encoding] = *names;
2800 return conv;
2801 }
2802
2803 delete conv;
2804 }
2805
40711af8 2806 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
2807 }
2808#endif // wxUSE_FONTMAP
e95354ec
VZ
2809 }
2810#endif // HAVE_ICONV
2811
2812#ifdef wxHAVE_WIN32_MB2WC
2813 {
7608a683 2814#if wxUSE_FONTMAP
e95354ec
VZ
2815 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2816 : new wxMBConv_win32(m_encoding);
2817 if ( conv->IsOk() )
2818 return conv;
2819
2820 delete conv;
7608a683
WS
2821#else
2822 return NULL;
2823#endif
e95354ec
VZ
2824 }
2825#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2826#if defined(__WXMAC__)
2827 {
5c3c8676 2828 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 2829 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 2830 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82
WS
2831 {
2832
2d1659cf 2833#if wxUSE_FONTMAP
d775fa82
WS
2834 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2835 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
2836#else
2837 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2838#endif
d775fa82 2839 if ( conv->IsOk() )
f7e98dee
RN
2840 return conv;
2841
2842 delete conv;
2843 }
2844 }
2845#endif
2846#if defined(__WXCOCOA__)
2847 {
2848 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2849 {
2850
a6900d10 2851#if wxUSE_FONTMAP
f7e98dee
RN
2852 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2853 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
2854#else
2855 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2856#endif
f7e98dee 2857 if ( conv->IsOk() )
d775fa82
WS
2858 return conv;
2859
2860 delete conv;
2861 }
335d31e0
SC
2862 }
2863#endif
e95354ec
VZ
2864 // step (2)
2865 wxFontEncoding enc = m_encoding;
2866#if wxUSE_FONTMAP
c547282d
VZ
2867 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2868 {
2869 // use "false" to suppress interactive dialogs -- we can be called from
2870 // anywhere and popping up a dialog from here is the last thing we want to
2871 // do
267e11c5 2872 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2873 }
e95354ec
VZ
2874#endif // wxUSE_FONTMAP
2875
2876 switch ( enc )
2877 {
2878 case wxFONTENCODING_UTF7:
2879 return new wxMBConvUTF7;
2880
2881 case wxFONTENCODING_UTF8:
2882 return new wxMBConvUTF8;
2883
e95354ec
VZ
2884 case wxFONTENCODING_UTF16BE:
2885 return new wxMBConvUTF16BE;
2886
2887 case wxFONTENCODING_UTF16LE:
2888 return new wxMBConvUTF16LE;
2889
e95354ec
VZ
2890 case wxFONTENCODING_UTF32BE:
2891 return new wxMBConvUTF32BE;
2892
2893 case wxFONTENCODING_UTF32LE:
2894 return new wxMBConvUTF32LE;
2895
2896 default:
2897 // nothing to do but put here to suppress gcc warnings
2898 ;
2899 }
2900
2901 // step (3)
2902#if wxUSE_FONTMAP
2903 {
2904 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2905 : new wxMBConv_wxwin(m_encoding);
2906 if ( conv->IsOk() )
2907 return conv;
2908
2909 delete conv;
2910 }
2911#endif // wxUSE_FONTMAP
2912
a58d4f4d
VS
2913 // NB: This is a hack to prevent deadlock. What could otherwise happen
2914 // in Unicode build: wxConvLocal creation ends up being here
2915 // because of some failure and logs the error. But wxLog will try to
2916 // attach timestamp, for which it will need wxConvLocal (to convert
2917 // time to char* and then wchar_t*), but that fails, tries to log
2918 // error, but wxLog has a (already locked) critical section that
2919 // guards static buffer.
2920 static bool alreadyLoggingError = false;
2921 if (!alreadyLoggingError)
2922 {
2923 alreadyLoggingError = true;
2924 wxLogError(_("Cannot convert from the charset '%s'!"),
2925 m_name ? m_name
e95354ec
VZ
2926 :
2927#if wxUSE_FONTMAP
267e11c5 2928 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
2929#else // !wxUSE_FONTMAP
2930 wxString::Format(_("encoding %s"), m_encoding).c_str()
2931#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2932 );
a58d4f4d
VS
2933 alreadyLoggingError = false;
2934 }
e95354ec
VZ
2935
2936 return NULL;
2937}
2938
2939void wxCSConv::CreateConvIfNeeded() const
2940{
2941 if ( m_deferred )
2942 {
2943 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2944
2945#if wxUSE_INTL
2946 // if we don't have neither the name nor the encoding, use the default
2947 // encoding for this system
2948 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2949 {
4d312c22 2950 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2951 }
2952#endif // wxUSE_INTL
2953
e95354ec
VZ
2954 self->m_convReal = DoCreate();
2955 self->m_deferred = false;
6001e347 2956 }
6001e347
RR
2957}
2958
2959size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2960{
e95354ec 2961 CreateConvIfNeeded();
dccce9ea 2962
e95354ec
VZ
2963 if (m_convReal)
2964 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2965
2966 // latin-1 (direct)
4def3b35 2967 size_t len = strlen(psz);
dccce9ea 2968
f1339c56
RR
2969 if (buf)
2970 {
4def3b35 2971 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2972 buf[c] = (unsigned char)(psz[c]);
2973 }
dccce9ea 2974
f1339c56 2975 return len;
6001e347
RR
2976}
2977
2978size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2979{
e95354ec 2980 CreateConvIfNeeded();
dccce9ea 2981
e95354ec
VZ
2982 if (m_convReal)
2983 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2984
f1339c56 2985 // latin-1 (direct)
f8d791e0 2986 const size_t len = wxWcslen(psz);
f1339c56
RR
2987 if (buf)
2988 {
4def3b35 2989 for (size_t c = 0; c <= len; c++)
24642831
VS
2990 {
2991 if (psz[c] > 0xFF)
2992 return (size_t)-1;
907173e5 2993 buf[c] = (char)psz[c];
24642831
VS
2994 }
2995 }
2996 else
2997 {
2998 for (size_t c = 0; c <= len; c++)
2999 {
3000 if (psz[c] > 0xFF)
3001 return (size_t)-1;
3002 }
f1339c56 3003 }
dccce9ea 3004
f1339c56 3005 return len;
6001e347
RR
3006}
3007
eec47cc6
VZ
3008const char *wxCSConv::GetMBNul(size_t *nulLen) const
3009{
3010 CreateConvIfNeeded();
3011
3012 if ( m_convReal )
3013 {
3014 // cast needed just to call private function of m_convReal
3015 return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
3016 }
3017
3018 *nulLen = 1;
3019 return "";
3020}
3021
bde4baac
VZ
3022// ----------------------------------------------------------------------------
3023// globals
3024// ----------------------------------------------------------------------------
3025
3026#ifdef __WINDOWS__
3027 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
3028#elif defined(__WXMAC__) && !defined(__MACH__)
3029 static wxMBConv_mac wxConvLibcObj ;
bde4baac 3030#else
dcc8fac0 3031 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
3032#endif
3033
3034static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3035static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3036static wxMBConvUTF7 wxConvUTF7Obj;
3037static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 3038
bde4baac
VZ
3039WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3040WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3041WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3042WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3043WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3044WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
3045WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3046#ifdef __WXOSX__
ea8ce907 3047 wxConvUTF8Obj;
f5a1953b 3048#else
ea8ce907 3049 wxConvLibcObj;
f5a1953b
VZ
3050#endif
3051
bde4baac
VZ
3052
3053#else // !wxUSE_WCHAR_T
3054
3055// stand-ins in absence of wchar_t
3056WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3057 wxConvISO8859_1,
3058 wxConvLocal,
3059 wxConvUTF8;
3060
3061#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T