]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
really pass the proper output buffer size in wxMBConv_win32 round trip check, at...
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
6001e347
RR
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
373658eb
VZ
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
bde4baac
VZ
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
7608a683 39#ifdef __WINDOWS__
532d575b 40 #include "wx/msw/private.h"
13dd924a 41 #include "wx/msw/missing.h"
0a1c1e62
GRG
42#endif
43
1c193821 44#ifndef __WXWINCE__
1cd52418 45#include <errno.h>
1c193821
JS
46#endif
47
6001e347
RR
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
e95354ec
VZ
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
6001e347 56#ifdef __SALFORDC__
373658eb 57 #include <clib.h>
6001e347
RR
58#endif
59
b040e242 60#ifdef HAVE_ICONV
373658eb 61 #include <iconv.h>
b1d547eb 62 #include "wx/thread.h"
1cd52418 63#endif
1cd52418 64
373658eb
VZ
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
7608a683 67#include "wx/utils.h"
373658eb 68
335d31e0 69#ifdef __WXMAC__
40ba2f3b 70#ifndef __DARWIN__
4227afa4
SC
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
40ba2f3b 74#endif
335d31e0
SC
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
ce6f8d6f
VZ
78
79#define TRACE_STRCONV _T("strconv")
80
4948c2b6 81#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
82 #define WC_UTF16
83#endif
84
373658eb
VZ
85// ============================================================================
86// implementation
87// ============================================================================
88
89// ----------------------------------------------------------------------------
c91830cb 90// UTF-16 en/decoding to/from UCS-4
373658eb 91// ----------------------------------------------------------------------------
6001e347 92
b0a6bb75 93
c91830cb 94static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 95{
dccce9ea 96 if (input<=0xffff)
4def3b35 97 {
999836aa
VZ
98 if (output)
99 *output = (wxUint16) input;
4def3b35 100 return 1;
dccce9ea
VZ
101 }
102 else if (input>=0x110000)
4def3b35
VS
103 {
104 return (size_t)-1;
dccce9ea
VZ
105 }
106 else
4def3b35 107 {
dccce9ea 108 if (output)
4def3b35 109 {
c91830cb 110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
112 }
113 return 2;
1cd52418 114 }
1cd52418
OK
115}
116
c91830cb 117static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 118{
dccce9ea 119 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
120 {
121 output = *input;
122 return 1;
dccce9ea 123 }
cdb14ecb 124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
4def3b35
VS
125 {
126 output = *input;
127 return (size_t)-1;
dccce9ea
VZ
128 }
129 else
4def3b35
VS
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
1cd52418
OK
134}
135
b0a6bb75 136
f6bcfd97 137// ----------------------------------------------------------------------------
6001e347 138// wxMBConv
f6bcfd97 139// ----------------------------------------------------------------------------
2c53a80a
WS
140
141wxMBConv::~wxMBConv()
142{
143 // nothing to do here (necessary for Darwin linking probably)
144}
6001e347 145
6001e347
RR
146const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147{
2b5f62a0 148 if ( psz )
6001e347 149 {
2b5f62a0
VZ
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
635f33ce
VS
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
2b5f62a0 161 }
f6bcfd97 162 }
2b5f62a0
VZ
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
6001e347
RR
167}
168
e5cceba0 169const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 170{
2b5f62a0
VZ
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
c91830cb 176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
2b5f62a0
VZ
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
e5cceba0 186
e5cceba0 187 return buf;
6001e347
RR
188}
189
f5fb6871 190const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
e4e3bbb4 191{
f5fb6871
RN
192 wxASSERT(pOutSize != NULL);
193
e4e3bbb4
RN
194 const char* szEnd = szString + nStringLen + 1;
195 const char* szPos = szString;
196 const char* szStart = szPos;
197
198 size_t nActualLength = 0;
f5fb6871
RN
199 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
200
201 wxWCharBuffer theBuffer(nCurrentSize);
e4e3bbb4
RN
202
203 //Convert the string until the length() is reached, continuing the
204 //loop every time a null character is reached
205 while(szPos != szEnd)
206 {
207 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
208
209 //Get the length of the current (sub)string
210 size_t nLen = MB2WC(NULL, szPos, 0);
211
212 //Invalid conversion?
213 if( nLen == (size_t)-1 )
f5fb6871
RN
214 {
215 *pOutSize = 0;
216 theBuffer.data()[0u] = wxT('\0');
217 return theBuffer;
218 }
219
e4e3bbb4
RN
220
221 //Increase the actual length (+1 for current null character)
222 nActualLength += nLen + 1;
223
f5fb6871
RN
224 //if buffer too big, realloc the buffer
225 if (nActualLength > (nCurrentSize+1))
226 {
227 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
228 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
229 theBuffer = theNewBuffer;
230 nCurrentSize <<= 1;
231 }
232
233 //Convert the current (sub)string
234 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
e4e3bbb4 235 {
f5fb6871
RN
236 *pOutSize = 0;
237 theBuffer.data()[0u] = wxT('\0');
238 return theBuffer;
e4e3bbb4
RN
239 }
240
241 //Increment to next (sub)string
3103e8a9
JS
242 //Note that we have to use strlen instead of nLen here
243 //because XX2XX gives us the size of the output buffer,
244 //which is not necessarily the length of the string
e4e3bbb4
RN
245 szPos += strlen(szPos) + 1;
246 }
247
f5fb6871
RN
248 //success - return actual length and the buffer
249 *pOutSize = nActualLength;
3698ae71 250 return theBuffer;
e4e3bbb4
RN
251}
252
f5fb6871 253const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
e4e3bbb4 254{
f5fb6871
RN
255 wxASSERT(pOutSize != NULL);
256
e4e3bbb4
RN
257 const wchar_t* szEnd = szString + nStringLen + 1;
258 const wchar_t* szPos = szString;
259 const wchar_t* szStart = szPos;
260
261 size_t nActualLength = 0;
f5fb6871
RN
262 size_t nCurrentSize = nStringLen << 2; //try * 4 first
263
264 wxCharBuffer theBuffer(nCurrentSize);
e4e3bbb4
RN
265
266 //Convert the string until the length() is reached, continuing the
267 //loop every time a null character is reached
268 while(szPos != szEnd)
269 {
270 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
271
272 //Get the length of the current (sub)string
273 size_t nLen = WC2MB(NULL, szPos, 0);
274
275 //Invalid conversion?
276 if( nLen == (size_t)-1 )
f5fb6871
RN
277 {
278 *pOutSize = 0;
279 theBuffer.data()[0u] = wxT('\0');
280 return theBuffer;
281 }
e4e3bbb4
RN
282
283 //Increase the actual length (+1 for current null character)
284 nActualLength += nLen + 1;
3698ae71 285
f5fb6871
RN
286 //if buffer too big, realloc the buffer
287 if (nActualLength > (nCurrentSize+1))
288 {
289 wxCharBuffer theNewBuffer(nCurrentSize << 1);
290 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
291 theBuffer = theNewBuffer;
292 nCurrentSize <<= 1;
293 }
294
295 //Convert the current (sub)string
296 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
e4e3bbb4 297 {
f5fb6871
RN
298 *pOutSize = 0;
299 theBuffer.data()[0u] = wxT('\0');
300 return theBuffer;
e4e3bbb4
RN
301 }
302
303 //Increment to next (sub)string
3103e8a9
JS
304 //Note that we have to use wxWcslen instead of nLen here
305 //because XX2XX gives us the size of the output buffer,
306 //which is not necessarily the length of the string
e4e3bbb4
RN
307 szPos += wxWcslen(szPos) + 1;
308 }
309
f5fb6871
RN
310 //success - return actual length and the buffer
311 *pOutSize = nActualLength;
3698ae71 312 return theBuffer;
e4e3bbb4
RN
313}
314
6001e347 315// ----------------------------------------------------------------------------
bde4baac 316// wxMBConvLibc
6001e347
RR
317// ----------------------------------------------------------------------------
318
bde4baac
VZ
319size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
320{
321 return wxMB2WC(buf, psz, n);
322}
323
324size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
325{
326 return wxWC2MB(buf, psz, n);
327}
e1bfe89e 328
66bf0099 329#ifdef __UNIX__
c12b7f79 330
e1bfe89e 331// ----------------------------------------------------------------------------
532d575b 332// wxConvBrokenFileNames
e1bfe89e
RR
333// ----------------------------------------------------------------------------
334
845905d5 335wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 336{
845905d5
MW
337 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
338 || wxStricmp(charset, _T("UTF8")) == 0 )
339 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
340 else
341 m_conv = new wxCSConv(charset);
ea8ce907
RR
342}
343
c12b7f79
VZ
344size_t
345wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
346 const char *psz,
347 size_t outputSize) const
e1bfe89e 348{
c12b7f79 349 return m_conv->MB2WC( outputBuf, psz, outputSize );
e1bfe89e
RR
350}
351
c12b7f79
VZ
352size_t
353wxConvBrokenFileNames::WC2MB(char *outputBuf,
354 const wchar_t *psz,
355 size_t outputSize) const
e1bfe89e 356{
c12b7f79 357 return m_conv->WC2MB( outputBuf, psz, outputSize );
e1bfe89e
RR
358}
359
66bf0099 360#endif
c12b7f79 361
bde4baac 362// ----------------------------------------------------------------------------
3698ae71 363// UTF-7
bde4baac 364// ----------------------------------------------------------------------------
6001e347 365
15f2ee32 366// Implementation (C) 2004 Fredrik Roubert
6001e347 367
15f2ee32
RN
368//
369// BASE64 decoding table
370//
371static const unsigned char utf7unb64[] =
6001e347 372{
15f2ee32
RN
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
376 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
379 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
380 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
382 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
383 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
384 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
386 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
387 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
388 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
402 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
403 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
405};
406
407size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
408{
15f2ee32
RN
409 size_t len = 0;
410
411 while (*psz && ((!buf) || (len < n)))
412 {
413 unsigned char cc = *psz++;
414 if (cc != '+')
415 {
416 // plain ASCII char
417 if (buf)
418 *buf++ = cc;
419 len++;
420 }
421 else if (*psz == '-')
422 {
423 // encoded plus sign
424 if (buf)
425 *buf++ = cc;
426 len++;
427 psz++;
428 }
429 else
430 {
431 // BASE64 encoded string
432 bool lsb;
433 unsigned char c;
434 unsigned int d, l;
435 for (lsb = false, d = 0, l = 0;
436 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
437 {
438 d <<= 6;
439 d += cc;
440 for (l += 6; l >= 8; lsb = !lsb)
441 {
6356d52a 442 c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
443 if (lsb)
444 {
445 if (buf)
446 *buf++ |= c;
447 len ++;
448 }
449 else
450 if (buf)
6356d52a 451 *buf = (wchar_t)(c << 8);
15f2ee32
RN
452 }
453 }
454 if (*psz == '-')
455 psz++;
456 }
457 }
458 if (buf && (len < n))
459 *buf = 0;
460 return len;
6001e347
RR
461}
462
15f2ee32
RN
463//
464// BASE64 encoding table
465//
466static const unsigned char utf7enb64[] =
467{
468 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
469 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
470 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
471 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
472 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
473 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
474 'w', 'x', 'y', 'z', '0', '1', '2', '3',
475 '4', '5', '6', '7', '8', '9', '+', '/'
476};
477
478//
479// UTF-7 encoding table
480//
481// 0 - Set D (directly encoded characters)
482// 1 - Set O (optional direct characters)
483// 2 - whitespace characters (optional)
484// 3 - special characters
485//
486static const unsigned char utf7encode[128] =
6001e347 487{
15f2ee32
RN
488 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
490 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
494 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
496};
497
667e5b3e 498size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32
RN
499{
500
501
502 size_t len = 0;
503
504 while (*psz && ((!buf) || (len < n)))
505 {
506 wchar_t cc = *psz++;
507 if (cc < 0x80 && utf7encode[cc] < 1)
508 {
509 // plain ASCII char
510 if (buf)
511 *buf++ = (char)cc;
512 len++;
513 }
514#ifndef WC_UTF16
79c78d42 515 else if (((wxUint32)cc) > 0xffff)
b2c13097 516 {
15f2ee32
RN
517 // no surrogate pair generation (yet?)
518 return (size_t)-1;
519 }
520#endif
521 else
522 {
523 if (buf)
524 *buf++ = '+';
525 len++;
526 if (cc != '+')
527 {
528 // BASE64 encode string
529 unsigned int lsb, d, l;
73c902d6 530 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
531 {
532 for (lsb = 0; lsb < 2; lsb ++)
533 {
534 d <<= 8;
535 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
536
537 for (l += 8; l >= 6; )
538 {
539 l -= 6;
540 if (buf)
541 *buf++ = utf7enb64[(d >> l) % 64];
542 len++;
543 }
544 }
545 cc = *psz;
546 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
547 break;
548 }
549 if (l != 0)
550 {
551 if (buf)
552 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
553 len++;
554 }
555 }
556 if (buf)
557 *buf++ = '-';
558 len++;
559 }
560 }
561 if (buf && (len < n))
562 *buf = 0;
563 return len;
6001e347
RR
564}
565
f6bcfd97 566// ----------------------------------------------------------------------------
6001e347 567// UTF-8
f6bcfd97 568// ----------------------------------------------------------------------------
6001e347 569
dccce9ea 570static wxUint32 utf8_max[]=
4def3b35 571 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 572
3698ae71
VZ
573// boundaries of the private use area we use to (temporarily) remap invalid
574// characters invalid in a UTF-8 encoded string
ea8ce907
RR
575const wxUint32 wxUnicodePUA = 0x100000;
576const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
577
6001e347
RR
578size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
579{
4def3b35
VS
580 size_t len = 0;
581
dccce9ea 582 while (*psz && ((!buf) || (len < n)))
4def3b35 583 {
ea8ce907
RR
584 const char *opsz = psz;
585 bool invalid = false;
4def3b35
VS
586 unsigned char cc = *psz++, fc = cc;
587 unsigned cnt;
dccce9ea 588 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 589 fc <<= 1;
dccce9ea 590 if (!cnt)
4def3b35
VS
591 {
592 // plain ASCII char
dccce9ea 593 if (buf)
4def3b35
VS
594 *buf++ = cc;
595 len++;
561488ef
MW
596
597 // escape the escape character for octal escapes
598 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
599 && cc == '\\' && (!buf || len < n))
600 {
601 if (buf)
602 *buf++ = cc;
603 len++;
604 }
dccce9ea
VZ
605 }
606 else
4def3b35
VS
607 {
608 cnt--;
dccce9ea 609 if (!cnt)
4def3b35
VS
610 {
611 // invalid UTF-8 sequence
ea8ce907 612 invalid = true;
dccce9ea
VZ
613 }
614 else
4def3b35
VS
615 {
616 unsigned ocnt = cnt - 1;
617 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 618 while (cnt--)
4def3b35 619 {
ea8ce907 620 cc = *psz;
dccce9ea 621 if ((cc & 0xC0) != 0x80)
4def3b35
VS
622 {
623 // invalid UTF-8 sequence
ea8ce907
RR
624 invalid = true;
625 break;
4def3b35 626 }
ea8ce907 627 psz++;
4def3b35
VS
628 res = (res << 6) | (cc & 0x3f);
629 }
ea8ce907 630 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
631 {
632 // illegal UTF-8 encoding
ea8ce907 633 invalid = true;
4def3b35 634 }
ea8ce907
RR
635 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
636 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
637 {
638 // if one of our PUA characters turns up externally
639 // it must also be treated as an illegal sequence
640 // (a bit like you have to escape an escape character)
641 invalid = true;
642 }
643 else
644 {
1cd52418 645#ifdef WC_UTF16
ea8ce907
RR
646 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
647 size_t pa = encode_utf16(res, (wxUint16 *)buf);
648 if (pa == (size_t)-1)
649 {
650 invalid = true;
651 }
652 else
653 {
654 if (buf)
655 buf += pa;
656 len += pa;
657 }
373658eb 658#else // !WC_UTF16
ea8ce907 659 if (buf)
38d4b1e4 660 *buf++ = (wchar_t)res;
ea8ce907 661 len++;
373658eb 662#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
663 }
664 }
665 if (invalid)
666 {
667 if (m_options & MAP_INVALID_UTF8_TO_PUA)
668 {
669 while (opsz < psz && (!buf || len < n))
670 {
671#ifdef WC_UTF16
672 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
673 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
674 wxASSERT(pa != (size_t)-1);
675 if (buf)
676 buf += pa;
677 opsz++;
678 len += pa;
679#else
680 if (buf)
38d4b1e4 681 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
682 opsz++;
683 len++;
684#endif
685 }
686 }
3698ae71 687 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
688 {
689 while (opsz < psz && (!buf || len < n))
690 {
3698ae71
VZ
691 if ( buf && len + 3 < n )
692 {
17a1ebd1 693 unsigned char on = *opsz;
3698ae71 694 *buf++ = L'\\';
17a1ebd1
VZ
695 *buf++ = (wchar_t)( L'0' + on / 0100 );
696 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
697 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 698 }
ea8ce907
RR
699 opsz++;
700 len += 4;
701 }
702 }
3698ae71 703 else // MAP_INVALID_UTF8_NOT
ea8ce907
RR
704 {
705 return (size_t)-1;
706 }
4def3b35
VS
707 }
708 }
6001e347 709 }
dccce9ea 710 if (buf && (len < n))
4def3b35
VS
711 *buf = 0;
712 return len;
6001e347
RR
713}
714
3698ae71
VZ
715static inline bool isoctal(wchar_t wch)
716{
717 return L'0' <= wch && wch <= L'7';
718}
719
6001e347
RR
720size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
721{
4def3b35 722 size_t len = 0;
6001e347 723
dccce9ea 724 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
725 {
726 wxUint32 cc;
1cd52418 727#ifdef WC_UTF16
b5153fd8
VZ
728 // cast is ok for WC_UTF16
729 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 730 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 731#else
4def3b35
VS
732 cc=(*psz++) & 0x7fffffff;
733#endif
3698ae71
VZ
734
735 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
736 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 737 {
dccce9ea 738 if (buf)
ea8ce907 739 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 740 len++;
3698ae71 741 }
561488ef
MW
742 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
743 && cc == L'\\' && psz[0] == L'\\' )
744 {
745 if (buf)
746 *buf++ = (char)cc;
747 psz++;
748 len++;
749 }
3698ae71
VZ
750 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
751 cc == L'\\' &&
752 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 753 {
dccce9ea 754 if (buf)
3698ae71 755 {
b2c13097
WS
756 *buf++ = (char) ((psz[0] - L'0')*0100 +
757 (psz[1] - L'0')*010 +
758 (psz[2] - L'0'));
3698ae71
VZ
759 }
760
761 psz += 3;
ea8ce907
RR
762 len++;
763 }
764 else
765 {
766 unsigned cnt;
767 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
768 if (!cnt)
4def3b35 769 {
ea8ce907
RR
770 // plain ASCII char
771 if (buf)
772 *buf++ = (char) cc;
773 len++;
774 }
775
776 else
777 {
778 len += cnt + 1;
779 if (buf)
780 {
781 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
782 while (cnt--)
783 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
784 }
4def3b35
VS
785 }
786 }
6001e347 787 }
4def3b35 788
3698ae71
VZ
789 if (buf && (len<n))
790 *buf = 0;
adb45366 791
4def3b35 792 return len;
6001e347
RR
793}
794
c91830cb
VZ
795// ----------------------------------------------------------------------------
796// UTF-16
797// ----------------------------------------------------------------------------
798
799#ifdef WORDS_BIGENDIAN
bde4baac
VZ
800 #define wxMBConvUTF16straight wxMBConvUTF16BE
801 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 802#else
bde4baac
VZ
803 #define wxMBConvUTF16swap wxMBConvUTF16BE
804 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
805#endif
806
807
c91830cb
VZ
808#ifdef WC_UTF16
809
c91830cb
VZ
810// copy 16bit MB to 16bit String
811size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
812{
813 size_t len=0;
814
815 while (*(wxUint16*)psz && (!buf || len < n))
816 {
817 if (buf)
818 *buf++ = *(wxUint16*)psz;
819 len++;
820
821 psz += sizeof(wxUint16);
822 }
823 if (buf && len<n) *buf=0;
824
825 return len;
826}
827
828
829// copy 16bit String to 16bit MB
830size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
831{
832 size_t len=0;
833
834 while (*psz && (!buf || len < n))
835 {
836 if (buf)
837 {
838 *(wxUint16*)buf = *psz;
839 buf += sizeof(wxUint16);
840 }
841 len += sizeof(wxUint16);
842 psz++;
843 }
844 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
845
846 return len;
847}
848
849
850// swap 16bit MB to 16bit String
851size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
852{
853 size_t len=0;
854
855 while (*(wxUint16*)psz && (!buf || len < n))
856 {
857 if (buf)
858 {
859 ((char *)buf)[0] = psz[1];
860 ((char *)buf)[1] = psz[0];
861 buf++;
862 }
863 len++;
864 psz += sizeof(wxUint16);
865 }
866 if (buf && len<n) *buf=0;
867
868 return len;
869}
870
871
872// swap 16bit MB to 16bit String
873size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
874{
875 size_t len=0;
876
877 while (*psz && (!buf || len < n))
878 {
879 if (buf)
880 {
881 *buf++ = ((char*)psz)[1];
882 *buf++ = ((char*)psz)[0];
883 }
884 len += sizeof(wxUint16);
885 psz++;
886 }
887 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
888
889 return len;
890}
891
892
893#else // WC_UTF16
894
895
896// copy 16bit MB to 32bit String
897size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
898{
899 size_t len=0;
900
901 while (*(wxUint16*)psz && (!buf || len < n))
902 {
903 wxUint32 cc;
904 size_t pa=decode_utf16((wxUint16*)psz, cc);
905 if (pa == (size_t)-1)
906 return pa;
907
908 if (buf)
38d4b1e4 909 *buf++ = (wchar_t)cc;
c91830cb
VZ
910 len++;
911 psz += pa * sizeof(wxUint16);
912 }
913 if (buf && len<n) *buf=0;
914
915 return len;
916}
917
918
919// copy 32bit String to 16bit MB
920size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
921{
922 size_t len=0;
923
924 while (*psz && (!buf || len < n))
925 {
926 wxUint16 cc[2];
927 size_t pa=encode_utf16(*psz, cc);
928
929 if (pa == (size_t)-1)
930 return pa;
931
932 if (buf)
933 {
69b80d28 934 *(wxUint16*)buf = cc[0];
b5153fd8 935 buf += sizeof(wxUint16);
c91830cb 936 if (pa > 1)
69b80d28
VZ
937 {
938 *(wxUint16*)buf = cc[1];
939 buf += sizeof(wxUint16);
940 }
c91830cb
VZ
941 }
942
943 len += pa*sizeof(wxUint16);
944 psz++;
945 }
946 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
947
948 return len;
949}
950
951
952// swap 16bit MB to 32bit String
953size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954{
955 size_t len=0;
956
957 while (*(wxUint16*)psz && (!buf || len < n))
958 {
959 wxUint32 cc;
960 char tmp[4];
961 tmp[0]=psz[1]; tmp[1]=psz[0];
962 tmp[2]=psz[3]; tmp[3]=psz[2];
963
964 size_t pa=decode_utf16((wxUint16*)tmp, cc);
965 if (pa == (size_t)-1)
966 return pa;
967
968 if (buf)
38d4b1e4 969 *buf++ = (wchar_t)cc;
c91830cb
VZ
970
971 len++;
972 psz += pa * sizeof(wxUint16);
973 }
974 if (buf && len<n) *buf=0;
975
976 return len;
977}
978
979
980// swap 32bit String to 16bit MB
981size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
982{
983 size_t len=0;
984
985 while (*psz && (!buf || len < n))
986 {
987 wxUint16 cc[2];
988 size_t pa=encode_utf16(*psz, cc);
989
990 if (pa == (size_t)-1)
991 return pa;
992
993 if (buf)
994 {
995 *buf++ = ((char*)cc)[1];
996 *buf++ = ((char*)cc)[0];
997 if (pa > 1)
998 {
999 *buf++ = ((char*)cc)[3];
1000 *buf++ = ((char*)cc)[2];
1001 }
1002 }
1003
1004 len += pa*sizeof(wxUint16);
1005 psz++;
1006 }
1007 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1008
1009 return len;
1010}
1011
1012#endif // WC_UTF16
1013
1014
1015// ----------------------------------------------------------------------------
1016// UTF-32
1017// ----------------------------------------------------------------------------
1018
1019#ifdef WORDS_BIGENDIAN
1020#define wxMBConvUTF32straight wxMBConvUTF32BE
1021#define wxMBConvUTF32swap wxMBConvUTF32LE
1022#else
1023#define wxMBConvUTF32swap wxMBConvUTF32BE
1024#define wxMBConvUTF32straight wxMBConvUTF32LE
1025#endif
1026
1027
1028WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1029WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1030
1031
1032#ifdef WC_UTF16
1033
1034// copy 32bit MB to 16bit String
1035size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1036{
1037 size_t len=0;
1038
1039 while (*(wxUint32*)psz && (!buf || len < n))
1040 {
1041 wxUint16 cc[2];
1042
1043 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1044 if (pa == (size_t)-1)
1045 return pa;
1046
1047 if (buf)
1048 {
1049 *buf++ = cc[0];
1050 if (pa > 1)
1051 *buf++ = cc[1];
1052 }
1053 len += pa;
1054 psz += sizeof(wxUint32);
1055 }
1056 if (buf && len<n) *buf=0;
1057
1058 return len;
1059}
1060
1061
1062// copy 16bit String to 32bit MB
1063size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1064{
1065 size_t len=0;
1066
1067 while (*psz && (!buf || len < n))
1068 {
1069 wxUint32 cc;
1070
b5153fd8
VZ
1071 // cast is ok for WC_UTF16
1072 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
1073 if (pa == (size_t)-1)
1074 return pa;
1075
1076 if (buf)
1077 {
1078 *(wxUint32*)buf = cc;
1079 buf += sizeof(wxUint32);
1080 }
1081 len += sizeof(wxUint32);
1082 psz += pa;
1083 }
b5153fd8
VZ
1084
1085 if (buf && len<=n-sizeof(wxUint32))
1086 *(wxUint32*)buf=0;
c91830cb
VZ
1087
1088 return len;
1089}
1090
1091
1092
1093// swap 32bit MB to 16bit String
1094size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1095{
1096 size_t len=0;
1097
1098 while (*(wxUint32*)psz && (!buf || len < n))
1099 {
1100 char tmp[4];
1101 tmp[0] = psz[3]; tmp[1] = psz[2];
1102 tmp[2] = psz[1]; tmp[3] = psz[0];
1103
1104
1105 wxUint16 cc[2];
1106
1107 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1108 if (pa == (size_t)-1)
1109 return pa;
1110
1111 if (buf)
1112 {
1113 *buf++ = cc[0];
1114 if (pa > 1)
1115 *buf++ = cc[1];
1116 }
1117 len += pa;
1118 psz += sizeof(wxUint32);
1119 }
b5153fd8
VZ
1120
1121 if (buf && len<n)
1122 *buf=0;
c91830cb
VZ
1123
1124 return len;
1125}
1126
1127
1128// swap 16bit String to 32bit MB
1129size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1130{
1131 size_t len=0;
1132
1133 while (*psz && (!buf || len < n))
1134 {
1135 char cc[4];
1136
b5153fd8
VZ
1137 // cast is ok for WC_UTF16
1138 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
1139 if (pa == (size_t)-1)
1140 return pa;
1141
1142 if (buf)
1143 {
1144 *buf++ = cc[3];
1145 *buf++ = cc[2];
1146 *buf++ = cc[1];
1147 *buf++ = cc[0];
1148 }
1149 len += sizeof(wxUint32);
1150 psz += pa;
1151 }
b5153fd8
VZ
1152
1153 if (buf && len<=n-sizeof(wxUint32))
1154 *(wxUint32*)buf=0;
c91830cb
VZ
1155
1156 return len;
1157}
1158
1159#else // WC_UTF16
1160
1161
1162// copy 32bit MB to 32bit String
1163size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1164{
1165 size_t len=0;
1166
1167 while (*(wxUint32*)psz && (!buf || len < n))
1168 {
1169 if (buf)
38d4b1e4 1170 *buf++ = (wchar_t)(*(wxUint32*)psz);
c91830cb
VZ
1171 len++;
1172 psz += sizeof(wxUint32);
1173 }
b5153fd8
VZ
1174
1175 if (buf && len<n)
1176 *buf=0;
c91830cb
VZ
1177
1178 return len;
1179}
1180
1181
1182// copy 32bit String to 32bit MB
1183size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1184{
1185 size_t len=0;
1186
1187 while (*psz && (!buf || len < n))
1188 {
1189 if (buf)
1190 {
1191 *(wxUint32*)buf = *psz;
1192 buf += sizeof(wxUint32);
1193 }
1194
1195 len += sizeof(wxUint32);
1196 psz++;
1197 }
1198
b5153fd8
VZ
1199 if (buf && len<=n-sizeof(wxUint32))
1200 *(wxUint32*)buf=0;
c91830cb
VZ
1201
1202 return len;
1203}
1204
1205
1206// swap 32bit MB to 32bit String
1207size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1208{
1209 size_t len=0;
1210
1211 while (*(wxUint32*)psz && (!buf || len < n))
1212 {
1213 if (buf)
1214 {
1215 ((char *)buf)[0] = psz[3];
1216 ((char *)buf)[1] = psz[2];
1217 ((char *)buf)[2] = psz[1];
1218 ((char *)buf)[3] = psz[0];
1219 buf++;
1220 }
1221 len++;
1222 psz += sizeof(wxUint32);
1223 }
b5153fd8
VZ
1224
1225 if (buf && len<n)
1226 *buf=0;
c91830cb
VZ
1227
1228 return len;
1229}
1230
1231
1232// swap 32bit String to 32bit MB
1233size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1234{
1235 size_t len=0;
1236
1237 while (*psz && (!buf || len < n))
1238 {
1239 if (buf)
1240 {
1241 *buf++ = ((char *)psz)[3];
1242 *buf++ = ((char *)psz)[2];
1243 *buf++ = ((char *)psz)[1];
1244 *buf++ = ((char *)psz)[0];
1245 }
1246 len += sizeof(wxUint32);
1247 psz++;
1248 }
b5153fd8
VZ
1249
1250 if (buf && len<=n-sizeof(wxUint32))
1251 *(wxUint32*)buf=0;
c91830cb
VZ
1252
1253 return len;
1254}
1255
1256
1257#endif // WC_UTF16
1258
1259
36acb880
VZ
1260// ============================================================================
1261// The classes doing conversion using the iconv_xxx() functions
1262// ============================================================================
3caec1bb 1263
b040e242 1264#ifdef HAVE_ICONV
3a0d76bc 1265
b1d547eb
VS
1266// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1267// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1268// (unless there's yet another bug in glibc) the only case when iconv()
1269// returns with (size_t)-1 (which means error) and says there are 0 bytes
1270// left in the input buffer -- when _real_ error occurs,
1271// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1272// iconv() failure.
3caec1bb
VS
1273// [This bug does not appear in glibc 2.2.]
1274#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1275#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1276 (errno != E2BIG || bufLeft != 0))
1277#else
1278#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1279#endif
1280
ab217dba 1281#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1282
74a7eb0b
VZ
1283#define ICONV_T_INVALID ((iconv_t)-1)
1284
1285#if SIZEOF_WCHAR_T == 4
1286 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1287 #define WC_ENC wxFONTENCODING_UTF32
1288#elif SIZEOF_WCHAR_T == 2
1289 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1290 #define WC_ENC wxFONTENCODING_UTF16
1291#else // sizeof(wchar_t) != 2 nor 4
1292 // does this ever happen?
1293 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1294#endif
1295
36acb880 1296// ----------------------------------------------------------------------------
e95354ec 1297// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1298// ----------------------------------------------------------------------------
1299
e95354ec 1300class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1301{
1302public:
e95354ec
VZ
1303 wxMBConv_iconv(const wxChar *name);
1304 virtual ~wxMBConv_iconv();
36acb880 1305
bde4baac
VZ
1306 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1307 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1308
e95354ec 1309 bool IsOk() const
74a7eb0b 1310 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1311
1312protected:
1313 // the iconv handlers used to translate from multibyte to wide char and in
1314 // the other direction
1315 iconv_t m2w,
1316 w2m;
b1d547eb
VS
1317#if wxUSE_THREADS
1318 // guards access to m2w and w2m objects
1319 wxMutex m_iconvMutex;
1320#endif
36acb880
VZ
1321
1322private:
e95354ec 1323 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1324 // available on this machine, it will remain NULL
74a7eb0b 1325 static wxString ms_wcCharsetName;
36acb880
VZ
1326
1327 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1328 // different endian-ness than the native one
405d8f46 1329 static bool ms_wcNeedsSwap;
36acb880
VZ
1330};
1331
8f115891
MW
1332// make the constructor available for unit testing
1333WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1334{
1335 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1336 if ( !result->IsOk() )
1337 {
1338 delete result;
1339 return 0;
1340 }
1341 return result;
1342}
1343
422e411e 1344wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1345bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1346
e95354ec 1347wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1348{
0331b385
VZ
1349 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1350 // names for the charsets
200a9923 1351 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1352
36acb880 1353 // check for charset that represents wchar_t:
74a7eb0b 1354 if ( ms_wcCharsetName.empty() )
f1339c56 1355 {
c2b83fdd
VZ
1356 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1357
74a7eb0b
VZ
1358#if wxUSE_FONTMAP
1359 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1360#else // !wxUSE_FONTMAP
1361 static const wxChar *names[] =
36acb880 1362 {
74a7eb0b
VZ
1363#if SIZEOF_WCHAR_T == 4
1364 _T("UCS-4"),
1365#elif SIZEOF_WCHAR_T = 2
1366 _T("UCS-2"),
1367#endif
1368 NULL
1369 };
1370#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1371
d1f024a8 1372 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1373 {
17a1ebd1 1374 const wxString nameCS(*names);
74a7eb0b
VZ
1375
1376 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1377 wxString nameXE(nameCS);
74a7eb0b
VZ
1378 #ifdef WORDS_BIGENDIAN
1379 nameXE += _T("BE");
1380 #else // little endian
1381 nameXE += _T("LE");
1382 #endif
1383
c2b83fdd
VZ
1384 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1385 nameXE.c_str());
1386
74a7eb0b
VZ
1387 m2w = iconv_open(nameXE.ToAscii(), cname);
1388 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1389 {
74a7eb0b 1390 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1391 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1392 nameCS.c_str());
17a1ebd1 1393 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1394
74a7eb0b
VZ
1395 // and check for bytesex ourselves:
1396 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1397 {
74a7eb0b
VZ
1398 char buf[2], *bufPtr;
1399 wchar_t wbuf[2], *wbufPtr;
1400 size_t insz, outsz;
1401 size_t res;
1402
1403 buf[0] = 'A';
1404 buf[1] = 0;
1405 wbuf[0] = 0;
1406 insz = 2;
1407 outsz = SIZEOF_WCHAR_T * 2;
1408 wbufPtr = wbuf;
1409 bufPtr = buf;
1410
1411 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1412 (char**)&wbufPtr, &outsz);
1413
1414 if (ICONV_FAILED(res, insz))
1415 {
1416 wxLogLastError(wxT("iconv"));
422e411e 1417 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1418 nameCS.c_str());
74a7eb0b
VZ
1419 }
1420 else // ok, can convert to this encoding, remember it
1421 {
17a1ebd1 1422 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1423 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1424 }
3a0d76bc
VS
1425 }
1426 }
74a7eb0b 1427 else // use charset not requiring byte swapping
36acb880 1428 {
74a7eb0b 1429 ms_wcCharsetName = nameXE;
36acb880 1430 }
3a0d76bc 1431 }
74a7eb0b 1432
0944fceb 1433 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1434 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1435 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1436 : ms_wcCharsetName.c_str(),
1437 ms_wcNeedsSwap ? _T(" (needs swap)")
1438 : _T(""));
3a0d76bc 1439 }
36acb880 1440 else // we already have ms_wcCharsetName
3caec1bb 1441 {
74a7eb0b 1442 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1443 }
dccce9ea 1444
74a7eb0b 1445 if ( ms_wcCharsetName.empty() )
f1339c56 1446 {
74a7eb0b 1447 w2m = ICONV_T_INVALID;
36acb880 1448 }
405d8f46
VZ
1449 else
1450 {
74a7eb0b
VZ
1451 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1452 if ( w2m == ICONV_T_INVALID )
1453 {
1454 wxLogTrace(TRACE_STRCONV,
1455 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1456 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1457 }
405d8f46 1458 }
36acb880 1459}
3caec1bb 1460
e95354ec 1461wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1462{
74a7eb0b 1463 if ( m2w != ICONV_T_INVALID )
36acb880 1464 iconv_close(m2w);
74a7eb0b 1465 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1466 iconv_close(w2m);
1467}
3a0d76bc 1468
bde4baac 1469size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1470{
b1d547eb
VS
1471#if wxUSE_THREADS
1472 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1473 // Unfortunately there is a couple of global wxCSConv objects such as
1474 // wxConvLocal that are used all over wx code, so we have to make sure
1475 // the handle is used by at most one thread at the time. Otherwise
1476 // only a few wx classes would be safe to use from non-main threads
1477 // as MB<->WC conversion would fail "randomly".
1478 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1479#endif
3698ae71 1480
36acb880
VZ
1481 size_t inbuf = strlen(psz);
1482 size_t outbuf = n * SIZEOF_WCHAR_T;
1483 size_t res, cres;
1484 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1485 wchar_t *bufPtr = buf;
1486 const char *pszPtr = psz;
1487
1488 if (buf)
1489 {
1490 // have destination buffer, convert there
1491 cres = iconv(m2w,
1492 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1493 (char**)&bufPtr, &outbuf);
1494 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1495
36acb880 1496 if (ms_wcNeedsSwap)
3a0d76bc 1497 {
36acb880 1498 // convert to native endianness
17a1ebd1
VZ
1499 for ( unsigned i = 0; i < res; i++ )
1500 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1501 }
adb45366 1502
49dd9820
VS
1503 // NB: iconv was given only strlen(psz) characters on input, and so
1504 // it couldn't convert the trailing zero. Let's do it ourselves
1505 // if there's some room left for it in the output buffer.
1506 if (res < n)
1507 buf[res] = 0;
36acb880
VZ
1508 }
1509 else
1510 {
1511 // no destination buffer... convert using temp buffer
1512 // to calculate destination buffer requirement
1513 wchar_t tbuf[8];
1514 res = 0;
1515 do {
1516 bufPtr = tbuf;
1517 outbuf = 8*SIZEOF_WCHAR_T;
1518
1519 cres = iconv(m2w,
1520 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1521 (char**)&bufPtr, &outbuf );
1522
1523 res += 8-(outbuf/SIZEOF_WCHAR_T);
1524 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1525 }
dccce9ea 1526
36acb880 1527 if (ICONV_FAILED(cres, inbuf))
f1339c56 1528 {
36acb880 1529 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1530 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1531 return (size_t)-1;
1532 }
1533
1534 return res;
1535}
1536
bde4baac 1537size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1538{
b1d547eb
VS
1539#if wxUSE_THREADS
1540 // NB: explained in MB2WC
1541 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1542#endif
3698ae71 1543
156162ec
MW
1544 size_t inlen = wxWcslen(psz);
1545 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1546 size_t outbuf = n;
1547 size_t res, cres;
3a0d76bc 1548
36acb880 1549 wchar_t *tmpbuf = 0;
3caec1bb 1550
36acb880
VZ
1551 if (ms_wcNeedsSwap)
1552 {
1553 // need to copy to temp buffer to switch endianness
74a7eb0b 1554 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1555 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1556 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1557 for ( size_t i = 0; i < inlen; i++ )
1558 tmpbuf[n] = WC_BSWAP(psz[i]);
156162ec 1559 tmpbuf[inlen] = L'\0';
74a7eb0b 1560 psz = tmpbuf;
36acb880 1561 }
3a0d76bc 1562
36acb880
VZ
1563 if (buf)
1564 {
1565 // have destination buffer, convert there
1566 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1567
36acb880 1568 res = n-outbuf;
adb45366 1569
49dd9820
VS
1570 // NB: iconv was given only wcslen(psz) characters on input, and so
1571 // it couldn't convert the trailing zero. Let's do it ourselves
1572 // if there's some room left for it in the output buffer.
1573 if (res < n)
1574 buf[0] = 0;
36acb880
VZ
1575 }
1576 else
1577 {
1578 // no destination buffer... convert using temp buffer
1579 // to calculate destination buffer requirement
1580 char tbuf[16];
1581 res = 0;
1582 do {
1583 buf = tbuf; outbuf = 16;
1584
1585 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1586
36acb880
VZ
1587 res += 16 - outbuf;
1588 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1589 }
dccce9ea 1590
36acb880
VZ
1591 if (ms_wcNeedsSwap)
1592 {
1593 free(tmpbuf);
1594 }
dccce9ea 1595
36acb880
VZ
1596 if (ICONV_FAILED(cres, inbuf))
1597 {
ce6f8d6f 1598 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1599 return (size_t)-1;
1600 }
1601
1602 return res;
1603}
1604
b040e242 1605#endif // HAVE_ICONV
36acb880 1606
e95354ec 1607
36acb880
VZ
1608// ============================================================================
1609// Win32 conversion classes
1610// ============================================================================
1cd52418 1611
e95354ec 1612#ifdef wxHAVE_WIN32_MB2WC
373658eb 1613
8b04d4c4 1614// from utils.cpp
d775fa82 1615#if wxUSE_FONTMAP
8b04d4c4
VZ
1616extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1617extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1618#endif
373658eb 1619
e95354ec 1620class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1621{
1622public:
bde4baac
VZ
1623 wxMBConv_win32()
1624 {
1625 m_CodePage = CP_ACP;
1626 }
1627
7608a683 1628#if wxUSE_FONTMAP
e95354ec 1629 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1630 {
1631 m_CodePage = wxCharsetToCodepage(name);
1632 }
dccce9ea 1633
e95354ec 1634 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1635 {
1636 m_CodePage = wxEncodingToCodepage(encoding);
1637 }
7608a683 1638#endif
8b04d4c4 1639
bde4baac 1640 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1641 {
02272c9c
VZ
1642 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1643 // the behaviour is not compatible with the Unix version (using iconv)
1644 // and break the library itself, e.g. wxTextInputStream::NextChar()
1645 // wouldn't work if reading an incomplete MB char didn't result in an
1646 // error
667e5b3e
VZ
1647 //
1648 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1649 // an error (tested under Windows Server 2003) and apparently it is
1650 // done on purpose, i.e. the function accepts any input in this case
1651 // and although I'd prefer to return error on ill-formed output, our
1652 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1653 // explicitly ill-formed according to RFC 2152) neither so we don't
1654 // even have any fallback here...
89028980
VS
1655 //
1656 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1657 // Win XP or newer and if it is specified on older versions, conversion
1658 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1659 // fails. So we can only use the flag on newer Windows versions.
1660 // Additionally, the flag is not supported by UTF7, symbol and CJK
1661 // encodings. See here:
1662 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1663 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1664 int flags = 0;
1665 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1666 m_CodePage < 50000 &&
1667 IsAtLeastWin2kSP4() )
1668 {
1669 flags = MB_ERR_INVALID_CHARS;
1670 }
1671 else if ( m_CodePage == CP_UTF8 )
1672 {
1673 // Avoid round-trip in the special case of UTF-8 by using our
1674 // own UTF-8 conversion code:
1675 return wxMBConvUTF8().MB2WC(buf, psz, n);
1676 }
667e5b3e 1677
2b5f62a0
VZ
1678 const size_t len = ::MultiByteToWideChar
1679 (
1680 m_CodePage, // code page
667e5b3e 1681 flags, // flags: fall on error
2b5f62a0
VZ
1682 psz, // input string
1683 -1, // its length (NUL-terminated)
b4da152e 1684 buf, // output string
2b5f62a0
VZ
1685 buf ? n : 0 // size of output buffer
1686 );
89028980
VS
1687 if ( !len )
1688 {
1689 // function totally failed
1690 return (size_t)-1;
1691 }
1692
1693 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1694 // check if we succeeded, by doing a double trip:
1695 if ( !flags && buf )
1696 {
53c174fc
VZ
1697 const size_t mbLen = strlen(psz);
1698 wxCharBuffer mbBuf(mbLen);
89028980
VS
1699 if ( ::WideCharToMultiByte
1700 (
1701 m_CodePage,
1702 0,
1703 buf,
1704 -1,
1705 mbBuf.data(),
53c174fc 1706 mbLen + 1, // size in bytes, not length
89028980
VS
1707 NULL,
1708 NULL
1709 ) == 0 ||
1710 strcmp(mbBuf, psz) != 0 )
1711 {
1712 // we didn't obtain the same thing we started from, hence
1713 // the conversion was lossy and we consider that it failed
1714 return (size_t)-1;
1715 }
1716 }
2b5f62a0 1717
03a991bc
VZ
1718 // note that it returns count of written chars for buf != NULL and size
1719 // of the needed buffer for buf == NULL so in either case the length of
1720 // the string (which never includes the terminating NUL) is one less
89028980 1721 return len - 1;
f1339c56 1722 }
dccce9ea 1723
13dd924a 1724 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1725 {
13dd924a
VZ
1726 /*
1727 we have a problem here: by default, WideCharToMultiByte() may
1728 replace characters unrepresentable in the target code page with bad
1729 quality approximations such as turning "1/2" symbol (U+00BD) into
1730 "1" for the code pages which don't have it and we, obviously, want
1731 to avoid this at any price
d775fa82 1732
13dd924a
VZ
1733 the trouble is that this function does it _silently_, i.e. it won't
1734 even tell us whether it did or not... Win98/2000 and higher provide
1735 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1736 we have to resort to a round trip, i.e. check that converting back
1737 results in the same string -- this is, of course, expensive but
1738 otherwise we simply can't be sure to not garble the data.
1739 */
1740
1741 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1742 // it doesn't work with CJK encodings (which we test for rather roughly
1743 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1744 // supporting it
907173e5
WS
1745 BOOL usedDef wxDUMMY_INITIALIZE(false);
1746 BOOL *pUsedDef;
13dd924a
VZ
1747 int flags;
1748 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1749 {
1750 // it's our lucky day
1751 flags = WC_NO_BEST_FIT_CHARS;
1752 pUsedDef = &usedDef;
1753 }
1754 else // old system or unsupported encoding
1755 {
1756 flags = 0;
1757 pUsedDef = NULL;
1758 }
1759
2b5f62a0
VZ
1760 const size_t len = ::WideCharToMultiByte
1761 (
1762 m_CodePage, // code page
13dd924a
VZ
1763 flags, // either none or no best fit
1764 pwz, // input string
2b5f62a0
VZ
1765 -1, // it is (wide) NUL-terminated
1766 buf, // output buffer
1767 buf ? n : 0, // and its size
1768 NULL, // default "replacement" char
13dd924a 1769 pUsedDef // [out] was it used?
2b5f62a0
VZ
1770 );
1771
13dd924a
VZ
1772 if ( !len )
1773 {
1774 // function totally failed
1775 return (size_t)-1;
1776 }
1777
1778 // if we were really converting, check if we succeeded
1779 if ( buf )
1780 {
1781 if ( flags )
1782 {
1783 // check if the conversion failed, i.e. if any replacements
1784 // were done
1785 if ( usedDef )
1786 return (size_t)-1;
1787 }
1788 else // we must resort to double tripping...
1789 {
1790 wxWCharBuffer wcBuf(n);
1791 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1792 wcscmp(wcBuf, pwz) != 0 )
1793 {
1794 // we didn't obtain the same thing we started from, hence
1795 // the conversion was lossy and we consider that it failed
1796 return (size_t)-1;
1797 }
1798 }
1799 }
1800
03a991bc 1801 // see the comment above for the reason of "len - 1"
13dd924a 1802 return len - 1;
f1339c56 1803 }
dccce9ea 1804
13dd924a
VZ
1805 bool IsOk() const { return m_CodePage != -1; }
1806
1807private:
1808 static bool CanUseNoBestFit()
1809 {
1810 static int s_isWin98Or2k = -1;
1811
1812 if ( s_isWin98Or2k == -1 )
1813 {
1814 int verMaj, verMin;
1815 switch ( wxGetOsVersion(&verMaj, &verMin) )
1816 {
1817 case wxWIN95:
1818 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1819 break;
1820
1821 case wxWINDOWS_NT:
1822 s_isWin98Or2k = verMaj >= 5;
1823 break;
1824
1825 default:
1826 // unknown, be conseravtive by default
1827 s_isWin98Or2k = 0;
1828 }
1829
1830 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1831 }
1832
1833 return s_isWin98Or2k == 1;
1834 }
f1339c56 1835
89028980
VS
1836 static bool IsAtLeastWin2kSP4()
1837 {
8942f83a
WS
1838#ifdef __WXWINCE__
1839 return false;
1840#else
89028980
VS
1841 static int s_isAtLeastWin2kSP4 = -1;
1842
1843 if ( s_isAtLeastWin2kSP4 == -1 )
1844 {
1845 OSVERSIONINFOEX ver;
1846
1847 memset(&ver, 0, sizeof(ver));
1848 ver.dwOSVersionInfoSize = sizeof(ver);
1849 GetVersionEx((OSVERSIONINFO*)&ver);
1850
1851 s_isAtLeastWin2kSP4 =
1852 ((ver.dwMajorVersion > 5) || // Vista+
1853 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1854 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1855 ver.wServicePackMajor >= 4)) // 2000 SP4+
1856 ? 1 : 0;
1857 }
1858
1859 return s_isAtLeastWin2kSP4 == 1;
8942f83a 1860#endif
89028980
VS
1861 }
1862
b1d66b54 1863 long m_CodePage;
1cd52418 1864};
e95354ec
VZ
1865
1866#endif // wxHAVE_WIN32_MB2WC
1867
f7e98dee
RN
1868// ============================================================================
1869// Cocoa conversion classes
1870// ============================================================================
1871
1872#if defined(__WXCOCOA__)
1873
ecd9653b 1874// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1875// Cocoa. Strangely enough, internally Core Foundation uses
1876// UTF 32 internally quite a bit - its just not public (yet).
1877
1878#include <CoreFoundation/CFString.h>
1879#include <CoreFoundation/CFStringEncodingExt.h>
1880
1881CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1882{
638357a0 1883 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1884 if ( encoding == wxFONTENCODING_DEFAULT )
1885 {
638357a0 1886 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1887 }
1888 else switch( encoding)
1889 {
1890 case wxFONTENCODING_ISO8859_1 :
1891 enc = kCFStringEncodingISOLatin1 ;
1892 break ;
1893 case wxFONTENCODING_ISO8859_2 :
1894 enc = kCFStringEncodingISOLatin2;
1895 break ;
1896 case wxFONTENCODING_ISO8859_3 :
1897 enc = kCFStringEncodingISOLatin3 ;
1898 break ;
1899 case wxFONTENCODING_ISO8859_4 :
1900 enc = kCFStringEncodingISOLatin4;
1901 break ;
1902 case wxFONTENCODING_ISO8859_5 :
1903 enc = kCFStringEncodingISOLatinCyrillic;
1904 break ;
1905 case wxFONTENCODING_ISO8859_6 :
1906 enc = kCFStringEncodingISOLatinArabic;
1907 break ;
1908 case wxFONTENCODING_ISO8859_7 :
1909 enc = kCFStringEncodingISOLatinGreek;
1910 break ;
1911 case wxFONTENCODING_ISO8859_8 :
1912 enc = kCFStringEncodingISOLatinHebrew;
1913 break ;
1914 case wxFONTENCODING_ISO8859_9 :
1915 enc = kCFStringEncodingISOLatin5;
1916 break ;
1917 case wxFONTENCODING_ISO8859_10 :
1918 enc = kCFStringEncodingISOLatin6;
1919 break ;
1920 case wxFONTENCODING_ISO8859_11 :
1921 enc = kCFStringEncodingISOLatinThai;
1922 break ;
1923 case wxFONTENCODING_ISO8859_13 :
1924 enc = kCFStringEncodingISOLatin7;
1925 break ;
1926 case wxFONTENCODING_ISO8859_14 :
1927 enc = kCFStringEncodingISOLatin8;
1928 break ;
1929 case wxFONTENCODING_ISO8859_15 :
1930 enc = kCFStringEncodingISOLatin9;
1931 break ;
1932
1933 case wxFONTENCODING_KOI8 :
1934 enc = kCFStringEncodingKOI8_R;
1935 break ;
1936 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1937 enc = kCFStringEncodingDOSRussian;
1938 break ;
1939
1940// case wxFONTENCODING_BULGARIAN :
1941// enc = ;
1942// break ;
1943
1944 case wxFONTENCODING_CP437 :
1945 enc =kCFStringEncodingDOSLatinUS ;
1946 break ;
1947 case wxFONTENCODING_CP850 :
1948 enc = kCFStringEncodingDOSLatin1;
1949 break ;
1950 case wxFONTENCODING_CP852 :
1951 enc = kCFStringEncodingDOSLatin2;
1952 break ;
1953 case wxFONTENCODING_CP855 :
1954 enc = kCFStringEncodingDOSCyrillic;
1955 break ;
1956 case wxFONTENCODING_CP866 :
1957 enc =kCFStringEncodingDOSRussian ;
1958 break ;
1959 case wxFONTENCODING_CP874 :
1960 enc = kCFStringEncodingDOSThai;
1961 break ;
1962 case wxFONTENCODING_CP932 :
1963 enc = kCFStringEncodingDOSJapanese;
1964 break ;
1965 case wxFONTENCODING_CP936 :
1966 enc =kCFStringEncodingDOSChineseSimplif ;
1967 break ;
1968 case wxFONTENCODING_CP949 :
1969 enc = kCFStringEncodingDOSKorean;
1970 break ;
1971 case wxFONTENCODING_CP950 :
1972 enc = kCFStringEncodingDOSChineseTrad;
1973 break ;
ecd9653b
WS
1974 case wxFONTENCODING_CP1250 :
1975 enc = kCFStringEncodingWindowsLatin2;
1976 break ;
1977 case wxFONTENCODING_CP1251 :
1978 enc =kCFStringEncodingWindowsCyrillic ;
1979 break ;
1980 case wxFONTENCODING_CP1252 :
1981 enc =kCFStringEncodingWindowsLatin1 ;
1982 break ;
1983 case wxFONTENCODING_CP1253 :
1984 enc = kCFStringEncodingWindowsGreek;
1985 break ;
1986 case wxFONTENCODING_CP1254 :
1987 enc = kCFStringEncodingWindowsLatin5;
1988 break ;
1989 case wxFONTENCODING_CP1255 :
1990 enc =kCFStringEncodingWindowsHebrew ;
1991 break ;
1992 case wxFONTENCODING_CP1256 :
1993 enc =kCFStringEncodingWindowsArabic ;
1994 break ;
1995 case wxFONTENCODING_CP1257 :
1996 enc = kCFStringEncodingWindowsBalticRim;
1997 break ;
638357a0
RN
1998// This only really encodes to UTF7 (if that) evidently
1999// case wxFONTENCODING_UTF7 :
2000// enc = kCFStringEncodingNonLossyASCII ;
2001// break ;
ecd9653b
WS
2002 case wxFONTENCODING_UTF8 :
2003 enc = kCFStringEncodingUTF8 ;
2004 break ;
2005 case wxFONTENCODING_EUC_JP :
2006 enc = kCFStringEncodingEUC_JP;
2007 break ;
2008 case wxFONTENCODING_UTF16 :
f7e98dee 2009 enc = kCFStringEncodingUnicode ;
ecd9653b 2010 break ;
f7e98dee
RN
2011 case wxFONTENCODING_MACROMAN :
2012 enc = kCFStringEncodingMacRoman ;
2013 break ;
2014 case wxFONTENCODING_MACJAPANESE :
2015 enc = kCFStringEncodingMacJapanese ;
2016 break ;
2017 case wxFONTENCODING_MACCHINESETRAD :
2018 enc = kCFStringEncodingMacChineseTrad ;
2019 break ;
2020 case wxFONTENCODING_MACKOREAN :
2021 enc = kCFStringEncodingMacKorean ;
2022 break ;
2023 case wxFONTENCODING_MACARABIC :
2024 enc = kCFStringEncodingMacArabic ;
2025 break ;
2026 case wxFONTENCODING_MACHEBREW :
2027 enc = kCFStringEncodingMacHebrew ;
2028 break ;
2029 case wxFONTENCODING_MACGREEK :
2030 enc = kCFStringEncodingMacGreek ;
2031 break ;
2032 case wxFONTENCODING_MACCYRILLIC :
2033 enc = kCFStringEncodingMacCyrillic ;
2034 break ;
2035 case wxFONTENCODING_MACDEVANAGARI :
2036 enc = kCFStringEncodingMacDevanagari ;
2037 break ;
2038 case wxFONTENCODING_MACGURMUKHI :
2039 enc = kCFStringEncodingMacGurmukhi ;
2040 break ;
2041 case wxFONTENCODING_MACGUJARATI :
2042 enc = kCFStringEncodingMacGujarati ;
2043 break ;
2044 case wxFONTENCODING_MACORIYA :
2045 enc = kCFStringEncodingMacOriya ;
2046 break ;
2047 case wxFONTENCODING_MACBENGALI :
2048 enc = kCFStringEncodingMacBengali ;
2049 break ;
2050 case wxFONTENCODING_MACTAMIL :
2051 enc = kCFStringEncodingMacTamil ;
2052 break ;
2053 case wxFONTENCODING_MACTELUGU :
2054 enc = kCFStringEncodingMacTelugu ;
2055 break ;
2056 case wxFONTENCODING_MACKANNADA :
2057 enc = kCFStringEncodingMacKannada ;
2058 break ;
2059 case wxFONTENCODING_MACMALAJALAM :
2060 enc = kCFStringEncodingMacMalayalam ;
2061 break ;
2062 case wxFONTENCODING_MACSINHALESE :
2063 enc = kCFStringEncodingMacSinhalese ;
2064 break ;
2065 case wxFONTENCODING_MACBURMESE :
2066 enc = kCFStringEncodingMacBurmese ;
2067 break ;
2068 case wxFONTENCODING_MACKHMER :
2069 enc = kCFStringEncodingMacKhmer ;
2070 break ;
2071 case wxFONTENCODING_MACTHAI :
2072 enc = kCFStringEncodingMacThai ;
2073 break ;
2074 case wxFONTENCODING_MACLAOTIAN :
2075 enc = kCFStringEncodingMacLaotian ;
2076 break ;
2077 case wxFONTENCODING_MACGEORGIAN :
2078 enc = kCFStringEncodingMacGeorgian ;
2079 break ;
2080 case wxFONTENCODING_MACARMENIAN :
2081 enc = kCFStringEncodingMacArmenian ;
2082 break ;
2083 case wxFONTENCODING_MACCHINESESIMP :
2084 enc = kCFStringEncodingMacChineseSimp ;
2085 break ;
2086 case wxFONTENCODING_MACTIBETAN :
2087 enc = kCFStringEncodingMacTibetan ;
2088 break ;
2089 case wxFONTENCODING_MACMONGOLIAN :
2090 enc = kCFStringEncodingMacMongolian ;
2091 break ;
2092 case wxFONTENCODING_MACETHIOPIC :
2093 enc = kCFStringEncodingMacEthiopic ;
2094 break ;
2095 case wxFONTENCODING_MACCENTRALEUR :
2096 enc = kCFStringEncodingMacCentralEurRoman ;
2097 break ;
2098 case wxFONTENCODING_MACVIATNAMESE :
2099 enc = kCFStringEncodingMacVietnamese ;
2100 break ;
2101 case wxFONTENCODING_MACARABICEXT :
2102 enc = kCFStringEncodingMacExtArabic ;
2103 break ;
2104 case wxFONTENCODING_MACSYMBOL :
2105 enc = kCFStringEncodingMacSymbol ;
2106 break ;
2107 case wxFONTENCODING_MACDINGBATS :
2108 enc = kCFStringEncodingMacDingbats ;
2109 break ;
2110 case wxFONTENCODING_MACTURKISH :
2111 enc = kCFStringEncodingMacTurkish ;
2112 break ;
2113 case wxFONTENCODING_MACCROATIAN :
2114 enc = kCFStringEncodingMacCroatian ;
2115 break ;
2116 case wxFONTENCODING_MACICELANDIC :
2117 enc = kCFStringEncodingMacIcelandic ;
2118 break ;
2119 case wxFONTENCODING_MACROMANIAN :
2120 enc = kCFStringEncodingMacRomanian ;
2121 break ;
2122 case wxFONTENCODING_MACCELTIC :
2123 enc = kCFStringEncodingMacCeltic ;
2124 break ;
2125 case wxFONTENCODING_MACGAELIC :
2126 enc = kCFStringEncodingMacGaelic ;
2127 break ;
ecd9653b
WS
2128// case wxFONTENCODING_MACKEYBOARD :
2129// enc = kCFStringEncodingMacKeyboardGlyphs ;
2130// break ;
2131 default :
2132 // because gcc is picky
2133 break ;
2134 } ;
2135 return enc ;
f7e98dee
RN
2136}
2137
f7e98dee
RN
2138class wxMBConv_cocoa : public wxMBConv
2139{
2140public:
2141 wxMBConv_cocoa()
2142 {
2143 Init(CFStringGetSystemEncoding()) ;
2144 }
2145
a6900d10 2146#if wxUSE_FONTMAP
f7e98dee
RN
2147 wxMBConv_cocoa(const wxChar* name)
2148 {
267e11c5 2149 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2150 }
a6900d10 2151#endif
f7e98dee
RN
2152
2153 wxMBConv_cocoa(wxFontEncoding encoding)
2154 {
2155 Init( wxCFStringEncFromFontEnc(encoding) );
2156 }
2157
2158 ~wxMBConv_cocoa()
2159 {
2160 }
2161
2162 void Init( CFStringEncoding encoding)
2163 {
638357a0 2164 m_encoding = encoding ;
f7e98dee
RN
2165 }
2166
2167 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2168 {
2169 wxASSERT(szUnConv);
ecd9653b 2170
638357a0
RN
2171 CFStringRef theString = CFStringCreateWithBytes (
2172 NULL, //the allocator
2173 (const UInt8*)szUnConv,
2174 strlen(szUnConv),
2175 m_encoding,
2176 false //no BOM/external representation
f7e98dee
RN
2177 );
2178
2179 wxASSERT(theString);
2180
638357a0
RN
2181 size_t nOutLength = CFStringGetLength(theString);
2182
2183 if (szOut == NULL)
f7e98dee 2184 {
f7e98dee 2185 CFRelease(theString);
638357a0 2186 return nOutLength;
f7e98dee 2187 }
ecd9653b 2188
638357a0 2189 CFRange theRange = { 0, nOutSize };
ecd9653b 2190
638357a0
RN
2191#if SIZEOF_WCHAR_T == 4
2192 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2193#endif
3698ae71 2194
f7e98dee 2195 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2196
f7e98dee 2197 CFRelease(theString);
ecd9653b 2198
638357a0 2199 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
2200
2201#if SIZEOF_WCHAR_T == 4
2202 wxMBConvUTF16 converter ;
638357a0 2203 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
2204 delete[] szUniCharBuffer;
2205#endif
3698ae71 2206
638357a0 2207 return nOutLength;
f7e98dee
RN
2208 }
2209
2210 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2211 {
638357a0 2212 wxASSERT(szUnConv);
3698ae71 2213
f7e98dee 2214 size_t nRealOutSize;
638357a0 2215 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2216 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2217
f7e98dee 2218#if SIZEOF_WCHAR_T == 4
d9d488cf 2219 wxMBConvUTF16 converter ;
f7e98dee
RN
2220 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2221 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2222 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2223 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2224#endif
2225
2226 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2227 NULL, //allocator
2228 szUniBuffer,
2229 nBufSize,
638357a0 2230 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2231 );
ecd9653b 2232
f7e98dee 2233 wxASSERT(theString);
ecd9653b 2234
f7e98dee 2235 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2236 //so we check and use getchars instead in that case
2237 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2238 {
638357a0
RN
2239 if (szOut != NULL)
2240 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2241
638357a0
RN
2242 nRealOutSize = CFStringGetLength(theString) + 1;
2243 }
2244 else
2245 {
2246 CFStringGetBytes(
2247 theString,
2248 CFRangeMake(0, CFStringGetLength(theString)),
2249 m_encoding,
2250 0, //what to put in characters that can't be converted -
2251 //0 tells CFString to return NULL if it meets such a character
2252 false, //not an external representation
2253 (UInt8*) szOut,
3698ae71 2254 nOutSize,
638357a0
RN
2255 (CFIndex*) &nRealOutSize
2256 );
f7e98dee 2257 }
ecd9653b 2258
638357a0 2259 CFRelease(theString);
ecd9653b 2260
638357a0
RN
2261#if SIZEOF_WCHAR_T == 4
2262 delete[] szUniBuffer;
2263#endif
ecd9653b 2264
f7e98dee
RN
2265 return nRealOutSize - 1;
2266 }
2267
2268 bool IsOk() const
ecd9653b 2269 {
3698ae71 2270 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2271 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2272 }
2273
2274private:
638357a0 2275 CFStringEncoding m_encoding ;
f7e98dee
RN
2276};
2277
2278#endif // defined(__WXCOCOA__)
2279
335d31e0
SC
2280// ============================================================================
2281// Mac conversion classes
2282// ============================================================================
2283
2284#if defined(__WXMAC__) && defined(TARGET_CARBON)
2285
2286class wxMBConv_mac : public wxMBConv
2287{
2288public:
2289 wxMBConv_mac()
2290 {
2291 Init(CFStringGetSystemEncoding()) ;
2292 }
2293
2d1659cf 2294#if wxUSE_FONTMAP
335d31e0
SC
2295 wxMBConv_mac(const wxChar* name)
2296 {
267e11c5 2297 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0 2298 }
2d1659cf 2299#endif
335d31e0
SC
2300
2301 wxMBConv_mac(wxFontEncoding encoding)
2302 {
d775fa82
WS
2303 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2304 }
2305
2306 ~wxMBConv_mac()
2307 {
2308 OSStatus status = noErr ;
2309 status = TECDisposeConverter(m_MB2WC_converter);
2310 status = TECDisposeConverter(m_WC2MB_converter);
2311 }
2312
2313
2314 void Init( TextEncodingBase encoding)
2315 {
2316 OSStatus status = noErr ;
2317 m_char_encoding = encoding ;
2318 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2319
2320 status = TECCreateConverter(&m_MB2WC_converter,
2321 m_char_encoding,
2322 m_unicode_encoding);
2323 status = TECCreateConverter(&m_WC2MB_converter,
2324 m_unicode_encoding,
2325 m_char_encoding);
2326 }
2327
335d31e0
SC
2328 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2329 {
d775fa82
WS
2330 OSStatus status = noErr ;
2331 ByteCount byteOutLen ;
2332 ByteCount byteInLen = strlen(psz) ;
2333 wchar_t *tbuf = NULL ;
2334 UniChar* ubuf = NULL ;
2335 size_t res = 0 ;
2336
2337 if (buf == NULL)
2338 {
638357a0 2339 //apple specs say at least 32
c543817b 2340 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2341 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2342 }
2343 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2344#if SIZEOF_WCHAR_T == 4
d775fa82 2345 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2346#else
d775fa82 2347 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2348#endif
d775fa82
WS
2349 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2350 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2351#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2352 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2353 // is not properly terminated we get random characters at the end
2354 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2355 wxMBConvUTF16 converter ;
d775fa82
WS
2356 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2357 free( ubuf ) ;
f3a355ce 2358#else
d775fa82 2359 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2360#endif
d775fa82
WS
2361 if ( buf == NULL )
2362 free(tbuf) ;
335d31e0 2363
335d31e0
SC
2364 if ( buf && res < n)
2365 buf[res] = 0;
2366
d775fa82 2367 return res ;
335d31e0
SC
2368 }
2369
2370 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2371 {
2372 OSStatus status = noErr ;
2373 ByteCount byteOutLen ;
2374 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2375
2376 char *tbuf = NULL ;
2377
2378 if (buf == NULL)
2379 {
638357a0 2380 //apple specs say at least 32
c543817b 2381 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2382 tbuf = (char*) malloc( n ) ;
2383 }
2384
2385 ByteCount byteBufferLen = n ;
2386 UniChar* ubuf = NULL ;
f3a355ce 2387#if SIZEOF_WCHAR_T == 4
d9d488cf 2388 wxMBConvUTF16 converter ;
d775fa82
WS
2389 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2390 byteInLen = unicharlen ;
2391 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2392 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2393#else
d775fa82 2394 ubuf = (UniChar*) psz ;
f3a355ce 2395#endif
d775fa82
WS
2396 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2397 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2398#if SIZEOF_WCHAR_T == 4
d775fa82 2399 free( ubuf ) ;
f3a355ce 2400#endif
d775fa82
WS
2401 if ( buf == NULL )
2402 free(tbuf) ;
335d31e0 2403
d775fa82 2404 size_t res = byteOutLen ;
335d31e0 2405 if ( buf && res < n)
638357a0 2406 {
335d31e0 2407 buf[res] = 0;
3698ae71 2408
638357a0
RN
2409 //we need to double-trip to verify it didn't insert any ? in place
2410 //of bogus characters
2411 wxWCharBuffer wcBuf(n);
2412 size_t pszlen = wxWcslen(psz);
2413 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2414 wxWcslen(wcBuf) != pszlen ||
2415 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2416 {
2417 // we didn't obtain the same thing we started from, hence
2418 // the conversion was lossy and we consider that it failed
2419 return (size_t)-1;
2420 }
2421 }
335d31e0 2422
d775fa82 2423 return res ;
335d31e0
SC
2424 }
2425
2426 bool IsOk() const
2427 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2428
2429private:
d775fa82
WS
2430 TECObjectRef m_MB2WC_converter ;
2431 TECObjectRef m_WC2MB_converter ;
2432
2433 TextEncodingBase m_char_encoding ;
2434 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2435};
2436
2437#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2438
36acb880
VZ
2439// ============================================================================
2440// wxEncodingConverter based conversion classes
2441// ============================================================================
2442
1e6feb95 2443#if wxUSE_FONTMAP
1cd52418 2444
e95354ec 2445class wxMBConv_wxwin : public wxMBConv
1cd52418 2446{
8b04d4c4
VZ
2447private:
2448 void Init()
2449 {
2450 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2451 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2452 }
2453
6001e347 2454public:
f1339c56
RR
2455 // temporarily just use wxEncodingConverter stuff,
2456 // so that it works while a better implementation is built
e95354ec 2457 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2458 {
2459 if (name)
267e11c5 2460 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2461 else
2462 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2463
8b04d4c4
VZ
2464 Init();
2465 }
2466
e95354ec 2467 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2468 {
2469 m_enc = enc;
2470
2471 Init();
f1339c56 2472 }
dccce9ea 2473
bde4baac 2474 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2475 {
2476 size_t inbuf = strlen(psz);
dccce9ea 2477 if (buf)
c643a977
VS
2478 {
2479 if (!m2w.Convert(psz,buf))
2480 return (size_t)-1;
2481 }
f1339c56
RR
2482 return inbuf;
2483 }
dccce9ea 2484
bde4baac 2485 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2486 {
f8d791e0 2487 const size_t inbuf = wxWcslen(psz);
f1339c56 2488 if (buf)
c643a977
VS
2489 {
2490 if (!w2m.Convert(psz,buf))
2491 return (size_t)-1;
2492 }
dccce9ea 2493
f1339c56
RR
2494 return inbuf;
2495 }
dccce9ea 2496
e95354ec 2497 bool IsOk() const { return m_ok; }
f1339c56
RR
2498
2499public:
8b04d4c4 2500 wxFontEncoding m_enc;
f1339c56 2501 wxEncodingConverter m2w, w2m;
cafbf6fb
VZ
2502
2503 // were we initialized successfully?
2504 bool m_ok;
fc7a2a60 2505
e95354ec 2506 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2507};
6001e347 2508
8f115891
MW
2509// make the constructors available for unit testing
2510WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2511{
2512 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2513 if ( !result->IsOk() )
2514 {
2515 delete result;
2516 return 0;
2517 }
2518 return result;
2519}
2520
1e6feb95
VZ
2521#endif // wxUSE_FONTMAP
2522
36acb880
VZ
2523// ============================================================================
2524// wxCSConv implementation
2525// ============================================================================
2526
8b04d4c4 2527void wxCSConv::Init()
6001e347 2528{
e95354ec
VZ
2529 m_name = NULL;
2530 m_convReal = NULL;
2531 m_deferred = true;
2532}
2533
8b04d4c4
VZ
2534wxCSConv::wxCSConv(const wxChar *charset)
2535{
2536 Init();
82713003 2537
e95354ec
VZ
2538 if ( charset )
2539 {
e95354ec
VZ
2540 SetName(charset);
2541 }
bda3d86a 2542
e4277538
VZ
2543#if wxUSE_FONTMAP
2544 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2545#else
bda3d86a 2546 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2547#endif
6001e347
RR
2548}
2549
8b04d4c4
VZ
2550wxCSConv::wxCSConv(wxFontEncoding encoding)
2551{
bda3d86a 2552 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2553 {
2554 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2555
2556 encoding = wxFONTENCODING_SYSTEM;
2557 }
2558
8b04d4c4
VZ
2559 Init();
2560
bda3d86a 2561 m_encoding = encoding;
8b04d4c4
VZ
2562}
2563
6001e347
RR
2564wxCSConv::~wxCSConv()
2565{
65e50848
JS
2566 Clear();
2567}
2568
54380f29 2569wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2570 : wxMBConv()
54380f29 2571{
8b04d4c4
VZ
2572 Init();
2573
54380f29 2574 SetName(conv.m_name);
8b04d4c4 2575 m_encoding = conv.m_encoding;
54380f29
GD
2576}
2577
2578wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2579{
2580 Clear();
8b04d4c4 2581
54380f29 2582 SetName(conv.m_name);
8b04d4c4
VZ
2583 m_encoding = conv.m_encoding;
2584
54380f29
GD
2585 return *this;
2586}
2587
65e50848
JS
2588void wxCSConv::Clear()
2589{
8b04d4c4 2590 free(m_name);
e95354ec 2591 delete m_convReal;
8b04d4c4 2592
65e50848 2593 m_name = NULL;
e95354ec 2594 m_convReal = NULL;
6001e347
RR
2595}
2596
2597void wxCSConv::SetName(const wxChar *charset)
2598{
f1339c56
RR
2599 if (charset)
2600 {
2601 m_name = wxStrdup(charset);
e95354ec 2602 m_deferred = true;
f1339c56 2603 }
6001e347
RR
2604}
2605
8b3eb85d
VZ
2606#if wxUSE_FONTMAP
2607#include "wx/hashmap.h"
2608
2609WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2610 wxEncodingNameCache );
8b3eb85d
VZ
2611
2612static wxEncodingNameCache gs_nameCache;
2613#endif
2614
e95354ec
VZ
2615wxMBConv *wxCSConv::DoCreate() const
2616{
ce6f8d6f
VZ
2617#if wxUSE_FONTMAP
2618 wxLogTrace(TRACE_STRCONV,
2619 wxT("creating conversion for %s"),
2620 (m_name ? m_name
2621 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2622#endif // wxUSE_FONTMAP
2623
c547282d
VZ
2624 // check for the special case of ASCII or ISO8859-1 charset: as we have
2625 // special knowledge of it anyhow, we don't need to create a special
2626 // conversion object
e4277538
VZ
2627 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2628 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2629 {
e95354ec
VZ
2630 // don't convert at all
2631 return NULL;
2632 }
dccce9ea 2633
e95354ec
VZ
2634 // we trust OS to do conversion better than we can so try external
2635 // conversion methods first
2636 //
2637 // the full order is:
2638 // 1. OS conversion (iconv() under Unix or Win32 API)
2639 // 2. hard coded conversions for UTF
2640 // 3. wxEncodingConverter as fall back
2641
2642 // step (1)
2643#ifdef HAVE_ICONV
c547282d 2644#if !wxUSE_FONTMAP
e95354ec 2645 if ( m_name )
c547282d 2646#endif // !wxUSE_FONTMAP
e95354ec 2647 {
c547282d 2648 wxString name(m_name);
8b3eb85d
VZ
2649 wxFontEncoding encoding(m_encoding);
2650
2651 if ( !name.empty() )
2652 {
2653 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2654 if ( conv->IsOk() )
2655 return conv;
2656
2657 delete conv;
c547282d
VZ
2658
2659#if wxUSE_FONTMAP
8b3eb85d
VZ
2660 encoding =
2661 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 2662#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2663 }
2664#if wxUSE_FONTMAP
2665 {
2666 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2667 if ( it != gs_nameCache.end() )
2668 {
2669 if ( it->second.empty() )
2670 return NULL;
c547282d 2671
8b3eb85d
VZ
2672 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2673 if ( conv->IsOk() )
2674 return conv;
e95354ec 2675
8b3eb85d
VZ
2676 delete conv;
2677 }
2678
2679 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2680
2681 for ( ; *names; ++names )
2682 {
2683 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2684 if ( conv->IsOk() )
2685 {
2686 gs_nameCache[encoding] = *names;
2687 return conv;
2688 }
2689
2690 delete conv;
2691 }
2692
40711af8 2693 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
2694 }
2695#endif // wxUSE_FONTMAP
e95354ec
VZ
2696 }
2697#endif // HAVE_ICONV
2698
2699#ifdef wxHAVE_WIN32_MB2WC
2700 {
7608a683 2701#if wxUSE_FONTMAP
e95354ec
VZ
2702 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2703 : new wxMBConv_win32(m_encoding);
2704 if ( conv->IsOk() )
2705 return conv;
2706
2707 delete conv;
7608a683
WS
2708#else
2709 return NULL;
2710#endif
e95354ec
VZ
2711 }
2712#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2713#if defined(__WXMAC__)
2714 {
5c3c8676 2715 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 2716 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 2717 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82
WS
2718 {
2719
2d1659cf 2720#if wxUSE_FONTMAP
d775fa82
WS
2721 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2722 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
2723#else
2724 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2725#endif
d775fa82 2726 if ( conv->IsOk() )
f7e98dee
RN
2727 return conv;
2728
2729 delete conv;
2730 }
2731 }
2732#endif
2733#if defined(__WXCOCOA__)
2734 {
2735 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2736 {
2737
a6900d10 2738#if wxUSE_FONTMAP
f7e98dee
RN
2739 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2740 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
2741#else
2742 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2743#endif
f7e98dee 2744 if ( conv->IsOk() )
d775fa82
WS
2745 return conv;
2746
2747 delete conv;
2748 }
335d31e0
SC
2749 }
2750#endif
e95354ec
VZ
2751 // step (2)
2752 wxFontEncoding enc = m_encoding;
2753#if wxUSE_FONTMAP
c547282d
VZ
2754 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2755 {
2756 // use "false" to suppress interactive dialogs -- we can be called from
2757 // anywhere and popping up a dialog from here is the last thing we want to
2758 // do
267e11c5 2759 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2760 }
e95354ec
VZ
2761#endif // wxUSE_FONTMAP
2762
2763 switch ( enc )
2764 {
2765 case wxFONTENCODING_UTF7:
2766 return new wxMBConvUTF7;
2767
2768 case wxFONTENCODING_UTF8:
2769 return new wxMBConvUTF8;
2770
e95354ec
VZ
2771 case wxFONTENCODING_UTF16BE:
2772 return new wxMBConvUTF16BE;
2773
2774 case wxFONTENCODING_UTF16LE:
2775 return new wxMBConvUTF16LE;
2776
e95354ec
VZ
2777 case wxFONTENCODING_UTF32BE:
2778 return new wxMBConvUTF32BE;
2779
2780 case wxFONTENCODING_UTF32LE:
2781 return new wxMBConvUTF32LE;
2782
2783 default:
2784 // nothing to do but put here to suppress gcc warnings
2785 ;
2786 }
2787
2788 // step (3)
2789#if wxUSE_FONTMAP
2790 {
2791 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2792 : new wxMBConv_wxwin(m_encoding);
2793 if ( conv->IsOk() )
2794 return conv;
2795
2796 delete conv;
2797 }
2798#endif // wxUSE_FONTMAP
2799
a58d4f4d
VS
2800 // NB: This is a hack to prevent deadlock. What could otherwise happen
2801 // in Unicode build: wxConvLocal creation ends up being here
2802 // because of some failure and logs the error. But wxLog will try to
2803 // attach timestamp, for which it will need wxConvLocal (to convert
2804 // time to char* and then wchar_t*), but that fails, tries to log
2805 // error, but wxLog has a (already locked) critical section that
2806 // guards static buffer.
2807 static bool alreadyLoggingError = false;
2808 if (!alreadyLoggingError)
2809 {
2810 alreadyLoggingError = true;
2811 wxLogError(_("Cannot convert from the charset '%s'!"),
2812 m_name ? m_name
e95354ec
VZ
2813 :
2814#if wxUSE_FONTMAP
267e11c5 2815 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
2816#else // !wxUSE_FONTMAP
2817 wxString::Format(_("encoding %s"), m_encoding).c_str()
2818#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2819 );
a58d4f4d
VS
2820 alreadyLoggingError = false;
2821 }
e95354ec
VZ
2822
2823 return NULL;
2824}
2825
2826void wxCSConv::CreateConvIfNeeded() const
2827{
2828 if ( m_deferred )
2829 {
2830 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2831
2832#if wxUSE_INTL
2833 // if we don't have neither the name nor the encoding, use the default
2834 // encoding for this system
2835 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2836 {
4d312c22 2837 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2838 }
2839#endif // wxUSE_INTL
2840
e95354ec
VZ
2841 self->m_convReal = DoCreate();
2842 self->m_deferred = false;
6001e347 2843 }
6001e347
RR
2844}
2845
2846size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2847{
e95354ec 2848 CreateConvIfNeeded();
dccce9ea 2849
e95354ec
VZ
2850 if (m_convReal)
2851 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2852
2853 // latin-1 (direct)
4def3b35 2854 size_t len = strlen(psz);
dccce9ea 2855
f1339c56
RR
2856 if (buf)
2857 {
4def3b35 2858 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2859 buf[c] = (unsigned char)(psz[c]);
2860 }
dccce9ea 2861
f1339c56 2862 return len;
6001e347
RR
2863}
2864
2865size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2866{
e95354ec 2867 CreateConvIfNeeded();
dccce9ea 2868
e95354ec
VZ
2869 if (m_convReal)
2870 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2871
f1339c56 2872 // latin-1 (direct)
f8d791e0 2873 const size_t len = wxWcslen(psz);
f1339c56
RR
2874 if (buf)
2875 {
4def3b35 2876 for (size_t c = 0; c <= len; c++)
24642831
VS
2877 {
2878 if (psz[c] > 0xFF)
2879 return (size_t)-1;
907173e5 2880 buf[c] = (char)psz[c];
24642831
VS
2881 }
2882 }
2883 else
2884 {
2885 for (size_t c = 0; c <= len; c++)
2886 {
2887 if (psz[c] > 0xFF)
2888 return (size_t)-1;
2889 }
f1339c56 2890 }
dccce9ea 2891
f1339c56 2892 return len;
6001e347
RR
2893}
2894
bde4baac
VZ
2895// ----------------------------------------------------------------------------
2896// globals
2897// ----------------------------------------------------------------------------
2898
2899#ifdef __WINDOWS__
2900 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
2901#elif defined(__WXMAC__) && !defined(__MACH__)
2902 static wxMBConv_mac wxConvLibcObj ;
bde4baac 2903#else
dcc8fac0 2904 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
2905#endif
2906
2907static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2908static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2909static wxMBConvUTF7 wxConvUTF7Obj;
2910static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 2911
bde4baac
VZ
2912WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2913WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2914WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2915WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2916WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2917WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
2918WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2919#ifdef __WXOSX__
ea8ce907 2920 wxConvUTF8Obj;
f5a1953b 2921#else
ea8ce907 2922 wxConvLibcObj;
f5a1953b
VZ
2923#endif
2924
bde4baac
VZ
2925
2926#else // !wxUSE_WCHAR_T
2927
2928// stand-ins in absence of wchar_t
2929WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2930 wxConvISO8859_1,
2931 wxConvLocal,
2932 wxConvUTF8;
2933
2934#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T