]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
made DoTestConversion() work with strings containing NULs
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
6001e347
RR
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
373658eb
VZ
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
bde4baac
VZ
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
7608a683 39#ifdef __WINDOWS__
532d575b 40 #include "wx/msw/private.h"
13dd924a 41 #include "wx/msw/missing.h"
0a1c1e62
GRG
42#endif
43
1c193821 44#ifndef __WXWINCE__
1cd52418 45#include <errno.h>
1c193821
JS
46#endif
47
6001e347
RR
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
e95354ec
VZ
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
6001e347 56#ifdef __SALFORDC__
373658eb 57 #include <clib.h>
6001e347
RR
58#endif
59
b040e242 60#ifdef HAVE_ICONV
373658eb 61 #include <iconv.h>
b1d547eb 62 #include "wx/thread.h"
1cd52418 63#endif
1cd52418 64
373658eb
VZ
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
7608a683 67#include "wx/utils.h"
373658eb 68
335d31e0 69#ifdef __WXMAC__
40ba2f3b 70#ifndef __DARWIN__
4227afa4
SC
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
40ba2f3b 74#endif
335d31e0
SC
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
ce6f8d6f
VZ
78
79#define TRACE_STRCONV _T("strconv")
80
4948c2b6 81#if SIZEOF_WCHAR_T == 2
ac11db3a
MW
82 #define WC_UTF16
83#endif
84
373658eb
VZ
85// ============================================================================
86// implementation
87// ============================================================================
88
89// ----------------------------------------------------------------------------
c91830cb 90// UTF-16 en/decoding to/from UCS-4
373658eb 91// ----------------------------------------------------------------------------
6001e347 92
b0a6bb75 93
c91830cb 94static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 95{
dccce9ea 96 if (input<=0xffff)
4def3b35 97 {
999836aa
VZ
98 if (output)
99 *output = (wxUint16) input;
4def3b35 100 return 1;
dccce9ea
VZ
101 }
102 else if (input>=0x110000)
4def3b35
VS
103 {
104 return (size_t)-1;
dccce9ea
VZ
105 }
106 else
4def3b35 107 {
dccce9ea 108 if (output)
4def3b35 109 {
c91830cb 110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
112 }
113 return 2;
1cd52418 114 }
1cd52418
OK
115}
116
c91830cb 117static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 118{
dccce9ea 119 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
120 {
121 output = *input;
122 return 1;
dccce9ea 123 }
cdb14ecb 124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
4def3b35
VS
125 {
126 output = *input;
127 return (size_t)-1;
dccce9ea
VZ
128 }
129 else
4def3b35
VS
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
1cd52418
OK
134}
135
b0a6bb75 136
f6bcfd97 137// ----------------------------------------------------------------------------
6001e347 138// wxMBConv
f6bcfd97 139// ----------------------------------------------------------------------------
2c53a80a
WS
140
141wxMBConv::~wxMBConv()
142{
143 // nothing to do here (necessary for Darwin linking probably)
144}
6001e347 145
6001e347
RR
146const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147{
2b5f62a0 148 if ( psz )
6001e347 149 {
2b5f62a0
VZ
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
635f33ce
VS
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
2b5f62a0 161 }
f6bcfd97 162 }
2b5f62a0
VZ
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
6001e347
RR
167}
168
e5cceba0 169const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 170{
2b5f62a0
VZ
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
c91830cb 176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
2b5f62a0
VZ
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
e5cceba0 186
e5cceba0 187 return buf;
6001e347
RR
188}
189
f5fb6871 190const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
e4e3bbb4 191{
f5fb6871
RN
192 wxASSERT(pOutSize != NULL);
193
e4e3bbb4
RN
194 const char* szEnd = szString + nStringLen + 1;
195 const char* szPos = szString;
196 const char* szStart = szPos;
197
198 size_t nActualLength = 0;
f5fb6871
RN
199 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
200
201 wxWCharBuffer theBuffer(nCurrentSize);
e4e3bbb4
RN
202
203 //Convert the string until the length() is reached, continuing the
204 //loop every time a null character is reached
205 while(szPos != szEnd)
206 {
207 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
208
209 //Get the length of the current (sub)string
210 size_t nLen = MB2WC(NULL, szPos, 0);
211
212 //Invalid conversion?
213 if( nLen == (size_t)-1 )
f5fb6871
RN
214 {
215 *pOutSize = 0;
216 theBuffer.data()[0u] = wxT('\0');
217 return theBuffer;
218 }
219
e4e3bbb4
RN
220
221 //Increase the actual length (+1 for current null character)
222 nActualLength += nLen + 1;
223
f5fb6871
RN
224 //if buffer too big, realloc the buffer
225 if (nActualLength > (nCurrentSize+1))
226 {
227 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
228 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
229 theBuffer = theNewBuffer;
230 nCurrentSize <<= 1;
231 }
232
233 //Convert the current (sub)string
234 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
e4e3bbb4 235 {
f5fb6871
RN
236 *pOutSize = 0;
237 theBuffer.data()[0u] = wxT('\0');
238 return theBuffer;
e4e3bbb4
RN
239 }
240
241 //Increment to next (sub)string
3103e8a9
JS
242 //Note that we have to use strlen instead of nLen here
243 //because XX2XX gives us the size of the output buffer,
244 //which is not necessarily the length of the string
e4e3bbb4
RN
245 szPos += strlen(szPos) + 1;
246 }
247
f5fb6871
RN
248 //success - return actual length and the buffer
249 *pOutSize = nActualLength;
3698ae71 250 return theBuffer;
e4e3bbb4
RN
251}
252
f5fb6871 253const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
e4e3bbb4 254{
f5fb6871
RN
255 wxASSERT(pOutSize != NULL);
256
e4e3bbb4
RN
257 const wchar_t* szEnd = szString + nStringLen + 1;
258 const wchar_t* szPos = szString;
259 const wchar_t* szStart = szPos;
260
261 size_t nActualLength = 0;
f5fb6871
RN
262 size_t nCurrentSize = nStringLen << 2; //try * 4 first
263
264 wxCharBuffer theBuffer(nCurrentSize);
e4e3bbb4
RN
265
266 //Convert the string until the length() is reached, continuing the
267 //loop every time a null character is reached
268 while(szPos != szEnd)
269 {
270 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
271
272 //Get the length of the current (sub)string
273 size_t nLen = WC2MB(NULL, szPos, 0);
274
275 //Invalid conversion?
276 if( nLen == (size_t)-1 )
f5fb6871
RN
277 {
278 *pOutSize = 0;
279 theBuffer.data()[0u] = wxT('\0');
280 return theBuffer;
281 }
e4e3bbb4
RN
282
283 //Increase the actual length (+1 for current null character)
284 nActualLength += nLen + 1;
3698ae71 285
f5fb6871
RN
286 //if buffer too big, realloc the buffer
287 if (nActualLength > (nCurrentSize+1))
288 {
289 wxCharBuffer theNewBuffer(nCurrentSize << 1);
290 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
291 theBuffer = theNewBuffer;
292 nCurrentSize <<= 1;
293 }
294
295 //Convert the current (sub)string
296 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
e4e3bbb4 297 {
f5fb6871
RN
298 *pOutSize = 0;
299 theBuffer.data()[0u] = wxT('\0');
300 return theBuffer;
e4e3bbb4
RN
301 }
302
303 //Increment to next (sub)string
3103e8a9
JS
304 //Note that we have to use wxWcslen instead of nLen here
305 //because XX2XX gives us the size of the output buffer,
306 //which is not necessarily the length of the string
e4e3bbb4
RN
307 szPos += wxWcslen(szPos) + 1;
308 }
309
f5fb6871
RN
310 //success - return actual length and the buffer
311 *pOutSize = nActualLength;
3698ae71 312 return theBuffer;
e4e3bbb4
RN
313}
314
6001e347 315// ----------------------------------------------------------------------------
bde4baac 316// wxMBConvLibc
6001e347
RR
317// ----------------------------------------------------------------------------
318
bde4baac
VZ
319size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
320{
321 return wxMB2WC(buf, psz, n);
322}
323
324size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
325{
326 return wxWC2MB(buf, psz, n);
327}
e1bfe89e 328
66bf0099 329#ifdef __UNIX__
c12b7f79 330
e1bfe89e 331// ----------------------------------------------------------------------------
532d575b 332// wxConvBrokenFileNames
e1bfe89e
RR
333// ----------------------------------------------------------------------------
334
845905d5 335wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 336{
845905d5
MW
337 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
338 || wxStricmp(charset, _T("UTF8")) == 0 )
339 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
340 else
341 m_conv = new wxCSConv(charset);
ea8ce907
RR
342}
343
c12b7f79
VZ
344size_t
345wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
346 const char *psz,
347 size_t outputSize) const
e1bfe89e 348{
c12b7f79 349 return m_conv->MB2WC( outputBuf, psz, outputSize );
e1bfe89e
RR
350}
351
c12b7f79
VZ
352size_t
353wxConvBrokenFileNames::WC2MB(char *outputBuf,
354 const wchar_t *psz,
355 size_t outputSize) const
e1bfe89e 356{
c12b7f79 357 return m_conv->WC2MB( outputBuf, psz, outputSize );
e1bfe89e
RR
358}
359
66bf0099 360#endif
c12b7f79 361
bde4baac 362// ----------------------------------------------------------------------------
3698ae71 363// UTF-7
bde4baac 364// ----------------------------------------------------------------------------
6001e347 365
15f2ee32 366// Implementation (C) 2004 Fredrik Roubert
6001e347 367
15f2ee32
RN
368//
369// BASE64 decoding table
370//
371static const unsigned char utf7unb64[] =
6001e347 372{
15f2ee32
RN
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
376 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
379 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
380 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
382 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
383 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
384 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
386 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
387 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
388 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
402 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
403 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
405};
406
407size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
408{
15f2ee32
RN
409 size_t len = 0;
410
04a37834 411 while ( *psz && (!buf || (len < n)) )
15f2ee32
RN
412 {
413 unsigned char cc = *psz++;
414 if (cc != '+')
415 {
416 // plain ASCII char
417 if (buf)
418 *buf++ = cc;
419 len++;
420 }
421 else if (*psz == '-')
422 {
423 // encoded plus sign
424 if (buf)
425 *buf++ = cc;
426 len++;
427 psz++;
428 }
04a37834 429 else // start of BASE64 encoded string
15f2ee32 430 {
04a37834 431 bool lsb, ok;
15f2ee32 432 unsigned int d, l;
04a37834
VZ
433 for ( ok = lsb = false, d = 0, l = 0;
434 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
435 psz++ )
15f2ee32
RN
436 {
437 d <<= 6;
438 d += cc;
439 for (l += 6; l >= 8; lsb = !lsb)
440 {
04a37834 441 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
442 if (lsb)
443 {
444 if (buf)
445 *buf++ |= c;
446 len ++;
447 }
448 else
04a37834 449 {
15f2ee32 450 if (buf)
6356d52a 451 *buf = (wchar_t)(c << 8);
04a37834
VZ
452 }
453
454 ok = true;
15f2ee32
RN
455 }
456 }
04a37834
VZ
457
458 if ( !ok )
459 {
460 // in valid UTF7 we should have valid characters after '+'
461 return (size_t)-1;
462 }
463
15f2ee32
RN
464 if (*psz == '-')
465 psz++;
466 }
467 }
04a37834
VZ
468
469 if ( buf && (len < n) )
470 *buf = '\0';
471
15f2ee32 472 return len;
6001e347
RR
473}
474
15f2ee32
RN
475//
476// BASE64 encoding table
477//
478static const unsigned char utf7enb64[] =
479{
480 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
481 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
482 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
483 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
484 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
485 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
486 'w', 'x', 'y', 'z', '0', '1', '2', '3',
487 '4', '5', '6', '7', '8', '9', '+', '/'
488};
489
490//
491// UTF-7 encoding table
492//
493// 0 - Set D (directly encoded characters)
494// 1 - Set O (optional direct characters)
495// 2 - whitespace characters (optional)
496// 3 - special characters
497//
498static const unsigned char utf7encode[128] =
6001e347 499{
15f2ee32
RN
500 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
501 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
502 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
503 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
504 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
506 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
507 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
508};
509
667e5b3e 510size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32
RN
511{
512
513
514 size_t len = 0;
515
516 while (*psz && ((!buf) || (len < n)))
517 {
518 wchar_t cc = *psz++;
519 if (cc < 0x80 && utf7encode[cc] < 1)
520 {
521 // plain ASCII char
522 if (buf)
523 *buf++ = (char)cc;
524 len++;
525 }
526#ifndef WC_UTF16
79c78d42 527 else if (((wxUint32)cc) > 0xffff)
b2c13097 528 {
15f2ee32
RN
529 // no surrogate pair generation (yet?)
530 return (size_t)-1;
531 }
532#endif
533 else
534 {
535 if (buf)
536 *buf++ = '+';
537 len++;
538 if (cc != '+')
539 {
540 // BASE64 encode string
541 unsigned int lsb, d, l;
73c902d6 542 for (d = 0, l = 0; /*nothing*/; psz++)
15f2ee32
RN
543 {
544 for (lsb = 0; lsb < 2; lsb ++)
545 {
546 d <<= 8;
547 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
548
549 for (l += 8; l >= 6; )
550 {
551 l -= 6;
552 if (buf)
553 *buf++ = utf7enb64[(d >> l) % 64];
554 len++;
555 }
556 }
557 cc = *psz;
558 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
559 break;
560 }
561 if (l != 0)
562 {
563 if (buf)
564 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
565 len++;
566 }
567 }
568 if (buf)
569 *buf++ = '-';
570 len++;
571 }
572 }
573 if (buf && (len < n))
574 *buf = 0;
575 return len;
6001e347
RR
576}
577
f6bcfd97 578// ----------------------------------------------------------------------------
6001e347 579// UTF-8
f6bcfd97 580// ----------------------------------------------------------------------------
6001e347 581
dccce9ea 582static wxUint32 utf8_max[]=
4def3b35 583 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 584
3698ae71
VZ
585// boundaries of the private use area we use to (temporarily) remap invalid
586// characters invalid in a UTF-8 encoded string
ea8ce907
RR
587const wxUint32 wxUnicodePUA = 0x100000;
588const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
589
6001e347
RR
590size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
591{
4def3b35
VS
592 size_t len = 0;
593
dccce9ea 594 while (*psz && ((!buf) || (len < n)))
4def3b35 595 {
ea8ce907
RR
596 const char *opsz = psz;
597 bool invalid = false;
4def3b35
VS
598 unsigned char cc = *psz++, fc = cc;
599 unsigned cnt;
dccce9ea 600 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 601 fc <<= 1;
dccce9ea 602 if (!cnt)
4def3b35
VS
603 {
604 // plain ASCII char
dccce9ea 605 if (buf)
4def3b35
VS
606 *buf++ = cc;
607 len++;
561488ef
MW
608
609 // escape the escape character for octal escapes
610 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
611 && cc == '\\' && (!buf || len < n))
612 {
613 if (buf)
614 *buf++ = cc;
615 len++;
616 }
dccce9ea
VZ
617 }
618 else
4def3b35
VS
619 {
620 cnt--;
dccce9ea 621 if (!cnt)
4def3b35
VS
622 {
623 // invalid UTF-8 sequence
ea8ce907 624 invalid = true;
dccce9ea
VZ
625 }
626 else
4def3b35
VS
627 {
628 unsigned ocnt = cnt - 1;
629 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 630 while (cnt--)
4def3b35 631 {
ea8ce907 632 cc = *psz;
dccce9ea 633 if ((cc & 0xC0) != 0x80)
4def3b35
VS
634 {
635 // invalid UTF-8 sequence
ea8ce907
RR
636 invalid = true;
637 break;
4def3b35 638 }
ea8ce907 639 psz++;
4def3b35
VS
640 res = (res << 6) | (cc & 0x3f);
641 }
ea8ce907 642 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
643 {
644 // illegal UTF-8 encoding
ea8ce907 645 invalid = true;
4def3b35 646 }
ea8ce907
RR
647 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
648 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
649 {
650 // if one of our PUA characters turns up externally
651 // it must also be treated as an illegal sequence
652 // (a bit like you have to escape an escape character)
653 invalid = true;
654 }
655 else
656 {
1cd52418 657#ifdef WC_UTF16
ea8ce907
RR
658 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
659 size_t pa = encode_utf16(res, (wxUint16 *)buf);
660 if (pa == (size_t)-1)
661 {
662 invalid = true;
663 }
664 else
665 {
666 if (buf)
667 buf += pa;
668 len += pa;
669 }
373658eb 670#else // !WC_UTF16
ea8ce907 671 if (buf)
38d4b1e4 672 *buf++ = (wchar_t)res;
ea8ce907 673 len++;
373658eb 674#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
675 }
676 }
677 if (invalid)
678 {
679 if (m_options & MAP_INVALID_UTF8_TO_PUA)
680 {
681 while (opsz < psz && (!buf || len < n))
682 {
683#ifdef WC_UTF16
684 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
685 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
686 wxASSERT(pa != (size_t)-1);
687 if (buf)
688 buf += pa;
689 opsz++;
690 len += pa;
691#else
692 if (buf)
38d4b1e4 693 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
694 opsz++;
695 len++;
696#endif
697 }
698 }
3698ae71 699 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
700 {
701 while (opsz < psz && (!buf || len < n))
702 {
3698ae71
VZ
703 if ( buf && len + 3 < n )
704 {
17a1ebd1 705 unsigned char on = *opsz;
3698ae71 706 *buf++ = L'\\';
17a1ebd1
VZ
707 *buf++ = (wchar_t)( L'0' + on / 0100 );
708 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
709 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 710 }
ea8ce907
RR
711 opsz++;
712 len += 4;
713 }
714 }
3698ae71 715 else // MAP_INVALID_UTF8_NOT
ea8ce907
RR
716 {
717 return (size_t)-1;
718 }
4def3b35
VS
719 }
720 }
6001e347 721 }
dccce9ea 722 if (buf && (len < n))
4def3b35
VS
723 *buf = 0;
724 return len;
6001e347
RR
725}
726
3698ae71
VZ
727static inline bool isoctal(wchar_t wch)
728{
729 return L'0' <= wch && wch <= L'7';
730}
731
6001e347
RR
732size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
733{
4def3b35 734 size_t len = 0;
6001e347 735
dccce9ea 736 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
737 {
738 wxUint32 cc;
1cd52418 739#ifdef WC_UTF16
b5153fd8
VZ
740 // cast is ok for WC_UTF16
741 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 742 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 743#else
4def3b35
VS
744 cc=(*psz++) & 0x7fffffff;
745#endif
3698ae71
VZ
746
747 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
748 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 749 {
dccce9ea 750 if (buf)
ea8ce907 751 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 752 len++;
3698ae71 753 }
561488ef
MW
754 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
755 && cc == L'\\' && psz[0] == L'\\' )
756 {
757 if (buf)
758 *buf++ = (char)cc;
759 psz++;
760 len++;
761 }
3698ae71
VZ
762 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
763 cc == L'\\' &&
764 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 765 {
dccce9ea 766 if (buf)
3698ae71 767 {
b2c13097
WS
768 *buf++ = (char) ((psz[0] - L'0')*0100 +
769 (psz[1] - L'0')*010 +
770 (psz[2] - L'0'));
3698ae71
VZ
771 }
772
773 psz += 3;
ea8ce907
RR
774 len++;
775 }
776 else
777 {
778 unsigned cnt;
779 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
780 if (!cnt)
4def3b35 781 {
ea8ce907
RR
782 // plain ASCII char
783 if (buf)
784 *buf++ = (char) cc;
785 len++;
786 }
787
788 else
789 {
790 len += cnt + 1;
791 if (buf)
792 {
793 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
794 while (cnt--)
795 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
796 }
4def3b35
VS
797 }
798 }
6001e347 799 }
4def3b35 800
3698ae71
VZ
801 if (buf && (len<n))
802 *buf = 0;
adb45366 803
4def3b35 804 return len;
6001e347
RR
805}
806
c91830cb
VZ
807// ----------------------------------------------------------------------------
808// UTF-16
809// ----------------------------------------------------------------------------
810
811#ifdef WORDS_BIGENDIAN
bde4baac
VZ
812 #define wxMBConvUTF16straight wxMBConvUTF16BE
813 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 814#else
bde4baac
VZ
815 #define wxMBConvUTF16swap wxMBConvUTF16BE
816 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
817#endif
818
819
c91830cb
VZ
820#ifdef WC_UTF16
821
c91830cb
VZ
822// copy 16bit MB to 16bit String
823size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
824{
825 size_t len=0;
826
827 while (*(wxUint16*)psz && (!buf || len < n))
828 {
829 if (buf)
830 *buf++ = *(wxUint16*)psz;
831 len++;
832
833 psz += sizeof(wxUint16);
834 }
835 if (buf && len<n) *buf=0;
836
837 return len;
838}
839
840
841// copy 16bit String to 16bit MB
842size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
843{
844 size_t len=0;
845
846 while (*psz && (!buf || len < n))
847 {
848 if (buf)
849 {
850 *(wxUint16*)buf = *psz;
851 buf += sizeof(wxUint16);
852 }
853 len += sizeof(wxUint16);
854 psz++;
855 }
856 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
857
858 return len;
859}
860
861
862// swap 16bit MB to 16bit String
863size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
864{
bfab25d4 865 size_t len = 0;
c91830cb 866
da12017a
VZ
867 // UTF16 string must be terminated by 2 NULs as single NULs may occur
868 // inside the string
869 while ( (psz[0] || psz[1]) && (!buf || len < n) )
c91830cb 870 {
bfab25d4 871 if ( buf )
c91830cb
VZ
872 {
873 ((char *)buf)[0] = psz[1];
874 ((char *)buf)[1] = psz[0];
875 buf++;
876 }
877 len++;
bfab25d4 878 psz += 2;
c91830cb 879 }
bfab25d4
VZ
880
881 if ( buf && len < n )
882 *buf = L'\0';
c91830cb
VZ
883
884 return len;
885}
886
887
888// swap 16bit MB to 16bit String
889size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
890{
891 size_t len=0;
892
893 while (*psz && (!buf || len < n))
894 {
895 if (buf)
896 {
897 *buf++ = ((char*)psz)[1];
898 *buf++ = ((char*)psz)[0];
899 }
900 len += sizeof(wxUint16);
901 psz++;
902 }
903 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
904
905 return len;
906}
907
908
909#else // WC_UTF16
910
911
912// copy 16bit MB to 32bit String
913size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
914{
915 size_t len=0;
916
917 while (*(wxUint16*)psz && (!buf || len < n))
918 {
919 wxUint32 cc;
920 size_t pa=decode_utf16((wxUint16*)psz, cc);
921 if (pa == (size_t)-1)
922 return pa;
923
924 if (buf)
38d4b1e4 925 *buf++ = (wchar_t)cc;
c91830cb
VZ
926 len++;
927 psz += pa * sizeof(wxUint16);
928 }
929 if (buf && len<n) *buf=0;
930
931 return len;
932}
933
934
935// copy 32bit String to 16bit MB
936size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
937{
938 size_t len=0;
939
940 while (*psz && (!buf || len < n))
941 {
942 wxUint16 cc[2];
943 size_t pa=encode_utf16(*psz, cc);
944
945 if (pa == (size_t)-1)
946 return pa;
947
948 if (buf)
949 {
69b80d28 950 *(wxUint16*)buf = cc[0];
b5153fd8 951 buf += sizeof(wxUint16);
c91830cb 952 if (pa > 1)
69b80d28
VZ
953 {
954 *(wxUint16*)buf = cc[1];
955 buf += sizeof(wxUint16);
956 }
c91830cb
VZ
957 }
958
959 len += pa*sizeof(wxUint16);
960 psz++;
961 }
962 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
963
964 return len;
965}
966
967
968// swap 16bit MB to 32bit String
969size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
970{
971 size_t len=0;
972
973 while (*(wxUint16*)psz && (!buf || len < n))
974 {
975 wxUint32 cc;
976 char tmp[4];
977 tmp[0]=psz[1]; tmp[1]=psz[0];
978 tmp[2]=psz[3]; tmp[3]=psz[2];
979
980 size_t pa=decode_utf16((wxUint16*)tmp, cc);
981 if (pa == (size_t)-1)
982 return pa;
983
984 if (buf)
38d4b1e4 985 *buf++ = (wchar_t)cc;
c91830cb
VZ
986
987 len++;
988 psz += pa * sizeof(wxUint16);
989 }
990 if (buf && len<n) *buf=0;
991
992 return len;
993}
994
995
996// swap 32bit String to 16bit MB
997size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
998{
999 size_t len=0;
1000
1001 while (*psz && (!buf || len < n))
1002 {
1003 wxUint16 cc[2];
1004 size_t pa=encode_utf16(*psz, cc);
1005
1006 if (pa == (size_t)-1)
1007 return pa;
1008
1009 if (buf)
1010 {
1011 *buf++ = ((char*)cc)[1];
1012 *buf++ = ((char*)cc)[0];
1013 if (pa > 1)
1014 {
1015 *buf++ = ((char*)cc)[3];
1016 *buf++ = ((char*)cc)[2];
1017 }
1018 }
1019
1020 len += pa*sizeof(wxUint16);
1021 psz++;
1022 }
1023 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1024
1025 return len;
1026}
1027
1028#endif // WC_UTF16
1029
1030
1031// ----------------------------------------------------------------------------
1032// UTF-32
1033// ----------------------------------------------------------------------------
1034
1035#ifdef WORDS_BIGENDIAN
1036#define wxMBConvUTF32straight wxMBConvUTF32BE
1037#define wxMBConvUTF32swap wxMBConvUTF32LE
1038#else
1039#define wxMBConvUTF32swap wxMBConvUTF32BE
1040#define wxMBConvUTF32straight wxMBConvUTF32LE
1041#endif
1042
1043
1044WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1045WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1046
1047
1048#ifdef WC_UTF16
1049
1050// copy 32bit MB to 16bit String
1051size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1052{
1053 size_t len=0;
1054
1055 while (*(wxUint32*)psz && (!buf || len < n))
1056 {
1057 wxUint16 cc[2];
1058
1059 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1060 if (pa == (size_t)-1)
1061 return pa;
1062
1063 if (buf)
1064 {
1065 *buf++ = cc[0];
1066 if (pa > 1)
1067 *buf++ = cc[1];
1068 }
1069 len += pa;
1070 psz += sizeof(wxUint32);
1071 }
1072 if (buf && len<n) *buf=0;
1073
1074 return len;
1075}
1076
1077
1078// copy 16bit String to 32bit MB
1079size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1080{
1081 size_t len=0;
1082
1083 while (*psz && (!buf || len < n))
1084 {
1085 wxUint32 cc;
1086
b5153fd8
VZ
1087 // cast is ok for WC_UTF16
1088 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
1089 if (pa == (size_t)-1)
1090 return pa;
1091
1092 if (buf)
1093 {
1094 *(wxUint32*)buf = cc;
1095 buf += sizeof(wxUint32);
1096 }
1097 len += sizeof(wxUint32);
1098 psz += pa;
1099 }
b5153fd8
VZ
1100
1101 if (buf && len<=n-sizeof(wxUint32))
1102 *(wxUint32*)buf=0;
c91830cb
VZ
1103
1104 return len;
1105}
1106
1107
1108
1109// swap 32bit MB to 16bit String
1110size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1111{
1112 size_t len=0;
1113
1114 while (*(wxUint32*)psz && (!buf || len < n))
1115 {
1116 char tmp[4];
1117 tmp[0] = psz[3]; tmp[1] = psz[2];
1118 tmp[2] = psz[1]; tmp[3] = psz[0];
1119
1120
1121 wxUint16 cc[2];
1122
1123 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1124 if (pa == (size_t)-1)
1125 return pa;
1126
1127 if (buf)
1128 {
1129 *buf++ = cc[0];
1130 if (pa > 1)
1131 *buf++ = cc[1];
1132 }
1133 len += pa;
1134 psz += sizeof(wxUint32);
1135 }
b5153fd8
VZ
1136
1137 if (buf && len<n)
1138 *buf=0;
c91830cb
VZ
1139
1140 return len;
1141}
1142
1143
1144// swap 16bit String to 32bit MB
1145size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1146{
1147 size_t len=0;
1148
1149 while (*psz && (!buf || len < n))
1150 {
1151 char cc[4];
1152
b5153fd8
VZ
1153 // cast is ok for WC_UTF16
1154 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
1155 if (pa == (size_t)-1)
1156 return pa;
1157
1158 if (buf)
1159 {
1160 *buf++ = cc[3];
1161 *buf++ = cc[2];
1162 *buf++ = cc[1];
1163 *buf++ = cc[0];
1164 }
1165 len += sizeof(wxUint32);
1166 psz += pa;
1167 }
b5153fd8
VZ
1168
1169 if (buf && len<=n-sizeof(wxUint32))
1170 *(wxUint32*)buf=0;
c91830cb
VZ
1171
1172 return len;
1173}
1174
1175#else // WC_UTF16
1176
1177
1178// copy 32bit MB to 32bit String
1179size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1180{
1181 size_t len=0;
1182
1183 while (*(wxUint32*)psz && (!buf || len < n))
1184 {
1185 if (buf)
38d4b1e4 1186 *buf++ = (wchar_t)(*(wxUint32*)psz);
c91830cb
VZ
1187 len++;
1188 psz += sizeof(wxUint32);
1189 }
b5153fd8
VZ
1190
1191 if (buf && len<n)
1192 *buf=0;
c91830cb
VZ
1193
1194 return len;
1195}
1196
1197
1198// copy 32bit String to 32bit MB
1199size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1200{
1201 size_t len=0;
1202
1203 while (*psz && (!buf || len < n))
1204 {
1205 if (buf)
1206 {
1207 *(wxUint32*)buf = *psz;
1208 buf += sizeof(wxUint32);
1209 }
1210
1211 len += sizeof(wxUint32);
1212 psz++;
1213 }
1214
b5153fd8
VZ
1215 if (buf && len<=n-sizeof(wxUint32))
1216 *(wxUint32*)buf=0;
c91830cb
VZ
1217
1218 return len;
1219}
1220
1221
1222// swap 32bit MB to 32bit String
1223size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1224{
1225 size_t len=0;
1226
1227 while (*(wxUint32*)psz && (!buf || len < n))
1228 {
1229 if (buf)
1230 {
1231 ((char *)buf)[0] = psz[3];
1232 ((char *)buf)[1] = psz[2];
1233 ((char *)buf)[2] = psz[1];
1234 ((char *)buf)[3] = psz[0];
1235 buf++;
1236 }
1237 len++;
1238 psz += sizeof(wxUint32);
1239 }
b5153fd8
VZ
1240
1241 if (buf && len<n)
1242 *buf=0;
c91830cb
VZ
1243
1244 return len;
1245}
1246
1247
1248// swap 32bit String to 32bit MB
1249size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1250{
1251 size_t len=0;
1252
1253 while (*psz && (!buf || len < n))
1254 {
1255 if (buf)
1256 {
1257 *buf++ = ((char *)psz)[3];
1258 *buf++ = ((char *)psz)[2];
1259 *buf++ = ((char *)psz)[1];
1260 *buf++ = ((char *)psz)[0];
1261 }
1262 len += sizeof(wxUint32);
1263 psz++;
1264 }
b5153fd8
VZ
1265
1266 if (buf && len<=n-sizeof(wxUint32))
1267 *(wxUint32*)buf=0;
c91830cb
VZ
1268
1269 return len;
1270}
1271
1272
1273#endif // WC_UTF16
1274
1275
36acb880
VZ
1276// ============================================================================
1277// The classes doing conversion using the iconv_xxx() functions
1278// ============================================================================
3caec1bb 1279
b040e242 1280#ifdef HAVE_ICONV
3a0d76bc 1281
b1d547eb
VS
1282// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1283// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1284// (unless there's yet another bug in glibc) the only case when iconv()
1285// returns with (size_t)-1 (which means error) and says there are 0 bytes
1286// left in the input buffer -- when _real_ error occurs,
1287// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1288// iconv() failure.
3caec1bb
VS
1289// [This bug does not appear in glibc 2.2.]
1290#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1291#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1292 (errno != E2BIG || bufLeft != 0))
1293#else
1294#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1295#endif
1296
ab217dba 1297#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1298
74a7eb0b
VZ
1299#define ICONV_T_INVALID ((iconv_t)-1)
1300
1301#if SIZEOF_WCHAR_T == 4
1302 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1303 #define WC_ENC wxFONTENCODING_UTF32
1304#elif SIZEOF_WCHAR_T == 2
1305 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1306 #define WC_ENC wxFONTENCODING_UTF16
1307#else // sizeof(wchar_t) != 2 nor 4
1308 // does this ever happen?
1309 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1310#endif
1311
36acb880 1312// ----------------------------------------------------------------------------
e95354ec 1313// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1314// ----------------------------------------------------------------------------
1315
e95354ec 1316class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1317{
1318public:
e95354ec
VZ
1319 wxMBConv_iconv(const wxChar *name);
1320 virtual ~wxMBConv_iconv();
36acb880 1321
bde4baac
VZ
1322 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1323 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1324
e95354ec 1325 bool IsOk() const
74a7eb0b 1326 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1327
1328protected:
1329 // the iconv handlers used to translate from multibyte to wide char and in
1330 // the other direction
1331 iconv_t m2w,
1332 w2m;
b1d547eb
VS
1333#if wxUSE_THREADS
1334 // guards access to m2w and w2m objects
1335 wxMutex m_iconvMutex;
1336#endif
36acb880
VZ
1337
1338private:
e95354ec 1339 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1340 // available on this machine, it will remain NULL
74a7eb0b 1341 static wxString ms_wcCharsetName;
36acb880
VZ
1342
1343 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1344 // different endian-ness than the native one
405d8f46 1345 static bool ms_wcNeedsSwap;
36acb880
VZ
1346};
1347
8f115891
MW
1348// make the constructor available for unit testing
1349WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1350{
1351 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1352 if ( !result->IsOk() )
1353 {
1354 delete result;
1355 return 0;
1356 }
1357 return result;
1358}
1359
422e411e 1360wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1361bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1362
e95354ec 1363wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1364{
0331b385
VZ
1365 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1366 // names for the charsets
200a9923 1367 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1368
36acb880 1369 // check for charset that represents wchar_t:
74a7eb0b 1370 if ( ms_wcCharsetName.empty() )
f1339c56 1371 {
c2b83fdd
VZ
1372 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1373
74a7eb0b
VZ
1374#if wxUSE_FONTMAP
1375 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1376#else // !wxUSE_FONTMAP
1377 static const wxChar *names[] =
36acb880 1378 {
74a7eb0b
VZ
1379#if SIZEOF_WCHAR_T == 4
1380 _T("UCS-4"),
1381#elif SIZEOF_WCHAR_T = 2
1382 _T("UCS-2"),
1383#endif
1384 NULL
1385 };
1386#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1387
d1f024a8 1388 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1389 {
17a1ebd1 1390 const wxString nameCS(*names);
74a7eb0b
VZ
1391
1392 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1393 wxString nameXE(nameCS);
74a7eb0b
VZ
1394 #ifdef WORDS_BIGENDIAN
1395 nameXE += _T("BE");
1396 #else // little endian
1397 nameXE += _T("LE");
1398 #endif
1399
c2b83fdd
VZ
1400 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1401 nameXE.c_str());
1402
74a7eb0b
VZ
1403 m2w = iconv_open(nameXE.ToAscii(), cname);
1404 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1405 {
74a7eb0b 1406 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1407 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1408 nameCS.c_str());
17a1ebd1 1409 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1410
74a7eb0b
VZ
1411 // and check for bytesex ourselves:
1412 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1413 {
74a7eb0b
VZ
1414 char buf[2], *bufPtr;
1415 wchar_t wbuf[2], *wbufPtr;
1416 size_t insz, outsz;
1417 size_t res;
1418
1419 buf[0] = 'A';
1420 buf[1] = 0;
1421 wbuf[0] = 0;
1422 insz = 2;
1423 outsz = SIZEOF_WCHAR_T * 2;
1424 wbufPtr = wbuf;
1425 bufPtr = buf;
1426
1427 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1428 (char**)&wbufPtr, &outsz);
1429
1430 if (ICONV_FAILED(res, insz))
1431 {
1432 wxLogLastError(wxT("iconv"));
422e411e 1433 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1434 nameCS.c_str());
74a7eb0b
VZ
1435 }
1436 else // ok, can convert to this encoding, remember it
1437 {
17a1ebd1 1438 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1439 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1440 }
3a0d76bc
VS
1441 }
1442 }
74a7eb0b 1443 else // use charset not requiring byte swapping
36acb880 1444 {
74a7eb0b 1445 ms_wcCharsetName = nameXE;
36acb880 1446 }
3a0d76bc 1447 }
74a7eb0b 1448
0944fceb 1449 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1450 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1451 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1452 : ms_wcCharsetName.c_str(),
1453 ms_wcNeedsSwap ? _T(" (needs swap)")
1454 : _T(""));
3a0d76bc 1455 }
36acb880 1456 else // we already have ms_wcCharsetName
3caec1bb 1457 {
74a7eb0b 1458 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1459 }
dccce9ea 1460
74a7eb0b 1461 if ( ms_wcCharsetName.empty() )
f1339c56 1462 {
74a7eb0b 1463 w2m = ICONV_T_INVALID;
36acb880 1464 }
405d8f46
VZ
1465 else
1466 {
74a7eb0b
VZ
1467 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1468 if ( w2m == ICONV_T_INVALID )
1469 {
1470 wxLogTrace(TRACE_STRCONV,
1471 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1472 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1473 }
405d8f46 1474 }
36acb880 1475}
3caec1bb 1476
e95354ec 1477wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1478{
74a7eb0b 1479 if ( m2w != ICONV_T_INVALID )
36acb880 1480 iconv_close(m2w);
74a7eb0b 1481 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1482 iconv_close(w2m);
1483}
3a0d76bc 1484
bde4baac 1485size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1486{
b1d547eb
VS
1487#if wxUSE_THREADS
1488 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1489 // Unfortunately there is a couple of global wxCSConv objects such as
1490 // wxConvLocal that are used all over wx code, so we have to make sure
1491 // the handle is used by at most one thread at the time. Otherwise
1492 // only a few wx classes would be safe to use from non-main threads
1493 // as MB<->WC conversion would fail "randomly".
1494 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1495#endif
3698ae71 1496
36acb880
VZ
1497 size_t inbuf = strlen(psz);
1498 size_t outbuf = n * SIZEOF_WCHAR_T;
1499 size_t res, cres;
1500 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1501 wchar_t *bufPtr = buf;
1502 const char *pszPtr = psz;
1503
1504 if (buf)
1505 {
1506 // have destination buffer, convert there
1507 cres = iconv(m2w,
1508 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1509 (char**)&bufPtr, &outbuf);
1510 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1511
36acb880 1512 if (ms_wcNeedsSwap)
3a0d76bc 1513 {
36acb880 1514 // convert to native endianness
17a1ebd1
VZ
1515 for ( unsigned i = 0; i < res; i++ )
1516 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1517 }
adb45366 1518
49dd9820
VS
1519 // NB: iconv was given only strlen(psz) characters on input, and so
1520 // it couldn't convert the trailing zero. Let's do it ourselves
1521 // if there's some room left for it in the output buffer.
1522 if (res < n)
1523 buf[res] = 0;
36acb880
VZ
1524 }
1525 else
1526 {
1527 // no destination buffer... convert using temp buffer
1528 // to calculate destination buffer requirement
1529 wchar_t tbuf[8];
1530 res = 0;
1531 do {
1532 bufPtr = tbuf;
1533 outbuf = 8*SIZEOF_WCHAR_T;
1534
1535 cres = iconv(m2w,
1536 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1537 (char**)&bufPtr, &outbuf );
1538
1539 res += 8-(outbuf/SIZEOF_WCHAR_T);
1540 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1541 }
dccce9ea 1542
36acb880 1543 if (ICONV_FAILED(cres, inbuf))
f1339c56 1544 {
36acb880 1545 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1546 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1547 return (size_t)-1;
1548 }
1549
1550 return res;
1551}
1552
bde4baac 1553size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1554{
b1d547eb
VS
1555#if wxUSE_THREADS
1556 // NB: explained in MB2WC
1557 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1558#endif
3698ae71 1559
156162ec
MW
1560 size_t inlen = wxWcslen(psz);
1561 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1562 size_t outbuf = n;
1563 size_t res, cres;
3a0d76bc 1564
36acb880 1565 wchar_t *tmpbuf = 0;
3caec1bb 1566
36acb880
VZ
1567 if (ms_wcNeedsSwap)
1568 {
1569 // need to copy to temp buffer to switch endianness
74a7eb0b 1570 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1571 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1572 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1573 for ( size_t i = 0; i < inlen; i++ )
1574 tmpbuf[n] = WC_BSWAP(psz[i]);
156162ec 1575 tmpbuf[inlen] = L'\0';
74a7eb0b 1576 psz = tmpbuf;
36acb880 1577 }
3a0d76bc 1578
36acb880
VZ
1579 if (buf)
1580 {
1581 // have destination buffer, convert there
1582 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1583
36acb880 1584 res = n-outbuf;
adb45366 1585
49dd9820
VS
1586 // NB: iconv was given only wcslen(psz) characters on input, and so
1587 // it couldn't convert the trailing zero. Let's do it ourselves
1588 // if there's some room left for it in the output buffer.
1589 if (res < n)
1590 buf[0] = 0;
36acb880
VZ
1591 }
1592 else
1593 {
1594 // no destination buffer... convert using temp buffer
1595 // to calculate destination buffer requirement
1596 char tbuf[16];
1597 res = 0;
1598 do {
1599 buf = tbuf; outbuf = 16;
1600
1601 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1602
36acb880
VZ
1603 res += 16 - outbuf;
1604 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1605 }
dccce9ea 1606
36acb880
VZ
1607 if (ms_wcNeedsSwap)
1608 {
1609 free(tmpbuf);
1610 }
dccce9ea 1611
36acb880
VZ
1612 if (ICONV_FAILED(cres, inbuf))
1613 {
ce6f8d6f 1614 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1615 return (size_t)-1;
1616 }
1617
1618 return res;
1619}
1620
b040e242 1621#endif // HAVE_ICONV
36acb880 1622
e95354ec 1623
36acb880
VZ
1624// ============================================================================
1625// Win32 conversion classes
1626// ============================================================================
1cd52418 1627
e95354ec 1628#ifdef wxHAVE_WIN32_MB2WC
373658eb 1629
8b04d4c4 1630// from utils.cpp
d775fa82 1631#if wxUSE_FONTMAP
8b04d4c4
VZ
1632extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1633extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1634#endif
373658eb 1635
e95354ec 1636class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1637{
1638public:
bde4baac
VZ
1639 wxMBConv_win32()
1640 {
1641 m_CodePage = CP_ACP;
1642 }
1643
7608a683 1644#if wxUSE_FONTMAP
e95354ec 1645 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1646 {
1647 m_CodePage = wxCharsetToCodepage(name);
1648 }
dccce9ea 1649
e95354ec 1650 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1651 {
1652 m_CodePage = wxEncodingToCodepage(encoding);
1653 }
7608a683 1654#endif
8b04d4c4 1655
bde4baac 1656 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1657 {
02272c9c
VZ
1658 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1659 // the behaviour is not compatible with the Unix version (using iconv)
1660 // and break the library itself, e.g. wxTextInputStream::NextChar()
1661 // wouldn't work if reading an incomplete MB char didn't result in an
1662 // error
667e5b3e
VZ
1663 //
1664 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1665 // an error (tested under Windows Server 2003) and apparently it is
1666 // done on purpose, i.e. the function accepts any input in this case
1667 // and although I'd prefer to return error on ill-formed output, our
1668 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1669 // explicitly ill-formed according to RFC 2152) neither so we don't
1670 // even have any fallback here...
89028980
VS
1671 //
1672 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1673 // Win XP or newer and if it is specified on older versions, conversion
1674 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1675 // fails. So we can only use the flag on newer Windows versions.
1676 // Additionally, the flag is not supported by UTF7, symbol and CJK
1677 // encodings. See here:
1678 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1679 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1680 int flags = 0;
1681 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1682 m_CodePage < 50000 &&
1683 IsAtLeastWin2kSP4() )
1684 {
1685 flags = MB_ERR_INVALID_CHARS;
1686 }
1687 else if ( m_CodePage == CP_UTF8 )
1688 {
1689 // Avoid round-trip in the special case of UTF-8 by using our
1690 // own UTF-8 conversion code:
1691 return wxMBConvUTF8().MB2WC(buf, psz, n);
1692 }
667e5b3e 1693
2b5f62a0
VZ
1694 const size_t len = ::MultiByteToWideChar
1695 (
1696 m_CodePage, // code page
667e5b3e 1697 flags, // flags: fall on error
2b5f62a0
VZ
1698 psz, // input string
1699 -1, // its length (NUL-terminated)
b4da152e 1700 buf, // output string
2b5f62a0
VZ
1701 buf ? n : 0 // size of output buffer
1702 );
89028980
VS
1703 if ( !len )
1704 {
1705 // function totally failed
1706 return (size_t)-1;
1707 }
1708
1709 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1710 // check if we succeeded, by doing a double trip:
1711 if ( !flags && buf )
1712 {
53c174fc
VZ
1713 const size_t mbLen = strlen(psz);
1714 wxCharBuffer mbBuf(mbLen);
89028980
VS
1715 if ( ::WideCharToMultiByte
1716 (
1717 m_CodePage,
1718 0,
1719 buf,
1720 -1,
1721 mbBuf.data(),
53c174fc 1722 mbLen + 1, // size in bytes, not length
89028980
VS
1723 NULL,
1724 NULL
1725 ) == 0 ||
1726 strcmp(mbBuf, psz) != 0 )
1727 {
1728 // we didn't obtain the same thing we started from, hence
1729 // the conversion was lossy and we consider that it failed
1730 return (size_t)-1;
1731 }
1732 }
2b5f62a0 1733
03a991bc
VZ
1734 // note that it returns count of written chars for buf != NULL and size
1735 // of the needed buffer for buf == NULL so in either case the length of
1736 // the string (which never includes the terminating NUL) is one less
89028980 1737 return len - 1;
f1339c56 1738 }
dccce9ea 1739
13dd924a 1740 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1741 {
13dd924a
VZ
1742 /*
1743 we have a problem here: by default, WideCharToMultiByte() may
1744 replace characters unrepresentable in the target code page with bad
1745 quality approximations such as turning "1/2" symbol (U+00BD) into
1746 "1" for the code pages which don't have it and we, obviously, want
1747 to avoid this at any price
d775fa82 1748
13dd924a
VZ
1749 the trouble is that this function does it _silently_, i.e. it won't
1750 even tell us whether it did or not... Win98/2000 and higher provide
1751 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1752 we have to resort to a round trip, i.e. check that converting back
1753 results in the same string -- this is, of course, expensive but
1754 otherwise we simply can't be sure to not garble the data.
1755 */
1756
1757 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1758 // it doesn't work with CJK encodings (which we test for rather roughly
1759 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1760 // supporting it
907173e5
WS
1761 BOOL usedDef wxDUMMY_INITIALIZE(false);
1762 BOOL *pUsedDef;
13dd924a
VZ
1763 int flags;
1764 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1765 {
1766 // it's our lucky day
1767 flags = WC_NO_BEST_FIT_CHARS;
1768 pUsedDef = &usedDef;
1769 }
1770 else // old system or unsupported encoding
1771 {
1772 flags = 0;
1773 pUsedDef = NULL;
1774 }
1775
2b5f62a0
VZ
1776 const size_t len = ::WideCharToMultiByte
1777 (
1778 m_CodePage, // code page
13dd924a
VZ
1779 flags, // either none or no best fit
1780 pwz, // input string
2b5f62a0
VZ
1781 -1, // it is (wide) NUL-terminated
1782 buf, // output buffer
1783 buf ? n : 0, // and its size
1784 NULL, // default "replacement" char
13dd924a 1785 pUsedDef // [out] was it used?
2b5f62a0
VZ
1786 );
1787
13dd924a
VZ
1788 if ( !len )
1789 {
1790 // function totally failed
1791 return (size_t)-1;
1792 }
1793
1794 // if we were really converting, check if we succeeded
1795 if ( buf )
1796 {
1797 if ( flags )
1798 {
1799 // check if the conversion failed, i.e. if any replacements
1800 // were done
1801 if ( usedDef )
1802 return (size_t)-1;
1803 }
1804 else // we must resort to double tripping...
1805 {
1806 wxWCharBuffer wcBuf(n);
1807 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1808 wcscmp(wcBuf, pwz) != 0 )
1809 {
1810 // we didn't obtain the same thing we started from, hence
1811 // the conversion was lossy and we consider that it failed
1812 return (size_t)-1;
1813 }
1814 }
1815 }
1816
03a991bc 1817 // see the comment above for the reason of "len - 1"
13dd924a 1818 return len - 1;
f1339c56 1819 }
dccce9ea 1820
13dd924a
VZ
1821 bool IsOk() const { return m_CodePage != -1; }
1822
1823private:
1824 static bool CanUseNoBestFit()
1825 {
1826 static int s_isWin98Or2k = -1;
1827
1828 if ( s_isWin98Or2k == -1 )
1829 {
1830 int verMaj, verMin;
1831 switch ( wxGetOsVersion(&verMaj, &verMin) )
1832 {
1833 case wxWIN95:
1834 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1835 break;
1836
1837 case wxWINDOWS_NT:
1838 s_isWin98Or2k = verMaj >= 5;
1839 break;
1840
1841 default:
1842 // unknown, be conseravtive by default
1843 s_isWin98Or2k = 0;
1844 }
1845
1846 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1847 }
1848
1849 return s_isWin98Or2k == 1;
1850 }
f1339c56 1851
89028980
VS
1852 static bool IsAtLeastWin2kSP4()
1853 {
8942f83a
WS
1854#ifdef __WXWINCE__
1855 return false;
1856#else
89028980
VS
1857 static int s_isAtLeastWin2kSP4 = -1;
1858
1859 if ( s_isAtLeastWin2kSP4 == -1 )
1860 {
1861 OSVERSIONINFOEX ver;
1862
1863 memset(&ver, 0, sizeof(ver));
1864 ver.dwOSVersionInfoSize = sizeof(ver);
1865 GetVersionEx((OSVERSIONINFO*)&ver);
1866
1867 s_isAtLeastWin2kSP4 =
1868 ((ver.dwMajorVersion > 5) || // Vista+
1869 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1870 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1871 ver.wServicePackMajor >= 4)) // 2000 SP4+
1872 ? 1 : 0;
1873 }
1874
1875 return s_isAtLeastWin2kSP4 == 1;
8942f83a 1876#endif
89028980
VS
1877 }
1878
b1d66b54 1879 long m_CodePage;
1cd52418 1880};
e95354ec
VZ
1881
1882#endif // wxHAVE_WIN32_MB2WC
1883
f7e98dee
RN
1884// ============================================================================
1885// Cocoa conversion classes
1886// ============================================================================
1887
1888#if defined(__WXCOCOA__)
1889
ecd9653b 1890// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1891// Cocoa. Strangely enough, internally Core Foundation uses
1892// UTF 32 internally quite a bit - its just not public (yet).
1893
1894#include <CoreFoundation/CFString.h>
1895#include <CoreFoundation/CFStringEncodingExt.h>
1896
1897CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1898{
638357a0 1899 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1900 if ( encoding == wxFONTENCODING_DEFAULT )
1901 {
638357a0 1902 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1903 }
1904 else switch( encoding)
1905 {
1906 case wxFONTENCODING_ISO8859_1 :
1907 enc = kCFStringEncodingISOLatin1 ;
1908 break ;
1909 case wxFONTENCODING_ISO8859_2 :
1910 enc = kCFStringEncodingISOLatin2;
1911 break ;
1912 case wxFONTENCODING_ISO8859_3 :
1913 enc = kCFStringEncodingISOLatin3 ;
1914 break ;
1915 case wxFONTENCODING_ISO8859_4 :
1916 enc = kCFStringEncodingISOLatin4;
1917 break ;
1918 case wxFONTENCODING_ISO8859_5 :
1919 enc = kCFStringEncodingISOLatinCyrillic;
1920 break ;
1921 case wxFONTENCODING_ISO8859_6 :
1922 enc = kCFStringEncodingISOLatinArabic;
1923 break ;
1924 case wxFONTENCODING_ISO8859_7 :
1925 enc = kCFStringEncodingISOLatinGreek;
1926 break ;
1927 case wxFONTENCODING_ISO8859_8 :
1928 enc = kCFStringEncodingISOLatinHebrew;
1929 break ;
1930 case wxFONTENCODING_ISO8859_9 :
1931 enc = kCFStringEncodingISOLatin5;
1932 break ;
1933 case wxFONTENCODING_ISO8859_10 :
1934 enc = kCFStringEncodingISOLatin6;
1935 break ;
1936 case wxFONTENCODING_ISO8859_11 :
1937 enc = kCFStringEncodingISOLatinThai;
1938 break ;
1939 case wxFONTENCODING_ISO8859_13 :
1940 enc = kCFStringEncodingISOLatin7;
1941 break ;
1942 case wxFONTENCODING_ISO8859_14 :
1943 enc = kCFStringEncodingISOLatin8;
1944 break ;
1945 case wxFONTENCODING_ISO8859_15 :
1946 enc = kCFStringEncodingISOLatin9;
1947 break ;
1948
1949 case wxFONTENCODING_KOI8 :
1950 enc = kCFStringEncodingKOI8_R;
1951 break ;
1952 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1953 enc = kCFStringEncodingDOSRussian;
1954 break ;
1955
1956// case wxFONTENCODING_BULGARIAN :
1957// enc = ;
1958// break ;
1959
1960 case wxFONTENCODING_CP437 :
1961 enc =kCFStringEncodingDOSLatinUS ;
1962 break ;
1963 case wxFONTENCODING_CP850 :
1964 enc = kCFStringEncodingDOSLatin1;
1965 break ;
1966 case wxFONTENCODING_CP852 :
1967 enc = kCFStringEncodingDOSLatin2;
1968 break ;
1969 case wxFONTENCODING_CP855 :
1970 enc = kCFStringEncodingDOSCyrillic;
1971 break ;
1972 case wxFONTENCODING_CP866 :
1973 enc =kCFStringEncodingDOSRussian ;
1974 break ;
1975 case wxFONTENCODING_CP874 :
1976 enc = kCFStringEncodingDOSThai;
1977 break ;
1978 case wxFONTENCODING_CP932 :
1979 enc = kCFStringEncodingDOSJapanese;
1980 break ;
1981 case wxFONTENCODING_CP936 :
1982 enc =kCFStringEncodingDOSChineseSimplif ;
1983 break ;
1984 case wxFONTENCODING_CP949 :
1985 enc = kCFStringEncodingDOSKorean;
1986 break ;
1987 case wxFONTENCODING_CP950 :
1988 enc = kCFStringEncodingDOSChineseTrad;
1989 break ;
ecd9653b
WS
1990 case wxFONTENCODING_CP1250 :
1991 enc = kCFStringEncodingWindowsLatin2;
1992 break ;
1993 case wxFONTENCODING_CP1251 :
1994 enc =kCFStringEncodingWindowsCyrillic ;
1995 break ;
1996 case wxFONTENCODING_CP1252 :
1997 enc =kCFStringEncodingWindowsLatin1 ;
1998 break ;
1999 case wxFONTENCODING_CP1253 :
2000 enc = kCFStringEncodingWindowsGreek;
2001 break ;
2002 case wxFONTENCODING_CP1254 :
2003 enc = kCFStringEncodingWindowsLatin5;
2004 break ;
2005 case wxFONTENCODING_CP1255 :
2006 enc =kCFStringEncodingWindowsHebrew ;
2007 break ;
2008 case wxFONTENCODING_CP1256 :
2009 enc =kCFStringEncodingWindowsArabic ;
2010 break ;
2011 case wxFONTENCODING_CP1257 :
2012 enc = kCFStringEncodingWindowsBalticRim;
2013 break ;
638357a0
RN
2014// This only really encodes to UTF7 (if that) evidently
2015// case wxFONTENCODING_UTF7 :
2016// enc = kCFStringEncodingNonLossyASCII ;
2017// break ;
ecd9653b
WS
2018 case wxFONTENCODING_UTF8 :
2019 enc = kCFStringEncodingUTF8 ;
2020 break ;
2021 case wxFONTENCODING_EUC_JP :
2022 enc = kCFStringEncodingEUC_JP;
2023 break ;
2024 case wxFONTENCODING_UTF16 :
f7e98dee 2025 enc = kCFStringEncodingUnicode ;
ecd9653b 2026 break ;
f7e98dee
RN
2027 case wxFONTENCODING_MACROMAN :
2028 enc = kCFStringEncodingMacRoman ;
2029 break ;
2030 case wxFONTENCODING_MACJAPANESE :
2031 enc = kCFStringEncodingMacJapanese ;
2032 break ;
2033 case wxFONTENCODING_MACCHINESETRAD :
2034 enc = kCFStringEncodingMacChineseTrad ;
2035 break ;
2036 case wxFONTENCODING_MACKOREAN :
2037 enc = kCFStringEncodingMacKorean ;
2038 break ;
2039 case wxFONTENCODING_MACARABIC :
2040 enc = kCFStringEncodingMacArabic ;
2041 break ;
2042 case wxFONTENCODING_MACHEBREW :
2043 enc = kCFStringEncodingMacHebrew ;
2044 break ;
2045 case wxFONTENCODING_MACGREEK :
2046 enc = kCFStringEncodingMacGreek ;
2047 break ;
2048 case wxFONTENCODING_MACCYRILLIC :
2049 enc = kCFStringEncodingMacCyrillic ;
2050 break ;
2051 case wxFONTENCODING_MACDEVANAGARI :
2052 enc = kCFStringEncodingMacDevanagari ;
2053 break ;
2054 case wxFONTENCODING_MACGURMUKHI :
2055 enc = kCFStringEncodingMacGurmukhi ;
2056 break ;
2057 case wxFONTENCODING_MACGUJARATI :
2058 enc = kCFStringEncodingMacGujarati ;
2059 break ;
2060 case wxFONTENCODING_MACORIYA :
2061 enc = kCFStringEncodingMacOriya ;
2062 break ;
2063 case wxFONTENCODING_MACBENGALI :
2064 enc = kCFStringEncodingMacBengali ;
2065 break ;
2066 case wxFONTENCODING_MACTAMIL :
2067 enc = kCFStringEncodingMacTamil ;
2068 break ;
2069 case wxFONTENCODING_MACTELUGU :
2070 enc = kCFStringEncodingMacTelugu ;
2071 break ;
2072 case wxFONTENCODING_MACKANNADA :
2073 enc = kCFStringEncodingMacKannada ;
2074 break ;
2075 case wxFONTENCODING_MACMALAJALAM :
2076 enc = kCFStringEncodingMacMalayalam ;
2077 break ;
2078 case wxFONTENCODING_MACSINHALESE :
2079 enc = kCFStringEncodingMacSinhalese ;
2080 break ;
2081 case wxFONTENCODING_MACBURMESE :
2082 enc = kCFStringEncodingMacBurmese ;
2083 break ;
2084 case wxFONTENCODING_MACKHMER :
2085 enc = kCFStringEncodingMacKhmer ;
2086 break ;
2087 case wxFONTENCODING_MACTHAI :
2088 enc = kCFStringEncodingMacThai ;
2089 break ;
2090 case wxFONTENCODING_MACLAOTIAN :
2091 enc = kCFStringEncodingMacLaotian ;
2092 break ;
2093 case wxFONTENCODING_MACGEORGIAN :
2094 enc = kCFStringEncodingMacGeorgian ;
2095 break ;
2096 case wxFONTENCODING_MACARMENIAN :
2097 enc = kCFStringEncodingMacArmenian ;
2098 break ;
2099 case wxFONTENCODING_MACCHINESESIMP :
2100 enc = kCFStringEncodingMacChineseSimp ;
2101 break ;
2102 case wxFONTENCODING_MACTIBETAN :
2103 enc = kCFStringEncodingMacTibetan ;
2104 break ;
2105 case wxFONTENCODING_MACMONGOLIAN :
2106 enc = kCFStringEncodingMacMongolian ;
2107 break ;
2108 case wxFONTENCODING_MACETHIOPIC :
2109 enc = kCFStringEncodingMacEthiopic ;
2110 break ;
2111 case wxFONTENCODING_MACCENTRALEUR :
2112 enc = kCFStringEncodingMacCentralEurRoman ;
2113 break ;
2114 case wxFONTENCODING_MACVIATNAMESE :
2115 enc = kCFStringEncodingMacVietnamese ;
2116 break ;
2117 case wxFONTENCODING_MACARABICEXT :
2118 enc = kCFStringEncodingMacExtArabic ;
2119 break ;
2120 case wxFONTENCODING_MACSYMBOL :
2121 enc = kCFStringEncodingMacSymbol ;
2122 break ;
2123 case wxFONTENCODING_MACDINGBATS :
2124 enc = kCFStringEncodingMacDingbats ;
2125 break ;
2126 case wxFONTENCODING_MACTURKISH :
2127 enc = kCFStringEncodingMacTurkish ;
2128 break ;
2129 case wxFONTENCODING_MACCROATIAN :
2130 enc = kCFStringEncodingMacCroatian ;
2131 break ;
2132 case wxFONTENCODING_MACICELANDIC :
2133 enc = kCFStringEncodingMacIcelandic ;
2134 break ;
2135 case wxFONTENCODING_MACROMANIAN :
2136 enc = kCFStringEncodingMacRomanian ;
2137 break ;
2138 case wxFONTENCODING_MACCELTIC :
2139 enc = kCFStringEncodingMacCeltic ;
2140 break ;
2141 case wxFONTENCODING_MACGAELIC :
2142 enc = kCFStringEncodingMacGaelic ;
2143 break ;
ecd9653b
WS
2144// case wxFONTENCODING_MACKEYBOARD :
2145// enc = kCFStringEncodingMacKeyboardGlyphs ;
2146// break ;
2147 default :
2148 // because gcc is picky
2149 break ;
2150 } ;
2151 return enc ;
f7e98dee
RN
2152}
2153
f7e98dee
RN
2154class wxMBConv_cocoa : public wxMBConv
2155{
2156public:
2157 wxMBConv_cocoa()
2158 {
2159 Init(CFStringGetSystemEncoding()) ;
2160 }
2161
a6900d10 2162#if wxUSE_FONTMAP
f7e98dee
RN
2163 wxMBConv_cocoa(const wxChar* name)
2164 {
267e11c5 2165 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2166 }
a6900d10 2167#endif
f7e98dee
RN
2168
2169 wxMBConv_cocoa(wxFontEncoding encoding)
2170 {
2171 Init( wxCFStringEncFromFontEnc(encoding) );
2172 }
2173
2174 ~wxMBConv_cocoa()
2175 {
2176 }
2177
2178 void Init( CFStringEncoding encoding)
2179 {
638357a0 2180 m_encoding = encoding ;
f7e98dee
RN
2181 }
2182
2183 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2184 {
2185 wxASSERT(szUnConv);
ecd9653b 2186
638357a0
RN
2187 CFStringRef theString = CFStringCreateWithBytes (
2188 NULL, //the allocator
2189 (const UInt8*)szUnConv,
2190 strlen(szUnConv),
2191 m_encoding,
2192 false //no BOM/external representation
f7e98dee
RN
2193 );
2194
2195 wxASSERT(theString);
2196
638357a0
RN
2197 size_t nOutLength = CFStringGetLength(theString);
2198
2199 if (szOut == NULL)
f7e98dee 2200 {
f7e98dee 2201 CFRelease(theString);
638357a0 2202 return nOutLength;
f7e98dee 2203 }
ecd9653b 2204
638357a0 2205 CFRange theRange = { 0, nOutSize };
ecd9653b 2206
638357a0
RN
2207#if SIZEOF_WCHAR_T == 4
2208 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2209#endif
3698ae71 2210
f7e98dee 2211 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2212
f7e98dee 2213 CFRelease(theString);
ecd9653b 2214
638357a0 2215 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
2216
2217#if SIZEOF_WCHAR_T == 4
2218 wxMBConvUTF16 converter ;
638357a0 2219 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
2220 delete[] szUniCharBuffer;
2221#endif
3698ae71 2222
638357a0 2223 return nOutLength;
f7e98dee
RN
2224 }
2225
2226 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2227 {
638357a0 2228 wxASSERT(szUnConv);
3698ae71 2229
f7e98dee 2230 size_t nRealOutSize;
638357a0 2231 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2232 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2233
f7e98dee 2234#if SIZEOF_WCHAR_T == 4
d9d488cf 2235 wxMBConvUTF16 converter ;
f7e98dee
RN
2236 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2237 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2238 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2239 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2240#endif
2241
2242 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2243 NULL, //allocator
2244 szUniBuffer,
2245 nBufSize,
638357a0 2246 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2247 );
ecd9653b 2248
f7e98dee 2249 wxASSERT(theString);
ecd9653b 2250
f7e98dee 2251 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2252 //so we check and use getchars instead in that case
2253 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2254 {
638357a0
RN
2255 if (szOut != NULL)
2256 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2257
638357a0
RN
2258 nRealOutSize = CFStringGetLength(theString) + 1;
2259 }
2260 else
2261 {
2262 CFStringGetBytes(
2263 theString,
2264 CFRangeMake(0, CFStringGetLength(theString)),
2265 m_encoding,
2266 0, //what to put in characters that can't be converted -
2267 //0 tells CFString to return NULL if it meets such a character
2268 false, //not an external representation
2269 (UInt8*) szOut,
3698ae71 2270 nOutSize,
638357a0
RN
2271 (CFIndex*) &nRealOutSize
2272 );
f7e98dee 2273 }
ecd9653b 2274
638357a0 2275 CFRelease(theString);
ecd9653b 2276
638357a0
RN
2277#if SIZEOF_WCHAR_T == 4
2278 delete[] szUniBuffer;
2279#endif
ecd9653b 2280
f7e98dee
RN
2281 return nRealOutSize - 1;
2282 }
2283
2284 bool IsOk() const
ecd9653b 2285 {
3698ae71 2286 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2287 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2288 }
2289
2290private:
638357a0 2291 CFStringEncoding m_encoding ;
f7e98dee
RN
2292};
2293
2294#endif // defined(__WXCOCOA__)
2295
335d31e0
SC
2296// ============================================================================
2297// Mac conversion classes
2298// ============================================================================
2299
2300#if defined(__WXMAC__) && defined(TARGET_CARBON)
2301
2302class wxMBConv_mac : public wxMBConv
2303{
2304public:
2305 wxMBConv_mac()
2306 {
2307 Init(CFStringGetSystemEncoding()) ;
2308 }
2309
2d1659cf 2310#if wxUSE_FONTMAP
335d31e0
SC
2311 wxMBConv_mac(const wxChar* name)
2312 {
267e11c5 2313 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0 2314 }
2d1659cf 2315#endif
335d31e0
SC
2316
2317 wxMBConv_mac(wxFontEncoding encoding)
2318 {
d775fa82
WS
2319 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2320 }
2321
2322 ~wxMBConv_mac()
2323 {
2324 OSStatus status = noErr ;
2325 status = TECDisposeConverter(m_MB2WC_converter);
2326 status = TECDisposeConverter(m_WC2MB_converter);
2327 }
2328
2329
2330 void Init( TextEncodingBase encoding)
2331 {
2332 OSStatus status = noErr ;
2333 m_char_encoding = encoding ;
2334 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2335
2336 status = TECCreateConverter(&m_MB2WC_converter,
2337 m_char_encoding,
2338 m_unicode_encoding);
2339 status = TECCreateConverter(&m_WC2MB_converter,
2340 m_unicode_encoding,
2341 m_char_encoding);
2342 }
2343
335d31e0
SC
2344 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2345 {
d775fa82
WS
2346 OSStatus status = noErr ;
2347 ByteCount byteOutLen ;
2348 ByteCount byteInLen = strlen(psz) ;
2349 wchar_t *tbuf = NULL ;
2350 UniChar* ubuf = NULL ;
2351 size_t res = 0 ;
2352
2353 if (buf == NULL)
2354 {
638357a0 2355 //apple specs say at least 32
c543817b 2356 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2357 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2358 }
2359 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2360#if SIZEOF_WCHAR_T == 4
d775fa82 2361 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2362#else
d775fa82 2363 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2364#endif
d775fa82
WS
2365 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2366 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2367#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2368 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2369 // is not properly terminated we get random characters at the end
2370 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2371 wxMBConvUTF16 converter ;
d775fa82
WS
2372 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2373 free( ubuf ) ;
f3a355ce 2374#else
d775fa82 2375 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2376#endif
d775fa82
WS
2377 if ( buf == NULL )
2378 free(tbuf) ;
335d31e0 2379
335d31e0
SC
2380 if ( buf && res < n)
2381 buf[res] = 0;
2382
d775fa82 2383 return res ;
335d31e0
SC
2384 }
2385
2386 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2387 {
2388 OSStatus status = noErr ;
2389 ByteCount byteOutLen ;
2390 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2391
2392 char *tbuf = NULL ;
2393
2394 if (buf == NULL)
2395 {
638357a0 2396 //apple specs say at least 32
c543817b 2397 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2398 tbuf = (char*) malloc( n ) ;
2399 }
2400
2401 ByteCount byteBufferLen = n ;
2402 UniChar* ubuf = NULL ;
f3a355ce 2403#if SIZEOF_WCHAR_T == 4
d9d488cf 2404 wxMBConvUTF16 converter ;
d775fa82
WS
2405 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2406 byteInLen = unicharlen ;
2407 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2408 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2409#else
d775fa82 2410 ubuf = (UniChar*) psz ;
f3a355ce 2411#endif
d775fa82
WS
2412 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2413 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2414#if SIZEOF_WCHAR_T == 4
d775fa82 2415 free( ubuf ) ;
f3a355ce 2416#endif
d775fa82
WS
2417 if ( buf == NULL )
2418 free(tbuf) ;
335d31e0 2419
d775fa82 2420 size_t res = byteOutLen ;
335d31e0 2421 if ( buf && res < n)
638357a0 2422 {
335d31e0 2423 buf[res] = 0;
3698ae71 2424
638357a0
RN
2425 //we need to double-trip to verify it didn't insert any ? in place
2426 //of bogus characters
2427 wxWCharBuffer wcBuf(n);
2428 size_t pszlen = wxWcslen(psz);
2429 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2430 wxWcslen(wcBuf) != pszlen ||
2431 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2432 {
2433 // we didn't obtain the same thing we started from, hence
2434 // the conversion was lossy and we consider that it failed
2435 return (size_t)-1;
2436 }
2437 }
335d31e0 2438
d775fa82 2439 return res ;
335d31e0
SC
2440 }
2441
2442 bool IsOk() const
2443 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2444
2445private:
d775fa82
WS
2446 TECObjectRef m_MB2WC_converter ;
2447 TECObjectRef m_WC2MB_converter ;
2448
2449 TextEncodingBase m_char_encoding ;
2450 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2451};
2452
2453#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2454
36acb880
VZ
2455// ============================================================================
2456// wxEncodingConverter based conversion classes
2457// ============================================================================
2458
1e6feb95 2459#if wxUSE_FONTMAP
1cd52418 2460
e95354ec 2461class wxMBConv_wxwin : public wxMBConv
1cd52418 2462{
8b04d4c4
VZ
2463private:
2464 void Init()
2465 {
2466 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2467 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2468 }
2469
6001e347 2470public:
f1339c56
RR
2471 // temporarily just use wxEncodingConverter stuff,
2472 // so that it works while a better implementation is built
e95354ec 2473 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2474 {
2475 if (name)
267e11c5 2476 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2477 else
2478 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2479
8b04d4c4
VZ
2480 Init();
2481 }
2482
e95354ec 2483 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2484 {
2485 m_enc = enc;
2486
2487 Init();
f1339c56 2488 }
dccce9ea 2489
bde4baac 2490 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2491 {
2492 size_t inbuf = strlen(psz);
dccce9ea 2493 if (buf)
c643a977
VS
2494 {
2495 if (!m2w.Convert(psz,buf))
2496 return (size_t)-1;
2497 }
f1339c56
RR
2498 return inbuf;
2499 }
dccce9ea 2500
bde4baac 2501 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2502 {
f8d791e0 2503 const size_t inbuf = wxWcslen(psz);
f1339c56 2504 if (buf)
c643a977
VS
2505 {
2506 if (!w2m.Convert(psz,buf))
2507 return (size_t)-1;
2508 }
dccce9ea 2509
f1339c56
RR
2510 return inbuf;
2511 }
dccce9ea 2512
e95354ec 2513 bool IsOk() const { return m_ok; }
f1339c56
RR
2514
2515public:
8b04d4c4 2516 wxFontEncoding m_enc;
f1339c56 2517 wxEncodingConverter m2w, w2m;
cafbf6fb
VZ
2518
2519 // were we initialized successfully?
2520 bool m_ok;
fc7a2a60 2521
e95354ec 2522 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2523};
6001e347 2524
8f115891
MW
2525// make the constructors available for unit testing
2526WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2527{
2528 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2529 if ( !result->IsOk() )
2530 {
2531 delete result;
2532 return 0;
2533 }
2534 return result;
2535}
2536
1e6feb95
VZ
2537#endif // wxUSE_FONTMAP
2538
36acb880
VZ
2539// ============================================================================
2540// wxCSConv implementation
2541// ============================================================================
2542
8b04d4c4 2543void wxCSConv::Init()
6001e347 2544{
e95354ec
VZ
2545 m_name = NULL;
2546 m_convReal = NULL;
2547 m_deferred = true;
2548}
2549
8b04d4c4
VZ
2550wxCSConv::wxCSConv(const wxChar *charset)
2551{
2552 Init();
82713003 2553
e95354ec
VZ
2554 if ( charset )
2555 {
e95354ec
VZ
2556 SetName(charset);
2557 }
bda3d86a 2558
e4277538
VZ
2559#if wxUSE_FONTMAP
2560 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2561#else
bda3d86a 2562 m_encoding = wxFONTENCODING_SYSTEM;
e4277538 2563#endif
6001e347
RR
2564}
2565
8b04d4c4
VZ
2566wxCSConv::wxCSConv(wxFontEncoding encoding)
2567{
bda3d86a 2568 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2569 {
2570 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2571
2572 encoding = wxFONTENCODING_SYSTEM;
2573 }
2574
8b04d4c4
VZ
2575 Init();
2576
bda3d86a 2577 m_encoding = encoding;
8b04d4c4
VZ
2578}
2579
6001e347
RR
2580wxCSConv::~wxCSConv()
2581{
65e50848
JS
2582 Clear();
2583}
2584
54380f29 2585wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2586 : wxMBConv()
54380f29 2587{
8b04d4c4
VZ
2588 Init();
2589
54380f29 2590 SetName(conv.m_name);
8b04d4c4 2591 m_encoding = conv.m_encoding;
54380f29
GD
2592}
2593
2594wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2595{
2596 Clear();
8b04d4c4 2597
54380f29 2598 SetName(conv.m_name);
8b04d4c4
VZ
2599 m_encoding = conv.m_encoding;
2600
54380f29
GD
2601 return *this;
2602}
2603
65e50848
JS
2604void wxCSConv::Clear()
2605{
8b04d4c4 2606 free(m_name);
e95354ec 2607 delete m_convReal;
8b04d4c4 2608
65e50848 2609 m_name = NULL;
e95354ec 2610 m_convReal = NULL;
6001e347
RR
2611}
2612
2613void wxCSConv::SetName(const wxChar *charset)
2614{
f1339c56
RR
2615 if (charset)
2616 {
2617 m_name = wxStrdup(charset);
e95354ec 2618 m_deferred = true;
f1339c56 2619 }
6001e347
RR
2620}
2621
8b3eb85d
VZ
2622#if wxUSE_FONTMAP
2623#include "wx/hashmap.h"
2624
2625WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3f5c62f9 2626 wxEncodingNameCache );
8b3eb85d
VZ
2627
2628static wxEncodingNameCache gs_nameCache;
2629#endif
2630
e95354ec
VZ
2631wxMBConv *wxCSConv::DoCreate() const
2632{
ce6f8d6f
VZ
2633#if wxUSE_FONTMAP
2634 wxLogTrace(TRACE_STRCONV,
2635 wxT("creating conversion for %s"),
2636 (m_name ? m_name
2637 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2638#endif // wxUSE_FONTMAP
2639
c547282d
VZ
2640 // check for the special case of ASCII or ISO8859-1 charset: as we have
2641 // special knowledge of it anyhow, we don't need to create a special
2642 // conversion object
e4277538
VZ
2643 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2644 m_encoding == wxFONTENCODING_DEFAULT )
f1339c56 2645 {
e95354ec
VZ
2646 // don't convert at all
2647 return NULL;
2648 }
dccce9ea 2649
e95354ec
VZ
2650 // we trust OS to do conversion better than we can so try external
2651 // conversion methods first
2652 //
2653 // the full order is:
2654 // 1. OS conversion (iconv() under Unix or Win32 API)
2655 // 2. hard coded conversions for UTF
2656 // 3. wxEncodingConverter as fall back
2657
2658 // step (1)
2659#ifdef HAVE_ICONV
c547282d 2660#if !wxUSE_FONTMAP
e95354ec 2661 if ( m_name )
c547282d 2662#endif // !wxUSE_FONTMAP
e95354ec 2663 {
c547282d 2664 wxString name(m_name);
8b3eb85d
VZ
2665 wxFontEncoding encoding(m_encoding);
2666
2667 if ( !name.empty() )
2668 {
2669 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2670 if ( conv->IsOk() )
2671 return conv;
2672
2673 delete conv;
c547282d
VZ
2674
2675#if wxUSE_FONTMAP
8b3eb85d
VZ
2676 encoding =
2677 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 2678#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2679 }
2680#if wxUSE_FONTMAP
2681 {
2682 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2683 if ( it != gs_nameCache.end() )
2684 {
2685 if ( it->second.empty() )
2686 return NULL;
c547282d 2687
8b3eb85d
VZ
2688 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2689 if ( conv->IsOk() )
2690 return conv;
e95354ec 2691
8b3eb85d
VZ
2692 delete conv;
2693 }
2694
2695 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2696
2697 for ( ; *names; ++names )
2698 {
2699 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2700 if ( conv->IsOk() )
2701 {
2702 gs_nameCache[encoding] = *names;
2703 return conv;
2704 }
2705
2706 delete conv;
2707 }
2708
40711af8 2709 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
2710 }
2711#endif // wxUSE_FONTMAP
e95354ec
VZ
2712 }
2713#endif // HAVE_ICONV
2714
2715#ifdef wxHAVE_WIN32_MB2WC
2716 {
7608a683 2717#if wxUSE_FONTMAP
e95354ec
VZ
2718 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2719 : new wxMBConv_win32(m_encoding);
2720 if ( conv->IsOk() )
2721 return conv;
2722
2723 delete conv;
7608a683
WS
2724#else
2725 return NULL;
2726#endif
e95354ec
VZ
2727 }
2728#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2729#if defined(__WXMAC__)
2730 {
5c3c8676 2731 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 2732 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 2733 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82
WS
2734 {
2735
2d1659cf 2736#if wxUSE_FONTMAP
d775fa82
WS
2737 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2738 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
2739#else
2740 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2741#endif
d775fa82 2742 if ( conv->IsOk() )
f7e98dee
RN
2743 return conv;
2744
2745 delete conv;
2746 }
2747 }
2748#endif
2749#if defined(__WXCOCOA__)
2750 {
2751 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2752 {
2753
a6900d10 2754#if wxUSE_FONTMAP
f7e98dee
RN
2755 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2756 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
2757#else
2758 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2759#endif
f7e98dee 2760 if ( conv->IsOk() )
d775fa82
WS
2761 return conv;
2762
2763 delete conv;
2764 }
335d31e0
SC
2765 }
2766#endif
e95354ec
VZ
2767 // step (2)
2768 wxFontEncoding enc = m_encoding;
2769#if wxUSE_FONTMAP
c547282d
VZ
2770 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2771 {
2772 // use "false" to suppress interactive dialogs -- we can be called from
2773 // anywhere and popping up a dialog from here is the last thing we want to
2774 // do
267e11c5 2775 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2776 }
e95354ec
VZ
2777#endif // wxUSE_FONTMAP
2778
2779 switch ( enc )
2780 {
2781 case wxFONTENCODING_UTF7:
2782 return new wxMBConvUTF7;
2783
2784 case wxFONTENCODING_UTF8:
2785 return new wxMBConvUTF8;
2786
e95354ec
VZ
2787 case wxFONTENCODING_UTF16BE:
2788 return new wxMBConvUTF16BE;
2789
2790 case wxFONTENCODING_UTF16LE:
2791 return new wxMBConvUTF16LE;
2792
e95354ec
VZ
2793 case wxFONTENCODING_UTF32BE:
2794 return new wxMBConvUTF32BE;
2795
2796 case wxFONTENCODING_UTF32LE:
2797 return new wxMBConvUTF32LE;
2798
2799 default:
2800 // nothing to do but put here to suppress gcc warnings
2801 ;
2802 }
2803
2804 // step (3)
2805#if wxUSE_FONTMAP
2806 {
2807 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2808 : new wxMBConv_wxwin(m_encoding);
2809 if ( conv->IsOk() )
2810 return conv;
2811
2812 delete conv;
2813 }
2814#endif // wxUSE_FONTMAP
2815
a58d4f4d
VS
2816 // NB: This is a hack to prevent deadlock. What could otherwise happen
2817 // in Unicode build: wxConvLocal creation ends up being here
2818 // because of some failure and logs the error. But wxLog will try to
2819 // attach timestamp, for which it will need wxConvLocal (to convert
2820 // time to char* and then wchar_t*), but that fails, tries to log
2821 // error, but wxLog has a (already locked) critical section that
2822 // guards static buffer.
2823 static bool alreadyLoggingError = false;
2824 if (!alreadyLoggingError)
2825 {
2826 alreadyLoggingError = true;
2827 wxLogError(_("Cannot convert from the charset '%s'!"),
2828 m_name ? m_name
e95354ec
VZ
2829 :
2830#if wxUSE_FONTMAP
267e11c5 2831 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
2832#else // !wxUSE_FONTMAP
2833 wxString::Format(_("encoding %s"), m_encoding).c_str()
2834#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2835 );
a58d4f4d
VS
2836 alreadyLoggingError = false;
2837 }
e95354ec
VZ
2838
2839 return NULL;
2840}
2841
2842void wxCSConv::CreateConvIfNeeded() const
2843{
2844 if ( m_deferred )
2845 {
2846 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2847
2848#if wxUSE_INTL
2849 // if we don't have neither the name nor the encoding, use the default
2850 // encoding for this system
2851 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2852 {
4d312c22 2853 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2854 }
2855#endif // wxUSE_INTL
2856
e95354ec
VZ
2857 self->m_convReal = DoCreate();
2858 self->m_deferred = false;
6001e347 2859 }
6001e347
RR
2860}
2861
2862size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2863{
e95354ec 2864 CreateConvIfNeeded();
dccce9ea 2865
e95354ec
VZ
2866 if (m_convReal)
2867 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2868
2869 // latin-1 (direct)
4def3b35 2870 size_t len = strlen(psz);
dccce9ea 2871
f1339c56
RR
2872 if (buf)
2873 {
4def3b35 2874 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2875 buf[c] = (unsigned char)(psz[c]);
2876 }
dccce9ea 2877
f1339c56 2878 return len;
6001e347
RR
2879}
2880
2881size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2882{
e95354ec 2883 CreateConvIfNeeded();
dccce9ea 2884
e95354ec
VZ
2885 if (m_convReal)
2886 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2887
f1339c56 2888 // latin-1 (direct)
f8d791e0 2889 const size_t len = wxWcslen(psz);
f1339c56
RR
2890 if (buf)
2891 {
4def3b35 2892 for (size_t c = 0; c <= len; c++)
24642831
VS
2893 {
2894 if (psz[c] > 0xFF)
2895 return (size_t)-1;
907173e5 2896 buf[c] = (char)psz[c];
24642831
VS
2897 }
2898 }
2899 else
2900 {
2901 for (size_t c = 0; c <= len; c++)
2902 {
2903 if (psz[c] > 0xFF)
2904 return (size_t)-1;
2905 }
f1339c56 2906 }
dccce9ea 2907
f1339c56 2908 return len;
6001e347
RR
2909}
2910
bde4baac
VZ
2911// ----------------------------------------------------------------------------
2912// globals
2913// ----------------------------------------------------------------------------
2914
2915#ifdef __WINDOWS__
2916 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
2917#elif defined(__WXMAC__) && !defined(__MACH__)
2918 static wxMBConv_mac wxConvLibcObj ;
bde4baac 2919#else
dcc8fac0 2920 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
2921#endif
2922
2923static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2924static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2925static wxMBConvUTF7 wxConvUTF7Obj;
2926static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 2927
bde4baac
VZ
2928WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2929WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2930WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2931WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2932WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2933WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
2934WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2935#ifdef __WXOSX__
ea8ce907 2936 wxConvUTF8Obj;
f5a1953b 2937#else
ea8ce907 2938 wxConvLibcObj;
f5a1953b
VZ
2939#endif
2940
bde4baac
VZ
2941
2942#else // !wxUSE_WCHAR_T
2943
2944// stand-ins in absence of wchar_t
2945WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2946 wxConvISO8859_1,
2947 wxConvLocal,
2948 wxConvUTF8;
2949
2950#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T