]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
wxMotif for OS/2 adjustements. Source cleaning.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347 1/////////////////////////////////////////////////////////////////////////////
38d4b1e4 2// Name: src/common/strconv.cpp
6001e347 3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
6001e347
RR
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
373658eb
VZ
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
bde4baac
VZ
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
7608a683 39#ifdef __WINDOWS__
532d575b 40 #include "wx/msw/private.h"
13dd924a 41 #include "wx/msw/missing.h"
0a1c1e62
GRG
42#endif
43
1c193821 44#ifndef __WXWINCE__
1cd52418 45#include <errno.h>
1c193821
JS
46#endif
47
6001e347
RR
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
e95354ec
VZ
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
6001e347 56#ifdef __SALFORDC__
373658eb 57 #include <clib.h>
6001e347
RR
58#endif
59
b040e242 60#ifdef HAVE_ICONV
373658eb 61 #include <iconv.h>
b1d547eb 62 #include "wx/thread.h"
1cd52418 63#endif
1cd52418 64
373658eb
VZ
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
7608a683 67#include "wx/utils.h"
373658eb 68
335d31e0 69#ifdef __WXMAC__
40ba2f3b 70#ifndef __DARWIN__
4227afa4
SC
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
40ba2f3b 74#endif
335d31e0
SC
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
ce6f8d6f
VZ
78
79#define TRACE_STRCONV _T("strconv")
80
373658eb
VZ
81// ============================================================================
82// implementation
83// ============================================================================
84
85// ----------------------------------------------------------------------------
c91830cb 86// UTF-16 en/decoding to/from UCS-4
373658eb 87// ----------------------------------------------------------------------------
6001e347 88
b0a6bb75 89
c91830cb 90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 91{
dccce9ea 92 if (input<=0xffff)
4def3b35 93 {
999836aa
VZ
94 if (output)
95 *output = (wxUint16) input;
4def3b35 96 return 1;
dccce9ea
VZ
97 }
98 else if (input>=0x110000)
4def3b35
VS
99 {
100 return (size_t)-1;
dccce9ea
VZ
101 }
102 else
4def3b35 103 {
dccce9ea 104 if (output)
4def3b35 105 {
c91830cb 106 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 107 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
108 }
109 return 2;
1cd52418 110 }
1cd52418
OK
111}
112
c91830cb 113static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 114{
dccce9ea 115 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
116 {
117 output = *input;
118 return 1;
dccce9ea 119 }
cdb14ecb 120 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
4def3b35
VS
121 {
122 output = *input;
123 return (size_t)-1;
dccce9ea
VZ
124 }
125 else
4def3b35
VS
126 {
127 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
128 return 2;
129 }
1cd52418
OK
130}
131
b0a6bb75 132
f6bcfd97 133// ----------------------------------------------------------------------------
6001e347 134// wxMBConv
f6bcfd97 135// ----------------------------------------------------------------------------
2c53a80a
WS
136
137wxMBConv::~wxMBConv()
138{
139 // nothing to do here (necessary for Darwin linking probably)
140}
6001e347 141
6001e347
RR
142const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
143{
2b5f62a0 144 if ( psz )
6001e347 145 {
2b5f62a0
VZ
146 // calculate the length of the buffer needed first
147 size_t nLen = MB2WC(NULL, psz, 0);
148 if ( nLen != (size_t)-1 )
149 {
150 // now do the actual conversion
151 wxWCharBuffer buf(nLen);
635f33ce
VS
152 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
153 if ( nLen != (size_t)-1 )
154 {
155 return buf;
156 }
2b5f62a0 157 }
f6bcfd97 158 }
2b5f62a0
VZ
159
160 wxWCharBuffer buf((wchar_t *)NULL);
161
162 return buf;
6001e347
RR
163}
164
e5cceba0 165const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 166{
2b5f62a0
VZ
167 if ( pwz )
168 {
169 size_t nLen = WC2MB(NULL, pwz, 0);
170 if ( nLen != (size_t)-1 )
171 {
c91830cb 172 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
173 nLen = WC2MB(buf.data(), pwz, nLen + 4);
174 if ( nLen != (size_t)-1 )
175 {
176 return buf;
177 }
2b5f62a0
VZ
178 }
179 }
180
181 wxCharBuffer buf((char *)NULL);
e5cceba0 182
e5cceba0 183 return buf;
6001e347
RR
184}
185
f5fb6871 186const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
e4e3bbb4 187{
f5fb6871
RN
188 wxASSERT(pOutSize != NULL);
189
e4e3bbb4
RN
190 const char* szEnd = szString + nStringLen + 1;
191 const char* szPos = szString;
192 const char* szStart = szPos;
193
194 size_t nActualLength = 0;
f5fb6871
RN
195 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
196
197 wxWCharBuffer theBuffer(nCurrentSize);
e4e3bbb4
RN
198
199 //Convert the string until the length() is reached, continuing the
200 //loop every time a null character is reached
201 while(szPos != szEnd)
202 {
203 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
204
205 //Get the length of the current (sub)string
206 size_t nLen = MB2WC(NULL, szPos, 0);
207
208 //Invalid conversion?
209 if( nLen == (size_t)-1 )
f5fb6871
RN
210 {
211 *pOutSize = 0;
212 theBuffer.data()[0u] = wxT('\0');
213 return theBuffer;
214 }
215
e4e3bbb4
RN
216
217 //Increase the actual length (+1 for current null character)
218 nActualLength += nLen + 1;
219
f5fb6871
RN
220 //if buffer too big, realloc the buffer
221 if (nActualLength > (nCurrentSize+1))
222 {
223 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
224 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
225 theBuffer = theNewBuffer;
226 nCurrentSize <<= 1;
227 }
228
229 //Convert the current (sub)string
230 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
e4e3bbb4 231 {
f5fb6871
RN
232 *pOutSize = 0;
233 theBuffer.data()[0u] = wxT('\0');
234 return theBuffer;
e4e3bbb4
RN
235 }
236
237 //Increment to next (sub)string
3103e8a9
JS
238 //Note that we have to use strlen instead of nLen here
239 //because XX2XX gives us the size of the output buffer,
240 //which is not necessarily the length of the string
e4e3bbb4
RN
241 szPos += strlen(szPos) + 1;
242 }
243
f5fb6871
RN
244 //success - return actual length and the buffer
245 *pOutSize = nActualLength;
3698ae71 246 return theBuffer;
e4e3bbb4
RN
247}
248
f5fb6871 249const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
e4e3bbb4 250{
f5fb6871
RN
251 wxASSERT(pOutSize != NULL);
252
e4e3bbb4
RN
253 const wchar_t* szEnd = szString + nStringLen + 1;
254 const wchar_t* szPos = szString;
255 const wchar_t* szStart = szPos;
256
257 size_t nActualLength = 0;
f5fb6871
RN
258 size_t nCurrentSize = nStringLen << 2; //try * 4 first
259
260 wxCharBuffer theBuffer(nCurrentSize);
e4e3bbb4
RN
261
262 //Convert the string until the length() is reached, continuing the
263 //loop every time a null character is reached
264 while(szPos != szEnd)
265 {
266 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
267
268 //Get the length of the current (sub)string
269 size_t nLen = WC2MB(NULL, szPos, 0);
270
271 //Invalid conversion?
272 if( nLen == (size_t)-1 )
f5fb6871
RN
273 {
274 *pOutSize = 0;
275 theBuffer.data()[0u] = wxT('\0');
276 return theBuffer;
277 }
e4e3bbb4
RN
278
279 //Increase the actual length (+1 for current null character)
280 nActualLength += nLen + 1;
3698ae71 281
f5fb6871
RN
282 //if buffer too big, realloc the buffer
283 if (nActualLength > (nCurrentSize+1))
284 {
285 wxCharBuffer theNewBuffer(nCurrentSize << 1);
286 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
287 theBuffer = theNewBuffer;
288 nCurrentSize <<= 1;
289 }
290
291 //Convert the current (sub)string
292 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
e4e3bbb4 293 {
f5fb6871
RN
294 *pOutSize = 0;
295 theBuffer.data()[0u] = wxT('\0');
296 return theBuffer;
e4e3bbb4
RN
297 }
298
299 //Increment to next (sub)string
3103e8a9
JS
300 //Note that we have to use wxWcslen instead of nLen here
301 //because XX2XX gives us the size of the output buffer,
302 //which is not necessarily the length of the string
e4e3bbb4
RN
303 szPos += wxWcslen(szPos) + 1;
304 }
305
f5fb6871
RN
306 //success - return actual length and the buffer
307 *pOutSize = nActualLength;
3698ae71 308 return theBuffer;
e4e3bbb4
RN
309}
310
6001e347 311// ----------------------------------------------------------------------------
bde4baac 312// wxMBConvLibc
6001e347
RR
313// ----------------------------------------------------------------------------
314
bde4baac
VZ
315size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
316{
317 return wxMB2WC(buf, psz, n);
318}
319
320size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
321{
322 return wxWC2MB(buf, psz, n);
323}
e1bfe89e 324
66bf0099 325#ifdef __UNIX__
c12b7f79 326
e1bfe89e 327// ----------------------------------------------------------------------------
532d575b 328// wxConvBrokenFileNames
e1bfe89e
RR
329// ----------------------------------------------------------------------------
330
845905d5 331wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
ea8ce907 332{
845905d5
MW
333 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
334 || wxStricmp(charset, _T("UTF8")) == 0 )
335 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
336 else
337 m_conv = new wxCSConv(charset);
ea8ce907
RR
338}
339
c12b7f79
VZ
340size_t
341wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
342 const char *psz,
343 size_t outputSize) const
e1bfe89e 344{
c12b7f79 345 return m_conv->MB2WC( outputBuf, psz, outputSize );
e1bfe89e
RR
346}
347
c12b7f79
VZ
348size_t
349wxConvBrokenFileNames::WC2MB(char *outputBuf,
350 const wchar_t *psz,
351 size_t outputSize) const
e1bfe89e 352{
c12b7f79 353 return m_conv->WC2MB( outputBuf, psz, outputSize );
e1bfe89e
RR
354}
355
66bf0099 356#endif
c12b7f79 357
bde4baac 358// ----------------------------------------------------------------------------
3698ae71 359// UTF-7
bde4baac 360// ----------------------------------------------------------------------------
6001e347 361
15f2ee32 362// Implementation (C) 2004 Fredrik Roubert
6001e347 363
15f2ee32
RN
364//
365// BASE64 decoding table
366//
367static const unsigned char utf7unb64[] =
6001e347 368{
15f2ee32
RN
369 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
375 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
376 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
378 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
379 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
380 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
382 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
383 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
384 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
401};
402
403size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
404{
15f2ee32
RN
405 size_t len = 0;
406
407 while (*psz && ((!buf) || (len < n)))
408 {
409 unsigned char cc = *psz++;
410 if (cc != '+')
411 {
412 // plain ASCII char
413 if (buf)
414 *buf++ = cc;
415 len++;
416 }
417 else if (*psz == '-')
418 {
419 // encoded plus sign
420 if (buf)
421 *buf++ = cc;
422 len++;
423 psz++;
424 }
425 else
426 {
427 // BASE64 encoded string
428 bool lsb;
429 unsigned char c;
430 unsigned int d, l;
431 for (lsb = false, d = 0, l = 0;
432 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
433 {
434 d <<= 6;
435 d += cc;
436 for (l += 6; l >= 8; lsb = !lsb)
437 {
6356d52a 438 c = (unsigned char)((d >> (l -= 8)) % 256);
15f2ee32
RN
439 if (lsb)
440 {
441 if (buf)
442 *buf++ |= c;
443 len ++;
444 }
445 else
446 if (buf)
6356d52a 447 *buf = (wchar_t)(c << 8);
15f2ee32
RN
448 }
449 }
450 if (*psz == '-')
451 psz++;
452 }
453 }
454 if (buf && (len < n))
455 *buf = 0;
456 return len;
6001e347
RR
457}
458
15f2ee32
RN
459//
460// BASE64 encoding table
461//
462static const unsigned char utf7enb64[] =
463{
464 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
465 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
466 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
467 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
468 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
469 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
470 'w', 'x', 'y', 'z', '0', '1', '2', '3',
471 '4', '5', '6', '7', '8', '9', '+', '/'
472};
473
474//
475// UTF-7 encoding table
476//
477// 0 - Set D (directly encoded characters)
478// 1 - Set O (optional direct characters)
479// 2 - whitespace characters (optional)
480// 3 - special characters
481//
482static const unsigned char utf7encode[128] =
6001e347 483{
15f2ee32
RN
484 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
486 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
492};
493
667e5b3e 494size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
15f2ee32
RN
495{
496
497
498 size_t len = 0;
499
500 while (*psz && ((!buf) || (len < n)))
501 {
502 wchar_t cc = *psz++;
503 if (cc < 0x80 && utf7encode[cc] < 1)
504 {
505 // plain ASCII char
506 if (buf)
507 *buf++ = (char)cc;
508 len++;
509 }
510#ifndef WC_UTF16
79c78d42 511 else if (((wxUint32)cc) > 0xffff)
b2c13097 512 {
15f2ee32
RN
513 // no surrogate pair generation (yet?)
514 return (size_t)-1;
515 }
516#endif
517 else
518 {
519 if (buf)
520 *buf++ = '+';
521 len++;
522 if (cc != '+')
523 {
524 // BASE64 encode string
525 unsigned int lsb, d, l;
526 for (d = 0, l = 0;; psz++)
527 {
528 for (lsb = 0; lsb < 2; lsb ++)
529 {
530 d <<= 8;
531 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
532
533 for (l += 8; l >= 6; )
534 {
535 l -= 6;
536 if (buf)
537 *buf++ = utf7enb64[(d >> l) % 64];
538 len++;
539 }
540 }
541 cc = *psz;
542 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
543 break;
544 }
545 if (l != 0)
546 {
547 if (buf)
548 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
549 len++;
550 }
551 }
552 if (buf)
553 *buf++ = '-';
554 len++;
555 }
556 }
557 if (buf && (len < n))
558 *buf = 0;
559 return len;
6001e347
RR
560}
561
f6bcfd97 562// ----------------------------------------------------------------------------
6001e347 563// UTF-8
f6bcfd97 564// ----------------------------------------------------------------------------
6001e347 565
dccce9ea 566static wxUint32 utf8_max[]=
4def3b35 567 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347 568
3698ae71
VZ
569// boundaries of the private use area we use to (temporarily) remap invalid
570// characters invalid in a UTF-8 encoded string
ea8ce907
RR
571const wxUint32 wxUnicodePUA = 0x100000;
572const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
573
6001e347
RR
574size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
575{
4def3b35
VS
576 size_t len = 0;
577
dccce9ea 578 while (*psz && ((!buf) || (len < n)))
4def3b35 579 {
ea8ce907
RR
580 const char *opsz = psz;
581 bool invalid = false;
4def3b35
VS
582 unsigned char cc = *psz++, fc = cc;
583 unsigned cnt;
dccce9ea 584 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 585 fc <<= 1;
dccce9ea 586 if (!cnt)
4def3b35
VS
587 {
588 // plain ASCII char
dccce9ea 589 if (buf)
4def3b35
VS
590 *buf++ = cc;
591 len++;
561488ef
MW
592
593 // escape the escape character for octal escapes
594 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
595 && cc == '\\' && (!buf || len < n))
596 {
597 if (buf)
598 *buf++ = cc;
599 len++;
600 }
dccce9ea
VZ
601 }
602 else
4def3b35
VS
603 {
604 cnt--;
dccce9ea 605 if (!cnt)
4def3b35
VS
606 {
607 // invalid UTF-8 sequence
ea8ce907 608 invalid = true;
dccce9ea
VZ
609 }
610 else
4def3b35
VS
611 {
612 unsigned ocnt = cnt - 1;
613 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 614 while (cnt--)
4def3b35 615 {
ea8ce907 616 cc = *psz;
dccce9ea 617 if ((cc & 0xC0) != 0x80)
4def3b35
VS
618 {
619 // invalid UTF-8 sequence
ea8ce907
RR
620 invalid = true;
621 break;
4def3b35 622 }
ea8ce907 623 psz++;
4def3b35
VS
624 res = (res << 6) | (cc & 0x3f);
625 }
ea8ce907 626 if (invalid || res <= utf8_max[ocnt])
4def3b35
VS
627 {
628 // illegal UTF-8 encoding
ea8ce907 629 invalid = true;
4def3b35 630 }
ea8ce907
RR
631 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
632 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
633 {
634 // if one of our PUA characters turns up externally
635 // it must also be treated as an illegal sequence
636 // (a bit like you have to escape an escape character)
637 invalid = true;
638 }
639 else
640 {
1cd52418 641#ifdef WC_UTF16
ea8ce907
RR
642 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
643 size_t pa = encode_utf16(res, (wxUint16 *)buf);
644 if (pa == (size_t)-1)
645 {
646 invalid = true;
647 }
648 else
649 {
650 if (buf)
651 buf += pa;
652 len += pa;
653 }
373658eb 654#else // !WC_UTF16
ea8ce907 655 if (buf)
38d4b1e4 656 *buf++ = (wchar_t)res;
ea8ce907 657 len++;
373658eb 658#endif // WC_UTF16/!WC_UTF16
ea8ce907
RR
659 }
660 }
661 if (invalid)
662 {
663 if (m_options & MAP_INVALID_UTF8_TO_PUA)
664 {
665 while (opsz < psz && (!buf || len < n))
666 {
667#ifdef WC_UTF16
668 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
669 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
670 wxASSERT(pa != (size_t)-1);
671 if (buf)
672 buf += pa;
673 opsz++;
674 len += pa;
675#else
676 if (buf)
38d4b1e4 677 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
ea8ce907
RR
678 opsz++;
679 len++;
680#endif
681 }
682 }
3698ae71 683 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
ea8ce907
RR
684 {
685 while (opsz < psz && (!buf || len < n))
686 {
3698ae71
VZ
687 if ( buf && len + 3 < n )
688 {
17a1ebd1 689 unsigned char on = *opsz;
3698ae71 690 *buf++ = L'\\';
17a1ebd1
VZ
691 *buf++ = (wchar_t)( L'0' + on / 0100 );
692 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
693 *buf++ = (wchar_t)( L'0' + on % 010 );
3698ae71 694 }
ea8ce907
RR
695 opsz++;
696 len += 4;
697 }
698 }
3698ae71 699 else // MAP_INVALID_UTF8_NOT
ea8ce907
RR
700 {
701 return (size_t)-1;
702 }
4def3b35
VS
703 }
704 }
6001e347 705 }
dccce9ea 706 if (buf && (len < n))
4def3b35
VS
707 *buf = 0;
708 return len;
6001e347
RR
709}
710
3698ae71
VZ
711static inline bool isoctal(wchar_t wch)
712{
713 return L'0' <= wch && wch <= L'7';
714}
715
6001e347
RR
716size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
717{
4def3b35 718 size_t len = 0;
6001e347 719
dccce9ea 720 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
721 {
722 wxUint32 cc;
1cd52418 723#ifdef WC_UTF16
b5153fd8
VZ
724 // cast is ok for WC_UTF16
725 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 726 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 727#else
4def3b35
VS
728 cc=(*psz++) & 0x7fffffff;
729#endif
3698ae71
VZ
730
731 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
732 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
4def3b35 733 {
dccce9ea 734 if (buf)
ea8ce907 735 *buf++ = (char)(cc - wxUnicodePUA);
4def3b35 736 len++;
3698ae71 737 }
561488ef
MW
738 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
739 && cc == L'\\' && psz[0] == L'\\' )
740 {
741 if (buf)
742 *buf++ = (char)cc;
743 psz++;
744 len++;
745 }
3698ae71
VZ
746 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
747 cc == L'\\' &&
748 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
4def3b35 749 {
dccce9ea 750 if (buf)
3698ae71 751 {
b2c13097
WS
752 *buf++ = (char) ((psz[0] - L'0')*0100 +
753 (psz[1] - L'0')*010 +
754 (psz[2] - L'0'));
3698ae71
VZ
755 }
756
757 psz += 3;
ea8ce907
RR
758 len++;
759 }
760 else
761 {
762 unsigned cnt;
763 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
764 if (!cnt)
4def3b35 765 {
ea8ce907
RR
766 // plain ASCII char
767 if (buf)
768 *buf++ = (char) cc;
769 len++;
770 }
771
772 else
773 {
774 len += cnt + 1;
775 if (buf)
776 {
777 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
778 while (cnt--)
779 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
780 }
4def3b35
VS
781 }
782 }
6001e347 783 }
4def3b35 784
3698ae71
VZ
785 if (buf && (len<n))
786 *buf = 0;
adb45366 787
4def3b35 788 return len;
6001e347
RR
789}
790
c91830cb
VZ
791// ----------------------------------------------------------------------------
792// UTF-16
793// ----------------------------------------------------------------------------
794
795#ifdef WORDS_BIGENDIAN
bde4baac
VZ
796 #define wxMBConvUTF16straight wxMBConvUTF16BE
797 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 798#else
bde4baac
VZ
799 #define wxMBConvUTF16swap wxMBConvUTF16BE
800 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
801#endif
802
803
c91830cb
VZ
804#ifdef WC_UTF16
805
c91830cb
VZ
806// copy 16bit MB to 16bit String
807size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
808{
809 size_t len=0;
810
811 while (*(wxUint16*)psz && (!buf || len < n))
812 {
813 if (buf)
814 *buf++ = *(wxUint16*)psz;
815 len++;
816
817 psz += sizeof(wxUint16);
818 }
819 if (buf && len<n) *buf=0;
820
821 return len;
822}
823
824
825// copy 16bit String to 16bit MB
826size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
827{
828 size_t len=0;
829
830 while (*psz && (!buf || len < n))
831 {
832 if (buf)
833 {
834 *(wxUint16*)buf = *psz;
835 buf += sizeof(wxUint16);
836 }
837 len += sizeof(wxUint16);
838 psz++;
839 }
840 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
841
842 return len;
843}
844
845
846// swap 16bit MB to 16bit String
847size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
848{
849 size_t len=0;
850
851 while (*(wxUint16*)psz && (!buf || len < n))
852 {
853 if (buf)
854 {
855 ((char *)buf)[0] = psz[1];
856 ((char *)buf)[1] = psz[0];
857 buf++;
858 }
859 len++;
860 psz += sizeof(wxUint16);
861 }
862 if (buf && len<n) *buf=0;
863
864 return len;
865}
866
867
868// swap 16bit MB to 16bit String
869size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
870{
871 size_t len=0;
872
873 while (*psz && (!buf || len < n))
874 {
875 if (buf)
876 {
877 *buf++ = ((char*)psz)[1];
878 *buf++ = ((char*)psz)[0];
879 }
880 len += sizeof(wxUint16);
881 psz++;
882 }
883 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
884
885 return len;
886}
887
888
889#else // WC_UTF16
890
891
892// copy 16bit MB to 32bit String
893size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
894{
895 size_t len=0;
896
897 while (*(wxUint16*)psz && (!buf || len < n))
898 {
899 wxUint32 cc;
900 size_t pa=decode_utf16((wxUint16*)psz, cc);
901 if (pa == (size_t)-1)
902 return pa;
903
904 if (buf)
38d4b1e4 905 *buf++ = (wchar_t)cc;
c91830cb
VZ
906 len++;
907 psz += pa * sizeof(wxUint16);
908 }
909 if (buf && len<n) *buf=0;
910
911 return len;
912}
913
914
915// copy 32bit String to 16bit MB
916size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
917{
918 size_t len=0;
919
920 while (*psz && (!buf || len < n))
921 {
922 wxUint16 cc[2];
923 size_t pa=encode_utf16(*psz, cc);
924
925 if (pa == (size_t)-1)
926 return pa;
927
928 if (buf)
929 {
69b80d28 930 *(wxUint16*)buf = cc[0];
b5153fd8 931 buf += sizeof(wxUint16);
c91830cb 932 if (pa > 1)
69b80d28
VZ
933 {
934 *(wxUint16*)buf = cc[1];
935 buf += sizeof(wxUint16);
936 }
c91830cb
VZ
937 }
938
939 len += pa*sizeof(wxUint16);
940 psz++;
941 }
942 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
943
944 return len;
945}
946
947
948// swap 16bit MB to 32bit String
949size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
950{
951 size_t len=0;
952
953 while (*(wxUint16*)psz && (!buf || len < n))
954 {
955 wxUint32 cc;
956 char tmp[4];
957 tmp[0]=psz[1]; tmp[1]=psz[0];
958 tmp[2]=psz[3]; tmp[3]=psz[2];
959
960 size_t pa=decode_utf16((wxUint16*)tmp, cc);
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
38d4b1e4 965 *buf++ = (wchar_t)cc;
c91830cb
VZ
966
967 len++;
968 psz += pa * sizeof(wxUint16);
969 }
970 if (buf && len<n) *buf=0;
971
972 return len;
973}
974
975
976// swap 32bit String to 16bit MB
977size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
978{
979 size_t len=0;
980
981 while (*psz && (!buf || len < n))
982 {
983 wxUint16 cc[2];
984 size_t pa=encode_utf16(*psz, cc);
985
986 if (pa == (size_t)-1)
987 return pa;
988
989 if (buf)
990 {
991 *buf++ = ((char*)cc)[1];
992 *buf++ = ((char*)cc)[0];
993 if (pa > 1)
994 {
995 *buf++ = ((char*)cc)[3];
996 *buf++ = ((char*)cc)[2];
997 }
998 }
999
1000 len += pa*sizeof(wxUint16);
1001 psz++;
1002 }
1003 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1004
1005 return len;
1006}
1007
1008#endif // WC_UTF16
1009
1010
1011// ----------------------------------------------------------------------------
1012// UTF-32
1013// ----------------------------------------------------------------------------
1014
1015#ifdef WORDS_BIGENDIAN
1016#define wxMBConvUTF32straight wxMBConvUTF32BE
1017#define wxMBConvUTF32swap wxMBConvUTF32LE
1018#else
1019#define wxMBConvUTF32swap wxMBConvUTF32BE
1020#define wxMBConvUTF32straight wxMBConvUTF32LE
1021#endif
1022
1023
1024WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1025WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1026
1027
1028#ifdef WC_UTF16
1029
1030// copy 32bit MB to 16bit String
1031size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1032{
1033 size_t len=0;
1034
1035 while (*(wxUint32*)psz && (!buf || len < n))
1036 {
1037 wxUint16 cc[2];
1038
1039 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1040 if (pa == (size_t)-1)
1041 return pa;
1042
1043 if (buf)
1044 {
1045 *buf++ = cc[0];
1046 if (pa > 1)
1047 *buf++ = cc[1];
1048 }
1049 len += pa;
1050 psz += sizeof(wxUint32);
1051 }
1052 if (buf && len<n) *buf=0;
1053
1054 return len;
1055}
1056
1057
1058// copy 16bit String to 32bit MB
1059size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1060{
1061 size_t len=0;
1062
1063 while (*psz && (!buf || len < n))
1064 {
1065 wxUint32 cc;
1066
b5153fd8
VZ
1067 // cast is ok for WC_UTF16
1068 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
1069 if (pa == (size_t)-1)
1070 return pa;
1071
1072 if (buf)
1073 {
1074 *(wxUint32*)buf = cc;
1075 buf += sizeof(wxUint32);
1076 }
1077 len += sizeof(wxUint32);
1078 psz += pa;
1079 }
b5153fd8
VZ
1080
1081 if (buf && len<=n-sizeof(wxUint32))
1082 *(wxUint32*)buf=0;
c91830cb
VZ
1083
1084 return len;
1085}
1086
1087
1088
1089// swap 32bit MB to 16bit String
1090size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1091{
1092 size_t len=0;
1093
1094 while (*(wxUint32*)psz && (!buf || len < n))
1095 {
1096 char tmp[4];
1097 tmp[0] = psz[3]; tmp[1] = psz[2];
1098 tmp[2] = psz[1]; tmp[3] = psz[0];
1099
1100
1101 wxUint16 cc[2];
1102
1103 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1104 if (pa == (size_t)-1)
1105 return pa;
1106
1107 if (buf)
1108 {
1109 *buf++ = cc[0];
1110 if (pa > 1)
1111 *buf++ = cc[1];
1112 }
1113 len += pa;
1114 psz += sizeof(wxUint32);
1115 }
b5153fd8
VZ
1116
1117 if (buf && len<n)
1118 *buf=0;
c91830cb
VZ
1119
1120 return len;
1121}
1122
1123
1124// swap 16bit String to 32bit MB
1125size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126{
1127 size_t len=0;
1128
1129 while (*psz && (!buf || len < n))
1130 {
1131 char cc[4];
1132
b5153fd8
VZ
1133 // cast is ok for WC_UTF16
1134 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
1135 if (pa == (size_t)-1)
1136 return pa;
1137
1138 if (buf)
1139 {
1140 *buf++ = cc[3];
1141 *buf++ = cc[2];
1142 *buf++ = cc[1];
1143 *buf++ = cc[0];
1144 }
1145 len += sizeof(wxUint32);
1146 psz += pa;
1147 }
b5153fd8
VZ
1148
1149 if (buf && len<=n-sizeof(wxUint32))
1150 *(wxUint32*)buf=0;
c91830cb
VZ
1151
1152 return len;
1153}
1154
1155#else // WC_UTF16
1156
1157
1158// copy 32bit MB to 32bit String
1159size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1160{
1161 size_t len=0;
1162
1163 while (*(wxUint32*)psz && (!buf || len < n))
1164 {
1165 if (buf)
38d4b1e4 1166 *buf++ = (wchar_t)(*(wxUint32*)psz);
c91830cb
VZ
1167 len++;
1168 psz += sizeof(wxUint32);
1169 }
b5153fd8
VZ
1170
1171 if (buf && len<n)
1172 *buf=0;
c91830cb
VZ
1173
1174 return len;
1175}
1176
1177
1178// copy 32bit String to 32bit MB
1179size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1180{
1181 size_t len=0;
1182
1183 while (*psz && (!buf || len < n))
1184 {
1185 if (buf)
1186 {
1187 *(wxUint32*)buf = *psz;
1188 buf += sizeof(wxUint32);
1189 }
1190
1191 len += sizeof(wxUint32);
1192 psz++;
1193 }
1194
b5153fd8
VZ
1195 if (buf && len<=n-sizeof(wxUint32))
1196 *(wxUint32*)buf=0;
c91830cb
VZ
1197
1198 return len;
1199}
1200
1201
1202// swap 32bit MB to 32bit String
1203size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1204{
1205 size_t len=0;
1206
1207 while (*(wxUint32*)psz && (!buf || len < n))
1208 {
1209 if (buf)
1210 {
1211 ((char *)buf)[0] = psz[3];
1212 ((char *)buf)[1] = psz[2];
1213 ((char *)buf)[2] = psz[1];
1214 ((char *)buf)[3] = psz[0];
1215 buf++;
1216 }
1217 len++;
1218 psz += sizeof(wxUint32);
1219 }
b5153fd8
VZ
1220
1221 if (buf && len<n)
1222 *buf=0;
c91830cb
VZ
1223
1224 return len;
1225}
1226
1227
1228// swap 32bit String to 32bit MB
1229size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1230{
1231 size_t len=0;
1232
1233 while (*psz && (!buf || len < n))
1234 {
1235 if (buf)
1236 {
1237 *buf++ = ((char *)psz)[3];
1238 *buf++ = ((char *)psz)[2];
1239 *buf++ = ((char *)psz)[1];
1240 *buf++ = ((char *)psz)[0];
1241 }
1242 len += sizeof(wxUint32);
1243 psz++;
1244 }
b5153fd8
VZ
1245
1246 if (buf && len<=n-sizeof(wxUint32))
1247 *(wxUint32*)buf=0;
c91830cb
VZ
1248
1249 return len;
1250}
1251
1252
1253#endif // WC_UTF16
1254
1255
36acb880
VZ
1256// ============================================================================
1257// The classes doing conversion using the iconv_xxx() functions
1258// ============================================================================
3caec1bb 1259
b040e242 1260#ifdef HAVE_ICONV
3a0d76bc 1261
b1d547eb
VS
1262// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1263// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1264// (unless there's yet another bug in glibc) the only case when iconv()
1265// returns with (size_t)-1 (which means error) and says there are 0 bytes
1266// left in the input buffer -- when _real_ error occurs,
1267// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1268// iconv() failure.
3caec1bb
VS
1269// [This bug does not appear in glibc 2.2.]
1270#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1271#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1272 (errno != E2BIG || bufLeft != 0))
1273#else
1274#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1275#endif
1276
ab217dba 1277#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880 1278
74a7eb0b
VZ
1279#define ICONV_T_INVALID ((iconv_t)-1)
1280
1281#if SIZEOF_WCHAR_T == 4
1282 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1283 #define WC_ENC wxFONTENCODING_UTF32
1284#elif SIZEOF_WCHAR_T == 2
1285 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1286 #define WC_ENC wxFONTENCODING_UTF16
1287#else // sizeof(wchar_t) != 2 nor 4
1288 // does this ever happen?
1289 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1290#endif
1291
36acb880 1292// ----------------------------------------------------------------------------
e95354ec 1293// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1294// ----------------------------------------------------------------------------
1295
e95354ec 1296class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1297{
1298public:
e95354ec
VZ
1299 wxMBConv_iconv(const wxChar *name);
1300 virtual ~wxMBConv_iconv();
36acb880 1301
bde4baac
VZ
1302 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1303 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1304
e95354ec 1305 bool IsOk() const
74a7eb0b 1306 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
36acb880
VZ
1307
1308protected:
1309 // the iconv handlers used to translate from multibyte to wide char and in
1310 // the other direction
1311 iconv_t m2w,
1312 w2m;
b1d547eb
VS
1313#if wxUSE_THREADS
1314 // guards access to m2w and w2m objects
1315 wxMutex m_iconvMutex;
1316#endif
36acb880
VZ
1317
1318private:
e95354ec 1319 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880 1320 // available on this machine, it will remain NULL
74a7eb0b 1321 static wxString ms_wcCharsetName;
36acb880
VZ
1322
1323 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1324 // different endian-ness than the native one
405d8f46 1325 static bool ms_wcNeedsSwap;
36acb880
VZ
1326};
1327
8f115891
MW
1328// make the constructor available for unit testing
1329WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1330{
1331 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1332 if ( !result->IsOk() )
1333 {
1334 delete result;
1335 return 0;
1336 }
1337 return result;
1338}
1339
422e411e 1340wxString wxMBConv_iconv::ms_wcCharsetName;
e95354ec 1341bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1342
e95354ec 1343wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1344{
0331b385
VZ
1345 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1346 // names for the charsets
200a9923 1347 const wxCharBuffer cname(wxString(name).ToAscii());
04c79127 1348
36acb880 1349 // check for charset that represents wchar_t:
74a7eb0b 1350 if ( ms_wcCharsetName.empty() )
f1339c56 1351 {
c2b83fdd
VZ
1352 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1353
74a7eb0b
VZ
1354#if wxUSE_FONTMAP
1355 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1356#else // !wxUSE_FONTMAP
1357 static const wxChar *names[] =
36acb880 1358 {
74a7eb0b
VZ
1359#if SIZEOF_WCHAR_T == 4
1360 _T("UCS-4"),
1361#elif SIZEOF_WCHAR_T = 2
1362 _T("UCS-2"),
1363#endif
1364 NULL
1365 };
1366#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
36acb880 1367
d1f024a8 1368 for ( ; *names && ms_wcCharsetName.empty(); ++names )
74a7eb0b 1369 {
17a1ebd1 1370 const wxString nameCS(*names);
74a7eb0b
VZ
1371
1372 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
17a1ebd1 1373 wxString nameXE(nameCS);
74a7eb0b
VZ
1374 #ifdef WORDS_BIGENDIAN
1375 nameXE += _T("BE");
1376 #else // little endian
1377 nameXE += _T("LE");
1378 #endif
1379
c2b83fdd
VZ
1380 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1381 nameXE.c_str());
1382
74a7eb0b
VZ
1383 m2w = iconv_open(nameXE.ToAscii(), cname);
1384 if ( m2w == ICONV_T_INVALID )
3a0d76bc 1385 {
74a7eb0b 1386 // try charset w/o bytesex info (e.g. "UCS4")
c2b83fdd
VZ
1387 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1388 nameCS.c_str());
17a1ebd1 1389 m2w = iconv_open(nameCS.ToAscii(), cname);
3a0d76bc 1390
74a7eb0b
VZ
1391 // and check for bytesex ourselves:
1392 if ( m2w != ICONV_T_INVALID )
3a0d76bc 1393 {
74a7eb0b
VZ
1394 char buf[2], *bufPtr;
1395 wchar_t wbuf[2], *wbufPtr;
1396 size_t insz, outsz;
1397 size_t res;
1398
1399 buf[0] = 'A';
1400 buf[1] = 0;
1401 wbuf[0] = 0;
1402 insz = 2;
1403 outsz = SIZEOF_WCHAR_T * 2;
1404 wbufPtr = wbuf;
1405 bufPtr = buf;
1406
1407 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1408 (char**)&wbufPtr, &outsz);
1409
1410 if (ICONV_FAILED(res, insz))
1411 {
1412 wxLogLastError(wxT("iconv"));
422e411e 1413 wxLogError(_("Conversion to charset '%s' doesn't work."),
17a1ebd1 1414 nameCS.c_str());
74a7eb0b
VZ
1415 }
1416 else // ok, can convert to this encoding, remember it
1417 {
17a1ebd1 1418 ms_wcCharsetName = nameCS;
74a7eb0b
VZ
1419 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1420 }
3a0d76bc
VS
1421 }
1422 }
74a7eb0b 1423 else // use charset not requiring byte swapping
36acb880 1424 {
74a7eb0b 1425 ms_wcCharsetName = nameXE;
36acb880 1426 }
3a0d76bc 1427 }
74a7eb0b 1428
0944fceb 1429 wxLogTrace(TRACE_STRCONV,
74a7eb0b 1430 wxT("iconv wchar_t charset is \"%s\"%s"),
cae8f1bf 1431 ms_wcCharsetName.empty() ? _T("<none>")
74a7eb0b
VZ
1432 : ms_wcCharsetName.c_str(),
1433 ms_wcNeedsSwap ? _T(" (needs swap)")
1434 : _T(""));
3a0d76bc 1435 }
36acb880 1436 else // we already have ms_wcCharsetName
3caec1bb 1437 {
74a7eb0b 1438 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
f1339c56 1439 }
dccce9ea 1440
74a7eb0b 1441 if ( ms_wcCharsetName.empty() )
f1339c56 1442 {
74a7eb0b 1443 w2m = ICONV_T_INVALID;
36acb880 1444 }
405d8f46
VZ
1445 else
1446 {
74a7eb0b
VZ
1447 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1448 if ( w2m == ICONV_T_INVALID )
1449 {
1450 wxLogTrace(TRACE_STRCONV,
1451 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
422e411e 1452 ms_wcCharsetName.c_str(), cname.data());
74a7eb0b 1453 }
405d8f46 1454 }
36acb880 1455}
3caec1bb 1456
e95354ec 1457wxMBConv_iconv::~wxMBConv_iconv()
36acb880 1458{
74a7eb0b 1459 if ( m2w != ICONV_T_INVALID )
36acb880 1460 iconv_close(m2w);
74a7eb0b 1461 if ( w2m != ICONV_T_INVALID )
36acb880
VZ
1462 iconv_close(w2m);
1463}
3a0d76bc 1464
bde4baac 1465size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880 1466{
b1d547eb
VS
1467#if wxUSE_THREADS
1468 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1469 // Unfortunately there is a couple of global wxCSConv objects such as
1470 // wxConvLocal that are used all over wx code, so we have to make sure
1471 // the handle is used by at most one thread at the time. Otherwise
1472 // only a few wx classes would be safe to use from non-main threads
1473 // as MB<->WC conversion would fail "randomly".
1474 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1475#endif
3698ae71 1476
36acb880
VZ
1477 size_t inbuf = strlen(psz);
1478 size_t outbuf = n * SIZEOF_WCHAR_T;
1479 size_t res, cres;
1480 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1481 wchar_t *bufPtr = buf;
1482 const char *pszPtr = psz;
1483
1484 if (buf)
1485 {
1486 // have destination buffer, convert there
1487 cres = iconv(m2w,
1488 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1489 (char**)&bufPtr, &outbuf);
1490 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1491
36acb880 1492 if (ms_wcNeedsSwap)
3a0d76bc 1493 {
36acb880 1494 // convert to native endianness
17a1ebd1
VZ
1495 for ( unsigned i = 0; i < res; i++ )
1496 buf[n] = WC_BSWAP(buf[i]);
3a0d76bc 1497 }
adb45366 1498
49dd9820
VS
1499 // NB: iconv was given only strlen(psz) characters on input, and so
1500 // it couldn't convert the trailing zero. Let's do it ourselves
1501 // if there's some room left for it in the output buffer.
1502 if (res < n)
1503 buf[res] = 0;
36acb880
VZ
1504 }
1505 else
1506 {
1507 // no destination buffer... convert using temp buffer
1508 // to calculate destination buffer requirement
1509 wchar_t tbuf[8];
1510 res = 0;
1511 do {
1512 bufPtr = tbuf;
1513 outbuf = 8*SIZEOF_WCHAR_T;
1514
1515 cres = iconv(m2w,
1516 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1517 (char**)&bufPtr, &outbuf );
1518
1519 res += 8-(outbuf/SIZEOF_WCHAR_T);
1520 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1521 }
dccce9ea 1522
36acb880 1523 if (ICONV_FAILED(cres, inbuf))
f1339c56 1524 {
36acb880 1525 //VS: it is ok if iconv fails, hence trace only
ce6f8d6f 1526 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1527 return (size_t)-1;
1528 }
1529
1530 return res;
1531}
1532
bde4baac 1533size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1534{
b1d547eb
VS
1535#if wxUSE_THREADS
1536 // NB: explained in MB2WC
1537 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1538#endif
3698ae71 1539
156162ec
MW
1540 size_t inlen = wxWcslen(psz);
1541 size_t inbuf = inlen * SIZEOF_WCHAR_T;
36acb880
VZ
1542 size_t outbuf = n;
1543 size_t res, cres;
3a0d76bc 1544
36acb880 1545 wchar_t *tmpbuf = 0;
3caec1bb 1546
36acb880
VZ
1547 if (ms_wcNeedsSwap)
1548 {
1549 // need to copy to temp buffer to switch endianness
74a7eb0b 1550 // (doing WC_BSWAP twice on the original buffer won't help, as it
36acb880 1551 // could be in read-only memory, or be accessed in some other thread)
74a7eb0b 1552 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
17a1ebd1
VZ
1553 for ( size_t i = 0; i < inlen; i++ )
1554 tmpbuf[n] = WC_BSWAP(psz[i]);
156162ec 1555 tmpbuf[inlen] = L'\0';
74a7eb0b 1556 psz = tmpbuf;
36acb880 1557 }
3a0d76bc 1558
36acb880
VZ
1559 if (buf)
1560 {
1561 // have destination buffer, convert there
1562 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1563
36acb880 1564 res = n-outbuf;
adb45366 1565
49dd9820
VS
1566 // NB: iconv was given only wcslen(psz) characters on input, and so
1567 // it couldn't convert the trailing zero. Let's do it ourselves
1568 // if there's some room left for it in the output buffer.
1569 if (res < n)
1570 buf[0] = 0;
36acb880
VZ
1571 }
1572 else
1573 {
1574 // no destination buffer... convert using temp buffer
1575 // to calculate destination buffer requirement
1576 char tbuf[16];
1577 res = 0;
1578 do {
1579 buf = tbuf; outbuf = 16;
1580
1581 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1582
36acb880
VZ
1583 res += 16 - outbuf;
1584 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1585 }
dccce9ea 1586
36acb880
VZ
1587 if (ms_wcNeedsSwap)
1588 {
1589 free(tmpbuf);
1590 }
dccce9ea 1591
36acb880
VZ
1592 if (ICONV_FAILED(cres, inbuf))
1593 {
ce6f8d6f 1594 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
36acb880
VZ
1595 return (size_t)-1;
1596 }
1597
1598 return res;
1599}
1600
b040e242 1601#endif // HAVE_ICONV
36acb880 1602
e95354ec 1603
36acb880
VZ
1604// ============================================================================
1605// Win32 conversion classes
1606// ============================================================================
1cd52418 1607
e95354ec 1608#ifdef wxHAVE_WIN32_MB2WC
373658eb 1609
8b04d4c4 1610// from utils.cpp
d775fa82 1611#if wxUSE_FONTMAP
8b04d4c4
VZ
1612extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1613extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1614#endif
373658eb 1615
e95354ec 1616class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1617{
1618public:
bde4baac
VZ
1619 wxMBConv_win32()
1620 {
1621 m_CodePage = CP_ACP;
1622 }
1623
7608a683 1624#if wxUSE_FONTMAP
e95354ec 1625 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1626 {
1627 m_CodePage = wxCharsetToCodepage(name);
1628 }
dccce9ea 1629
e95354ec 1630 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1631 {
1632 m_CodePage = wxEncodingToCodepage(encoding);
1633 }
7608a683 1634#endif
8b04d4c4 1635
bde4baac 1636 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1637 {
02272c9c
VZ
1638 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1639 // the behaviour is not compatible with the Unix version (using iconv)
1640 // and break the library itself, e.g. wxTextInputStream::NextChar()
1641 // wouldn't work if reading an incomplete MB char didn't result in an
1642 // error
667e5b3e
VZ
1643 //
1644 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1645 // an error (tested under Windows Server 2003) and apparently it is
1646 // done on purpose, i.e. the function accepts any input in this case
1647 // and although I'd prefer to return error on ill-formed output, our
1648 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1649 // explicitly ill-formed according to RFC 2152) neither so we don't
1650 // even have any fallback here...
1651 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1652
2b5f62a0
VZ
1653 const size_t len = ::MultiByteToWideChar
1654 (
1655 m_CodePage, // code page
667e5b3e 1656 flags, // flags: fall on error
2b5f62a0
VZ
1657 psz, // input string
1658 -1, // its length (NUL-terminated)
b4da152e 1659 buf, // output string
2b5f62a0
VZ
1660 buf ? n : 0 // size of output buffer
1661 );
1662
03a991bc
VZ
1663 // note that it returns count of written chars for buf != NULL and size
1664 // of the needed buffer for buf == NULL so in either case the length of
1665 // the string (which never includes the terminating NUL) is one less
1666 return len ? len - 1 : (size_t)-1;
f1339c56 1667 }
dccce9ea 1668
13dd924a 1669 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1670 {
13dd924a
VZ
1671 /*
1672 we have a problem here: by default, WideCharToMultiByte() may
1673 replace characters unrepresentable in the target code page with bad
1674 quality approximations such as turning "1/2" symbol (U+00BD) into
1675 "1" for the code pages which don't have it and we, obviously, want
1676 to avoid this at any price
d775fa82 1677
13dd924a
VZ
1678 the trouble is that this function does it _silently_, i.e. it won't
1679 even tell us whether it did or not... Win98/2000 and higher provide
1680 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1681 we have to resort to a round trip, i.e. check that converting back
1682 results in the same string -- this is, of course, expensive but
1683 otherwise we simply can't be sure to not garble the data.
1684 */
1685
1686 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1687 // it doesn't work with CJK encodings (which we test for rather roughly
1688 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1689 // supporting it
907173e5
WS
1690 BOOL usedDef wxDUMMY_INITIALIZE(false);
1691 BOOL *pUsedDef;
13dd924a
VZ
1692 int flags;
1693 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1694 {
1695 // it's our lucky day
1696 flags = WC_NO_BEST_FIT_CHARS;
1697 pUsedDef = &usedDef;
1698 }
1699 else // old system or unsupported encoding
1700 {
1701 flags = 0;
1702 pUsedDef = NULL;
1703 }
1704
2b5f62a0
VZ
1705 const size_t len = ::WideCharToMultiByte
1706 (
1707 m_CodePage, // code page
13dd924a
VZ
1708 flags, // either none or no best fit
1709 pwz, // input string
2b5f62a0
VZ
1710 -1, // it is (wide) NUL-terminated
1711 buf, // output buffer
1712 buf ? n : 0, // and its size
1713 NULL, // default "replacement" char
13dd924a 1714 pUsedDef // [out] was it used?
2b5f62a0
VZ
1715 );
1716
13dd924a
VZ
1717 if ( !len )
1718 {
1719 // function totally failed
1720 return (size_t)-1;
1721 }
1722
1723 // if we were really converting, check if we succeeded
1724 if ( buf )
1725 {
1726 if ( flags )
1727 {
1728 // check if the conversion failed, i.e. if any replacements
1729 // were done
1730 if ( usedDef )
1731 return (size_t)-1;
1732 }
1733 else // we must resort to double tripping...
1734 {
1735 wxWCharBuffer wcBuf(n);
1736 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1737 wcscmp(wcBuf, pwz) != 0 )
1738 {
1739 // we didn't obtain the same thing we started from, hence
1740 // the conversion was lossy and we consider that it failed
1741 return (size_t)-1;
1742 }
1743 }
1744 }
1745
03a991bc 1746 // see the comment above for the reason of "len - 1"
13dd924a 1747 return len - 1;
f1339c56 1748 }
dccce9ea 1749
13dd924a
VZ
1750 bool IsOk() const { return m_CodePage != -1; }
1751
1752private:
1753 static bool CanUseNoBestFit()
1754 {
1755 static int s_isWin98Or2k = -1;
1756
1757 if ( s_isWin98Or2k == -1 )
1758 {
1759 int verMaj, verMin;
1760 switch ( wxGetOsVersion(&verMaj, &verMin) )
1761 {
1762 case wxWIN95:
1763 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1764 break;
1765
1766 case wxWINDOWS_NT:
1767 s_isWin98Or2k = verMaj >= 5;
1768 break;
1769
1770 default:
1771 // unknown, be conseravtive by default
1772 s_isWin98Or2k = 0;
1773 }
1774
1775 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1776 }
1777
1778 return s_isWin98Or2k == 1;
1779 }
f1339c56 1780
b1d66b54 1781 long m_CodePage;
1cd52418 1782};
e95354ec
VZ
1783
1784#endif // wxHAVE_WIN32_MB2WC
1785
f7e98dee
RN
1786// ============================================================================
1787// Cocoa conversion classes
1788// ============================================================================
1789
1790#if defined(__WXCOCOA__)
1791
ecd9653b 1792// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1793// Cocoa. Strangely enough, internally Core Foundation uses
1794// UTF 32 internally quite a bit - its just not public (yet).
1795
1796#include <CoreFoundation/CFString.h>
1797#include <CoreFoundation/CFStringEncodingExt.h>
1798
1799CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1800{
638357a0 1801 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1802 if ( encoding == wxFONTENCODING_DEFAULT )
1803 {
638357a0 1804 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1805 }
1806 else switch( encoding)
1807 {
1808 case wxFONTENCODING_ISO8859_1 :
1809 enc = kCFStringEncodingISOLatin1 ;
1810 break ;
1811 case wxFONTENCODING_ISO8859_2 :
1812 enc = kCFStringEncodingISOLatin2;
1813 break ;
1814 case wxFONTENCODING_ISO8859_3 :
1815 enc = kCFStringEncodingISOLatin3 ;
1816 break ;
1817 case wxFONTENCODING_ISO8859_4 :
1818 enc = kCFStringEncodingISOLatin4;
1819 break ;
1820 case wxFONTENCODING_ISO8859_5 :
1821 enc = kCFStringEncodingISOLatinCyrillic;
1822 break ;
1823 case wxFONTENCODING_ISO8859_6 :
1824 enc = kCFStringEncodingISOLatinArabic;
1825 break ;
1826 case wxFONTENCODING_ISO8859_7 :
1827 enc = kCFStringEncodingISOLatinGreek;
1828 break ;
1829 case wxFONTENCODING_ISO8859_8 :
1830 enc = kCFStringEncodingISOLatinHebrew;
1831 break ;
1832 case wxFONTENCODING_ISO8859_9 :
1833 enc = kCFStringEncodingISOLatin5;
1834 break ;
1835 case wxFONTENCODING_ISO8859_10 :
1836 enc = kCFStringEncodingISOLatin6;
1837 break ;
1838 case wxFONTENCODING_ISO8859_11 :
1839 enc = kCFStringEncodingISOLatinThai;
1840 break ;
1841 case wxFONTENCODING_ISO8859_13 :
1842 enc = kCFStringEncodingISOLatin7;
1843 break ;
1844 case wxFONTENCODING_ISO8859_14 :
1845 enc = kCFStringEncodingISOLatin8;
1846 break ;
1847 case wxFONTENCODING_ISO8859_15 :
1848 enc = kCFStringEncodingISOLatin9;
1849 break ;
1850
1851 case wxFONTENCODING_KOI8 :
1852 enc = kCFStringEncodingKOI8_R;
1853 break ;
1854 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1855 enc = kCFStringEncodingDOSRussian;
1856 break ;
1857
1858// case wxFONTENCODING_BULGARIAN :
1859// enc = ;
1860// break ;
1861
1862 case wxFONTENCODING_CP437 :
1863 enc =kCFStringEncodingDOSLatinUS ;
1864 break ;
1865 case wxFONTENCODING_CP850 :
1866 enc = kCFStringEncodingDOSLatin1;
1867 break ;
1868 case wxFONTENCODING_CP852 :
1869 enc = kCFStringEncodingDOSLatin2;
1870 break ;
1871 case wxFONTENCODING_CP855 :
1872 enc = kCFStringEncodingDOSCyrillic;
1873 break ;
1874 case wxFONTENCODING_CP866 :
1875 enc =kCFStringEncodingDOSRussian ;
1876 break ;
1877 case wxFONTENCODING_CP874 :
1878 enc = kCFStringEncodingDOSThai;
1879 break ;
1880 case wxFONTENCODING_CP932 :
1881 enc = kCFStringEncodingDOSJapanese;
1882 break ;
1883 case wxFONTENCODING_CP936 :
1884 enc =kCFStringEncodingDOSChineseSimplif ;
1885 break ;
1886 case wxFONTENCODING_CP949 :
1887 enc = kCFStringEncodingDOSKorean;
1888 break ;
1889 case wxFONTENCODING_CP950 :
1890 enc = kCFStringEncodingDOSChineseTrad;
1891 break ;
ecd9653b
WS
1892 case wxFONTENCODING_CP1250 :
1893 enc = kCFStringEncodingWindowsLatin2;
1894 break ;
1895 case wxFONTENCODING_CP1251 :
1896 enc =kCFStringEncodingWindowsCyrillic ;
1897 break ;
1898 case wxFONTENCODING_CP1252 :
1899 enc =kCFStringEncodingWindowsLatin1 ;
1900 break ;
1901 case wxFONTENCODING_CP1253 :
1902 enc = kCFStringEncodingWindowsGreek;
1903 break ;
1904 case wxFONTENCODING_CP1254 :
1905 enc = kCFStringEncodingWindowsLatin5;
1906 break ;
1907 case wxFONTENCODING_CP1255 :
1908 enc =kCFStringEncodingWindowsHebrew ;
1909 break ;
1910 case wxFONTENCODING_CP1256 :
1911 enc =kCFStringEncodingWindowsArabic ;
1912 break ;
1913 case wxFONTENCODING_CP1257 :
1914 enc = kCFStringEncodingWindowsBalticRim;
1915 break ;
638357a0
RN
1916// This only really encodes to UTF7 (if that) evidently
1917// case wxFONTENCODING_UTF7 :
1918// enc = kCFStringEncodingNonLossyASCII ;
1919// break ;
ecd9653b
WS
1920 case wxFONTENCODING_UTF8 :
1921 enc = kCFStringEncodingUTF8 ;
1922 break ;
1923 case wxFONTENCODING_EUC_JP :
1924 enc = kCFStringEncodingEUC_JP;
1925 break ;
1926 case wxFONTENCODING_UTF16 :
f7e98dee 1927 enc = kCFStringEncodingUnicode ;
ecd9653b 1928 break ;
f7e98dee
RN
1929 case wxFONTENCODING_MACROMAN :
1930 enc = kCFStringEncodingMacRoman ;
1931 break ;
1932 case wxFONTENCODING_MACJAPANESE :
1933 enc = kCFStringEncodingMacJapanese ;
1934 break ;
1935 case wxFONTENCODING_MACCHINESETRAD :
1936 enc = kCFStringEncodingMacChineseTrad ;
1937 break ;
1938 case wxFONTENCODING_MACKOREAN :
1939 enc = kCFStringEncodingMacKorean ;
1940 break ;
1941 case wxFONTENCODING_MACARABIC :
1942 enc = kCFStringEncodingMacArabic ;
1943 break ;
1944 case wxFONTENCODING_MACHEBREW :
1945 enc = kCFStringEncodingMacHebrew ;
1946 break ;
1947 case wxFONTENCODING_MACGREEK :
1948 enc = kCFStringEncodingMacGreek ;
1949 break ;
1950 case wxFONTENCODING_MACCYRILLIC :
1951 enc = kCFStringEncodingMacCyrillic ;
1952 break ;
1953 case wxFONTENCODING_MACDEVANAGARI :
1954 enc = kCFStringEncodingMacDevanagari ;
1955 break ;
1956 case wxFONTENCODING_MACGURMUKHI :
1957 enc = kCFStringEncodingMacGurmukhi ;
1958 break ;
1959 case wxFONTENCODING_MACGUJARATI :
1960 enc = kCFStringEncodingMacGujarati ;
1961 break ;
1962 case wxFONTENCODING_MACORIYA :
1963 enc = kCFStringEncodingMacOriya ;
1964 break ;
1965 case wxFONTENCODING_MACBENGALI :
1966 enc = kCFStringEncodingMacBengali ;
1967 break ;
1968 case wxFONTENCODING_MACTAMIL :
1969 enc = kCFStringEncodingMacTamil ;
1970 break ;
1971 case wxFONTENCODING_MACTELUGU :
1972 enc = kCFStringEncodingMacTelugu ;
1973 break ;
1974 case wxFONTENCODING_MACKANNADA :
1975 enc = kCFStringEncodingMacKannada ;
1976 break ;
1977 case wxFONTENCODING_MACMALAJALAM :
1978 enc = kCFStringEncodingMacMalayalam ;
1979 break ;
1980 case wxFONTENCODING_MACSINHALESE :
1981 enc = kCFStringEncodingMacSinhalese ;
1982 break ;
1983 case wxFONTENCODING_MACBURMESE :
1984 enc = kCFStringEncodingMacBurmese ;
1985 break ;
1986 case wxFONTENCODING_MACKHMER :
1987 enc = kCFStringEncodingMacKhmer ;
1988 break ;
1989 case wxFONTENCODING_MACTHAI :
1990 enc = kCFStringEncodingMacThai ;
1991 break ;
1992 case wxFONTENCODING_MACLAOTIAN :
1993 enc = kCFStringEncodingMacLaotian ;
1994 break ;
1995 case wxFONTENCODING_MACGEORGIAN :
1996 enc = kCFStringEncodingMacGeorgian ;
1997 break ;
1998 case wxFONTENCODING_MACARMENIAN :
1999 enc = kCFStringEncodingMacArmenian ;
2000 break ;
2001 case wxFONTENCODING_MACCHINESESIMP :
2002 enc = kCFStringEncodingMacChineseSimp ;
2003 break ;
2004 case wxFONTENCODING_MACTIBETAN :
2005 enc = kCFStringEncodingMacTibetan ;
2006 break ;
2007 case wxFONTENCODING_MACMONGOLIAN :
2008 enc = kCFStringEncodingMacMongolian ;
2009 break ;
2010 case wxFONTENCODING_MACETHIOPIC :
2011 enc = kCFStringEncodingMacEthiopic ;
2012 break ;
2013 case wxFONTENCODING_MACCENTRALEUR :
2014 enc = kCFStringEncodingMacCentralEurRoman ;
2015 break ;
2016 case wxFONTENCODING_MACVIATNAMESE :
2017 enc = kCFStringEncodingMacVietnamese ;
2018 break ;
2019 case wxFONTENCODING_MACARABICEXT :
2020 enc = kCFStringEncodingMacExtArabic ;
2021 break ;
2022 case wxFONTENCODING_MACSYMBOL :
2023 enc = kCFStringEncodingMacSymbol ;
2024 break ;
2025 case wxFONTENCODING_MACDINGBATS :
2026 enc = kCFStringEncodingMacDingbats ;
2027 break ;
2028 case wxFONTENCODING_MACTURKISH :
2029 enc = kCFStringEncodingMacTurkish ;
2030 break ;
2031 case wxFONTENCODING_MACCROATIAN :
2032 enc = kCFStringEncodingMacCroatian ;
2033 break ;
2034 case wxFONTENCODING_MACICELANDIC :
2035 enc = kCFStringEncodingMacIcelandic ;
2036 break ;
2037 case wxFONTENCODING_MACROMANIAN :
2038 enc = kCFStringEncodingMacRomanian ;
2039 break ;
2040 case wxFONTENCODING_MACCELTIC :
2041 enc = kCFStringEncodingMacCeltic ;
2042 break ;
2043 case wxFONTENCODING_MACGAELIC :
2044 enc = kCFStringEncodingMacGaelic ;
2045 break ;
ecd9653b
WS
2046// case wxFONTENCODING_MACKEYBOARD :
2047// enc = kCFStringEncodingMacKeyboardGlyphs ;
2048// break ;
2049 default :
2050 // because gcc is picky
2051 break ;
2052 } ;
2053 return enc ;
f7e98dee
RN
2054}
2055
f7e98dee
RN
2056class wxMBConv_cocoa : public wxMBConv
2057{
2058public:
2059 wxMBConv_cocoa()
2060 {
2061 Init(CFStringGetSystemEncoding()) ;
2062 }
2063
a6900d10 2064#if wxUSE_FONTMAP
f7e98dee
RN
2065 wxMBConv_cocoa(const wxChar* name)
2066 {
267e11c5 2067 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
f7e98dee 2068 }
a6900d10 2069#endif
f7e98dee
RN
2070
2071 wxMBConv_cocoa(wxFontEncoding encoding)
2072 {
2073 Init( wxCFStringEncFromFontEnc(encoding) );
2074 }
2075
2076 ~wxMBConv_cocoa()
2077 {
2078 }
2079
2080 void Init( CFStringEncoding encoding)
2081 {
638357a0 2082 m_encoding = encoding ;
f7e98dee
RN
2083 }
2084
2085 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2086 {
2087 wxASSERT(szUnConv);
ecd9653b 2088
638357a0
RN
2089 CFStringRef theString = CFStringCreateWithBytes (
2090 NULL, //the allocator
2091 (const UInt8*)szUnConv,
2092 strlen(szUnConv),
2093 m_encoding,
2094 false //no BOM/external representation
f7e98dee
RN
2095 );
2096
2097 wxASSERT(theString);
2098
638357a0
RN
2099 size_t nOutLength = CFStringGetLength(theString);
2100
2101 if (szOut == NULL)
f7e98dee 2102 {
f7e98dee 2103 CFRelease(theString);
638357a0 2104 return nOutLength;
f7e98dee 2105 }
ecd9653b 2106
638357a0 2107 CFRange theRange = { 0, nOutSize };
ecd9653b 2108
638357a0
RN
2109#if SIZEOF_WCHAR_T == 4
2110 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2111#endif
3698ae71 2112
f7e98dee 2113 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
3698ae71 2114
f7e98dee 2115 CFRelease(theString);
ecd9653b 2116
638357a0 2117 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
2118
2119#if SIZEOF_WCHAR_T == 4
2120 wxMBConvUTF16 converter ;
638357a0 2121 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
2122 delete[] szUniCharBuffer;
2123#endif
3698ae71 2124
638357a0 2125 return nOutLength;
f7e98dee
RN
2126 }
2127
2128 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2129 {
638357a0 2130 wxASSERT(szUnConv);
3698ae71 2131
f7e98dee 2132 size_t nRealOutSize;
638357a0 2133 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 2134 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 2135
f7e98dee 2136#if SIZEOF_WCHAR_T == 4
d9d488cf 2137 wxMBConvUTF16 converter ;
f7e98dee
RN
2138 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2139 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2140 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2141 nBufSize /= sizeof(UniChar);
f7e98dee
RN
2142#endif
2143
2144 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2145 NULL, //allocator
2146 szUniBuffer,
2147 nBufSize,
638357a0 2148 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 2149 );
ecd9653b 2150
f7e98dee 2151 wxASSERT(theString);
ecd9653b 2152
f7e98dee 2153 //Note that CER puts a BOM when converting to unicode
638357a0
RN
2154 //so we check and use getchars instead in that case
2155 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 2156 {
638357a0
RN
2157 if (szOut != NULL)
2158 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
3698ae71 2159
638357a0
RN
2160 nRealOutSize = CFStringGetLength(theString) + 1;
2161 }
2162 else
2163 {
2164 CFStringGetBytes(
2165 theString,
2166 CFRangeMake(0, CFStringGetLength(theString)),
2167 m_encoding,
2168 0, //what to put in characters that can't be converted -
2169 //0 tells CFString to return NULL if it meets such a character
2170 false, //not an external representation
2171 (UInt8*) szOut,
3698ae71 2172 nOutSize,
638357a0
RN
2173 (CFIndex*) &nRealOutSize
2174 );
f7e98dee 2175 }
ecd9653b 2176
638357a0 2177 CFRelease(theString);
ecd9653b 2178
638357a0
RN
2179#if SIZEOF_WCHAR_T == 4
2180 delete[] szUniBuffer;
2181#endif
ecd9653b 2182
f7e98dee
RN
2183 return nRealOutSize - 1;
2184 }
2185
2186 bool IsOk() const
ecd9653b 2187 {
3698ae71 2188 return m_encoding != kCFStringEncodingInvalidId &&
638357a0 2189 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
2190 }
2191
2192private:
638357a0 2193 CFStringEncoding m_encoding ;
f7e98dee
RN
2194};
2195
2196#endif // defined(__WXCOCOA__)
2197
335d31e0
SC
2198// ============================================================================
2199// Mac conversion classes
2200// ============================================================================
2201
2202#if defined(__WXMAC__) && defined(TARGET_CARBON)
2203
2204class wxMBConv_mac : public wxMBConv
2205{
2206public:
2207 wxMBConv_mac()
2208 {
2209 Init(CFStringGetSystemEncoding()) ;
2210 }
2211
2d1659cf 2212#if wxUSE_FONTMAP
335d31e0
SC
2213 wxMBConv_mac(const wxChar* name)
2214 {
267e11c5 2215 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0 2216 }
2d1659cf 2217#endif
335d31e0
SC
2218
2219 wxMBConv_mac(wxFontEncoding encoding)
2220 {
d775fa82
WS
2221 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2222 }
2223
2224 ~wxMBConv_mac()
2225 {
2226 OSStatus status = noErr ;
2227 status = TECDisposeConverter(m_MB2WC_converter);
2228 status = TECDisposeConverter(m_WC2MB_converter);
2229 }
2230
2231
2232 void Init( TextEncodingBase encoding)
2233 {
2234 OSStatus status = noErr ;
2235 m_char_encoding = encoding ;
2236 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2237
2238 status = TECCreateConverter(&m_MB2WC_converter,
2239 m_char_encoding,
2240 m_unicode_encoding);
2241 status = TECCreateConverter(&m_WC2MB_converter,
2242 m_unicode_encoding,
2243 m_char_encoding);
2244 }
2245
335d31e0
SC
2246 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2247 {
d775fa82
WS
2248 OSStatus status = noErr ;
2249 ByteCount byteOutLen ;
2250 ByteCount byteInLen = strlen(psz) ;
2251 wchar_t *tbuf = NULL ;
2252 UniChar* ubuf = NULL ;
2253 size_t res = 0 ;
2254
2255 if (buf == NULL)
2256 {
638357a0 2257 //apple specs say at least 32
c543817b 2258 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2259 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2260 }
2261 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2262#if SIZEOF_WCHAR_T == 4
d775fa82 2263 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2264#else
d775fa82 2265 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2266#endif
d775fa82
WS
2267 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2268 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2269#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2270 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2271 // is not properly terminated we get random characters at the end
2272 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d9d488cf 2273 wxMBConvUTF16 converter ;
d775fa82
WS
2274 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2275 free( ubuf ) ;
f3a355ce 2276#else
d775fa82 2277 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2278#endif
d775fa82
WS
2279 if ( buf == NULL )
2280 free(tbuf) ;
335d31e0 2281
335d31e0
SC
2282 if ( buf && res < n)
2283 buf[res] = 0;
2284
d775fa82 2285 return res ;
335d31e0
SC
2286 }
2287
2288 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2289 {
2290 OSStatus status = noErr ;
2291 ByteCount byteOutLen ;
2292 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2293
2294 char *tbuf = NULL ;
2295
2296 if (buf == NULL)
2297 {
638357a0 2298 //apple specs say at least 32
c543817b 2299 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2300 tbuf = (char*) malloc( n ) ;
2301 }
2302
2303 ByteCount byteBufferLen = n ;
2304 UniChar* ubuf = NULL ;
f3a355ce 2305#if SIZEOF_WCHAR_T == 4
d9d488cf 2306 wxMBConvUTF16 converter ;
d775fa82
WS
2307 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2308 byteInLen = unicharlen ;
2309 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2310 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2311#else
d775fa82 2312 ubuf = (UniChar*) psz ;
f3a355ce 2313#endif
d775fa82
WS
2314 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2315 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2316#if SIZEOF_WCHAR_T == 4
d775fa82 2317 free( ubuf ) ;
f3a355ce 2318#endif
d775fa82
WS
2319 if ( buf == NULL )
2320 free(tbuf) ;
335d31e0 2321
d775fa82 2322 size_t res = byteOutLen ;
335d31e0 2323 if ( buf && res < n)
638357a0 2324 {
335d31e0 2325 buf[res] = 0;
3698ae71 2326
638357a0
RN
2327 //we need to double-trip to verify it didn't insert any ? in place
2328 //of bogus characters
2329 wxWCharBuffer wcBuf(n);
2330 size_t pszlen = wxWcslen(psz);
2331 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2332 wxWcslen(wcBuf) != pszlen ||
2333 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2334 {
2335 // we didn't obtain the same thing we started from, hence
2336 // the conversion was lossy and we consider that it failed
2337 return (size_t)-1;
2338 }
2339 }
335d31e0 2340
d775fa82 2341 return res ;
335d31e0
SC
2342 }
2343
2344 bool IsOk() const
2345 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2346
2347private:
d775fa82
WS
2348 TECObjectRef m_MB2WC_converter ;
2349 TECObjectRef m_WC2MB_converter ;
2350
2351 TextEncodingBase m_char_encoding ;
2352 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2353};
2354
2355#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2356
36acb880
VZ
2357// ============================================================================
2358// wxEncodingConverter based conversion classes
2359// ============================================================================
2360
1e6feb95 2361#if wxUSE_FONTMAP
1cd52418 2362
e95354ec 2363class wxMBConv_wxwin : public wxMBConv
1cd52418 2364{
8b04d4c4
VZ
2365private:
2366 void Init()
2367 {
2368 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2369 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2370 }
2371
6001e347 2372public:
f1339c56
RR
2373 // temporarily just use wxEncodingConverter stuff,
2374 // so that it works while a better implementation is built
e95354ec 2375 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2376 {
2377 if (name)
267e11c5 2378 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2379 else
2380 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2381
8b04d4c4
VZ
2382 Init();
2383 }
2384
e95354ec 2385 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2386 {
2387 m_enc = enc;
2388
2389 Init();
f1339c56 2390 }
dccce9ea 2391
bde4baac 2392 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2393 {
2394 size_t inbuf = strlen(psz);
dccce9ea 2395 if (buf)
c643a977
VS
2396 {
2397 if (!m2w.Convert(psz,buf))
2398 return (size_t)-1;
2399 }
f1339c56
RR
2400 return inbuf;
2401 }
dccce9ea 2402
bde4baac 2403 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2404 {
f8d791e0 2405 const size_t inbuf = wxWcslen(psz);
f1339c56 2406 if (buf)
c643a977
VS
2407 {
2408 if (!w2m.Convert(psz,buf))
2409 return (size_t)-1;
2410 }
dccce9ea 2411
f1339c56
RR
2412 return inbuf;
2413 }
dccce9ea 2414
e95354ec 2415 bool IsOk() const { return m_ok; }
f1339c56
RR
2416
2417public:
8b04d4c4 2418 wxFontEncoding m_enc;
f1339c56 2419 wxEncodingConverter m2w, w2m;
cafbf6fb
VZ
2420
2421 // were we initialized successfully?
2422 bool m_ok;
fc7a2a60 2423
e95354ec 2424 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2425};
6001e347 2426
8f115891
MW
2427// make the constructors available for unit testing
2428WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2429{
2430 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2431 if ( !result->IsOk() )
2432 {
2433 delete result;
2434 return 0;
2435 }
2436 return result;
2437}
2438
1e6feb95
VZ
2439#endif // wxUSE_FONTMAP
2440
36acb880
VZ
2441// ============================================================================
2442// wxCSConv implementation
2443// ============================================================================
2444
8b04d4c4 2445void wxCSConv::Init()
6001e347 2446{
e95354ec
VZ
2447 m_name = NULL;
2448 m_convReal = NULL;
2449 m_deferred = true;
2450}
2451
8b04d4c4
VZ
2452wxCSConv::wxCSConv(const wxChar *charset)
2453{
2454 Init();
82713003 2455
e95354ec
VZ
2456 if ( charset )
2457 {
e95354ec
VZ
2458 SetName(charset);
2459 }
bda3d86a
VZ
2460
2461 m_encoding = wxFONTENCODING_SYSTEM;
6001e347
RR
2462}
2463
8b04d4c4
VZ
2464wxCSConv::wxCSConv(wxFontEncoding encoding)
2465{
bda3d86a 2466 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2467 {
2468 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2469
2470 encoding = wxFONTENCODING_SYSTEM;
2471 }
2472
8b04d4c4
VZ
2473 Init();
2474
bda3d86a 2475 m_encoding = encoding;
8b04d4c4
VZ
2476}
2477
6001e347
RR
2478wxCSConv::~wxCSConv()
2479{
65e50848
JS
2480 Clear();
2481}
2482
54380f29 2483wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2484 : wxMBConv()
54380f29 2485{
8b04d4c4
VZ
2486 Init();
2487
54380f29 2488 SetName(conv.m_name);
8b04d4c4 2489 m_encoding = conv.m_encoding;
54380f29
GD
2490}
2491
2492wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2493{
2494 Clear();
8b04d4c4 2495
54380f29 2496 SetName(conv.m_name);
8b04d4c4
VZ
2497 m_encoding = conv.m_encoding;
2498
54380f29
GD
2499 return *this;
2500}
2501
65e50848
JS
2502void wxCSConv::Clear()
2503{
8b04d4c4 2504 free(m_name);
e95354ec 2505 delete m_convReal;
8b04d4c4 2506
65e50848 2507 m_name = NULL;
e95354ec 2508 m_convReal = NULL;
6001e347
RR
2509}
2510
2511void wxCSConv::SetName(const wxChar *charset)
2512{
f1339c56
RR
2513 if (charset)
2514 {
2515 m_name = wxStrdup(charset);
e95354ec 2516 m_deferred = true;
f1339c56 2517 }
6001e347
RR
2518}
2519
8b3eb85d
VZ
2520#if wxUSE_FONTMAP
2521#include "wx/hashmap.h"
2522
2523WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2524 wxEncodingNameCache );
2525
2526static wxEncodingNameCache gs_nameCache;
2527#endif
2528
e95354ec
VZ
2529wxMBConv *wxCSConv::DoCreate() const
2530{
ce6f8d6f
VZ
2531#if wxUSE_FONTMAP
2532 wxLogTrace(TRACE_STRCONV,
2533 wxT("creating conversion for %s"),
2534 (m_name ? m_name
2535 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2536#endif // wxUSE_FONTMAP
2537
c547282d
VZ
2538 // check for the special case of ASCII or ISO8859-1 charset: as we have
2539 // special knowledge of it anyhow, we don't need to create a special
2540 // conversion object
2541 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 2542 {
e95354ec
VZ
2543 // don't convert at all
2544 return NULL;
2545 }
dccce9ea 2546
e95354ec
VZ
2547 // we trust OS to do conversion better than we can so try external
2548 // conversion methods first
2549 //
2550 // the full order is:
2551 // 1. OS conversion (iconv() under Unix or Win32 API)
2552 // 2. hard coded conversions for UTF
2553 // 3. wxEncodingConverter as fall back
2554
2555 // step (1)
2556#ifdef HAVE_ICONV
c547282d 2557#if !wxUSE_FONTMAP
e95354ec 2558 if ( m_name )
c547282d 2559#endif // !wxUSE_FONTMAP
e95354ec 2560 {
c547282d 2561 wxString name(m_name);
8b3eb85d
VZ
2562 wxFontEncoding encoding(m_encoding);
2563
2564 if ( !name.empty() )
2565 {
2566 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2567 if ( conv->IsOk() )
2568 return conv;
2569
2570 delete conv;
c547282d
VZ
2571
2572#if wxUSE_FONTMAP
8b3eb85d
VZ
2573 encoding =
2574 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
c547282d 2575#endif // wxUSE_FONTMAP
8b3eb85d
VZ
2576 }
2577#if wxUSE_FONTMAP
2578 {
2579 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2580 if ( it != gs_nameCache.end() )
2581 {
2582 if ( it->second.empty() )
2583 return NULL;
c547282d 2584
8b3eb85d
VZ
2585 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2586 if ( conv->IsOk() )
2587 return conv;
e95354ec 2588
8b3eb85d
VZ
2589 delete conv;
2590 }
2591
2592 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2593
2594 for ( ; *names; ++names )
2595 {
2596 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2597 if ( conv->IsOk() )
2598 {
2599 gs_nameCache[encoding] = *names;
2600 return conv;
2601 }
2602
2603 delete conv;
2604 }
2605
40711af8 2606 gs_nameCache[encoding] = _T(""); // cache the failure
8b3eb85d
VZ
2607 }
2608#endif // wxUSE_FONTMAP
e95354ec
VZ
2609 }
2610#endif // HAVE_ICONV
2611
2612#ifdef wxHAVE_WIN32_MB2WC
2613 {
7608a683 2614#if wxUSE_FONTMAP
e95354ec
VZ
2615 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2616 : new wxMBConv_win32(m_encoding);
2617 if ( conv->IsOk() )
2618 return conv;
2619
2620 delete conv;
7608a683
WS
2621#else
2622 return NULL;
2623#endif
e95354ec
VZ
2624 }
2625#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2626#if defined(__WXMAC__)
2627 {
5c3c8676 2628 // leave UTF16 and UTF32 to the built-ins of wx
3698ae71 2629 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
5c3c8676 2630 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
d775fa82
WS
2631 {
2632
2d1659cf 2633#if wxUSE_FONTMAP
d775fa82
WS
2634 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2635 : new wxMBConv_mac(m_encoding);
2d1659cf
RN
2636#else
2637 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2638#endif
d775fa82 2639 if ( conv->IsOk() )
f7e98dee
RN
2640 return conv;
2641
2642 delete conv;
2643 }
2644 }
2645#endif
2646#if defined(__WXCOCOA__)
2647 {
2648 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2649 {
2650
a6900d10 2651#if wxUSE_FONTMAP
f7e98dee
RN
2652 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2653 : new wxMBConv_cocoa(m_encoding);
a6900d10
RN
2654#else
2655 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2656#endif
f7e98dee 2657 if ( conv->IsOk() )
d775fa82
WS
2658 return conv;
2659
2660 delete conv;
2661 }
335d31e0
SC
2662 }
2663#endif
e95354ec
VZ
2664 // step (2)
2665 wxFontEncoding enc = m_encoding;
2666#if wxUSE_FONTMAP
c547282d
VZ
2667 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2668 {
2669 // use "false" to suppress interactive dialogs -- we can be called from
2670 // anywhere and popping up a dialog from here is the last thing we want to
2671 // do
267e11c5 2672 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
c547282d 2673 }
e95354ec
VZ
2674#endif // wxUSE_FONTMAP
2675
2676 switch ( enc )
2677 {
2678 case wxFONTENCODING_UTF7:
2679 return new wxMBConvUTF7;
2680
2681 case wxFONTENCODING_UTF8:
2682 return new wxMBConvUTF8;
2683
e95354ec
VZ
2684 case wxFONTENCODING_UTF16BE:
2685 return new wxMBConvUTF16BE;
2686
2687 case wxFONTENCODING_UTF16LE:
2688 return new wxMBConvUTF16LE;
2689
e95354ec
VZ
2690 case wxFONTENCODING_UTF32BE:
2691 return new wxMBConvUTF32BE;
2692
2693 case wxFONTENCODING_UTF32LE:
2694 return new wxMBConvUTF32LE;
2695
2696 default:
2697 // nothing to do but put here to suppress gcc warnings
2698 ;
2699 }
2700
2701 // step (3)
2702#if wxUSE_FONTMAP
2703 {
2704 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2705 : new wxMBConv_wxwin(m_encoding);
2706 if ( conv->IsOk() )
2707 return conv;
2708
2709 delete conv;
2710 }
2711#endif // wxUSE_FONTMAP
2712
a58d4f4d
VS
2713 // NB: This is a hack to prevent deadlock. What could otherwise happen
2714 // in Unicode build: wxConvLocal creation ends up being here
2715 // because of some failure and logs the error. But wxLog will try to
2716 // attach timestamp, for which it will need wxConvLocal (to convert
2717 // time to char* and then wchar_t*), but that fails, tries to log
2718 // error, but wxLog has a (already locked) critical section that
2719 // guards static buffer.
2720 static bool alreadyLoggingError = false;
2721 if (!alreadyLoggingError)
2722 {
2723 alreadyLoggingError = true;
2724 wxLogError(_("Cannot convert from the charset '%s'!"),
2725 m_name ? m_name
e95354ec
VZ
2726 :
2727#if wxUSE_FONTMAP
267e11c5 2728 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
e95354ec
VZ
2729#else // !wxUSE_FONTMAP
2730 wxString::Format(_("encoding %s"), m_encoding).c_str()
2731#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2732 );
a58d4f4d
VS
2733 alreadyLoggingError = false;
2734 }
e95354ec
VZ
2735
2736 return NULL;
2737}
2738
2739void wxCSConv::CreateConvIfNeeded() const
2740{
2741 if ( m_deferred )
2742 {
2743 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2744
2745#if wxUSE_INTL
2746 // if we don't have neither the name nor the encoding, use the default
2747 // encoding for this system
2748 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2749 {
4d312c22 2750 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2751 }
2752#endif // wxUSE_INTL
2753
e95354ec
VZ
2754 self->m_convReal = DoCreate();
2755 self->m_deferred = false;
6001e347 2756 }
6001e347
RR
2757}
2758
2759size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2760{
e95354ec 2761 CreateConvIfNeeded();
dccce9ea 2762
e95354ec
VZ
2763 if (m_convReal)
2764 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2765
2766 // latin-1 (direct)
4def3b35 2767 size_t len = strlen(psz);
dccce9ea 2768
f1339c56
RR
2769 if (buf)
2770 {
4def3b35 2771 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2772 buf[c] = (unsigned char)(psz[c]);
2773 }
dccce9ea 2774
f1339c56 2775 return len;
6001e347
RR
2776}
2777
2778size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2779{
e95354ec 2780 CreateConvIfNeeded();
dccce9ea 2781
e95354ec
VZ
2782 if (m_convReal)
2783 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2784
f1339c56 2785 // latin-1 (direct)
f8d791e0 2786 const size_t len = wxWcslen(psz);
f1339c56
RR
2787 if (buf)
2788 {
4def3b35 2789 for (size_t c = 0; c <= len; c++)
24642831
VS
2790 {
2791 if (psz[c] > 0xFF)
2792 return (size_t)-1;
907173e5 2793 buf[c] = (char)psz[c];
24642831
VS
2794 }
2795 }
2796 else
2797 {
2798 for (size_t c = 0; c <= len; c++)
2799 {
2800 if (psz[c] > 0xFF)
2801 return (size_t)-1;
2802 }
f1339c56 2803 }
dccce9ea 2804
f1339c56 2805 return len;
6001e347
RR
2806}
2807
bde4baac
VZ
2808// ----------------------------------------------------------------------------
2809// globals
2810// ----------------------------------------------------------------------------
2811
2812#ifdef __WINDOWS__
2813 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
2814#elif defined(__WXMAC__) && !defined(__MACH__)
2815 static wxMBConv_mac wxConvLibcObj ;
bde4baac 2816#else
dcc8fac0 2817 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
2818#endif
2819
2820static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2821static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2822static wxMBConvUTF7 wxConvUTF7Obj;
2823static wxMBConvUTF8 wxConvUTF8Obj;
c12b7f79 2824
bde4baac
VZ
2825WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2826WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2827WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2828WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2829WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2830WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
f5a1953b
VZ
2831WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2832#ifdef __WXOSX__
ea8ce907 2833 wxConvUTF8Obj;
f5a1953b 2834#else
ea8ce907 2835 wxConvLibcObj;
f5a1953b
VZ
2836#endif
2837
bde4baac
VZ
2838
2839#else // !wxUSE_WCHAR_T
2840
2841// stand-ins in absence of wchar_t
2842WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2843 wxConvISO8859_1,
2844 wxConvLocal,
2845 wxConvUTF8;
2846
2847#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T