]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
Compare and assign wxChars to wxChar instead char local variable.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347
RR
1/////////////////////////////////////////////////////////////////////////////
2// Name: strconv.cpp
3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
14f355c2 23#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
6001e347
RR
24 #pragma implementation "strconv.h"
25#endif
26
27// For compilers that support precompilation, includes "wx.h".
28#include "wx/wxprec.h"
29
30#ifdef __BORLANDC__
31 #pragma hdrstop
32#endif
33
373658eb
VZ
34#ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37#endif // WX_PRECOMP
38
bde4baac
VZ
39#include "wx/strconv.h"
40
41#if wxUSE_WCHAR_T
42
0a1c1e62 43#ifdef __WXMSW__
373658eb 44 #include "wx/msw/private.h"
7608a683
WS
45#endif
46
47#ifdef __WINDOWS__
13dd924a 48 #include "wx/msw/missing.h"
0a1c1e62
GRG
49#endif
50
1c193821 51#ifndef __WXWINCE__
1cd52418 52#include <errno.h>
1c193821
JS
53#endif
54
6001e347
RR
55#include <ctype.h>
56#include <string.h>
57#include <stdlib.h>
58
e95354ec
VZ
59#if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61#endif // __WIN32__ but !__WXMICROWIN__
62
373658eb
VZ
63// ----------------------------------------------------------------------------
64// headers
65// ----------------------------------------------------------------------------
7af284fd 66
6001e347 67#ifdef __SALFORDC__
373658eb 68 #include <clib.h>
6001e347
RR
69#endif
70
b040e242 71#ifdef HAVE_ICONV
373658eb 72 #include <iconv.h>
1cd52418 73#endif
1cd52418 74
373658eb
VZ
75#include "wx/encconv.h"
76#include "wx/fontmap.h"
7608a683 77#include "wx/utils.h"
373658eb 78
335d31e0 79#ifdef __WXMAC__
4227afa4
SC
80#include <ATSUnicode.h>
81#include <TextCommon.h>
82#include <TextEncodingConverter.h>
335d31e0
SC
83
84#include "wx/mac/private.h" // includes mac headers
85#endif
373658eb
VZ
86// ----------------------------------------------------------------------------
87// macros
88// ----------------------------------------------------------------------------
3e61dfb0 89
1cd52418 90#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
3a0d76bc 91#define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
1cd52418
OK
92
93#if SIZEOF_WCHAR_T == 4
3a0d76bc
VS
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
98 #else
99 #define WC_NAME_BEST "UCS-4LE"
100 #endif
1cd52418 101#elif SIZEOF_WCHAR_T == 2
3a0d76bc
VS
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
a3f2769e 104 #define WC_UTF16
3a0d76bc
VS
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
107 #else
108 #define WC_NAME_BEST "UTF-16LE"
109 #endif
bab1e722 110#else // sizeof(wchar_t) != 2 nor 4
bde4baac
VZ
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1cd52418
OK
113#endif
114
373658eb
VZ
115// ============================================================================
116// implementation
117// ============================================================================
118
119// ----------------------------------------------------------------------------
c91830cb 120// UTF-16 en/decoding to/from UCS-4
373658eb 121// ----------------------------------------------------------------------------
6001e347 122
b0a6bb75 123
c91830cb 124static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 125{
dccce9ea 126 if (input<=0xffff)
4def3b35 127 {
999836aa
VZ
128 if (output)
129 *output = (wxUint16) input;
4def3b35 130 return 1;
dccce9ea
VZ
131 }
132 else if (input>=0x110000)
4def3b35
VS
133 {
134 return (size_t)-1;
dccce9ea
VZ
135 }
136 else
4def3b35 137 {
dccce9ea 138 if (output)
4def3b35 139 {
c91830cb 140 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 141 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
142 }
143 return 2;
1cd52418 144 }
1cd52418
OK
145}
146
c91830cb 147static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 148{
dccce9ea 149 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
150 {
151 output = *input;
152 return 1;
dccce9ea
VZ
153 }
154 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
4def3b35
VS
155 {
156 output = *input;
157 return (size_t)-1;
dccce9ea
VZ
158 }
159 else
4def3b35
VS
160 {
161 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
162 return 2;
163 }
1cd52418
OK
164}
165
b0a6bb75 166
f6bcfd97 167// ----------------------------------------------------------------------------
6001e347 168// wxMBConv
f6bcfd97 169// ----------------------------------------------------------------------------
2c53a80a
WS
170
171wxMBConv::~wxMBConv()
172{
173 // nothing to do here (necessary for Darwin linking probably)
174}
6001e347 175
6001e347
RR
176const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
177{
2b5f62a0 178 if ( psz )
6001e347 179 {
2b5f62a0
VZ
180 // calculate the length of the buffer needed first
181 size_t nLen = MB2WC(NULL, psz, 0);
182 if ( nLen != (size_t)-1 )
183 {
184 // now do the actual conversion
185 wxWCharBuffer buf(nLen);
635f33ce
VS
186 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
187 if ( nLen != (size_t)-1 )
188 {
189 return buf;
190 }
2b5f62a0 191 }
f6bcfd97 192 }
2b5f62a0
VZ
193
194 wxWCharBuffer buf((wchar_t *)NULL);
195
196 return buf;
6001e347
RR
197}
198
e5cceba0 199const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 200{
2b5f62a0
VZ
201 if ( pwz )
202 {
203 size_t nLen = WC2MB(NULL, pwz, 0);
204 if ( nLen != (size_t)-1 )
205 {
c91830cb 206 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
207 nLen = WC2MB(buf.data(), pwz, nLen + 4);
208 if ( nLen != (size_t)-1 )
209 {
210 return buf;
211 }
2b5f62a0
VZ
212 }
213 }
214
215 wxCharBuffer buf((char *)NULL);
e5cceba0 216
e5cceba0 217 return buf;
6001e347
RR
218}
219
e4e3bbb4
RN
220size_t wxMBConv::MB2WC(wchar_t* szBuffer, const char* szString,
221 size_t outsize, size_t nStringLen) const
222{
223 const char* szEnd = szString + nStringLen + 1;
224 const char* szPos = szString;
225 const char* szStart = szPos;
226
227 size_t nActualLength = 0;
228
229 //Convert the string until the length() is reached, continuing the
230 //loop every time a null character is reached
231 while(szPos != szEnd)
232 {
233 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
234
235 //Get the length of the current (sub)string
236 size_t nLen = MB2WC(NULL, szPos, 0);
237
238 //Invalid conversion?
239 if( nLen == (size_t)-1 )
240 return nLen;
241
242 //Increase the actual length (+1 for current null character)
243 nActualLength += nLen + 1;
244
245 //Only copy data in if buffer size is big enough
246 if (szBuffer != NULL &&
247 nActualLength <= outsize)
248 {
249 //Convert the current (sub)string
250 if ( MB2WC(&szBuffer[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
251 return (size_t)-1;
252 }
253
254 //Increment to next (sub)string
255 //Note that we have to use strlen here instead of nLen
256 //here because XX2XX gives us the size of the output buffer,
257 //not neccessarly the length of the string
258 szPos += strlen(szPos) + 1;
259 }
260
261 return nActualLength - 1; //success - return actual length
262}
263
264size_t wxMBConv::WC2MB(char* szBuffer, const wchar_t* szString,
265 size_t outsize, size_t nStringLen) const
266{
267 const wchar_t* szEnd = szString + nStringLen + 1;
268 const wchar_t* szPos = szString;
269 const wchar_t* szStart = szPos;
270
271 size_t nActualLength = 0;
272
273 //Convert the string until the length() is reached, continuing the
274 //loop every time a null character is reached
275 while(szPos != szEnd)
276 {
277 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
278
279 //Get the length of the current (sub)string
280 size_t nLen = WC2MB(NULL, szPos, 0);
281
282 //Invalid conversion?
283 if( nLen == (size_t)-1 )
284 return nLen;
285
286 //Increase the actual length (+1 for current null character)
287 nActualLength += nLen + 1;
288
289 //Only copy data in if buffer size is big enough
290 if (szBuffer != NULL &&
291 nActualLength <= outsize)
292 {
293 //Convert the current (sub)string
294 if(WC2MB(&szBuffer[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
295 return (size_t)-1;
296 }
297
298 //Increment to next (sub)string
299 //Note that we have to use wxWcslen here instead of nLen
300 //here because XX2XX gives us the size of the output buffer,
301 //not neccessarly the length of the string
302 szPos += wxWcslen(szPos) + 1;
303 }
304
305 return nActualLength - 1; //success - return actual length
306}
307
6001e347 308// ----------------------------------------------------------------------------
bde4baac 309// wxMBConvLibc
6001e347
RR
310// ----------------------------------------------------------------------------
311
bde4baac
VZ
312size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
313{
314 return wxMB2WC(buf, psz, n);
315}
316
317size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
318{
319 return wxWC2MB(buf, psz, n);
320}
bde4baac 321// ----------------------------------------------------------------------------
15f2ee32 322// UTF-7
bde4baac 323// ----------------------------------------------------------------------------
6001e347 324
15f2ee32 325// Implementation (C) 2004 Fredrik Roubert
6001e347 326
15f2ee32
RN
327//
328// BASE64 decoding table
329//
330static const unsigned char utf7unb64[] =
6001e347 331{
15f2ee32
RN
332 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
333 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
334 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
335 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
336 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
337 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
338 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
339 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
340 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
341 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
342 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
343 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
344 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
345 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
346 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
347 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
348 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
349 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
350 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
351 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
352 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
353 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
354 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
355 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
356 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
357 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
358 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
359 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
360 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
361 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
362 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
363 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
364};
365
366size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
367{
368
369 size_t len = 0;
370
371 while (*psz && ((!buf) || (len < n)))
372 {
373 unsigned char cc = *psz++;
374 if (cc != '+')
375 {
376 // plain ASCII char
377 if (buf)
378 *buf++ = cc;
379 len++;
380 }
381 else if (*psz == '-')
382 {
383 // encoded plus sign
384 if (buf)
385 *buf++ = cc;
386 len++;
387 psz++;
388 }
389 else
390 {
391 // BASE64 encoded string
392 bool lsb;
393 unsigned char c;
394 unsigned int d, l;
395 for (lsb = false, d = 0, l = 0;
396 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
397 {
398 d <<= 6;
399 d += cc;
400 for (l += 6; l >= 8; lsb = !lsb)
401 {
402 c = (d >> (l -= 8)) % 256;
403 if (lsb)
404 {
405 if (buf)
406 *buf++ |= c;
407 len ++;
408 }
409 else
410 if (buf)
411 *buf = c << 8;
412 }
413 }
414 if (*psz == '-')
415 psz++;
416 }
417 }
418 if (buf && (len < n))
419 *buf = 0;
420 return len;
6001e347
RR
421}
422
15f2ee32
RN
423//
424// BASE64 encoding table
425//
426static const unsigned char utf7enb64[] =
427{
428 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
429 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
430 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
431 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
432 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
433 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
434 'w', 'x', 'y', 'z', '0', '1', '2', '3',
435 '4', '5', '6', '7', '8', '9', '+', '/'
436};
437
438//
439// UTF-7 encoding table
440//
441// 0 - Set D (directly encoded characters)
442// 1 - Set O (optional direct characters)
443// 2 - whitespace characters (optional)
444// 3 - special characters
445//
446static const unsigned char utf7encode[128] =
6001e347 447{
15f2ee32
RN
448 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
449 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
450 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
452 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
454 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
456};
457
458size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
459*psz, size_t n) const
460{
461
462
463 size_t len = 0;
464
465 while (*psz && ((!buf) || (len < n)))
466 {
467 wchar_t cc = *psz++;
468 if (cc < 0x80 && utf7encode[cc] < 1)
469 {
470 // plain ASCII char
471 if (buf)
472 *buf++ = (char)cc;
473 len++;
474 }
475#ifndef WC_UTF16
7c8fad40
JJ
476#ifdef __VMS
477 else if (cc > 0xffff)
478#else
479 else if (cc > ((const wchar_t)0xffff))
480#endif
481 {
15f2ee32
RN
482 // no surrogate pair generation (yet?)
483 return (size_t)-1;
484 }
485#endif
486 else
487 {
488 if (buf)
489 *buf++ = '+';
490 len++;
491 if (cc != '+')
492 {
493 // BASE64 encode string
494 unsigned int lsb, d, l;
495 for (d = 0, l = 0;; psz++)
496 {
497 for (lsb = 0; lsb < 2; lsb ++)
498 {
499 d <<= 8;
500 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
501
502 for (l += 8; l >= 6; )
503 {
504 l -= 6;
505 if (buf)
506 *buf++ = utf7enb64[(d >> l) % 64];
507 len++;
508 }
509 }
510 cc = *psz;
511 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
512 break;
513 }
514 if (l != 0)
515 {
516 if (buf)
517 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
518 len++;
519 }
520 }
521 if (buf)
522 *buf++ = '-';
523 len++;
524 }
525 }
526 if (buf && (len < n))
527 *buf = 0;
528 return len;
6001e347
RR
529}
530
f6bcfd97 531// ----------------------------------------------------------------------------
6001e347 532// UTF-8
f6bcfd97 533// ----------------------------------------------------------------------------
6001e347 534
dccce9ea 535static wxUint32 utf8_max[]=
4def3b35 536 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347
RR
537
538size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
539{
4def3b35
VS
540 size_t len = 0;
541
dccce9ea 542 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
543 {
544 unsigned char cc = *psz++, fc = cc;
545 unsigned cnt;
dccce9ea 546 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 547 fc <<= 1;
dccce9ea 548 if (!cnt)
4def3b35
VS
549 {
550 // plain ASCII char
dccce9ea 551 if (buf)
4def3b35
VS
552 *buf++ = cc;
553 len++;
dccce9ea
VZ
554 }
555 else
4def3b35
VS
556 {
557 cnt--;
dccce9ea 558 if (!cnt)
4def3b35
VS
559 {
560 // invalid UTF-8 sequence
561 return (size_t)-1;
dccce9ea
VZ
562 }
563 else
4def3b35
VS
564 {
565 unsigned ocnt = cnt - 1;
566 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 567 while (cnt--)
4def3b35
VS
568 {
569 cc = *psz++;
dccce9ea 570 if ((cc & 0xC0) != 0x80)
4def3b35
VS
571 {
572 // invalid UTF-8 sequence
573 return (size_t)-1;
574 }
575 res = (res << 6) | (cc & 0x3f);
576 }
dccce9ea 577 if (res <= utf8_max[ocnt])
4def3b35
VS
578 {
579 // illegal UTF-8 encoding
580 return (size_t)-1;
581 }
1cd52418 582#ifdef WC_UTF16
b5153fd8
VZ
583 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
584 size_t pa = encode_utf16(res, (wxUint16 *)buf);
4def3b35
VS
585 if (pa == (size_t)-1)
586 return (size_t)-1;
dccce9ea 587 if (buf)
4def3b35
VS
588 buf += pa;
589 len += pa;
373658eb 590#else // !WC_UTF16
dccce9ea 591 if (buf)
4def3b35
VS
592 *buf++ = res;
593 len++;
373658eb 594#endif // WC_UTF16/!WC_UTF16
4def3b35
VS
595 }
596 }
6001e347 597 }
dccce9ea 598 if (buf && (len < n))
4def3b35
VS
599 *buf = 0;
600 return len;
6001e347
RR
601}
602
603size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
604{
4def3b35 605 size_t len = 0;
6001e347 606
dccce9ea 607 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
608 {
609 wxUint32 cc;
1cd52418 610#ifdef WC_UTF16
b5153fd8
VZ
611 // cast is ok for WC_UTF16
612 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 613 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 614#else
4def3b35
VS
615 cc=(*psz++) & 0x7fffffff;
616#endif
617 unsigned cnt;
618 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
dccce9ea 619 if (!cnt)
4def3b35
VS
620 {
621 // plain ASCII char
dccce9ea 622 if (buf)
574c939e 623 *buf++ = (char) cc;
4def3b35 624 len++;
dccce9ea
VZ
625 }
626
627 else
4def3b35
VS
628 {
629 len += cnt + 1;
dccce9ea 630 if (buf)
4def3b35 631 {
574c939e 632 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
4def3b35 633 while (cnt--)
574c939e 634 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
4def3b35
VS
635 }
636 }
6001e347 637 }
4def3b35
VS
638
639 if (buf && (len<n)) *buf = 0;
adb45366 640
4def3b35 641 return len;
6001e347
RR
642}
643
c91830cb
VZ
644
645
646
647// ----------------------------------------------------------------------------
648// UTF-16
649// ----------------------------------------------------------------------------
650
651#ifdef WORDS_BIGENDIAN
bde4baac
VZ
652 #define wxMBConvUTF16straight wxMBConvUTF16BE
653 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 654#else
bde4baac
VZ
655 #define wxMBConvUTF16swap wxMBConvUTF16BE
656 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
657#endif
658
659
c91830cb
VZ
660#ifdef WC_UTF16
661
c91830cb
VZ
662// copy 16bit MB to 16bit String
663size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
664{
665 size_t len=0;
666
667 while (*(wxUint16*)psz && (!buf || len < n))
668 {
669 if (buf)
670 *buf++ = *(wxUint16*)psz;
671 len++;
672
673 psz += sizeof(wxUint16);
674 }
675 if (buf && len<n) *buf=0;
676
677 return len;
678}
679
680
681// copy 16bit String to 16bit MB
682size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
683{
684 size_t len=0;
685
686 while (*psz && (!buf || len < n))
687 {
688 if (buf)
689 {
690 *(wxUint16*)buf = *psz;
691 buf += sizeof(wxUint16);
692 }
693 len += sizeof(wxUint16);
694 psz++;
695 }
696 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
697
698 return len;
699}
700
701
702// swap 16bit MB to 16bit String
703size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
704{
705 size_t len=0;
706
707 while (*(wxUint16*)psz && (!buf || len < n))
708 {
709 if (buf)
710 {
711 ((char *)buf)[0] = psz[1];
712 ((char *)buf)[1] = psz[0];
713 buf++;
714 }
715 len++;
716 psz += sizeof(wxUint16);
717 }
718 if (buf && len<n) *buf=0;
719
720 return len;
721}
722
723
724// swap 16bit MB to 16bit String
725size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
726{
727 size_t len=0;
728
729 while (*psz && (!buf || len < n))
730 {
731 if (buf)
732 {
733 *buf++ = ((char*)psz)[1];
734 *buf++ = ((char*)psz)[0];
735 }
736 len += sizeof(wxUint16);
737 psz++;
738 }
739 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
740
741 return len;
742}
743
744
745#else // WC_UTF16
746
747
748// copy 16bit MB to 32bit String
749size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
750{
751 size_t len=0;
752
753 while (*(wxUint16*)psz && (!buf || len < n))
754 {
755 wxUint32 cc;
756 size_t pa=decode_utf16((wxUint16*)psz, cc);
757 if (pa == (size_t)-1)
758 return pa;
759
760 if (buf)
761 *buf++ = cc;
762 len++;
763 psz += pa * sizeof(wxUint16);
764 }
765 if (buf && len<n) *buf=0;
766
767 return len;
768}
769
770
771// copy 32bit String to 16bit MB
772size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
773{
774 size_t len=0;
775
776 while (*psz && (!buf || len < n))
777 {
778 wxUint16 cc[2];
779 size_t pa=encode_utf16(*psz, cc);
780
781 if (pa == (size_t)-1)
782 return pa;
783
784 if (buf)
785 {
69b80d28 786 *(wxUint16*)buf = cc[0];
b5153fd8 787 buf += sizeof(wxUint16);
c91830cb 788 if (pa > 1)
69b80d28
VZ
789 {
790 *(wxUint16*)buf = cc[1];
791 buf += sizeof(wxUint16);
792 }
c91830cb
VZ
793 }
794
795 len += pa*sizeof(wxUint16);
796 psz++;
797 }
798 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
799
800 return len;
801}
802
803
804// swap 16bit MB to 32bit String
805size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
806{
807 size_t len=0;
808
809 while (*(wxUint16*)psz && (!buf || len < n))
810 {
811 wxUint32 cc;
812 char tmp[4];
813 tmp[0]=psz[1]; tmp[1]=psz[0];
814 tmp[2]=psz[3]; tmp[3]=psz[2];
815
816 size_t pa=decode_utf16((wxUint16*)tmp, cc);
817 if (pa == (size_t)-1)
818 return pa;
819
820 if (buf)
821 *buf++ = cc;
822
823 len++;
824 psz += pa * sizeof(wxUint16);
825 }
826 if (buf && len<n) *buf=0;
827
828 return len;
829}
830
831
832// swap 32bit String to 16bit MB
833size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
834{
835 size_t len=0;
836
837 while (*psz && (!buf || len < n))
838 {
839 wxUint16 cc[2];
840 size_t pa=encode_utf16(*psz, cc);
841
842 if (pa == (size_t)-1)
843 return pa;
844
845 if (buf)
846 {
847 *buf++ = ((char*)cc)[1];
848 *buf++ = ((char*)cc)[0];
849 if (pa > 1)
850 {
851 *buf++ = ((char*)cc)[3];
852 *buf++ = ((char*)cc)[2];
853 }
854 }
855
856 len += pa*sizeof(wxUint16);
857 psz++;
858 }
859 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
860
861 return len;
862}
863
864#endif // WC_UTF16
865
866
867// ----------------------------------------------------------------------------
868// UTF-32
869// ----------------------------------------------------------------------------
870
871#ifdef WORDS_BIGENDIAN
872#define wxMBConvUTF32straight wxMBConvUTF32BE
873#define wxMBConvUTF32swap wxMBConvUTF32LE
874#else
875#define wxMBConvUTF32swap wxMBConvUTF32BE
876#define wxMBConvUTF32straight wxMBConvUTF32LE
877#endif
878
879
880WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
881WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
882
883
884#ifdef WC_UTF16
885
886// copy 32bit MB to 16bit String
887size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
888{
889 size_t len=0;
890
891 while (*(wxUint32*)psz && (!buf || len < n))
892 {
893 wxUint16 cc[2];
894
895 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
896 if (pa == (size_t)-1)
897 return pa;
898
899 if (buf)
900 {
901 *buf++ = cc[0];
902 if (pa > 1)
903 *buf++ = cc[1];
904 }
905 len += pa;
906 psz += sizeof(wxUint32);
907 }
908 if (buf && len<n) *buf=0;
909
910 return len;
911}
912
913
914// copy 16bit String to 32bit MB
915size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
916{
917 size_t len=0;
918
919 while (*psz && (!buf || len < n))
920 {
921 wxUint32 cc;
922
b5153fd8
VZ
923 // cast is ok for WC_UTF16
924 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
925 if (pa == (size_t)-1)
926 return pa;
927
928 if (buf)
929 {
930 *(wxUint32*)buf = cc;
931 buf += sizeof(wxUint32);
932 }
933 len += sizeof(wxUint32);
934 psz += pa;
935 }
b5153fd8
VZ
936
937 if (buf && len<=n-sizeof(wxUint32))
938 *(wxUint32*)buf=0;
c91830cb
VZ
939
940 return len;
941}
942
943
944
945// swap 32bit MB to 16bit String
946size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
947{
948 size_t len=0;
949
950 while (*(wxUint32*)psz && (!buf || len < n))
951 {
952 char tmp[4];
953 tmp[0] = psz[3]; tmp[1] = psz[2];
954 tmp[2] = psz[1]; tmp[3] = psz[0];
955
956
957 wxUint16 cc[2];
958
959 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
960 if (pa == (size_t)-1)
961 return pa;
962
963 if (buf)
964 {
965 *buf++ = cc[0];
966 if (pa > 1)
967 *buf++ = cc[1];
968 }
969 len += pa;
970 psz += sizeof(wxUint32);
971 }
b5153fd8
VZ
972
973 if (buf && len<n)
974 *buf=0;
c91830cb
VZ
975
976 return len;
977}
978
979
980// swap 16bit String to 32bit MB
981size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
982{
983 size_t len=0;
984
985 while (*psz && (!buf || len < n))
986 {
987 char cc[4];
988
b5153fd8
VZ
989 // cast is ok for WC_UTF16
990 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
991 if (pa == (size_t)-1)
992 return pa;
993
994 if (buf)
995 {
996 *buf++ = cc[3];
997 *buf++ = cc[2];
998 *buf++ = cc[1];
999 *buf++ = cc[0];
1000 }
1001 len += sizeof(wxUint32);
1002 psz += pa;
1003 }
b5153fd8
VZ
1004
1005 if (buf && len<=n-sizeof(wxUint32))
1006 *(wxUint32*)buf=0;
c91830cb
VZ
1007
1008 return len;
1009}
1010
1011#else // WC_UTF16
1012
1013
1014// copy 32bit MB to 32bit String
1015size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1016{
1017 size_t len=0;
1018
1019 while (*(wxUint32*)psz && (!buf || len < n))
1020 {
1021 if (buf)
1022 *buf++ = *(wxUint32*)psz;
1023 len++;
1024 psz += sizeof(wxUint32);
1025 }
b5153fd8
VZ
1026
1027 if (buf && len<n)
1028 *buf=0;
c91830cb
VZ
1029
1030 return len;
1031}
1032
1033
1034// copy 32bit String to 32bit MB
1035size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1036{
1037 size_t len=0;
1038
1039 while (*psz && (!buf || len < n))
1040 {
1041 if (buf)
1042 {
1043 *(wxUint32*)buf = *psz;
1044 buf += sizeof(wxUint32);
1045 }
1046
1047 len += sizeof(wxUint32);
1048 psz++;
1049 }
1050
b5153fd8
VZ
1051 if (buf && len<=n-sizeof(wxUint32))
1052 *(wxUint32*)buf=0;
c91830cb
VZ
1053
1054 return len;
1055}
1056
1057
1058// swap 32bit MB to 32bit String
1059size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1060{
1061 size_t len=0;
1062
1063 while (*(wxUint32*)psz && (!buf || len < n))
1064 {
1065 if (buf)
1066 {
1067 ((char *)buf)[0] = psz[3];
1068 ((char *)buf)[1] = psz[2];
1069 ((char *)buf)[2] = psz[1];
1070 ((char *)buf)[3] = psz[0];
1071 buf++;
1072 }
1073 len++;
1074 psz += sizeof(wxUint32);
1075 }
b5153fd8
VZ
1076
1077 if (buf && len<n)
1078 *buf=0;
c91830cb
VZ
1079
1080 return len;
1081}
1082
1083
1084// swap 32bit String to 32bit MB
1085size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1086{
1087 size_t len=0;
1088
1089 while (*psz && (!buf || len < n))
1090 {
1091 if (buf)
1092 {
1093 *buf++ = ((char *)psz)[3];
1094 *buf++ = ((char *)psz)[2];
1095 *buf++ = ((char *)psz)[1];
1096 *buf++ = ((char *)psz)[0];
1097 }
1098 len += sizeof(wxUint32);
1099 psz++;
1100 }
b5153fd8
VZ
1101
1102 if (buf && len<=n-sizeof(wxUint32))
1103 *(wxUint32*)buf=0;
c91830cb
VZ
1104
1105 return len;
1106}
1107
1108
1109#endif // WC_UTF16
1110
1111
36acb880
VZ
1112// ============================================================================
1113// The classes doing conversion using the iconv_xxx() functions
1114// ============================================================================
3caec1bb 1115
b040e242 1116#ifdef HAVE_ICONV
3a0d76bc 1117
3caec1bb
VS
1118// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1119// if output buffer is _exactly_ as big as needed. Such case is (unless there's
1120// yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1121// (which means error) and says there are 0 bytes left in the input buffer --
1122// when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1123// this alternative test for iconv() failure.
1124// [This bug does not appear in glibc 2.2.]
1125#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1126#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1127 (errno != E2BIG || bufLeft != 0))
1128#else
1129#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1130#endif
1131
ab217dba 1132#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880
VZ
1133
1134// ----------------------------------------------------------------------------
e95354ec 1135// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1136// ----------------------------------------------------------------------------
1137
e95354ec 1138class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1139{
1140public:
e95354ec
VZ
1141 wxMBConv_iconv(const wxChar *name);
1142 virtual ~wxMBConv_iconv();
36acb880 1143
bde4baac
VZ
1144 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1145 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1146
e95354ec 1147 bool IsOk() const
36acb880
VZ
1148 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1149
1150protected:
1151 // the iconv handlers used to translate from multibyte to wide char and in
1152 // the other direction
1153 iconv_t m2w,
1154 w2m;
1155
1156private:
e95354ec 1157 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880
VZ
1158 // available on this machine, it will remain NULL
1159 static const char *ms_wcCharsetName;
1160
1161 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1162 // different endian-ness than the native one
405d8f46 1163 static bool ms_wcNeedsSwap;
36acb880
VZ
1164};
1165
e95354ec
VZ
1166const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1167bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1168
e95354ec 1169wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1170{
04c79127
RR
1171 // Do it the hard way
1172 char cname[100];
1173 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1174 cname[i] = (char) name[i];
1175
36acb880
VZ
1176 // check for charset that represents wchar_t:
1177 if (ms_wcCharsetName == NULL)
f1339c56 1178 {
e95354ec 1179 ms_wcNeedsSwap = false;
dccce9ea 1180
36acb880
VZ
1181 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1182 ms_wcCharsetName = WC_NAME_BEST;
04c79127 1183 m2w = iconv_open(ms_wcCharsetName, cname);
3a0d76bc 1184
36acb880
VZ
1185 if (m2w == (iconv_t)-1)
1186 {
1187 // try charset w/o bytesex info (e.g. "UCS4")
1188 // and check for bytesex ourselves:
1189 ms_wcCharsetName = WC_NAME;
04c79127 1190 m2w = iconv_open(ms_wcCharsetName, cname);
36acb880
VZ
1191
1192 // last bet, try if it knows WCHAR_T pseudo-charset
3a0d76bc
VS
1193 if (m2w == (iconv_t)-1)
1194 {
36acb880 1195 ms_wcCharsetName = "WCHAR_T";
04c79127 1196 m2w = iconv_open(ms_wcCharsetName, cname);
36acb880 1197 }
3a0d76bc 1198
36acb880
VZ
1199 if (m2w != (iconv_t)-1)
1200 {
1201 char buf[2], *bufPtr;
1202 wchar_t wbuf[2], *wbufPtr;
1203 size_t insz, outsz;
1204 size_t res;
1205
1206 buf[0] = 'A';
1207 buf[1] = 0;
1208 wbuf[0] = 0;
1209 insz = 2;
1210 outsz = SIZEOF_WCHAR_T * 2;
1211 wbufPtr = wbuf;
1212 bufPtr = buf;
1213
1214 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1215 (char**)&wbufPtr, &outsz);
1216
1217 if (ICONV_FAILED(res, insz))
3a0d76bc 1218 {
36acb880
VZ
1219 ms_wcCharsetName = NULL;
1220 wxLogLastError(wxT("iconv"));
2b5f62a0 1221 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
3a0d76bc
VS
1222 }
1223 else
1224 {
36acb880 1225 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
3a0d76bc
VS
1226 }
1227 }
36acb880
VZ
1228 else
1229 {
1230 ms_wcCharsetName = NULL;
373658eb 1231
77ffb593 1232 // VS: we must not output an error here, since wxWidgets will safely
957686c8
VS
1233 // fall back to using wxEncodingConverter.
1234 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1235 //wxLogError(
36acb880 1236 }
3a0d76bc 1237 }
36acb880 1238 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
3a0d76bc 1239 }
36acb880 1240 else // we already have ms_wcCharsetName
3caec1bb 1241 {
04c79127 1242 m2w = iconv_open(ms_wcCharsetName, cname);
f1339c56 1243 }
dccce9ea 1244
36acb880
VZ
1245 // NB: don't ever pass NULL to iconv_open(), it may crash!
1246 if ( ms_wcCharsetName )
f1339c56 1247 {
04c79127 1248 w2m = iconv_open( cname, ms_wcCharsetName);
36acb880 1249 }
405d8f46
VZ
1250 else
1251 {
1252 w2m = (iconv_t)-1;
1253 }
36acb880 1254}
3caec1bb 1255
e95354ec 1256wxMBConv_iconv::~wxMBConv_iconv()
36acb880
VZ
1257{
1258 if ( m2w != (iconv_t)-1 )
1259 iconv_close(m2w);
1260 if ( w2m != (iconv_t)-1 )
1261 iconv_close(w2m);
1262}
3a0d76bc 1263
bde4baac 1264size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880
VZ
1265{
1266 size_t inbuf = strlen(psz);
1267 size_t outbuf = n * SIZEOF_WCHAR_T;
1268 size_t res, cres;
1269 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1270 wchar_t *bufPtr = buf;
1271 const char *pszPtr = psz;
1272
1273 if (buf)
1274 {
1275 // have destination buffer, convert there
1276 cres = iconv(m2w,
1277 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1278 (char**)&bufPtr, &outbuf);
1279 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1280
36acb880 1281 if (ms_wcNeedsSwap)
3a0d76bc 1282 {
36acb880
VZ
1283 // convert to native endianness
1284 WC_BSWAP(buf /* _not_ bufPtr */, res)
3a0d76bc 1285 }
adb45366 1286
49dd9820
VS
1287 // NB: iconv was given only strlen(psz) characters on input, and so
1288 // it couldn't convert the trailing zero. Let's do it ourselves
1289 // if there's some room left for it in the output buffer.
1290 if (res < n)
1291 buf[res] = 0;
36acb880
VZ
1292 }
1293 else
1294 {
1295 // no destination buffer... convert using temp buffer
1296 // to calculate destination buffer requirement
1297 wchar_t tbuf[8];
1298 res = 0;
1299 do {
1300 bufPtr = tbuf;
1301 outbuf = 8*SIZEOF_WCHAR_T;
1302
1303 cres = iconv(m2w,
1304 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1305 (char**)&bufPtr, &outbuf );
1306
1307 res += 8-(outbuf/SIZEOF_WCHAR_T);
1308 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1309 }
dccce9ea 1310
36acb880 1311 if (ICONV_FAILED(cres, inbuf))
f1339c56 1312 {
36acb880
VZ
1313 //VS: it is ok if iconv fails, hence trace only
1314 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1315 return (size_t)-1;
1316 }
1317
1318 return res;
1319}
1320
bde4baac 1321size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1322{
f8d791e0 1323 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
36acb880
VZ
1324 size_t outbuf = n;
1325 size_t res, cres;
3a0d76bc 1326
36acb880 1327 wchar_t *tmpbuf = 0;
3caec1bb 1328
36acb880
VZ
1329 if (ms_wcNeedsSwap)
1330 {
1331 // need to copy to temp buffer to switch endianness
1332 // this absolutely doesn't rock!
1333 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1334 // could be in read-only memory, or be accessed in some other thread)
1335 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1336 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1337 WC_BSWAP(tmpbuf, inbuf)
1338 psz=tmpbuf;
1339 }
3a0d76bc 1340
36acb880
VZ
1341 if (buf)
1342 {
1343 // have destination buffer, convert there
1344 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1345
36acb880 1346 res = n-outbuf;
adb45366 1347
49dd9820
VS
1348 // NB: iconv was given only wcslen(psz) characters on input, and so
1349 // it couldn't convert the trailing zero. Let's do it ourselves
1350 // if there's some room left for it in the output buffer.
1351 if (res < n)
1352 buf[0] = 0;
36acb880
VZ
1353 }
1354 else
1355 {
1356 // no destination buffer... convert using temp buffer
1357 // to calculate destination buffer requirement
1358 char tbuf[16];
1359 res = 0;
1360 do {
1361 buf = tbuf; outbuf = 16;
1362
1363 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1364
36acb880
VZ
1365 res += 16 - outbuf;
1366 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1367 }
dccce9ea 1368
36acb880
VZ
1369 if (ms_wcNeedsSwap)
1370 {
1371 free(tmpbuf);
1372 }
dccce9ea 1373
36acb880
VZ
1374 if (ICONV_FAILED(cres, inbuf))
1375 {
1376 //VS: it is ok if iconv fails, hence trace only
1377 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1378 return (size_t)-1;
1379 }
1380
1381 return res;
1382}
1383
b040e242 1384#endif // HAVE_ICONV
36acb880 1385
e95354ec 1386
36acb880
VZ
1387// ============================================================================
1388// Win32 conversion classes
1389// ============================================================================
1cd52418 1390
e95354ec 1391#ifdef wxHAVE_WIN32_MB2WC
373658eb 1392
8b04d4c4 1393// from utils.cpp
d775fa82 1394#if wxUSE_FONTMAP
8b04d4c4
VZ
1395extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1396extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1397#endif
373658eb 1398
e95354ec 1399class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1400{
1401public:
bde4baac
VZ
1402 wxMBConv_win32()
1403 {
1404 m_CodePage = CP_ACP;
1405 }
1406
7608a683 1407#if wxUSE_FONTMAP
e95354ec 1408 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1409 {
1410 m_CodePage = wxCharsetToCodepage(name);
1411 }
dccce9ea 1412
e95354ec 1413 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1414 {
1415 m_CodePage = wxEncodingToCodepage(encoding);
1416 }
7608a683 1417#endif
8b04d4c4 1418
bde4baac 1419 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1420 {
02272c9c
VZ
1421 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1422 // the behaviour is not compatible with the Unix version (using iconv)
1423 // and break the library itself, e.g. wxTextInputStream::NextChar()
1424 // wouldn't work if reading an incomplete MB char didn't result in an
1425 // error
2b5f62a0
VZ
1426 const size_t len = ::MultiByteToWideChar
1427 (
1428 m_CodePage, // code page
02272c9c 1429 MB_ERR_INVALID_CHARS, // flags: fall on error
2b5f62a0
VZ
1430 psz, // input string
1431 -1, // its length (NUL-terminated)
b4da152e 1432 buf, // output string
2b5f62a0
VZ
1433 buf ? n : 0 // size of output buffer
1434 );
1435
03a991bc
VZ
1436 // note that it returns count of written chars for buf != NULL and size
1437 // of the needed buffer for buf == NULL so in either case the length of
1438 // the string (which never includes the terminating NUL) is one less
1439 return len ? len - 1 : (size_t)-1;
f1339c56 1440 }
dccce9ea 1441
13dd924a 1442 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1443 {
13dd924a
VZ
1444 /*
1445 we have a problem here: by default, WideCharToMultiByte() may
1446 replace characters unrepresentable in the target code page with bad
1447 quality approximations such as turning "1/2" symbol (U+00BD) into
1448 "1" for the code pages which don't have it and we, obviously, want
1449 to avoid this at any price
d775fa82 1450
13dd924a
VZ
1451 the trouble is that this function does it _silently_, i.e. it won't
1452 even tell us whether it did or not... Win98/2000 and higher provide
1453 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1454 we have to resort to a round trip, i.e. check that converting back
1455 results in the same string -- this is, of course, expensive but
1456 otherwise we simply can't be sure to not garble the data.
1457 */
1458
1459 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1460 // it doesn't work with CJK encodings (which we test for rather roughly
1461 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1462 // supporting it
907173e5
WS
1463 BOOL usedDef wxDUMMY_INITIALIZE(false);
1464 BOOL *pUsedDef;
13dd924a
VZ
1465 int flags;
1466 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1467 {
1468 // it's our lucky day
1469 flags = WC_NO_BEST_FIT_CHARS;
1470 pUsedDef = &usedDef;
1471 }
1472 else // old system or unsupported encoding
1473 {
1474 flags = 0;
1475 pUsedDef = NULL;
1476 }
1477
2b5f62a0
VZ
1478 const size_t len = ::WideCharToMultiByte
1479 (
1480 m_CodePage, // code page
13dd924a
VZ
1481 flags, // either none or no best fit
1482 pwz, // input string
2b5f62a0
VZ
1483 -1, // it is (wide) NUL-terminated
1484 buf, // output buffer
1485 buf ? n : 0, // and its size
1486 NULL, // default "replacement" char
13dd924a 1487 pUsedDef // [out] was it used?
2b5f62a0
VZ
1488 );
1489
13dd924a
VZ
1490 if ( !len )
1491 {
1492 // function totally failed
1493 return (size_t)-1;
1494 }
1495
1496 // if we were really converting, check if we succeeded
1497 if ( buf )
1498 {
1499 if ( flags )
1500 {
1501 // check if the conversion failed, i.e. if any replacements
1502 // were done
1503 if ( usedDef )
1504 return (size_t)-1;
1505 }
1506 else // we must resort to double tripping...
1507 {
1508 wxWCharBuffer wcBuf(n);
1509 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1510 wcscmp(wcBuf, pwz) != 0 )
1511 {
1512 // we didn't obtain the same thing we started from, hence
1513 // the conversion was lossy and we consider that it failed
1514 return (size_t)-1;
1515 }
1516 }
1517 }
1518
03a991bc 1519 // see the comment above for the reason of "len - 1"
13dd924a 1520 return len - 1;
f1339c56 1521 }
dccce9ea 1522
13dd924a
VZ
1523 bool IsOk() const { return m_CodePage != -1; }
1524
1525private:
1526 static bool CanUseNoBestFit()
1527 {
1528 static int s_isWin98Or2k = -1;
1529
1530 if ( s_isWin98Or2k == -1 )
1531 {
1532 int verMaj, verMin;
1533 switch ( wxGetOsVersion(&verMaj, &verMin) )
1534 {
1535 case wxWIN95:
1536 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1537 break;
1538
1539 case wxWINDOWS_NT:
1540 s_isWin98Or2k = verMaj >= 5;
1541 break;
1542
1543 default:
1544 // unknown, be conseravtive by default
1545 s_isWin98Or2k = 0;
1546 }
1547
1548 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1549 }
1550
1551 return s_isWin98Or2k == 1;
1552 }
f1339c56 1553
b1d66b54 1554 long m_CodePage;
1cd52418 1555};
e95354ec
VZ
1556
1557#endif // wxHAVE_WIN32_MB2WC
1558
f7e98dee
RN
1559// ============================================================================
1560// Cocoa conversion classes
1561// ============================================================================
1562
1563#if defined(__WXCOCOA__)
1564
ecd9653b 1565// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1566// Cocoa. Strangely enough, internally Core Foundation uses
1567// UTF 32 internally quite a bit - its just not public (yet).
1568
1569#include <CoreFoundation/CFString.h>
1570#include <CoreFoundation/CFStringEncodingExt.h>
1571
1572CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1573{
638357a0 1574 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1575 if ( encoding == wxFONTENCODING_DEFAULT )
1576 {
638357a0 1577 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1578 }
1579 else switch( encoding)
1580 {
1581 case wxFONTENCODING_ISO8859_1 :
1582 enc = kCFStringEncodingISOLatin1 ;
1583 break ;
1584 case wxFONTENCODING_ISO8859_2 :
1585 enc = kCFStringEncodingISOLatin2;
1586 break ;
1587 case wxFONTENCODING_ISO8859_3 :
1588 enc = kCFStringEncodingISOLatin3 ;
1589 break ;
1590 case wxFONTENCODING_ISO8859_4 :
1591 enc = kCFStringEncodingISOLatin4;
1592 break ;
1593 case wxFONTENCODING_ISO8859_5 :
1594 enc = kCFStringEncodingISOLatinCyrillic;
1595 break ;
1596 case wxFONTENCODING_ISO8859_6 :
1597 enc = kCFStringEncodingISOLatinArabic;
1598 break ;
1599 case wxFONTENCODING_ISO8859_7 :
1600 enc = kCFStringEncodingISOLatinGreek;
1601 break ;
1602 case wxFONTENCODING_ISO8859_8 :
1603 enc = kCFStringEncodingISOLatinHebrew;
1604 break ;
1605 case wxFONTENCODING_ISO8859_9 :
1606 enc = kCFStringEncodingISOLatin5;
1607 break ;
1608 case wxFONTENCODING_ISO8859_10 :
1609 enc = kCFStringEncodingISOLatin6;
1610 break ;
1611 case wxFONTENCODING_ISO8859_11 :
1612 enc = kCFStringEncodingISOLatinThai;
1613 break ;
1614 case wxFONTENCODING_ISO8859_13 :
1615 enc = kCFStringEncodingISOLatin7;
1616 break ;
1617 case wxFONTENCODING_ISO8859_14 :
1618 enc = kCFStringEncodingISOLatin8;
1619 break ;
1620 case wxFONTENCODING_ISO8859_15 :
1621 enc = kCFStringEncodingISOLatin9;
1622 break ;
1623
1624 case wxFONTENCODING_KOI8 :
1625 enc = kCFStringEncodingKOI8_R;
1626 break ;
1627 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1628 enc = kCFStringEncodingDOSRussian;
1629 break ;
1630
1631// case wxFONTENCODING_BULGARIAN :
1632// enc = ;
1633// break ;
1634
1635 case wxFONTENCODING_CP437 :
1636 enc =kCFStringEncodingDOSLatinUS ;
1637 break ;
1638 case wxFONTENCODING_CP850 :
1639 enc = kCFStringEncodingDOSLatin1;
1640 break ;
1641 case wxFONTENCODING_CP852 :
1642 enc = kCFStringEncodingDOSLatin2;
1643 break ;
1644 case wxFONTENCODING_CP855 :
1645 enc = kCFStringEncodingDOSCyrillic;
1646 break ;
1647 case wxFONTENCODING_CP866 :
1648 enc =kCFStringEncodingDOSRussian ;
1649 break ;
1650 case wxFONTENCODING_CP874 :
1651 enc = kCFStringEncodingDOSThai;
1652 break ;
1653 case wxFONTENCODING_CP932 :
1654 enc = kCFStringEncodingDOSJapanese;
1655 break ;
1656 case wxFONTENCODING_CP936 :
1657 enc =kCFStringEncodingDOSChineseSimplif ;
1658 break ;
1659 case wxFONTENCODING_CP949 :
1660 enc = kCFStringEncodingDOSKorean;
1661 break ;
1662 case wxFONTENCODING_CP950 :
1663 enc = kCFStringEncodingDOSChineseTrad;
1664 break ;
ecd9653b
WS
1665 case wxFONTENCODING_CP1250 :
1666 enc = kCFStringEncodingWindowsLatin2;
1667 break ;
1668 case wxFONTENCODING_CP1251 :
1669 enc =kCFStringEncodingWindowsCyrillic ;
1670 break ;
1671 case wxFONTENCODING_CP1252 :
1672 enc =kCFStringEncodingWindowsLatin1 ;
1673 break ;
1674 case wxFONTENCODING_CP1253 :
1675 enc = kCFStringEncodingWindowsGreek;
1676 break ;
1677 case wxFONTENCODING_CP1254 :
1678 enc = kCFStringEncodingWindowsLatin5;
1679 break ;
1680 case wxFONTENCODING_CP1255 :
1681 enc =kCFStringEncodingWindowsHebrew ;
1682 break ;
1683 case wxFONTENCODING_CP1256 :
1684 enc =kCFStringEncodingWindowsArabic ;
1685 break ;
1686 case wxFONTENCODING_CP1257 :
1687 enc = kCFStringEncodingWindowsBalticRim;
1688 break ;
638357a0
RN
1689// This only really encodes to UTF7 (if that) evidently
1690// case wxFONTENCODING_UTF7 :
1691// enc = kCFStringEncodingNonLossyASCII ;
1692// break ;
ecd9653b
WS
1693 case wxFONTENCODING_UTF8 :
1694 enc = kCFStringEncodingUTF8 ;
1695 break ;
1696 case wxFONTENCODING_EUC_JP :
1697 enc = kCFStringEncodingEUC_JP;
1698 break ;
1699 case wxFONTENCODING_UTF16 :
f7e98dee 1700 enc = kCFStringEncodingUnicode ;
ecd9653b 1701 break ;
f7e98dee
RN
1702 case wxFONTENCODING_MACROMAN :
1703 enc = kCFStringEncodingMacRoman ;
1704 break ;
1705 case wxFONTENCODING_MACJAPANESE :
1706 enc = kCFStringEncodingMacJapanese ;
1707 break ;
1708 case wxFONTENCODING_MACCHINESETRAD :
1709 enc = kCFStringEncodingMacChineseTrad ;
1710 break ;
1711 case wxFONTENCODING_MACKOREAN :
1712 enc = kCFStringEncodingMacKorean ;
1713 break ;
1714 case wxFONTENCODING_MACARABIC :
1715 enc = kCFStringEncodingMacArabic ;
1716 break ;
1717 case wxFONTENCODING_MACHEBREW :
1718 enc = kCFStringEncodingMacHebrew ;
1719 break ;
1720 case wxFONTENCODING_MACGREEK :
1721 enc = kCFStringEncodingMacGreek ;
1722 break ;
1723 case wxFONTENCODING_MACCYRILLIC :
1724 enc = kCFStringEncodingMacCyrillic ;
1725 break ;
1726 case wxFONTENCODING_MACDEVANAGARI :
1727 enc = kCFStringEncodingMacDevanagari ;
1728 break ;
1729 case wxFONTENCODING_MACGURMUKHI :
1730 enc = kCFStringEncodingMacGurmukhi ;
1731 break ;
1732 case wxFONTENCODING_MACGUJARATI :
1733 enc = kCFStringEncodingMacGujarati ;
1734 break ;
1735 case wxFONTENCODING_MACORIYA :
1736 enc = kCFStringEncodingMacOriya ;
1737 break ;
1738 case wxFONTENCODING_MACBENGALI :
1739 enc = kCFStringEncodingMacBengali ;
1740 break ;
1741 case wxFONTENCODING_MACTAMIL :
1742 enc = kCFStringEncodingMacTamil ;
1743 break ;
1744 case wxFONTENCODING_MACTELUGU :
1745 enc = kCFStringEncodingMacTelugu ;
1746 break ;
1747 case wxFONTENCODING_MACKANNADA :
1748 enc = kCFStringEncodingMacKannada ;
1749 break ;
1750 case wxFONTENCODING_MACMALAJALAM :
1751 enc = kCFStringEncodingMacMalayalam ;
1752 break ;
1753 case wxFONTENCODING_MACSINHALESE :
1754 enc = kCFStringEncodingMacSinhalese ;
1755 break ;
1756 case wxFONTENCODING_MACBURMESE :
1757 enc = kCFStringEncodingMacBurmese ;
1758 break ;
1759 case wxFONTENCODING_MACKHMER :
1760 enc = kCFStringEncodingMacKhmer ;
1761 break ;
1762 case wxFONTENCODING_MACTHAI :
1763 enc = kCFStringEncodingMacThai ;
1764 break ;
1765 case wxFONTENCODING_MACLAOTIAN :
1766 enc = kCFStringEncodingMacLaotian ;
1767 break ;
1768 case wxFONTENCODING_MACGEORGIAN :
1769 enc = kCFStringEncodingMacGeorgian ;
1770 break ;
1771 case wxFONTENCODING_MACARMENIAN :
1772 enc = kCFStringEncodingMacArmenian ;
1773 break ;
1774 case wxFONTENCODING_MACCHINESESIMP :
1775 enc = kCFStringEncodingMacChineseSimp ;
1776 break ;
1777 case wxFONTENCODING_MACTIBETAN :
1778 enc = kCFStringEncodingMacTibetan ;
1779 break ;
1780 case wxFONTENCODING_MACMONGOLIAN :
1781 enc = kCFStringEncodingMacMongolian ;
1782 break ;
1783 case wxFONTENCODING_MACETHIOPIC :
1784 enc = kCFStringEncodingMacEthiopic ;
1785 break ;
1786 case wxFONTENCODING_MACCENTRALEUR :
1787 enc = kCFStringEncodingMacCentralEurRoman ;
1788 break ;
1789 case wxFONTENCODING_MACVIATNAMESE :
1790 enc = kCFStringEncodingMacVietnamese ;
1791 break ;
1792 case wxFONTENCODING_MACARABICEXT :
1793 enc = kCFStringEncodingMacExtArabic ;
1794 break ;
1795 case wxFONTENCODING_MACSYMBOL :
1796 enc = kCFStringEncodingMacSymbol ;
1797 break ;
1798 case wxFONTENCODING_MACDINGBATS :
1799 enc = kCFStringEncodingMacDingbats ;
1800 break ;
1801 case wxFONTENCODING_MACTURKISH :
1802 enc = kCFStringEncodingMacTurkish ;
1803 break ;
1804 case wxFONTENCODING_MACCROATIAN :
1805 enc = kCFStringEncodingMacCroatian ;
1806 break ;
1807 case wxFONTENCODING_MACICELANDIC :
1808 enc = kCFStringEncodingMacIcelandic ;
1809 break ;
1810 case wxFONTENCODING_MACROMANIAN :
1811 enc = kCFStringEncodingMacRomanian ;
1812 break ;
1813 case wxFONTENCODING_MACCELTIC :
1814 enc = kCFStringEncodingMacCeltic ;
1815 break ;
1816 case wxFONTENCODING_MACGAELIC :
1817 enc = kCFStringEncodingMacGaelic ;
1818 break ;
ecd9653b
WS
1819// case wxFONTENCODING_MACKEYBOARD :
1820// enc = kCFStringEncodingMacKeyboardGlyphs ;
1821// break ;
1822 default :
1823 // because gcc is picky
1824 break ;
1825 } ;
1826 return enc ;
f7e98dee
RN
1827}
1828
f7e98dee
RN
1829class wxMBConv_cocoa : public wxMBConv
1830{
1831public:
1832 wxMBConv_cocoa()
1833 {
1834 Init(CFStringGetSystemEncoding()) ;
1835 }
1836
1837 wxMBConv_cocoa(const wxChar* name)
1838 {
1839 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1840 }
1841
1842 wxMBConv_cocoa(wxFontEncoding encoding)
1843 {
1844 Init( wxCFStringEncFromFontEnc(encoding) );
1845 }
1846
1847 ~wxMBConv_cocoa()
1848 {
1849 }
1850
1851 void Init( CFStringEncoding encoding)
1852 {
638357a0 1853 m_encoding = encoding ;
f7e98dee
RN
1854 }
1855
1856 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1857 {
1858 wxASSERT(szUnConv);
ecd9653b 1859
638357a0
RN
1860 CFStringRef theString = CFStringCreateWithBytes (
1861 NULL, //the allocator
1862 (const UInt8*)szUnConv,
1863 strlen(szUnConv),
1864 m_encoding,
1865 false //no BOM/external representation
f7e98dee
RN
1866 );
1867
1868 wxASSERT(theString);
1869
638357a0
RN
1870 size_t nOutLength = CFStringGetLength(theString);
1871
1872 if (szOut == NULL)
f7e98dee 1873 {
f7e98dee 1874 CFRelease(theString);
638357a0 1875 return nOutLength;
f7e98dee 1876 }
ecd9653b 1877
638357a0 1878 CFRange theRange = { 0, nOutSize };
ecd9653b 1879
638357a0
RN
1880#if SIZEOF_WCHAR_T == 4
1881 UniChar* szUniCharBuffer = new UniChar[nOutSize];
1882#endif
1883
f7e98dee 1884 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
638357a0 1885
f7e98dee 1886 CFRelease(theString);
ecd9653b 1887
638357a0 1888 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
1889
1890#if SIZEOF_WCHAR_T == 4
1891 wxMBConvUTF16 converter ;
638357a0 1892 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
1893 delete[] szUniCharBuffer;
1894#endif
638357a0
RN
1895
1896 return nOutLength;
f7e98dee
RN
1897 }
1898
1899 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1900 {
638357a0
RN
1901 wxASSERT(szUnConv);
1902
f7e98dee 1903 size_t nRealOutSize;
638357a0 1904 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 1905 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 1906
f7e98dee
RN
1907#if SIZEOF_WCHAR_T == 4
1908 wxMBConvUTF16BE converter ;
1909 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1910 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1911 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1912 nBufSize /= sizeof(UniChar);
f7e98dee
RN
1913#endif
1914
1915 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1916 NULL, //allocator
1917 szUniBuffer,
1918 nBufSize,
638357a0 1919 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 1920 );
ecd9653b 1921
f7e98dee 1922 wxASSERT(theString);
ecd9653b 1923
f7e98dee 1924 //Note that CER puts a BOM when converting to unicode
638357a0
RN
1925 //so we check and use getchars instead in that case
1926 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 1927 {
638357a0
RN
1928 if (szOut != NULL)
1929 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1930
1931 nRealOutSize = CFStringGetLength(theString) + 1;
1932 }
1933 else
1934 {
1935 CFStringGetBytes(
1936 theString,
1937 CFRangeMake(0, CFStringGetLength(theString)),
1938 m_encoding,
1939 0, //what to put in characters that can't be converted -
1940 //0 tells CFString to return NULL if it meets such a character
1941 false, //not an external representation
1942 (UInt8*) szOut,
1943 nOutSize,
1944 (CFIndex*) &nRealOutSize
1945 );
f7e98dee 1946 }
ecd9653b 1947
638357a0 1948 CFRelease(theString);
ecd9653b 1949
638357a0
RN
1950#if SIZEOF_WCHAR_T == 4
1951 delete[] szUniBuffer;
1952#endif
ecd9653b 1953
f7e98dee
RN
1954 return nRealOutSize - 1;
1955 }
1956
1957 bool IsOk() const
ecd9653b 1958 {
638357a0
RN
1959 return m_encoding != kCFStringEncodingInvalidId &&
1960 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
1961 }
1962
1963private:
638357a0 1964 CFStringEncoding m_encoding ;
f7e98dee
RN
1965};
1966
1967#endif // defined(__WXCOCOA__)
1968
335d31e0
SC
1969// ============================================================================
1970// Mac conversion classes
1971// ============================================================================
1972
1973#if defined(__WXMAC__) && defined(TARGET_CARBON)
1974
1975class wxMBConv_mac : public wxMBConv
1976{
1977public:
1978 wxMBConv_mac()
1979 {
1980 Init(CFStringGetSystemEncoding()) ;
1981 }
1982
1983 wxMBConv_mac(const wxChar* name)
1984 {
d775fa82 1985 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0
SC
1986 }
1987
1988 wxMBConv_mac(wxFontEncoding encoding)
1989 {
d775fa82
WS
1990 Init( wxMacGetSystemEncFromFontEnc(encoding) );
1991 }
1992
1993 ~wxMBConv_mac()
1994 {
1995 OSStatus status = noErr ;
1996 status = TECDisposeConverter(m_MB2WC_converter);
1997 status = TECDisposeConverter(m_WC2MB_converter);
1998 }
1999
2000
2001 void Init( TextEncodingBase encoding)
2002 {
2003 OSStatus status = noErr ;
2004 m_char_encoding = encoding ;
2005 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2006
2007 status = TECCreateConverter(&m_MB2WC_converter,
2008 m_char_encoding,
2009 m_unicode_encoding);
2010 status = TECCreateConverter(&m_WC2MB_converter,
2011 m_unicode_encoding,
2012 m_char_encoding);
2013 }
2014
335d31e0
SC
2015 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2016 {
d775fa82
WS
2017 OSStatus status = noErr ;
2018 ByteCount byteOutLen ;
2019 ByteCount byteInLen = strlen(psz) ;
2020 wchar_t *tbuf = NULL ;
2021 UniChar* ubuf = NULL ;
2022 size_t res = 0 ;
2023
2024 if (buf == NULL)
2025 {
638357a0 2026 //apple specs say at least 32
c543817b 2027 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2028 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2029 }
2030 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2031#if SIZEOF_WCHAR_T == 4
d775fa82 2032 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2033#else
d775fa82 2034 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2035#endif
d775fa82
WS
2036 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2037 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2038#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2039 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2040 // is not properly terminated we get random characters at the end
2041 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d775fa82
WS
2042 wxMBConvUTF16BE converter ;
2043 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2044 free( ubuf ) ;
f3a355ce 2045#else
d775fa82 2046 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2047#endif
d775fa82
WS
2048 if ( buf == NULL )
2049 free(tbuf) ;
335d31e0 2050
335d31e0
SC
2051 if ( buf && res < n)
2052 buf[res] = 0;
2053
d775fa82 2054 return res ;
335d31e0
SC
2055 }
2056
2057 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2058 {
2059 OSStatus status = noErr ;
2060 ByteCount byteOutLen ;
2061 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2062
2063 char *tbuf = NULL ;
2064
2065 if (buf == NULL)
2066 {
638357a0 2067 //apple specs say at least 32
c543817b 2068 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2069 tbuf = (char*) malloc( n ) ;
2070 }
2071
2072 ByteCount byteBufferLen = n ;
2073 UniChar* ubuf = NULL ;
f3a355ce 2074#if SIZEOF_WCHAR_T == 4
d775fa82
WS
2075 wxMBConvUTF16BE converter ;
2076 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2077 byteInLen = unicharlen ;
2078 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2079 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2080#else
d775fa82 2081 ubuf = (UniChar*) psz ;
f3a355ce 2082#endif
d775fa82
WS
2083 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2084 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2085#if SIZEOF_WCHAR_T == 4
d775fa82 2086 free( ubuf ) ;
f3a355ce 2087#endif
d775fa82
WS
2088 if ( buf == NULL )
2089 free(tbuf) ;
335d31e0 2090
d775fa82 2091 size_t res = byteOutLen ;
335d31e0 2092 if ( buf && res < n)
638357a0 2093 {
335d31e0 2094 buf[res] = 0;
638357a0
RN
2095
2096 //we need to double-trip to verify it didn't insert any ? in place
2097 //of bogus characters
2098 wxWCharBuffer wcBuf(n);
2099 size_t pszlen = wxWcslen(psz);
2100 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2101 wxWcslen(wcBuf) != pszlen ||
2102 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2103 {
2104 // we didn't obtain the same thing we started from, hence
2105 // the conversion was lossy and we consider that it failed
2106 return (size_t)-1;
2107 }
2108 }
335d31e0 2109
d775fa82 2110 return res ;
335d31e0
SC
2111 }
2112
2113 bool IsOk() const
2114 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2115
2116private:
d775fa82
WS
2117 TECObjectRef m_MB2WC_converter ;
2118 TECObjectRef m_WC2MB_converter ;
2119
2120 TextEncodingBase m_char_encoding ;
2121 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2122};
2123
2124#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2125
36acb880
VZ
2126// ============================================================================
2127// wxEncodingConverter based conversion classes
2128// ============================================================================
2129
1e6feb95 2130#if wxUSE_FONTMAP
1cd52418 2131
e95354ec 2132class wxMBConv_wxwin : public wxMBConv
1cd52418 2133{
8b04d4c4
VZ
2134private:
2135 void Init()
2136 {
2137 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2138 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2139 }
2140
6001e347 2141public:
f1339c56
RR
2142 // temporarily just use wxEncodingConverter stuff,
2143 // so that it works while a better implementation is built
e95354ec 2144 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2145 {
2146 if (name)
e95354ec 2147 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2148 else
2149 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2150
8b04d4c4
VZ
2151 Init();
2152 }
2153
e95354ec 2154 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2155 {
2156 m_enc = enc;
2157
2158 Init();
f1339c56 2159 }
dccce9ea 2160
bde4baac 2161 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2162 {
2163 size_t inbuf = strlen(psz);
dccce9ea 2164 if (buf)
4def3b35 2165 m2w.Convert(psz,buf);
f1339c56
RR
2166 return inbuf;
2167 }
dccce9ea 2168
bde4baac 2169 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2170 {
f8d791e0 2171 const size_t inbuf = wxWcslen(psz);
f1339c56
RR
2172 if (buf)
2173 w2m.Convert(psz,buf);
dccce9ea 2174
f1339c56
RR
2175 return inbuf;
2176 }
dccce9ea 2177
e95354ec 2178 bool IsOk() const { return m_ok; }
f1339c56
RR
2179
2180public:
8b04d4c4 2181 wxFontEncoding m_enc;
f1339c56 2182 wxEncodingConverter m2w, w2m;
cafbf6fb
VZ
2183
2184 // were we initialized successfully?
2185 bool m_ok;
fc7a2a60 2186
e95354ec 2187 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2188};
6001e347 2189
1e6feb95
VZ
2190#endif // wxUSE_FONTMAP
2191
36acb880
VZ
2192// ============================================================================
2193// wxCSConv implementation
2194// ============================================================================
2195
8b04d4c4 2196void wxCSConv::Init()
6001e347 2197{
e95354ec
VZ
2198 m_name = NULL;
2199 m_convReal = NULL;
2200 m_deferred = true;
2201}
2202
8b04d4c4
VZ
2203wxCSConv::wxCSConv(const wxChar *charset)
2204{
2205 Init();
82713003 2206
e95354ec
VZ
2207 if ( charset )
2208 {
e95354ec
VZ
2209 SetName(charset);
2210 }
bda3d86a
VZ
2211
2212 m_encoding = wxFONTENCODING_SYSTEM;
6001e347
RR
2213}
2214
8b04d4c4
VZ
2215wxCSConv::wxCSConv(wxFontEncoding encoding)
2216{
bda3d86a 2217 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2218 {
2219 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2220
2221 encoding = wxFONTENCODING_SYSTEM;
2222 }
2223
8b04d4c4
VZ
2224 Init();
2225
bda3d86a 2226 m_encoding = encoding;
8b04d4c4
VZ
2227}
2228
6001e347
RR
2229wxCSConv::~wxCSConv()
2230{
65e50848
JS
2231 Clear();
2232}
2233
54380f29 2234wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2235 : wxMBConv()
54380f29 2236{
8b04d4c4
VZ
2237 Init();
2238
54380f29 2239 SetName(conv.m_name);
8b04d4c4 2240 m_encoding = conv.m_encoding;
54380f29
GD
2241}
2242
2243wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2244{
2245 Clear();
8b04d4c4 2246
54380f29 2247 SetName(conv.m_name);
8b04d4c4
VZ
2248 m_encoding = conv.m_encoding;
2249
54380f29
GD
2250 return *this;
2251}
2252
65e50848
JS
2253void wxCSConv::Clear()
2254{
8b04d4c4 2255 free(m_name);
e95354ec 2256 delete m_convReal;
8b04d4c4 2257
65e50848 2258 m_name = NULL;
e95354ec 2259 m_convReal = NULL;
6001e347
RR
2260}
2261
2262void wxCSConv::SetName(const wxChar *charset)
2263{
f1339c56
RR
2264 if (charset)
2265 {
2266 m_name = wxStrdup(charset);
e95354ec 2267 m_deferred = true;
f1339c56 2268 }
6001e347
RR
2269}
2270
e95354ec
VZ
2271wxMBConv *wxCSConv::DoCreate() const
2272{
c547282d
VZ
2273 // check for the special case of ASCII or ISO8859-1 charset: as we have
2274 // special knowledge of it anyhow, we don't need to create a special
2275 // conversion object
2276 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 2277 {
e95354ec
VZ
2278 // don't convert at all
2279 return NULL;
2280 }
dccce9ea 2281
e95354ec
VZ
2282 // we trust OS to do conversion better than we can so try external
2283 // conversion methods first
2284 //
2285 // the full order is:
2286 // 1. OS conversion (iconv() under Unix or Win32 API)
2287 // 2. hard coded conversions for UTF
2288 // 3. wxEncodingConverter as fall back
2289
2290 // step (1)
2291#ifdef HAVE_ICONV
c547282d 2292#if !wxUSE_FONTMAP
e95354ec 2293 if ( m_name )
c547282d 2294#endif // !wxUSE_FONTMAP
e95354ec 2295 {
c547282d
VZ
2296 wxString name(m_name);
2297
2298#if wxUSE_FONTMAP
2299 if ( name.empty() )
2300 name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2301#endif // wxUSE_FONTMAP
2302
2303 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
e95354ec
VZ
2304 if ( conv->IsOk() )
2305 return conv;
2306
2307 delete conv;
2308 }
2309#endif // HAVE_ICONV
2310
2311#ifdef wxHAVE_WIN32_MB2WC
2312 {
7608a683 2313#if wxUSE_FONTMAP
e95354ec
VZ
2314 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2315 : new wxMBConv_win32(m_encoding);
2316 if ( conv->IsOk() )
2317 return conv;
2318
2319 delete conv;
7608a683
WS
2320#else
2321 return NULL;
2322#endif
e95354ec
VZ
2323 }
2324#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2325#if defined(__WXMAC__)
2326 {
2327 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2328 {
2329
2330 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2331 : new wxMBConv_mac(m_encoding);
2332 if ( conv->IsOk() )
f7e98dee
RN
2333 return conv;
2334
2335 delete conv;
2336 }
2337 }
2338#endif
2339#if defined(__WXCOCOA__)
2340 {
2341 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2342 {
2343
2344 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2345 : new wxMBConv_cocoa(m_encoding);
2346 if ( conv->IsOk() )
d775fa82
WS
2347 return conv;
2348
2349 delete conv;
2350 }
335d31e0
SC
2351 }
2352#endif
e95354ec
VZ
2353 // step (2)
2354 wxFontEncoding enc = m_encoding;
2355#if wxUSE_FONTMAP
c547282d
VZ
2356 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2357 {
2358 // use "false" to suppress interactive dialogs -- we can be called from
2359 // anywhere and popping up a dialog from here is the last thing we want to
2360 // do
2361 enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2362 }
e95354ec
VZ
2363#endif // wxUSE_FONTMAP
2364
2365 switch ( enc )
2366 {
2367 case wxFONTENCODING_UTF7:
2368 return new wxMBConvUTF7;
2369
2370 case wxFONTENCODING_UTF8:
2371 return new wxMBConvUTF8;
2372
e95354ec
VZ
2373 case wxFONTENCODING_UTF16BE:
2374 return new wxMBConvUTF16BE;
2375
2376 case wxFONTENCODING_UTF16LE:
2377 return new wxMBConvUTF16LE;
2378
e95354ec
VZ
2379 case wxFONTENCODING_UTF32BE:
2380 return new wxMBConvUTF32BE;
2381
2382 case wxFONTENCODING_UTF32LE:
2383 return new wxMBConvUTF32LE;
2384
2385 default:
2386 // nothing to do but put here to suppress gcc warnings
2387 ;
2388 }
2389
2390 // step (3)
2391#if wxUSE_FONTMAP
2392 {
2393 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2394 : new wxMBConv_wxwin(m_encoding);
2395 if ( conv->IsOk() )
2396 return conv;
2397
2398 delete conv;
2399 }
2400#endif // wxUSE_FONTMAP
2401
a58d4f4d
VS
2402 // NB: This is a hack to prevent deadlock. What could otherwise happen
2403 // in Unicode build: wxConvLocal creation ends up being here
2404 // because of some failure and logs the error. But wxLog will try to
2405 // attach timestamp, for which it will need wxConvLocal (to convert
2406 // time to char* and then wchar_t*), but that fails, tries to log
2407 // error, but wxLog has a (already locked) critical section that
2408 // guards static buffer.
2409 static bool alreadyLoggingError = false;
2410 if (!alreadyLoggingError)
2411 {
2412 alreadyLoggingError = true;
2413 wxLogError(_("Cannot convert from the charset '%s'!"),
2414 m_name ? m_name
e95354ec
VZ
2415 :
2416#if wxUSE_FONTMAP
2417 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2418#else // !wxUSE_FONTMAP
2419 wxString::Format(_("encoding %s"), m_encoding).c_str()
2420#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2421 );
a58d4f4d
VS
2422 alreadyLoggingError = false;
2423 }
e95354ec
VZ
2424
2425 return NULL;
2426}
2427
2428void wxCSConv::CreateConvIfNeeded() const
2429{
2430 if ( m_deferred )
2431 {
2432 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2433
2434#if wxUSE_INTL
2435 // if we don't have neither the name nor the encoding, use the default
2436 // encoding for this system
2437 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2438 {
4d312c22 2439 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2440 }
2441#endif // wxUSE_INTL
2442
e95354ec
VZ
2443 self->m_convReal = DoCreate();
2444 self->m_deferred = false;
6001e347 2445 }
6001e347
RR
2446}
2447
2448size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2449{
e95354ec 2450 CreateConvIfNeeded();
dccce9ea 2451
e95354ec
VZ
2452 if (m_convReal)
2453 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2454
2455 // latin-1 (direct)
4def3b35 2456 size_t len = strlen(psz);
dccce9ea 2457
f1339c56
RR
2458 if (buf)
2459 {
4def3b35 2460 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2461 buf[c] = (unsigned char)(psz[c]);
2462 }
dccce9ea 2463
f1339c56 2464 return len;
6001e347
RR
2465}
2466
2467size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2468{
e95354ec 2469 CreateConvIfNeeded();
dccce9ea 2470
e95354ec
VZ
2471 if (m_convReal)
2472 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2473
f1339c56 2474 // latin-1 (direct)
f8d791e0 2475 const size_t len = wxWcslen(psz);
f1339c56
RR
2476 if (buf)
2477 {
4def3b35 2478 for (size_t c = 0; c <= len; c++)
24642831
VS
2479 {
2480 if (psz[c] > 0xFF)
2481 return (size_t)-1;
907173e5 2482 buf[c] = (char)psz[c];
24642831
VS
2483 }
2484 }
2485 else
2486 {
2487 for (size_t c = 0; c <= len; c++)
2488 {
2489 if (psz[c] > 0xFF)
2490 return (size_t)-1;
2491 }
f1339c56 2492 }
dccce9ea 2493
f1339c56 2494 return len;
6001e347
RR
2495}
2496
bde4baac
VZ
2497// ----------------------------------------------------------------------------
2498// globals
2499// ----------------------------------------------------------------------------
2500
2501#ifdef __WINDOWS__
2502 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
2503#elif defined(__WXMAC__) && !defined(__MACH__)
2504 static wxMBConv_mac wxConvLibcObj ;
bde4baac 2505#else
dcc8fac0 2506 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
2507#endif
2508
2509static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2510static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2511static wxMBConvUTF7 wxConvUTF7Obj;
2512static wxMBConvUTF8 wxConvUTF8Obj;
2513
2514
2515WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2516WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2517WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2518WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2519WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2520WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2521
2522#else // !wxUSE_WCHAR_T
2523
2524// stand-ins in absence of wchar_t
2525WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2526 wxConvISO8859_1,
2527 wxConvLocal,
2528 wxConvUTF8;
2529
2530#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
6001e347
RR
2531
2532