]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
We need a char, not wxChar.
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347
RR
1/////////////////////////////////////////////////////////////////////////////
2// Name: strconv.cpp
3// Purpose: Unicode conversion classes
15f2ee32
RN
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6001e347
RR
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
e95354ec
VZ
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
15f2ee32 11// (c) 2004 Ryan Norton, Fredrik Roubert
65571936 12// Licence: wxWindows licence
6001e347
RR
13/////////////////////////////////////////////////////////////////////////////
14
f6bcfd97
BP
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
14f355c2 23#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
6001e347
RR
24 #pragma implementation "strconv.h"
25#endif
26
27// For compilers that support precompilation, includes "wx.h".
28#include "wx/wxprec.h"
29
30#ifdef __BORLANDC__
31 #pragma hdrstop
32#endif
33
373658eb
VZ
34#ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37#endif // WX_PRECOMP
38
bde4baac
VZ
39#include "wx/strconv.h"
40
41#if wxUSE_WCHAR_T
42
0a1c1e62 43#ifdef __WXMSW__
373658eb 44 #include "wx/msw/private.h"
7608a683
WS
45#endif
46
47#ifdef __WINDOWS__
13dd924a 48 #include "wx/msw/missing.h"
0a1c1e62
GRG
49#endif
50
1c193821 51#ifndef __WXWINCE__
1cd52418 52#include <errno.h>
1c193821
JS
53#endif
54
6001e347
RR
55#include <ctype.h>
56#include <string.h>
57#include <stdlib.h>
58
e95354ec
VZ
59#if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61#endif // __WIN32__ but !__WXMICROWIN__
62
373658eb
VZ
63// ----------------------------------------------------------------------------
64// headers
65// ----------------------------------------------------------------------------
7af284fd 66
6001e347 67#ifdef __SALFORDC__
373658eb 68 #include <clib.h>
6001e347
RR
69#endif
70
b040e242 71#ifdef HAVE_ICONV
373658eb 72 #include <iconv.h>
1cd52418 73#endif
1cd52418 74
373658eb
VZ
75#include "wx/encconv.h"
76#include "wx/fontmap.h"
7608a683 77#include "wx/utils.h"
373658eb 78
335d31e0 79#ifdef __WXMAC__
4227afa4
SC
80#include <ATSUnicode.h>
81#include <TextCommon.h>
82#include <TextEncodingConverter.h>
335d31e0
SC
83
84#include "wx/mac/private.h" // includes mac headers
85#endif
373658eb
VZ
86// ----------------------------------------------------------------------------
87// macros
88// ----------------------------------------------------------------------------
3e61dfb0 89
1cd52418 90#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
3a0d76bc 91#define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
1cd52418
OK
92
93#if SIZEOF_WCHAR_T == 4
3a0d76bc
VS
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
98 #else
99 #define WC_NAME_BEST "UCS-4LE"
100 #endif
1cd52418 101#elif SIZEOF_WCHAR_T == 2
3a0d76bc
VS
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
a3f2769e 104 #define WC_UTF16
3a0d76bc
VS
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
107 #else
108 #define WC_NAME_BEST "UTF-16LE"
109 #endif
bab1e722 110#else // sizeof(wchar_t) != 2 nor 4
bde4baac
VZ
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1cd52418
OK
113#endif
114
373658eb
VZ
115// ============================================================================
116// implementation
117// ============================================================================
118
119// ----------------------------------------------------------------------------
c91830cb 120// UTF-16 en/decoding to/from UCS-4
373658eb 121// ----------------------------------------------------------------------------
6001e347 122
b0a6bb75 123
c91830cb 124static size_t encode_utf16(wxUint32 input, wxUint16 *output)
1cd52418 125{
dccce9ea 126 if (input<=0xffff)
4def3b35 127 {
999836aa
VZ
128 if (output)
129 *output = (wxUint16) input;
4def3b35 130 return 1;
dccce9ea
VZ
131 }
132 else if (input>=0x110000)
4def3b35
VS
133 {
134 return (size_t)-1;
dccce9ea
VZ
135 }
136 else
4def3b35 137 {
dccce9ea 138 if (output)
4def3b35 139 {
c91830cb 140 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
999836aa 141 *output = (wxUint16) ((input&0x3ff)+0xdc00);
4def3b35
VS
142 }
143 return 2;
1cd52418 144 }
1cd52418
OK
145}
146
c91830cb 147static size_t decode_utf16(const wxUint16* input, wxUint32& output)
1cd52418 148{
dccce9ea 149 if ((*input<0xd800) || (*input>0xdfff))
4def3b35
VS
150 {
151 output = *input;
152 return 1;
dccce9ea
VZ
153 }
154 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
4def3b35
VS
155 {
156 output = *input;
157 return (size_t)-1;
dccce9ea
VZ
158 }
159 else
4def3b35
VS
160 {
161 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
162 return 2;
163 }
1cd52418
OK
164}
165
b0a6bb75 166
f6bcfd97 167// ----------------------------------------------------------------------------
6001e347 168// wxMBConv
f6bcfd97 169// ----------------------------------------------------------------------------
2c53a80a
WS
170
171wxMBConv::~wxMBConv()
172{
173 // nothing to do here (necessary for Darwin linking probably)
174}
6001e347 175
6001e347
RR
176const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
177{
2b5f62a0 178 if ( psz )
6001e347 179 {
2b5f62a0
VZ
180 // calculate the length of the buffer needed first
181 size_t nLen = MB2WC(NULL, psz, 0);
182 if ( nLen != (size_t)-1 )
183 {
184 // now do the actual conversion
185 wxWCharBuffer buf(nLen);
635f33ce
VS
186 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
187 if ( nLen != (size_t)-1 )
188 {
189 return buf;
190 }
2b5f62a0 191 }
f6bcfd97 192 }
2b5f62a0
VZ
193
194 wxWCharBuffer buf((wchar_t *)NULL);
195
196 return buf;
6001e347
RR
197}
198
e5cceba0 199const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
6001e347 200{
2b5f62a0
VZ
201 if ( pwz )
202 {
203 size_t nLen = WC2MB(NULL, pwz, 0);
204 if ( nLen != (size_t)-1 )
205 {
c91830cb 206 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
635f33ce
VS
207 nLen = WC2MB(buf.data(), pwz, nLen + 4);
208 if ( nLen != (size_t)-1 )
209 {
210 return buf;
211 }
2b5f62a0
VZ
212 }
213 }
214
215 wxCharBuffer buf((char *)NULL);
e5cceba0 216
e5cceba0 217 return buf;
6001e347
RR
218}
219
e4e3bbb4
RN
220size_t wxMBConv::MB2WC(wchar_t* szBuffer, const char* szString,
221 size_t outsize, size_t nStringLen) const
222{
223 const char* szEnd = szString + nStringLen + 1;
224 const char* szPos = szString;
225 const char* szStart = szPos;
226
227 size_t nActualLength = 0;
228
229 //Convert the string until the length() is reached, continuing the
230 //loop every time a null character is reached
231 while(szPos != szEnd)
232 {
233 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
234
235 //Get the length of the current (sub)string
236 size_t nLen = MB2WC(NULL, szPos, 0);
237
238 //Invalid conversion?
239 if( nLen == (size_t)-1 )
240 return nLen;
241
242 //Increase the actual length (+1 for current null character)
243 nActualLength += nLen + 1;
244
245 //Only copy data in if buffer size is big enough
246 if (szBuffer != NULL &&
247 nActualLength <= outsize)
248 {
249 //Convert the current (sub)string
250 if ( MB2WC(&szBuffer[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
251 return (size_t)-1;
252 }
253
254 //Increment to next (sub)string
255 //Note that we have to use strlen here instead of nLen
256 //here because XX2XX gives us the size of the output buffer,
257 //not neccessarly the length of the string
258 szPos += strlen(szPos) + 1;
259 }
260
261 return nActualLength - 1; //success - return actual length
262}
263
264size_t wxMBConv::WC2MB(char* szBuffer, const wchar_t* szString,
265 size_t outsize, size_t nStringLen) const
266{
267 const wchar_t* szEnd = szString + nStringLen + 1;
268 const wchar_t* szPos = szString;
269 const wchar_t* szStart = szPos;
270
271 size_t nActualLength = 0;
272
273 //Convert the string until the length() is reached, continuing the
274 //loop every time a null character is reached
275 while(szPos != szEnd)
276 {
277 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
278
279 //Get the length of the current (sub)string
280 size_t nLen = WC2MB(NULL, szPos, 0);
281
282 //Invalid conversion?
283 if( nLen == (size_t)-1 )
284 return nLen;
285
286 //Increase the actual length (+1 for current null character)
287 nActualLength += nLen + 1;
288
289 //Only copy data in if buffer size is big enough
290 if (szBuffer != NULL &&
291 nActualLength <= outsize)
292 {
293 //Convert the current (sub)string
294 if(WC2MB(&szBuffer[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
295 return (size_t)-1;
296 }
297
298 //Increment to next (sub)string
299 //Note that we have to use wxWcslen here instead of nLen
300 //here because XX2XX gives us the size of the output buffer,
301 //not neccessarly the length of the string
302 szPos += wxWcslen(szPos) + 1;
303 }
304
305 return nActualLength - 1; //success - return actual length
306}
307
6001e347 308// ----------------------------------------------------------------------------
bde4baac 309// wxMBConvLibc
6001e347
RR
310// ----------------------------------------------------------------------------
311
bde4baac
VZ
312size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
313{
314 return wxMB2WC(buf, psz, n);
315}
316
317size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
318{
319 return wxWC2MB(buf, psz, n);
320}
bde4baac 321// ----------------------------------------------------------------------------
15f2ee32 322// UTF-7
bde4baac 323// ----------------------------------------------------------------------------
6001e347 324
15f2ee32 325// Implementation (C) 2004 Fredrik Roubert
6001e347 326
15f2ee32
RN
327//
328// BASE64 decoding table
329//
330static const unsigned char utf7unb64[] =
6001e347 331{
15f2ee32
RN
332 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
333 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
334 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
335 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
336 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
337 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
338 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
339 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
340 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
341 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
342 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
343 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
344 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
345 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
346 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
347 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
348 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
349 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
350 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
351 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
352 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
353 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
354 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
355 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
356 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
357 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
358 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
359 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
360 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
361 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
362 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
363 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
364};
365
366size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
367{
368
369 size_t len = 0;
370
371 while (*psz && ((!buf) || (len < n)))
372 {
373 unsigned char cc = *psz++;
374 if (cc != '+')
375 {
376 // plain ASCII char
377 if (buf)
378 *buf++ = cc;
379 len++;
380 }
381 else if (*psz == '-')
382 {
383 // encoded plus sign
384 if (buf)
385 *buf++ = cc;
386 len++;
387 psz++;
388 }
389 else
390 {
391 // BASE64 encoded string
392 bool lsb;
393 unsigned char c;
394 unsigned int d, l;
395 for (lsb = false, d = 0, l = 0;
396 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
397 {
398 d <<= 6;
399 d += cc;
400 for (l += 6; l >= 8; lsb = !lsb)
401 {
402 c = (d >> (l -= 8)) % 256;
403 if (lsb)
404 {
405 if (buf)
406 *buf++ |= c;
407 len ++;
408 }
409 else
410 if (buf)
411 *buf = c << 8;
412 }
413 }
414 if (*psz == '-')
415 psz++;
416 }
417 }
418 if (buf && (len < n))
419 *buf = 0;
420 return len;
6001e347
RR
421}
422
15f2ee32
RN
423//
424// BASE64 encoding table
425//
426static const unsigned char utf7enb64[] =
427{
428 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
429 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
430 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
431 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
432 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
433 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
434 'w', 'x', 'y', 'z', '0', '1', '2', '3',
435 '4', '5', '6', '7', '8', '9', '+', '/'
436};
437
438//
439// UTF-7 encoding table
440//
441// 0 - Set D (directly encoded characters)
442// 1 - Set O (optional direct characters)
443// 2 - whitespace characters (optional)
444// 3 - special characters
445//
446static const unsigned char utf7encode[128] =
6001e347 447{
15f2ee32
RN
448 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
449 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
450 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
452 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
454 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
456};
457
458size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
459*psz, size_t n) const
460{
461
462
463 size_t len = 0;
464
465 while (*psz && ((!buf) || (len < n)))
466 {
467 wchar_t cc = *psz++;
468 if (cc < 0x80 && utf7encode[cc] < 1)
469 {
470 // plain ASCII char
471 if (buf)
472 *buf++ = (char)cc;
473 len++;
474 }
475#ifndef WC_UTF16
6e394fc6
RN
476 else if (((wxUint16)cc) > 0xffff)
477 {
15f2ee32
RN
478 // no surrogate pair generation (yet?)
479 return (size_t)-1;
480 }
481#endif
482 else
483 {
484 if (buf)
485 *buf++ = '+';
486 len++;
487 if (cc != '+')
488 {
489 // BASE64 encode string
490 unsigned int lsb, d, l;
491 for (d = 0, l = 0;; psz++)
492 {
493 for (lsb = 0; lsb < 2; lsb ++)
494 {
495 d <<= 8;
496 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
497
498 for (l += 8; l >= 6; )
499 {
500 l -= 6;
501 if (buf)
502 *buf++ = utf7enb64[(d >> l) % 64];
503 len++;
504 }
505 }
506 cc = *psz;
507 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
508 break;
509 }
510 if (l != 0)
511 {
512 if (buf)
513 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
514 len++;
515 }
516 }
517 if (buf)
518 *buf++ = '-';
519 len++;
520 }
521 }
522 if (buf && (len < n))
523 *buf = 0;
524 return len;
6001e347
RR
525}
526
f6bcfd97 527// ----------------------------------------------------------------------------
6001e347 528// UTF-8
f6bcfd97 529// ----------------------------------------------------------------------------
6001e347 530
dccce9ea 531static wxUint32 utf8_max[]=
4def3b35 532 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
6001e347
RR
533
534size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
535{
4def3b35
VS
536 size_t len = 0;
537
dccce9ea 538 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
539 {
540 unsigned char cc = *psz++, fc = cc;
541 unsigned cnt;
dccce9ea 542 for (cnt = 0; fc & 0x80; cnt++)
4def3b35 543 fc <<= 1;
dccce9ea 544 if (!cnt)
4def3b35
VS
545 {
546 // plain ASCII char
dccce9ea 547 if (buf)
4def3b35
VS
548 *buf++ = cc;
549 len++;
dccce9ea
VZ
550 }
551 else
4def3b35
VS
552 {
553 cnt--;
dccce9ea 554 if (!cnt)
4def3b35
VS
555 {
556 // invalid UTF-8 sequence
557 return (size_t)-1;
dccce9ea
VZ
558 }
559 else
4def3b35
VS
560 {
561 unsigned ocnt = cnt - 1;
562 wxUint32 res = cc & (0x3f >> cnt);
dccce9ea 563 while (cnt--)
4def3b35
VS
564 {
565 cc = *psz++;
dccce9ea 566 if ((cc & 0xC0) != 0x80)
4def3b35
VS
567 {
568 // invalid UTF-8 sequence
569 return (size_t)-1;
570 }
571 res = (res << 6) | (cc & 0x3f);
572 }
dccce9ea 573 if (res <= utf8_max[ocnt])
4def3b35
VS
574 {
575 // illegal UTF-8 encoding
576 return (size_t)-1;
577 }
1cd52418 578#ifdef WC_UTF16
b5153fd8
VZ
579 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
580 size_t pa = encode_utf16(res, (wxUint16 *)buf);
4def3b35
VS
581 if (pa == (size_t)-1)
582 return (size_t)-1;
dccce9ea 583 if (buf)
4def3b35
VS
584 buf += pa;
585 len += pa;
373658eb 586#else // !WC_UTF16
dccce9ea 587 if (buf)
4def3b35
VS
588 *buf++ = res;
589 len++;
373658eb 590#endif // WC_UTF16/!WC_UTF16
4def3b35
VS
591 }
592 }
6001e347 593 }
dccce9ea 594 if (buf && (len < n))
4def3b35
VS
595 *buf = 0;
596 return len;
6001e347
RR
597}
598
599size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
600{
4def3b35 601 size_t len = 0;
6001e347 602
dccce9ea 603 while (*psz && ((!buf) || (len < n)))
4def3b35
VS
604 {
605 wxUint32 cc;
1cd52418 606#ifdef WC_UTF16
b5153fd8
VZ
607 // cast is ok for WC_UTF16
608 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
4def3b35 609 psz += (pa == (size_t)-1) ? 1 : pa;
1cd52418 610#else
4def3b35
VS
611 cc=(*psz++) & 0x7fffffff;
612#endif
613 unsigned cnt;
614 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
dccce9ea 615 if (!cnt)
4def3b35
VS
616 {
617 // plain ASCII char
dccce9ea 618 if (buf)
574c939e 619 *buf++ = (char) cc;
4def3b35 620 len++;
dccce9ea
VZ
621 }
622
623 else
4def3b35
VS
624 {
625 len += cnt + 1;
dccce9ea 626 if (buf)
4def3b35 627 {
574c939e 628 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
4def3b35 629 while (cnt--)
574c939e 630 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
4def3b35
VS
631 }
632 }
6001e347 633 }
4def3b35
VS
634
635 if (buf && (len<n)) *buf = 0;
adb45366 636
4def3b35 637 return len;
6001e347
RR
638}
639
c91830cb
VZ
640
641
642
643// ----------------------------------------------------------------------------
644// UTF-16
645// ----------------------------------------------------------------------------
646
647#ifdef WORDS_BIGENDIAN
bde4baac
VZ
648 #define wxMBConvUTF16straight wxMBConvUTF16BE
649 #define wxMBConvUTF16swap wxMBConvUTF16LE
c91830cb 650#else
bde4baac
VZ
651 #define wxMBConvUTF16swap wxMBConvUTF16BE
652 #define wxMBConvUTF16straight wxMBConvUTF16LE
c91830cb
VZ
653#endif
654
655
c91830cb
VZ
656#ifdef WC_UTF16
657
c91830cb
VZ
658// copy 16bit MB to 16bit String
659size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
660{
661 size_t len=0;
662
663 while (*(wxUint16*)psz && (!buf || len < n))
664 {
665 if (buf)
666 *buf++ = *(wxUint16*)psz;
667 len++;
668
669 psz += sizeof(wxUint16);
670 }
671 if (buf && len<n) *buf=0;
672
673 return len;
674}
675
676
677// copy 16bit String to 16bit MB
678size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
679{
680 size_t len=0;
681
682 while (*psz && (!buf || len < n))
683 {
684 if (buf)
685 {
686 *(wxUint16*)buf = *psz;
687 buf += sizeof(wxUint16);
688 }
689 len += sizeof(wxUint16);
690 psz++;
691 }
692 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
693
694 return len;
695}
696
697
698// swap 16bit MB to 16bit String
699size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
700{
701 size_t len=0;
702
703 while (*(wxUint16*)psz && (!buf || len < n))
704 {
705 if (buf)
706 {
707 ((char *)buf)[0] = psz[1];
708 ((char *)buf)[1] = psz[0];
709 buf++;
710 }
711 len++;
712 psz += sizeof(wxUint16);
713 }
714 if (buf && len<n) *buf=0;
715
716 return len;
717}
718
719
720// swap 16bit MB to 16bit String
721size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
722{
723 size_t len=0;
724
725 while (*psz && (!buf || len < n))
726 {
727 if (buf)
728 {
729 *buf++ = ((char*)psz)[1];
730 *buf++ = ((char*)psz)[0];
731 }
732 len += sizeof(wxUint16);
733 psz++;
734 }
735 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
736
737 return len;
738}
739
740
741#else // WC_UTF16
742
743
744// copy 16bit MB to 32bit String
745size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
746{
747 size_t len=0;
748
749 while (*(wxUint16*)psz && (!buf || len < n))
750 {
751 wxUint32 cc;
752 size_t pa=decode_utf16((wxUint16*)psz, cc);
753 if (pa == (size_t)-1)
754 return pa;
755
756 if (buf)
757 *buf++ = cc;
758 len++;
759 psz += pa * sizeof(wxUint16);
760 }
761 if (buf && len<n) *buf=0;
762
763 return len;
764}
765
766
767// copy 32bit String to 16bit MB
768size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
769{
770 size_t len=0;
771
772 while (*psz && (!buf || len < n))
773 {
774 wxUint16 cc[2];
775 size_t pa=encode_utf16(*psz, cc);
776
777 if (pa == (size_t)-1)
778 return pa;
779
780 if (buf)
781 {
69b80d28 782 *(wxUint16*)buf = cc[0];
b5153fd8 783 buf += sizeof(wxUint16);
c91830cb 784 if (pa > 1)
69b80d28
VZ
785 {
786 *(wxUint16*)buf = cc[1];
787 buf += sizeof(wxUint16);
788 }
c91830cb
VZ
789 }
790
791 len += pa*sizeof(wxUint16);
792 psz++;
793 }
794 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
795
796 return len;
797}
798
799
800// swap 16bit MB to 32bit String
801size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
802{
803 size_t len=0;
804
805 while (*(wxUint16*)psz && (!buf || len < n))
806 {
807 wxUint32 cc;
808 char tmp[4];
809 tmp[0]=psz[1]; tmp[1]=psz[0];
810 tmp[2]=psz[3]; tmp[3]=psz[2];
811
812 size_t pa=decode_utf16((wxUint16*)tmp, cc);
813 if (pa == (size_t)-1)
814 return pa;
815
816 if (buf)
817 *buf++ = cc;
818
819 len++;
820 psz += pa * sizeof(wxUint16);
821 }
822 if (buf && len<n) *buf=0;
823
824 return len;
825}
826
827
828// swap 32bit String to 16bit MB
829size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
830{
831 size_t len=0;
832
833 while (*psz && (!buf || len < n))
834 {
835 wxUint16 cc[2];
836 size_t pa=encode_utf16(*psz, cc);
837
838 if (pa == (size_t)-1)
839 return pa;
840
841 if (buf)
842 {
843 *buf++ = ((char*)cc)[1];
844 *buf++ = ((char*)cc)[0];
845 if (pa > 1)
846 {
847 *buf++ = ((char*)cc)[3];
848 *buf++ = ((char*)cc)[2];
849 }
850 }
851
852 len += pa*sizeof(wxUint16);
853 psz++;
854 }
855 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
856
857 return len;
858}
859
860#endif // WC_UTF16
861
862
863// ----------------------------------------------------------------------------
864// UTF-32
865// ----------------------------------------------------------------------------
866
867#ifdef WORDS_BIGENDIAN
868#define wxMBConvUTF32straight wxMBConvUTF32BE
869#define wxMBConvUTF32swap wxMBConvUTF32LE
870#else
871#define wxMBConvUTF32swap wxMBConvUTF32BE
872#define wxMBConvUTF32straight wxMBConvUTF32LE
873#endif
874
875
876WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
877WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
878
879
880#ifdef WC_UTF16
881
882// copy 32bit MB to 16bit String
883size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
884{
885 size_t len=0;
886
887 while (*(wxUint32*)psz && (!buf || len < n))
888 {
889 wxUint16 cc[2];
890
891 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
892 if (pa == (size_t)-1)
893 return pa;
894
895 if (buf)
896 {
897 *buf++ = cc[0];
898 if (pa > 1)
899 *buf++ = cc[1];
900 }
901 len += pa;
902 psz += sizeof(wxUint32);
903 }
904 if (buf && len<n) *buf=0;
905
906 return len;
907}
908
909
910// copy 16bit String to 32bit MB
911size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
912{
913 size_t len=0;
914
915 while (*psz && (!buf || len < n))
916 {
917 wxUint32 cc;
918
b5153fd8
VZ
919 // cast is ok for WC_UTF16
920 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
c91830cb
VZ
921 if (pa == (size_t)-1)
922 return pa;
923
924 if (buf)
925 {
926 *(wxUint32*)buf = cc;
927 buf += sizeof(wxUint32);
928 }
929 len += sizeof(wxUint32);
930 psz += pa;
931 }
b5153fd8
VZ
932
933 if (buf && len<=n-sizeof(wxUint32))
934 *(wxUint32*)buf=0;
c91830cb
VZ
935
936 return len;
937}
938
939
940
941// swap 32bit MB to 16bit String
942size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
943{
944 size_t len=0;
945
946 while (*(wxUint32*)psz && (!buf || len < n))
947 {
948 char tmp[4];
949 tmp[0] = psz[3]; tmp[1] = psz[2];
950 tmp[2] = psz[1]; tmp[3] = psz[0];
951
952
953 wxUint16 cc[2];
954
955 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
956 if (pa == (size_t)-1)
957 return pa;
958
959 if (buf)
960 {
961 *buf++ = cc[0];
962 if (pa > 1)
963 *buf++ = cc[1];
964 }
965 len += pa;
966 psz += sizeof(wxUint32);
967 }
b5153fd8
VZ
968
969 if (buf && len<n)
970 *buf=0;
c91830cb
VZ
971
972 return len;
973}
974
975
976// swap 16bit String to 32bit MB
977size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
978{
979 size_t len=0;
980
981 while (*psz && (!buf || len < n))
982 {
983 char cc[4];
984
b5153fd8
VZ
985 // cast is ok for WC_UTF16
986 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
c91830cb
VZ
987 if (pa == (size_t)-1)
988 return pa;
989
990 if (buf)
991 {
992 *buf++ = cc[3];
993 *buf++ = cc[2];
994 *buf++ = cc[1];
995 *buf++ = cc[0];
996 }
997 len += sizeof(wxUint32);
998 psz += pa;
999 }
b5153fd8
VZ
1000
1001 if (buf && len<=n-sizeof(wxUint32))
1002 *(wxUint32*)buf=0;
c91830cb
VZ
1003
1004 return len;
1005}
1006
1007#else // WC_UTF16
1008
1009
1010// copy 32bit MB to 32bit String
1011size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1012{
1013 size_t len=0;
1014
1015 while (*(wxUint32*)psz && (!buf || len < n))
1016 {
1017 if (buf)
1018 *buf++ = *(wxUint32*)psz;
1019 len++;
1020 psz += sizeof(wxUint32);
1021 }
b5153fd8
VZ
1022
1023 if (buf && len<n)
1024 *buf=0;
c91830cb
VZ
1025
1026 return len;
1027}
1028
1029
1030// copy 32bit String to 32bit MB
1031size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1032{
1033 size_t len=0;
1034
1035 while (*psz && (!buf || len < n))
1036 {
1037 if (buf)
1038 {
1039 *(wxUint32*)buf = *psz;
1040 buf += sizeof(wxUint32);
1041 }
1042
1043 len += sizeof(wxUint32);
1044 psz++;
1045 }
1046
b5153fd8
VZ
1047 if (buf && len<=n-sizeof(wxUint32))
1048 *(wxUint32*)buf=0;
c91830cb
VZ
1049
1050 return len;
1051}
1052
1053
1054// swap 32bit MB to 32bit String
1055size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1056{
1057 size_t len=0;
1058
1059 while (*(wxUint32*)psz && (!buf || len < n))
1060 {
1061 if (buf)
1062 {
1063 ((char *)buf)[0] = psz[3];
1064 ((char *)buf)[1] = psz[2];
1065 ((char *)buf)[2] = psz[1];
1066 ((char *)buf)[3] = psz[0];
1067 buf++;
1068 }
1069 len++;
1070 psz += sizeof(wxUint32);
1071 }
b5153fd8
VZ
1072
1073 if (buf && len<n)
1074 *buf=0;
c91830cb
VZ
1075
1076 return len;
1077}
1078
1079
1080// swap 32bit String to 32bit MB
1081size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1082{
1083 size_t len=0;
1084
1085 while (*psz && (!buf || len < n))
1086 {
1087 if (buf)
1088 {
1089 *buf++ = ((char *)psz)[3];
1090 *buf++ = ((char *)psz)[2];
1091 *buf++ = ((char *)psz)[1];
1092 *buf++ = ((char *)psz)[0];
1093 }
1094 len += sizeof(wxUint32);
1095 psz++;
1096 }
b5153fd8
VZ
1097
1098 if (buf && len<=n-sizeof(wxUint32))
1099 *(wxUint32*)buf=0;
c91830cb
VZ
1100
1101 return len;
1102}
1103
1104
1105#endif // WC_UTF16
1106
1107
36acb880
VZ
1108// ============================================================================
1109// The classes doing conversion using the iconv_xxx() functions
1110// ============================================================================
3caec1bb 1111
b040e242 1112#ifdef HAVE_ICONV
3a0d76bc 1113
3caec1bb
VS
1114// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1115// if output buffer is _exactly_ as big as needed. Such case is (unless there's
1116// yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1117// (which means error) and says there are 0 bytes left in the input buffer --
1118// when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1119// this alternative test for iconv() failure.
1120// [This bug does not appear in glibc 2.2.]
1121#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1122#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1123 (errno != E2BIG || bufLeft != 0))
1124#else
1125#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1126#endif
1127
ab217dba 1128#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
36acb880
VZ
1129
1130// ----------------------------------------------------------------------------
e95354ec 1131// wxMBConv_iconv: encapsulates an iconv character set
36acb880
VZ
1132// ----------------------------------------------------------------------------
1133
e95354ec 1134class wxMBConv_iconv : public wxMBConv
1cd52418
OK
1135{
1136public:
e95354ec
VZ
1137 wxMBConv_iconv(const wxChar *name);
1138 virtual ~wxMBConv_iconv();
36acb880 1139
bde4baac
VZ
1140 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1141 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
36acb880 1142
e95354ec 1143 bool IsOk() const
36acb880
VZ
1144 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1145
1146protected:
1147 // the iconv handlers used to translate from multibyte to wide char and in
1148 // the other direction
1149 iconv_t m2w,
1150 w2m;
1151
1152private:
e95354ec 1153 // the name (for iconv_open()) of a wide char charset -- if none is
36acb880
VZ
1154 // available on this machine, it will remain NULL
1155 static const char *ms_wcCharsetName;
1156
1157 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1158 // different endian-ness than the native one
405d8f46 1159 static bool ms_wcNeedsSwap;
36acb880
VZ
1160};
1161
e95354ec
VZ
1162const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1163bool wxMBConv_iconv::ms_wcNeedsSwap = false;
36acb880 1164
e95354ec 1165wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
36acb880 1166{
04c79127
RR
1167 // Do it the hard way
1168 char cname[100];
1169 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1170 cname[i] = (char) name[i];
1171
36acb880
VZ
1172 // check for charset that represents wchar_t:
1173 if (ms_wcCharsetName == NULL)
f1339c56 1174 {
e95354ec 1175 ms_wcNeedsSwap = false;
dccce9ea 1176
36acb880
VZ
1177 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1178 ms_wcCharsetName = WC_NAME_BEST;
04c79127 1179 m2w = iconv_open(ms_wcCharsetName, cname);
3a0d76bc 1180
36acb880
VZ
1181 if (m2w == (iconv_t)-1)
1182 {
1183 // try charset w/o bytesex info (e.g. "UCS4")
1184 // and check for bytesex ourselves:
1185 ms_wcCharsetName = WC_NAME;
04c79127 1186 m2w = iconv_open(ms_wcCharsetName, cname);
36acb880
VZ
1187
1188 // last bet, try if it knows WCHAR_T pseudo-charset
3a0d76bc
VS
1189 if (m2w == (iconv_t)-1)
1190 {
36acb880 1191 ms_wcCharsetName = "WCHAR_T";
04c79127 1192 m2w = iconv_open(ms_wcCharsetName, cname);
36acb880 1193 }
3a0d76bc 1194
36acb880
VZ
1195 if (m2w != (iconv_t)-1)
1196 {
1197 char buf[2], *bufPtr;
1198 wchar_t wbuf[2], *wbufPtr;
1199 size_t insz, outsz;
1200 size_t res;
1201
1202 buf[0] = 'A';
1203 buf[1] = 0;
1204 wbuf[0] = 0;
1205 insz = 2;
1206 outsz = SIZEOF_WCHAR_T * 2;
1207 wbufPtr = wbuf;
1208 bufPtr = buf;
1209
1210 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1211 (char**)&wbufPtr, &outsz);
1212
1213 if (ICONV_FAILED(res, insz))
3a0d76bc 1214 {
36acb880
VZ
1215 ms_wcCharsetName = NULL;
1216 wxLogLastError(wxT("iconv"));
2b5f62a0 1217 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
3a0d76bc
VS
1218 }
1219 else
1220 {
36acb880 1221 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
3a0d76bc
VS
1222 }
1223 }
36acb880
VZ
1224 else
1225 {
1226 ms_wcCharsetName = NULL;
373658eb 1227
77ffb593 1228 // VS: we must not output an error here, since wxWidgets will safely
957686c8
VS
1229 // fall back to using wxEncodingConverter.
1230 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1231 //wxLogError(
36acb880 1232 }
3a0d76bc 1233 }
36acb880 1234 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
3a0d76bc 1235 }
36acb880 1236 else // we already have ms_wcCharsetName
3caec1bb 1237 {
04c79127 1238 m2w = iconv_open(ms_wcCharsetName, cname);
f1339c56 1239 }
dccce9ea 1240
36acb880
VZ
1241 // NB: don't ever pass NULL to iconv_open(), it may crash!
1242 if ( ms_wcCharsetName )
f1339c56 1243 {
04c79127 1244 w2m = iconv_open( cname, ms_wcCharsetName);
36acb880 1245 }
405d8f46
VZ
1246 else
1247 {
1248 w2m = (iconv_t)-1;
1249 }
36acb880 1250}
3caec1bb 1251
e95354ec 1252wxMBConv_iconv::~wxMBConv_iconv()
36acb880
VZ
1253{
1254 if ( m2w != (iconv_t)-1 )
1255 iconv_close(m2w);
1256 if ( w2m != (iconv_t)-1 )
1257 iconv_close(w2m);
1258}
3a0d76bc 1259
bde4baac 1260size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
36acb880
VZ
1261{
1262 size_t inbuf = strlen(psz);
1263 size_t outbuf = n * SIZEOF_WCHAR_T;
1264 size_t res, cres;
1265 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1266 wchar_t *bufPtr = buf;
1267 const char *pszPtr = psz;
1268
1269 if (buf)
1270 {
1271 // have destination buffer, convert there
1272 cres = iconv(m2w,
1273 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1274 (char**)&bufPtr, &outbuf);
1275 res = n - (outbuf / SIZEOF_WCHAR_T);
dccce9ea 1276
36acb880 1277 if (ms_wcNeedsSwap)
3a0d76bc 1278 {
36acb880
VZ
1279 // convert to native endianness
1280 WC_BSWAP(buf /* _not_ bufPtr */, res)
3a0d76bc 1281 }
adb45366 1282
49dd9820
VS
1283 // NB: iconv was given only strlen(psz) characters on input, and so
1284 // it couldn't convert the trailing zero. Let's do it ourselves
1285 // if there's some room left for it in the output buffer.
1286 if (res < n)
1287 buf[res] = 0;
36acb880
VZ
1288 }
1289 else
1290 {
1291 // no destination buffer... convert using temp buffer
1292 // to calculate destination buffer requirement
1293 wchar_t tbuf[8];
1294 res = 0;
1295 do {
1296 bufPtr = tbuf;
1297 outbuf = 8*SIZEOF_WCHAR_T;
1298
1299 cres = iconv(m2w,
1300 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1301 (char**)&bufPtr, &outbuf );
1302
1303 res += 8-(outbuf/SIZEOF_WCHAR_T);
1304 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1305 }
dccce9ea 1306
36acb880 1307 if (ICONV_FAILED(cres, inbuf))
f1339c56 1308 {
36acb880
VZ
1309 //VS: it is ok if iconv fails, hence trace only
1310 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1311 return (size_t)-1;
1312 }
1313
1314 return res;
1315}
1316
bde4baac 1317size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
36acb880 1318{
f8d791e0 1319 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
36acb880
VZ
1320 size_t outbuf = n;
1321 size_t res, cres;
3a0d76bc 1322
36acb880 1323 wchar_t *tmpbuf = 0;
3caec1bb 1324
36acb880
VZ
1325 if (ms_wcNeedsSwap)
1326 {
1327 // need to copy to temp buffer to switch endianness
1328 // this absolutely doesn't rock!
1329 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1330 // could be in read-only memory, or be accessed in some other thread)
1331 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1332 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1333 WC_BSWAP(tmpbuf, inbuf)
1334 psz=tmpbuf;
1335 }
3a0d76bc 1336
36acb880
VZ
1337 if (buf)
1338 {
1339 // have destination buffer, convert there
1340 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
3a0d76bc 1341
36acb880 1342 res = n-outbuf;
adb45366 1343
49dd9820
VS
1344 // NB: iconv was given only wcslen(psz) characters on input, and so
1345 // it couldn't convert the trailing zero. Let's do it ourselves
1346 // if there's some room left for it in the output buffer.
1347 if (res < n)
1348 buf[0] = 0;
36acb880
VZ
1349 }
1350 else
1351 {
1352 // no destination buffer... convert using temp buffer
1353 // to calculate destination buffer requirement
1354 char tbuf[16];
1355 res = 0;
1356 do {
1357 buf = tbuf; outbuf = 16;
1358
1359 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
dccce9ea 1360
36acb880
VZ
1361 res += 16 - outbuf;
1362 } while ((cres==(size_t)-1) && (errno==E2BIG));
f1339c56 1363 }
dccce9ea 1364
36acb880
VZ
1365 if (ms_wcNeedsSwap)
1366 {
1367 free(tmpbuf);
1368 }
dccce9ea 1369
36acb880
VZ
1370 if (ICONV_FAILED(cres, inbuf))
1371 {
1372 //VS: it is ok if iconv fails, hence trace only
1373 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1374 return (size_t)-1;
1375 }
1376
1377 return res;
1378}
1379
b040e242 1380#endif // HAVE_ICONV
36acb880 1381
e95354ec 1382
36acb880
VZ
1383// ============================================================================
1384// Win32 conversion classes
1385// ============================================================================
1cd52418 1386
e95354ec 1387#ifdef wxHAVE_WIN32_MB2WC
373658eb 1388
8b04d4c4 1389// from utils.cpp
d775fa82 1390#if wxUSE_FONTMAP
8b04d4c4
VZ
1391extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1392extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
7608a683 1393#endif
373658eb 1394
e95354ec 1395class wxMBConv_win32 : public wxMBConv
1cd52418
OK
1396{
1397public:
bde4baac
VZ
1398 wxMBConv_win32()
1399 {
1400 m_CodePage = CP_ACP;
1401 }
1402
7608a683 1403#if wxUSE_FONTMAP
e95354ec 1404 wxMBConv_win32(const wxChar* name)
bde4baac
VZ
1405 {
1406 m_CodePage = wxCharsetToCodepage(name);
1407 }
dccce9ea 1408
e95354ec 1409 wxMBConv_win32(wxFontEncoding encoding)
bde4baac
VZ
1410 {
1411 m_CodePage = wxEncodingToCodepage(encoding);
1412 }
7608a683 1413#endif
8b04d4c4 1414
bde4baac 1415 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
f1339c56 1416 {
02272c9c
VZ
1417 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1418 // the behaviour is not compatible with the Unix version (using iconv)
1419 // and break the library itself, e.g. wxTextInputStream::NextChar()
1420 // wouldn't work if reading an incomplete MB char didn't result in an
1421 // error
2b5f62a0
VZ
1422 const size_t len = ::MultiByteToWideChar
1423 (
1424 m_CodePage, // code page
02272c9c 1425 MB_ERR_INVALID_CHARS, // flags: fall on error
2b5f62a0
VZ
1426 psz, // input string
1427 -1, // its length (NUL-terminated)
b4da152e 1428 buf, // output string
2b5f62a0
VZ
1429 buf ? n : 0 // size of output buffer
1430 );
1431
03a991bc
VZ
1432 // note that it returns count of written chars for buf != NULL and size
1433 // of the needed buffer for buf == NULL so in either case the length of
1434 // the string (which never includes the terminating NUL) is one less
1435 return len ? len - 1 : (size_t)-1;
f1339c56 1436 }
dccce9ea 1437
13dd924a 1438 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
f1339c56 1439 {
13dd924a
VZ
1440 /*
1441 we have a problem here: by default, WideCharToMultiByte() may
1442 replace characters unrepresentable in the target code page with bad
1443 quality approximations such as turning "1/2" symbol (U+00BD) into
1444 "1" for the code pages which don't have it and we, obviously, want
1445 to avoid this at any price
d775fa82 1446
13dd924a
VZ
1447 the trouble is that this function does it _silently_, i.e. it won't
1448 even tell us whether it did or not... Win98/2000 and higher provide
1449 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1450 we have to resort to a round trip, i.e. check that converting back
1451 results in the same string -- this is, of course, expensive but
1452 otherwise we simply can't be sure to not garble the data.
1453 */
1454
1455 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1456 // it doesn't work with CJK encodings (which we test for rather roughly
1457 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1458 // supporting it
907173e5
WS
1459 BOOL usedDef wxDUMMY_INITIALIZE(false);
1460 BOOL *pUsedDef;
13dd924a
VZ
1461 int flags;
1462 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1463 {
1464 // it's our lucky day
1465 flags = WC_NO_BEST_FIT_CHARS;
1466 pUsedDef = &usedDef;
1467 }
1468 else // old system or unsupported encoding
1469 {
1470 flags = 0;
1471 pUsedDef = NULL;
1472 }
1473
2b5f62a0
VZ
1474 const size_t len = ::WideCharToMultiByte
1475 (
1476 m_CodePage, // code page
13dd924a
VZ
1477 flags, // either none or no best fit
1478 pwz, // input string
2b5f62a0
VZ
1479 -1, // it is (wide) NUL-terminated
1480 buf, // output buffer
1481 buf ? n : 0, // and its size
1482 NULL, // default "replacement" char
13dd924a 1483 pUsedDef // [out] was it used?
2b5f62a0
VZ
1484 );
1485
13dd924a
VZ
1486 if ( !len )
1487 {
1488 // function totally failed
1489 return (size_t)-1;
1490 }
1491
1492 // if we were really converting, check if we succeeded
1493 if ( buf )
1494 {
1495 if ( flags )
1496 {
1497 // check if the conversion failed, i.e. if any replacements
1498 // were done
1499 if ( usedDef )
1500 return (size_t)-1;
1501 }
1502 else // we must resort to double tripping...
1503 {
1504 wxWCharBuffer wcBuf(n);
1505 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1506 wcscmp(wcBuf, pwz) != 0 )
1507 {
1508 // we didn't obtain the same thing we started from, hence
1509 // the conversion was lossy and we consider that it failed
1510 return (size_t)-1;
1511 }
1512 }
1513 }
1514
03a991bc 1515 // see the comment above for the reason of "len - 1"
13dd924a 1516 return len - 1;
f1339c56 1517 }
dccce9ea 1518
13dd924a
VZ
1519 bool IsOk() const { return m_CodePage != -1; }
1520
1521private:
1522 static bool CanUseNoBestFit()
1523 {
1524 static int s_isWin98Or2k = -1;
1525
1526 if ( s_isWin98Or2k == -1 )
1527 {
1528 int verMaj, verMin;
1529 switch ( wxGetOsVersion(&verMaj, &verMin) )
1530 {
1531 case wxWIN95:
1532 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1533 break;
1534
1535 case wxWINDOWS_NT:
1536 s_isWin98Or2k = verMaj >= 5;
1537 break;
1538
1539 default:
1540 // unknown, be conseravtive by default
1541 s_isWin98Or2k = 0;
1542 }
1543
1544 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1545 }
1546
1547 return s_isWin98Or2k == 1;
1548 }
f1339c56 1549
b1d66b54 1550 long m_CodePage;
1cd52418 1551};
e95354ec
VZ
1552
1553#endif // wxHAVE_WIN32_MB2WC
1554
f7e98dee
RN
1555// ============================================================================
1556// Cocoa conversion classes
1557// ============================================================================
1558
1559#if defined(__WXCOCOA__)
1560
ecd9653b 1561// RN: There is no UTF-32 support in either Core Foundation or
f7e98dee
RN
1562// Cocoa. Strangely enough, internally Core Foundation uses
1563// UTF 32 internally quite a bit - its just not public (yet).
1564
1565#include <CoreFoundation/CFString.h>
1566#include <CoreFoundation/CFStringEncodingExt.h>
1567
1568CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
ecd9653b 1569{
638357a0 1570 CFStringEncoding enc = kCFStringEncodingInvalidId ;
ecd9653b
WS
1571 if ( encoding == wxFONTENCODING_DEFAULT )
1572 {
638357a0 1573 enc = CFStringGetSystemEncoding();
ecd9653b
WS
1574 }
1575 else switch( encoding)
1576 {
1577 case wxFONTENCODING_ISO8859_1 :
1578 enc = kCFStringEncodingISOLatin1 ;
1579 break ;
1580 case wxFONTENCODING_ISO8859_2 :
1581 enc = kCFStringEncodingISOLatin2;
1582 break ;
1583 case wxFONTENCODING_ISO8859_3 :
1584 enc = kCFStringEncodingISOLatin3 ;
1585 break ;
1586 case wxFONTENCODING_ISO8859_4 :
1587 enc = kCFStringEncodingISOLatin4;
1588 break ;
1589 case wxFONTENCODING_ISO8859_5 :
1590 enc = kCFStringEncodingISOLatinCyrillic;
1591 break ;
1592 case wxFONTENCODING_ISO8859_6 :
1593 enc = kCFStringEncodingISOLatinArabic;
1594 break ;
1595 case wxFONTENCODING_ISO8859_7 :
1596 enc = kCFStringEncodingISOLatinGreek;
1597 break ;
1598 case wxFONTENCODING_ISO8859_8 :
1599 enc = kCFStringEncodingISOLatinHebrew;
1600 break ;
1601 case wxFONTENCODING_ISO8859_9 :
1602 enc = kCFStringEncodingISOLatin5;
1603 break ;
1604 case wxFONTENCODING_ISO8859_10 :
1605 enc = kCFStringEncodingISOLatin6;
1606 break ;
1607 case wxFONTENCODING_ISO8859_11 :
1608 enc = kCFStringEncodingISOLatinThai;
1609 break ;
1610 case wxFONTENCODING_ISO8859_13 :
1611 enc = kCFStringEncodingISOLatin7;
1612 break ;
1613 case wxFONTENCODING_ISO8859_14 :
1614 enc = kCFStringEncodingISOLatin8;
1615 break ;
1616 case wxFONTENCODING_ISO8859_15 :
1617 enc = kCFStringEncodingISOLatin9;
1618 break ;
1619
1620 case wxFONTENCODING_KOI8 :
1621 enc = kCFStringEncodingKOI8_R;
1622 break ;
1623 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1624 enc = kCFStringEncodingDOSRussian;
1625 break ;
1626
1627// case wxFONTENCODING_BULGARIAN :
1628// enc = ;
1629// break ;
1630
1631 case wxFONTENCODING_CP437 :
1632 enc =kCFStringEncodingDOSLatinUS ;
1633 break ;
1634 case wxFONTENCODING_CP850 :
1635 enc = kCFStringEncodingDOSLatin1;
1636 break ;
1637 case wxFONTENCODING_CP852 :
1638 enc = kCFStringEncodingDOSLatin2;
1639 break ;
1640 case wxFONTENCODING_CP855 :
1641 enc = kCFStringEncodingDOSCyrillic;
1642 break ;
1643 case wxFONTENCODING_CP866 :
1644 enc =kCFStringEncodingDOSRussian ;
1645 break ;
1646 case wxFONTENCODING_CP874 :
1647 enc = kCFStringEncodingDOSThai;
1648 break ;
1649 case wxFONTENCODING_CP932 :
1650 enc = kCFStringEncodingDOSJapanese;
1651 break ;
1652 case wxFONTENCODING_CP936 :
1653 enc =kCFStringEncodingDOSChineseSimplif ;
1654 break ;
1655 case wxFONTENCODING_CP949 :
1656 enc = kCFStringEncodingDOSKorean;
1657 break ;
1658 case wxFONTENCODING_CP950 :
1659 enc = kCFStringEncodingDOSChineseTrad;
1660 break ;
ecd9653b
WS
1661 case wxFONTENCODING_CP1250 :
1662 enc = kCFStringEncodingWindowsLatin2;
1663 break ;
1664 case wxFONTENCODING_CP1251 :
1665 enc =kCFStringEncodingWindowsCyrillic ;
1666 break ;
1667 case wxFONTENCODING_CP1252 :
1668 enc =kCFStringEncodingWindowsLatin1 ;
1669 break ;
1670 case wxFONTENCODING_CP1253 :
1671 enc = kCFStringEncodingWindowsGreek;
1672 break ;
1673 case wxFONTENCODING_CP1254 :
1674 enc = kCFStringEncodingWindowsLatin5;
1675 break ;
1676 case wxFONTENCODING_CP1255 :
1677 enc =kCFStringEncodingWindowsHebrew ;
1678 break ;
1679 case wxFONTENCODING_CP1256 :
1680 enc =kCFStringEncodingWindowsArabic ;
1681 break ;
1682 case wxFONTENCODING_CP1257 :
1683 enc = kCFStringEncodingWindowsBalticRim;
1684 break ;
638357a0
RN
1685// This only really encodes to UTF7 (if that) evidently
1686// case wxFONTENCODING_UTF7 :
1687// enc = kCFStringEncodingNonLossyASCII ;
1688// break ;
ecd9653b
WS
1689 case wxFONTENCODING_UTF8 :
1690 enc = kCFStringEncodingUTF8 ;
1691 break ;
1692 case wxFONTENCODING_EUC_JP :
1693 enc = kCFStringEncodingEUC_JP;
1694 break ;
1695 case wxFONTENCODING_UTF16 :
f7e98dee 1696 enc = kCFStringEncodingUnicode ;
ecd9653b 1697 break ;
f7e98dee
RN
1698 case wxFONTENCODING_MACROMAN :
1699 enc = kCFStringEncodingMacRoman ;
1700 break ;
1701 case wxFONTENCODING_MACJAPANESE :
1702 enc = kCFStringEncodingMacJapanese ;
1703 break ;
1704 case wxFONTENCODING_MACCHINESETRAD :
1705 enc = kCFStringEncodingMacChineseTrad ;
1706 break ;
1707 case wxFONTENCODING_MACKOREAN :
1708 enc = kCFStringEncodingMacKorean ;
1709 break ;
1710 case wxFONTENCODING_MACARABIC :
1711 enc = kCFStringEncodingMacArabic ;
1712 break ;
1713 case wxFONTENCODING_MACHEBREW :
1714 enc = kCFStringEncodingMacHebrew ;
1715 break ;
1716 case wxFONTENCODING_MACGREEK :
1717 enc = kCFStringEncodingMacGreek ;
1718 break ;
1719 case wxFONTENCODING_MACCYRILLIC :
1720 enc = kCFStringEncodingMacCyrillic ;
1721 break ;
1722 case wxFONTENCODING_MACDEVANAGARI :
1723 enc = kCFStringEncodingMacDevanagari ;
1724 break ;
1725 case wxFONTENCODING_MACGURMUKHI :
1726 enc = kCFStringEncodingMacGurmukhi ;
1727 break ;
1728 case wxFONTENCODING_MACGUJARATI :
1729 enc = kCFStringEncodingMacGujarati ;
1730 break ;
1731 case wxFONTENCODING_MACORIYA :
1732 enc = kCFStringEncodingMacOriya ;
1733 break ;
1734 case wxFONTENCODING_MACBENGALI :
1735 enc = kCFStringEncodingMacBengali ;
1736 break ;
1737 case wxFONTENCODING_MACTAMIL :
1738 enc = kCFStringEncodingMacTamil ;
1739 break ;
1740 case wxFONTENCODING_MACTELUGU :
1741 enc = kCFStringEncodingMacTelugu ;
1742 break ;
1743 case wxFONTENCODING_MACKANNADA :
1744 enc = kCFStringEncodingMacKannada ;
1745 break ;
1746 case wxFONTENCODING_MACMALAJALAM :
1747 enc = kCFStringEncodingMacMalayalam ;
1748 break ;
1749 case wxFONTENCODING_MACSINHALESE :
1750 enc = kCFStringEncodingMacSinhalese ;
1751 break ;
1752 case wxFONTENCODING_MACBURMESE :
1753 enc = kCFStringEncodingMacBurmese ;
1754 break ;
1755 case wxFONTENCODING_MACKHMER :
1756 enc = kCFStringEncodingMacKhmer ;
1757 break ;
1758 case wxFONTENCODING_MACTHAI :
1759 enc = kCFStringEncodingMacThai ;
1760 break ;
1761 case wxFONTENCODING_MACLAOTIAN :
1762 enc = kCFStringEncodingMacLaotian ;
1763 break ;
1764 case wxFONTENCODING_MACGEORGIAN :
1765 enc = kCFStringEncodingMacGeorgian ;
1766 break ;
1767 case wxFONTENCODING_MACARMENIAN :
1768 enc = kCFStringEncodingMacArmenian ;
1769 break ;
1770 case wxFONTENCODING_MACCHINESESIMP :
1771 enc = kCFStringEncodingMacChineseSimp ;
1772 break ;
1773 case wxFONTENCODING_MACTIBETAN :
1774 enc = kCFStringEncodingMacTibetan ;
1775 break ;
1776 case wxFONTENCODING_MACMONGOLIAN :
1777 enc = kCFStringEncodingMacMongolian ;
1778 break ;
1779 case wxFONTENCODING_MACETHIOPIC :
1780 enc = kCFStringEncodingMacEthiopic ;
1781 break ;
1782 case wxFONTENCODING_MACCENTRALEUR :
1783 enc = kCFStringEncodingMacCentralEurRoman ;
1784 break ;
1785 case wxFONTENCODING_MACVIATNAMESE :
1786 enc = kCFStringEncodingMacVietnamese ;
1787 break ;
1788 case wxFONTENCODING_MACARABICEXT :
1789 enc = kCFStringEncodingMacExtArabic ;
1790 break ;
1791 case wxFONTENCODING_MACSYMBOL :
1792 enc = kCFStringEncodingMacSymbol ;
1793 break ;
1794 case wxFONTENCODING_MACDINGBATS :
1795 enc = kCFStringEncodingMacDingbats ;
1796 break ;
1797 case wxFONTENCODING_MACTURKISH :
1798 enc = kCFStringEncodingMacTurkish ;
1799 break ;
1800 case wxFONTENCODING_MACCROATIAN :
1801 enc = kCFStringEncodingMacCroatian ;
1802 break ;
1803 case wxFONTENCODING_MACICELANDIC :
1804 enc = kCFStringEncodingMacIcelandic ;
1805 break ;
1806 case wxFONTENCODING_MACROMANIAN :
1807 enc = kCFStringEncodingMacRomanian ;
1808 break ;
1809 case wxFONTENCODING_MACCELTIC :
1810 enc = kCFStringEncodingMacCeltic ;
1811 break ;
1812 case wxFONTENCODING_MACGAELIC :
1813 enc = kCFStringEncodingMacGaelic ;
1814 break ;
ecd9653b
WS
1815// case wxFONTENCODING_MACKEYBOARD :
1816// enc = kCFStringEncodingMacKeyboardGlyphs ;
1817// break ;
1818 default :
1819 // because gcc is picky
1820 break ;
1821 } ;
1822 return enc ;
f7e98dee
RN
1823}
1824
f7e98dee
RN
1825class wxMBConv_cocoa : public wxMBConv
1826{
1827public:
1828 wxMBConv_cocoa()
1829 {
1830 Init(CFStringGetSystemEncoding()) ;
1831 }
1832
1833 wxMBConv_cocoa(const wxChar* name)
1834 {
1835 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1836 }
1837
1838 wxMBConv_cocoa(wxFontEncoding encoding)
1839 {
1840 Init( wxCFStringEncFromFontEnc(encoding) );
1841 }
1842
1843 ~wxMBConv_cocoa()
1844 {
1845 }
1846
1847 void Init( CFStringEncoding encoding)
1848 {
638357a0 1849 m_encoding = encoding ;
f7e98dee
RN
1850 }
1851
1852 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1853 {
1854 wxASSERT(szUnConv);
ecd9653b 1855
638357a0
RN
1856 CFStringRef theString = CFStringCreateWithBytes (
1857 NULL, //the allocator
1858 (const UInt8*)szUnConv,
1859 strlen(szUnConv),
1860 m_encoding,
1861 false //no BOM/external representation
f7e98dee
RN
1862 );
1863
1864 wxASSERT(theString);
1865
638357a0
RN
1866 size_t nOutLength = CFStringGetLength(theString);
1867
1868 if (szOut == NULL)
f7e98dee 1869 {
f7e98dee 1870 CFRelease(theString);
638357a0 1871 return nOutLength;
f7e98dee 1872 }
ecd9653b 1873
638357a0 1874 CFRange theRange = { 0, nOutSize };
ecd9653b 1875
638357a0
RN
1876#if SIZEOF_WCHAR_T == 4
1877 UniChar* szUniCharBuffer = new UniChar[nOutSize];
1878#endif
1879
f7e98dee 1880 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
638357a0 1881
f7e98dee 1882 CFRelease(theString);
ecd9653b 1883
638357a0 1884 szUniCharBuffer[nOutLength] = '\0' ;
f7e98dee
RN
1885
1886#if SIZEOF_WCHAR_T == 4
1887 wxMBConvUTF16 converter ;
638357a0 1888 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
f7e98dee
RN
1889 delete[] szUniCharBuffer;
1890#endif
638357a0
RN
1891
1892 return nOutLength;
f7e98dee
RN
1893 }
1894
1895 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1896 {
638357a0
RN
1897 wxASSERT(szUnConv);
1898
f7e98dee 1899 size_t nRealOutSize;
638357a0 1900 size_t nBufSize = wxWcslen(szUnConv);
f7e98dee 1901 UniChar* szUniBuffer = (UniChar*) szUnConv;
ecd9653b 1902
f7e98dee
RN
1903#if SIZEOF_WCHAR_T == 4
1904 wxMBConvUTF16BE converter ;
1905 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1906 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1907 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1908 nBufSize /= sizeof(UniChar);
f7e98dee
RN
1909#endif
1910
1911 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1912 NULL, //allocator
1913 szUniBuffer,
1914 nBufSize,
638357a0 1915 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
f7e98dee 1916 );
ecd9653b 1917
f7e98dee 1918 wxASSERT(theString);
ecd9653b 1919
f7e98dee 1920 //Note that CER puts a BOM when converting to unicode
638357a0
RN
1921 //so we check and use getchars instead in that case
1922 if (m_encoding == kCFStringEncodingUnicode)
f7e98dee 1923 {
638357a0
RN
1924 if (szOut != NULL)
1925 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1926
1927 nRealOutSize = CFStringGetLength(theString) + 1;
1928 }
1929 else
1930 {
1931 CFStringGetBytes(
1932 theString,
1933 CFRangeMake(0, CFStringGetLength(theString)),
1934 m_encoding,
1935 0, //what to put in characters that can't be converted -
1936 //0 tells CFString to return NULL if it meets such a character
1937 false, //not an external representation
1938 (UInt8*) szOut,
1939 nOutSize,
1940 (CFIndex*) &nRealOutSize
1941 );
f7e98dee 1942 }
ecd9653b 1943
638357a0 1944 CFRelease(theString);
ecd9653b 1945
638357a0
RN
1946#if SIZEOF_WCHAR_T == 4
1947 delete[] szUniBuffer;
1948#endif
ecd9653b 1949
f7e98dee
RN
1950 return nRealOutSize - 1;
1951 }
1952
1953 bool IsOk() const
ecd9653b 1954 {
638357a0
RN
1955 return m_encoding != kCFStringEncodingInvalidId &&
1956 CFStringIsEncodingAvailable(m_encoding);
f7e98dee
RN
1957 }
1958
1959private:
638357a0 1960 CFStringEncoding m_encoding ;
f7e98dee
RN
1961};
1962
1963#endif // defined(__WXCOCOA__)
1964
335d31e0
SC
1965// ============================================================================
1966// Mac conversion classes
1967// ============================================================================
1968
1969#if defined(__WXMAC__) && defined(TARGET_CARBON)
1970
1971class wxMBConv_mac : public wxMBConv
1972{
1973public:
1974 wxMBConv_mac()
1975 {
1976 Init(CFStringGetSystemEncoding()) ;
1977 }
1978
1979 wxMBConv_mac(const wxChar* name)
1980 {
d775fa82 1981 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
335d31e0
SC
1982 }
1983
1984 wxMBConv_mac(wxFontEncoding encoding)
1985 {
d775fa82
WS
1986 Init( wxMacGetSystemEncFromFontEnc(encoding) );
1987 }
1988
1989 ~wxMBConv_mac()
1990 {
1991 OSStatus status = noErr ;
1992 status = TECDisposeConverter(m_MB2WC_converter);
1993 status = TECDisposeConverter(m_WC2MB_converter);
1994 }
1995
1996
1997 void Init( TextEncodingBase encoding)
1998 {
1999 OSStatus status = noErr ;
2000 m_char_encoding = encoding ;
2001 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2002
2003 status = TECCreateConverter(&m_MB2WC_converter,
2004 m_char_encoding,
2005 m_unicode_encoding);
2006 status = TECCreateConverter(&m_WC2MB_converter,
2007 m_unicode_encoding,
2008 m_char_encoding);
2009 }
2010
335d31e0
SC
2011 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2012 {
d775fa82
WS
2013 OSStatus status = noErr ;
2014 ByteCount byteOutLen ;
2015 ByteCount byteInLen = strlen(psz) ;
2016 wchar_t *tbuf = NULL ;
2017 UniChar* ubuf = NULL ;
2018 size_t res = 0 ;
2019
2020 if (buf == NULL)
2021 {
638357a0 2022 //apple specs say at least 32
c543817b 2023 n = wxMax( 32 , byteInLen ) ;
d775fa82
WS
2024 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2025 }
2026 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
f3a355ce 2027#if SIZEOF_WCHAR_T == 4
d775fa82 2028 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
f3a355ce 2029#else
d775fa82 2030 ubuf = (UniChar*) (buf ? buf : tbuf) ;
f3a355ce 2031#endif
d775fa82
WS
2032 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2033 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
f3a355ce 2034#if SIZEOF_WCHAR_T == 4
8471ea90
SC
2035 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2036 // is not properly terminated we get random characters at the end
2037 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
d775fa82
WS
2038 wxMBConvUTF16BE converter ;
2039 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2040 free( ubuf ) ;
f3a355ce 2041#else
d775fa82 2042 res = byteOutLen / sizeof( UniChar ) ;
f3a355ce 2043#endif
d775fa82
WS
2044 if ( buf == NULL )
2045 free(tbuf) ;
335d31e0 2046
335d31e0
SC
2047 if ( buf && res < n)
2048 buf[res] = 0;
2049
d775fa82 2050 return res ;
335d31e0
SC
2051 }
2052
2053 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
d775fa82
WS
2054 {
2055 OSStatus status = noErr ;
2056 ByteCount byteOutLen ;
2057 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2058
2059 char *tbuf = NULL ;
2060
2061 if (buf == NULL)
2062 {
638357a0 2063 //apple specs say at least 32
c543817b 2064 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
d775fa82
WS
2065 tbuf = (char*) malloc( n ) ;
2066 }
2067
2068 ByteCount byteBufferLen = n ;
2069 UniChar* ubuf = NULL ;
f3a355ce 2070#if SIZEOF_WCHAR_T == 4
d775fa82
WS
2071 wxMBConvUTF16BE converter ;
2072 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2073 byteInLen = unicharlen ;
2074 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2075 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
f3a355ce 2076#else
d775fa82 2077 ubuf = (UniChar*) psz ;
f3a355ce 2078#endif
d775fa82
WS
2079 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2080 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
f3a355ce 2081#if SIZEOF_WCHAR_T == 4
d775fa82 2082 free( ubuf ) ;
f3a355ce 2083#endif
d775fa82
WS
2084 if ( buf == NULL )
2085 free(tbuf) ;
335d31e0 2086
d775fa82 2087 size_t res = byteOutLen ;
335d31e0 2088 if ( buf && res < n)
638357a0 2089 {
335d31e0 2090 buf[res] = 0;
638357a0
RN
2091
2092 //we need to double-trip to verify it didn't insert any ? in place
2093 //of bogus characters
2094 wxWCharBuffer wcBuf(n);
2095 size_t pszlen = wxWcslen(psz);
2096 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2097 wxWcslen(wcBuf) != pszlen ||
2098 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2099 {
2100 // we didn't obtain the same thing we started from, hence
2101 // the conversion was lossy and we consider that it failed
2102 return (size_t)-1;
2103 }
2104 }
335d31e0 2105
d775fa82 2106 return res ;
335d31e0
SC
2107 }
2108
2109 bool IsOk() const
2110 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2111
2112private:
d775fa82
WS
2113 TECObjectRef m_MB2WC_converter ;
2114 TECObjectRef m_WC2MB_converter ;
2115
2116 TextEncodingBase m_char_encoding ;
2117 TextEncodingBase m_unicode_encoding ;
335d31e0
SC
2118};
2119
2120#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1e6feb95 2121
36acb880
VZ
2122// ============================================================================
2123// wxEncodingConverter based conversion classes
2124// ============================================================================
2125
1e6feb95 2126#if wxUSE_FONTMAP
1cd52418 2127
e95354ec 2128class wxMBConv_wxwin : public wxMBConv
1cd52418 2129{
8b04d4c4
VZ
2130private:
2131 void Init()
2132 {
2133 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2134 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2135 }
2136
6001e347 2137public:
f1339c56
RR
2138 // temporarily just use wxEncodingConverter stuff,
2139 // so that it works while a better implementation is built
e95354ec 2140 wxMBConv_wxwin(const wxChar* name)
f1339c56
RR
2141 {
2142 if (name)
e95354ec 2143 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
8b04d4c4
VZ
2144 else
2145 m_enc = wxFONTENCODING_SYSTEM;
cafbf6fb 2146
8b04d4c4
VZ
2147 Init();
2148 }
2149
e95354ec 2150 wxMBConv_wxwin(wxFontEncoding enc)
8b04d4c4
VZ
2151 {
2152 m_enc = enc;
2153
2154 Init();
f1339c56 2155 }
dccce9ea 2156
bde4baac 2157 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
f1339c56
RR
2158 {
2159 size_t inbuf = strlen(psz);
dccce9ea 2160 if (buf)
4def3b35 2161 m2w.Convert(psz,buf);
f1339c56
RR
2162 return inbuf;
2163 }
dccce9ea 2164
bde4baac 2165 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
f1339c56 2166 {
f8d791e0 2167 const size_t inbuf = wxWcslen(psz);
f1339c56
RR
2168 if (buf)
2169 w2m.Convert(psz,buf);
dccce9ea 2170
f1339c56
RR
2171 return inbuf;
2172 }
dccce9ea 2173
e95354ec 2174 bool IsOk() const { return m_ok; }
f1339c56
RR
2175
2176public:
8b04d4c4 2177 wxFontEncoding m_enc;
f1339c56 2178 wxEncodingConverter m2w, w2m;
cafbf6fb
VZ
2179
2180 // were we initialized successfully?
2181 bool m_ok;
fc7a2a60 2182
e95354ec 2183 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
f6bcfd97 2184};
6001e347 2185
1e6feb95
VZ
2186#endif // wxUSE_FONTMAP
2187
36acb880
VZ
2188// ============================================================================
2189// wxCSConv implementation
2190// ============================================================================
2191
8b04d4c4 2192void wxCSConv::Init()
6001e347 2193{
e95354ec
VZ
2194 m_name = NULL;
2195 m_convReal = NULL;
2196 m_deferred = true;
2197}
2198
8b04d4c4
VZ
2199wxCSConv::wxCSConv(const wxChar *charset)
2200{
2201 Init();
82713003 2202
e95354ec
VZ
2203 if ( charset )
2204 {
e95354ec
VZ
2205 SetName(charset);
2206 }
bda3d86a
VZ
2207
2208 m_encoding = wxFONTENCODING_SYSTEM;
6001e347
RR
2209}
2210
8b04d4c4
VZ
2211wxCSConv::wxCSConv(wxFontEncoding encoding)
2212{
bda3d86a 2213 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
e95354ec
VZ
2214 {
2215 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2216
2217 encoding = wxFONTENCODING_SYSTEM;
2218 }
2219
8b04d4c4
VZ
2220 Init();
2221
bda3d86a 2222 m_encoding = encoding;
8b04d4c4
VZ
2223}
2224
6001e347
RR
2225wxCSConv::~wxCSConv()
2226{
65e50848
JS
2227 Clear();
2228}
2229
54380f29 2230wxCSConv::wxCSConv(const wxCSConv& conv)
8b04d4c4 2231 : wxMBConv()
54380f29 2232{
8b04d4c4
VZ
2233 Init();
2234
54380f29 2235 SetName(conv.m_name);
8b04d4c4 2236 m_encoding = conv.m_encoding;
54380f29
GD
2237}
2238
2239wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2240{
2241 Clear();
8b04d4c4 2242
54380f29 2243 SetName(conv.m_name);
8b04d4c4
VZ
2244 m_encoding = conv.m_encoding;
2245
54380f29
GD
2246 return *this;
2247}
2248
65e50848
JS
2249void wxCSConv::Clear()
2250{
8b04d4c4 2251 free(m_name);
e95354ec 2252 delete m_convReal;
8b04d4c4 2253
65e50848 2254 m_name = NULL;
e95354ec 2255 m_convReal = NULL;
6001e347
RR
2256}
2257
2258void wxCSConv::SetName(const wxChar *charset)
2259{
f1339c56
RR
2260 if (charset)
2261 {
2262 m_name = wxStrdup(charset);
e95354ec 2263 m_deferred = true;
f1339c56 2264 }
6001e347
RR
2265}
2266
e95354ec
VZ
2267wxMBConv *wxCSConv::DoCreate() const
2268{
c547282d
VZ
2269 // check for the special case of ASCII or ISO8859-1 charset: as we have
2270 // special knowledge of it anyhow, we don't need to create a special
2271 // conversion object
2272 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
f1339c56 2273 {
e95354ec
VZ
2274 // don't convert at all
2275 return NULL;
2276 }
dccce9ea 2277
e95354ec
VZ
2278 // we trust OS to do conversion better than we can so try external
2279 // conversion methods first
2280 //
2281 // the full order is:
2282 // 1. OS conversion (iconv() under Unix or Win32 API)
2283 // 2. hard coded conversions for UTF
2284 // 3. wxEncodingConverter as fall back
2285
2286 // step (1)
2287#ifdef HAVE_ICONV
c547282d 2288#if !wxUSE_FONTMAP
e95354ec 2289 if ( m_name )
c547282d 2290#endif // !wxUSE_FONTMAP
e95354ec 2291 {
c547282d
VZ
2292 wxString name(m_name);
2293
2294#if wxUSE_FONTMAP
2295 if ( name.empty() )
2296 name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2297#endif // wxUSE_FONTMAP
2298
2299 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
e95354ec
VZ
2300 if ( conv->IsOk() )
2301 return conv;
2302
2303 delete conv;
2304 }
2305#endif // HAVE_ICONV
2306
2307#ifdef wxHAVE_WIN32_MB2WC
2308 {
7608a683 2309#if wxUSE_FONTMAP
e95354ec
VZ
2310 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2311 : new wxMBConv_win32(m_encoding);
2312 if ( conv->IsOk() )
2313 return conv;
2314
2315 delete conv;
7608a683
WS
2316#else
2317 return NULL;
2318#endif
e95354ec
VZ
2319 }
2320#endif // wxHAVE_WIN32_MB2WC
d775fa82
WS
2321#if defined(__WXMAC__)
2322 {
2323 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2324 {
2325
2326 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2327 : new wxMBConv_mac(m_encoding);
2328 if ( conv->IsOk() )
f7e98dee
RN
2329 return conv;
2330
2331 delete conv;
2332 }
2333 }
2334#endif
2335#if defined(__WXCOCOA__)
2336 {
2337 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2338 {
2339
2340 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2341 : new wxMBConv_cocoa(m_encoding);
2342 if ( conv->IsOk() )
d775fa82
WS
2343 return conv;
2344
2345 delete conv;
2346 }
335d31e0
SC
2347 }
2348#endif
e95354ec
VZ
2349 // step (2)
2350 wxFontEncoding enc = m_encoding;
2351#if wxUSE_FONTMAP
c547282d
VZ
2352 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2353 {
2354 // use "false" to suppress interactive dialogs -- we can be called from
2355 // anywhere and popping up a dialog from here is the last thing we want to
2356 // do
2357 enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2358 }
e95354ec
VZ
2359#endif // wxUSE_FONTMAP
2360
2361 switch ( enc )
2362 {
2363 case wxFONTENCODING_UTF7:
2364 return new wxMBConvUTF7;
2365
2366 case wxFONTENCODING_UTF8:
2367 return new wxMBConvUTF8;
2368
e95354ec
VZ
2369 case wxFONTENCODING_UTF16BE:
2370 return new wxMBConvUTF16BE;
2371
2372 case wxFONTENCODING_UTF16LE:
2373 return new wxMBConvUTF16LE;
2374
e95354ec
VZ
2375 case wxFONTENCODING_UTF32BE:
2376 return new wxMBConvUTF32BE;
2377
2378 case wxFONTENCODING_UTF32LE:
2379 return new wxMBConvUTF32LE;
2380
2381 default:
2382 // nothing to do but put here to suppress gcc warnings
2383 ;
2384 }
2385
2386 // step (3)
2387#if wxUSE_FONTMAP
2388 {
2389 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2390 : new wxMBConv_wxwin(m_encoding);
2391 if ( conv->IsOk() )
2392 return conv;
2393
2394 delete conv;
2395 }
2396#endif // wxUSE_FONTMAP
2397
a58d4f4d
VS
2398 // NB: This is a hack to prevent deadlock. What could otherwise happen
2399 // in Unicode build: wxConvLocal creation ends up being here
2400 // because of some failure and logs the error. But wxLog will try to
2401 // attach timestamp, for which it will need wxConvLocal (to convert
2402 // time to char* and then wchar_t*), but that fails, tries to log
2403 // error, but wxLog has a (already locked) critical section that
2404 // guards static buffer.
2405 static bool alreadyLoggingError = false;
2406 if (!alreadyLoggingError)
2407 {
2408 alreadyLoggingError = true;
2409 wxLogError(_("Cannot convert from the charset '%s'!"),
2410 m_name ? m_name
e95354ec
VZ
2411 :
2412#if wxUSE_FONTMAP
2413 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2414#else // !wxUSE_FONTMAP
2415 wxString::Format(_("encoding %s"), m_encoding).c_str()
2416#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2417 );
a58d4f4d
VS
2418 alreadyLoggingError = false;
2419 }
e95354ec
VZ
2420
2421 return NULL;
2422}
2423
2424void wxCSConv::CreateConvIfNeeded() const
2425{
2426 if ( m_deferred )
2427 {
2428 wxCSConv *self = (wxCSConv *)this; // const_cast
bda3d86a
VZ
2429
2430#if wxUSE_INTL
2431 // if we don't have neither the name nor the encoding, use the default
2432 // encoding for this system
2433 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2434 {
4d312c22 2435 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
bda3d86a
VZ
2436 }
2437#endif // wxUSE_INTL
2438
e95354ec
VZ
2439 self->m_convReal = DoCreate();
2440 self->m_deferred = false;
6001e347 2441 }
6001e347
RR
2442}
2443
2444size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2445{
e95354ec 2446 CreateConvIfNeeded();
dccce9ea 2447
e95354ec
VZ
2448 if (m_convReal)
2449 return m_convReal->MB2WC(buf, psz, n);
f1339c56
RR
2450
2451 // latin-1 (direct)
4def3b35 2452 size_t len = strlen(psz);
dccce9ea 2453
f1339c56
RR
2454 if (buf)
2455 {
4def3b35 2456 for (size_t c = 0; c <= len; c++)
f1339c56
RR
2457 buf[c] = (unsigned char)(psz[c]);
2458 }
dccce9ea 2459
f1339c56 2460 return len;
6001e347
RR
2461}
2462
2463size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2464{
e95354ec 2465 CreateConvIfNeeded();
dccce9ea 2466
e95354ec
VZ
2467 if (m_convReal)
2468 return m_convReal->WC2MB(buf, psz, n);
1cd52418 2469
f1339c56 2470 // latin-1 (direct)
f8d791e0 2471 const size_t len = wxWcslen(psz);
f1339c56
RR
2472 if (buf)
2473 {
4def3b35 2474 for (size_t c = 0; c <= len; c++)
24642831
VS
2475 {
2476 if (psz[c] > 0xFF)
2477 return (size_t)-1;
907173e5 2478 buf[c] = (char)psz[c];
24642831
VS
2479 }
2480 }
2481 else
2482 {
2483 for (size_t c = 0; c <= len; c++)
2484 {
2485 if (psz[c] > 0xFF)
2486 return (size_t)-1;
2487 }
f1339c56 2488 }
dccce9ea 2489
f1339c56 2490 return len;
6001e347
RR
2491}
2492
bde4baac
VZ
2493// ----------------------------------------------------------------------------
2494// globals
2495// ----------------------------------------------------------------------------
2496
2497#ifdef __WINDOWS__
2498 static wxMBConv_win32 wxConvLibcObj;
f81f5901
SC
2499#elif defined(__WXMAC__) && !defined(__MACH__)
2500 static wxMBConv_mac wxConvLibcObj ;
bde4baac 2501#else
dcc8fac0 2502 static wxMBConvLibc wxConvLibcObj;
bde4baac
VZ
2503#endif
2504
2505static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2506static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2507static wxMBConvUTF7 wxConvUTF7Obj;
2508static wxMBConvUTF8 wxConvUTF8Obj;
2509
2510
2511WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2512WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2513WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2514WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2515WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2516WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2517
2518#else // !wxUSE_WCHAR_T
2519
2520// stand-ins in absence of wchar_t
2521WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2522 wxConvISO8859_1,
2523 wxConvLocal,
2524 wxConvUTF8;
2525
2526#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
6001e347
RR
2527
2528