]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
4741650e156b640c5b4712b60576c49b351f0991
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359
360 #ifdef __UNIX__
361
362 // ----------------------------------------------------------------------------
363 // wxConvBrokenFileNames
364 // ----------------------------------------------------------------------------
365
366 wxConvBrokenFileNames::wxConvBrokenFileNames()
367 {
368 // decide which conversion to use for the file names
369
370 // (1) this variable exists for the sole purpose of specifying the encoding
371 // of the filenames for GTK+ programs, so use it if it is set
372 const wxChar *encName = wxGetenv(_T("G_FILENAME_ENCODING"));
373 if ( encName )
374 {
375 m_conv = new wxCSConv(encName);
376 }
377 else // no G_FILENAME_ENCODING
378 {
379 // (2) if a non default locale is set, assume that the user wants his
380 // filenames in this locale too
381 switch ( wxLocale::GetSystemEncoding() )
382 {
383 default:
384 m_conv = new wxMBConvLibc;
385 break;
386
387 // (3) finally use UTF-8 by default
388 case wxFONTENCODING_SYSTEM:
389 case wxFONTENCODING_UTF8:
390 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
391 break;
392 }
393 }
394 }
395
396 size_t
397 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
398 const char *psz,
399 size_t outputSize) const
400 {
401 return m_conv->MB2WC( outputBuf, psz, outputSize );
402 }
403
404 size_t
405 wxConvBrokenFileNames::WC2MB(char *outputBuf,
406 const wchar_t *psz,
407 size_t outputSize) const
408 {
409 return m_conv->WC2MB( outputBuf, psz, outputSize );
410 }
411
412 #endif
413
414 // ----------------------------------------------------------------------------
415 // UTF-7
416 // ----------------------------------------------------------------------------
417
418 // Implementation (C) 2004 Fredrik Roubert
419
420 //
421 // BASE64 decoding table
422 //
423 static const unsigned char utf7unb64[] =
424 {
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
431 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
432 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
434 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
435 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
436 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
437 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
438 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
439 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
440 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
441 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
442 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
443 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
444 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
451 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
452 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
453 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
454 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
455 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
456 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
457 };
458
459 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
460 {
461 size_t len = 0;
462
463 while (*psz && ((!buf) || (len < n)))
464 {
465 unsigned char cc = *psz++;
466 if (cc != '+')
467 {
468 // plain ASCII char
469 if (buf)
470 *buf++ = cc;
471 len++;
472 }
473 else if (*psz == '-')
474 {
475 // encoded plus sign
476 if (buf)
477 *buf++ = cc;
478 len++;
479 psz++;
480 }
481 else
482 {
483 // BASE64 encoded string
484 bool lsb;
485 unsigned char c;
486 unsigned int d, l;
487 for (lsb = false, d = 0, l = 0;
488 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
489 {
490 d <<= 6;
491 d += cc;
492 for (l += 6; l >= 8; lsb = !lsb)
493 {
494 c = (unsigned char)((d >> (l -= 8)) % 256);
495 if (lsb)
496 {
497 if (buf)
498 *buf++ |= c;
499 len ++;
500 }
501 else
502 if (buf)
503 *buf = (wchar_t)(c << 8);
504 }
505 }
506 if (*psz == '-')
507 psz++;
508 }
509 }
510 if (buf && (len < n))
511 *buf = 0;
512 return len;
513 }
514
515 //
516 // BASE64 encoding table
517 //
518 static const unsigned char utf7enb64[] =
519 {
520 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
521 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
522 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
523 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
524 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
525 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
526 'w', 'x', 'y', 'z', '0', '1', '2', '3',
527 '4', '5', '6', '7', '8', '9', '+', '/'
528 };
529
530 //
531 // UTF-7 encoding table
532 //
533 // 0 - Set D (directly encoded characters)
534 // 1 - Set O (optional direct characters)
535 // 2 - whitespace characters (optional)
536 // 3 - special characters
537 //
538 static const unsigned char utf7encode[128] =
539 {
540 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
541 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
542 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
544 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
546 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
548 };
549
550 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
551 {
552
553
554 size_t len = 0;
555
556 while (*psz && ((!buf) || (len < n)))
557 {
558 wchar_t cc = *psz++;
559 if (cc < 0x80 && utf7encode[cc] < 1)
560 {
561 // plain ASCII char
562 if (buf)
563 *buf++ = (char)cc;
564 len++;
565 }
566 #ifndef WC_UTF16
567 else if (((wxUint32)cc) > 0xffff)
568 {
569 // no surrogate pair generation (yet?)
570 return (size_t)-1;
571 }
572 #endif
573 else
574 {
575 if (buf)
576 *buf++ = '+';
577 len++;
578 if (cc != '+')
579 {
580 // BASE64 encode string
581 unsigned int lsb, d, l;
582 for (d = 0, l = 0;; psz++)
583 {
584 for (lsb = 0; lsb < 2; lsb ++)
585 {
586 d <<= 8;
587 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
588
589 for (l += 8; l >= 6; )
590 {
591 l -= 6;
592 if (buf)
593 *buf++ = utf7enb64[(d >> l) % 64];
594 len++;
595 }
596 }
597 cc = *psz;
598 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
599 break;
600 }
601 if (l != 0)
602 {
603 if (buf)
604 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
605 len++;
606 }
607 }
608 if (buf)
609 *buf++ = '-';
610 len++;
611 }
612 }
613 if (buf && (len < n))
614 *buf = 0;
615 return len;
616 }
617
618 // ----------------------------------------------------------------------------
619 // UTF-8
620 // ----------------------------------------------------------------------------
621
622 static wxUint32 utf8_max[]=
623 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
624
625 // boundaries of the private use area we use to (temporarily) remap invalid
626 // characters invalid in a UTF-8 encoded string
627 const wxUint32 wxUnicodePUA = 0x100000;
628 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
629
630 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
631 {
632 size_t len = 0;
633
634 while (*psz && ((!buf) || (len < n)))
635 {
636 const char *opsz = psz;
637 bool invalid = false;
638 unsigned char cc = *psz++, fc = cc;
639 unsigned cnt;
640 for (cnt = 0; fc & 0x80; cnt++)
641 fc <<= 1;
642 if (!cnt)
643 {
644 // plain ASCII char
645 if (buf)
646 *buf++ = cc;
647 len++;
648 }
649 else
650 {
651 cnt--;
652 if (!cnt)
653 {
654 // invalid UTF-8 sequence
655 invalid = true;
656 }
657 else
658 {
659 unsigned ocnt = cnt - 1;
660 wxUint32 res = cc & (0x3f >> cnt);
661 while (cnt--)
662 {
663 cc = *psz;
664 if ((cc & 0xC0) != 0x80)
665 {
666 // invalid UTF-8 sequence
667 invalid = true;
668 break;
669 }
670 psz++;
671 res = (res << 6) | (cc & 0x3f);
672 }
673 if (invalid || res <= utf8_max[ocnt])
674 {
675 // illegal UTF-8 encoding
676 invalid = true;
677 }
678 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
679 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
680 {
681 // if one of our PUA characters turns up externally
682 // it must also be treated as an illegal sequence
683 // (a bit like you have to escape an escape character)
684 invalid = true;
685 }
686 else
687 {
688 #ifdef WC_UTF16
689 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
690 size_t pa = encode_utf16(res, (wxUint16 *)buf);
691 if (pa == (size_t)-1)
692 {
693 invalid = true;
694 }
695 else
696 {
697 if (buf)
698 buf += pa;
699 len += pa;
700 }
701 #else // !WC_UTF16
702 if (buf)
703 *buf++ = res;
704 len++;
705 #endif // WC_UTF16/!WC_UTF16
706 }
707 }
708 if (invalid)
709 {
710 if (m_options & MAP_INVALID_UTF8_TO_PUA)
711 {
712 while (opsz < psz && (!buf || len < n))
713 {
714 #ifdef WC_UTF16
715 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
716 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
717 wxASSERT(pa != (size_t)-1);
718 if (buf)
719 buf += pa;
720 opsz++;
721 len += pa;
722 #else
723 if (buf)
724 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
725 opsz++;
726 len++;
727 #endif
728 }
729 }
730 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
731 {
732 while (opsz < psz && (!buf || len < n))
733 {
734 if ( buf && len + 3 < n )
735 {
736 unsigned char n = *opsz;
737 *buf++ = L'\\';
738 *buf++ = (wchar_t)( L'0' + n / 0100 );
739 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
740 *buf++ = (wchar_t)( L'0' + n % 010 );
741 }
742 opsz++;
743 len += 4;
744 }
745 }
746 else // MAP_INVALID_UTF8_NOT
747 {
748 return (size_t)-1;
749 }
750 }
751 }
752 }
753 if (buf && (len < n))
754 *buf = 0;
755 return len;
756 }
757
758 static inline bool isoctal(wchar_t wch)
759 {
760 return L'0' <= wch && wch <= L'7';
761 }
762
763 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
764 {
765 size_t len = 0;
766
767 while (*psz && ((!buf) || (len < n)))
768 {
769 wxUint32 cc;
770 #ifdef WC_UTF16
771 // cast is ok for WC_UTF16
772 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
773 psz += (pa == (size_t)-1) ? 1 : pa;
774 #else
775 cc=(*psz++) & 0x7fffffff;
776 #endif
777
778 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
779 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
780 {
781 if (buf)
782 *buf++ = (char)(cc - wxUnicodePUA);
783 len++;
784 }
785 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
786 cc == L'\\' &&
787 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
788 {
789 if (buf)
790 {
791 *buf++ = (char) ((psz[0] - L'0')*0100 +
792 (psz[1] - L'0')*010 +
793 (psz[2] - L'0'));
794 }
795
796 psz += 3;
797 len++;
798 }
799 else
800 {
801 unsigned cnt;
802 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
803 if (!cnt)
804 {
805 // plain ASCII char
806 if (buf)
807 *buf++ = (char) cc;
808 len++;
809 }
810
811 else
812 {
813 len += cnt + 1;
814 if (buf)
815 {
816 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
817 while (cnt--)
818 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
819 }
820 }
821 }
822 }
823
824 if (buf && (len<n))
825 *buf = 0;
826
827 return len;
828 }
829
830 // ----------------------------------------------------------------------------
831 // UTF-16
832 // ----------------------------------------------------------------------------
833
834 #ifdef WORDS_BIGENDIAN
835 #define wxMBConvUTF16straight wxMBConvUTF16BE
836 #define wxMBConvUTF16swap wxMBConvUTF16LE
837 #else
838 #define wxMBConvUTF16swap wxMBConvUTF16BE
839 #define wxMBConvUTF16straight wxMBConvUTF16LE
840 #endif
841
842
843 #ifdef WC_UTF16
844
845 // copy 16bit MB to 16bit String
846 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
847 {
848 size_t len=0;
849
850 while (*(wxUint16*)psz && (!buf || len < n))
851 {
852 if (buf)
853 *buf++ = *(wxUint16*)psz;
854 len++;
855
856 psz += sizeof(wxUint16);
857 }
858 if (buf && len<n) *buf=0;
859
860 return len;
861 }
862
863
864 // copy 16bit String to 16bit MB
865 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
866 {
867 size_t len=0;
868
869 while (*psz && (!buf || len < n))
870 {
871 if (buf)
872 {
873 *(wxUint16*)buf = *psz;
874 buf += sizeof(wxUint16);
875 }
876 len += sizeof(wxUint16);
877 psz++;
878 }
879 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
880
881 return len;
882 }
883
884
885 // swap 16bit MB to 16bit String
886 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
887 {
888 size_t len=0;
889
890 while (*(wxUint16*)psz && (!buf || len < n))
891 {
892 if (buf)
893 {
894 ((char *)buf)[0] = psz[1];
895 ((char *)buf)[1] = psz[0];
896 buf++;
897 }
898 len++;
899 psz += sizeof(wxUint16);
900 }
901 if (buf && len<n) *buf=0;
902
903 return len;
904 }
905
906
907 // swap 16bit MB to 16bit String
908 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
909 {
910 size_t len=0;
911
912 while (*psz && (!buf || len < n))
913 {
914 if (buf)
915 {
916 *buf++ = ((char*)psz)[1];
917 *buf++ = ((char*)psz)[0];
918 }
919 len += sizeof(wxUint16);
920 psz++;
921 }
922 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
923
924 return len;
925 }
926
927
928 #else // WC_UTF16
929
930
931 // copy 16bit MB to 32bit String
932 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
933 {
934 size_t len=0;
935
936 while (*(wxUint16*)psz && (!buf || len < n))
937 {
938 wxUint32 cc;
939 size_t pa=decode_utf16((wxUint16*)psz, cc);
940 if (pa == (size_t)-1)
941 return pa;
942
943 if (buf)
944 *buf++ = cc;
945 len++;
946 psz += pa * sizeof(wxUint16);
947 }
948 if (buf && len<n) *buf=0;
949
950 return len;
951 }
952
953
954 // copy 32bit String to 16bit MB
955 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
956 {
957 size_t len=0;
958
959 while (*psz && (!buf || len < n))
960 {
961 wxUint16 cc[2];
962 size_t pa=encode_utf16(*psz, cc);
963
964 if (pa == (size_t)-1)
965 return pa;
966
967 if (buf)
968 {
969 *(wxUint16*)buf = cc[0];
970 buf += sizeof(wxUint16);
971 if (pa > 1)
972 {
973 *(wxUint16*)buf = cc[1];
974 buf += sizeof(wxUint16);
975 }
976 }
977
978 len += pa*sizeof(wxUint16);
979 psz++;
980 }
981 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
982
983 return len;
984 }
985
986
987 // swap 16bit MB to 32bit String
988 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
989 {
990 size_t len=0;
991
992 while (*(wxUint16*)psz && (!buf || len < n))
993 {
994 wxUint32 cc;
995 char tmp[4];
996 tmp[0]=psz[1]; tmp[1]=psz[0];
997 tmp[2]=psz[3]; tmp[3]=psz[2];
998
999 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1000 if (pa == (size_t)-1)
1001 return pa;
1002
1003 if (buf)
1004 *buf++ = cc;
1005
1006 len++;
1007 psz += pa * sizeof(wxUint16);
1008 }
1009 if (buf && len<n) *buf=0;
1010
1011 return len;
1012 }
1013
1014
1015 // swap 32bit String to 16bit MB
1016 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1017 {
1018 size_t len=0;
1019
1020 while (*psz && (!buf || len < n))
1021 {
1022 wxUint16 cc[2];
1023 size_t pa=encode_utf16(*psz, cc);
1024
1025 if (pa == (size_t)-1)
1026 return pa;
1027
1028 if (buf)
1029 {
1030 *buf++ = ((char*)cc)[1];
1031 *buf++ = ((char*)cc)[0];
1032 if (pa > 1)
1033 {
1034 *buf++ = ((char*)cc)[3];
1035 *buf++ = ((char*)cc)[2];
1036 }
1037 }
1038
1039 len += pa*sizeof(wxUint16);
1040 psz++;
1041 }
1042 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1043
1044 return len;
1045 }
1046
1047 #endif // WC_UTF16
1048
1049
1050 // ----------------------------------------------------------------------------
1051 // UTF-32
1052 // ----------------------------------------------------------------------------
1053
1054 #ifdef WORDS_BIGENDIAN
1055 #define wxMBConvUTF32straight wxMBConvUTF32BE
1056 #define wxMBConvUTF32swap wxMBConvUTF32LE
1057 #else
1058 #define wxMBConvUTF32swap wxMBConvUTF32BE
1059 #define wxMBConvUTF32straight wxMBConvUTF32LE
1060 #endif
1061
1062
1063 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1064 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1065
1066
1067 #ifdef WC_UTF16
1068
1069 // copy 32bit MB to 16bit String
1070 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1071 {
1072 size_t len=0;
1073
1074 while (*(wxUint32*)psz && (!buf || len < n))
1075 {
1076 wxUint16 cc[2];
1077
1078 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1079 if (pa == (size_t)-1)
1080 return pa;
1081
1082 if (buf)
1083 {
1084 *buf++ = cc[0];
1085 if (pa > 1)
1086 *buf++ = cc[1];
1087 }
1088 len += pa;
1089 psz += sizeof(wxUint32);
1090 }
1091 if (buf && len<n) *buf=0;
1092
1093 return len;
1094 }
1095
1096
1097 // copy 16bit String to 32bit MB
1098 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1099 {
1100 size_t len=0;
1101
1102 while (*psz && (!buf || len < n))
1103 {
1104 wxUint32 cc;
1105
1106 // cast is ok for WC_UTF16
1107 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1108 if (pa == (size_t)-1)
1109 return pa;
1110
1111 if (buf)
1112 {
1113 *(wxUint32*)buf = cc;
1114 buf += sizeof(wxUint32);
1115 }
1116 len += sizeof(wxUint32);
1117 psz += pa;
1118 }
1119
1120 if (buf && len<=n-sizeof(wxUint32))
1121 *(wxUint32*)buf=0;
1122
1123 return len;
1124 }
1125
1126
1127
1128 // swap 32bit MB to 16bit String
1129 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1130 {
1131 size_t len=0;
1132
1133 while (*(wxUint32*)psz && (!buf || len < n))
1134 {
1135 char tmp[4];
1136 tmp[0] = psz[3]; tmp[1] = psz[2];
1137 tmp[2] = psz[1]; tmp[3] = psz[0];
1138
1139
1140 wxUint16 cc[2];
1141
1142 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1143 if (pa == (size_t)-1)
1144 return pa;
1145
1146 if (buf)
1147 {
1148 *buf++ = cc[0];
1149 if (pa > 1)
1150 *buf++ = cc[1];
1151 }
1152 len += pa;
1153 psz += sizeof(wxUint32);
1154 }
1155
1156 if (buf && len<n)
1157 *buf=0;
1158
1159 return len;
1160 }
1161
1162
1163 // swap 16bit String to 32bit MB
1164 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1165 {
1166 size_t len=0;
1167
1168 while (*psz && (!buf || len < n))
1169 {
1170 char cc[4];
1171
1172 // cast is ok for WC_UTF16
1173 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1174 if (pa == (size_t)-1)
1175 return pa;
1176
1177 if (buf)
1178 {
1179 *buf++ = cc[3];
1180 *buf++ = cc[2];
1181 *buf++ = cc[1];
1182 *buf++ = cc[0];
1183 }
1184 len += sizeof(wxUint32);
1185 psz += pa;
1186 }
1187
1188 if (buf && len<=n-sizeof(wxUint32))
1189 *(wxUint32*)buf=0;
1190
1191 return len;
1192 }
1193
1194 #else // WC_UTF16
1195
1196
1197 // copy 32bit MB to 32bit String
1198 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1199 {
1200 size_t len=0;
1201
1202 while (*(wxUint32*)psz && (!buf || len < n))
1203 {
1204 if (buf)
1205 *buf++ = *(wxUint32*)psz;
1206 len++;
1207 psz += sizeof(wxUint32);
1208 }
1209
1210 if (buf && len<n)
1211 *buf=0;
1212
1213 return len;
1214 }
1215
1216
1217 // copy 32bit String to 32bit MB
1218 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1219 {
1220 size_t len=0;
1221
1222 while (*psz && (!buf || len < n))
1223 {
1224 if (buf)
1225 {
1226 *(wxUint32*)buf = *psz;
1227 buf += sizeof(wxUint32);
1228 }
1229
1230 len += sizeof(wxUint32);
1231 psz++;
1232 }
1233
1234 if (buf && len<=n-sizeof(wxUint32))
1235 *(wxUint32*)buf=0;
1236
1237 return len;
1238 }
1239
1240
1241 // swap 32bit MB to 32bit String
1242 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1243 {
1244 size_t len=0;
1245
1246 while (*(wxUint32*)psz && (!buf || len < n))
1247 {
1248 if (buf)
1249 {
1250 ((char *)buf)[0] = psz[3];
1251 ((char *)buf)[1] = psz[2];
1252 ((char *)buf)[2] = psz[1];
1253 ((char *)buf)[3] = psz[0];
1254 buf++;
1255 }
1256 len++;
1257 psz += sizeof(wxUint32);
1258 }
1259
1260 if (buf && len<n)
1261 *buf=0;
1262
1263 return len;
1264 }
1265
1266
1267 // swap 32bit String to 32bit MB
1268 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1269 {
1270 size_t len=0;
1271
1272 while (*psz && (!buf || len < n))
1273 {
1274 if (buf)
1275 {
1276 *buf++ = ((char *)psz)[3];
1277 *buf++ = ((char *)psz)[2];
1278 *buf++ = ((char *)psz)[1];
1279 *buf++ = ((char *)psz)[0];
1280 }
1281 len += sizeof(wxUint32);
1282 psz++;
1283 }
1284
1285 if (buf && len<=n-sizeof(wxUint32))
1286 *(wxUint32*)buf=0;
1287
1288 return len;
1289 }
1290
1291
1292 #endif // WC_UTF16
1293
1294
1295 // ============================================================================
1296 // The classes doing conversion using the iconv_xxx() functions
1297 // ============================================================================
1298
1299 #ifdef HAVE_ICONV
1300
1301 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1302 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1303 // (unless there's yet another bug in glibc) the only case when iconv()
1304 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1305 // left in the input buffer -- when _real_ error occurs,
1306 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1307 // iconv() failure.
1308 // [This bug does not appear in glibc 2.2.]
1309 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1310 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1311 (errno != E2BIG || bufLeft != 0))
1312 #else
1313 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1314 #endif
1315
1316 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1317
1318 // ----------------------------------------------------------------------------
1319 // wxMBConv_iconv: encapsulates an iconv character set
1320 // ----------------------------------------------------------------------------
1321
1322 class wxMBConv_iconv : public wxMBConv
1323 {
1324 public:
1325 wxMBConv_iconv(const wxChar *name);
1326 virtual ~wxMBConv_iconv();
1327
1328 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1329 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1330
1331 bool IsOk() const
1332 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1333
1334 protected:
1335 // the iconv handlers used to translate from multibyte to wide char and in
1336 // the other direction
1337 iconv_t m2w,
1338 w2m;
1339 #if wxUSE_THREADS
1340 // guards access to m2w and w2m objects
1341 wxMutex m_iconvMutex;
1342 #endif
1343
1344 private:
1345 // the name (for iconv_open()) of a wide char charset -- if none is
1346 // available on this machine, it will remain NULL
1347 static const char *ms_wcCharsetName;
1348
1349 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1350 // different endian-ness than the native one
1351 static bool ms_wcNeedsSwap;
1352 };
1353
1354 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1355 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1356
1357 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1358 {
1359 // Do it the hard way
1360 char cname[100];
1361 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1362 cname[i] = (char) name[i];
1363
1364 // check for charset that represents wchar_t:
1365 if (ms_wcCharsetName == NULL)
1366 {
1367 ms_wcNeedsSwap = false;
1368
1369 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1370 ms_wcCharsetName = WC_NAME_BEST;
1371 m2w = iconv_open(ms_wcCharsetName, cname);
1372
1373 if (m2w == (iconv_t)-1)
1374 {
1375 // try charset w/o bytesex info (e.g. "UCS4")
1376 // and check for bytesex ourselves:
1377 ms_wcCharsetName = WC_NAME;
1378 m2w = iconv_open(ms_wcCharsetName, cname);
1379
1380 // last bet, try if it knows WCHAR_T pseudo-charset
1381 if (m2w == (iconv_t)-1)
1382 {
1383 ms_wcCharsetName = "WCHAR_T";
1384 m2w = iconv_open(ms_wcCharsetName, cname);
1385 }
1386
1387 if (m2w != (iconv_t)-1)
1388 {
1389 char buf[2], *bufPtr;
1390 wchar_t wbuf[2], *wbufPtr;
1391 size_t insz, outsz;
1392 size_t res;
1393
1394 buf[0] = 'A';
1395 buf[1] = 0;
1396 wbuf[0] = 0;
1397 insz = 2;
1398 outsz = SIZEOF_WCHAR_T * 2;
1399 wbufPtr = wbuf;
1400 bufPtr = buf;
1401
1402 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1403 (char**)&wbufPtr, &outsz);
1404
1405 if (ICONV_FAILED(res, insz))
1406 {
1407 ms_wcCharsetName = NULL;
1408 wxLogLastError(wxT("iconv"));
1409 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1410 }
1411 else
1412 {
1413 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1414 }
1415 }
1416 else
1417 {
1418 ms_wcCharsetName = NULL;
1419
1420 // VS: we must not output an error here, since wxWidgets will safely
1421 // fall back to using wxEncodingConverter.
1422 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1423 //wxLogError(
1424 }
1425 }
1426 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1427 }
1428 else // we already have ms_wcCharsetName
1429 {
1430 m2w = iconv_open(ms_wcCharsetName, cname);
1431 }
1432
1433 // NB: don't ever pass NULL to iconv_open(), it may crash!
1434 if ( ms_wcCharsetName )
1435 {
1436 w2m = iconv_open( cname, ms_wcCharsetName);
1437 }
1438 else
1439 {
1440 w2m = (iconv_t)-1;
1441 }
1442 }
1443
1444 wxMBConv_iconv::~wxMBConv_iconv()
1445 {
1446 if ( m2w != (iconv_t)-1 )
1447 iconv_close(m2w);
1448 if ( w2m != (iconv_t)-1 )
1449 iconv_close(w2m);
1450 }
1451
1452 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1453 {
1454 #if wxUSE_THREADS
1455 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1456 // Unfortunately there is a couple of global wxCSConv objects such as
1457 // wxConvLocal that are used all over wx code, so we have to make sure
1458 // the handle is used by at most one thread at the time. Otherwise
1459 // only a few wx classes would be safe to use from non-main threads
1460 // as MB<->WC conversion would fail "randomly".
1461 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1462 #endif
1463
1464 size_t inbuf = strlen(psz);
1465 size_t outbuf = n * SIZEOF_WCHAR_T;
1466 size_t res, cres;
1467 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1468 wchar_t *bufPtr = buf;
1469 const char *pszPtr = psz;
1470
1471 if (buf)
1472 {
1473 // have destination buffer, convert there
1474 cres = iconv(m2w,
1475 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1476 (char**)&bufPtr, &outbuf);
1477 res = n - (outbuf / SIZEOF_WCHAR_T);
1478
1479 if (ms_wcNeedsSwap)
1480 {
1481 // convert to native endianness
1482 WC_BSWAP(buf /* _not_ bufPtr */, res)
1483 }
1484
1485 // NB: iconv was given only strlen(psz) characters on input, and so
1486 // it couldn't convert the trailing zero. Let's do it ourselves
1487 // if there's some room left for it in the output buffer.
1488 if (res < n)
1489 buf[res] = 0;
1490 }
1491 else
1492 {
1493 // no destination buffer... convert using temp buffer
1494 // to calculate destination buffer requirement
1495 wchar_t tbuf[8];
1496 res = 0;
1497 do {
1498 bufPtr = tbuf;
1499 outbuf = 8*SIZEOF_WCHAR_T;
1500
1501 cres = iconv(m2w,
1502 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1503 (char**)&bufPtr, &outbuf );
1504
1505 res += 8-(outbuf/SIZEOF_WCHAR_T);
1506 } while ((cres==(size_t)-1) && (errno==E2BIG));
1507 }
1508
1509 if (ICONV_FAILED(cres, inbuf))
1510 {
1511 //VS: it is ok if iconv fails, hence trace only
1512 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1513 return (size_t)-1;
1514 }
1515
1516 return res;
1517 }
1518
1519 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1520 {
1521 #if wxUSE_THREADS
1522 // NB: explained in MB2WC
1523 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1524 #endif
1525
1526 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1527 size_t outbuf = n;
1528 size_t res, cres;
1529
1530 wchar_t *tmpbuf = 0;
1531
1532 if (ms_wcNeedsSwap)
1533 {
1534 // need to copy to temp buffer to switch endianness
1535 // this absolutely doesn't rock!
1536 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1537 // could be in read-only memory, or be accessed in some other thread)
1538 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1539 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1540 WC_BSWAP(tmpbuf, inbuf)
1541 psz=tmpbuf;
1542 }
1543
1544 if (buf)
1545 {
1546 // have destination buffer, convert there
1547 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1548
1549 res = n-outbuf;
1550
1551 // NB: iconv was given only wcslen(psz) characters on input, and so
1552 // it couldn't convert the trailing zero. Let's do it ourselves
1553 // if there's some room left for it in the output buffer.
1554 if (res < n)
1555 buf[0] = 0;
1556 }
1557 else
1558 {
1559 // no destination buffer... convert using temp buffer
1560 // to calculate destination buffer requirement
1561 char tbuf[16];
1562 res = 0;
1563 do {
1564 buf = tbuf; outbuf = 16;
1565
1566 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1567
1568 res += 16 - outbuf;
1569 } while ((cres==(size_t)-1) && (errno==E2BIG));
1570 }
1571
1572 if (ms_wcNeedsSwap)
1573 {
1574 free(tmpbuf);
1575 }
1576
1577 if (ICONV_FAILED(cres, inbuf))
1578 {
1579 //VS: it is ok if iconv fails, hence trace only
1580 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1581 return (size_t)-1;
1582 }
1583
1584 return res;
1585 }
1586
1587 #endif // HAVE_ICONV
1588
1589
1590 // ============================================================================
1591 // Win32 conversion classes
1592 // ============================================================================
1593
1594 #ifdef wxHAVE_WIN32_MB2WC
1595
1596 // from utils.cpp
1597 #if wxUSE_FONTMAP
1598 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1599 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1600 #endif
1601
1602 class wxMBConv_win32 : public wxMBConv
1603 {
1604 public:
1605 wxMBConv_win32()
1606 {
1607 m_CodePage = CP_ACP;
1608 }
1609
1610 #if wxUSE_FONTMAP
1611 wxMBConv_win32(const wxChar* name)
1612 {
1613 m_CodePage = wxCharsetToCodepage(name);
1614 }
1615
1616 wxMBConv_win32(wxFontEncoding encoding)
1617 {
1618 m_CodePage = wxEncodingToCodepage(encoding);
1619 }
1620 #endif
1621
1622 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1623 {
1624 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1625 // the behaviour is not compatible with the Unix version (using iconv)
1626 // and break the library itself, e.g. wxTextInputStream::NextChar()
1627 // wouldn't work if reading an incomplete MB char didn't result in an
1628 // error
1629 //
1630 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1631 // an error (tested under Windows Server 2003) and apparently it is
1632 // done on purpose, i.e. the function accepts any input in this case
1633 // and although I'd prefer to return error on ill-formed output, our
1634 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1635 // explicitly ill-formed according to RFC 2152) neither so we don't
1636 // even have any fallback here...
1637 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1638
1639 const size_t len = ::MultiByteToWideChar
1640 (
1641 m_CodePage, // code page
1642 flags, // flags: fall on error
1643 psz, // input string
1644 -1, // its length (NUL-terminated)
1645 buf, // output string
1646 buf ? n : 0 // size of output buffer
1647 );
1648
1649 // note that it returns count of written chars for buf != NULL and size
1650 // of the needed buffer for buf == NULL so in either case the length of
1651 // the string (which never includes the terminating NUL) is one less
1652 return len ? len - 1 : (size_t)-1;
1653 }
1654
1655 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1656 {
1657 /*
1658 we have a problem here: by default, WideCharToMultiByte() may
1659 replace characters unrepresentable in the target code page with bad
1660 quality approximations such as turning "1/2" symbol (U+00BD) into
1661 "1" for the code pages which don't have it and we, obviously, want
1662 to avoid this at any price
1663
1664 the trouble is that this function does it _silently_, i.e. it won't
1665 even tell us whether it did or not... Win98/2000 and higher provide
1666 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1667 we have to resort to a round trip, i.e. check that converting back
1668 results in the same string -- this is, of course, expensive but
1669 otherwise we simply can't be sure to not garble the data.
1670 */
1671
1672 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1673 // it doesn't work with CJK encodings (which we test for rather roughly
1674 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1675 // supporting it
1676 BOOL usedDef wxDUMMY_INITIALIZE(false);
1677 BOOL *pUsedDef;
1678 int flags;
1679 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1680 {
1681 // it's our lucky day
1682 flags = WC_NO_BEST_FIT_CHARS;
1683 pUsedDef = &usedDef;
1684 }
1685 else // old system or unsupported encoding
1686 {
1687 flags = 0;
1688 pUsedDef = NULL;
1689 }
1690
1691 const size_t len = ::WideCharToMultiByte
1692 (
1693 m_CodePage, // code page
1694 flags, // either none or no best fit
1695 pwz, // input string
1696 -1, // it is (wide) NUL-terminated
1697 buf, // output buffer
1698 buf ? n : 0, // and its size
1699 NULL, // default "replacement" char
1700 pUsedDef // [out] was it used?
1701 );
1702
1703 if ( !len )
1704 {
1705 // function totally failed
1706 return (size_t)-1;
1707 }
1708
1709 // if we were really converting, check if we succeeded
1710 if ( buf )
1711 {
1712 if ( flags )
1713 {
1714 // check if the conversion failed, i.e. if any replacements
1715 // were done
1716 if ( usedDef )
1717 return (size_t)-1;
1718 }
1719 else // we must resort to double tripping...
1720 {
1721 wxWCharBuffer wcBuf(n);
1722 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1723 wcscmp(wcBuf, pwz) != 0 )
1724 {
1725 // we didn't obtain the same thing we started from, hence
1726 // the conversion was lossy and we consider that it failed
1727 return (size_t)-1;
1728 }
1729 }
1730 }
1731
1732 // see the comment above for the reason of "len - 1"
1733 return len - 1;
1734 }
1735
1736 bool IsOk() const { return m_CodePage != -1; }
1737
1738 private:
1739 static bool CanUseNoBestFit()
1740 {
1741 static int s_isWin98Or2k = -1;
1742
1743 if ( s_isWin98Or2k == -1 )
1744 {
1745 int verMaj, verMin;
1746 switch ( wxGetOsVersion(&verMaj, &verMin) )
1747 {
1748 case wxWIN95:
1749 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1750 break;
1751
1752 case wxWINDOWS_NT:
1753 s_isWin98Or2k = verMaj >= 5;
1754 break;
1755
1756 default:
1757 // unknown, be conseravtive by default
1758 s_isWin98Or2k = 0;
1759 }
1760
1761 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1762 }
1763
1764 return s_isWin98Or2k == 1;
1765 }
1766
1767 long m_CodePage;
1768 };
1769
1770 #endif // wxHAVE_WIN32_MB2WC
1771
1772 // ============================================================================
1773 // Cocoa conversion classes
1774 // ============================================================================
1775
1776 #if defined(__WXCOCOA__)
1777
1778 // RN: There is no UTF-32 support in either Core Foundation or
1779 // Cocoa. Strangely enough, internally Core Foundation uses
1780 // UTF 32 internally quite a bit - its just not public (yet).
1781
1782 #include <CoreFoundation/CFString.h>
1783 #include <CoreFoundation/CFStringEncodingExt.h>
1784
1785 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1786 {
1787 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1788 if ( encoding == wxFONTENCODING_DEFAULT )
1789 {
1790 enc = CFStringGetSystemEncoding();
1791 }
1792 else switch( encoding)
1793 {
1794 case wxFONTENCODING_ISO8859_1 :
1795 enc = kCFStringEncodingISOLatin1 ;
1796 break ;
1797 case wxFONTENCODING_ISO8859_2 :
1798 enc = kCFStringEncodingISOLatin2;
1799 break ;
1800 case wxFONTENCODING_ISO8859_3 :
1801 enc = kCFStringEncodingISOLatin3 ;
1802 break ;
1803 case wxFONTENCODING_ISO8859_4 :
1804 enc = kCFStringEncodingISOLatin4;
1805 break ;
1806 case wxFONTENCODING_ISO8859_5 :
1807 enc = kCFStringEncodingISOLatinCyrillic;
1808 break ;
1809 case wxFONTENCODING_ISO8859_6 :
1810 enc = kCFStringEncodingISOLatinArabic;
1811 break ;
1812 case wxFONTENCODING_ISO8859_7 :
1813 enc = kCFStringEncodingISOLatinGreek;
1814 break ;
1815 case wxFONTENCODING_ISO8859_8 :
1816 enc = kCFStringEncodingISOLatinHebrew;
1817 break ;
1818 case wxFONTENCODING_ISO8859_9 :
1819 enc = kCFStringEncodingISOLatin5;
1820 break ;
1821 case wxFONTENCODING_ISO8859_10 :
1822 enc = kCFStringEncodingISOLatin6;
1823 break ;
1824 case wxFONTENCODING_ISO8859_11 :
1825 enc = kCFStringEncodingISOLatinThai;
1826 break ;
1827 case wxFONTENCODING_ISO8859_13 :
1828 enc = kCFStringEncodingISOLatin7;
1829 break ;
1830 case wxFONTENCODING_ISO8859_14 :
1831 enc = kCFStringEncodingISOLatin8;
1832 break ;
1833 case wxFONTENCODING_ISO8859_15 :
1834 enc = kCFStringEncodingISOLatin9;
1835 break ;
1836
1837 case wxFONTENCODING_KOI8 :
1838 enc = kCFStringEncodingKOI8_R;
1839 break ;
1840 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1841 enc = kCFStringEncodingDOSRussian;
1842 break ;
1843
1844 // case wxFONTENCODING_BULGARIAN :
1845 // enc = ;
1846 // break ;
1847
1848 case wxFONTENCODING_CP437 :
1849 enc =kCFStringEncodingDOSLatinUS ;
1850 break ;
1851 case wxFONTENCODING_CP850 :
1852 enc = kCFStringEncodingDOSLatin1;
1853 break ;
1854 case wxFONTENCODING_CP852 :
1855 enc = kCFStringEncodingDOSLatin2;
1856 break ;
1857 case wxFONTENCODING_CP855 :
1858 enc = kCFStringEncodingDOSCyrillic;
1859 break ;
1860 case wxFONTENCODING_CP866 :
1861 enc =kCFStringEncodingDOSRussian ;
1862 break ;
1863 case wxFONTENCODING_CP874 :
1864 enc = kCFStringEncodingDOSThai;
1865 break ;
1866 case wxFONTENCODING_CP932 :
1867 enc = kCFStringEncodingDOSJapanese;
1868 break ;
1869 case wxFONTENCODING_CP936 :
1870 enc =kCFStringEncodingDOSChineseSimplif ;
1871 break ;
1872 case wxFONTENCODING_CP949 :
1873 enc = kCFStringEncodingDOSKorean;
1874 break ;
1875 case wxFONTENCODING_CP950 :
1876 enc = kCFStringEncodingDOSChineseTrad;
1877 break ;
1878 case wxFONTENCODING_CP1250 :
1879 enc = kCFStringEncodingWindowsLatin2;
1880 break ;
1881 case wxFONTENCODING_CP1251 :
1882 enc =kCFStringEncodingWindowsCyrillic ;
1883 break ;
1884 case wxFONTENCODING_CP1252 :
1885 enc =kCFStringEncodingWindowsLatin1 ;
1886 break ;
1887 case wxFONTENCODING_CP1253 :
1888 enc = kCFStringEncodingWindowsGreek;
1889 break ;
1890 case wxFONTENCODING_CP1254 :
1891 enc = kCFStringEncodingWindowsLatin5;
1892 break ;
1893 case wxFONTENCODING_CP1255 :
1894 enc =kCFStringEncodingWindowsHebrew ;
1895 break ;
1896 case wxFONTENCODING_CP1256 :
1897 enc =kCFStringEncodingWindowsArabic ;
1898 break ;
1899 case wxFONTENCODING_CP1257 :
1900 enc = kCFStringEncodingWindowsBalticRim;
1901 break ;
1902 // This only really encodes to UTF7 (if that) evidently
1903 // case wxFONTENCODING_UTF7 :
1904 // enc = kCFStringEncodingNonLossyASCII ;
1905 // break ;
1906 case wxFONTENCODING_UTF8 :
1907 enc = kCFStringEncodingUTF8 ;
1908 break ;
1909 case wxFONTENCODING_EUC_JP :
1910 enc = kCFStringEncodingEUC_JP;
1911 break ;
1912 case wxFONTENCODING_UTF16 :
1913 enc = kCFStringEncodingUnicode ;
1914 break ;
1915 case wxFONTENCODING_MACROMAN :
1916 enc = kCFStringEncodingMacRoman ;
1917 break ;
1918 case wxFONTENCODING_MACJAPANESE :
1919 enc = kCFStringEncodingMacJapanese ;
1920 break ;
1921 case wxFONTENCODING_MACCHINESETRAD :
1922 enc = kCFStringEncodingMacChineseTrad ;
1923 break ;
1924 case wxFONTENCODING_MACKOREAN :
1925 enc = kCFStringEncodingMacKorean ;
1926 break ;
1927 case wxFONTENCODING_MACARABIC :
1928 enc = kCFStringEncodingMacArabic ;
1929 break ;
1930 case wxFONTENCODING_MACHEBREW :
1931 enc = kCFStringEncodingMacHebrew ;
1932 break ;
1933 case wxFONTENCODING_MACGREEK :
1934 enc = kCFStringEncodingMacGreek ;
1935 break ;
1936 case wxFONTENCODING_MACCYRILLIC :
1937 enc = kCFStringEncodingMacCyrillic ;
1938 break ;
1939 case wxFONTENCODING_MACDEVANAGARI :
1940 enc = kCFStringEncodingMacDevanagari ;
1941 break ;
1942 case wxFONTENCODING_MACGURMUKHI :
1943 enc = kCFStringEncodingMacGurmukhi ;
1944 break ;
1945 case wxFONTENCODING_MACGUJARATI :
1946 enc = kCFStringEncodingMacGujarati ;
1947 break ;
1948 case wxFONTENCODING_MACORIYA :
1949 enc = kCFStringEncodingMacOriya ;
1950 break ;
1951 case wxFONTENCODING_MACBENGALI :
1952 enc = kCFStringEncodingMacBengali ;
1953 break ;
1954 case wxFONTENCODING_MACTAMIL :
1955 enc = kCFStringEncodingMacTamil ;
1956 break ;
1957 case wxFONTENCODING_MACTELUGU :
1958 enc = kCFStringEncodingMacTelugu ;
1959 break ;
1960 case wxFONTENCODING_MACKANNADA :
1961 enc = kCFStringEncodingMacKannada ;
1962 break ;
1963 case wxFONTENCODING_MACMALAJALAM :
1964 enc = kCFStringEncodingMacMalayalam ;
1965 break ;
1966 case wxFONTENCODING_MACSINHALESE :
1967 enc = kCFStringEncodingMacSinhalese ;
1968 break ;
1969 case wxFONTENCODING_MACBURMESE :
1970 enc = kCFStringEncodingMacBurmese ;
1971 break ;
1972 case wxFONTENCODING_MACKHMER :
1973 enc = kCFStringEncodingMacKhmer ;
1974 break ;
1975 case wxFONTENCODING_MACTHAI :
1976 enc = kCFStringEncodingMacThai ;
1977 break ;
1978 case wxFONTENCODING_MACLAOTIAN :
1979 enc = kCFStringEncodingMacLaotian ;
1980 break ;
1981 case wxFONTENCODING_MACGEORGIAN :
1982 enc = kCFStringEncodingMacGeorgian ;
1983 break ;
1984 case wxFONTENCODING_MACARMENIAN :
1985 enc = kCFStringEncodingMacArmenian ;
1986 break ;
1987 case wxFONTENCODING_MACCHINESESIMP :
1988 enc = kCFStringEncodingMacChineseSimp ;
1989 break ;
1990 case wxFONTENCODING_MACTIBETAN :
1991 enc = kCFStringEncodingMacTibetan ;
1992 break ;
1993 case wxFONTENCODING_MACMONGOLIAN :
1994 enc = kCFStringEncodingMacMongolian ;
1995 break ;
1996 case wxFONTENCODING_MACETHIOPIC :
1997 enc = kCFStringEncodingMacEthiopic ;
1998 break ;
1999 case wxFONTENCODING_MACCENTRALEUR :
2000 enc = kCFStringEncodingMacCentralEurRoman ;
2001 break ;
2002 case wxFONTENCODING_MACVIATNAMESE :
2003 enc = kCFStringEncodingMacVietnamese ;
2004 break ;
2005 case wxFONTENCODING_MACARABICEXT :
2006 enc = kCFStringEncodingMacExtArabic ;
2007 break ;
2008 case wxFONTENCODING_MACSYMBOL :
2009 enc = kCFStringEncodingMacSymbol ;
2010 break ;
2011 case wxFONTENCODING_MACDINGBATS :
2012 enc = kCFStringEncodingMacDingbats ;
2013 break ;
2014 case wxFONTENCODING_MACTURKISH :
2015 enc = kCFStringEncodingMacTurkish ;
2016 break ;
2017 case wxFONTENCODING_MACCROATIAN :
2018 enc = kCFStringEncodingMacCroatian ;
2019 break ;
2020 case wxFONTENCODING_MACICELANDIC :
2021 enc = kCFStringEncodingMacIcelandic ;
2022 break ;
2023 case wxFONTENCODING_MACROMANIAN :
2024 enc = kCFStringEncodingMacRomanian ;
2025 break ;
2026 case wxFONTENCODING_MACCELTIC :
2027 enc = kCFStringEncodingMacCeltic ;
2028 break ;
2029 case wxFONTENCODING_MACGAELIC :
2030 enc = kCFStringEncodingMacGaelic ;
2031 break ;
2032 // case wxFONTENCODING_MACKEYBOARD :
2033 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2034 // break ;
2035 default :
2036 // because gcc is picky
2037 break ;
2038 } ;
2039 return enc ;
2040 }
2041
2042 class wxMBConv_cocoa : public wxMBConv
2043 {
2044 public:
2045 wxMBConv_cocoa()
2046 {
2047 Init(CFStringGetSystemEncoding()) ;
2048 }
2049
2050 #if wxUSE_FONTMAP
2051 wxMBConv_cocoa(const wxChar* name)
2052 {
2053 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2054 }
2055 #endif
2056
2057 wxMBConv_cocoa(wxFontEncoding encoding)
2058 {
2059 Init( wxCFStringEncFromFontEnc(encoding) );
2060 }
2061
2062 ~wxMBConv_cocoa()
2063 {
2064 }
2065
2066 void Init( CFStringEncoding encoding)
2067 {
2068 m_encoding = encoding ;
2069 }
2070
2071 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2072 {
2073 wxASSERT(szUnConv);
2074
2075 CFStringRef theString = CFStringCreateWithBytes (
2076 NULL, //the allocator
2077 (const UInt8*)szUnConv,
2078 strlen(szUnConv),
2079 m_encoding,
2080 false //no BOM/external representation
2081 );
2082
2083 wxASSERT(theString);
2084
2085 size_t nOutLength = CFStringGetLength(theString);
2086
2087 if (szOut == NULL)
2088 {
2089 CFRelease(theString);
2090 return nOutLength;
2091 }
2092
2093 CFRange theRange = { 0, nOutSize };
2094
2095 #if SIZEOF_WCHAR_T == 4
2096 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2097 #endif
2098
2099 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2100
2101 CFRelease(theString);
2102
2103 szUniCharBuffer[nOutLength] = '\0' ;
2104
2105 #if SIZEOF_WCHAR_T == 4
2106 wxMBConvUTF16 converter ;
2107 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2108 delete[] szUniCharBuffer;
2109 #endif
2110
2111 return nOutLength;
2112 }
2113
2114 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2115 {
2116 wxASSERT(szUnConv);
2117
2118 size_t nRealOutSize;
2119 size_t nBufSize = wxWcslen(szUnConv);
2120 UniChar* szUniBuffer = (UniChar*) szUnConv;
2121
2122 #if SIZEOF_WCHAR_T == 4
2123 wxMBConvUTF16BE converter ;
2124 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2125 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2126 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2127 nBufSize /= sizeof(UniChar);
2128 #endif
2129
2130 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2131 NULL, //allocator
2132 szUniBuffer,
2133 nBufSize,
2134 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2135 );
2136
2137 wxASSERT(theString);
2138
2139 //Note that CER puts a BOM when converting to unicode
2140 //so we check and use getchars instead in that case
2141 if (m_encoding == kCFStringEncodingUnicode)
2142 {
2143 if (szOut != NULL)
2144 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2145
2146 nRealOutSize = CFStringGetLength(theString) + 1;
2147 }
2148 else
2149 {
2150 CFStringGetBytes(
2151 theString,
2152 CFRangeMake(0, CFStringGetLength(theString)),
2153 m_encoding,
2154 0, //what to put in characters that can't be converted -
2155 //0 tells CFString to return NULL if it meets such a character
2156 false, //not an external representation
2157 (UInt8*) szOut,
2158 nOutSize,
2159 (CFIndex*) &nRealOutSize
2160 );
2161 }
2162
2163 CFRelease(theString);
2164
2165 #if SIZEOF_WCHAR_T == 4
2166 delete[] szUniBuffer;
2167 #endif
2168
2169 return nRealOutSize - 1;
2170 }
2171
2172 bool IsOk() const
2173 {
2174 return m_encoding != kCFStringEncodingInvalidId &&
2175 CFStringIsEncodingAvailable(m_encoding);
2176 }
2177
2178 private:
2179 CFStringEncoding m_encoding ;
2180 };
2181
2182 #endif // defined(__WXCOCOA__)
2183
2184 // ============================================================================
2185 // Mac conversion classes
2186 // ============================================================================
2187
2188 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2189
2190 class wxMBConv_mac : public wxMBConv
2191 {
2192 public:
2193 wxMBConv_mac()
2194 {
2195 Init(CFStringGetSystemEncoding()) ;
2196 }
2197
2198 #if wxUSE_FONTMAP
2199 wxMBConv_mac(const wxChar* name)
2200 {
2201 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2202 }
2203 #endif
2204
2205 wxMBConv_mac(wxFontEncoding encoding)
2206 {
2207 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2208 }
2209
2210 ~wxMBConv_mac()
2211 {
2212 OSStatus status = noErr ;
2213 status = TECDisposeConverter(m_MB2WC_converter);
2214 status = TECDisposeConverter(m_WC2MB_converter);
2215 }
2216
2217
2218 void Init( TextEncodingBase encoding)
2219 {
2220 OSStatus status = noErr ;
2221 m_char_encoding = encoding ;
2222 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2223
2224 status = TECCreateConverter(&m_MB2WC_converter,
2225 m_char_encoding,
2226 m_unicode_encoding);
2227 status = TECCreateConverter(&m_WC2MB_converter,
2228 m_unicode_encoding,
2229 m_char_encoding);
2230 }
2231
2232 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2233 {
2234 OSStatus status = noErr ;
2235 ByteCount byteOutLen ;
2236 ByteCount byteInLen = strlen(psz) ;
2237 wchar_t *tbuf = NULL ;
2238 UniChar* ubuf = NULL ;
2239 size_t res = 0 ;
2240
2241 if (buf == NULL)
2242 {
2243 //apple specs say at least 32
2244 n = wxMax( 32 , byteInLen ) ;
2245 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2246 }
2247 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2248 #if SIZEOF_WCHAR_T == 4
2249 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2250 #else
2251 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2252 #endif
2253 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2254 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2255 #if SIZEOF_WCHAR_T == 4
2256 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2257 // is not properly terminated we get random characters at the end
2258 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2259 wxMBConvUTF16BE converter ;
2260 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2261 free( ubuf ) ;
2262 #else
2263 res = byteOutLen / sizeof( UniChar ) ;
2264 #endif
2265 if ( buf == NULL )
2266 free(tbuf) ;
2267
2268 if ( buf && res < n)
2269 buf[res] = 0;
2270
2271 return res ;
2272 }
2273
2274 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2275 {
2276 OSStatus status = noErr ;
2277 ByteCount byteOutLen ;
2278 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2279
2280 char *tbuf = NULL ;
2281
2282 if (buf == NULL)
2283 {
2284 //apple specs say at least 32
2285 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2286 tbuf = (char*) malloc( n ) ;
2287 }
2288
2289 ByteCount byteBufferLen = n ;
2290 UniChar* ubuf = NULL ;
2291 #if SIZEOF_WCHAR_T == 4
2292 wxMBConvUTF16BE converter ;
2293 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2294 byteInLen = unicharlen ;
2295 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2296 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2297 #else
2298 ubuf = (UniChar*) psz ;
2299 #endif
2300 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2301 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2302 #if SIZEOF_WCHAR_T == 4
2303 free( ubuf ) ;
2304 #endif
2305 if ( buf == NULL )
2306 free(tbuf) ;
2307
2308 size_t res = byteOutLen ;
2309 if ( buf && res < n)
2310 {
2311 buf[res] = 0;
2312
2313 //we need to double-trip to verify it didn't insert any ? in place
2314 //of bogus characters
2315 wxWCharBuffer wcBuf(n);
2316 size_t pszlen = wxWcslen(psz);
2317 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2318 wxWcslen(wcBuf) != pszlen ||
2319 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2320 {
2321 // we didn't obtain the same thing we started from, hence
2322 // the conversion was lossy and we consider that it failed
2323 return (size_t)-1;
2324 }
2325 }
2326
2327 return res ;
2328 }
2329
2330 bool IsOk() const
2331 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2332
2333 private:
2334 TECObjectRef m_MB2WC_converter ;
2335 TECObjectRef m_WC2MB_converter ;
2336
2337 TextEncodingBase m_char_encoding ;
2338 TextEncodingBase m_unicode_encoding ;
2339 };
2340
2341 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2342
2343 // ============================================================================
2344 // wxEncodingConverter based conversion classes
2345 // ============================================================================
2346
2347 #if wxUSE_FONTMAP
2348
2349 class wxMBConv_wxwin : public wxMBConv
2350 {
2351 private:
2352 void Init()
2353 {
2354 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2355 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2356 }
2357
2358 public:
2359 // temporarily just use wxEncodingConverter stuff,
2360 // so that it works while a better implementation is built
2361 wxMBConv_wxwin(const wxChar* name)
2362 {
2363 if (name)
2364 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2365 else
2366 m_enc = wxFONTENCODING_SYSTEM;
2367
2368 Init();
2369 }
2370
2371 wxMBConv_wxwin(wxFontEncoding enc)
2372 {
2373 m_enc = enc;
2374
2375 Init();
2376 }
2377
2378 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2379 {
2380 size_t inbuf = strlen(psz);
2381 if (buf)
2382 {
2383 if (!m2w.Convert(psz,buf))
2384 return (size_t)-1;
2385 }
2386 return inbuf;
2387 }
2388
2389 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2390 {
2391 const size_t inbuf = wxWcslen(psz);
2392 if (buf)
2393 {
2394 if (!w2m.Convert(psz,buf))
2395 return (size_t)-1;
2396 }
2397
2398 return inbuf;
2399 }
2400
2401 bool IsOk() const { return m_ok; }
2402
2403 public:
2404 wxFontEncoding m_enc;
2405 wxEncodingConverter m2w, w2m;
2406
2407 // were we initialized successfully?
2408 bool m_ok;
2409
2410 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2411 };
2412
2413 #endif // wxUSE_FONTMAP
2414
2415 // ============================================================================
2416 // wxCSConv implementation
2417 // ============================================================================
2418
2419 void wxCSConv::Init()
2420 {
2421 m_name = NULL;
2422 m_convReal = NULL;
2423 m_deferred = true;
2424 }
2425
2426 wxCSConv::wxCSConv(const wxChar *charset)
2427 {
2428 Init();
2429
2430 if ( charset )
2431 {
2432 SetName(charset);
2433 }
2434
2435 m_encoding = wxFONTENCODING_SYSTEM;
2436 }
2437
2438 wxCSConv::wxCSConv(wxFontEncoding encoding)
2439 {
2440 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2441 {
2442 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2443
2444 encoding = wxFONTENCODING_SYSTEM;
2445 }
2446
2447 Init();
2448
2449 m_encoding = encoding;
2450 }
2451
2452 wxCSConv::~wxCSConv()
2453 {
2454 Clear();
2455 }
2456
2457 wxCSConv::wxCSConv(const wxCSConv& conv)
2458 : wxMBConv()
2459 {
2460 Init();
2461
2462 SetName(conv.m_name);
2463 m_encoding = conv.m_encoding;
2464 }
2465
2466 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2467 {
2468 Clear();
2469
2470 SetName(conv.m_name);
2471 m_encoding = conv.m_encoding;
2472
2473 return *this;
2474 }
2475
2476 void wxCSConv::Clear()
2477 {
2478 free(m_name);
2479 delete m_convReal;
2480
2481 m_name = NULL;
2482 m_convReal = NULL;
2483 }
2484
2485 void wxCSConv::SetName(const wxChar *charset)
2486 {
2487 if (charset)
2488 {
2489 m_name = wxStrdup(charset);
2490 m_deferred = true;
2491 }
2492 }
2493
2494 wxMBConv *wxCSConv::DoCreate() const
2495 {
2496 // check for the special case of ASCII or ISO8859-1 charset: as we have
2497 // special knowledge of it anyhow, we don't need to create a special
2498 // conversion object
2499 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2500 {
2501 // don't convert at all
2502 return NULL;
2503 }
2504
2505 // we trust OS to do conversion better than we can so try external
2506 // conversion methods first
2507 //
2508 // the full order is:
2509 // 1. OS conversion (iconv() under Unix or Win32 API)
2510 // 2. hard coded conversions for UTF
2511 // 3. wxEncodingConverter as fall back
2512
2513 // step (1)
2514 #ifdef HAVE_ICONV
2515 #if !wxUSE_FONTMAP
2516 if ( m_name )
2517 #endif // !wxUSE_FONTMAP
2518 {
2519 wxString name(m_name);
2520
2521 #if wxUSE_FONTMAP
2522 if ( name.empty() )
2523 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2524 #endif // wxUSE_FONTMAP
2525
2526 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2527 if ( conv->IsOk() )
2528 return conv;
2529
2530 delete conv;
2531 }
2532 #endif // HAVE_ICONV
2533
2534 #ifdef wxHAVE_WIN32_MB2WC
2535 {
2536 #if wxUSE_FONTMAP
2537 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2538 : new wxMBConv_win32(m_encoding);
2539 if ( conv->IsOk() )
2540 return conv;
2541
2542 delete conv;
2543 #else
2544 return NULL;
2545 #endif
2546 }
2547 #endif // wxHAVE_WIN32_MB2WC
2548 #if defined(__WXMAC__)
2549 {
2550 // leave UTF16 and UTF32 to the built-ins of wx
2551 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2552 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2553 {
2554
2555 #if wxUSE_FONTMAP
2556 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2557 : new wxMBConv_mac(m_encoding);
2558 #else
2559 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2560 #endif
2561 if ( conv->IsOk() )
2562 return conv;
2563
2564 delete conv;
2565 }
2566 }
2567 #endif
2568 #if defined(__WXCOCOA__)
2569 {
2570 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2571 {
2572
2573 #if wxUSE_FONTMAP
2574 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2575 : new wxMBConv_cocoa(m_encoding);
2576 #else
2577 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2578 #endif
2579 if ( conv->IsOk() )
2580 return conv;
2581
2582 delete conv;
2583 }
2584 }
2585 #endif
2586 // step (2)
2587 wxFontEncoding enc = m_encoding;
2588 #if wxUSE_FONTMAP
2589 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2590 {
2591 // use "false" to suppress interactive dialogs -- we can be called from
2592 // anywhere and popping up a dialog from here is the last thing we want to
2593 // do
2594 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2595 }
2596 #endif // wxUSE_FONTMAP
2597
2598 switch ( enc )
2599 {
2600 case wxFONTENCODING_UTF7:
2601 return new wxMBConvUTF7;
2602
2603 case wxFONTENCODING_UTF8:
2604 return new wxMBConvUTF8;
2605
2606 case wxFONTENCODING_UTF16BE:
2607 return new wxMBConvUTF16BE;
2608
2609 case wxFONTENCODING_UTF16LE:
2610 return new wxMBConvUTF16LE;
2611
2612 case wxFONTENCODING_UTF32BE:
2613 return new wxMBConvUTF32BE;
2614
2615 case wxFONTENCODING_UTF32LE:
2616 return new wxMBConvUTF32LE;
2617
2618 default:
2619 // nothing to do but put here to suppress gcc warnings
2620 ;
2621 }
2622
2623 // step (3)
2624 #if wxUSE_FONTMAP
2625 {
2626 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2627 : new wxMBConv_wxwin(m_encoding);
2628 if ( conv->IsOk() )
2629 return conv;
2630
2631 delete conv;
2632 }
2633 #endif // wxUSE_FONTMAP
2634
2635 // NB: This is a hack to prevent deadlock. What could otherwise happen
2636 // in Unicode build: wxConvLocal creation ends up being here
2637 // because of some failure and logs the error. But wxLog will try to
2638 // attach timestamp, for which it will need wxConvLocal (to convert
2639 // time to char* and then wchar_t*), but that fails, tries to log
2640 // error, but wxLog has a (already locked) critical section that
2641 // guards static buffer.
2642 static bool alreadyLoggingError = false;
2643 if (!alreadyLoggingError)
2644 {
2645 alreadyLoggingError = true;
2646 wxLogError(_("Cannot convert from the charset '%s'!"),
2647 m_name ? m_name
2648 :
2649 #if wxUSE_FONTMAP
2650 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2651 #else // !wxUSE_FONTMAP
2652 wxString::Format(_("encoding %s"), m_encoding).c_str()
2653 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2654 );
2655 alreadyLoggingError = false;
2656 }
2657
2658 return NULL;
2659 }
2660
2661 void wxCSConv::CreateConvIfNeeded() const
2662 {
2663 if ( m_deferred )
2664 {
2665 wxCSConv *self = (wxCSConv *)this; // const_cast
2666
2667 #if wxUSE_INTL
2668 // if we don't have neither the name nor the encoding, use the default
2669 // encoding for this system
2670 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2671 {
2672 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2673 }
2674 #endif // wxUSE_INTL
2675
2676 self->m_convReal = DoCreate();
2677 self->m_deferred = false;
2678 }
2679 }
2680
2681 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2682 {
2683 CreateConvIfNeeded();
2684
2685 if (m_convReal)
2686 return m_convReal->MB2WC(buf, psz, n);
2687
2688 // latin-1 (direct)
2689 size_t len = strlen(psz);
2690
2691 if (buf)
2692 {
2693 for (size_t c = 0; c <= len; c++)
2694 buf[c] = (unsigned char)(psz[c]);
2695 }
2696
2697 return len;
2698 }
2699
2700 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2701 {
2702 CreateConvIfNeeded();
2703
2704 if (m_convReal)
2705 return m_convReal->WC2MB(buf, psz, n);
2706
2707 // latin-1 (direct)
2708 const size_t len = wxWcslen(psz);
2709 if (buf)
2710 {
2711 for (size_t c = 0; c <= len; c++)
2712 {
2713 if (psz[c] > 0xFF)
2714 return (size_t)-1;
2715 buf[c] = (char)psz[c];
2716 }
2717 }
2718 else
2719 {
2720 for (size_t c = 0; c <= len; c++)
2721 {
2722 if (psz[c] > 0xFF)
2723 return (size_t)-1;
2724 }
2725 }
2726
2727 return len;
2728 }
2729
2730 // ----------------------------------------------------------------------------
2731 // globals
2732 // ----------------------------------------------------------------------------
2733
2734 #ifdef __WINDOWS__
2735 static wxMBConv_win32 wxConvLibcObj;
2736 #elif defined(__WXMAC__) && !defined(__MACH__)
2737 static wxMBConv_mac wxConvLibcObj ;
2738 #else
2739 static wxMBConvLibc wxConvLibcObj;
2740 #endif
2741
2742 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2743 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2744 static wxMBConvUTF7 wxConvUTF7Obj;
2745 static wxMBConvUTF8 wxConvUTF8Obj;
2746
2747 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2748 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2749 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2750 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2751 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2752 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2753 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2754 #ifdef __WXOSX__
2755 wxConvUTF8Obj;
2756 #else
2757 wxConvLibcObj;
2758 #endif
2759
2760
2761 #else // !wxUSE_WCHAR_T
2762
2763 // stand-ins in absence of wchar_t
2764 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2765 wxConvISO8859_1,
2766 wxConvLocal,
2767 wxConvUTF8;
2768
2769 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2770
2771