]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
use popen() instead of wxExecute(), it works inside wxYield() unlike the latter
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58 #ifdef HAVE_LANGINFO_H
59 #include <langinfo.h>
60 #endif
61
62 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
63 #define wxHAVE_WIN32_MB2WC
64 #endif // __WIN32__ but !__WXMICROWIN__
65
66 // ----------------------------------------------------------------------------
67 // headers
68 // ----------------------------------------------------------------------------
69
70 #ifdef __SALFORDC__
71 #include <clib.h>
72 #endif
73
74 #ifdef HAVE_ICONV
75 #include <iconv.h>
76 #include "wx/thread.h"
77 #endif
78
79 #include "wx/encconv.h"
80 #include "wx/fontmap.h"
81 #include "wx/utils.h"
82
83 #ifdef __WXMAC__
84 #include <ATSUnicode.h>
85 #include <TextCommon.h>
86 #include <TextEncodingConverter.h>
87
88 #include "wx/mac/private.h" // includes mac headers
89 #endif
90 // ----------------------------------------------------------------------------
91 // macros
92 // ----------------------------------------------------------------------------
93
94 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
95 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
96
97 #if SIZEOF_WCHAR_T == 4
98 #define WC_NAME "UCS4"
99 #define WC_BSWAP BSWAP_UCS4
100 #ifdef WORDS_BIGENDIAN
101 #define WC_NAME_BEST "UCS-4BE"
102 #else
103 #define WC_NAME_BEST "UCS-4LE"
104 #endif
105 #elif SIZEOF_WCHAR_T == 2
106 #define WC_NAME "UTF16"
107 #define WC_BSWAP BSWAP_UTF16
108 #define WC_UTF16
109 #ifdef WORDS_BIGENDIAN
110 #define WC_NAME_BEST "UTF-16BE"
111 #else
112 #define WC_NAME_BEST "UTF-16LE"
113 #endif
114 #else // sizeof(wchar_t) != 2 nor 4
115 // does this ever happen?
116 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
117 #endif
118
119 // ============================================================================
120 // implementation
121 // ============================================================================
122
123 // ----------------------------------------------------------------------------
124 // UTF-16 en/decoding to/from UCS-4
125 // ----------------------------------------------------------------------------
126
127
128 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
129 {
130 if (input<=0xffff)
131 {
132 if (output)
133 *output = (wxUint16) input;
134 return 1;
135 }
136 else if (input>=0x110000)
137 {
138 return (size_t)-1;
139 }
140 else
141 {
142 if (output)
143 {
144 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
145 *output = (wxUint16) ((input&0x3ff)+0xdc00);
146 }
147 return 2;
148 }
149 }
150
151 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
152 {
153 if ((*input<0xd800) || (*input>0xdfff))
154 {
155 output = *input;
156 return 1;
157 }
158 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
159 {
160 output = *input;
161 return (size_t)-1;
162 }
163 else
164 {
165 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
166 return 2;
167 }
168 }
169
170
171 // ----------------------------------------------------------------------------
172 // wxMBConv
173 // ----------------------------------------------------------------------------
174
175 wxMBConv::~wxMBConv()
176 {
177 // nothing to do here (necessary for Darwin linking probably)
178 }
179
180 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
181 {
182 if ( psz )
183 {
184 // calculate the length of the buffer needed first
185 size_t nLen = MB2WC(NULL, psz, 0);
186 if ( nLen != (size_t)-1 )
187 {
188 // now do the actual conversion
189 wxWCharBuffer buf(nLen);
190 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
191 if ( nLen != (size_t)-1 )
192 {
193 return buf;
194 }
195 }
196 }
197
198 wxWCharBuffer buf((wchar_t *)NULL);
199
200 return buf;
201 }
202
203 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
204 {
205 if ( pwz )
206 {
207 size_t nLen = WC2MB(NULL, pwz, 0);
208 if ( nLen != (size_t)-1 )
209 {
210 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
211 nLen = WC2MB(buf.data(), pwz, nLen + 4);
212 if ( nLen != (size_t)-1 )
213 {
214 return buf;
215 }
216 }
217 }
218
219 wxCharBuffer buf((char *)NULL);
220
221 return buf;
222 }
223
224 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
225 {
226 wxASSERT(pOutSize != NULL);
227
228 const char* szEnd = szString + nStringLen + 1;
229 const char* szPos = szString;
230 const char* szStart = szPos;
231
232 size_t nActualLength = 0;
233 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
234
235 wxWCharBuffer theBuffer(nCurrentSize);
236
237 //Convert the string until the length() is reached, continuing the
238 //loop every time a null character is reached
239 while(szPos != szEnd)
240 {
241 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
242
243 //Get the length of the current (sub)string
244 size_t nLen = MB2WC(NULL, szPos, 0);
245
246 //Invalid conversion?
247 if( nLen == (size_t)-1 )
248 {
249 *pOutSize = 0;
250 theBuffer.data()[0u] = wxT('\0');
251 return theBuffer;
252 }
253
254
255 //Increase the actual length (+1 for current null character)
256 nActualLength += nLen + 1;
257
258 //if buffer too big, realloc the buffer
259 if (nActualLength > (nCurrentSize+1))
260 {
261 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
262 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
263 theBuffer = theNewBuffer;
264 nCurrentSize <<= 1;
265 }
266
267 //Convert the current (sub)string
268 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
269 {
270 *pOutSize = 0;
271 theBuffer.data()[0u] = wxT('\0');
272 return theBuffer;
273 }
274
275 //Increment to next (sub)string
276 //Note that we have to use strlen here instead of nLen
277 //here because XX2XX gives us the size of the output buffer,
278 //not neccessarly the length of the string
279 szPos += strlen(szPos) + 1;
280 }
281
282 //success - return actual length and the buffer
283 *pOutSize = nActualLength;
284 return theBuffer;
285 }
286
287 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
288 {
289 wxASSERT(pOutSize != NULL);
290
291 const wchar_t* szEnd = szString + nStringLen + 1;
292 const wchar_t* szPos = szString;
293 const wchar_t* szStart = szPos;
294
295 size_t nActualLength = 0;
296 size_t nCurrentSize = nStringLen << 2; //try * 4 first
297
298 wxCharBuffer theBuffer(nCurrentSize);
299
300 //Convert the string until the length() is reached, continuing the
301 //loop every time a null character is reached
302 while(szPos != szEnd)
303 {
304 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
305
306 //Get the length of the current (sub)string
307 size_t nLen = WC2MB(NULL, szPos, 0);
308
309 //Invalid conversion?
310 if( nLen == (size_t)-1 )
311 {
312 *pOutSize = 0;
313 theBuffer.data()[0u] = wxT('\0');
314 return theBuffer;
315 }
316
317 //Increase the actual length (+1 for current null character)
318 nActualLength += nLen + 1;
319
320 //if buffer too big, realloc the buffer
321 if (nActualLength > (nCurrentSize+1))
322 {
323 wxCharBuffer theNewBuffer(nCurrentSize << 1);
324 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
325 theBuffer = theNewBuffer;
326 nCurrentSize <<= 1;
327 }
328
329 //Convert the current (sub)string
330 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
331 {
332 *pOutSize = 0;
333 theBuffer.data()[0u] = wxT('\0');
334 return theBuffer;
335 }
336
337 //Increment to next (sub)string
338 //Note that we have to use wxWcslen here instead of nLen
339 //here because XX2XX gives us the size of the output buffer,
340 //not neccessarly the length of the string
341 szPos += wxWcslen(szPos) + 1;
342 }
343
344 //success - return actual length and the buffer
345 *pOutSize = nActualLength;
346 return theBuffer;
347 }
348
349 // ----------------------------------------------------------------------------
350 // wxMBConvLibc
351 // ----------------------------------------------------------------------------
352
353 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
354 {
355 return wxMB2WC(buf, psz, n);
356 }
357
358 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
359 {
360 return wxWC2MB(buf, psz, n);
361 }
362
363 // ----------------------------------------------------------------------------
364 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
365 // files are accidentally written in an encoding which is not
366 // the system encoding. Typically, the system encoding will be
367 // UTF8 but there might be files stored in ISO8859-1 on disk.
368 // ----------------------------------------------------------------------------
369
370 class wxConvBrokenFileNames: public wxMBConvLibc
371 {
372 public:
373 wxConvBrokenFileNames() : m_utf8conv(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL) { }
374 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
375 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
376 inline bool UseUTF8() const;
377 private:
378 wxMBConvUTF8 m_utf8conv;
379 };
380
381 bool wxConvBrokenFileNames::UseUTF8() const
382 {
383 #if defined HAVE_LANGINFO_H && defined CODESET
384 char *codeset = nl_langinfo(CODESET);
385 return strcmp(codeset, "UTF-8") == 0;
386 #else
387 return false;
388 #endif
389 }
390
391 size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
392 {
393 if (UseUTF8())
394 return m_utf8conv.MB2WC( outputBuf, psz, outputSize );
395 else
396 return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
397 }
398
399 size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
400 {
401 if (UseUTF8())
402 return m_utf8conv.WC2MB( outputBuf, psz, outputSize );
403 else
404 return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
405 }
406
407 // ----------------------------------------------------------------------------
408 // UTF-7
409 // ----------------------------------------------------------------------------
410
411 // Implementation (C) 2004 Fredrik Roubert
412
413 //
414 // BASE64 decoding table
415 //
416 static const unsigned char utf7unb64[] =
417 {
418 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
424 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
425 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
427 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
428 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
429 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
431 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
432 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
433 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
437 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
438 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
439 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
440 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
441 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
442 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
443 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
444 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
450 };
451
452 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
453 {
454 size_t len = 0;
455
456 while (*psz && ((!buf) || (len < n)))
457 {
458 unsigned char cc = *psz++;
459 if (cc != '+')
460 {
461 // plain ASCII char
462 if (buf)
463 *buf++ = cc;
464 len++;
465 }
466 else if (*psz == '-')
467 {
468 // encoded plus sign
469 if (buf)
470 *buf++ = cc;
471 len++;
472 psz++;
473 }
474 else
475 {
476 // BASE64 encoded string
477 bool lsb;
478 unsigned char c;
479 unsigned int d, l;
480 for (lsb = false, d = 0, l = 0;
481 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
482 {
483 d <<= 6;
484 d += cc;
485 for (l += 6; l >= 8; lsb = !lsb)
486 {
487 c = (unsigned char)((d >> (l -= 8)) % 256);
488 if (lsb)
489 {
490 if (buf)
491 *buf++ |= c;
492 len ++;
493 }
494 else
495 if (buf)
496 *buf = (wchar_t)(c << 8);
497 }
498 }
499 if (*psz == '-')
500 psz++;
501 }
502 }
503 if (buf && (len < n))
504 *buf = 0;
505 return len;
506 }
507
508 //
509 // BASE64 encoding table
510 //
511 static const unsigned char utf7enb64[] =
512 {
513 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
514 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
515 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
516 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
517 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
518 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
519 'w', 'x', 'y', 'z', '0', '1', '2', '3',
520 '4', '5', '6', '7', '8', '9', '+', '/'
521 };
522
523 //
524 // UTF-7 encoding table
525 //
526 // 0 - Set D (directly encoded characters)
527 // 1 - Set O (optional direct characters)
528 // 2 - whitespace characters (optional)
529 // 3 - special characters
530 //
531 static const unsigned char utf7encode[128] =
532 {
533 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
534 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
541 };
542
543 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
544 {
545
546
547 size_t len = 0;
548
549 while (*psz && ((!buf) || (len < n)))
550 {
551 wchar_t cc = *psz++;
552 if (cc < 0x80 && utf7encode[cc] < 1)
553 {
554 // plain ASCII char
555 if (buf)
556 *buf++ = (char)cc;
557 len++;
558 }
559 #ifndef WC_UTF16
560 else if (((wxUint32)cc) > 0xffff)
561 {
562 // no surrogate pair generation (yet?)
563 return (size_t)-1;
564 }
565 #endif
566 else
567 {
568 if (buf)
569 *buf++ = '+';
570 len++;
571 if (cc != '+')
572 {
573 // BASE64 encode string
574 unsigned int lsb, d, l;
575 for (d = 0, l = 0;; psz++)
576 {
577 for (lsb = 0; lsb < 2; lsb ++)
578 {
579 d <<= 8;
580 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
581
582 for (l += 8; l >= 6; )
583 {
584 l -= 6;
585 if (buf)
586 *buf++ = utf7enb64[(d >> l) % 64];
587 len++;
588 }
589 }
590 cc = *psz;
591 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
592 break;
593 }
594 if (l != 0)
595 {
596 if (buf)
597 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
598 len++;
599 }
600 }
601 if (buf)
602 *buf++ = '-';
603 len++;
604 }
605 }
606 if (buf && (len < n))
607 *buf = 0;
608 return len;
609 }
610
611 // ----------------------------------------------------------------------------
612 // UTF-8
613 // ----------------------------------------------------------------------------
614
615 static wxUint32 utf8_max[]=
616 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
617
618 // boundaries of the private use area we use to (temporarily) remap invalid
619 // characters invalid in a UTF-8 encoded string
620 const wxUint32 wxUnicodePUA = 0x100000;
621 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
622
623 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
624 {
625 size_t len = 0;
626
627 while (*psz && ((!buf) || (len < n)))
628 {
629 const char *opsz = psz;
630 bool invalid = false;
631 unsigned char cc = *psz++, fc = cc;
632 unsigned cnt;
633 for (cnt = 0; fc & 0x80; cnt++)
634 fc <<= 1;
635 if (!cnt)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = cc;
640 len++;
641 }
642 else
643 {
644 cnt--;
645 if (!cnt)
646 {
647 // invalid UTF-8 sequence
648 invalid = true;
649 }
650 else
651 {
652 unsigned ocnt = cnt - 1;
653 wxUint32 res = cc & (0x3f >> cnt);
654 while (cnt--)
655 {
656 cc = *psz;
657 if ((cc & 0xC0) != 0x80)
658 {
659 // invalid UTF-8 sequence
660 invalid = true;
661 break;
662 }
663 psz++;
664 res = (res << 6) | (cc & 0x3f);
665 }
666 if (invalid || res <= utf8_max[ocnt])
667 {
668 // illegal UTF-8 encoding
669 invalid = true;
670 }
671 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
672 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
673 {
674 // if one of our PUA characters turns up externally
675 // it must also be treated as an illegal sequence
676 // (a bit like you have to escape an escape character)
677 invalid = true;
678 }
679 else
680 {
681 #ifdef WC_UTF16
682 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
683 size_t pa = encode_utf16(res, (wxUint16 *)buf);
684 if (pa == (size_t)-1)
685 {
686 invalid = true;
687 }
688 else
689 {
690 if (buf)
691 buf += pa;
692 len += pa;
693 }
694 #else // !WC_UTF16
695 if (buf)
696 *buf++ = res;
697 len++;
698 #endif // WC_UTF16/!WC_UTF16
699 }
700 }
701 if (invalid)
702 {
703 if (m_options & MAP_INVALID_UTF8_TO_PUA)
704 {
705 while (opsz < psz && (!buf || len < n))
706 {
707 #ifdef WC_UTF16
708 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
709 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
710 wxASSERT(pa != (size_t)-1);
711 if (buf)
712 buf += pa;
713 opsz++;
714 len += pa;
715 #else
716 if (buf)
717 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
718 opsz++;
719 len++;
720 #endif
721 }
722 }
723 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
724 {
725 while (opsz < psz && (!buf || len < n))
726 {
727 if ( buf && len + 3 < n )
728 {
729 unsigned char n = *opsz;
730 *buf++ = L'\\';
731 *buf++ = L'0' + n / 0100;
732 *buf++ = L'0' + (n % 0100) / 010;
733 *buf++ = L'0' + n % 010;
734 }
735 opsz++;
736 len += 4;
737 }
738 }
739 else // MAP_INVALID_UTF8_NOT
740 {
741 return (size_t)-1;
742 }
743 }
744 }
745 }
746 if (buf && (len < n))
747 *buf = 0;
748 return len;
749 }
750
751 static inline bool isoctal(wchar_t wch)
752 {
753 return L'0' <= wch && wch <= L'7';
754 }
755
756 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
757 {
758 size_t len = 0;
759
760 while (*psz && ((!buf) || (len < n)))
761 {
762 wxUint32 cc;
763 #ifdef WC_UTF16
764 // cast is ok for WC_UTF16
765 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
766 psz += (pa == (size_t)-1) ? 1 : pa;
767 #else
768 cc=(*psz++) & 0x7fffffff;
769 #endif
770
771 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
772 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
773 {
774 if (buf)
775 *buf++ = (char)(cc - wxUnicodePUA);
776 len++;
777 }
778 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
779 cc == L'\\' &&
780 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
781 {
782 if (buf)
783 {
784 *buf++ = (char) (psz[0] - L'0')*0100 +
785 (psz[1] - L'0')*010 +
786 (psz[2] - L'0');
787 }
788
789 psz += 3;
790 len++;
791 }
792 else
793 {
794 unsigned cnt;
795 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
796 if (!cnt)
797 {
798 // plain ASCII char
799 if (buf)
800 *buf++ = (char) cc;
801 len++;
802 }
803
804 else
805 {
806 len += cnt + 1;
807 if (buf)
808 {
809 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
810 while (cnt--)
811 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
812 }
813 }
814 }
815 }
816
817 if (buf && (len<n))
818 *buf = 0;
819
820 return len;
821 }
822
823 // ----------------------------------------------------------------------------
824 // UTF-16
825 // ----------------------------------------------------------------------------
826
827 #ifdef WORDS_BIGENDIAN
828 #define wxMBConvUTF16straight wxMBConvUTF16BE
829 #define wxMBConvUTF16swap wxMBConvUTF16LE
830 #else
831 #define wxMBConvUTF16swap wxMBConvUTF16BE
832 #define wxMBConvUTF16straight wxMBConvUTF16LE
833 #endif
834
835
836 #ifdef WC_UTF16
837
838 // copy 16bit MB to 16bit String
839 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
840 {
841 size_t len=0;
842
843 while (*(wxUint16*)psz && (!buf || len < n))
844 {
845 if (buf)
846 *buf++ = *(wxUint16*)psz;
847 len++;
848
849 psz += sizeof(wxUint16);
850 }
851 if (buf && len<n) *buf=0;
852
853 return len;
854 }
855
856
857 // copy 16bit String to 16bit MB
858 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
859 {
860 size_t len=0;
861
862 while (*psz && (!buf || len < n))
863 {
864 if (buf)
865 {
866 *(wxUint16*)buf = *psz;
867 buf += sizeof(wxUint16);
868 }
869 len += sizeof(wxUint16);
870 psz++;
871 }
872 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
873
874 return len;
875 }
876
877
878 // swap 16bit MB to 16bit String
879 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
880 {
881 size_t len=0;
882
883 while (*(wxUint16*)psz && (!buf || len < n))
884 {
885 if (buf)
886 {
887 ((char *)buf)[0] = psz[1];
888 ((char *)buf)[1] = psz[0];
889 buf++;
890 }
891 len++;
892 psz += sizeof(wxUint16);
893 }
894 if (buf && len<n) *buf=0;
895
896 return len;
897 }
898
899
900 // swap 16bit MB to 16bit String
901 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
902 {
903 size_t len=0;
904
905 while (*psz && (!buf || len < n))
906 {
907 if (buf)
908 {
909 *buf++ = ((char*)psz)[1];
910 *buf++ = ((char*)psz)[0];
911 }
912 len += sizeof(wxUint16);
913 psz++;
914 }
915 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
916
917 return len;
918 }
919
920
921 #else // WC_UTF16
922
923
924 // copy 16bit MB to 32bit String
925 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
926 {
927 size_t len=0;
928
929 while (*(wxUint16*)psz && (!buf || len < n))
930 {
931 wxUint32 cc;
932 size_t pa=decode_utf16((wxUint16*)psz, cc);
933 if (pa == (size_t)-1)
934 return pa;
935
936 if (buf)
937 *buf++ = cc;
938 len++;
939 psz += pa * sizeof(wxUint16);
940 }
941 if (buf && len<n) *buf=0;
942
943 return len;
944 }
945
946
947 // copy 32bit String to 16bit MB
948 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
949 {
950 size_t len=0;
951
952 while (*psz && (!buf || len < n))
953 {
954 wxUint16 cc[2];
955 size_t pa=encode_utf16(*psz, cc);
956
957 if (pa == (size_t)-1)
958 return pa;
959
960 if (buf)
961 {
962 *(wxUint16*)buf = cc[0];
963 buf += sizeof(wxUint16);
964 if (pa > 1)
965 {
966 *(wxUint16*)buf = cc[1];
967 buf += sizeof(wxUint16);
968 }
969 }
970
971 len += pa*sizeof(wxUint16);
972 psz++;
973 }
974 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
975
976 return len;
977 }
978
979
980 // swap 16bit MB to 32bit String
981 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
982 {
983 size_t len=0;
984
985 while (*(wxUint16*)psz && (!buf || len < n))
986 {
987 wxUint32 cc;
988 char tmp[4];
989 tmp[0]=psz[1]; tmp[1]=psz[0];
990 tmp[2]=psz[3]; tmp[3]=psz[2];
991
992 size_t pa=decode_utf16((wxUint16*)tmp, cc);
993 if (pa == (size_t)-1)
994 return pa;
995
996 if (buf)
997 *buf++ = cc;
998
999 len++;
1000 psz += pa * sizeof(wxUint16);
1001 }
1002 if (buf && len<n) *buf=0;
1003
1004 return len;
1005 }
1006
1007
1008 // swap 32bit String to 16bit MB
1009 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1010 {
1011 size_t len=0;
1012
1013 while (*psz && (!buf || len < n))
1014 {
1015 wxUint16 cc[2];
1016 size_t pa=encode_utf16(*psz, cc);
1017
1018 if (pa == (size_t)-1)
1019 return pa;
1020
1021 if (buf)
1022 {
1023 *buf++ = ((char*)cc)[1];
1024 *buf++ = ((char*)cc)[0];
1025 if (pa > 1)
1026 {
1027 *buf++ = ((char*)cc)[3];
1028 *buf++ = ((char*)cc)[2];
1029 }
1030 }
1031
1032 len += pa*sizeof(wxUint16);
1033 psz++;
1034 }
1035 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1036
1037 return len;
1038 }
1039
1040 #endif // WC_UTF16
1041
1042
1043 // ----------------------------------------------------------------------------
1044 // UTF-32
1045 // ----------------------------------------------------------------------------
1046
1047 #ifdef WORDS_BIGENDIAN
1048 #define wxMBConvUTF32straight wxMBConvUTF32BE
1049 #define wxMBConvUTF32swap wxMBConvUTF32LE
1050 #else
1051 #define wxMBConvUTF32swap wxMBConvUTF32BE
1052 #define wxMBConvUTF32straight wxMBConvUTF32LE
1053 #endif
1054
1055
1056 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1057 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1058
1059
1060 #ifdef WC_UTF16
1061
1062 // copy 32bit MB to 16bit String
1063 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1064 {
1065 size_t len=0;
1066
1067 while (*(wxUint32*)psz && (!buf || len < n))
1068 {
1069 wxUint16 cc[2];
1070
1071 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1072 if (pa == (size_t)-1)
1073 return pa;
1074
1075 if (buf)
1076 {
1077 *buf++ = cc[0];
1078 if (pa > 1)
1079 *buf++ = cc[1];
1080 }
1081 len += pa;
1082 psz += sizeof(wxUint32);
1083 }
1084 if (buf && len<n) *buf=0;
1085
1086 return len;
1087 }
1088
1089
1090 // copy 16bit String to 32bit MB
1091 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1092 {
1093 size_t len=0;
1094
1095 while (*psz && (!buf || len < n))
1096 {
1097 wxUint32 cc;
1098
1099 // cast is ok for WC_UTF16
1100 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1101 if (pa == (size_t)-1)
1102 return pa;
1103
1104 if (buf)
1105 {
1106 *(wxUint32*)buf = cc;
1107 buf += sizeof(wxUint32);
1108 }
1109 len += sizeof(wxUint32);
1110 psz += pa;
1111 }
1112
1113 if (buf && len<=n-sizeof(wxUint32))
1114 *(wxUint32*)buf=0;
1115
1116 return len;
1117 }
1118
1119
1120
1121 // swap 32bit MB to 16bit String
1122 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1123 {
1124 size_t len=0;
1125
1126 while (*(wxUint32*)psz && (!buf || len < n))
1127 {
1128 char tmp[4];
1129 tmp[0] = psz[3]; tmp[1] = psz[2];
1130 tmp[2] = psz[1]; tmp[3] = psz[0];
1131
1132
1133 wxUint16 cc[2];
1134
1135 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1136 if (pa == (size_t)-1)
1137 return pa;
1138
1139 if (buf)
1140 {
1141 *buf++ = cc[0];
1142 if (pa > 1)
1143 *buf++ = cc[1];
1144 }
1145 len += pa;
1146 psz += sizeof(wxUint32);
1147 }
1148
1149 if (buf && len<n)
1150 *buf=0;
1151
1152 return len;
1153 }
1154
1155
1156 // swap 16bit String to 32bit MB
1157 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1158 {
1159 size_t len=0;
1160
1161 while (*psz && (!buf || len < n))
1162 {
1163 char cc[4];
1164
1165 // cast is ok for WC_UTF16
1166 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1167 if (pa == (size_t)-1)
1168 return pa;
1169
1170 if (buf)
1171 {
1172 *buf++ = cc[3];
1173 *buf++ = cc[2];
1174 *buf++ = cc[1];
1175 *buf++ = cc[0];
1176 }
1177 len += sizeof(wxUint32);
1178 psz += pa;
1179 }
1180
1181 if (buf && len<=n-sizeof(wxUint32))
1182 *(wxUint32*)buf=0;
1183
1184 return len;
1185 }
1186
1187 #else // WC_UTF16
1188
1189
1190 // copy 32bit MB to 32bit String
1191 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1192 {
1193 size_t len=0;
1194
1195 while (*(wxUint32*)psz && (!buf || len < n))
1196 {
1197 if (buf)
1198 *buf++ = *(wxUint32*)psz;
1199 len++;
1200 psz += sizeof(wxUint32);
1201 }
1202
1203 if (buf && len<n)
1204 *buf=0;
1205
1206 return len;
1207 }
1208
1209
1210 // copy 32bit String to 32bit MB
1211 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1212 {
1213 size_t len=0;
1214
1215 while (*psz && (!buf || len < n))
1216 {
1217 if (buf)
1218 {
1219 *(wxUint32*)buf = *psz;
1220 buf += sizeof(wxUint32);
1221 }
1222
1223 len += sizeof(wxUint32);
1224 psz++;
1225 }
1226
1227 if (buf && len<=n-sizeof(wxUint32))
1228 *(wxUint32*)buf=0;
1229
1230 return len;
1231 }
1232
1233
1234 // swap 32bit MB to 32bit String
1235 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1236 {
1237 size_t len=0;
1238
1239 while (*(wxUint32*)psz && (!buf || len < n))
1240 {
1241 if (buf)
1242 {
1243 ((char *)buf)[0] = psz[3];
1244 ((char *)buf)[1] = psz[2];
1245 ((char *)buf)[2] = psz[1];
1246 ((char *)buf)[3] = psz[0];
1247 buf++;
1248 }
1249 len++;
1250 psz += sizeof(wxUint32);
1251 }
1252
1253 if (buf && len<n)
1254 *buf=0;
1255
1256 return len;
1257 }
1258
1259
1260 // swap 32bit String to 32bit MB
1261 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1262 {
1263 size_t len=0;
1264
1265 while (*psz && (!buf || len < n))
1266 {
1267 if (buf)
1268 {
1269 *buf++ = ((char *)psz)[3];
1270 *buf++ = ((char *)psz)[2];
1271 *buf++ = ((char *)psz)[1];
1272 *buf++ = ((char *)psz)[0];
1273 }
1274 len += sizeof(wxUint32);
1275 psz++;
1276 }
1277
1278 if (buf && len<=n-sizeof(wxUint32))
1279 *(wxUint32*)buf=0;
1280
1281 return len;
1282 }
1283
1284
1285 #endif // WC_UTF16
1286
1287
1288 // ============================================================================
1289 // The classes doing conversion using the iconv_xxx() functions
1290 // ============================================================================
1291
1292 #ifdef HAVE_ICONV
1293
1294 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1295 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1296 // (unless there's yet another bug in glibc) the only case when iconv()
1297 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1298 // left in the input buffer -- when _real_ error occurs,
1299 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1300 // iconv() failure.
1301 // [This bug does not appear in glibc 2.2.]
1302 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1303 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1304 (errno != E2BIG || bufLeft != 0))
1305 #else
1306 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1307 #endif
1308
1309 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1310
1311 // ----------------------------------------------------------------------------
1312 // wxMBConv_iconv: encapsulates an iconv character set
1313 // ----------------------------------------------------------------------------
1314
1315 class wxMBConv_iconv : public wxMBConv
1316 {
1317 public:
1318 wxMBConv_iconv(const wxChar *name);
1319 virtual ~wxMBConv_iconv();
1320
1321 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1322 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1323
1324 bool IsOk() const
1325 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1326
1327 protected:
1328 // the iconv handlers used to translate from multibyte to wide char and in
1329 // the other direction
1330 iconv_t m2w,
1331 w2m;
1332 #if wxUSE_THREADS
1333 // guards access to m2w and w2m objects
1334 wxMutex m_iconvMutex;
1335 #endif
1336
1337 private:
1338 // the name (for iconv_open()) of a wide char charset -- if none is
1339 // available on this machine, it will remain NULL
1340 static const char *ms_wcCharsetName;
1341
1342 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1343 // different endian-ness than the native one
1344 static bool ms_wcNeedsSwap;
1345 };
1346
1347 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1348 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1349
1350 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1351 {
1352 // Do it the hard way
1353 char cname[100];
1354 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1355 cname[i] = (char) name[i];
1356
1357 // check for charset that represents wchar_t:
1358 if (ms_wcCharsetName == NULL)
1359 {
1360 ms_wcNeedsSwap = false;
1361
1362 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1363 ms_wcCharsetName = WC_NAME_BEST;
1364 m2w = iconv_open(ms_wcCharsetName, cname);
1365
1366 if (m2w == (iconv_t)-1)
1367 {
1368 // try charset w/o bytesex info (e.g. "UCS4")
1369 // and check for bytesex ourselves:
1370 ms_wcCharsetName = WC_NAME;
1371 m2w = iconv_open(ms_wcCharsetName, cname);
1372
1373 // last bet, try if it knows WCHAR_T pseudo-charset
1374 if (m2w == (iconv_t)-1)
1375 {
1376 ms_wcCharsetName = "WCHAR_T";
1377 m2w = iconv_open(ms_wcCharsetName, cname);
1378 }
1379
1380 if (m2w != (iconv_t)-1)
1381 {
1382 char buf[2], *bufPtr;
1383 wchar_t wbuf[2], *wbufPtr;
1384 size_t insz, outsz;
1385 size_t res;
1386
1387 buf[0] = 'A';
1388 buf[1] = 0;
1389 wbuf[0] = 0;
1390 insz = 2;
1391 outsz = SIZEOF_WCHAR_T * 2;
1392 wbufPtr = wbuf;
1393 bufPtr = buf;
1394
1395 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1396 (char**)&wbufPtr, &outsz);
1397
1398 if (ICONV_FAILED(res, insz))
1399 {
1400 ms_wcCharsetName = NULL;
1401 wxLogLastError(wxT("iconv"));
1402 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1403 }
1404 else
1405 {
1406 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1407 }
1408 }
1409 else
1410 {
1411 ms_wcCharsetName = NULL;
1412
1413 // VS: we must not output an error here, since wxWidgets will safely
1414 // fall back to using wxEncodingConverter.
1415 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1416 //wxLogError(
1417 }
1418 }
1419 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1420 }
1421 else // we already have ms_wcCharsetName
1422 {
1423 m2w = iconv_open(ms_wcCharsetName, cname);
1424 }
1425
1426 // NB: don't ever pass NULL to iconv_open(), it may crash!
1427 if ( ms_wcCharsetName )
1428 {
1429 w2m = iconv_open( cname, ms_wcCharsetName);
1430 }
1431 else
1432 {
1433 w2m = (iconv_t)-1;
1434 }
1435 }
1436
1437 wxMBConv_iconv::~wxMBConv_iconv()
1438 {
1439 if ( m2w != (iconv_t)-1 )
1440 iconv_close(m2w);
1441 if ( w2m != (iconv_t)-1 )
1442 iconv_close(w2m);
1443 }
1444
1445 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1446 {
1447 #if wxUSE_THREADS
1448 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1449 // Unfortunately there is a couple of global wxCSConv objects such as
1450 // wxConvLocal that are used all over wx code, so we have to make sure
1451 // the handle is used by at most one thread at the time. Otherwise
1452 // only a few wx classes would be safe to use from non-main threads
1453 // as MB<->WC conversion would fail "randomly".
1454 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1455 #endif
1456
1457 size_t inbuf = strlen(psz);
1458 size_t outbuf = n * SIZEOF_WCHAR_T;
1459 size_t res, cres;
1460 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1461 wchar_t *bufPtr = buf;
1462 const char *pszPtr = psz;
1463
1464 if (buf)
1465 {
1466 // have destination buffer, convert there
1467 cres = iconv(m2w,
1468 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1469 (char**)&bufPtr, &outbuf);
1470 res = n - (outbuf / SIZEOF_WCHAR_T);
1471
1472 if (ms_wcNeedsSwap)
1473 {
1474 // convert to native endianness
1475 WC_BSWAP(buf /* _not_ bufPtr */, res)
1476 }
1477
1478 // NB: iconv was given only strlen(psz) characters on input, and so
1479 // it couldn't convert the trailing zero. Let's do it ourselves
1480 // if there's some room left for it in the output buffer.
1481 if (res < n)
1482 buf[res] = 0;
1483 }
1484 else
1485 {
1486 // no destination buffer... convert using temp buffer
1487 // to calculate destination buffer requirement
1488 wchar_t tbuf[8];
1489 res = 0;
1490 do {
1491 bufPtr = tbuf;
1492 outbuf = 8*SIZEOF_WCHAR_T;
1493
1494 cres = iconv(m2w,
1495 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1496 (char**)&bufPtr, &outbuf );
1497
1498 res += 8-(outbuf/SIZEOF_WCHAR_T);
1499 } while ((cres==(size_t)-1) && (errno==E2BIG));
1500 }
1501
1502 if (ICONV_FAILED(cres, inbuf))
1503 {
1504 //VS: it is ok if iconv fails, hence trace only
1505 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1506 return (size_t)-1;
1507 }
1508
1509 return res;
1510 }
1511
1512 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1513 {
1514 #if wxUSE_THREADS
1515 // NB: explained in MB2WC
1516 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1517 #endif
1518
1519 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1520 size_t outbuf = n;
1521 size_t res, cres;
1522
1523 wchar_t *tmpbuf = 0;
1524
1525 if (ms_wcNeedsSwap)
1526 {
1527 // need to copy to temp buffer to switch endianness
1528 // this absolutely doesn't rock!
1529 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1530 // could be in read-only memory, or be accessed in some other thread)
1531 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1532 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1533 WC_BSWAP(tmpbuf, inbuf)
1534 psz=tmpbuf;
1535 }
1536
1537 if (buf)
1538 {
1539 // have destination buffer, convert there
1540 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1541
1542 res = n-outbuf;
1543
1544 // NB: iconv was given only wcslen(psz) characters on input, and so
1545 // it couldn't convert the trailing zero. Let's do it ourselves
1546 // if there's some room left for it in the output buffer.
1547 if (res < n)
1548 buf[0] = 0;
1549 }
1550 else
1551 {
1552 // no destination buffer... convert using temp buffer
1553 // to calculate destination buffer requirement
1554 char tbuf[16];
1555 res = 0;
1556 do {
1557 buf = tbuf; outbuf = 16;
1558
1559 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1560
1561 res += 16 - outbuf;
1562 } while ((cres==(size_t)-1) && (errno==E2BIG));
1563 }
1564
1565 if (ms_wcNeedsSwap)
1566 {
1567 free(tmpbuf);
1568 }
1569
1570 if (ICONV_FAILED(cres, inbuf))
1571 {
1572 //VS: it is ok if iconv fails, hence trace only
1573 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1574 return (size_t)-1;
1575 }
1576
1577 return res;
1578 }
1579
1580 #endif // HAVE_ICONV
1581
1582
1583 // ============================================================================
1584 // Win32 conversion classes
1585 // ============================================================================
1586
1587 #ifdef wxHAVE_WIN32_MB2WC
1588
1589 // from utils.cpp
1590 #if wxUSE_FONTMAP
1591 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1592 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1593 #endif
1594
1595 class wxMBConv_win32 : public wxMBConv
1596 {
1597 public:
1598 wxMBConv_win32()
1599 {
1600 m_CodePage = CP_ACP;
1601 }
1602
1603 #if wxUSE_FONTMAP
1604 wxMBConv_win32(const wxChar* name)
1605 {
1606 m_CodePage = wxCharsetToCodepage(name);
1607 }
1608
1609 wxMBConv_win32(wxFontEncoding encoding)
1610 {
1611 m_CodePage = wxEncodingToCodepage(encoding);
1612 }
1613 #endif
1614
1615 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1616 {
1617 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1618 // the behaviour is not compatible with the Unix version (using iconv)
1619 // and break the library itself, e.g. wxTextInputStream::NextChar()
1620 // wouldn't work if reading an incomplete MB char didn't result in an
1621 // error
1622 //
1623 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1624 // an error (tested under Windows Server 2003) and apparently it is
1625 // done on purpose, i.e. the function accepts any input in this case
1626 // and although I'd prefer to return error on ill-formed output, our
1627 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1628 // explicitly ill-formed according to RFC 2152) neither so we don't
1629 // even have any fallback here...
1630 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1631
1632 const size_t len = ::MultiByteToWideChar
1633 (
1634 m_CodePage, // code page
1635 flags, // flags: fall on error
1636 psz, // input string
1637 -1, // its length (NUL-terminated)
1638 buf, // output string
1639 buf ? n : 0 // size of output buffer
1640 );
1641
1642 // note that it returns count of written chars for buf != NULL and size
1643 // of the needed buffer for buf == NULL so in either case the length of
1644 // the string (which never includes the terminating NUL) is one less
1645 return len ? len - 1 : (size_t)-1;
1646 }
1647
1648 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1649 {
1650 /*
1651 we have a problem here: by default, WideCharToMultiByte() may
1652 replace characters unrepresentable in the target code page with bad
1653 quality approximations such as turning "1/2" symbol (U+00BD) into
1654 "1" for the code pages which don't have it and we, obviously, want
1655 to avoid this at any price
1656
1657 the trouble is that this function does it _silently_, i.e. it won't
1658 even tell us whether it did or not... Win98/2000 and higher provide
1659 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1660 we have to resort to a round trip, i.e. check that converting back
1661 results in the same string -- this is, of course, expensive but
1662 otherwise we simply can't be sure to not garble the data.
1663 */
1664
1665 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1666 // it doesn't work with CJK encodings (which we test for rather roughly
1667 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1668 // supporting it
1669 BOOL usedDef wxDUMMY_INITIALIZE(false);
1670 BOOL *pUsedDef;
1671 int flags;
1672 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1673 {
1674 // it's our lucky day
1675 flags = WC_NO_BEST_FIT_CHARS;
1676 pUsedDef = &usedDef;
1677 }
1678 else // old system or unsupported encoding
1679 {
1680 flags = 0;
1681 pUsedDef = NULL;
1682 }
1683
1684 const size_t len = ::WideCharToMultiByte
1685 (
1686 m_CodePage, // code page
1687 flags, // either none or no best fit
1688 pwz, // input string
1689 -1, // it is (wide) NUL-terminated
1690 buf, // output buffer
1691 buf ? n : 0, // and its size
1692 NULL, // default "replacement" char
1693 pUsedDef // [out] was it used?
1694 );
1695
1696 if ( !len )
1697 {
1698 // function totally failed
1699 return (size_t)-1;
1700 }
1701
1702 // if we were really converting, check if we succeeded
1703 if ( buf )
1704 {
1705 if ( flags )
1706 {
1707 // check if the conversion failed, i.e. if any replacements
1708 // were done
1709 if ( usedDef )
1710 return (size_t)-1;
1711 }
1712 else // we must resort to double tripping...
1713 {
1714 wxWCharBuffer wcBuf(n);
1715 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1716 wcscmp(wcBuf, pwz) != 0 )
1717 {
1718 // we didn't obtain the same thing we started from, hence
1719 // the conversion was lossy and we consider that it failed
1720 return (size_t)-1;
1721 }
1722 }
1723 }
1724
1725 // see the comment above for the reason of "len - 1"
1726 return len - 1;
1727 }
1728
1729 bool IsOk() const { return m_CodePage != -1; }
1730
1731 private:
1732 static bool CanUseNoBestFit()
1733 {
1734 static int s_isWin98Or2k = -1;
1735
1736 if ( s_isWin98Or2k == -1 )
1737 {
1738 int verMaj, verMin;
1739 switch ( wxGetOsVersion(&verMaj, &verMin) )
1740 {
1741 case wxWIN95:
1742 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1743 break;
1744
1745 case wxWINDOWS_NT:
1746 s_isWin98Or2k = verMaj >= 5;
1747 break;
1748
1749 default:
1750 // unknown, be conseravtive by default
1751 s_isWin98Or2k = 0;
1752 }
1753
1754 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1755 }
1756
1757 return s_isWin98Or2k == 1;
1758 }
1759
1760 long m_CodePage;
1761 };
1762
1763 #endif // wxHAVE_WIN32_MB2WC
1764
1765 // ============================================================================
1766 // Cocoa conversion classes
1767 // ============================================================================
1768
1769 #if defined(__WXCOCOA__)
1770
1771 // RN: There is no UTF-32 support in either Core Foundation or
1772 // Cocoa. Strangely enough, internally Core Foundation uses
1773 // UTF 32 internally quite a bit - its just not public (yet).
1774
1775 #include <CoreFoundation/CFString.h>
1776 #include <CoreFoundation/CFStringEncodingExt.h>
1777
1778 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1779 {
1780 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1781 if ( encoding == wxFONTENCODING_DEFAULT )
1782 {
1783 enc = CFStringGetSystemEncoding();
1784 }
1785 else switch( encoding)
1786 {
1787 case wxFONTENCODING_ISO8859_1 :
1788 enc = kCFStringEncodingISOLatin1 ;
1789 break ;
1790 case wxFONTENCODING_ISO8859_2 :
1791 enc = kCFStringEncodingISOLatin2;
1792 break ;
1793 case wxFONTENCODING_ISO8859_3 :
1794 enc = kCFStringEncodingISOLatin3 ;
1795 break ;
1796 case wxFONTENCODING_ISO8859_4 :
1797 enc = kCFStringEncodingISOLatin4;
1798 break ;
1799 case wxFONTENCODING_ISO8859_5 :
1800 enc = kCFStringEncodingISOLatinCyrillic;
1801 break ;
1802 case wxFONTENCODING_ISO8859_6 :
1803 enc = kCFStringEncodingISOLatinArabic;
1804 break ;
1805 case wxFONTENCODING_ISO8859_7 :
1806 enc = kCFStringEncodingISOLatinGreek;
1807 break ;
1808 case wxFONTENCODING_ISO8859_8 :
1809 enc = kCFStringEncodingISOLatinHebrew;
1810 break ;
1811 case wxFONTENCODING_ISO8859_9 :
1812 enc = kCFStringEncodingISOLatin5;
1813 break ;
1814 case wxFONTENCODING_ISO8859_10 :
1815 enc = kCFStringEncodingISOLatin6;
1816 break ;
1817 case wxFONTENCODING_ISO8859_11 :
1818 enc = kCFStringEncodingISOLatinThai;
1819 break ;
1820 case wxFONTENCODING_ISO8859_13 :
1821 enc = kCFStringEncodingISOLatin7;
1822 break ;
1823 case wxFONTENCODING_ISO8859_14 :
1824 enc = kCFStringEncodingISOLatin8;
1825 break ;
1826 case wxFONTENCODING_ISO8859_15 :
1827 enc = kCFStringEncodingISOLatin9;
1828 break ;
1829
1830 case wxFONTENCODING_KOI8 :
1831 enc = kCFStringEncodingKOI8_R;
1832 break ;
1833 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1834 enc = kCFStringEncodingDOSRussian;
1835 break ;
1836
1837 // case wxFONTENCODING_BULGARIAN :
1838 // enc = ;
1839 // break ;
1840
1841 case wxFONTENCODING_CP437 :
1842 enc =kCFStringEncodingDOSLatinUS ;
1843 break ;
1844 case wxFONTENCODING_CP850 :
1845 enc = kCFStringEncodingDOSLatin1;
1846 break ;
1847 case wxFONTENCODING_CP852 :
1848 enc = kCFStringEncodingDOSLatin2;
1849 break ;
1850 case wxFONTENCODING_CP855 :
1851 enc = kCFStringEncodingDOSCyrillic;
1852 break ;
1853 case wxFONTENCODING_CP866 :
1854 enc =kCFStringEncodingDOSRussian ;
1855 break ;
1856 case wxFONTENCODING_CP874 :
1857 enc = kCFStringEncodingDOSThai;
1858 break ;
1859 case wxFONTENCODING_CP932 :
1860 enc = kCFStringEncodingDOSJapanese;
1861 break ;
1862 case wxFONTENCODING_CP936 :
1863 enc =kCFStringEncodingDOSChineseSimplif ;
1864 break ;
1865 case wxFONTENCODING_CP949 :
1866 enc = kCFStringEncodingDOSKorean;
1867 break ;
1868 case wxFONTENCODING_CP950 :
1869 enc = kCFStringEncodingDOSChineseTrad;
1870 break ;
1871 case wxFONTENCODING_CP1250 :
1872 enc = kCFStringEncodingWindowsLatin2;
1873 break ;
1874 case wxFONTENCODING_CP1251 :
1875 enc =kCFStringEncodingWindowsCyrillic ;
1876 break ;
1877 case wxFONTENCODING_CP1252 :
1878 enc =kCFStringEncodingWindowsLatin1 ;
1879 break ;
1880 case wxFONTENCODING_CP1253 :
1881 enc = kCFStringEncodingWindowsGreek;
1882 break ;
1883 case wxFONTENCODING_CP1254 :
1884 enc = kCFStringEncodingWindowsLatin5;
1885 break ;
1886 case wxFONTENCODING_CP1255 :
1887 enc =kCFStringEncodingWindowsHebrew ;
1888 break ;
1889 case wxFONTENCODING_CP1256 :
1890 enc =kCFStringEncodingWindowsArabic ;
1891 break ;
1892 case wxFONTENCODING_CP1257 :
1893 enc = kCFStringEncodingWindowsBalticRim;
1894 break ;
1895 // This only really encodes to UTF7 (if that) evidently
1896 // case wxFONTENCODING_UTF7 :
1897 // enc = kCFStringEncodingNonLossyASCII ;
1898 // break ;
1899 case wxFONTENCODING_UTF8 :
1900 enc = kCFStringEncodingUTF8 ;
1901 break ;
1902 case wxFONTENCODING_EUC_JP :
1903 enc = kCFStringEncodingEUC_JP;
1904 break ;
1905 case wxFONTENCODING_UTF16 :
1906 enc = kCFStringEncodingUnicode ;
1907 break ;
1908 case wxFONTENCODING_MACROMAN :
1909 enc = kCFStringEncodingMacRoman ;
1910 break ;
1911 case wxFONTENCODING_MACJAPANESE :
1912 enc = kCFStringEncodingMacJapanese ;
1913 break ;
1914 case wxFONTENCODING_MACCHINESETRAD :
1915 enc = kCFStringEncodingMacChineseTrad ;
1916 break ;
1917 case wxFONTENCODING_MACKOREAN :
1918 enc = kCFStringEncodingMacKorean ;
1919 break ;
1920 case wxFONTENCODING_MACARABIC :
1921 enc = kCFStringEncodingMacArabic ;
1922 break ;
1923 case wxFONTENCODING_MACHEBREW :
1924 enc = kCFStringEncodingMacHebrew ;
1925 break ;
1926 case wxFONTENCODING_MACGREEK :
1927 enc = kCFStringEncodingMacGreek ;
1928 break ;
1929 case wxFONTENCODING_MACCYRILLIC :
1930 enc = kCFStringEncodingMacCyrillic ;
1931 break ;
1932 case wxFONTENCODING_MACDEVANAGARI :
1933 enc = kCFStringEncodingMacDevanagari ;
1934 break ;
1935 case wxFONTENCODING_MACGURMUKHI :
1936 enc = kCFStringEncodingMacGurmukhi ;
1937 break ;
1938 case wxFONTENCODING_MACGUJARATI :
1939 enc = kCFStringEncodingMacGujarati ;
1940 break ;
1941 case wxFONTENCODING_MACORIYA :
1942 enc = kCFStringEncodingMacOriya ;
1943 break ;
1944 case wxFONTENCODING_MACBENGALI :
1945 enc = kCFStringEncodingMacBengali ;
1946 break ;
1947 case wxFONTENCODING_MACTAMIL :
1948 enc = kCFStringEncodingMacTamil ;
1949 break ;
1950 case wxFONTENCODING_MACTELUGU :
1951 enc = kCFStringEncodingMacTelugu ;
1952 break ;
1953 case wxFONTENCODING_MACKANNADA :
1954 enc = kCFStringEncodingMacKannada ;
1955 break ;
1956 case wxFONTENCODING_MACMALAJALAM :
1957 enc = kCFStringEncodingMacMalayalam ;
1958 break ;
1959 case wxFONTENCODING_MACSINHALESE :
1960 enc = kCFStringEncodingMacSinhalese ;
1961 break ;
1962 case wxFONTENCODING_MACBURMESE :
1963 enc = kCFStringEncodingMacBurmese ;
1964 break ;
1965 case wxFONTENCODING_MACKHMER :
1966 enc = kCFStringEncodingMacKhmer ;
1967 break ;
1968 case wxFONTENCODING_MACTHAI :
1969 enc = kCFStringEncodingMacThai ;
1970 break ;
1971 case wxFONTENCODING_MACLAOTIAN :
1972 enc = kCFStringEncodingMacLaotian ;
1973 break ;
1974 case wxFONTENCODING_MACGEORGIAN :
1975 enc = kCFStringEncodingMacGeorgian ;
1976 break ;
1977 case wxFONTENCODING_MACARMENIAN :
1978 enc = kCFStringEncodingMacArmenian ;
1979 break ;
1980 case wxFONTENCODING_MACCHINESESIMP :
1981 enc = kCFStringEncodingMacChineseSimp ;
1982 break ;
1983 case wxFONTENCODING_MACTIBETAN :
1984 enc = kCFStringEncodingMacTibetan ;
1985 break ;
1986 case wxFONTENCODING_MACMONGOLIAN :
1987 enc = kCFStringEncodingMacMongolian ;
1988 break ;
1989 case wxFONTENCODING_MACETHIOPIC :
1990 enc = kCFStringEncodingMacEthiopic ;
1991 break ;
1992 case wxFONTENCODING_MACCENTRALEUR :
1993 enc = kCFStringEncodingMacCentralEurRoman ;
1994 break ;
1995 case wxFONTENCODING_MACVIATNAMESE :
1996 enc = kCFStringEncodingMacVietnamese ;
1997 break ;
1998 case wxFONTENCODING_MACARABICEXT :
1999 enc = kCFStringEncodingMacExtArabic ;
2000 break ;
2001 case wxFONTENCODING_MACSYMBOL :
2002 enc = kCFStringEncodingMacSymbol ;
2003 break ;
2004 case wxFONTENCODING_MACDINGBATS :
2005 enc = kCFStringEncodingMacDingbats ;
2006 break ;
2007 case wxFONTENCODING_MACTURKISH :
2008 enc = kCFStringEncodingMacTurkish ;
2009 break ;
2010 case wxFONTENCODING_MACCROATIAN :
2011 enc = kCFStringEncodingMacCroatian ;
2012 break ;
2013 case wxFONTENCODING_MACICELANDIC :
2014 enc = kCFStringEncodingMacIcelandic ;
2015 break ;
2016 case wxFONTENCODING_MACROMANIAN :
2017 enc = kCFStringEncodingMacRomanian ;
2018 break ;
2019 case wxFONTENCODING_MACCELTIC :
2020 enc = kCFStringEncodingMacCeltic ;
2021 break ;
2022 case wxFONTENCODING_MACGAELIC :
2023 enc = kCFStringEncodingMacGaelic ;
2024 break ;
2025 // case wxFONTENCODING_MACKEYBOARD :
2026 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2027 // break ;
2028 default :
2029 // because gcc is picky
2030 break ;
2031 } ;
2032 return enc ;
2033 }
2034
2035 class wxMBConv_cocoa : public wxMBConv
2036 {
2037 public:
2038 wxMBConv_cocoa()
2039 {
2040 Init(CFStringGetSystemEncoding()) ;
2041 }
2042
2043 #if wxUSE_FONTMAP
2044 wxMBConv_cocoa(const wxChar* name)
2045 {
2046 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2047 }
2048 #endif
2049
2050 wxMBConv_cocoa(wxFontEncoding encoding)
2051 {
2052 Init( wxCFStringEncFromFontEnc(encoding) );
2053 }
2054
2055 ~wxMBConv_cocoa()
2056 {
2057 }
2058
2059 void Init( CFStringEncoding encoding)
2060 {
2061 m_encoding = encoding ;
2062 }
2063
2064 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2065 {
2066 wxASSERT(szUnConv);
2067
2068 CFStringRef theString = CFStringCreateWithBytes (
2069 NULL, //the allocator
2070 (const UInt8*)szUnConv,
2071 strlen(szUnConv),
2072 m_encoding,
2073 false //no BOM/external representation
2074 );
2075
2076 wxASSERT(theString);
2077
2078 size_t nOutLength = CFStringGetLength(theString);
2079
2080 if (szOut == NULL)
2081 {
2082 CFRelease(theString);
2083 return nOutLength;
2084 }
2085
2086 CFRange theRange = { 0, nOutSize };
2087
2088 #if SIZEOF_WCHAR_T == 4
2089 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2090 #endif
2091
2092 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2093
2094 CFRelease(theString);
2095
2096 szUniCharBuffer[nOutLength] = '\0' ;
2097
2098 #if SIZEOF_WCHAR_T == 4
2099 wxMBConvUTF16 converter ;
2100 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2101 delete[] szUniCharBuffer;
2102 #endif
2103
2104 return nOutLength;
2105 }
2106
2107 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2108 {
2109 wxASSERT(szUnConv);
2110
2111 size_t nRealOutSize;
2112 size_t nBufSize = wxWcslen(szUnConv);
2113 UniChar* szUniBuffer = (UniChar*) szUnConv;
2114
2115 #if SIZEOF_WCHAR_T == 4
2116 wxMBConvUTF16BE converter ;
2117 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2118 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2119 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2120 nBufSize /= sizeof(UniChar);
2121 #endif
2122
2123 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2124 NULL, //allocator
2125 szUniBuffer,
2126 nBufSize,
2127 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2128 );
2129
2130 wxASSERT(theString);
2131
2132 //Note that CER puts a BOM when converting to unicode
2133 //so we check and use getchars instead in that case
2134 if (m_encoding == kCFStringEncodingUnicode)
2135 {
2136 if (szOut != NULL)
2137 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2138
2139 nRealOutSize = CFStringGetLength(theString) + 1;
2140 }
2141 else
2142 {
2143 CFStringGetBytes(
2144 theString,
2145 CFRangeMake(0, CFStringGetLength(theString)),
2146 m_encoding,
2147 0, //what to put in characters that can't be converted -
2148 //0 tells CFString to return NULL if it meets such a character
2149 false, //not an external representation
2150 (UInt8*) szOut,
2151 nOutSize,
2152 (CFIndex*) &nRealOutSize
2153 );
2154 }
2155
2156 CFRelease(theString);
2157
2158 #if SIZEOF_WCHAR_T == 4
2159 delete[] szUniBuffer;
2160 #endif
2161
2162 return nRealOutSize - 1;
2163 }
2164
2165 bool IsOk() const
2166 {
2167 return m_encoding != kCFStringEncodingInvalidId &&
2168 CFStringIsEncodingAvailable(m_encoding);
2169 }
2170
2171 private:
2172 CFStringEncoding m_encoding ;
2173 };
2174
2175 #endif // defined(__WXCOCOA__)
2176
2177 // ============================================================================
2178 // Mac conversion classes
2179 // ============================================================================
2180
2181 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2182
2183 class wxMBConv_mac : public wxMBConv
2184 {
2185 public:
2186 wxMBConv_mac()
2187 {
2188 Init(CFStringGetSystemEncoding()) ;
2189 }
2190
2191 #if wxUSE_FONTMAP
2192 wxMBConv_mac(const wxChar* name)
2193 {
2194 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2195 }
2196 #endif
2197
2198 wxMBConv_mac(wxFontEncoding encoding)
2199 {
2200 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2201 }
2202
2203 ~wxMBConv_mac()
2204 {
2205 OSStatus status = noErr ;
2206 status = TECDisposeConverter(m_MB2WC_converter);
2207 status = TECDisposeConverter(m_WC2MB_converter);
2208 }
2209
2210
2211 void Init( TextEncodingBase encoding)
2212 {
2213 OSStatus status = noErr ;
2214 m_char_encoding = encoding ;
2215 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2216
2217 status = TECCreateConverter(&m_MB2WC_converter,
2218 m_char_encoding,
2219 m_unicode_encoding);
2220 status = TECCreateConverter(&m_WC2MB_converter,
2221 m_unicode_encoding,
2222 m_char_encoding);
2223 }
2224
2225 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2226 {
2227 OSStatus status = noErr ;
2228 ByteCount byteOutLen ;
2229 ByteCount byteInLen = strlen(psz) ;
2230 wchar_t *tbuf = NULL ;
2231 UniChar* ubuf = NULL ;
2232 size_t res = 0 ;
2233
2234 if (buf == NULL)
2235 {
2236 //apple specs say at least 32
2237 n = wxMax( 32 , byteInLen ) ;
2238 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2239 }
2240 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2241 #if SIZEOF_WCHAR_T == 4
2242 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2243 #else
2244 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2245 #endif
2246 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2247 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2248 #if SIZEOF_WCHAR_T == 4
2249 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2250 // is not properly terminated we get random characters at the end
2251 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2252 wxMBConvUTF16BE converter ;
2253 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2254 free( ubuf ) ;
2255 #else
2256 res = byteOutLen / sizeof( UniChar ) ;
2257 #endif
2258 if ( buf == NULL )
2259 free(tbuf) ;
2260
2261 if ( buf && res < n)
2262 buf[res] = 0;
2263
2264 return res ;
2265 }
2266
2267 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2268 {
2269 OSStatus status = noErr ;
2270 ByteCount byteOutLen ;
2271 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2272
2273 char *tbuf = NULL ;
2274
2275 if (buf == NULL)
2276 {
2277 //apple specs say at least 32
2278 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2279 tbuf = (char*) malloc( n ) ;
2280 }
2281
2282 ByteCount byteBufferLen = n ;
2283 UniChar* ubuf = NULL ;
2284 #if SIZEOF_WCHAR_T == 4
2285 wxMBConvUTF16BE converter ;
2286 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2287 byteInLen = unicharlen ;
2288 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2289 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2290 #else
2291 ubuf = (UniChar*) psz ;
2292 #endif
2293 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2294 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2295 #if SIZEOF_WCHAR_T == 4
2296 free( ubuf ) ;
2297 #endif
2298 if ( buf == NULL )
2299 free(tbuf) ;
2300
2301 size_t res = byteOutLen ;
2302 if ( buf && res < n)
2303 {
2304 buf[res] = 0;
2305
2306 //we need to double-trip to verify it didn't insert any ? in place
2307 //of bogus characters
2308 wxWCharBuffer wcBuf(n);
2309 size_t pszlen = wxWcslen(psz);
2310 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2311 wxWcslen(wcBuf) != pszlen ||
2312 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2313 {
2314 // we didn't obtain the same thing we started from, hence
2315 // the conversion was lossy and we consider that it failed
2316 return (size_t)-1;
2317 }
2318 }
2319
2320 return res ;
2321 }
2322
2323 bool IsOk() const
2324 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2325
2326 private:
2327 TECObjectRef m_MB2WC_converter ;
2328 TECObjectRef m_WC2MB_converter ;
2329
2330 TextEncodingBase m_char_encoding ;
2331 TextEncodingBase m_unicode_encoding ;
2332 };
2333
2334 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2335
2336 // ============================================================================
2337 // wxEncodingConverter based conversion classes
2338 // ============================================================================
2339
2340 #if wxUSE_FONTMAP
2341
2342 class wxMBConv_wxwin : public wxMBConv
2343 {
2344 private:
2345 void Init()
2346 {
2347 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2348 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2349 }
2350
2351 public:
2352 // temporarily just use wxEncodingConverter stuff,
2353 // so that it works while a better implementation is built
2354 wxMBConv_wxwin(const wxChar* name)
2355 {
2356 if (name)
2357 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2358 else
2359 m_enc = wxFONTENCODING_SYSTEM;
2360
2361 Init();
2362 }
2363
2364 wxMBConv_wxwin(wxFontEncoding enc)
2365 {
2366 m_enc = enc;
2367
2368 Init();
2369 }
2370
2371 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2372 {
2373 size_t inbuf = strlen(psz);
2374 if (buf)
2375 {
2376 if (!m2w.Convert(psz,buf))
2377 return (size_t)-1;
2378 }
2379 return inbuf;
2380 }
2381
2382 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2383 {
2384 const size_t inbuf = wxWcslen(psz);
2385 if (buf)
2386 {
2387 if (!w2m.Convert(psz,buf))
2388 return (size_t)-1;
2389 }
2390
2391 return inbuf;
2392 }
2393
2394 bool IsOk() const { return m_ok; }
2395
2396 public:
2397 wxFontEncoding m_enc;
2398 wxEncodingConverter m2w, w2m;
2399
2400 // were we initialized successfully?
2401 bool m_ok;
2402
2403 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2404 };
2405
2406 #endif // wxUSE_FONTMAP
2407
2408 // ============================================================================
2409 // wxCSConv implementation
2410 // ============================================================================
2411
2412 void wxCSConv::Init()
2413 {
2414 m_name = NULL;
2415 m_convReal = NULL;
2416 m_deferred = true;
2417 }
2418
2419 wxCSConv::wxCSConv(const wxChar *charset)
2420 {
2421 Init();
2422
2423 if ( charset )
2424 {
2425 SetName(charset);
2426 }
2427
2428 m_encoding = wxFONTENCODING_SYSTEM;
2429 }
2430
2431 wxCSConv::wxCSConv(wxFontEncoding encoding)
2432 {
2433 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2434 {
2435 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2436
2437 encoding = wxFONTENCODING_SYSTEM;
2438 }
2439
2440 Init();
2441
2442 m_encoding = encoding;
2443 }
2444
2445 wxCSConv::~wxCSConv()
2446 {
2447 Clear();
2448 }
2449
2450 wxCSConv::wxCSConv(const wxCSConv& conv)
2451 : wxMBConv()
2452 {
2453 Init();
2454
2455 SetName(conv.m_name);
2456 m_encoding = conv.m_encoding;
2457 }
2458
2459 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2460 {
2461 Clear();
2462
2463 SetName(conv.m_name);
2464 m_encoding = conv.m_encoding;
2465
2466 return *this;
2467 }
2468
2469 void wxCSConv::Clear()
2470 {
2471 free(m_name);
2472 delete m_convReal;
2473
2474 m_name = NULL;
2475 m_convReal = NULL;
2476 }
2477
2478 void wxCSConv::SetName(const wxChar *charset)
2479 {
2480 if (charset)
2481 {
2482 m_name = wxStrdup(charset);
2483 m_deferred = true;
2484 }
2485 }
2486
2487 wxMBConv *wxCSConv::DoCreate() const
2488 {
2489 // check for the special case of ASCII or ISO8859-1 charset: as we have
2490 // special knowledge of it anyhow, we don't need to create a special
2491 // conversion object
2492 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2493 {
2494 // don't convert at all
2495 return NULL;
2496 }
2497
2498 // we trust OS to do conversion better than we can so try external
2499 // conversion methods first
2500 //
2501 // the full order is:
2502 // 1. OS conversion (iconv() under Unix or Win32 API)
2503 // 2. hard coded conversions for UTF
2504 // 3. wxEncodingConverter as fall back
2505
2506 // step (1)
2507 #ifdef HAVE_ICONV
2508 #if !wxUSE_FONTMAP
2509 if ( m_name )
2510 #endif // !wxUSE_FONTMAP
2511 {
2512 wxString name(m_name);
2513
2514 #if wxUSE_FONTMAP
2515 if ( name.empty() )
2516 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2517 #endif // wxUSE_FONTMAP
2518
2519 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2520 if ( conv->IsOk() )
2521 return conv;
2522
2523 delete conv;
2524 }
2525 #endif // HAVE_ICONV
2526
2527 #ifdef wxHAVE_WIN32_MB2WC
2528 {
2529 #if wxUSE_FONTMAP
2530 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2531 : new wxMBConv_win32(m_encoding);
2532 if ( conv->IsOk() )
2533 return conv;
2534
2535 delete conv;
2536 #else
2537 return NULL;
2538 #endif
2539 }
2540 #endif // wxHAVE_WIN32_MB2WC
2541 #if defined(__WXMAC__)
2542 {
2543 // leave UTF16 and UTF32 to the built-ins of wx
2544 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2545 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2546 {
2547
2548 #if wxUSE_FONTMAP
2549 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2550 : new wxMBConv_mac(m_encoding);
2551 #else
2552 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2553 #endif
2554 if ( conv->IsOk() )
2555 return conv;
2556
2557 delete conv;
2558 }
2559 }
2560 #endif
2561 #if defined(__WXCOCOA__)
2562 {
2563 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2564 {
2565
2566 #if wxUSE_FONTMAP
2567 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2568 : new wxMBConv_cocoa(m_encoding);
2569 #else
2570 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2571 #endif
2572 if ( conv->IsOk() )
2573 return conv;
2574
2575 delete conv;
2576 }
2577 }
2578 #endif
2579 // step (2)
2580 wxFontEncoding enc = m_encoding;
2581 #if wxUSE_FONTMAP
2582 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2583 {
2584 // use "false" to suppress interactive dialogs -- we can be called from
2585 // anywhere and popping up a dialog from here is the last thing we want to
2586 // do
2587 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2588 }
2589 #endif // wxUSE_FONTMAP
2590
2591 switch ( enc )
2592 {
2593 case wxFONTENCODING_UTF7:
2594 return new wxMBConvUTF7;
2595
2596 case wxFONTENCODING_UTF8:
2597 return new wxMBConvUTF8;
2598
2599 case wxFONTENCODING_UTF16BE:
2600 return new wxMBConvUTF16BE;
2601
2602 case wxFONTENCODING_UTF16LE:
2603 return new wxMBConvUTF16LE;
2604
2605 case wxFONTENCODING_UTF32BE:
2606 return new wxMBConvUTF32BE;
2607
2608 case wxFONTENCODING_UTF32LE:
2609 return new wxMBConvUTF32LE;
2610
2611 default:
2612 // nothing to do but put here to suppress gcc warnings
2613 ;
2614 }
2615
2616 // step (3)
2617 #if wxUSE_FONTMAP
2618 {
2619 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2620 : new wxMBConv_wxwin(m_encoding);
2621 if ( conv->IsOk() )
2622 return conv;
2623
2624 delete conv;
2625 }
2626 #endif // wxUSE_FONTMAP
2627
2628 // NB: This is a hack to prevent deadlock. What could otherwise happen
2629 // in Unicode build: wxConvLocal creation ends up being here
2630 // because of some failure and logs the error. But wxLog will try to
2631 // attach timestamp, for which it will need wxConvLocal (to convert
2632 // time to char* and then wchar_t*), but that fails, tries to log
2633 // error, but wxLog has a (already locked) critical section that
2634 // guards static buffer.
2635 static bool alreadyLoggingError = false;
2636 if (!alreadyLoggingError)
2637 {
2638 alreadyLoggingError = true;
2639 wxLogError(_("Cannot convert from the charset '%s'!"),
2640 m_name ? m_name
2641 :
2642 #if wxUSE_FONTMAP
2643 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2644 #else // !wxUSE_FONTMAP
2645 wxString::Format(_("encoding %s"), m_encoding).c_str()
2646 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2647 );
2648 alreadyLoggingError = false;
2649 }
2650
2651 return NULL;
2652 }
2653
2654 void wxCSConv::CreateConvIfNeeded() const
2655 {
2656 if ( m_deferred )
2657 {
2658 wxCSConv *self = (wxCSConv *)this; // const_cast
2659
2660 #if wxUSE_INTL
2661 // if we don't have neither the name nor the encoding, use the default
2662 // encoding for this system
2663 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2664 {
2665 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2666 }
2667 #endif // wxUSE_INTL
2668
2669 self->m_convReal = DoCreate();
2670 self->m_deferred = false;
2671 }
2672 }
2673
2674 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2675 {
2676 CreateConvIfNeeded();
2677
2678 if (m_convReal)
2679 return m_convReal->MB2WC(buf, psz, n);
2680
2681 // latin-1 (direct)
2682 size_t len = strlen(psz);
2683
2684 if (buf)
2685 {
2686 for (size_t c = 0; c <= len; c++)
2687 buf[c] = (unsigned char)(psz[c]);
2688 }
2689
2690 return len;
2691 }
2692
2693 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2694 {
2695 CreateConvIfNeeded();
2696
2697 if (m_convReal)
2698 return m_convReal->WC2MB(buf, psz, n);
2699
2700 // latin-1 (direct)
2701 const size_t len = wxWcslen(psz);
2702 if (buf)
2703 {
2704 for (size_t c = 0; c <= len; c++)
2705 {
2706 if (psz[c] > 0xFF)
2707 return (size_t)-1;
2708 buf[c] = (char)psz[c];
2709 }
2710 }
2711 else
2712 {
2713 for (size_t c = 0; c <= len; c++)
2714 {
2715 if (psz[c] > 0xFF)
2716 return (size_t)-1;
2717 }
2718 }
2719
2720 return len;
2721 }
2722
2723 // ----------------------------------------------------------------------------
2724 // globals
2725 // ----------------------------------------------------------------------------
2726
2727 #ifdef __WINDOWS__
2728 static wxMBConv_win32 wxConvLibcObj;
2729 #elif defined(__WXMAC__) && !defined(__MACH__)
2730 static wxMBConv_mac wxConvLibcObj ;
2731 #else
2732 static wxMBConvLibc wxConvLibcObj;
2733 #endif
2734
2735 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2736 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2737 static wxMBConvUTF7 wxConvUTF7Obj;
2738 static wxMBConvUTF8 wxConvUTF8Obj;
2739 static wxConvBrokenFileNames wxConvBrokenFileNamesObj;
2740
2741 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2742 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2743 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2744 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2745 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2746 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2747 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2748 #ifdef __WXOSX__
2749 wxConvUTF8Obj;
2750 #elif __WXGTK20__
2751 wxConvBrokenFileNamesObj;
2752 #else
2753 wxConvLibcObj;
2754 #endif
2755
2756
2757 #else // !wxUSE_WCHAR_T
2758
2759 // stand-ins in absence of wchar_t
2760 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2761 wxConvISO8859_1,
2762 wxConvLocal,
2763 wxConvUTF8;
2764
2765 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2766
2767