]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Commit empty wxConvBrokenFileNames
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359
360 // ----------------------------------------------------------------------------
361 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
362 // files are accidentally written in an encoding which is not
363 // the system encoding. Typically, the system encoding will be
364 // UTF8 but there might be files stored in ISO8859-1 in disk.
365 // ----------------------------------------------------------------------------
366
367 class wxConvBrokenFileNames: public wxMBConvLibc
368 {
369 public:
370 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
371 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
372 };
373
374 size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
375 {
376 #if 0
377 if (we find some invalid characters)
378 {
379 Convert to Unicode range.
380 }
381 else
382 #endif
383 return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
384 }
385
386 size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
387 {
388 #if 0
389 Convert back from Unicode range.
390 #endif
391 return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
392 }
393
394 // ----------------------------------------------------------------------------
395 // UTF-7
396 // ----------------------------------------------------------------------------
397
398 // Implementation (C) 2004 Fredrik Roubert
399
400 //
401 // BASE64 decoding table
402 //
403 static const unsigned char utf7unb64[] =
404 {
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
411 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
412 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
413 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
414 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
415 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
416 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
417 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
418 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
419 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
420 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
437 };
438
439 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
440 {
441 size_t len = 0;
442
443 while (*psz && ((!buf) || (len < n)))
444 {
445 unsigned char cc = *psz++;
446 if (cc != '+')
447 {
448 // plain ASCII char
449 if (buf)
450 *buf++ = cc;
451 len++;
452 }
453 else if (*psz == '-')
454 {
455 // encoded plus sign
456 if (buf)
457 *buf++ = cc;
458 len++;
459 psz++;
460 }
461 else
462 {
463 // BASE64 encoded string
464 bool lsb;
465 unsigned char c;
466 unsigned int d, l;
467 for (lsb = false, d = 0, l = 0;
468 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
469 {
470 d <<= 6;
471 d += cc;
472 for (l += 6; l >= 8; lsb = !lsb)
473 {
474 c = (unsigned char)((d >> (l -= 8)) % 256);
475 if (lsb)
476 {
477 if (buf)
478 *buf++ |= c;
479 len ++;
480 }
481 else
482 if (buf)
483 *buf = (wchar_t)(c << 8);
484 }
485 }
486 if (*psz == '-')
487 psz++;
488 }
489 }
490 if (buf && (len < n))
491 *buf = 0;
492 return len;
493 }
494
495 //
496 // BASE64 encoding table
497 //
498 static const unsigned char utf7enb64[] =
499 {
500 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
501 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
502 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
503 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
504 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
505 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
506 'w', 'x', 'y', 'z', '0', '1', '2', '3',
507 '4', '5', '6', '7', '8', '9', '+', '/'
508 };
509
510 //
511 // UTF-7 encoding table
512 //
513 // 0 - Set D (directly encoded characters)
514 // 1 - Set O (optional direct characters)
515 // 2 - whitespace characters (optional)
516 // 3 - special characters
517 //
518 static const unsigned char utf7encode[128] =
519 {
520 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
521 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
524 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
528 };
529
530 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
531 {
532
533
534 size_t len = 0;
535
536 while (*psz && ((!buf) || (len < n)))
537 {
538 wchar_t cc = *psz++;
539 if (cc < 0x80 && utf7encode[cc] < 1)
540 {
541 // plain ASCII char
542 if (buf)
543 *buf++ = (char)cc;
544 len++;
545 }
546 #ifndef WC_UTF16
547 else if (((wxUint32)cc) > 0xffff)
548 {
549 // no surrogate pair generation (yet?)
550 return (size_t)-1;
551 }
552 #endif
553 else
554 {
555 if (buf)
556 *buf++ = '+';
557 len++;
558 if (cc != '+')
559 {
560 // BASE64 encode string
561 unsigned int lsb, d, l;
562 for (d = 0, l = 0;; psz++)
563 {
564 for (lsb = 0; lsb < 2; lsb ++)
565 {
566 d <<= 8;
567 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
568
569 for (l += 8; l >= 6; )
570 {
571 l -= 6;
572 if (buf)
573 *buf++ = utf7enb64[(d >> l) % 64];
574 len++;
575 }
576 }
577 cc = *psz;
578 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
579 break;
580 }
581 if (l != 0)
582 {
583 if (buf)
584 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
585 len++;
586 }
587 }
588 if (buf)
589 *buf++ = '-';
590 len++;
591 }
592 }
593 if (buf && (len < n))
594 *buf = 0;
595 return len;
596 }
597
598 // ----------------------------------------------------------------------------
599 // UTF-8
600 // ----------------------------------------------------------------------------
601
602 static wxUint32 utf8_max[]=
603 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
604
605 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
606 {
607 size_t len = 0;
608
609 while (*psz && ((!buf) || (len < n)))
610 {
611 unsigned char cc = *psz++, fc = cc;
612 unsigned cnt;
613 for (cnt = 0; fc & 0x80; cnt++)
614 fc <<= 1;
615 if (!cnt)
616 {
617 // plain ASCII char
618 if (buf)
619 *buf++ = cc;
620 len++;
621 }
622 else
623 {
624 cnt--;
625 if (!cnt)
626 {
627 // invalid UTF-8 sequence
628 return (size_t)-1;
629 }
630 else
631 {
632 unsigned ocnt = cnt - 1;
633 wxUint32 res = cc & (0x3f >> cnt);
634 while (cnt--)
635 {
636 cc = *psz++;
637 if ((cc & 0xC0) != 0x80)
638 {
639 // invalid UTF-8 sequence
640 return (size_t)-1;
641 }
642 res = (res << 6) | (cc & 0x3f);
643 }
644 if (res <= utf8_max[ocnt])
645 {
646 // illegal UTF-8 encoding
647 return (size_t)-1;
648 }
649 #ifdef WC_UTF16
650 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
651 size_t pa = encode_utf16(res, (wxUint16 *)buf);
652 if (pa == (size_t)-1)
653 return (size_t)-1;
654 if (buf)
655 buf += pa;
656 len += pa;
657 #else // !WC_UTF16
658 if (buf)
659 *buf++ = res;
660 len++;
661 #endif // WC_UTF16/!WC_UTF16
662 }
663 }
664 }
665 if (buf && (len < n))
666 *buf = 0;
667 return len;
668 }
669
670 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
671 {
672 size_t len = 0;
673
674 while (*psz && ((!buf) || (len < n)))
675 {
676 wxUint32 cc;
677 #ifdef WC_UTF16
678 // cast is ok for WC_UTF16
679 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
680 psz += (pa == (size_t)-1) ? 1 : pa;
681 #else
682 cc=(*psz++) & 0x7fffffff;
683 #endif
684 unsigned cnt;
685 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
686 if (!cnt)
687 {
688 // plain ASCII char
689 if (buf)
690 *buf++ = (char) cc;
691 len++;
692 }
693
694 else
695 {
696 len += cnt + 1;
697 if (buf)
698 {
699 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
700 while (cnt--)
701 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
702 }
703 }
704 }
705
706 if (buf && (len<n)) *buf = 0;
707
708 return len;
709 }
710
711
712
713
714 // ----------------------------------------------------------------------------
715 // UTF-16
716 // ----------------------------------------------------------------------------
717
718 #ifdef WORDS_BIGENDIAN
719 #define wxMBConvUTF16straight wxMBConvUTF16BE
720 #define wxMBConvUTF16swap wxMBConvUTF16LE
721 #else
722 #define wxMBConvUTF16swap wxMBConvUTF16BE
723 #define wxMBConvUTF16straight wxMBConvUTF16LE
724 #endif
725
726
727 #ifdef WC_UTF16
728
729 // copy 16bit MB to 16bit String
730 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
731 {
732 size_t len=0;
733
734 while (*(wxUint16*)psz && (!buf || len < n))
735 {
736 if (buf)
737 *buf++ = *(wxUint16*)psz;
738 len++;
739
740 psz += sizeof(wxUint16);
741 }
742 if (buf && len<n) *buf=0;
743
744 return len;
745 }
746
747
748 // copy 16bit String to 16bit MB
749 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
750 {
751 size_t len=0;
752
753 while (*psz && (!buf || len < n))
754 {
755 if (buf)
756 {
757 *(wxUint16*)buf = *psz;
758 buf += sizeof(wxUint16);
759 }
760 len += sizeof(wxUint16);
761 psz++;
762 }
763 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
764
765 return len;
766 }
767
768
769 // swap 16bit MB to 16bit String
770 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
771 {
772 size_t len=0;
773
774 while (*(wxUint16*)psz && (!buf || len < n))
775 {
776 if (buf)
777 {
778 ((char *)buf)[0] = psz[1];
779 ((char *)buf)[1] = psz[0];
780 buf++;
781 }
782 len++;
783 psz += sizeof(wxUint16);
784 }
785 if (buf && len<n) *buf=0;
786
787 return len;
788 }
789
790
791 // swap 16bit MB to 16bit String
792 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
793 {
794 size_t len=0;
795
796 while (*psz && (!buf || len < n))
797 {
798 if (buf)
799 {
800 *buf++ = ((char*)psz)[1];
801 *buf++ = ((char*)psz)[0];
802 }
803 len += sizeof(wxUint16);
804 psz++;
805 }
806 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
807
808 return len;
809 }
810
811
812 #else // WC_UTF16
813
814
815 // copy 16bit MB to 32bit String
816 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
817 {
818 size_t len=0;
819
820 while (*(wxUint16*)psz && (!buf || len < n))
821 {
822 wxUint32 cc;
823 size_t pa=decode_utf16((wxUint16*)psz, cc);
824 if (pa == (size_t)-1)
825 return pa;
826
827 if (buf)
828 *buf++ = cc;
829 len++;
830 psz += pa * sizeof(wxUint16);
831 }
832 if (buf && len<n) *buf=0;
833
834 return len;
835 }
836
837
838 // copy 32bit String to 16bit MB
839 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
840 {
841 size_t len=0;
842
843 while (*psz && (!buf || len < n))
844 {
845 wxUint16 cc[2];
846 size_t pa=encode_utf16(*psz, cc);
847
848 if (pa == (size_t)-1)
849 return pa;
850
851 if (buf)
852 {
853 *(wxUint16*)buf = cc[0];
854 buf += sizeof(wxUint16);
855 if (pa > 1)
856 {
857 *(wxUint16*)buf = cc[1];
858 buf += sizeof(wxUint16);
859 }
860 }
861
862 len += pa*sizeof(wxUint16);
863 psz++;
864 }
865 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
866
867 return len;
868 }
869
870
871 // swap 16bit MB to 32bit String
872 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
873 {
874 size_t len=0;
875
876 while (*(wxUint16*)psz && (!buf || len < n))
877 {
878 wxUint32 cc;
879 char tmp[4];
880 tmp[0]=psz[1]; tmp[1]=psz[0];
881 tmp[2]=psz[3]; tmp[3]=psz[2];
882
883 size_t pa=decode_utf16((wxUint16*)tmp, cc);
884 if (pa == (size_t)-1)
885 return pa;
886
887 if (buf)
888 *buf++ = cc;
889
890 len++;
891 psz += pa * sizeof(wxUint16);
892 }
893 if (buf && len<n) *buf=0;
894
895 return len;
896 }
897
898
899 // swap 32bit String to 16bit MB
900 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
901 {
902 size_t len=0;
903
904 while (*psz && (!buf || len < n))
905 {
906 wxUint16 cc[2];
907 size_t pa=encode_utf16(*psz, cc);
908
909 if (pa == (size_t)-1)
910 return pa;
911
912 if (buf)
913 {
914 *buf++ = ((char*)cc)[1];
915 *buf++ = ((char*)cc)[0];
916 if (pa > 1)
917 {
918 *buf++ = ((char*)cc)[3];
919 *buf++ = ((char*)cc)[2];
920 }
921 }
922
923 len += pa*sizeof(wxUint16);
924 psz++;
925 }
926 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
927
928 return len;
929 }
930
931 #endif // WC_UTF16
932
933
934 // ----------------------------------------------------------------------------
935 // UTF-32
936 // ----------------------------------------------------------------------------
937
938 #ifdef WORDS_BIGENDIAN
939 #define wxMBConvUTF32straight wxMBConvUTF32BE
940 #define wxMBConvUTF32swap wxMBConvUTF32LE
941 #else
942 #define wxMBConvUTF32swap wxMBConvUTF32BE
943 #define wxMBConvUTF32straight wxMBConvUTF32LE
944 #endif
945
946
947 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
948 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
949
950
951 #ifdef WC_UTF16
952
953 // copy 32bit MB to 16bit String
954 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
955 {
956 size_t len=0;
957
958 while (*(wxUint32*)psz && (!buf || len < n))
959 {
960 wxUint16 cc[2];
961
962 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
963 if (pa == (size_t)-1)
964 return pa;
965
966 if (buf)
967 {
968 *buf++ = cc[0];
969 if (pa > 1)
970 *buf++ = cc[1];
971 }
972 len += pa;
973 psz += sizeof(wxUint32);
974 }
975 if (buf && len<n) *buf=0;
976
977 return len;
978 }
979
980
981 // copy 16bit String to 32bit MB
982 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
983 {
984 size_t len=0;
985
986 while (*psz && (!buf || len < n))
987 {
988 wxUint32 cc;
989
990 // cast is ok for WC_UTF16
991 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
992 if (pa == (size_t)-1)
993 return pa;
994
995 if (buf)
996 {
997 *(wxUint32*)buf = cc;
998 buf += sizeof(wxUint32);
999 }
1000 len += sizeof(wxUint32);
1001 psz += pa;
1002 }
1003
1004 if (buf && len<=n-sizeof(wxUint32))
1005 *(wxUint32*)buf=0;
1006
1007 return len;
1008 }
1009
1010
1011
1012 // swap 32bit MB to 16bit String
1013 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1014 {
1015 size_t len=0;
1016
1017 while (*(wxUint32*)psz && (!buf || len < n))
1018 {
1019 char tmp[4];
1020 tmp[0] = psz[3]; tmp[1] = psz[2];
1021 tmp[2] = psz[1]; tmp[3] = psz[0];
1022
1023
1024 wxUint16 cc[2];
1025
1026 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1027 if (pa == (size_t)-1)
1028 return pa;
1029
1030 if (buf)
1031 {
1032 *buf++ = cc[0];
1033 if (pa > 1)
1034 *buf++ = cc[1];
1035 }
1036 len += pa;
1037 psz += sizeof(wxUint32);
1038 }
1039
1040 if (buf && len<n)
1041 *buf=0;
1042
1043 return len;
1044 }
1045
1046
1047 // swap 16bit String to 32bit MB
1048 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1049 {
1050 size_t len=0;
1051
1052 while (*psz && (!buf || len < n))
1053 {
1054 char cc[4];
1055
1056 // cast is ok for WC_UTF16
1057 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1058 if (pa == (size_t)-1)
1059 return pa;
1060
1061 if (buf)
1062 {
1063 *buf++ = cc[3];
1064 *buf++ = cc[2];
1065 *buf++ = cc[1];
1066 *buf++ = cc[0];
1067 }
1068 len += sizeof(wxUint32);
1069 psz += pa;
1070 }
1071
1072 if (buf && len<=n-sizeof(wxUint32))
1073 *(wxUint32*)buf=0;
1074
1075 return len;
1076 }
1077
1078 #else // WC_UTF16
1079
1080
1081 // copy 32bit MB to 32bit String
1082 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1083 {
1084 size_t len=0;
1085
1086 while (*(wxUint32*)psz && (!buf || len < n))
1087 {
1088 if (buf)
1089 *buf++ = *(wxUint32*)psz;
1090 len++;
1091 psz += sizeof(wxUint32);
1092 }
1093
1094 if (buf && len<n)
1095 *buf=0;
1096
1097 return len;
1098 }
1099
1100
1101 // copy 32bit String to 32bit MB
1102 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1103 {
1104 size_t len=0;
1105
1106 while (*psz && (!buf || len < n))
1107 {
1108 if (buf)
1109 {
1110 *(wxUint32*)buf = *psz;
1111 buf += sizeof(wxUint32);
1112 }
1113
1114 len += sizeof(wxUint32);
1115 psz++;
1116 }
1117
1118 if (buf && len<=n-sizeof(wxUint32))
1119 *(wxUint32*)buf=0;
1120
1121 return len;
1122 }
1123
1124
1125 // swap 32bit MB to 32bit String
1126 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1127 {
1128 size_t len=0;
1129
1130 while (*(wxUint32*)psz && (!buf || len < n))
1131 {
1132 if (buf)
1133 {
1134 ((char *)buf)[0] = psz[3];
1135 ((char *)buf)[1] = psz[2];
1136 ((char *)buf)[2] = psz[1];
1137 ((char *)buf)[3] = psz[0];
1138 buf++;
1139 }
1140 len++;
1141 psz += sizeof(wxUint32);
1142 }
1143
1144 if (buf && len<n)
1145 *buf=0;
1146
1147 return len;
1148 }
1149
1150
1151 // swap 32bit String to 32bit MB
1152 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1153 {
1154 size_t len=0;
1155
1156 while (*psz && (!buf || len < n))
1157 {
1158 if (buf)
1159 {
1160 *buf++ = ((char *)psz)[3];
1161 *buf++ = ((char *)psz)[2];
1162 *buf++ = ((char *)psz)[1];
1163 *buf++ = ((char *)psz)[0];
1164 }
1165 len += sizeof(wxUint32);
1166 psz++;
1167 }
1168
1169 if (buf && len<=n-sizeof(wxUint32))
1170 *(wxUint32*)buf=0;
1171
1172 return len;
1173 }
1174
1175
1176 #endif // WC_UTF16
1177
1178
1179 // ============================================================================
1180 // The classes doing conversion using the iconv_xxx() functions
1181 // ============================================================================
1182
1183 #ifdef HAVE_ICONV
1184
1185 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1186 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1187 // (unless there's yet another bug in glibc) the only case when iconv()
1188 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1189 // left in the input buffer -- when _real_ error occurs,
1190 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1191 // iconv() failure.
1192 // [This bug does not appear in glibc 2.2.]
1193 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1194 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1195 (errno != E2BIG || bufLeft != 0))
1196 #else
1197 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1198 #endif
1199
1200 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1201
1202 // ----------------------------------------------------------------------------
1203 // wxMBConv_iconv: encapsulates an iconv character set
1204 // ----------------------------------------------------------------------------
1205
1206 class wxMBConv_iconv : public wxMBConv
1207 {
1208 public:
1209 wxMBConv_iconv(const wxChar *name);
1210 virtual ~wxMBConv_iconv();
1211
1212 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1213 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1214
1215 bool IsOk() const
1216 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1217
1218 protected:
1219 // the iconv handlers used to translate from multibyte to wide char and in
1220 // the other direction
1221 iconv_t m2w,
1222 w2m;
1223 #if wxUSE_THREADS
1224 // guards access to m2w and w2m objects
1225 wxMutex m_iconvMutex;
1226 #endif
1227
1228 private:
1229 // the name (for iconv_open()) of a wide char charset -- if none is
1230 // available on this machine, it will remain NULL
1231 static const char *ms_wcCharsetName;
1232
1233 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1234 // different endian-ness than the native one
1235 static bool ms_wcNeedsSwap;
1236 };
1237
1238 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1239 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1240
1241 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1242 {
1243 // Do it the hard way
1244 char cname[100];
1245 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1246 cname[i] = (char) name[i];
1247
1248 // check for charset that represents wchar_t:
1249 if (ms_wcCharsetName == NULL)
1250 {
1251 ms_wcNeedsSwap = false;
1252
1253 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1254 ms_wcCharsetName = WC_NAME_BEST;
1255 m2w = iconv_open(ms_wcCharsetName, cname);
1256
1257 if (m2w == (iconv_t)-1)
1258 {
1259 // try charset w/o bytesex info (e.g. "UCS4")
1260 // and check for bytesex ourselves:
1261 ms_wcCharsetName = WC_NAME;
1262 m2w = iconv_open(ms_wcCharsetName, cname);
1263
1264 // last bet, try if it knows WCHAR_T pseudo-charset
1265 if (m2w == (iconv_t)-1)
1266 {
1267 ms_wcCharsetName = "WCHAR_T";
1268 m2w = iconv_open(ms_wcCharsetName, cname);
1269 }
1270
1271 if (m2w != (iconv_t)-1)
1272 {
1273 char buf[2], *bufPtr;
1274 wchar_t wbuf[2], *wbufPtr;
1275 size_t insz, outsz;
1276 size_t res;
1277
1278 buf[0] = 'A';
1279 buf[1] = 0;
1280 wbuf[0] = 0;
1281 insz = 2;
1282 outsz = SIZEOF_WCHAR_T * 2;
1283 wbufPtr = wbuf;
1284 bufPtr = buf;
1285
1286 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1287 (char**)&wbufPtr, &outsz);
1288
1289 if (ICONV_FAILED(res, insz))
1290 {
1291 ms_wcCharsetName = NULL;
1292 wxLogLastError(wxT("iconv"));
1293 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1294 }
1295 else
1296 {
1297 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1298 }
1299 }
1300 else
1301 {
1302 ms_wcCharsetName = NULL;
1303
1304 // VS: we must not output an error here, since wxWidgets will safely
1305 // fall back to using wxEncodingConverter.
1306 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1307 //wxLogError(
1308 }
1309 }
1310 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1311 }
1312 else // we already have ms_wcCharsetName
1313 {
1314 m2w = iconv_open(ms_wcCharsetName, cname);
1315 }
1316
1317 // NB: don't ever pass NULL to iconv_open(), it may crash!
1318 if ( ms_wcCharsetName )
1319 {
1320 w2m = iconv_open( cname, ms_wcCharsetName);
1321 }
1322 else
1323 {
1324 w2m = (iconv_t)-1;
1325 }
1326 }
1327
1328 wxMBConv_iconv::~wxMBConv_iconv()
1329 {
1330 if ( m2w != (iconv_t)-1 )
1331 iconv_close(m2w);
1332 if ( w2m != (iconv_t)-1 )
1333 iconv_close(w2m);
1334 }
1335
1336 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1337 {
1338 #if wxUSE_THREADS
1339 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1340 // Unfortunately there is a couple of global wxCSConv objects such as
1341 // wxConvLocal that are used all over wx code, so we have to make sure
1342 // the handle is used by at most one thread at the time. Otherwise
1343 // only a few wx classes would be safe to use from non-main threads
1344 // as MB<->WC conversion would fail "randomly".
1345 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1346 #endif
1347
1348 size_t inbuf = strlen(psz);
1349 size_t outbuf = n * SIZEOF_WCHAR_T;
1350 size_t res, cres;
1351 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1352 wchar_t *bufPtr = buf;
1353 const char *pszPtr = psz;
1354
1355 if (buf)
1356 {
1357 // have destination buffer, convert there
1358 cres = iconv(m2w,
1359 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1360 (char**)&bufPtr, &outbuf);
1361 res = n - (outbuf / SIZEOF_WCHAR_T);
1362
1363 if (ms_wcNeedsSwap)
1364 {
1365 // convert to native endianness
1366 WC_BSWAP(buf /* _not_ bufPtr */, res)
1367 }
1368
1369 // NB: iconv was given only strlen(psz) characters on input, and so
1370 // it couldn't convert the trailing zero. Let's do it ourselves
1371 // if there's some room left for it in the output buffer.
1372 if (res < n)
1373 buf[res] = 0;
1374 }
1375 else
1376 {
1377 // no destination buffer... convert using temp buffer
1378 // to calculate destination buffer requirement
1379 wchar_t tbuf[8];
1380 res = 0;
1381 do {
1382 bufPtr = tbuf;
1383 outbuf = 8*SIZEOF_WCHAR_T;
1384
1385 cres = iconv(m2w,
1386 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1387 (char**)&bufPtr, &outbuf );
1388
1389 res += 8-(outbuf/SIZEOF_WCHAR_T);
1390 } while ((cres==(size_t)-1) && (errno==E2BIG));
1391 }
1392
1393 if (ICONV_FAILED(cres, inbuf))
1394 {
1395 //VS: it is ok if iconv fails, hence trace only
1396 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1397 return (size_t)-1;
1398 }
1399
1400 return res;
1401 }
1402
1403 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1404 {
1405 #if wxUSE_THREADS
1406 // NB: explained in MB2WC
1407 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1408 #endif
1409
1410 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1411 size_t outbuf = n;
1412 size_t res, cres;
1413
1414 wchar_t *tmpbuf = 0;
1415
1416 if (ms_wcNeedsSwap)
1417 {
1418 // need to copy to temp buffer to switch endianness
1419 // this absolutely doesn't rock!
1420 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1421 // could be in read-only memory, or be accessed in some other thread)
1422 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1423 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1424 WC_BSWAP(tmpbuf, inbuf)
1425 psz=tmpbuf;
1426 }
1427
1428 if (buf)
1429 {
1430 // have destination buffer, convert there
1431 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1432
1433 res = n-outbuf;
1434
1435 // NB: iconv was given only wcslen(psz) characters on input, and so
1436 // it couldn't convert the trailing zero. Let's do it ourselves
1437 // if there's some room left for it in the output buffer.
1438 if (res < n)
1439 buf[0] = 0;
1440 }
1441 else
1442 {
1443 // no destination buffer... convert using temp buffer
1444 // to calculate destination buffer requirement
1445 char tbuf[16];
1446 res = 0;
1447 do {
1448 buf = tbuf; outbuf = 16;
1449
1450 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1451
1452 res += 16 - outbuf;
1453 } while ((cres==(size_t)-1) && (errno==E2BIG));
1454 }
1455
1456 if (ms_wcNeedsSwap)
1457 {
1458 free(tmpbuf);
1459 }
1460
1461 if (ICONV_FAILED(cres, inbuf))
1462 {
1463 //VS: it is ok if iconv fails, hence trace only
1464 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1465 return (size_t)-1;
1466 }
1467
1468 return res;
1469 }
1470
1471 #endif // HAVE_ICONV
1472
1473
1474 // ============================================================================
1475 // Win32 conversion classes
1476 // ============================================================================
1477
1478 #ifdef wxHAVE_WIN32_MB2WC
1479
1480 // from utils.cpp
1481 #if wxUSE_FONTMAP
1482 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1483 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1484 #endif
1485
1486 class wxMBConv_win32 : public wxMBConv
1487 {
1488 public:
1489 wxMBConv_win32()
1490 {
1491 m_CodePage = CP_ACP;
1492 }
1493
1494 #if wxUSE_FONTMAP
1495 wxMBConv_win32(const wxChar* name)
1496 {
1497 m_CodePage = wxCharsetToCodepage(name);
1498 }
1499
1500 wxMBConv_win32(wxFontEncoding encoding)
1501 {
1502 m_CodePage = wxEncodingToCodepage(encoding);
1503 }
1504 #endif
1505
1506 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1507 {
1508 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1509 // the behaviour is not compatible with the Unix version (using iconv)
1510 // and break the library itself, e.g. wxTextInputStream::NextChar()
1511 // wouldn't work if reading an incomplete MB char didn't result in an
1512 // error
1513 //
1514 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1515 // an error (tested under Windows Server 2003) and apparently it is
1516 // done on purpose, i.e. the function accepts any input in this case
1517 // and although I'd prefer to return error on ill-formed output, our
1518 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1519 // explicitly ill-formed according to RFC 2152) neither so we don't
1520 // even have any fallback here...
1521 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1522
1523 const size_t len = ::MultiByteToWideChar
1524 (
1525 m_CodePage, // code page
1526 flags, // flags: fall on error
1527 psz, // input string
1528 -1, // its length (NUL-terminated)
1529 buf, // output string
1530 buf ? n : 0 // size of output buffer
1531 );
1532
1533 // note that it returns count of written chars for buf != NULL and size
1534 // of the needed buffer for buf == NULL so in either case the length of
1535 // the string (which never includes the terminating NUL) is one less
1536 return len ? len - 1 : (size_t)-1;
1537 }
1538
1539 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1540 {
1541 /*
1542 we have a problem here: by default, WideCharToMultiByte() may
1543 replace characters unrepresentable in the target code page with bad
1544 quality approximations such as turning "1/2" symbol (U+00BD) into
1545 "1" for the code pages which don't have it and we, obviously, want
1546 to avoid this at any price
1547
1548 the trouble is that this function does it _silently_, i.e. it won't
1549 even tell us whether it did or not... Win98/2000 and higher provide
1550 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1551 we have to resort to a round trip, i.e. check that converting back
1552 results in the same string -- this is, of course, expensive but
1553 otherwise we simply can't be sure to not garble the data.
1554 */
1555
1556 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1557 // it doesn't work with CJK encodings (which we test for rather roughly
1558 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1559 // supporting it
1560 BOOL usedDef wxDUMMY_INITIALIZE(false);
1561 BOOL *pUsedDef;
1562 int flags;
1563 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1564 {
1565 // it's our lucky day
1566 flags = WC_NO_BEST_FIT_CHARS;
1567 pUsedDef = &usedDef;
1568 }
1569 else // old system or unsupported encoding
1570 {
1571 flags = 0;
1572 pUsedDef = NULL;
1573 }
1574
1575 const size_t len = ::WideCharToMultiByte
1576 (
1577 m_CodePage, // code page
1578 flags, // either none or no best fit
1579 pwz, // input string
1580 -1, // it is (wide) NUL-terminated
1581 buf, // output buffer
1582 buf ? n : 0, // and its size
1583 NULL, // default "replacement" char
1584 pUsedDef // [out] was it used?
1585 );
1586
1587 if ( !len )
1588 {
1589 // function totally failed
1590 return (size_t)-1;
1591 }
1592
1593 // if we were really converting, check if we succeeded
1594 if ( buf )
1595 {
1596 if ( flags )
1597 {
1598 // check if the conversion failed, i.e. if any replacements
1599 // were done
1600 if ( usedDef )
1601 return (size_t)-1;
1602 }
1603 else // we must resort to double tripping...
1604 {
1605 wxWCharBuffer wcBuf(n);
1606 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1607 wcscmp(wcBuf, pwz) != 0 )
1608 {
1609 // we didn't obtain the same thing we started from, hence
1610 // the conversion was lossy and we consider that it failed
1611 return (size_t)-1;
1612 }
1613 }
1614 }
1615
1616 // see the comment above for the reason of "len - 1"
1617 return len - 1;
1618 }
1619
1620 bool IsOk() const { return m_CodePage != -1; }
1621
1622 private:
1623 static bool CanUseNoBestFit()
1624 {
1625 static int s_isWin98Or2k = -1;
1626
1627 if ( s_isWin98Or2k == -1 )
1628 {
1629 int verMaj, verMin;
1630 switch ( wxGetOsVersion(&verMaj, &verMin) )
1631 {
1632 case wxWIN95:
1633 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1634 break;
1635
1636 case wxWINDOWS_NT:
1637 s_isWin98Or2k = verMaj >= 5;
1638 break;
1639
1640 default:
1641 // unknown, be conseravtive by default
1642 s_isWin98Or2k = 0;
1643 }
1644
1645 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1646 }
1647
1648 return s_isWin98Or2k == 1;
1649 }
1650
1651 long m_CodePage;
1652 };
1653
1654 #endif // wxHAVE_WIN32_MB2WC
1655
1656 // ============================================================================
1657 // Cocoa conversion classes
1658 // ============================================================================
1659
1660 #if defined(__WXCOCOA__)
1661
1662 // RN: There is no UTF-32 support in either Core Foundation or
1663 // Cocoa. Strangely enough, internally Core Foundation uses
1664 // UTF 32 internally quite a bit - its just not public (yet).
1665
1666 #include <CoreFoundation/CFString.h>
1667 #include <CoreFoundation/CFStringEncodingExt.h>
1668
1669 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1670 {
1671 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1672 if ( encoding == wxFONTENCODING_DEFAULT )
1673 {
1674 enc = CFStringGetSystemEncoding();
1675 }
1676 else switch( encoding)
1677 {
1678 case wxFONTENCODING_ISO8859_1 :
1679 enc = kCFStringEncodingISOLatin1 ;
1680 break ;
1681 case wxFONTENCODING_ISO8859_2 :
1682 enc = kCFStringEncodingISOLatin2;
1683 break ;
1684 case wxFONTENCODING_ISO8859_3 :
1685 enc = kCFStringEncodingISOLatin3 ;
1686 break ;
1687 case wxFONTENCODING_ISO8859_4 :
1688 enc = kCFStringEncodingISOLatin4;
1689 break ;
1690 case wxFONTENCODING_ISO8859_5 :
1691 enc = kCFStringEncodingISOLatinCyrillic;
1692 break ;
1693 case wxFONTENCODING_ISO8859_6 :
1694 enc = kCFStringEncodingISOLatinArabic;
1695 break ;
1696 case wxFONTENCODING_ISO8859_7 :
1697 enc = kCFStringEncodingISOLatinGreek;
1698 break ;
1699 case wxFONTENCODING_ISO8859_8 :
1700 enc = kCFStringEncodingISOLatinHebrew;
1701 break ;
1702 case wxFONTENCODING_ISO8859_9 :
1703 enc = kCFStringEncodingISOLatin5;
1704 break ;
1705 case wxFONTENCODING_ISO8859_10 :
1706 enc = kCFStringEncodingISOLatin6;
1707 break ;
1708 case wxFONTENCODING_ISO8859_11 :
1709 enc = kCFStringEncodingISOLatinThai;
1710 break ;
1711 case wxFONTENCODING_ISO8859_13 :
1712 enc = kCFStringEncodingISOLatin7;
1713 break ;
1714 case wxFONTENCODING_ISO8859_14 :
1715 enc = kCFStringEncodingISOLatin8;
1716 break ;
1717 case wxFONTENCODING_ISO8859_15 :
1718 enc = kCFStringEncodingISOLatin9;
1719 break ;
1720
1721 case wxFONTENCODING_KOI8 :
1722 enc = kCFStringEncodingKOI8_R;
1723 break ;
1724 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1725 enc = kCFStringEncodingDOSRussian;
1726 break ;
1727
1728 // case wxFONTENCODING_BULGARIAN :
1729 // enc = ;
1730 // break ;
1731
1732 case wxFONTENCODING_CP437 :
1733 enc =kCFStringEncodingDOSLatinUS ;
1734 break ;
1735 case wxFONTENCODING_CP850 :
1736 enc = kCFStringEncodingDOSLatin1;
1737 break ;
1738 case wxFONTENCODING_CP852 :
1739 enc = kCFStringEncodingDOSLatin2;
1740 break ;
1741 case wxFONTENCODING_CP855 :
1742 enc = kCFStringEncodingDOSCyrillic;
1743 break ;
1744 case wxFONTENCODING_CP866 :
1745 enc =kCFStringEncodingDOSRussian ;
1746 break ;
1747 case wxFONTENCODING_CP874 :
1748 enc = kCFStringEncodingDOSThai;
1749 break ;
1750 case wxFONTENCODING_CP932 :
1751 enc = kCFStringEncodingDOSJapanese;
1752 break ;
1753 case wxFONTENCODING_CP936 :
1754 enc =kCFStringEncodingDOSChineseSimplif ;
1755 break ;
1756 case wxFONTENCODING_CP949 :
1757 enc = kCFStringEncodingDOSKorean;
1758 break ;
1759 case wxFONTENCODING_CP950 :
1760 enc = kCFStringEncodingDOSChineseTrad;
1761 break ;
1762 case wxFONTENCODING_CP1250 :
1763 enc = kCFStringEncodingWindowsLatin2;
1764 break ;
1765 case wxFONTENCODING_CP1251 :
1766 enc =kCFStringEncodingWindowsCyrillic ;
1767 break ;
1768 case wxFONTENCODING_CP1252 :
1769 enc =kCFStringEncodingWindowsLatin1 ;
1770 break ;
1771 case wxFONTENCODING_CP1253 :
1772 enc = kCFStringEncodingWindowsGreek;
1773 break ;
1774 case wxFONTENCODING_CP1254 :
1775 enc = kCFStringEncodingWindowsLatin5;
1776 break ;
1777 case wxFONTENCODING_CP1255 :
1778 enc =kCFStringEncodingWindowsHebrew ;
1779 break ;
1780 case wxFONTENCODING_CP1256 :
1781 enc =kCFStringEncodingWindowsArabic ;
1782 break ;
1783 case wxFONTENCODING_CP1257 :
1784 enc = kCFStringEncodingWindowsBalticRim;
1785 break ;
1786 // This only really encodes to UTF7 (if that) evidently
1787 // case wxFONTENCODING_UTF7 :
1788 // enc = kCFStringEncodingNonLossyASCII ;
1789 // break ;
1790 case wxFONTENCODING_UTF8 :
1791 enc = kCFStringEncodingUTF8 ;
1792 break ;
1793 case wxFONTENCODING_EUC_JP :
1794 enc = kCFStringEncodingEUC_JP;
1795 break ;
1796 case wxFONTENCODING_UTF16 :
1797 enc = kCFStringEncodingUnicode ;
1798 break ;
1799 case wxFONTENCODING_MACROMAN :
1800 enc = kCFStringEncodingMacRoman ;
1801 break ;
1802 case wxFONTENCODING_MACJAPANESE :
1803 enc = kCFStringEncodingMacJapanese ;
1804 break ;
1805 case wxFONTENCODING_MACCHINESETRAD :
1806 enc = kCFStringEncodingMacChineseTrad ;
1807 break ;
1808 case wxFONTENCODING_MACKOREAN :
1809 enc = kCFStringEncodingMacKorean ;
1810 break ;
1811 case wxFONTENCODING_MACARABIC :
1812 enc = kCFStringEncodingMacArabic ;
1813 break ;
1814 case wxFONTENCODING_MACHEBREW :
1815 enc = kCFStringEncodingMacHebrew ;
1816 break ;
1817 case wxFONTENCODING_MACGREEK :
1818 enc = kCFStringEncodingMacGreek ;
1819 break ;
1820 case wxFONTENCODING_MACCYRILLIC :
1821 enc = kCFStringEncodingMacCyrillic ;
1822 break ;
1823 case wxFONTENCODING_MACDEVANAGARI :
1824 enc = kCFStringEncodingMacDevanagari ;
1825 break ;
1826 case wxFONTENCODING_MACGURMUKHI :
1827 enc = kCFStringEncodingMacGurmukhi ;
1828 break ;
1829 case wxFONTENCODING_MACGUJARATI :
1830 enc = kCFStringEncodingMacGujarati ;
1831 break ;
1832 case wxFONTENCODING_MACORIYA :
1833 enc = kCFStringEncodingMacOriya ;
1834 break ;
1835 case wxFONTENCODING_MACBENGALI :
1836 enc = kCFStringEncodingMacBengali ;
1837 break ;
1838 case wxFONTENCODING_MACTAMIL :
1839 enc = kCFStringEncodingMacTamil ;
1840 break ;
1841 case wxFONTENCODING_MACTELUGU :
1842 enc = kCFStringEncodingMacTelugu ;
1843 break ;
1844 case wxFONTENCODING_MACKANNADA :
1845 enc = kCFStringEncodingMacKannada ;
1846 break ;
1847 case wxFONTENCODING_MACMALAJALAM :
1848 enc = kCFStringEncodingMacMalayalam ;
1849 break ;
1850 case wxFONTENCODING_MACSINHALESE :
1851 enc = kCFStringEncodingMacSinhalese ;
1852 break ;
1853 case wxFONTENCODING_MACBURMESE :
1854 enc = kCFStringEncodingMacBurmese ;
1855 break ;
1856 case wxFONTENCODING_MACKHMER :
1857 enc = kCFStringEncodingMacKhmer ;
1858 break ;
1859 case wxFONTENCODING_MACTHAI :
1860 enc = kCFStringEncodingMacThai ;
1861 break ;
1862 case wxFONTENCODING_MACLAOTIAN :
1863 enc = kCFStringEncodingMacLaotian ;
1864 break ;
1865 case wxFONTENCODING_MACGEORGIAN :
1866 enc = kCFStringEncodingMacGeorgian ;
1867 break ;
1868 case wxFONTENCODING_MACARMENIAN :
1869 enc = kCFStringEncodingMacArmenian ;
1870 break ;
1871 case wxFONTENCODING_MACCHINESESIMP :
1872 enc = kCFStringEncodingMacChineseSimp ;
1873 break ;
1874 case wxFONTENCODING_MACTIBETAN :
1875 enc = kCFStringEncodingMacTibetan ;
1876 break ;
1877 case wxFONTENCODING_MACMONGOLIAN :
1878 enc = kCFStringEncodingMacMongolian ;
1879 break ;
1880 case wxFONTENCODING_MACETHIOPIC :
1881 enc = kCFStringEncodingMacEthiopic ;
1882 break ;
1883 case wxFONTENCODING_MACCENTRALEUR :
1884 enc = kCFStringEncodingMacCentralEurRoman ;
1885 break ;
1886 case wxFONTENCODING_MACVIATNAMESE :
1887 enc = kCFStringEncodingMacVietnamese ;
1888 break ;
1889 case wxFONTENCODING_MACARABICEXT :
1890 enc = kCFStringEncodingMacExtArabic ;
1891 break ;
1892 case wxFONTENCODING_MACSYMBOL :
1893 enc = kCFStringEncodingMacSymbol ;
1894 break ;
1895 case wxFONTENCODING_MACDINGBATS :
1896 enc = kCFStringEncodingMacDingbats ;
1897 break ;
1898 case wxFONTENCODING_MACTURKISH :
1899 enc = kCFStringEncodingMacTurkish ;
1900 break ;
1901 case wxFONTENCODING_MACCROATIAN :
1902 enc = kCFStringEncodingMacCroatian ;
1903 break ;
1904 case wxFONTENCODING_MACICELANDIC :
1905 enc = kCFStringEncodingMacIcelandic ;
1906 break ;
1907 case wxFONTENCODING_MACROMANIAN :
1908 enc = kCFStringEncodingMacRomanian ;
1909 break ;
1910 case wxFONTENCODING_MACCELTIC :
1911 enc = kCFStringEncodingMacCeltic ;
1912 break ;
1913 case wxFONTENCODING_MACGAELIC :
1914 enc = kCFStringEncodingMacGaelic ;
1915 break ;
1916 // case wxFONTENCODING_MACKEYBOARD :
1917 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1918 // break ;
1919 default :
1920 // because gcc is picky
1921 break ;
1922 } ;
1923 return enc ;
1924 }
1925
1926 class wxMBConv_cocoa : public wxMBConv
1927 {
1928 public:
1929 wxMBConv_cocoa()
1930 {
1931 Init(CFStringGetSystemEncoding()) ;
1932 }
1933
1934 #if wxUSE_FONTMAP
1935 wxMBConv_cocoa(const wxChar* name)
1936 {
1937 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
1938 }
1939 #endif
1940
1941 wxMBConv_cocoa(wxFontEncoding encoding)
1942 {
1943 Init( wxCFStringEncFromFontEnc(encoding) );
1944 }
1945
1946 ~wxMBConv_cocoa()
1947 {
1948 }
1949
1950 void Init( CFStringEncoding encoding)
1951 {
1952 m_encoding = encoding ;
1953 }
1954
1955 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1956 {
1957 wxASSERT(szUnConv);
1958
1959 CFStringRef theString = CFStringCreateWithBytes (
1960 NULL, //the allocator
1961 (const UInt8*)szUnConv,
1962 strlen(szUnConv),
1963 m_encoding,
1964 false //no BOM/external representation
1965 );
1966
1967 wxASSERT(theString);
1968
1969 size_t nOutLength = CFStringGetLength(theString);
1970
1971 if (szOut == NULL)
1972 {
1973 CFRelease(theString);
1974 return nOutLength;
1975 }
1976
1977 CFRange theRange = { 0, nOutSize };
1978
1979 #if SIZEOF_WCHAR_T == 4
1980 UniChar* szUniCharBuffer = new UniChar[nOutSize];
1981 #endif
1982
1983 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1984
1985 CFRelease(theString);
1986
1987 szUniCharBuffer[nOutLength] = '\0' ;
1988
1989 #if SIZEOF_WCHAR_T == 4
1990 wxMBConvUTF16 converter ;
1991 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1992 delete[] szUniCharBuffer;
1993 #endif
1994
1995 return nOutLength;
1996 }
1997
1998 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1999 {
2000 wxASSERT(szUnConv);
2001
2002 size_t nRealOutSize;
2003 size_t nBufSize = wxWcslen(szUnConv);
2004 UniChar* szUniBuffer = (UniChar*) szUnConv;
2005
2006 #if SIZEOF_WCHAR_T == 4
2007 wxMBConvUTF16BE converter ;
2008 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2009 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2010 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2011 nBufSize /= sizeof(UniChar);
2012 #endif
2013
2014 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2015 NULL, //allocator
2016 szUniBuffer,
2017 nBufSize,
2018 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2019 );
2020
2021 wxASSERT(theString);
2022
2023 //Note that CER puts a BOM when converting to unicode
2024 //so we check and use getchars instead in that case
2025 if (m_encoding == kCFStringEncodingUnicode)
2026 {
2027 if (szOut != NULL)
2028 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2029
2030 nRealOutSize = CFStringGetLength(theString) + 1;
2031 }
2032 else
2033 {
2034 CFStringGetBytes(
2035 theString,
2036 CFRangeMake(0, CFStringGetLength(theString)),
2037 m_encoding,
2038 0, //what to put in characters that can't be converted -
2039 //0 tells CFString to return NULL if it meets such a character
2040 false, //not an external representation
2041 (UInt8*) szOut,
2042 nOutSize,
2043 (CFIndex*) &nRealOutSize
2044 );
2045 }
2046
2047 CFRelease(theString);
2048
2049 #if SIZEOF_WCHAR_T == 4
2050 delete[] szUniBuffer;
2051 #endif
2052
2053 return nRealOutSize - 1;
2054 }
2055
2056 bool IsOk() const
2057 {
2058 return m_encoding != kCFStringEncodingInvalidId &&
2059 CFStringIsEncodingAvailable(m_encoding);
2060 }
2061
2062 private:
2063 CFStringEncoding m_encoding ;
2064 };
2065
2066 #endif // defined(__WXCOCOA__)
2067
2068 // ============================================================================
2069 // Mac conversion classes
2070 // ============================================================================
2071
2072 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2073
2074 class wxMBConv_mac : public wxMBConv
2075 {
2076 public:
2077 wxMBConv_mac()
2078 {
2079 Init(CFStringGetSystemEncoding()) ;
2080 }
2081
2082 #if wxUSE_FONTMAP
2083 wxMBConv_mac(const wxChar* name)
2084 {
2085 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2086 }
2087 #endif
2088
2089 wxMBConv_mac(wxFontEncoding encoding)
2090 {
2091 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2092 }
2093
2094 ~wxMBConv_mac()
2095 {
2096 OSStatus status = noErr ;
2097 status = TECDisposeConverter(m_MB2WC_converter);
2098 status = TECDisposeConverter(m_WC2MB_converter);
2099 }
2100
2101
2102 void Init( TextEncodingBase encoding)
2103 {
2104 OSStatus status = noErr ;
2105 m_char_encoding = encoding ;
2106 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2107
2108 status = TECCreateConverter(&m_MB2WC_converter,
2109 m_char_encoding,
2110 m_unicode_encoding);
2111 status = TECCreateConverter(&m_WC2MB_converter,
2112 m_unicode_encoding,
2113 m_char_encoding);
2114 }
2115
2116 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2117 {
2118 OSStatus status = noErr ;
2119 ByteCount byteOutLen ;
2120 ByteCount byteInLen = strlen(psz) ;
2121 wchar_t *tbuf = NULL ;
2122 UniChar* ubuf = NULL ;
2123 size_t res = 0 ;
2124
2125 if (buf == NULL)
2126 {
2127 //apple specs say at least 32
2128 n = wxMax( 32 , byteInLen ) ;
2129 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2130 }
2131 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2132 #if SIZEOF_WCHAR_T == 4
2133 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2134 #else
2135 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2136 #endif
2137 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2138 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2139 #if SIZEOF_WCHAR_T == 4
2140 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2141 // is not properly terminated we get random characters at the end
2142 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2143 wxMBConvUTF16BE converter ;
2144 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2145 free( ubuf ) ;
2146 #else
2147 res = byteOutLen / sizeof( UniChar ) ;
2148 #endif
2149 if ( buf == NULL )
2150 free(tbuf) ;
2151
2152 if ( buf && res < n)
2153 buf[res] = 0;
2154
2155 return res ;
2156 }
2157
2158 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2159 {
2160 OSStatus status = noErr ;
2161 ByteCount byteOutLen ;
2162 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2163
2164 char *tbuf = NULL ;
2165
2166 if (buf == NULL)
2167 {
2168 //apple specs say at least 32
2169 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2170 tbuf = (char*) malloc( n ) ;
2171 }
2172
2173 ByteCount byteBufferLen = n ;
2174 UniChar* ubuf = NULL ;
2175 #if SIZEOF_WCHAR_T == 4
2176 wxMBConvUTF16BE converter ;
2177 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2178 byteInLen = unicharlen ;
2179 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2180 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2181 #else
2182 ubuf = (UniChar*) psz ;
2183 #endif
2184 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2185 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2186 #if SIZEOF_WCHAR_T == 4
2187 free( ubuf ) ;
2188 #endif
2189 if ( buf == NULL )
2190 free(tbuf) ;
2191
2192 size_t res = byteOutLen ;
2193 if ( buf && res < n)
2194 {
2195 buf[res] = 0;
2196
2197 //we need to double-trip to verify it didn't insert any ? in place
2198 //of bogus characters
2199 wxWCharBuffer wcBuf(n);
2200 size_t pszlen = wxWcslen(psz);
2201 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2202 wxWcslen(wcBuf) != pszlen ||
2203 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2204 {
2205 // we didn't obtain the same thing we started from, hence
2206 // the conversion was lossy and we consider that it failed
2207 return (size_t)-1;
2208 }
2209 }
2210
2211 return res ;
2212 }
2213
2214 bool IsOk() const
2215 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2216
2217 private:
2218 TECObjectRef m_MB2WC_converter ;
2219 TECObjectRef m_WC2MB_converter ;
2220
2221 TextEncodingBase m_char_encoding ;
2222 TextEncodingBase m_unicode_encoding ;
2223 };
2224
2225 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2226
2227 // ============================================================================
2228 // wxEncodingConverter based conversion classes
2229 // ============================================================================
2230
2231 #if wxUSE_FONTMAP
2232
2233 class wxMBConv_wxwin : public wxMBConv
2234 {
2235 private:
2236 void Init()
2237 {
2238 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2239 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2240 }
2241
2242 public:
2243 // temporarily just use wxEncodingConverter stuff,
2244 // so that it works while a better implementation is built
2245 wxMBConv_wxwin(const wxChar* name)
2246 {
2247 if (name)
2248 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2249 else
2250 m_enc = wxFONTENCODING_SYSTEM;
2251
2252 Init();
2253 }
2254
2255 wxMBConv_wxwin(wxFontEncoding enc)
2256 {
2257 m_enc = enc;
2258
2259 Init();
2260 }
2261
2262 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2263 {
2264 size_t inbuf = strlen(psz);
2265 if (buf)
2266 {
2267 if (!m2w.Convert(psz,buf))
2268 return (size_t)-1;
2269 }
2270 return inbuf;
2271 }
2272
2273 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2274 {
2275 const size_t inbuf = wxWcslen(psz);
2276 if (buf)
2277 {
2278 if (!w2m.Convert(psz,buf))
2279 return (size_t)-1;
2280 }
2281
2282 return inbuf;
2283 }
2284
2285 bool IsOk() const { return m_ok; }
2286
2287 public:
2288 wxFontEncoding m_enc;
2289 wxEncodingConverter m2w, w2m;
2290
2291 // were we initialized successfully?
2292 bool m_ok;
2293
2294 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2295 };
2296
2297 #endif // wxUSE_FONTMAP
2298
2299 // ============================================================================
2300 // wxCSConv implementation
2301 // ============================================================================
2302
2303 void wxCSConv::Init()
2304 {
2305 m_name = NULL;
2306 m_convReal = NULL;
2307 m_deferred = true;
2308 }
2309
2310 wxCSConv::wxCSConv(const wxChar *charset)
2311 {
2312 Init();
2313
2314 if ( charset )
2315 {
2316 SetName(charset);
2317 }
2318
2319 m_encoding = wxFONTENCODING_SYSTEM;
2320 }
2321
2322 wxCSConv::wxCSConv(wxFontEncoding encoding)
2323 {
2324 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2325 {
2326 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2327
2328 encoding = wxFONTENCODING_SYSTEM;
2329 }
2330
2331 Init();
2332
2333 m_encoding = encoding;
2334 }
2335
2336 wxCSConv::~wxCSConv()
2337 {
2338 Clear();
2339 }
2340
2341 wxCSConv::wxCSConv(const wxCSConv& conv)
2342 : wxMBConv()
2343 {
2344 Init();
2345
2346 SetName(conv.m_name);
2347 m_encoding = conv.m_encoding;
2348 }
2349
2350 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2351 {
2352 Clear();
2353
2354 SetName(conv.m_name);
2355 m_encoding = conv.m_encoding;
2356
2357 return *this;
2358 }
2359
2360 void wxCSConv::Clear()
2361 {
2362 free(m_name);
2363 delete m_convReal;
2364
2365 m_name = NULL;
2366 m_convReal = NULL;
2367 }
2368
2369 void wxCSConv::SetName(const wxChar *charset)
2370 {
2371 if (charset)
2372 {
2373 m_name = wxStrdup(charset);
2374 m_deferred = true;
2375 }
2376 }
2377
2378 wxMBConv *wxCSConv::DoCreate() const
2379 {
2380 // check for the special case of ASCII or ISO8859-1 charset: as we have
2381 // special knowledge of it anyhow, we don't need to create a special
2382 // conversion object
2383 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2384 {
2385 // don't convert at all
2386 return NULL;
2387 }
2388
2389 // we trust OS to do conversion better than we can so try external
2390 // conversion methods first
2391 //
2392 // the full order is:
2393 // 1. OS conversion (iconv() under Unix or Win32 API)
2394 // 2. hard coded conversions for UTF
2395 // 3. wxEncodingConverter as fall back
2396
2397 // step (1)
2398 #ifdef HAVE_ICONV
2399 #if !wxUSE_FONTMAP
2400 if ( m_name )
2401 #endif // !wxUSE_FONTMAP
2402 {
2403 wxString name(m_name);
2404
2405 #if wxUSE_FONTMAP
2406 if ( name.empty() )
2407 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2408 #endif // wxUSE_FONTMAP
2409
2410 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2411 if ( conv->IsOk() )
2412 return conv;
2413
2414 delete conv;
2415 }
2416 #endif // HAVE_ICONV
2417
2418 #ifdef wxHAVE_WIN32_MB2WC
2419 {
2420 #if wxUSE_FONTMAP
2421 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2422 : new wxMBConv_win32(m_encoding);
2423 if ( conv->IsOk() )
2424 return conv;
2425
2426 delete conv;
2427 #else
2428 return NULL;
2429 #endif
2430 }
2431 #endif // wxHAVE_WIN32_MB2WC
2432 #if defined(__WXMAC__)
2433 {
2434 // leave UTF16 and UTF32 to the built-ins of wx
2435 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2436 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2437 {
2438
2439 #if wxUSE_FONTMAP
2440 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2441 : new wxMBConv_mac(m_encoding);
2442 #else
2443 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2444 #endif
2445 if ( conv->IsOk() )
2446 return conv;
2447
2448 delete conv;
2449 }
2450 }
2451 #endif
2452 #if defined(__WXCOCOA__)
2453 {
2454 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2455 {
2456
2457 #if wxUSE_FONTMAP
2458 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2459 : new wxMBConv_cocoa(m_encoding);
2460 #else
2461 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2462 #endif
2463 if ( conv->IsOk() )
2464 return conv;
2465
2466 delete conv;
2467 }
2468 }
2469 #endif
2470 // step (2)
2471 wxFontEncoding enc = m_encoding;
2472 #if wxUSE_FONTMAP
2473 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2474 {
2475 // use "false" to suppress interactive dialogs -- we can be called from
2476 // anywhere and popping up a dialog from here is the last thing we want to
2477 // do
2478 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2479 }
2480 #endif // wxUSE_FONTMAP
2481
2482 switch ( enc )
2483 {
2484 case wxFONTENCODING_UTF7:
2485 return new wxMBConvUTF7;
2486
2487 case wxFONTENCODING_UTF8:
2488 return new wxMBConvUTF8;
2489
2490 case wxFONTENCODING_UTF16BE:
2491 return new wxMBConvUTF16BE;
2492
2493 case wxFONTENCODING_UTF16LE:
2494 return new wxMBConvUTF16LE;
2495
2496 case wxFONTENCODING_UTF32BE:
2497 return new wxMBConvUTF32BE;
2498
2499 case wxFONTENCODING_UTF32LE:
2500 return new wxMBConvUTF32LE;
2501
2502 default:
2503 // nothing to do but put here to suppress gcc warnings
2504 ;
2505 }
2506
2507 // step (3)
2508 #if wxUSE_FONTMAP
2509 {
2510 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2511 : new wxMBConv_wxwin(m_encoding);
2512 if ( conv->IsOk() )
2513 return conv;
2514
2515 delete conv;
2516 }
2517 #endif // wxUSE_FONTMAP
2518
2519 // NB: This is a hack to prevent deadlock. What could otherwise happen
2520 // in Unicode build: wxConvLocal creation ends up being here
2521 // because of some failure and logs the error. But wxLog will try to
2522 // attach timestamp, for which it will need wxConvLocal (to convert
2523 // time to char* and then wchar_t*), but that fails, tries to log
2524 // error, but wxLog has a (already locked) critical section that
2525 // guards static buffer.
2526 static bool alreadyLoggingError = false;
2527 if (!alreadyLoggingError)
2528 {
2529 alreadyLoggingError = true;
2530 wxLogError(_("Cannot convert from the charset '%s'!"),
2531 m_name ? m_name
2532 :
2533 #if wxUSE_FONTMAP
2534 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2535 #else // !wxUSE_FONTMAP
2536 wxString::Format(_("encoding %s"), m_encoding).c_str()
2537 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2538 );
2539 alreadyLoggingError = false;
2540 }
2541
2542 return NULL;
2543 }
2544
2545 void wxCSConv::CreateConvIfNeeded() const
2546 {
2547 if ( m_deferred )
2548 {
2549 wxCSConv *self = (wxCSConv *)this; // const_cast
2550
2551 #if wxUSE_INTL
2552 // if we don't have neither the name nor the encoding, use the default
2553 // encoding for this system
2554 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2555 {
2556 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2557 }
2558 #endif // wxUSE_INTL
2559
2560 self->m_convReal = DoCreate();
2561 self->m_deferred = false;
2562 }
2563 }
2564
2565 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2566 {
2567 CreateConvIfNeeded();
2568
2569 if (m_convReal)
2570 return m_convReal->MB2WC(buf, psz, n);
2571
2572 // latin-1 (direct)
2573 size_t len = strlen(psz);
2574
2575 if (buf)
2576 {
2577 for (size_t c = 0; c <= len; c++)
2578 buf[c] = (unsigned char)(psz[c]);
2579 }
2580
2581 return len;
2582 }
2583
2584 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2585 {
2586 CreateConvIfNeeded();
2587
2588 if (m_convReal)
2589 return m_convReal->WC2MB(buf, psz, n);
2590
2591 // latin-1 (direct)
2592 const size_t len = wxWcslen(psz);
2593 if (buf)
2594 {
2595 for (size_t c = 0; c <= len; c++)
2596 {
2597 if (psz[c] > 0xFF)
2598 return (size_t)-1;
2599 buf[c] = (char)psz[c];
2600 }
2601 }
2602 else
2603 {
2604 for (size_t c = 0; c <= len; c++)
2605 {
2606 if (psz[c] > 0xFF)
2607 return (size_t)-1;
2608 }
2609 }
2610
2611 return len;
2612 }
2613
2614 // ----------------------------------------------------------------------------
2615 // globals
2616 // ----------------------------------------------------------------------------
2617
2618 #ifdef __WINDOWS__
2619 static wxMBConv_win32 wxConvLibcObj;
2620 #elif defined(__WXMAC__) && !defined(__MACH__)
2621 static wxMBConv_mac wxConvLibcObj ;
2622 #else
2623 static wxMBConvLibc wxConvLibcObj;
2624 #endif
2625
2626 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2627 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2628 static wxMBConvUTF7 wxConvUTF7Obj;
2629 static wxMBConvUTF8 wxConvUTF8Obj;
2630
2631 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2632 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2633 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2634 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2635 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2636 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2637 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2638 #ifdef __WXOSX__
2639 wxConvUTF8Obj;
2640 #else
2641 wxConvLibcObj;
2642 #endif
2643
2644
2645 #else // !wxUSE_WCHAR_T
2646
2647 // stand-ins in absence of wchar_t
2648 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2649 wxConvISO8859_1,
2650 wxConvLocal,
2651 wxConvUTF8;
2652
2653 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2654
2655