]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
e88ccba1842da4afa4e7e44caf49ab2fafda2a71
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WINDOWS__
44 #include "wx/msw/private.h"
45 #include "wx/msw/missing.h"
46 #endif
47
48 #ifndef __WXWINCE__
49 #include <errno.h>
50 #endif
51
52 #include <ctype.h>
53 #include <string.h>
54 #include <stdlib.h>
55
56 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
57 #define wxHAVE_WIN32_MB2WC
58 #endif // __WIN32__ but !__WXMICROWIN__
59
60 // ----------------------------------------------------------------------------
61 // headers
62 // ----------------------------------------------------------------------------
63
64 #ifdef __SALFORDC__
65 #include <clib.h>
66 #endif
67
68 #ifdef HAVE_ICONV
69 #include <iconv.h>
70 #include "wx/thread.h"
71 #endif
72
73 #include "wx/encconv.h"
74 #include "wx/fontmap.h"
75 #include "wx/utils.h"
76
77 #ifdef __WXMAC__
78 #ifndef __DARWIN__
79 #include <ATSUnicode.h>
80 #include <TextCommon.h>
81 #include <TextEncodingConverter.h>
82 #endif
83
84 #include "wx/mac/private.h" // includes mac headers
85 #endif
86 // ----------------------------------------------------------------------------
87 // macros
88 // ----------------------------------------------------------------------------
89
90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
92
93 #if SIZEOF_WCHAR_T == 4
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
98 #else
99 #define WC_NAME_BEST "UCS-4LE"
100 #endif
101 #elif SIZEOF_WCHAR_T == 2
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
104 #define WC_UTF16
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
107 #else
108 #define WC_NAME_BEST "UTF-16LE"
109 #endif
110 #else // sizeof(wchar_t) != 2 nor 4
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
113 #endif
114
115 // ============================================================================
116 // implementation
117 // ============================================================================
118
119 // ----------------------------------------------------------------------------
120 // UTF-16 en/decoding to/from UCS-4
121 // ----------------------------------------------------------------------------
122
123
124 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
125 {
126 if (input<=0xffff)
127 {
128 if (output)
129 *output = (wxUint16) input;
130 return 1;
131 }
132 else if (input>=0x110000)
133 {
134 return (size_t)-1;
135 }
136 else
137 {
138 if (output)
139 {
140 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
141 *output = (wxUint16) ((input&0x3ff)+0xdc00);
142 }
143 return 2;
144 }
145 }
146
147 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
148 {
149 if ((*input<0xd800) || (*input>0xdfff))
150 {
151 output = *input;
152 return 1;
153 }
154 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
155 {
156 output = *input;
157 return (size_t)-1;
158 }
159 else
160 {
161 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
162 return 2;
163 }
164 }
165
166
167 // ----------------------------------------------------------------------------
168 // wxMBConv
169 // ----------------------------------------------------------------------------
170
171 wxMBConv::~wxMBConv()
172 {
173 // nothing to do here (necessary for Darwin linking probably)
174 }
175
176 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
177 {
178 if ( psz )
179 {
180 // calculate the length of the buffer needed first
181 size_t nLen = MB2WC(NULL, psz, 0);
182 if ( nLen != (size_t)-1 )
183 {
184 // now do the actual conversion
185 wxWCharBuffer buf(nLen);
186 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
187 if ( nLen != (size_t)-1 )
188 {
189 return buf;
190 }
191 }
192 }
193
194 wxWCharBuffer buf((wchar_t *)NULL);
195
196 return buf;
197 }
198
199 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
200 {
201 if ( pwz )
202 {
203 size_t nLen = WC2MB(NULL, pwz, 0);
204 if ( nLen != (size_t)-1 )
205 {
206 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
207 nLen = WC2MB(buf.data(), pwz, nLen + 4);
208 if ( nLen != (size_t)-1 )
209 {
210 return buf;
211 }
212 }
213 }
214
215 wxCharBuffer buf((char *)NULL);
216
217 return buf;
218 }
219
220 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
221 {
222 wxASSERT(pOutSize != NULL);
223
224 const char* szEnd = szString + nStringLen + 1;
225 const char* szPos = szString;
226 const char* szStart = szPos;
227
228 size_t nActualLength = 0;
229 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
230
231 wxWCharBuffer theBuffer(nCurrentSize);
232
233 //Convert the string until the length() is reached, continuing the
234 //loop every time a null character is reached
235 while(szPos != szEnd)
236 {
237 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
238
239 //Get the length of the current (sub)string
240 size_t nLen = MB2WC(NULL, szPos, 0);
241
242 //Invalid conversion?
243 if( nLen == (size_t)-1 )
244 {
245 *pOutSize = 0;
246 theBuffer.data()[0u] = wxT('\0');
247 return theBuffer;
248 }
249
250
251 //Increase the actual length (+1 for current null character)
252 nActualLength += nLen + 1;
253
254 //if buffer too big, realloc the buffer
255 if (nActualLength > (nCurrentSize+1))
256 {
257 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
258 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
259 theBuffer = theNewBuffer;
260 nCurrentSize <<= 1;
261 }
262
263 //Convert the current (sub)string
264 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
265 {
266 *pOutSize = 0;
267 theBuffer.data()[0u] = wxT('\0');
268 return theBuffer;
269 }
270
271 //Increment to next (sub)string
272 //Note that we have to use strlen instead of nLen here
273 //because XX2XX gives us the size of the output buffer,
274 //which is not necessarily the length of the string
275 szPos += strlen(szPos) + 1;
276 }
277
278 //success - return actual length and the buffer
279 *pOutSize = nActualLength;
280 return theBuffer;
281 }
282
283 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
284 {
285 wxASSERT(pOutSize != NULL);
286
287 const wchar_t* szEnd = szString + nStringLen + 1;
288 const wchar_t* szPos = szString;
289 const wchar_t* szStart = szPos;
290
291 size_t nActualLength = 0;
292 size_t nCurrentSize = nStringLen << 2; //try * 4 first
293
294 wxCharBuffer theBuffer(nCurrentSize);
295
296 //Convert the string until the length() is reached, continuing the
297 //loop every time a null character is reached
298 while(szPos != szEnd)
299 {
300 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
301
302 //Get the length of the current (sub)string
303 size_t nLen = WC2MB(NULL, szPos, 0);
304
305 //Invalid conversion?
306 if( nLen == (size_t)-1 )
307 {
308 *pOutSize = 0;
309 theBuffer.data()[0u] = wxT('\0');
310 return theBuffer;
311 }
312
313 //Increase the actual length (+1 for current null character)
314 nActualLength += nLen + 1;
315
316 //if buffer too big, realloc the buffer
317 if (nActualLength > (nCurrentSize+1))
318 {
319 wxCharBuffer theNewBuffer(nCurrentSize << 1);
320 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
321 theBuffer = theNewBuffer;
322 nCurrentSize <<= 1;
323 }
324
325 //Convert the current (sub)string
326 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
327 {
328 *pOutSize = 0;
329 theBuffer.data()[0u] = wxT('\0');
330 return theBuffer;
331 }
332
333 //Increment to next (sub)string
334 //Note that we have to use wxWcslen instead of nLen here
335 //because XX2XX gives us the size of the output buffer,
336 //which is not necessarily the length of the string
337 szPos += wxWcslen(szPos) + 1;
338 }
339
340 //success - return actual length and the buffer
341 *pOutSize = nActualLength;
342 return theBuffer;
343 }
344
345 // ----------------------------------------------------------------------------
346 // wxMBConvLibc
347 // ----------------------------------------------------------------------------
348
349 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
350 {
351 return wxMB2WC(buf, psz, n);
352 }
353
354 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
355 {
356 return wxWC2MB(buf, psz, n);
357 }
358
359 #ifdef __UNIX__
360
361 // ----------------------------------------------------------------------------
362 // wxConvBrokenFileNames
363 // ----------------------------------------------------------------------------
364
365 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
366 {
367 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
368 || wxStricmp(charset, _T("UTF8")) == 0 )
369 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
370 else
371 m_conv = new wxCSConv(charset);
372 }
373
374 size_t
375 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
376 const char *psz,
377 size_t outputSize) const
378 {
379 return m_conv->MB2WC( outputBuf, psz, outputSize );
380 }
381
382 size_t
383 wxConvBrokenFileNames::WC2MB(char *outputBuf,
384 const wchar_t *psz,
385 size_t outputSize) const
386 {
387 return m_conv->WC2MB( outputBuf, psz, outputSize );
388 }
389
390 #endif
391
392 // ----------------------------------------------------------------------------
393 // UTF-7
394 // ----------------------------------------------------------------------------
395
396 // Implementation (C) 2004 Fredrik Roubert
397
398 //
399 // BASE64 decoding table
400 //
401 static const unsigned char utf7unb64[] =
402 {
403 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
409 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
410 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
412 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
413 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
414 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
415 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
416 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
417 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
418 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
435 };
436
437 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
438 {
439 size_t len = 0;
440
441 while (*psz && ((!buf) || (len < n)))
442 {
443 unsigned char cc = *psz++;
444 if (cc != '+')
445 {
446 // plain ASCII char
447 if (buf)
448 *buf++ = cc;
449 len++;
450 }
451 else if (*psz == '-')
452 {
453 // encoded plus sign
454 if (buf)
455 *buf++ = cc;
456 len++;
457 psz++;
458 }
459 else
460 {
461 // BASE64 encoded string
462 bool lsb;
463 unsigned char c;
464 unsigned int d, l;
465 for (lsb = false, d = 0, l = 0;
466 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
467 {
468 d <<= 6;
469 d += cc;
470 for (l += 6; l >= 8; lsb = !lsb)
471 {
472 c = (unsigned char)((d >> (l -= 8)) % 256);
473 if (lsb)
474 {
475 if (buf)
476 *buf++ |= c;
477 len ++;
478 }
479 else
480 if (buf)
481 *buf = (wchar_t)(c << 8);
482 }
483 }
484 if (*psz == '-')
485 psz++;
486 }
487 }
488 if (buf && (len < n))
489 *buf = 0;
490 return len;
491 }
492
493 //
494 // BASE64 encoding table
495 //
496 static const unsigned char utf7enb64[] =
497 {
498 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
499 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
500 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
501 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
502 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
503 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
504 'w', 'x', 'y', 'z', '0', '1', '2', '3',
505 '4', '5', '6', '7', '8', '9', '+', '/'
506 };
507
508 //
509 // UTF-7 encoding table
510 //
511 // 0 - Set D (directly encoded characters)
512 // 1 - Set O (optional direct characters)
513 // 2 - whitespace characters (optional)
514 // 3 - special characters
515 //
516 static const unsigned char utf7encode[128] =
517 {
518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
519 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
520 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
522 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
524 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
526 };
527
528 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
529 {
530
531
532 size_t len = 0;
533
534 while (*psz && ((!buf) || (len < n)))
535 {
536 wchar_t cc = *psz++;
537 if (cc < 0x80 && utf7encode[cc] < 1)
538 {
539 // plain ASCII char
540 if (buf)
541 *buf++ = (char)cc;
542 len++;
543 }
544 #ifndef WC_UTF16
545 else if (((wxUint32)cc) > 0xffff)
546 {
547 // no surrogate pair generation (yet?)
548 return (size_t)-1;
549 }
550 #endif
551 else
552 {
553 if (buf)
554 *buf++ = '+';
555 len++;
556 if (cc != '+')
557 {
558 // BASE64 encode string
559 unsigned int lsb, d, l;
560 for (d = 0, l = 0;; psz++)
561 {
562 for (lsb = 0; lsb < 2; lsb ++)
563 {
564 d <<= 8;
565 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
566
567 for (l += 8; l >= 6; )
568 {
569 l -= 6;
570 if (buf)
571 *buf++ = utf7enb64[(d >> l) % 64];
572 len++;
573 }
574 }
575 cc = *psz;
576 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
577 break;
578 }
579 if (l != 0)
580 {
581 if (buf)
582 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
583 len++;
584 }
585 }
586 if (buf)
587 *buf++ = '-';
588 len++;
589 }
590 }
591 if (buf && (len < n))
592 *buf = 0;
593 return len;
594 }
595
596 // ----------------------------------------------------------------------------
597 // UTF-8
598 // ----------------------------------------------------------------------------
599
600 static wxUint32 utf8_max[]=
601 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
602
603 // boundaries of the private use area we use to (temporarily) remap invalid
604 // characters invalid in a UTF-8 encoded string
605 const wxUint32 wxUnicodePUA = 0x100000;
606 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
607
608 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
609 {
610 size_t len = 0;
611
612 while (*psz && ((!buf) || (len < n)))
613 {
614 const char *opsz = psz;
615 bool invalid = false;
616 unsigned char cc = *psz++, fc = cc;
617 unsigned cnt;
618 for (cnt = 0; fc & 0x80; cnt++)
619 fc <<= 1;
620 if (!cnt)
621 {
622 // plain ASCII char
623 if (buf)
624 *buf++ = cc;
625 len++;
626
627 // escape the escape character for octal escapes
628 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
629 && cc == '\\' && (!buf || len < n))
630 {
631 if (buf)
632 *buf++ = cc;
633 len++;
634 }
635 }
636 else
637 {
638 cnt--;
639 if (!cnt)
640 {
641 // invalid UTF-8 sequence
642 invalid = true;
643 }
644 else
645 {
646 unsigned ocnt = cnt - 1;
647 wxUint32 res = cc & (0x3f >> cnt);
648 while (cnt--)
649 {
650 cc = *psz;
651 if ((cc & 0xC0) != 0x80)
652 {
653 // invalid UTF-8 sequence
654 invalid = true;
655 break;
656 }
657 psz++;
658 res = (res << 6) | (cc & 0x3f);
659 }
660 if (invalid || res <= utf8_max[ocnt])
661 {
662 // illegal UTF-8 encoding
663 invalid = true;
664 }
665 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
666 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
667 {
668 // if one of our PUA characters turns up externally
669 // it must also be treated as an illegal sequence
670 // (a bit like you have to escape an escape character)
671 invalid = true;
672 }
673 else
674 {
675 #ifdef WC_UTF16
676 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
677 size_t pa = encode_utf16(res, (wxUint16 *)buf);
678 if (pa == (size_t)-1)
679 {
680 invalid = true;
681 }
682 else
683 {
684 if (buf)
685 buf += pa;
686 len += pa;
687 }
688 #else // !WC_UTF16
689 if (buf)
690 *buf++ = res;
691 len++;
692 #endif // WC_UTF16/!WC_UTF16
693 }
694 }
695 if (invalid)
696 {
697 if (m_options & MAP_INVALID_UTF8_TO_PUA)
698 {
699 while (opsz < psz && (!buf || len < n))
700 {
701 #ifdef WC_UTF16
702 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
703 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
704 wxASSERT(pa != (size_t)-1);
705 if (buf)
706 buf += pa;
707 opsz++;
708 len += pa;
709 #else
710 if (buf)
711 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
712 opsz++;
713 len++;
714 #endif
715 }
716 }
717 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
718 {
719 while (opsz < psz && (!buf || len < n))
720 {
721 if ( buf && len + 3 < n )
722 {
723 unsigned char n = *opsz;
724 *buf++ = L'\\';
725 *buf++ = (wchar_t)( L'0' + n / 0100 );
726 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
727 *buf++ = (wchar_t)( L'0' + n % 010 );
728 }
729 opsz++;
730 len += 4;
731 }
732 }
733 else // MAP_INVALID_UTF8_NOT
734 {
735 return (size_t)-1;
736 }
737 }
738 }
739 }
740 if (buf && (len < n))
741 *buf = 0;
742 return len;
743 }
744
745 static inline bool isoctal(wchar_t wch)
746 {
747 return L'0' <= wch && wch <= L'7';
748 }
749
750 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
751 {
752 size_t len = 0;
753
754 while (*psz && ((!buf) || (len < n)))
755 {
756 wxUint32 cc;
757 #ifdef WC_UTF16
758 // cast is ok for WC_UTF16
759 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
760 psz += (pa == (size_t)-1) ? 1 : pa;
761 #else
762 cc=(*psz++) & 0x7fffffff;
763 #endif
764
765 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
766 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
767 {
768 if (buf)
769 *buf++ = (char)(cc - wxUnicodePUA);
770 len++;
771 }
772 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
773 && cc == L'\\' && psz[0] == L'\\' )
774 {
775 if (buf)
776 *buf++ = (char)cc;
777 psz++;
778 len++;
779 }
780 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
781 cc == L'\\' &&
782 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
783 {
784 if (buf)
785 {
786 *buf++ = (char) ((psz[0] - L'0')*0100 +
787 (psz[1] - L'0')*010 +
788 (psz[2] - L'0'));
789 }
790
791 psz += 3;
792 len++;
793 }
794 else
795 {
796 unsigned cnt;
797 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
798 if (!cnt)
799 {
800 // plain ASCII char
801 if (buf)
802 *buf++ = (char) cc;
803 len++;
804 }
805
806 else
807 {
808 len += cnt + 1;
809 if (buf)
810 {
811 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
812 while (cnt--)
813 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
814 }
815 }
816 }
817 }
818
819 if (buf && (len<n))
820 *buf = 0;
821
822 return len;
823 }
824
825 // ----------------------------------------------------------------------------
826 // UTF-16
827 // ----------------------------------------------------------------------------
828
829 #ifdef WORDS_BIGENDIAN
830 #define wxMBConvUTF16straight wxMBConvUTF16BE
831 #define wxMBConvUTF16swap wxMBConvUTF16LE
832 #else
833 #define wxMBConvUTF16swap wxMBConvUTF16BE
834 #define wxMBConvUTF16straight wxMBConvUTF16LE
835 #endif
836
837
838 #ifdef WC_UTF16
839
840 // copy 16bit MB to 16bit String
841 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
842 {
843 size_t len=0;
844
845 while (*(wxUint16*)psz && (!buf || len < n))
846 {
847 if (buf)
848 *buf++ = *(wxUint16*)psz;
849 len++;
850
851 psz += sizeof(wxUint16);
852 }
853 if (buf && len<n) *buf=0;
854
855 return len;
856 }
857
858
859 // copy 16bit String to 16bit MB
860 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
861 {
862 size_t len=0;
863
864 while (*psz && (!buf || len < n))
865 {
866 if (buf)
867 {
868 *(wxUint16*)buf = *psz;
869 buf += sizeof(wxUint16);
870 }
871 len += sizeof(wxUint16);
872 psz++;
873 }
874 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
875
876 return len;
877 }
878
879
880 // swap 16bit MB to 16bit String
881 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
882 {
883 size_t len=0;
884
885 while (*(wxUint16*)psz && (!buf || len < n))
886 {
887 if (buf)
888 {
889 ((char *)buf)[0] = psz[1];
890 ((char *)buf)[1] = psz[0];
891 buf++;
892 }
893 len++;
894 psz += sizeof(wxUint16);
895 }
896 if (buf && len<n) *buf=0;
897
898 return len;
899 }
900
901
902 // swap 16bit MB to 16bit String
903 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
904 {
905 size_t len=0;
906
907 while (*psz && (!buf || len < n))
908 {
909 if (buf)
910 {
911 *buf++ = ((char*)psz)[1];
912 *buf++ = ((char*)psz)[0];
913 }
914 len += sizeof(wxUint16);
915 psz++;
916 }
917 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
918
919 return len;
920 }
921
922
923 #else // WC_UTF16
924
925
926 // copy 16bit MB to 32bit String
927 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
928 {
929 size_t len=0;
930
931 while (*(wxUint16*)psz && (!buf || len < n))
932 {
933 wxUint32 cc;
934 size_t pa=decode_utf16((wxUint16*)psz, cc);
935 if (pa == (size_t)-1)
936 return pa;
937
938 if (buf)
939 *buf++ = cc;
940 len++;
941 psz += pa * sizeof(wxUint16);
942 }
943 if (buf && len<n) *buf=0;
944
945 return len;
946 }
947
948
949 // copy 32bit String to 16bit MB
950 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
951 {
952 size_t len=0;
953
954 while (*psz && (!buf || len < n))
955 {
956 wxUint16 cc[2];
957 size_t pa=encode_utf16(*psz, cc);
958
959 if (pa == (size_t)-1)
960 return pa;
961
962 if (buf)
963 {
964 *(wxUint16*)buf = cc[0];
965 buf += sizeof(wxUint16);
966 if (pa > 1)
967 {
968 *(wxUint16*)buf = cc[1];
969 buf += sizeof(wxUint16);
970 }
971 }
972
973 len += pa*sizeof(wxUint16);
974 psz++;
975 }
976 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
977
978 return len;
979 }
980
981
982 // swap 16bit MB to 32bit String
983 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
984 {
985 size_t len=0;
986
987 while (*(wxUint16*)psz && (!buf || len < n))
988 {
989 wxUint32 cc;
990 char tmp[4];
991 tmp[0]=psz[1]; tmp[1]=psz[0];
992 tmp[2]=psz[3]; tmp[3]=psz[2];
993
994 size_t pa=decode_utf16((wxUint16*)tmp, cc);
995 if (pa == (size_t)-1)
996 return pa;
997
998 if (buf)
999 *buf++ = cc;
1000
1001 len++;
1002 psz += pa * sizeof(wxUint16);
1003 }
1004 if (buf && len<n) *buf=0;
1005
1006 return len;
1007 }
1008
1009
1010 // swap 32bit String to 16bit MB
1011 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1012 {
1013 size_t len=0;
1014
1015 while (*psz && (!buf || len < n))
1016 {
1017 wxUint16 cc[2];
1018 size_t pa=encode_utf16(*psz, cc);
1019
1020 if (pa == (size_t)-1)
1021 return pa;
1022
1023 if (buf)
1024 {
1025 *buf++ = ((char*)cc)[1];
1026 *buf++ = ((char*)cc)[0];
1027 if (pa > 1)
1028 {
1029 *buf++ = ((char*)cc)[3];
1030 *buf++ = ((char*)cc)[2];
1031 }
1032 }
1033
1034 len += pa*sizeof(wxUint16);
1035 psz++;
1036 }
1037 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1038
1039 return len;
1040 }
1041
1042 #endif // WC_UTF16
1043
1044
1045 // ----------------------------------------------------------------------------
1046 // UTF-32
1047 // ----------------------------------------------------------------------------
1048
1049 #ifdef WORDS_BIGENDIAN
1050 #define wxMBConvUTF32straight wxMBConvUTF32BE
1051 #define wxMBConvUTF32swap wxMBConvUTF32LE
1052 #else
1053 #define wxMBConvUTF32swap wxMBConvUTF32BE
1054 #define wxMBConvUTF32straight wxMBConvUTF32LE
1055 #endif
1056
1057
1058 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1059 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1060
1061
1062 #ifdef WC_UTF16
1063
1064 // copy 32bit MB to 16bit String
1065 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1066 {
1067 size_t len=0;
1068
1069 while (*(wxUint32*)psz && (!buf || len < n))
1070 {
1071 wxUint16 cc[2];
1072
1073 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1074 if (pa == (size_t)-1)
1075 return pa;
1076
1077 if (buf)
1078 {
1079 *buf++ = cc[0];
1080 if (pa > 1)
1081 *buf++ = cc[1];
1082 }
1083 len += pa;
1084 psz += sizeof(wxUint32);
1085 }
1086 if (buf && len<n) *buf=0;
1087
1088 return len;
1089 }
1090
1091
1092 // copy 16bit String to 32bit MB
1093 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1094 {
1095 size_t len=0;
1096
1097 while (*psz && (!buf || len < n))
1098 {
1099 wxUint32 cc;
1100
1101 // cast is ok for WC_UTF16
1102 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1103 if (pa == (size_t)-1)
1104 return pa;
1105
1106 if (buf)
1107 {
1108 *(wxUint32*)buf = cc;
1109 buf += sizeof(wxUint32);
1110 }
1111 len += sizeof(wxUint32);
1112 psz += pa;
1113 }
1114
1115 if (buf && len<=n-sizeof(wxUint32))
1116 *(wxUint32*)buf=0;
1117
1118 return len;
1119 }
1120
1121
1122
1123 // swap 32bit MB to 16bit String
1124 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1125 {
1126 size_t len=0;
1127
1128 while (*(wxUint32*)psz && (!buf || len < n))
1129 {
1130 char tmp[4];
1131 tmp[0] = psz[3]; tmp[1] = psz[2];
1132 tmp[2] = psz[1]; tmp[3] = psz[0];
1133
1134
1135 wxUint16 cc[2];
1136
1137 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1138 if (pa == (size_t)-1)
1139 return pa;
1140
1141 if (buf)
1142 {
1143 *buf++ = cc[0];
1144 if (pa > 1)
1145 *buf++ = cc[1];
1146 }
1147 len += pa;
1148 psz += sizeof(wxUint32);
1149 }
1150
1151 if (buf && len<n)
1152 *buf=0;
1153
1154 return len;
1155 }
1156
1157
1158 // swap 16bit String to 32bit MB
1159 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1160 {
1161 size_t len=0;
1162
1163 while (*psz && (!buf || len < n))
1164 {
1165 char cc[4];
1166
1167 // cast is ok for WC_UTF16
1168 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1169 if (pa == (size_t)-1)
1170 return pa;
1171
1172 if (buf)
1173 {
1174 *buf++ = cc[3];
1175 *buf++ = cc[2];
1176 *buf++ = cc[1];
1177 *buf++ = cc[0];
1178 }
1179 len += sizeof(wxUint32);
1180 psz += pa;
1181 }
1182
1183 if (buf && len<=n-sizeof(wxUint32))
1184 *(wxUint32*)buf=0;
1185
1186 return len;
1187 }
1188
1189 #else // WC_UTF16
1190
1191
1192 // copy 32bit MB to 32bit String
1193 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1194 {
1195 size_t len=0;
1196
1197 while (*(wxUint32*)psz && (!buf || len < n))
1198 {
1199 if (buf)
1200 *buf++ = *(wxUint32*)psz;
1201 len++;
1202 psz += sizeof(wxUint32);
1203 }
1204
1205 if (buf && len<n)
1206 *buf=0;
1207
1208 return len;
1209 }
1210
1211
1212 // copy 32bit String to 32bit MB
1213 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1214 {
1215 size_t len=0;
1216
1217 while (*psz && (!buf || len < n))
1218 {
1219 if (buf)
1220 {
1221 *(wxUint32*)buf = *psz;
1222 buf += sizeof(wxUint32);
1223 }
1224
1225 len += sizeof(wxUint32);
1226 psz++;
1227 }
1228
1229 if (buf && len<=n-sizeof(wxUint32))
1230 *(wxUint32*)buf=0;
1231
1232 return len;
1233 }
1234
1235
1236 // swap 32bit MB to 32bit String
1237 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1238 {
1239 size_t len=0;
1240
1241 while (*(wxUint32*)psz && (!buf || len < n))
1242 {
1243 if (buf)
1244 {
1245 ((char *)buf)[0] = psz[3];
1246 ((char *)buf)[1] = psz[2];
1247 ((char *)buf)[2] = psz[1];
1248 ((char *)buf)[3] = psz[0];
1249 buf++;
1250 }
1251 len++;
1252 psz += sizeof(wxUint32);
1253 }
1254
1255 if (buf && len<n)
1256 *buf=0;
1257
1258 return len;
1259 }
1260
1261
1262 // swap 32bit String to 32bit MB
1263 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1264 {
1265 size_t len=0;
1266
1267 while (*psz && (!buf || len < n))
1268 {
1269 if (buf)
1270 {
1271 *buf++ = ((char *)psz)[3];
1272 *buf++ = ((char *)psz)[2];
1273 *buf++ = ((char *)psz)[1];
1274 *buf++ = ((char *)psz)[0];
1275 }
1276 len += sizeof(wxUint32);
1277 psz++;
1278 }
1279
1280 if (buf && len<=n-sizeof(wxUint32))
1281 *(wxUint32*)buf=0;
1282
1283 return len;
1284 }
1285
1286
1287 #endif // WC_UTF16
1288
1289
1290 // ============================================================================
1291 // The classes doing conversion using the iconv_xxx() functions
1292 // ============================================================================
1293
1294 #ifdef HAVE_ICONV
1295
1296 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1297 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1298 // (unless there's yet another bug in glibc) the only case when iconv()
1299 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1300 // left in the input buffer -- when _real_ error occurs,
1301 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1302 // iconv() failure.
1303 // [This bug does not appear in glibc 2.2.]
1304 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1305 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1306 (errno != E2BIG || bufLeft != 0))
1307 #else
1308 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1309 #endif
1310
1311 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1312
1313 // ----------------------------------------------------------------------------
1314 // wxMBConv_iconv: encapsulates an iconv character set
1315 // ----------------------------------------------------------------------------
1316
1317 class wxMBConv_iconv : public wxMBConv
1318 {
1319 public:
1320 wxMBConv_iconv(const wxChar *name);
1321 virtual ~wxMBConv_iconv();
1322
1323 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1324 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1325
1326 bool IsOk() const
1327 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1328
1329 protected:
1330 // the iconv handlers used to translate from multibyte to wide char and in
1331 // the other direction
1332 iconv_t m2w,
1333 w2m;
1334 #if wxUSE_THREADS
1335 // guards access to m2w and w2m objects
1336 wxMutex m_iconvMutex;
1337 #endif
1338
1339 private:
1340 // the name (for iconv_open()) of a wide char charset -- if none is
1341 // available on this machine, it will remain NULL
1342 static const char *ms_wcCharsetName;
1343
1344 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1345 // different endian-ness than the native one
1346 static bool ms_wcNeedsSwap;
1347 };
1348
1349 // make the constructor available for unit testing
1350 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1351 {
1352 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1353 if ( !result->IsOk() )
1354 {
1355 delete result;
1356 return 0;
1357 }
1358 return result;
1359 }
1360
1361 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1362 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1363
1364 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1365 {
1366 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1367 // names for the charsets
1368 const wxCharBuffer cname(wxString::ToAscii(name));
1369
1370 // check for charset that represents wchar_t:
1371 if (ms_wcCharsetName == NULL)
1372 {
1373 ms_wcNeedsSwap = false;
1374
1375 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1376 ms_wcCharsetName = WC_NAME_BEST;
1377 m2w = iconv_open(ms_wcCharsetName, cname);
1378
1379 if (m2w == (iconv_t)-1)
1380 {
1381 // try charset w/o bytesex info (e.g. "UCS4")
1382 // and check for bytesex ourselves:
1383 ms_wcCharsetName = WC_NAME;
1384 m2w = iconv_open(ms_wcCharsetName, cname);
1385
1386 // last bet, try if it knows WCHAR_T pseudo-charset
1387 if (m2w == (iconv_t)-1)
1388 {
1389 ms_wcCharsetName = "WCHAR_T";
1390 m2w = iconv_open(ms_wcCharsetName, cname);
1391 }
1392
1393 if (m2w != (iconv_t)-1)
1394 {
1395 char buf[2], *bufPtr;
1396 wchar_t wbuf[2], *wbufPtr;
1397 size_t insz, outsz;
1398 size_t res;
1399
1400 buf[0] = 'A';
1401 buf[1] = 0;
1402 wbuf[0] = 0;
1403 insz = 2;
1404 outsz = SIZEOF_WCHAR_T * 2;
1405 wbufPtr = wbuf;
1406 bufPtr = buf;
1407
1408 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1409 (char**)&wbufPtr, &outsz);
1410
1411 if (ICONV_FAILED(res, insz))
1412 {
1413 ms_wcCharsetName = NULL;
1414 wxLogLastError(wxT("iconv"));
1415 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1416 }
1417 else
1418 {
1419 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1420 }
1421 }
1422 else
1423 {
1424 ms_wcCharsetName = NULL;
1425
1426 // VS: we must not output an error here, since wxWidgets will safely
1427 // fall back to using wxEncodingConverter.
1428 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1429 //wxLogError(
1430 }
1431 }
1432 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1433 }
1434 else // we already have ms_wcCharsetName
1435 {
1436 m2w = iconv_open(ms_wcCharsetName, cname);
1437 }
1438
1439 // NB: don't ever pass NULL to iconv_open(), it may crash!
1440 if ( ms_wcCharsetName )
1441 {
1442 w2m = iconv_open( cname, ms_wcCharsetName);
1443 }
1444 else
1445 {
1446 w2m = (iconv_t)-1;
1447 }
1448 }
1449
1450 wxMBConv_iconv::~wxMBConv_iconv()
1451 {
1452 if ( m2w != (iconv_t)-1 )
1453 iconv_close(m2w);
1454 if ( w2m != (iconv_t)-1 )
1455 iconv_close(w2m);
1456 }
1457
1458 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1459 {
1460 #if wxUSE_THREADS
1461 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1462 // Unfortunately there is a couple of global wxCSConv objects such as
1463 // wxConvLocal that are used all over wx code, so we have to make sure
1464 // the handle is used by at most one thread at the time. Otherwise
1465 // only a few wx classes would be safe to use from non-main threads
1466 // as MB<->WC conversion would fail "randomly".
1467 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1468 #endif
1469
1470 size_t inbuf = strlen(psz);
1471 size_t outbuf = n * SIZEOF_WCHAR_T;
1472 size_t res, cres;
1473 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1474 wchar_t *bufPtr = buf;
1475 const char *pszPtr = psz;
1476
1477 if (buf)
1478 {
1479 // have destination buffer, convert there
1480 cres = iconv(m2w,
1481 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1482 (char**)&bufPtr, &outbuf);
1483 res = n - (outbuf / SIZEOF_WCHAR_T);
1484
1485 if (ms_wcNeedsSwap)
1486 {
1487 // convert to native endianness
1488 WC_BSWAP(buf /* _not_ bufPtr */, res)
1489 }
1490
1491 // NB: iconv was given only strlen(psz) characters on input, and so
1492 // it couldn't convert the trailing zero. Let's do it ourselves
1493 // if there's some room left for it in the output buffer.
1494 if (res < n)
1495 buf[res] = 0;
1496 }
1497 else
1498 {
1499 // no destination buffer... convert using temp buffer
1500 // to calculate destination buffer requirement
1501 wchar_t tbuf[8];
1502 res = 0;
1503 do {
1504 bufPtr = tbuf;
1505 outbuf = 8*SIZEOF_WCHAR_T;
1506
1507 cres = iconv(m2w,
1508 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1509 (char**)&bufPtr, &outbuf );
1510
1511 res += 8-(outbuf/SIZEOF_WCHAR_T);
1512 } while ((cres==(size_t)-1) && (errno==E2BIG));
1513 }
1514
1515 if (ICONV_FAILED(cres, inbuf))
1516 {
1517 //VS: it is ok if iconv fails, hence trace only
1518 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1519 return (size_t)-1;
1520 }
1521
1522 return res;
1523 }
1524
1525 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1526 {
1527 #if wxUSE_THREADS
1528 // NB: explained in MB2WC
1529 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1530 #endif
1531
1532 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1533 size_t outbuf = n;
1534 size_t res, cres;
1535
1536 wchar_t *tmpbuf = 0;
1537
1538 if (ms_wcNeedsSwap)
1539 {
1540 // need to copy to temp buffer to switch endianness
1541 // this absolutely doesn't rock!
1542 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1543 // could be in read-only memory, or be accessed in some other thread)
1544 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1545 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1546 WC_BSWAP(tmpbuf, inbuf)
1547 psz=tmpbuf;
1548 }
1549
1550 if (buf)
1551 {
1552 // have destination buffer, convert there
1553 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1554
1555 res = n-outbuf;
1556
1557 // NB: iconv was given only wcslen(psz) characters on input, and so
1558 // it couldn't convert the trailing zero. Let's do it ourselves
1559 // if there's some room left for it in the output buffer.
1560 if (res < n)
1561 buf[0] = 0;
1562 }
1563 else
1564 {
1565 // no destination buffer... convert using temp buffer
1566 // to calculate destination buffer requirement
1567 char tbuf[16];
1568 res = 0;
1569 do {
1570 buf = tbuf; outbuf = 16;
1571
1572 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1573
1574 res += 16 - outbuf;
1575 } while ((cres==(size_t)-1) && (errno==E2BIG));
1576 }
1577
1578 if (ms_wcNeedsSwap)
1579 {
1580 free(tmpbuf);
1581 }
1582
1583 if (ICONV_FAILED(cres, inbuf))
1584 {
1585 //VS: it is ok if iconv fails, hence trace only
1586 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1587 return (size_t)-1;
1588 }
1589
1590 return res;
1591 }
1592
1593 #endif // HAVE_ICONV
1594
1595
1596 // ============================================================================
1597 // Win32 conversion classes
1598 // ============================================================================
1599
1600 #ifdef wxHAVE_WIN32_MB2WC
1601
1602 // from utils.cpp
1603 #if wxUSE_FONTMAP
1604 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1605 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1606 #endif
1607
1608 class wxMBConv_win32 : public wxMBConv
1609 {
1610 public:
1611 wxMBConv_win32()
1612 {
1613 m_CodePage = CP_ACP;
1614 }
1615
1616 #if wxUSE_FONTMAP
1617 wxMBConv_win32(const wxChar* name)
1618 {
1619 m_CodePage = wxCharsetToCodepage(name);
1620 }
1621
1622 wxMBConv_win32(wxFontEncoding encoding)
1623 {
1624 m_CodePage = wxEncodingToCodepage(encoding);
1625 }
1626 #endif
1627
1628 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1629 {
1630 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1631 // the behaviour is not compatible with the Unix version (using iconv)
1632 // and break the library itself, e.g. wxTextInputStream::NextChar()
1633 // wouldn't work if reading an incomplete MB char didn't result in an
1634 // error
1635 //
1636 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1637 // an error (tested under Windows Server 2003) and apparently it is
1638 // done on purpose, i.e. the function accepts any input in this case
1639 // and although I'd prefer to return error on ill-formed output, our
1640 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1641 // explicitly ill-formed according to RFC 2152) neither so we don't
1642 // even have any fallback here...
1643 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1644
1645 const size_t len = ::MultiByteToWideChar
1646 (
1647 m_CodePage, // code page
1648 flags, // flags: fall on error
1649 psz, // input string
1650 -1, // its length (NUL-terminated)
1651 buf, // output string
1652 buf ? n : 0 // size of output buffer
1653 );
1654
1655 // note that it returns count of written chars for buf != NULL and size
1656 // of the needed buffer for buf == NULL so in either case the length of
1657 // the string (which never includes the terminating NUL) is one less
1658 return len ? len - 1 : (size_t)-1;
1659 }
1660
1661 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1662 {
1663 /*
1664 we have a problem here: by default, WideCharToMultiByte() may
1665 replace characters unrepresentable in the target code page with bad
1666 quality approximations such as turning "1/2" symbol (U+00BD) into
1667 "1" for the code pages which don't have it and we, obviously, want
1668 to avoid this at any price
1669
1670 the trouble is that this function does it _silently_, i.e. it won't
1671 even tell us whether it did or not... Win98/2000 and higher provide
1672 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1673 we have to resort to a round trip, i.e. check that converting back
1674 results in the same string -- this is, of course, expensive but
1675 otherwise we simply can't be sure to not garble the data.
1676 */
1677
1678 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1679 // it doesn't work with CJK encodings (which we test for rather roughly
1680 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1681 // supporting it
1682 BOOL usedDef wxDUMMY_INITIALIZE(false);
1683 BOOL *pUsedDef;
1684 int flags;
1685 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1686 {
1687 // it's our lucky day
1688 flags = WC_NO_BEST_FIT_CHARS;
1689 pUsedDef = &usedDef;
1690 }
1691 else // old system or unsupported encoding
1692 {
1693 flags = 0;
1694 pUsedDef = NULL;
1695 }
1696
1697 const size_t len = ::WideCharToMultiByte
1698 (
1699 m_CodePage, // code page
1700 flags, // either none or no best fit
1701 pwz, // input string
1702 -1, // it is (wide) NUL-terminated
1703 buf, // output buffer
1704 buf ? n : 0, // and its size
1705 NULL, // default "replacement" char
1706 pUsedDef // [out] was it used?
1707 );
1708
1709 if ( !len )
1710 {
1711 // function totally failed
1712 return (size_t)-1;
1713 }
1714
1715 // if we were really converting, check if we succeeded
1716 if ( buf )
1717 {
1718 if ( flags )
1719 {
1720 // check if the conversion failed, i.e. if any replacements
1721 // were done
1722 if ( usedDef )
1723 return (size_t)-1;
1724 }
1725 else // we must resort to double tripping...
1726 {
1727 wxWCharBuffer wcBuf(n);
1728 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1729 wcscmp(wcBuf, pwz) != 0 )
1730 {
1731 // we didn't obtain the same thing we started from, hence
1732 // the conversion was lossy and we consider that it failed
1733 return (size_t)-1;
1734 }
1735 }
1736 }
1737
1738 // see the comment above for the reason of "len - 1"
1739 return len - 1;
1740 }
1741
1742 bool IsOk() const { return m_CodePage != -1; }
1743
1744 private:
1745 static bool CanUseNoBestFit()
1746 {
1747 static int s_isWin98Or2k = -1;
1748
1749 if ( s_isWin98Or2k == -1 )
1750 {
1751 int verMaj, verMin;
1752 switch ( wxGetOsVersion(&verMaj, &verMin) )
1753 {
1754 case wxWIN95:
1755 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1756 break;
1757
1758 case wxWINDOWS_NT:
1759 s_isWin98Or2k = verMaj >= 5;
1760 break;
1761
1762 default:
1763 // unknown, be conseravtive by default
1764 s_isWin98Or2k = 0;
1765 }
1766
1767 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1768 }
1769
1770 return s_isWin98Or2k == 1;
1771 }
1772
1773 long m_CodePage;
1774 };
1775
1776 #endif // wxHAVE_WIN32_MB2WC
1777
1778 // ============================================================================
1779 // Cocoa conversion classes
1780 // ============================================================================
1781
1782 #if defined(__WXCOCOA__)
1783
1784 // RN: There is no UTF-32 support in either Core Foundation or
1785 // Cocoa. Strangely enough, internally Core Foundation uses
1786 // UTF 32 internally quite a bit - its just not public (yet).
1787
1788 #include <CoreFoundation/CFString.h>
1789 #include <CoreFoundation/CFStringEncodingExt.h>
1790
1791 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1792 {
1793 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1794 if ( encoding == wxFONTENCODING_DEFAULT )
1795 {
1796 enc = CFStringGetSystemEncoding();
1797 }
1798 else switch( encoding)
1799 {
1800 case wxFONTENCODING_ISO8859_1 :
1801 enc = kCFStringEncodingISOLatin1 ;
1802 break ;
1803 case wxFONTENCODING_ISO8859_2 :
1804 enc = kCFStringEncodingISOLatin2;
1805 break ;
1806 case wxFONTENCODING_ISO8859_3 :
1807 enc = kCFStringEncodingISOLatin3 ;
1808 break ;
1809 case wxFONTENCODING_ISO8859_4 :
1810 enc = kCFStringEncodingISOLatin4;
1811 break ;
1812 case wxFONTENCODING_ISO8859_5 :
1813 enc = kCFStringEncodingISOLatinCyrillic;
1814 break ;
1815 case wxFONTENCODING_ISO8859_6 :
1816 enc = kCFStringEncodingISOLatinArabic;
1817 break ;
1818 case wxFONTENCODING_ISO8859_7 :
1819 enc = kCFStringEncodingISOLatinGreek;
1820 break ;
1821 case wxFONTENCODING_ISO8859_8 :
1822 enc = kCFStringEncodingISOLatinHebrew;
1823 break ;
1824 case wxFONTENCODING_ISO8859_9 :
1825 enc = kCFStringEncodingISOLatin5;
1826 break ;
1827 case wxFONTENCODING_ISO8859_10 :
1828 enc = kCFStringEncodingISOLatin6;
1829 break ;
1830 case wxFONTENCODING_ISO8859_11 :
1831 enc = kCFStringEncodingISOLatinThai;
1832 break ;
1833 case wxFONTENCODING_ISO8859_13 :
1834 enc = kCFStringEncodingISOLatin7;
1835 break ;
1836 case wxFONTENCODING_ISO8859_14 :
1837 enc = kCFStringEncodingISOLatin8;
1838 break ;
1839 case wxFONTENCODING_ISO8859_15 :
1840 enc = kCFStringEncodingISOLatin9;
1841 break ;
1842
1843 case wxFONTENCODING_KOI8 :
1844 enc = kCFStringEncodingKOI8_R;
1845 break ;
1846 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1847 enc = kCFStringEncodingDOSRussian;
1848 break ;
1849
1850 // case wxFONTENCODING_BULGARIAN :
1851 // enc = ;
1852 // break ;
1853
1854 case wxFONTENCODING_CP437 :
1855 enc =kCFStringEncodingDOSLatinUS ;
1856 break ;
1857 case wxFONTENCODING_CP850 :
1858 enc = kCFStringEncodingDOSLatin1;
1859 break ;
1860 case wxFONTENCODING_CP852 :
1861 enc = kCFStringEncodingDOSLatin2;
1862 break ;
1863 case wxFONTENCODING_CP855 :
1864 enc = kCFStringEncodingDOSCyrillic;
1865 break ;
1866 case wxFONTENCODING_CP866 :
1867 enc =kCFStringEncodingDOSRussian ;
1868 break ;
1869 case wxFONTENCODING_CP874 :
1870 enc = kCFStringEncodingDOSThai;
1871 break ;
1872 case wxFONTENCODING_CP932 :
1873 enc = kCFStringEncodingDOSJapanese;
1874 break ;
1875 case wxFONTENCODING_CP936 :
1876 enc =kCFStringEncodingDOSChineseSimplif ;
1877 break ;
1878 case wxFONTENCODING_CP949 :
1879 enc = kCFStringEncodingDOSKorean;
1880 break ;
1881 case wxFONTENCODING_CP950 :
1882 enc = kCFStringEncodingDOSChineseTrad;
1883 break ;
1884 case wxFONTENCODING_CP1250 :
1885 enc = kCFStringEncodingWindowsLatin2;
1886 break ;
1887 case wxFONTENCODING_CP1251 :
1888 enc =kCFStringEncodingWindowsCyrillic ;
1889 break ;
1890 case wxFONTENCODING_CP1252 :
1891 enc =kCFStringEncodingWindowsLatin1 ;
1892 break ;
1893 case wxFONTENCODING_CP1253 :
1894 enc = kCFStringEncodingWindowsGreek;
1895 break ;
1896 case wxFONTENCODING_CP1254 :
1897 enc = kCFStringEncodingWindowsLatin5;
1898 break ;
1899 case wxFONTENCODING_CP1255 :
1900 enc =kCFStringEncodingWindowsHebrew ;
1901 break ;
1902 case wxFONTENCODING_CP1256 :
1903 enc =kCFStringEncodingWindowsArabic ;
1904 break ;
1905 case wxFONTENCODING_CP1257 :
1906 enc = kCFStringEncodingWindowsBalticRim;
1907 break ;
1908 // This only really encodes to UTF7 (if that) evidently
1909 // case wxFONTENCODING_UTF7 :
1910 // enc = kCFStringEncodingNonLossyASCII ;
1911 // break ;
1912 case wxFONTENCODING_UTF8 :
1913 enc = kCFStringEncodingUTF8 ;
1914 break ;
1915 case wxFONTENCODING_EUC_JP :
1916 enc = kCFStringEncodingEUC_JP;
1917 break ;
1918 case wxFONTENCODING_UTF16 :
1919 enc = kCFStringEncodingUnicode ;
1920 break ;
1921 case wxFONTENCODING_MACROMAN :
1922 enc = kCFStringEncodingMacRoman ;
1923 break ;
1924 case wxFONTENCODING_MACJAPANESE :
1925 enc = kCFStringEncodingMacJapanese ;
1926 break ;
1927 case wxFONTENCODING_MACCHINESETRAD :
1928 enc = kCFStringEncodingMacChineseTrad ;
1929 break ;
1930 case wxFONTENCODING_MACKOREAN :
1931 enc = kCFStringEncodingMacKorean ;
1932 break ;
1933 case wxFONTENCODING_MACARABIC :
1934 enc = kCFStringEncodingMacArabic ;
1935 break ;
1936 case wxFONTENCODING_MACHEBREW :
1937 enc = kCFStringEncodingMacHebrew ;
1938 break ;
1939 case wxFONTENCODING_MACGREEK :
1940 enc = kCFStringEncodingMacGreek ;
1941 break ;
1942 case wxFONTENCODING_MACCYRILLIC :
1943 enc = kCFStringEncodingMacCyrillic ;
1944 break ;
1945 case wxFONTENCODING_MACDEVANAGARI :
1946 enc = kCFStringEncodingMacDevanagari ;
1947 break ;
1948 case wxFONTENCODING_MACGURMUKHI :
1949 enc = kCFStringEncodingMacGurmukhi ;
1950 break ;
1951 case wxFONTENCODING_MACGUJARATI :
1952 enc = kCFStringEncodingMacGujarati ;
1953 break ;
1954 case wxFONTENCODING_MACORIYA :
1955 enc = kCFStringEncodingMacOriya ;
1956 break ;
1957 case wxFONTENCODING_MACBENGALI :
1958 enc = kCFStringEncodingMacBengali ;
1959 break ;
1960 case wxFONTENCODING_MACTAMIL :
1961 enc = kCFStringEncodingMacTamil ;
1962 break ;
1963 case wxFONTENCODING_MACTELUGU :
1964 enc = kCFStringEncodingMacTelugu ;
1965 break ;
1966 case wxFONTENCODING_MACKANNADA :
1967 enc = kCFStringEncodingMacKannada ;
1968 break ;
1969 case wxFONTENCODING_MACMALAJALAM :
1970 enc = kCFStringEncodingMacMalayalam ;
1971 break ;
1972 case wxFONTENCODING_MACSINHALESE :
1973 enc = kCFStringEncodingMacSinhalese ;
1974 break ;
1975 case wxFONTENCODING_MACBURMESE :
1976 enc = kCFStringEncodingMacBurmese ;
1977 break ;
1978 case wxFONTENCODING_MACKHMER :
1979 enc = kCFStringEncodingMacKhmer ;
1980 break ;
1981 case wxFONTENCODING_MACTHAI :
1982 enc = kCFStringEncodingMacThai ;
1983 break ;
1984 case wxFONTENCODING_MACLAOTIAN :
1985 enc = kCFStringEncodingMacLaotian ;
1986 break ;
1987 case wxFONTENCODING_MACGEORGIAN :
1988 enc = kCFStringEncodingMacGeorgian ;
1989 break ;
1990 case wxFONTENCODING_MACARMENIAN :
1991 enc = kCFStringEncodingMacArmenian ;
1992 break ;
1993 case wxFONTENCODING_MACCHINESESIMP :
1994 enc = kCFStringEncodingMacChineseSimp ;
1995 break ;
1996 case wxFONTENCODING_MACTIBETAN :
1997 enc = kCFStringEncodingMacTibetan ;
1998 break ;
1999 case wxFONTENCODING_MACMONGOLIAN :
2000 enc = kCFStringEncodingMacMongolian ;
2001 break ;
2002 case wxFONTENCODING_MACETHIOPIC :
2003 enc = kCFStringEncodingMacEthiopic ;
2004 break ;
2005 case wxFONTENCODING_MACCENTRALEUR :
2006 enc = kCFStringEncodingMacCentralEurRoman ;
2007 break ;
2008 case wxFONTENCODING_MACVIATNAMESE :
2009 enc = kCFStringEncodingMacVietnamese ;
2010 break ;
2011 case wxFONTENCODING_MACARABICEXT :
2012 enc = kCFStringEncodingMacExtArabic ;
2013 break ;
2014 case wxFONTENCODING_MACSYMBOL :
2015 enc = kCFStringEncodingMacSymbol ;
2016 break ;
2017 case wxFONTENCODING_MACDINGBATS :
2018 enc = kCFStringEncodingMacDingbats ;
2019 break ;
2020 case wxFONTENCODING_MACTURKISH :
2021 enc = kCFStringEncodingMacTurkish ;
2022 break ;
2023 case wxFONTENCODING_MACCROATIAN :
2024 enc = kCFStringEncodingMacCroatian ;
2025 break ;
2026 case wxFONTENCODING_MACICELANDIC :
2027 enc = kCFStringEncodingMacIcelandic ;
2028 break ;
2029 case wxFONTENCODING_MACROMANIAN :
2030 enc = kCFStringEncodingMacRomanian ;
2031 break ;
2032 case wxFONTENCODING_MACCELTIC :
2033 enc = kCFStringEncodingMacCeltic ;
2034 break ;
2035 case wxFONTENCODING_MACGAELIC :
2036 enc = kCFStringEncodingMacGaelic ;
2037 break ;
2038 // case wxFONTENCODING_MACKEYBOARD :
2039 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2040 // break ;
2041 default :
2042 // because gcc is picky
2043 break ;
2044 } ;
2045 return enc ;
2046 }
2047
2048 class wxMBConv_cocoa : public wxMBConv
2049 {
2050 public:
2051 wxMBConv_cocoa()
2052 {
2053 Init(CFStringGetSystemEncoding()) ;
2054 }
2055
2056 #if wxUSE_FONTMAP
2057 wxMBConv_cocoa(const wxChar* name)
2058 {
2059 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2060 }
2061 #endif
2062
2063 wxMBConv_cocoa(wxFontEncoding encoding)
2064 {
2065 Init( wxCFStringEncFromFontEnc(encoding) );
2066 }
2067
2068 ~wxMBConv_cocoa()
2069 {
2070 }
2071
2072 void Init( CFStringEncoding encoding)
2073 {
2074 m_encoding = encoding ;
2075 }
2076
2077 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2078 {
2079 wxASSERT(szUnConv);
2080
2081 CFStringRef theString = CFStringCreateWithBytes (
2082 NULL, //the allocator
2083 (const UInt8*)szUnConv,
2084 strlen(szUnConv),
2085 m_encoding,
2086 false //no BOM/external representation
2087 );
2088
2089 wxASSERT(theString);
2090
2091 size_t nOutLength = CFStringGetLength(theString);
2092
2093 if (szOut == NULL)
2094 {
2095 CFRelease(theString);
2096 return nOutLength;
2097 }
2098
2099 CFRange theRange = { 0, nOutSize };
2100
2101 #if SIZEOF_WCHAR_T == 4
2102 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2103 #endif
2104
2105 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2106
2107 CFRelease(theString);
2108
2109 szUniCharBuffer[nOutLength] = '\0' ;
2110
2111 #if SIZEOF_WCHAR_T == 4
2112 wxMBConvUTF16 converter ;
2113 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2114 delete[] szUniCharBuffer;
2115 #endif
2116
2117 return nOutLength;
2118 }
2119
2120 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2121 {
2122 wxASSERT(szUnConv);
2123
2124 size_t nRealOutSize;
2125 size_t nBufSize = wxWcslen(szUnConv);
2126 UniChar* szUniBuffer = (UniChar*) szUnConv;
2127
2128 #if SIZEOF_WCHAR_T == 4
2129 wxMBConvUTF16 converter ;
2130 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2131 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2132 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2133 nBufSize /= sizeof(UniChar);
2134 #endif
2135
2136 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2137 NULL, //allocator
2138 szUniBuffer,
2139 nBufSize,
2140 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2141 );
2142
2143 wxASSERT(theString);
2144
2145 //Note that CER puts a BOM when converting to unicode
2146 //so we check and use getchars instead in that case
2147 if (m_encoding == kCFStringEncodingUnicode)
2148 {
2149 if (szOut != NULL)
2150 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2151
2152 nRealOutSize = CFStringGetLength(theString) + 1;
2153 }
2154 else
2155 {
2156 CFStringGetBytes(
2157 theString,
2158 CFRangeMake(0, CFStringGetLength(theString)),
2159 m_encoding,
2160 0, //what to put in characters that can't be converted -
2161 //0 tells CFString to return NULL if it meets such a character
2162 false, //not an external representation
2163 (UInt8*) szOut,
2164 nOutSize,
2165 (CFIndex*) &nRealOutSize
2166 );
2167 }
2168
2169 CFRelease(theString);
2170
2171 #if SIZEOF_WCHAR_T == 4
2172 delete[] szUniBuffer;
2173 #endif
2174
2175 return nRealOutSize - 1;
2176 }
2177
2178 bool IsOk() const
2179 {
2180 return m_encoding != kCFStringEncodingInvalidId &&
2181 CFStringIsEncodingAvailable(m_encoding);
2182 }
2183
2184 private:
2185 CFStringEncoding m_encoding ;
2186 };
2187
2188 #endif // defined(__WXCOCOA__)
2189
2190 // ============================================================================
2191 // Mac conversion classes
2192 // ============================================================================
2193
2194 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2195
2196 class wxMBConv_mac : public wxMBConv
2197 {
2198 public:
2199 wxMBConv_mac()
2200 {
2201 Init(CFStringGetSystemEncoding()) ;
2202 }
2203
2204 #if wxUSE_FONTMAP
2205 wxMBConv_mac(const wxChar* name)
2206 {
2207 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2208 }
2209 #endif
2210
2211 wxMBConv_mac(wxFontEncoding encoding)
2212 {
2213 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2214 }
2215
2216 ~wxMBConv_mac()
2217 {
2218 OSStatus status = noErr ;
2219 status = TECDisposeConverter(m_MB2WC_converter);
2220 status = TECDisposeConverter(m_WC2MB_converter);
2221 }
2222
2223
2224 void Init( TextEncodingBase encoding)
2225 {
2226 OSStatus status = noErr ;
2227 m_char_encoding = encoding ;
2228 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2229
2230 status = TECCreateConverter(&m_MB2WC_converter,
2231 m_char_encoding,
2232 m_unicode_encoding);
2233 status = TECCreateConverter(&m_WC2MB_converter,
2234 m_unicode_encoding,
2235 m_char_encoding);
2236 }
2237
2238 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2239 {
2240 OSStatus status = noErr ;
2241 ByteCount byteOutLen ;
2242 ByteCount byteInLen = strlen(psz) ;
2243 wchar_t *tbuf = NULL ;
2244 UniChar* ubuf = NULL ;
2245 size_t res = 0 ;
2246
2247 if (buf == NULL)
2248 {
2249 //apple specs say at least 32
2250 n = wxMax( 32 , byteInLen ) ;
2251 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2252 }
2253 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2254 #if SIZEOF_WCHAR_T == 4
2255 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2256 #else
2257 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2258 #endif
2259 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2260 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2261 #if SIZEOF_WCHAR_T == 4
2262 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2263 // is not properly terminated we get random characters at the end
2264 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2265 wxMBConvUTF16 converter ;
2266 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2267 free( ubuf ) ;
2268 #else
2269 res = byteOutLen / sizeof( UniChar ) ;
2270 #endif
2271 if ( buf == NULL )
2272 free(tbuf) ;
2273
2274 if ( buf && res < n)
2275 buf[res] = 0;
2276
2277 return res ;
2278 }
2279
2280 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2281 {
2282 OSStatus status = noErr ;
2283 ByteCount byteOutLen ;
2284 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2285
2286 char *tbuf = NULL ;
2287
2288 if (buf == NULL)
2289 {
2290 //apple specs say at least 32
2291 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2292 tbuf = (char*) malloc( n ) ;
2293 }
2294
2295 ByteCount byteBufferLen = n ;
2296 UniChar* ubuf = NULL ;
2297 #if SIZEOF_WCHAR_T == 4
2298 wxMBConvUTF16 converter ;
2299 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2300 byteInLen = unicharlen ;
2301 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2302 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2303 #else
2304 ubuf = (UniChar*) psz ;
2305 #endif
2306 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2307 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2308 #if SIZEOF_WCHAR_T == 4
2309 free( ubuf ) ;
2310 #endif
2311 if ( buf == NULL )
2312 free(tbuf) ;
2313
2314 size_t res = byteOutLen ;
2315 if ( buf && res < n)
2316 {
2317 buf[res] = 0;
2318
2319 //we need to double-trip to verify it didn't insert any ? in place
2320 //of bogus characters
2321 wxWCharBuffer wcBuf(n);
2322 size_t pszlen = wxWcslen(psz);
2323 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2324 wxWcslen(wcBuf) != pszlen ||
2325 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2326 {
2327 // we didn't obtain the same thing we started from, hence
2328 // the conversion was lossy and we consider that it failed
2329 return (size_t)-1;
2330 }
2331 }
2332
2333 return res ;
2334 }
2335
2336 bool IsOk() const
2337 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2338
2339 private:
2340 TECObjectRef m_MB2WC_converter ;
2341 TECObjectRef m_WC2MB_converter ;
2342
2343 TextEncodingBase m_char_encoding ;
2344 TextEncodingBase m_unicode_encoding ;
2345 };
2346
2347 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2348
2349 // ============================================================================
2350 // wxEncodingConverter based conversion classes
2351 // ============================================================================
2352
2353 #if wxUSE_FONTMAP
2354
2355 class wxMBConv_wxwin : public wxMBConv
2356 {
2357 private:
2358 void Init()
2359 {
2360 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2361 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2362 }
2363
2364 public:
2365 // temporarily just use wxEncodingConverter stuff,
2366 // so that it works while a better implementation is built
2367 wxMBConv_wxwin(const wxChar* name)
2368 {
2369 if (name)
2370 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2371 else
2372 m_enc = wxFONTENCODING_SYSTEM;
2373
2374 Init();
2375 }
2376
2377 wxMBConv_wxwin(wxFontEncoding enc)
2378 {
2379 m_enc = enc;
2380
2381 Init();
2382 }
2383
2384 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2385 {
2386 size_t inbuf = strlen(psz);
2387 if (buf)
2388 {
2389 if (!m2w.Convert(psz,buf))
2390 return (size_t)-1;
2391 }
2392 return inbuf;
2393 }
2394
2395 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2396 {
2397 const size_t inbuf = wxWcslen(psz);
2398 if (buf)
2399 {
2400 if (!w2m.Convert(psz,buf))
2401 return (size_t)-1;
2402 }
2403
2404 return inbuf;
2405 }
2406
2407 bool IsOk() const { return m_ok; }
2408
2409 public:
2410 wxFontEncoding m_enc;
2411 wxEncodingConverter m2w, w2m;
2412
2413 // were we initialized successfully?
2414 bool m_ok;
2415
2416 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2417 };
2418
2419 // make the constructors available for unit testing
2420 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2421 {
2422 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2423 if ( !result->IsOk() )
2424 {
2425 delete result;
2426 return 0;
2427 }
2428 return result;
2429 }
2430
2431 #endif // wxUSE_FONTMAP
2432
2433 // ============================================================================
2434 // wxCSConv implementation
2435 // ============================================================================
2436
2437 void wxCSConv::Init()
2438 {
2439 m_name = NULL;
2440 m_convReal = NULL;
2441 m_deferred = true;
2442 }
2443
2444 wxCSConv::wxCSConv(const wxChar *charset)
2445 {
2446 Init();
2447
2448 if ( charset )
2449 {
2450 SetName(charset);
2451 }
2452
2453 m_encoding = wxFONTENCODING_SYSTEM;
2454 }
2455
2456 wxCSConv::wxCSConv(wxFontEncoding encoding)
2457 {
2458 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2459 {
2460 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2461
2462 encoding = wxFONTENCODING_SYSTEM;
2463 }
2464
2465 Init();
2466
2467 m_encoding = encoding;
2468 }
2469
2470 wxCSConv::~wxCSConv()
2471 {
2472 Clear();
2473 }
2474
2475 wxCSConv::wxCSConv(const wxCSConv& conv)
2476 : wxMBConv()
2477 {
2478 Init();
2479
2480 SetName(conv.m_name);
2481 m_encoding = conv.m_encoding;
2482 }
2483
2484 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2485 {
2486 Clear();
2487
2488 SetName(conv.m_name);
2489 m_encoding = conv.m_encoding;
2490
2491 return *this;
2492 }
2493
2494 void wxCSConv::Clear()
2495 {
2496 free(m_name);
2497 delete m_convReal;
2498
2499 m_name = NULL;
2500 m_convReal = NULL;
2501 }
2502
2503 void wxCSConv::SetName(const wxChar *charset)
2504 {
2505 if (charset)
2506 {
2507 m_name = wxStrdup(charset);
2508 m_deferred = true;
2509 }
2510 }
2511
2512 wxMBConv *wxCSConv::DoCreate() const
2513 {
2514 // check for the special case of ASCII or ISO8859-1 charset: as we have
2515 // special knowledge of it anyhow, we don't need to create a special
2516 // conversion object
2517 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2518 {
2519 // don't convert at all
2520 return NULL;
2521 }
2522
2523 // we trust OS to do conversion better than we can so try external
2524 // conversion methods first
2525 //
2526 // the full order is:
2527 // 1. OS conversion (iconv() under Unix or Win32 API)
2528 // 2. hard coded conversions for UTF
2529 // 3. wxEncodingConverter as fall back
2530
2531 // step (1)
2532 #ifdef HAVE_ICONV
2533 #if !wxUSE_FONTMAP
2534 if ( m_name )
2535 #endif // !wxUSE_FONTMAP
2536 {
2537 wxString name(m_name);
2538
2539 #if wxUSE_FONTMAP
2540 if ( name.empty() )
2541 name = wxFontMapperBase::GetEncodingName(m_encoding);
2542 #endif // wxUSE_FONTMAP
2543
2544 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2545 if ( conv->IsOk() )
2546 return conv;
2547
2548 delete conv;
2549 }
2550 #endif // HAVE_ICONV
2551
2552 #ifdef wxHAVE_WIN32_MB2WC
2553 {
2554 #if wxUSE_FONTMAP
2555 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2556 : new wxMBConv_win32(m_encoding);
2557 if ( conv->IsOk() )
2558 return conv;
2559
2560 delete conv;
2561 #else
2562 return NULL;
2563 #endif
2564 }
2565 #endif // wxHAVE_WIN32_MB2WC
2566 #if defined(__WXMAC__)
2567 {
2568 // leave UTF16 and UTF32 to the built-ins of wx
2569 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2570 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2571 {
2572
2573 #if wxUSE_FONTMAP
2574 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2575 : new wxMBConv_mac(m_encoding);
2576 #else
2577 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2578 #endif
2579 if ( conv->IsOk() )
2580 return conv;
2581
2582 delete conv;
2583 }
2584 }
2585 #endif
2586 #if defined(__WXCOCOA__)
2587 {
2588 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2589 {
2590
2591 #if wxUSE_FONTMAP
2592 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2593 : new wxMBConv_cocoa(m_encoding);
2594 #else
2595 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2596 #endif
2597 if ( conv->IsOk() )
2598 return conv;
2599
2600 delete conv;
2601 }
2602 }
2603 #endif
2604 // step (2)
2605 wxFontEncoding enc = m_encoding;
2606 #if wxUSE_FONTMAP
2607 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2608 {
2609 // use "false" to suppress interactive dialogs -- we can be called from
2610 // anywhere and popping up a dialog from here is the last thing we want to
2611 // do
2612 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2613 }
2614 #endif // wxUSE_FONTMAP
2615
2616 switch ( enc )
2617 {
2618 case wxFONTENCODING_UTF7:
2619 return new wxMBConvUTF7;
2620
2621 case wxFONTENCODING_UTF8:
2622 return new wxMBConvUTF8;
2623
2624 case wxFONTENCODING_UTF16BE:
2625 return new wxMBConvUTF16BE;
2626
2627 case wxFONTENCODING_UTF16LE:
2628 return new wxMBConvUTF16LE;
2629
2630 case wxFONTENCODING_UTF32BE:
2631 return new wxMBConvUTF32BE;
2632
2633 case wxFONTENCODING_UTF32LE:
2634 return new wxMBConvUTF32LE;
2635
2636 default:
2637 // nothing to do but put here to suppress gcc warnings
2638 ;
2639 }
2640
2641 // step (3)
2642 #if wxUSE_FONTMAP
2643 {
2644 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2645 : new wxMBConv_wxwin(m_encoding);
2646 if ( conv->IsOk() )
2647 return conv;
2648
2649 delete conv;
2650 }
2651 #endif // wxUSE_FONTMAP
2652
2653 // NB: This is a hack to prevent deadlock. What could otherwise happen
2654 // in Unicode build: wxConvLocal creation ends up being here
2655 // because of some failure and logs the error. But wxLog will try to
2656 // attach timestamp, for which it will need wxConvLocal (to convert
2657 // time to char* and then wchar_t*), but that fails, tries to log
2658 // error, but wxLog has a (already locked) critical section that
2659 // guards static buffer.
2660 static bool alreadyLoggingError = false;
2661 if (!alreadyLoggingError)
2662 {
2663 alreadyLoggingError = true;
2664 wxLogError(_("Cannot convert from the charset '%s'!"),
2665 m_name ? m_name
2666 :
2667 #if wxUSE_FONTMAP
2668 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2669 #else // !wxUSE_FONTMAP
2670 wxString::Format(_("encoding %s"), m_encoding).c_str()
2671 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2672 );
2673 alreadyLoggingError = false;
2674 }
2675
2676 return NULL;
2677 }
2678
2679 void wxCSConv::CreateConvIfNeeded() const
2680 {
2681 if ( m_deferred )
2682 {
2683 wxCSConv *self = (wxCSConv *)this; // const_cast
2684
2685 #if wxUSE_INTL
2686 // if we don't have neither the name nor the encoding, use the default
2687 // encoding for this system
2688 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2689 {
2690 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2691 }
2692 #endif // wxUSE_INTL
2693
2694 self->m_convReal = DoCreate();
2695 self->m_deferred = false;
2696 }
2697 }
2698
2699 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2700 {
2701 CreateConvIfNeeded();
2702
2703 if (m_convReal)
2704 return m_convReal->MB2WC(buf, psz, n);
2705
2706 // latin-1 (direct)
2707 size_t len = strlen(psz);
2708
2709 if (buf)
2710 {
2711 for (size_t c = 0; c <= len; c++)
2712 buf[c] = (unsigned char)(psz[c]);
2713 }
2714
2715 return len;
2716 }
2717
2718 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2719 {
2720 CreateConvIfNeeded();
2721
2722 if (m_convReal)
2723 return m_convReal->WC2MB(buf, psz, n);
2724
2725 // latin-1 (direct)
2726 const size_t len = wxWcslen(psz);
2727 if (buf)
2728 {
2729 for (size_t c = 0; c <= len; c++)
2730 {
2731 if (psz[c] > 0xFF)
2732 return (size_t)-1;
2733 buf[c] = (char)psz[c];
2734 }
2735 }
2736 else
2737 {
2738 for (size_t c = 0; c <= len; c++)
2739 {
2740 if (psz[c] > 0xFF)
2741 return (size_t)-1;
2742 }
2743 }
2744
2745 return len;
2746 }
2747
2748 // ----------------------------------------------------------------------------
2749 // globals
2750 // ----------------------------------------------------------------------------
2751
2752 #ifdef __WINDOWS__
2753 static wxMBConv_win32 wxConvLibcObj;
2754 #elif defined(__WXMAC__) && !defined(__MACH__)
2755 static wxMBConv_mac wxConvLibcObj ;
2756 #else
2757 static wxMBConvLibc wxConvLibcObj;
2758 #endif
2759
2760 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2761 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2762 static wxMBConvUTF7 wxConvUTF7Obj;
2763 static wxMBConvUTF8 wxConvUTF8Obj;
2764
2765 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2766 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2767 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2768 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2769 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2770 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2771 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2772 #ifdef __WXOSX__
2773 wxConvUTF8Obj;
2774 #else
2775 wxConvLibcObj;
2776 #endif
2777
2778
2779 #else // !wxUSE_WCHAR_T
2780
2781 // stand-ins in absence of wchar_t
2782 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2783 wxConvISO8859_1,
2784 wxConvLocal,
2785 wxConvUTF8;
2786
2787 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2788
2789