]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Submit patch based on Michael W.'s invalid UTF8
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58 #ifdef HAVE_LANGINFO_H
59 #include <langinfo.h>
60 #endif
61
62 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
63 #define wxHAVE_WIN32_MB2WC
64 #endif // __WIN32__ but !__WXMICROWIN__
65
66 // ----------------------------------------------------------------------------
67 // headers
68 // ----------------------------------------------------------------------------
69
70 #ifdef __SALFORDC__
71 #include <clib.h>
72 #endif
73
74 #ifdef HAVE_ICONV
75 #include <iconv.h>
76 #include "wx/thread.h"
77 #endif
78
79 #include "wx/encconv.h"
80 #include "wx/fontmap.h"
81 #include "wx/utils.h"
82
83 #ifdef __WXMAC__
84 #include <ATSUnicode.h>
85 #include <TextCommon.h>
86 #include <TextEncodingConverter.h>
87
88 #include "wx/mac/private.h" // includes mac headers
89 #endif
90 // ----------------------------------------------------------------------------
91 // macros
92 // ----------------------------------------------------------------------------
93
94 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
95 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
96
97 #if SIZEOF_WCHAR_T == 4
98 #define WC_NAME "UCS4"
99 #define WC_BSWAP BSWAP_UCS4
100 #ifdef WORDS_BIGENDIAN
101 #define WC_NAME_BEST "UCS-4BE"
102 #else
103 #define WC_NAME_BEST "UCS-4LE"
104 #endif
105 #elif SIZEOF_WCHAR_T == 2
106 #define WC_NAME "UTF16"
107 #define WC_BSWAP BSWAP_UTF16
108 #define WC_UTF16
109 #ifdef WORDS_BIGENDIAN
110 #define WC_NAME_BEST "UTF-16BE"
111 #else
112 #define WC_NAME_BEST "UTF-16LE"
113 #endif
114 #else // sizeof(wchar_t) != 2 nor 4
115 // does this ever happen?
116 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
117 #endif
118
119 // ============================================================================
120 // implementation
121 // ============================================================================
122
123 // ----------------------------------------------------------------------------
124 // UTF-16 en/decoding to/from UCS-4
125 // ----------------------------------------------------------------------------
126
127
128 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
129 {
130 if (input<=0xffff)
131 {
132 if (output)
133 *output = (wxUint16) input;
134 return 1;
135 }
136 else if (input>=0x110000)
137 {
138 return (size_t)-1;
139 }
140 else
141 {
142 if (output)
143 {
144 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
145 *output = (wxUint16) ((input&0x3ff)+0xdc00);
146 }
147 return 2;
148 }
149 }
150
151 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
152 {
153 if ((*input<0xd800) || (*input>0xdfff))
154 {
155 output = *input;
156 return 1;
157 }
158 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
159 {
160 output = *input;
161 return (size_t)-1;
162 }
163 else
164 {
165 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
166 return 2;
167 }
168 }
169
170
171 // ----------------------------------------------------------------------------
172 // wxMBConv
173 // ----------------------------------------------------------------------------
174
175 wxMBConv::~wxMBConv()
176 {
177 // nothing to do here (necessary for Darwin linking probably)
178 }
179
180 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
181 {
182 if ( psz )
183 {
184 // calculate the length of the buffer needed first
185 size_t nLen = MB2WC(NULL, psz, 0);
186 if ( nLen != (size_t)-1 )
187 {
188 // now do the actual conversion
189 wxWCharBuffer buf(nLen);
190 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
191 if ( nLen != (size_t)-1 )
192 {
193 return buf;
194 }
195 }
196 }
197
198 wxWCharBuffer buf((wchar_t *)NULL);
199
200 return buf;
201 }
202
203 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
204 {
205 if ( pwz )
206 {
207 size_t nLen = WC2MB(NULL, pwz, 0);
208 if ( nLen != (size_t)-1 )
209 {
210 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
211 nLen = WC2MB(buf.data(), pwz, nLen + 4);
212 if ( nLen != (size_t)-1 )
213 {
214 return buf;
215 }
216 }
217 }
218
219 wxCharBuffer buf((char *)NULL);
220
221 return buf;
222 }
223
224 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
225 {
226 wxASSERT(pOutSize != NULL);
227
228 const char* szEnd = szString + nStringLen + 1;
229 const char* szPos = szString;
230 const char* szStart = szPos;
231
232 size_t nActualLength = 0;
233 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
234
235 wxWCharBuffer theBuffer(nCurrentSize);
236
237 //Convert the string until the length() is reached, continuing the
238 //loop every time a null character is reached
239 while(szPos != szEnd)
240 {
241 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
242
243 //Get the length of the current (sub)string
244 size_t nLen = MB2WC(NULL, szPos, 0);
245
246 //Invalid conversion?
247 if( nLen == (size_t)-1 )
248 {
249 *pOutSize = 0;
250 theBuffer.data()[0u] = wxT('\0');
251 return theBuffer;
252 }
253
254
255 //Increase the actual length (+1 for current null character)
256 nActualLength += nLen + 1;
257
258 //if buffer too big, realloc the buffer
259 if (nActualLength > (nCurrentSize+1))
260 {
261 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
262 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
263 theBuffer = theNewBuffer;
264 nCurrentSize <<= 1;
265 }
266
267 //Convert the current (sub)string
268 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
269 {
270 *pOutSize = 0;
271 theBuffer.data()[0u] = wxT('\0');
272 return theBuffer;
273 }
274
275 //Increment to next (sub)string
276 //Note that we have to use strlen here instead of nLen
277 //here because XX2XX gives us the size of the output buffer,
278 //not neccessarly the length of the string
279 szPos += strlen(szPos) + 1;
280 }
281
282 //success - return actual length and the buffer
283 *pOutSize = nActualLength;
284 return theBuffer;
285 }
286
287 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
288 {
289 wxASSERT(pOutSize != NULL);
290
291 const wchar_t* szEnd = szString + nStringLen + 1;
292 const wchar_t* szPos = szString;
293 const wchar_t* szStart = szPos;
294
295 size_t nActualLength = 0;
296 size_t nCurrentSize = nStringLen << 2; //try * 4 first
297
298 wxCharBuffer theBuffer(nCurrentSize);
299
300 //Convert the string until the length() is reached, continuing the
301 //loop every time a null character is reached
302 while(szPos != szEnd)
303 {
304 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
305
306 //Get the length of the current (sub)string
307 size_t nLen = WC2MB(NULL, szPos, 0);
308
309 //Invalid conversion?
310 if( nLen == (size_t)-1 )
311 {
312 *pOutSize = 0;
313 theBuffer.data()[0u] = wxT('\0');
314 return theBuffer;
315 }
316
317 //Increase the actual length (+1 for current null character)
318 nActualLength += nLen + 1;
319
320 //if buffer too big, realloc the buffer
321 if (nActualLength > (nCurrentSize+1))
322 {
323 wxCharBuffer theNewBuffer(nCurrentSize << 1);
324 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
325 theBuffer = theNewBuffer;
326 nCurrentSize <<= 1;
327 }
328
329 //Convert the current (sub)string
330 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
331 {
332 *pOutSize = 0;
333 theBuffer.data()[0u] = wxT('\0');
334 return theBuffer;
335 }
336
337 //Increment to next (sub)string
338 //Note that we have to use wxWcslen here instead of nLen
339 //here because XX2XX gives us the size of the output buffer,
340 //not neccessarly the length of the string
341 szPos += wxWcslen(szPos) + 1;
342 }
343
344 //success - return actual length and the buffer
345 *pOutSize = nActualLength;
346 return theBuffer;
347 }
348
349 // ----------------------------------------------------------------------------
350 // wxMBConvLibc
351 // ----------------------------------------------------------------------------
352
353 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
354 {
355 return wxMB2WC(buf, psz, n);
356 }
357
358 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
359 {
360 return wxWC2MB(buf, psz, n);
361 }
362
363 // ----------------------------------------------------------------------------
364 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
365 // files are accidentally written in an encoding which is not
366 // the system encoding. Typically, the system encoding will be
367 // UTF8 but there might be files stored in ISO8859-1 on disk.
368 // ----------------------------------------------------------------------------
369
370 class wxConvBrokenFileNames: public wxMBConvLibc
371 {
372 public:
373 wxConvBrokenFileNames() : m_utf8conv(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL) { }
374 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
375 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
376 inline bool UseUTF8() const;
377 private:
378 wxMBConvUTF8 m_utf8conv;
379 };
380
381 bool wxConvBrokenFileNames::UseUTF8() const
382 {
383 #if defined HAVE_LANGINFO_H && defined CODESET
384 char *codeset = nl_langinfo(CODESET);
385 return strcmp(codeset, "UTF-8") == 0;
386 #else
387 return false;
388 #endif
389 }
390
391 size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
392 {
393 if (UseUTF8())
394 return m_utf8conv.MB2WC( outputBuf, psz, outputSize );
395 else
396 return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
397 }
398
399 size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
400 {
401 if (UseUTF8())
402 return m_utf8conv.WC2MB( outputBuf, psz, outputSize );
403 else
404 return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
405 }
406
407 // ----------------------------------------------------------------------------
408 // UTF-7
409 // ----------------------------------------------------------------------------
410
411 // Implementation (C) 2004 Fredrik Roubert
412
413 //
414 // BASE64 decoding table
415 //
416 static const unsigned char utf7unb64[] =
417 {
418 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
424 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
425 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
427 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
428 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
429 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
431 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
432 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
433 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
437 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
438 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
439 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
440 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
441 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
442 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
443 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
444 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
450 };
451
452 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
453 {
454 size_t len = 0;
455
456 while (*psz && ((!buf) || (len < n)))
457 {
458 unsigned char cc = *psz++;
459 if (cc != '+')
460 {
461 // plain ASCII char
462 if (buf)
463 *buf++ = cc;
464 len++;
465 }
466 else if (*psz == '-')
467 {
468 // encoded plus sign
469 if (buf)
470 *buf++ = cc;
471 len++;
472 psz++;
473 }
474 else
475 {
476 // BASE64 encoded string
477 bool lsb;
478 unsigned char c;
479 unsigned int d, l;
480 for (lsb = false, d = 0, l = 0;
481 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
482 {
483 d <<= 6;
484 d += cc;
485 for (l += 6; l >= 8; lsb = !lsb)
486 {
487 c = (unsigned char)((d >> (l -= 8)) % 256);
488 if (lsb)
489 {
490 if (buf)
491 *buf++ |= c;
492 len ++;
493 }
494 else
495 if (buf)
496 *buf = (wchar_t)(c << 8);
497 }
498 }
499 if (*psz == '-')
500 psz++;
501 }
502 }
503 if (buf && (len < n))
504 *buf = 0;
505 return len;
506 }
507
508 //
509 // BASE64 encoding table
510 //
511 static const unsigned char utf7enb64[] =
512 {
513 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
514 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
515 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
516 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
517 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
518 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
519 'w', 'x', 'y', 'z', '0', '1', '2', '3',
520 '4', '5', '6', '7', '8', '9', '+', '/'
521 };
522
523 //
524 // UTF-7 encoding table
525 //
526 // 0 - Set D (directly encoded characters)
527 // 1 - Set O (optional direct characters)
528 // 2 - whitespace characters (optional)
529 // 3 - special characters
530 //
531 static const unsigned char utf7encode[128] =
532 {
533 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
534 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
541 };
542
543 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
544 {
545
546
547 size_t len = 0;
548
549 while (*psz && ((!buf) || (len < n)))
550 {
551 wchar_t cc = *psz++;
552 if (cc < 0x80 && utf7encode[cc] < 1)
553 {
554 // plain ASCII char
555 if (buf)
556 *buf++ = (char)cc;
557 len++;
558 }
559 #ifndef WC_UTF16
560 else if (((wxUint32)cc) > 0xffff)
561 {
562 // no surrogate pair generation (yet?)
563 return (size_t)-1;
564 }
565 #endif
566 else
567 {
568 if (buf)
569 *buf++ = '+';
570 len++;
571 if (cc != '+')
572 {
573 // BASE64 encode string
574 unsigned int lsb, d, l;
575 for (d = 0, l = 0;; psz++)
576 {
577 for (lsb = 0; lsb < 2; lsb ++)
578 {
579 d <<= 8;
580 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
581
582 for (l += 8; l >= 6; )
583 {
584 l -= 6;
585 if (buf)
586 *buf++ = utf7enb64[(d >> l) % 64];
587 len++;
588 }
589 }
590 cc = *psz;
591 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
592 break;
593 }
594 if (l != 0)
595 {
596 if (buf)
597 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
598 len++;
599 }
600 }
601 if (buf)
602 *buf++ = '-';
603 len++;
604 }
605 }
606 if (buf && (len < n))
607 *buf = 0;
608 return len;
609 }
610
611 // ----------------------------------------------------------------------------
612 // UTF-8
613 // ----------------------------------------------------------------------------
614
615 static wxUint32 utf8_max[]=
616 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
617
618 const wxUint32 wxUnicodePUA = 0x100000;
619 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
620
621 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
622 {
623 size_t len = 0;
624
625 while (*psz && ((!buf) || (len < n)))
626 {
627 const char *opsz = psz;
628 bool invalid = false;
629 unsigned char cc = *psz++, fc = cc;
630 unsigned cnt;
631 for (cnt = 0; fc & 0x80; cnt++)
632 fc <<= 1;
633 if (!cnt)
634 {
635 // plain ASCII char
636 if (buf)
637 *buf++ = cc;
638 len++;
639 }
640 else
641 {
642 cnt--;
643 if (!cnt)
644 {
645 // invalid UTF-8 sequence
646 invalid = true;
647 }
648 else
649 {
650 unsigned ocnt = cnt - 1;
651 wxUint32 res = cc & (0x3f >> cnt);
652 while (cnt--)
653 {
654 cc = *psz;
655 if ((cc & 0xC0) != 0x80)
656 {
657 // invalid UTF-8 sequence
658 invalid = true;
659 break;
660 }
661 psz++;
662 res = (res << 6) | (cc & 0x3f);
663 }
664 if (invalid || res <= utf8_max[ocnt])
665 {
666 // illegal UTF-8 encoding
667 invalid = true;
668 }
669 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
670 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
671 {
672 // if one of our PUA characters turns up externally
673 // it must also be treated as an illegal sequence
674 // (a bit like you have to escape an escape character)
675 invalid = true;
676 }
677 else
678 {
679 #ifdef WC_UTF16
680 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
681 size_t pa = encode_utf16(res, (wxUint16 *)buf);
682 if (pa == (size_t)-1)
683 {
684 invalid = true;
685 }
686 else
687 {
688 if (buf)
689 buf += pa;
690 len += pa;
691 }
692 #else // !WC_UTF16
693 if (buf)
694 *buf++ = res;
695 len++;
696 #endif // WC_UTF16/!WC_UTF16
697 }
698 }
699 if (invalid)
700 {
701 if (m_options & MAP_INVALID_UTF8_TO_PUA)
702 {
703 while (opsz < psz && (!buf || len < n))
704 {
705 #ifdef WC_UTF16
706 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
707 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
708 wxASSERT(pa != (size_t)-1);
709 if (buf)
710 buf += pa;
711 opsz++;
712 len += pa;
713 #else
714 if (buf)
715 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
716 opsz++;
717 len++;
718 #endif
719 }
720 }
721 else
722 if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
723 {
724 while (opsz < psz && (!buf || len < n))
725 {
726 wchar_t str[6];
727 wxSnprintf( str, 5, L"\\%o", (int) (unsigned char) *opsz );
728 if (buf)
729 *buf++ = str[0];
730 if (buf)
731 *buf++ = str[1];
732 if (buf)
733 *buf++ = str[2];
734 if (buf)
735 *buf++ = str[3];
736 opsz++;
737 len += 4;
738 }
739 }
740 else
741 {
742 return (size_t)-1;
743 }
744 }
745 }
746 }
747 if (buf && (len < n))
748 *buf = 0;
749 return len;
750 }
751
752 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
753 {
754 size_t len = 0;
755
756 while (*psz && ((!buf) || (len < n)))
757 {
758 wxUint32 cc;
759 #ifdef WC_UTF16
760 // cast is ok for WC_UTF16
761 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
762 psz += (pa == (size_t)-1) ? 1 : pa;
763 #else
764 cc=(*psz++) & 0x7fffffff;
765 #endif
766 if ((m_options & MAP_INVALID_UTF8_TO_PUA)
767 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd)
768 {
769 if (buf)
770 *buf++ = (char)(cc - wxUnicodePUA);
771 len++;
772 }
773 else
774 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
775 && cc == L'\\')
776 {
777 wchar_t str[4];
778 str[0] = *psz; psz++;
779 str[1] = *psz; psz++;
780 str[2] = *psz; psz++;
781 str[3] = 0;
782 int octal;
783 wxSscanf( str, L"%o", &octal );
784 if (buf)
785 *buf++ = (char) octal;
786 len++;
787 }
788 else
789 {
790 unsigned cnt;
791 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
792 if (!cnt)
793 {
794 // plain ASCII char
795 if (buf)
796 *buf++ = (char) cc;
797 len++;
798 }
799
800 else
801 {
802 len += cnt + 1;
803 if (buf)
804 {
805 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
806 while (cnt--)
807 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
808 }
809 }
810 }
811 }
812
813 if (buf && (len<n)) *buf = 0;
814
815 return len;
816 }
817
818 // ----------------------------------------------------------------------------
819 // UTF-16
820 // ----------------------------------------------------------------------------
821
822 #ifdef WORDS_BIGENDIAN
823 #define wxMBConvUTF16straight wxMBConvUTF16BE
824 #define wxMBConvUTF16swap wxMBConvUTF16LE
825 #else
826 #define wxMBConvUTF16swap wxMBConvUTF16BE
827 #define wxMBConvUTF16straight wxMBConvUTF16LE
828 #endif
829
830
831 #ifdef WC_UTF16
832
833 // copy 16bit MB to 16bit String
834 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
835 {
836 size_t len=0;
837
838 while (*(wxUint16*)psz && (!buf || len < n))
839 {
840 if (buf)
841 *buf++ = *(wxUint16*)psz;
842 len++;
843
844 psz += sizeof(wxUint16);
845 }
846 if (buf && len<n) *buf=0;
847
848 return len;
849 }
850
851
852 // copy 16bit String to 16bit MB
853 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
854 {
855 size_t len=0;
856
857 while (*psz && (!buf || len < n))
858 {
859 if (buf)
860 {
861 *(wxUint16*)buf = *psz;
862 buf += sizeof(wxUint16);
863 }
864 len += sizeof(wxUint16);
865 psz++;
866 }
867 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
868
869 return len;
870 }
871
872
873 // swap 16bit MB to 16bit String
874 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
875 {
876 size_t len=0;
877
878 while (*(wxUint16*)psz && (!buf || len < n))
879 {
880 if (buf)
881 {
882 ((char *)buf)[0] = psz[1];
883 ((char *)buf)[1] = psz[0];
884 buf++;
885 }
886 len++;
887 psz += sizeof(wxUint16);
888 }
889 if (buf && len<n) *buf=0;
890
891 return len;
892 }
893
894
895 // swap 16bit MB to 16bit String
896 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
897 {
898 size_t len=0;
899
900 while (*psz && (!buf || len < n))
901 {
902 if (buf)
903 {
904 *buf++ = ((char*)psz)[1];
905 *buf++ = ((char*)psz)[0];
906 }
907 len += sizeof(wxUint16);
908 psz++;
909 }
910 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
911
912 return len;
913 }
914
915
916 #else // WC_UTF16
917
918
919 // copy 16bit MB to 32bit String
920 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
921 {
922 size_t len=0;
923
924 while (*(wxUint16*)psz && (!buf || len < n))
925 {
926 wxUint32 cc;
927 size_t pa=decode_utf16((wxUint16*)psz, cc);
928 if (pa == (size_t)-1)
929 return pa;
930
931 if (buf)
932 *buf++ = cc;
933 len++;
934 psz += pa * sizeof(wxUint16);
935 }
936 if (buf && len<n) *buf=0;
937
938 return len;
939 }
940
941
942 // copy 32bit String to 16bit MB
943 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
944 {
945 size_t len=0;
946
947 while (*psz && (!buf || len < n))
948 {
949 wxUint16 cc[2];
950 size_t pa=encode_utf16(*psz, cc);
951
952 if (pa == (size_t)-1)
953 return pa;
954
955 if (buf)
956 {
957 *(wxUint16*)buf = cc[0];
958 buf += sizeof(wxUint16);
959 if (pa > 1)
960 {
961 *(wxUint16*)buf = cc[1];
962 buf += sizeof(wxUint16);
963 }
964 }
965
966 len += pa*sizeof(wxUint16);
967 psz++;
968 }
969 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
970
971 return len;
972 }
973
974
975 // swap 16bit MB to 32bit String
976 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
977 {
978 size_t len=0;
979
980 while (*(wxUint16*)psz && (!buf || len < n))
981 {
982 wxUint32 cc;
983 char tmp[4];
984 tmp[0]=psz[1]; tmp[1]=psz[0];
985 tmp[2]=psz[3]; tmp[3]=psz[2];
986
987 size_t pa=decode_utf16((wxUint16*)tmp, cc);
988 if (pa == (size_t)-1)
989 return pa;
990
991 if (buf)
992 *buf++ = cc;
993
994 len++;
995 psz += pa * sizeof(wxUint16);
996 }
997 if (buf && len<n) *buf=0;
998
999 return len;
1000 }
1001
1002
1003 // swap 32bit String to 16bit MB
1004 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1005 {
1006 size_t len=0;
1007
1008 while (*psz && (!buf || len < n))
1009 {
1010 wxUint16 cc[2];
1011 size_t pa=encode_utf16(*psz, cc);
1012
1013 if (pa == (size_t)-1)
1014 return pa;
1015
1016 if (buf)
1017 {
1018 *buf++ = ((char*)cc)[1];
1019 *buf++ = ((char*)cc)[0];
1020 if (pa > 1)
1021 {
1022 *buf++ = ((char*)cc)[3];
1023 *buf++ = ((char*)cc)[2];
1024 }
1025 }
1026
1027 len += pa*sizeof(wxUint16);
1028 psz++;
1029 }
1030 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1031
1032 return len;
1033 }
1034
1035 #endif // WC_UTF16
1036
1037
1038 // ----------------------------------------------------------------------------
1039 // UTF-32
1040 // ----------------------------------------------------------------------------
1041
1042 #ifdef WORDS_BIGENDIAN
1043 #define wxMBConvUTF32straight wxMBConvUTF32BE
1044 #define wxMBConvUTF32swap wxMBConvUTF32LE
1045 #else
1046 #define wxMBConvUTF32swap wxMBConvUTF32BE
1047 #define wxMBConvUTF32straight wxMBConvUTF32LE
1048 #endif
1049
1050
1051 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1052 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1053
1054
1055 #ifdef WC_UTF16
1056
1057 // copy 32bit MB to 16bit String
1058 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1059 {
1060 size_t len=0;
1061
1062 while (*(wxUint32*)psz && (!buf || len < n))
1063 {
1064 wxUint16 cc[2];
1065
1066 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1067 if (pa == (size_t)-1)
1068 return pa;
1069
1070 if (buf)
1071 {
1072 *buf++ = cc[0];
1073 if (pa > 1)
1074 *buf++ = cc[1];
1075 }
1076 len += pa;
1077 psz += sizeof(wxUint32);
1078 }
1079 if (buf && len<n) *buf=0;
1080
1081 return len;
1082 }
1083
1084
1085 // copy 16bit String to 32bit MB
1086 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1087 {
1088 size_t len=0;
1089
1090 while (*psz && (!buf || len < n))
1091 {
1092 wxUint32 cc;
1093
1094 // cast is ok for WC_UTF16
1095 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1096 if (pa == (size_t)-1)
1097 return pa;
1098
1099 if (buf)
1100 {
1101 *(wxUint32*)buf = cc;
1102 buf += sizeof(wxUint32);
1103 }
1104 len += sizeof(wxUint32);
1105 psz += pa;
1106 }
1107
1108 if (buf && len<=n-sizeof(wxUint32))
1109 *(wxUint32*)buf=0;
1110
1111 return len;
1112 }
1113
1114
1115
1116 // swap 32bit MB to 16bit String
1117 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1118 {
1119 size_t len=0;
1120
1121 while (*(wxUint32*)psz && (!buf || len < n))
1122 {
1123 char tmp[4];
1124 tmp[0] = psz[3]; tmp[1] = psz[2];
1125 tmp[2] = psz[1]; tmp[3] = psz[0];
1126
1127
1128 wxUint16 cc[2];
1129
1130 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1131 if (pa == (size_t)-1)
1132 return pa;
1133
1134 if (buf)
1135 {
1136 *buf++ = cc[0];
1137 if (pa > 1)
1138 *buf++ = cc[1];
1139 }
1140 len += pa;
1141 psz += sizeof(wxUint32);
1142 }
1143
1144 if (buf && len<n)
1145 *buf=0;
1146
1147 return len;
1148 }
1149
1150
1151 // swap 16bit String to 32bit MB
1152 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1153 {
1154 size_t len=0;
1155
1156 while (*psz && (!buf || len < n))
1157 {
1158 char cc[4];
1159
1160 // cast is ok for WC_UTF16
1161 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1162 if (pa == (size_t)-1)
1163 return pa;
1164
1165 if (buf)
1166 {
1167 *buf++ = cc[3];
1168 *buf++ = cc[2];
1169 *buf++ = cc[1];
1170 *buf++ = cc[0];
1171 }
1172 len += sizeof(wxUint32);
1173 psz += pa;
1174 }
1175
1176 if (buf && len<=n-sizeof(wxUint32))
1177 *(wxUint32*)buf=0;
1178
1179 return len;
1180 }
1181
1182 #else // WC_UTF16
1183
1184
1185 // copy 32bit MB to 32bit String
1186 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1187 {
1188 size_t len=0;
1189
1190 while (*(wxUint32*)psz && (!buf || len < n))
1191 {
1192 if (buf)
1193 *buf++ = *(wxUint32*)psz;
1194 len++;
1195 psz += sizeof(wxUint32);
1196 }
1197
1198 if (buf && len<n)
1199 *buf=0;
1200
1201 return len;
1202 }
1203
1204
1205 // copy 32bit String to 32bit MB
1206 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1207 {
1208 size_t len=0;
1209
1210 while (*psz && (!buf || len < n))
1211 {
1212 if (buf)
1213 {
1214 *(wxUint32*)buf = *psz;
1215 buf += sizeof(wxUint32);
1216 }
1217
1218 len += sizeof(wxUint32);
1219 psz++;
1220 }
1221
1222 if (buf && len<=n-sizeof(wxUint32))
1223 *(wxUint32*)buf=0;
1224
1225 return len;
1226 }
1227
1228
1229 // swap 32bit MB to 32bit String
1230 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1231 {
1232 size_t len=0;
1233
1234 while (*(wxUint32*)psz && (!buf || len < n))
1235 {
1236 if (buf)
1237 {
1238 ((char *)buf)[0] = psz[3];
1239 ((char *)buf)[1] = psz[2];
1240 ((char *)buf)[2] = psz[1];
1241 ((char *)buf)[3] = psz[0];
1242 buf++;
1243 }
1244 len++;
1245 psz += sizeof(wxUint32);
1246 }
1247
1248 if (buf && len<n)
1249 *buf=0;
1250
1251 return len;
1252 }
1253
1254
1255 // swap 32bit String to 32bit MB
1256 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1257 {
1258 size_t len=0;
1259
1260 while (*psz && (!buf || len < n))
1261 {
1262 if (buf)
1263 {
1264 *buf++ = ((char *)psz)[3];
1265 *buf++ = ((char *)psz)[2];
1266 *buf++ = ((char *)psz)[1];
1267 *buf++ = ((char *)psz)[0];
1268 }
1269 len += sizeof(wxUint32);
1270 psz++;
1271 }
1272
1273 if (buf && len<=n-sizeof(wxUint32))
1274 *(wxUint32*)buf=0;
1275
1276 return len;
1277 }
1278
1279
1280 #endif // WC_UTF16
1281
1282
1283 // ============================================================================
1284 // The classes doing conversion using the iconv_xxx() functions
1285 // ============================================================================
1286
1287 #ifdef HAVE_ICONV
1288
1289 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1290 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1291 // (unless there's yet another bug in glibc) the only case when iconv()
1292 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1293 // left in the input buffer -- when _real_ error occurs,
1294 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1295 // iconv() failure.
1296 // [This bug does not appear in glibc 2.2.]
1297 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1298 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1299 (errno != E2BIG || bufLeft != 0))
1300 #else
1301 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1302 #endif
1303
1304 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1305
1306 // ----------------------------------------------------------------------------
1307 // wxMBConv_iconv: encapsulates an iconv character set
1308 // ----------------------------------------------------------------------------
1309
1310 class wxMBConv_iconv : public wxMBConv
1311 {
1312 public:
1313 wxMBConv_iconv(const wxChar *name);
1314 virtual ~wxMBConv_iconv();
1315
1316 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1317 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1318
1319 bool IsOk() const
1320 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1321
1322 protected:
1323 // the iconv handlers used to translate from multibyte to wide char and in
1324 // the other direction
1325 iconv_t m2w,
1326 w2m;
1327 #if wxUSE_THREADS
1328 // guards access to m2w and w2m objects
1329 wxMutex m_iconvMutex;
1330 #endif
1331
1332 private:
1333 // the name (for iconv_open()) of a wide char charset -- if none is
1334 // available on this machine, it will remain NULL
1335 static const char *ms_wcCharsetName;
1336
1337 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1338 // different endian-ness than the native one
1339 static bool ms_wcNeedsSwap;
1340 };
1341
1342 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1343 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1344
1345 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1346 {
1347 // Do it the hard way
1348 char cname[100];
1349 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1350 cname[i] = (char) name[i];
1351
1352 // check for charset that represents wchar_t:
1353 if (ms_wcCharsetName == NULL)
1354 {
1355 ms_wcNeedsSwap = false;
1356
1357 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1358 ms_wcCharsetName = WC_NAME_BEST;
1359 m2w = iconv_open(ms_wcCharsetName, cname);
1360
1361 if (m2w == (iconv_t)-1)
1362 {
1363 // try charset w/o bytesex info (e.g. "UCS4")
1364 // and check for bytesex ourselves:
1365 ms_wcCharsetName = WC_NAME;
1366 m2w = iconv_open(ms_wcCharsetName, cname);
1367
1368 // last bet, try if it knows WCHAR_T pseudo-charset
1369 if (m2w == (iconv_t)-1)
1370 {
1371 ms_wcCharsetName = "WCHAR_T";
1372 m2w = iconv_open(ms_wcCharsetName, cname);
1373 }
1374
1375 if (m2w != (iconv_t)-1)
1376 {
1377 char buf[2], *bufPtr;
1378 wchar_t wbuf[2], *wbufPtr;
1379 size_t insz, outsz;
1380 size_t res;
1381
1382 buf[0] = 'A';
1383 buf[1] = 0;
1384 wbuf[0] = 0;
1385 insz = 2;
1386 outsz = SIZEOF_WCHAR_T * 2;
1387 wbufPtr = wbuf;
1388 bufPtr = buf;
1389
1390 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1391 (char**)&wbufPtr, &outsz);
1392
1393 if (ICONV_FAILED(res, insz))
1394 {
1395 ms_wcCharsetName = NULL;
1396 wxLogLastError(wxT("iconv"));
1397 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1398 }
1399 else
1400 {
1401 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1402 }
1403 }
1404 else
1405 {
1406 ms_wcCharsetName = NULL;
1407
1408 // VS: we must not output an error here, since wxWidgets will safely
1409 // fall back to using wxEncodingConverter.
1410 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1411 //wxLogError(
1412 }
1413 }
1414 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1415 }
1416 else // we already have ms_wcCharsetName
1417 {
1418 m2w = iconv_open(ms_wcCharsetName, cname);
1419 }
1420
1421 // NB: don't ever pass NULL to iconv_open(), it may crash!
1422 if ( ms_wcCharsetName )
1423 {
1424 w2m = iconv_open( cname, ms_wcCharsetName);
1425 }
1426 else
1427 {
1428 w2m = (iconv_t)-1;
1429 }
1430 }
1431
1432 wxMBConv_iconv::~wxMBConv_iconv()
1433 {
1434 if ( m2w != (iconv_t)-1 )
1435 iconv_close(m2w);
1436 if ( w2m != (iconv_t)-1 )
1437 iconv_close(w2m);
1438 }
1439
1440 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1441 {
1442 #if wxUSE_THREADS
1443 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1444 // Unfortunately there is a couple of global wxCSConv objects such as
1445 // wxConvLocal that are used all over wx code, so we have to make sure
1446 // the handle is used by at most one thread at the time. Otherwise
1447 // only a few wx classes would be safe to use from non-main threads
1448 // as MB<->WC conversion would fail "randomly".
1449 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1450 #endif
1451
1452 size_t inbuf = strlen(psz);
1453 size_t outbuf = n * SIZEOF_WCHAR_T;
1454 size_t res, cres;
1455 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1456 wchar_t *bufPtr = buf;
1457 const char *pszPtr = psz;
1458
1459 if (buf)
1460 {
1461 // have destination buffer, convert there
1462 cres = iconv(m2w,
1463 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1464 (char**)&bufPtr, &outbuf);
1465 res = n - (outbuf / SIZEOF_WCHAR_T);
1466
1467 if (ms_wcNeedsSwap)
1468 {
1469 // convert to native endianness
1470 WC_BSWAP(buf /* _not_ bufPtr */, res)
1471 }
1472
1473 // NB: iconv was given only strlen(psz) characters on input, and so
1474 // it couldn't convert the trailing zero. Let's do it ourselves
1475 // if there's some room left for it in the output buffer.
1476 if (res < n)
1477 buf[res] = 0;
1478 }
1479 else
1480 {
1481 // no destination buffer... convert using temp buffer
1482 // to calculate destination buffer requirement
1483 wchar_t tbuf[8];
1484 res = 0;
1485 do {
1486 bufPtr = tbuf;
1487 outbuf = 8*SIZEOF_WCHAR_T;
1488
1489 cres = iconv(m2w,
1490 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1491 (char**)&bufPtr, &outbuf );
1492
1493 res += 8-(outbuf/SIZEOF_WCHAR_T);
1494 } while ((cres==(size_t)-1) && (errno==E2BIG));
1495 }
1496
1497 if (ICONV_FAILED(cres, inbuf))
1498 {
1499 //VS: it is ok if iconv fails, hence trace only
1500 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1501 return (size_t)-1;
1502 }
1503
1504 return res;
1505 }
1506
1507 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1508 {
1509 #if wxUSE_THREADS
1510 // NB: explained in MB2WC
1511 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1512 #endif
1513
1514 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1515 size_t outbuf = n;
1516 size_t res, cres;
1517
1518 wchar_t *tmpbuf = 0;
1519
1520 if (ms_wcNeedsSwap)
1521 {
1522 // need to copy to temp buffer to switch endianness
1523 // this absolutely doesn't rock!
1524 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1525 // could be in read-only memory, or be accessed in some other thread)
1526 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1527 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1528 WC_BSWAP(tmpbuf, inbuf)
1529 psz=tmpbuf;
1530 }
1531
1532 if (buf)
1533 {
1534 // have destination buffer, convert there
1535 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1536
1537 res = n-outbuf;
1538
1539 // NB: iconv was given only wcslen(psz) characters on input, and so
1540 // it couldn't convert the trailing zero. Let's do it ourselves
1541 // if there's some room left for it in the output buffer.
1542 if (res < n)
1543 buf[0] = 0;
1544 }
1545 else
1546 {
1547 // no destination buffer... convert using temp buffer
1548 // to calculate destination buffer requirement
1549 char tbuf[16];
1550 res = 0;
1551 do {
1552 buf = tbuf; outbuf = 16;
1553
1554 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1555
1556 res += 16 - outbuf;
1557 } while ((cres==(size_t)-1) && (errno==E2BIG));
1558 }
1559
1560 if (ms_wcNeedsSwap)
1561 {
1562 free(tmpbuf);
1563 }
1564
1565 if (ICONV_FAILED(cres, inbuf))
1566 {
1567 //VS: it is ok if iconv fails, hence trace only
1568 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1569 return (size_t)-1;
1570 }
1571
1572 return res;
1573 }
1574
1575 #endif // HAVE_ICONV
1576
1577
1578 // ============================================================================
1579 // Win32 conversion classes
1580 // ============================================================================
1581
1582 #ifdef wxHAVE_WIN32_MB2WC
1583
1584 // from utils.cpp
1585 #if wxUSE_FONTMAP
1586 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1587 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1588 #endif
1589
1590 class wxMBConv_win32 : public wxMBConv
1591 {
1592 public:
1593 wxMBConv_win32()
1594 {
1595 m_CodePage = CP_ACP;
1596 }
1597
1598 #if wxUSE_FONTMAP
1599 wxMBConv_win32(const wxChar* name)
1600 {
1601 m_CodePage = wxCharsetToCodepage(name);
1602 }
1603
1604 wxMBConv_win32(wxFontEncoding encoding)
1605 {
1606 m_CodePage = wxEncodingToCodepage(encoding);
1607 }
1608 #endif
1609
1610 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1611 {
1612 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1613 // the behaviour is not compatible with the Unix version (using iconv)
1614 // and break the library itself, e.g. wxTextInputStream::NextChar()
1615 // wouldn't work if reading an incomplete MB char didn't result in an
1616 // error
1617 //
1618 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1619 // an error (tested under Windows Server 2003) and apparently it is
1620 // done on purpose, i.e. the function accepts any input in this case
1621 // and although I'd prefer to return error on ill-formed output, our
1622 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1623 // explicitly ill-formed according to RFC 2152) neither so we don't
1624 // even have any fallback here...
1625 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1626
1627 const size_t len = ::MultiByteToWideChar
1628 (
1629 m_CodePage, // code page
1630 flags, // flags: fall on error
1631 psz, // input string
1632 -1, // its length (NUL-terminated)
1633 buf, // output string
1634 buf ? n : 0 // size of output buffer
1635 );
1636
1637 // note that it returns count of written chars for buf != NULL and size
1638 // of the needed buffer for buf == NULL so in either case the length of
1639 // the string (which never includes the terminating NUL) is one less
1640 return len ? len - 1 : (size_t)-1;
1641 }
1642
1643 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1644 {
1645 /*
1646 we have a problem here: by default, WideCharToMultiByte() may
1647 replace characters unrepresentable in the target code page with bad
1648 quality approximations such as turning "1/2" symbol (U+00BD) into
1649 "1" for the code pages which don't have it and we, obviously, want
1650 to avoid this at any price
1651
1652 the trouble is that this function does it _silently_, i.e. it won't
1653 even tell us whether it did or not... Win98/2000 and higher provide
1654 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1655 we have to resort to a round trip, i.e. check that converting back
1656 results in the same string -- this is, of course, expensive but
1657 otherwise we simply can't be sure to not garble the data.
1658 */
1659
1660 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1661 // it doesn't work with CJK encodings (which we test for rather roughly
1662 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1663 // supporting it
1664 BOOL usedDef wxDUMMY_INITIALIZE(false);
1665 BOOL *pUsedDef;
1666 int flags;
1667 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1668 {
1669 // it's our lucky day
1670 flags = WC_NO_BEST_FIT_CHARS;
1671 pUsedDef = &usedDef;
1672 }
1673 else // old system or unsupported encoding
1674 {
1675 flags = 0;
1676 pUsedDef = NULL;
1677 }
1678
1679 const size_t len = ::WideCharToMultiByte
1680 (
1681 m_CodePage, // code page
1682 flags, // either none or no best fit
1683 pwz, // input string
1684 -1, // it is (wide) NUL-terminated
1685 buf, // output buffer
1686 buf ? n : 0, // and its size
1687 NULL, // default "replacement" char
1688 pUsedDef // [out] was it used?
1689 );
1690
1691 if ( !len )
1692 {
1693 // function totally failed
1694 return (size_t)-1;
1695 }
1696
1697 // if we were really converting, check if we succeeded
1698 if ( buf )
1699 {
1700 if ( flags )
1701 {
1702 // check if the conversion failed, i.e. if any replacements
1703 // were done
1704 if ( usedDef )
1705 return (size_t)-1;
1706 }
1707 else // we must resort to double tripping...
1708 {
1709 wxWCharBuffer wcBuf(n);
1710 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1711 wcscmp(wcBuf, pwz) != 0 )
1712 {
1713 // we didn't obtain the same thing we started from, hence
1714 // the conversion was lossy and we consider that it failed
1715 return (size_t)-1;
1716 }
1717 }
1718 }
1719
1720 // see the comment above for the reason of "len - 1"
1721 return len - 1;
1722 }
1723
1724 bool IsOk() const { return m_CodePage != -1; }
1725
1726 private:
1727 static bool CanUseNoBestFit()
1728 {
1729 static int s_isWin98Or2k = -1;
1730
1731 if ( s_isWin98Or2k == -1 )
1732 {
1733 int verMaj, verMin;
1734 switch ( wxGetOsVersion(&verMaj, &verMin) )
1735 {
1736 case wxWIN95:
1737 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1738 break;
1739
1740 case wxWINDOWS_NT:
1741 s_isWin98Or2k = verMaj >= 5;
1742 break;
1743
1744 default:
1745 // unknown, be conseravtive by default
1746 s_isWin98Or2k = 0;
1747 }
1748
1749 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1750 }
1751
1752 return s_isWin98Or2k == 1;
1753 }
1754
1755 long m_CodePage;
1756 };
1757
1758 #endif // wxHAVE_WIN32_MB2WC
1759
1760 // ============================================================================
1761 // Cocoa conversion classes
1762 // ============================================================================
1763
1764 #if defined(__WXCOCOA__)
1765
1766 // RN: There is no UTF-32 support in either Core Foundation or
1767 // Cocoa. Strangely enough, internally Core Foundation uses
1768 // UTF 32 internally quite a bit - its just not public (yet).
1769
1770 #include <CoreFoundation/CFString.h>
1771 #include <CoreFoundation/CFStringEncodingExt.h>
1772
1773 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1774 {
1775 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1776 if ( encoding == wxFONTENCODING_DEFAULT )
1777 {
1778 enc = CFStringGetSystemEncoding();
1779 }
1780 else switch( encoding)
1781 {
1782 case wxFONTENCODING_ISO8859_1 :
1783 enc = kCFStringEncodingISOLatin1 ;
1784 break ;
1785 case wxFONTENCODING_ISO8859_2 :
1786 enc = kCFStringEncodingISOLatin2;
1787 break ;
1788 case wxFONTENCODING_ISO8859_3 :
1789 enc = kCFStringEncodingISOLatin3 ;
1790 break ;
1791 case wxFONTENCODING_ISO8859_4 :
1792 enc = kCFStringEncodingISOLatin4;
1793 break ;
1794 case wxFONTENCODING_ISO8859_5 :
1795 enc = kCFStringEncodingISOLatinCyrillic;
1796 break ;
1797 case wxFONTENCODING_ISO8859_6 :
1798 enc = kCFStringEncodingISOLatinArabic;
1799 break ;
1800 case wxFONTENCODING_ISO8859_7 :
1801 enc = kCFStringEncodingISOLatinGreek;
1802 break ;
1803 case wxFONTENCODING_ISO8859_8 :
1804 enc = kCFStringEncodingISOLatinHebrew;
1805 break ;
1806 case wxFONTENCODING_ISO8859_9 :
1807 enc = kCFStringEncodingISOLatin5;
1808 break ;
1809 case wxFONTENCODING_ISO8859_10 :
1810 enc = kCFStringEncodingISOLatin6;
1811 break ;
1812 case wxFONTENCODING_ISO8859_11 :
1813 enc = kCFStringEncodingISOLatinThai;
1814 break ;
1815 case wxFONTENCODING_ISO8859_13 :
1816 enc = kCFStringEncodingISOLatin7;
1817 break ;
1818 case wxFONTENCODING_ISO8859_14 :
1819 enc = kCFStringEncodingISOLatin8;
1820 break ;
1821 case wxFONTENCODING_ISO8859_15 :
1822 enc = kCFStringEncodingISOLatin9;
1823 break ;
1824
1825 case wxFONTENCODING_KOI8 :
1826 enc = kCFStringEncodingKOI8_R;
1827 break ;
1828 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1829 enc = kCFStringEncodingDOSRussian;
1830 break ;
1831
1832 // case wxFONTENCODING_BULGARIAN :
1833 // enc = ;
1834 // break ;
1835
1836 case wxFONTENCODING_CP437 :
1837 enc =kCFStringEncodingDOSLatinUS ;
1838 break ;
1839 case wxFONTENCODING_CP850 :
1840 enc = kCFStringEncodingDOSLatin1;
1841 break ;
1842 case wxFONTENCODING_CP852 :
1843 enc = kCFStringEncodingDOSLatin2;
1844 break ;
1845 case wxFONTENCODING_CP855 :
1846 enc = kCFStringEncodingDOSCyrillic;
1847 break ;
1848 case wxFONTENCODING_CP866 :
1849 enc =kCFStringEncodingDOSRussian ;
1850 break ;
1851 case wxFONTENCODING_CP874 :
1852 enc = kCFStringEncodingDOSThai;
1853 break ;
1854 case wxFONTENCODING_CP932 :
1855 enc = kCFStringEncodingDOSJapanese;
1856 break ;
1857 case wxFONTENCODING_CP936 :
1858 enc =kCFStringEncodingDOSChineseSimplif ;
1859 break ;
1860 case wxFONTENCODING_CP949 :
1861 enc = kCFStringEncodingDOSKorean;
1862 break ;
1863 case wxFONTENCODING_CP950 :
1864 enc = kCFStringEncodingDOSChineseTrad;
1865 break ;
1866 case wxFONTENCODING_CP1250 :
1867 enc = kCFStringEncodingWindowsLatin2;
1868 break ;
1869 case wxFONTENCODING_CP1251 :
1870 enc =kCFStringEncodingWindowsCyrillic ;
1871 break ;
1872 case wxFONTENCODING_CP1252 :
1873 enc =kCFStringEncodingWindowsLatin1 ;
1874 break ;
1875 case wxFONTENCODING_CP1253 :
1876 enc = kCFStringEncodingWindowsGreek;
1877 break ;
1878 case wxFONTENCODING_CP1254 :
1879 enc = kCFStringEncodingWindowsLatin5;
1880 break ;
1881 case wxFONTENCODING_CP1255 :
1882 enc =kCFStringEncodingWindowsHebrew ;
1883 break ;
1884 case wxFONTENCODING_CP1256 :
1885 enc =kCFStringEncodingWindowsArabic ;
1886 break ;
1887 case wxFONTENCODING_CP1257 :
1888 enc = kCFStringEncodingWindowsBalticRim;
1889 break ;
1890 // This only really encodes to UTF7 (if that) evidently
1891 // case wxFONTENCODING_UTF7 :
1892 // enc = kCFStringEncodingNonLossyASCII ;
1893 // break ;
1894 case wxFONTENCODING_UTF8 :
1895 enc = kCFStringEncodingUTF8 ;
1896 break ;
1897 case wxFONTENCODING_EUC_JP :
1898 enc = kCFStringEncodingEUC_JP;
1899 break ;
1900 case wxFONTENCODING_UTF16 :
1901 enc = kCFStringEncodingUnicode ;
1902 break ;
1903 case wxFONTENCODING_MACROMAN :
1904 enc = kCFStringEncodingMacRoman ;
1905 break ;
1906 case wxFONTENCODING_MACJAPANESE :
1907 enc = kCFStringEncodingMacJapanese ;
1908 break ;
1909 case wxFONTENCODING_MACCHINESETRAD :
1910 enc = kCFStringEncodingMacChineseTrad ;
1911 break ;
1912 case wxFONTENCODING_MACKOREAN :
1913 enc = kCFStringEncodingMacKorean ;
1914 break ;
1915 case wxFONTENCODING_MACARABIC :
1916 enc = kCFStringEncodingMacArabic ;
1917 break ;
1918 case wxFONTENCODING_MACHEBREW :
1919 enc = kCFStringEncodingMacHebrew ;
1920 break ;
1921 case wxFONTENCODING_MACGREEK :
1922 enc = kCFStringEncodingMacGreek ;
1923 break ;
1924 case wxFONTENCODING_MACCYRILLIC :
1925 enc = kCFStringEncodingMacCyrillic ;
1926 break ;
1927 case wxFONTENCODING_MACDEVANAGARI :
1928 enc = kCFStringEncodingMacDevanagari ;
1929 break ;
1930 case wxFONTENCODING_MACGURMUKHI :
1931 enc = kCFStringEncodingMacGurmukhi ;
1932 break ;
1933 case wxFONTENCODING_MACGUJARATI :
1934 enc = kCFStringEncodingMacGujarati ;
1935 break ;
1936 case wxFONTENCODING_MACORIYA :
1937 enc = kCFStringEncodingMacOriya ;
1938 break ;
1939 case wxFONTENCODING_MACBENGALI :
1940 enc = kCFStringEncodingMacBengali ;
1941 break ;
1942 case wxFONTENCODING_MACTAMIL :
1943 enc = kCFStringEncodingMacTamil ;
1944 break ;
1945 case wxFONTENCODING_MACTELUGU :
1946 enc = kCFStringEncodingMacTelugu ;
1947 break ;
1948 case wxFONTENCODING_MACKANNADA :
1949 enc = kCFStringEncodingMacKannada ;
1950 break ;
1951 case wxFONTENCODING_MACMALAJALAM :
1952 enc = kCFStringEncodingMacMalayalam ;
1953 break ;
1954 case wxFONTENCODING_MACSINHALESE :
1955 enc = kCFStringEncodingMacSinhalese ;
1956 break ;
1957 case wxFONTENCODING_MACBURMESE :
1958 enc = kCFStringEncodingMacBurmese ;
1959 break ;
1960 case wxFONTENCODING_MACKHMER :
1961 enc = kCFStringEncodingMacKhmer ;
1962 break ;
1963 case wxFONTENCODING_MACTHAI :
1964 enc = kCFStringEncodingMacThai ;
1965 break ;
1966 case wxFONTENCODING_MACLAOTIAN :
1967 enc = kCFStringEncodingMacLaotian ;
1968 break ;
1969 case wxFONTENCODING_MACGEORGIAN :
1970 enc = kCFStringEncodingMacGeorgian ;
1971 break ;
1972 case wxFONTENCODING_MACARMENIAN :
1973 enc = kCFStringEncodingMacArmenian ;
1974 break ;
1975 case wxFONTENCODING_MACCHINESESIMP :
1976 enc = kCFStringEncodingMacChineseSimp ;
1977 break ;
1978 case wxFONTENCODING_MACTIBETAN :
1979 enc = kCFStringEncodingMacTibetan ;
1980 break ;
1981 case wxFONTENCODING_MACMONGOLIAN :
1982 enc = kCFStringEncodingMacMongolian ;
1983 break ;
1984 case wxFONTENCODING_MACETHIOPIC :
1985 enc = kCFStringEncodingMacEthiopic ;
1986 break ;
1987 case wxFONTENCODING_MACCENTRALEUR :
1988 enc = kCFStringEncodingMacCentralEurRoman ;
1989 break ;
1990 case wxFONTENCODING_MACVIATNAMESE :
1991 enc = kCFStringEncodingMacVietnamese ;
1992 break ;
1993 case wxFONTENCODING_MACARABICEXT :
1994 enc = kCFStringEncodingMacExtArabic ;
1995 break ;
1996 case wxFONTENCODING_MACSYMBOL :
1997 enc = kCFStringEncodingMacSymbol ;
1998 break ;
1999 case wxFONTENCODING_MACDINGBATS :
2000 enc = kCFStringEncodingMacDingbats ;
2001 break ;
2002 case wxFONTENCODING_MACTURKISH :
2003 enc = kCFStringEncodingMacTurkish ;
2004 break ;
2005 case wxFONTENCODING_MACCROATIAN :
2006 enc = kCFStringEncodingMacCroatian ;
2007 break ;
2008 case wxFONTENCODING_MACICELANDIC :
2009 enc = kCFStringEncodingMacIcelandic ;
2010 break ;
2011 case wxFONTENCODING_MACROMANIAN :
2012 enc = kCFStringEncodingMacRomanian ;
2013 break ;
2014 case wxFONTENCODING_MACCELTIC :
2015 enc = kCFStringEncodingMacCeltic ;
2016 break ;
2017 case wxFONTENCODING_MACGAELIC :
2018 enc = kCFStringEncodingMacGaelic ;
2019 break ;
2020 // case wxFONTENCODING_MACKEYBOARD :
2021 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2022 // break ;
2023 default :
2024 // because gcc is picky
2025 break ;
2026 } ;
2027 return enc ;
2028 }
2029
2030 class wxMBConv_cocoa : public wxMBConv
2031 {
2032 public:
2033 wxMBConv_cocoa()
2034 {
2035 Init(CFStringGetSystemEncoding()) ;
2036 }
2037
2038 #if wxUSE_FONTMAP
2039 wxMBConv_cocoa(const wxChar* name)
2040 {
2041 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2042 }
2043 #endif
2044
2045 wxMBConv_cocoa(wxFontEncoding encoding)
2046 {
2047 Init( wxCFStringEncFromFontEnc(encoding) );
2048 }
2049
2050 ~wxMBConv_cocoa()
2051 {
2052 }
2053
2054 void Init( CFStringEncoding encoding)
2055 {
2056 m_encoding = encoding ;
2057 }
2058
2059 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2060 {
2061 wxASSERT(szUnConv);
2062
2063 CFStringRef theString = CFStringCreateWithBytes (
2064 NULL, //the allocator
2065 (const UInt8*)szUnConv,
2066 strlen(szUnConv),
2067 m_encoding,
2068 false //no BOM/external representation
2069 );
2070
2071 wxASSERT(theString);
2072
2073 size_t nOutLength = CFStringGetLength(theString);
2074
2075 if (szOut == NULL)
2076 {
2077 CFRelease(theString);
2078 return nOutLength;
2079 }
2080
2081 CFRange theRange = { 0, nOutSize };
2082
2083 #if SIZEOF_WCHAR_T == 4
2084 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2085 #endif
2086
2087 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2088
2089 CFRelease(theString);
2090
2091 szUniCharBuffer[nOutLength] = '\0' ;
2092
2093 #if SIZEOF_WCHAR_T == 4
2094 wxMBConvUTF16 converter ;
2095 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2096 delete[] szUniCharBuffer;
2097 #endif
2098
2099 return nOutLength;
2100 }
2101
2102 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2103 {
2104 wxASSERT(szUnConv);
2105
2106 size_t nRealOutSize;
2107 size_t nBufSize = wxWcslen(szUnConv);
2108 UniChar* szUniBuffer = (UniChar*) szUnConv;
2109
2110 #if SIZEOF_WCHAR_T == 4
2111 wxMBConvUTF16BE converter ;
2112 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2113 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2114 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2115 nBufSize /= sizeof(UniChar);
2116 #endif
2117
2118 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2119 NULL, //allocator
2120 szUniBuffer,
2121 nBufSize,
2122 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2123 );
2124
2125 wxASSERT(theString);
2126
2127 //Note that CER puts a BOM when converting to unicode
2128 //so we check and use getchars instead in that case
2129 if (m_encoding == kCFStringEncodingUnicode)
2130 {
2131 if (szOut != NULL)
2132 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2133
2134 nRealOutSize = CFStringGetLength(theString) + 1;
2135 }
2136 else
2137 {
2138 CFStringGetBytes(
2139 theString,
2140 CFRangeMake(0, CFStringGetLength(theString)),
2141 m_encoding,
2142 0, //what to put in characters that can't be converted -
2143 //0 tells CFString to return NULL if it meets such a character
2144 false, //not an external representation
2145 (UInt8*) szOut,
2146 nOutSize,
2147 (CFIndex*) &nRealOutSize
2148 );
2149 }
2150
2151 CFRelease(theString);
2152
2153 #if SIZEOF_WCHAR_T == 4
2154 delete[] szUniBuffer;
2155 #endif
2156
2157 return nRealOutSize - 1;
2158 }
2159
2160 bool IsOk() const
2161 {
2162 return m_encoding != kCFStringEncodingInvalidId &&
2163 CFStringIsEncodingAvailable(m_encoding);
2164 }
2165
2166 private:
2167 CFStringEncoding m_encoding ;
2168 };
2169
2170 #endif // defined(__WXCOCOA__)
2171
2172 // ============================================================================
2173 // Mac conversion classes
2174 // ============================================================================
2175
2176 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2177
2178 class wxMBConv_mac : public wxMBConv
2179 {
2180 public:
2181 wxMBConv_mac()
2182 {
2183 Init(CFStringGetSystemEncoding()) ;
2184 }
2185
2186 #if wxUSE_FONTMAP
2187 wxMBConv_mac(const wxChar* name)
2188 {
2189 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2190 }
2191 #endif
2192
2193 wxMBConv_mac(wxFontEncoding encoding)
2194 {
2195 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2196 }
2197
2198 ~wxMBConv_mac()
2199 {
2200 OSStatus status = noErr ;
2201 status = TECDisposeConverter(m_MB2WC_converter);
2202 status = TECDisposeConverter(m_WC2MB_converter);
2203 }
2204
2205
2206 void Init( TextEncodingBase encoding)
2207 {
2208 OSStatus status = noErr ;
2209 m_char_encoding = encoding ;
2210 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2211
2212 status = TECCreateConverter(&m_MB2WC_converter,
2213 m_char_encoding,
2214 m_unicode_encoding);
2215 status = TECCreateConverter(&m_WC2MB_converter,
2216 m_unicode_encoding,
2217 m_char_encoding);
2218 }
2219
2220 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2221 {
2222 OSStatus status = noErr ;
2223 ByteCount byteOutLen ;
2224 ByteCount byteInLen = strlen(psz) ;
2225 wchar_t *tbuf = NULL ;
2226 UniChar* ubuf = NULL ;
2227 size_t res = 0 ;
2228
2229 if (buf == NULL)
2230 {
2231 //apple specs say at least 32
2232 n = wxMax( 32 , byteInLen ) ;
2233 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2234 }
2235 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2236 #if SIZEOF_WCHAR_T == 4
2237 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2238 #else
2239 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2240 #endif
2241 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2242 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2243 #if SIZEOF_WCHAR_T == 4
2244 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2245 // is not properly terminated we get random characters at the end
2246 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2247 wxMBConvUTF16BE converter ;
2248 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2249 free( ubuf ) ;
2250 #else
2251 res = byteOutLen / sizeof( UniChar ) ;
2252 #endif
2253 if ( buf == NULL )
2254 free(tbuf) ;
2255
2256 if ( buf && res < n)
2257 buf[res] = 0;
2258
2259 return res ;
2260 }
2261
2262 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2263 {
2264 OSStatus status = noErr ;
2265 ByteCount byteOutLen ;
2266 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2267
2268 char *tbuf = NULL ;
2269
2270 if (buf == NULL)
2271 {
2272 //apple specs say at least 32
2273 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2274 tbuf = (char*) malloc( n ) ;
2275 }
2276
2277 ByteCount byteBufferLen = n ;
2278 UniChar* ubuf = NULL ;
2279 #if SIZEOF_WCHAR_T == 4
2280 wxMBConvUTF16BE converter ;
2281 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2282 byteInLen = unicharlen ;
2283 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2284 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2285 #else
2286 ubuf = (UniChar*) psz ;
2287 #endif
2288 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2289 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2290 #if SIZEOF_WCHAR_T == 4
2291 free( ubuf ) ;
2292 #endif
2293 if ( buf == NULL )
2294 free(tbuf) ;
2295
2296 size_t res = byteOutLen ;
2297 if ( buf && res < n)
2298 {
2299 buf[res] = 0;
2300
2301 //we need to double-trip to verify it didn't insert any ? in place
2302 //of bogus characters
2303 wxWCharBuffer wcBuf(n);
2304 size_t pszlen = wxWcslen(psz);
2305 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2306 wxWcslen(wcBuf) != pszlen ||
2307 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2308 {
2309 // we didn't obtain the same thing we started from, hence
2310 // the conversion was lossy and we consider that it failed
2311 return (size_t)-1;
2312 }
2313 }
2314
2315 return res ;
2316 }
2317
2318 bool IsOk() const
2319 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2320
2321 private:
2322 TECObjectRef m_MB2WC_converter ;
2323 TECObjectRef m_WC2MB_converter ;
2324
2325 TextEncodingBase m_char_encoding ;
2326 TextEncodingBase m_unicode_encoding ;
2327 };
2328
2329 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2330
2331 // ============================================================================
2332 // wxEncodingConverter based conversion classes
2333 // ============================================================================
2334
2335 #if wxUSE_FONTMAP
2336
2337 class wxMBConv_wxwin : public wxMBConv
2338 {
2339 private:
2340 void Init()
2341 {
2342 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2343 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2344 }
2345
2346 public:
2347 // temporarily just use wxEncodingConverter stuff,
2348 // so that it works while a better implementation is built
2349 wxMBConv_wxwin(const wxChar* name)
2350 {
2351 if (name)
2352 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2353 else
2354 m_enc = wxFONTENCODING_SYSTEM;
2355
2356 Init();
2357 }
2358
2359 wxMBConv_wxwin(wxFontEncoding enc)
2360 {
2361 m_enc = enc;
2362
2363 Init();
2364 }
2365
2366 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2367 {
2368 size_t inbuf = strlen(psz);
2369 if (buf)
2370 {
2371 if (!m2w.Convert(psz,buf))
2372 return (size_t)-1;
2373 }
2374 return inbuf;
2375 }
2376
2377 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2378 {
2379 const size_t inbuf = wxWcslen(psz);
2380 if (buf)
2381 {
2382 if (!w2m.Convert(psz,buf))
2383 return (size_t)-1;
2384 }
2385
2386 return inbuf;
2387 }
2388
2389 bool IsOk() const { return m_ok; }
2390
2391 public:
2392 wxFontEncoding m_enc;
2393 wxEncodingConverter m2w, w2m;
2394
2395 // were we initialized successfully?
2396 bool m_ok;
2397
2398 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2399 };
2400
2401 #endif // wxUSE_FONTMAP
2402
2403 // ============================================================================
2404 // wxCSConv implementation
2405 // ============================================================================
2406
2407 void wxCSConv::Init()
2408 {
2409 m_name = NULL;
2410 m_convReal = NULL;
2411 m_deferred = true;
2412 }
2413
2414 wxCSConv::wxCSConv(const wxChar *charset)
2415 {
2416 Init();
2417
2418 if ( charset )
2419 {
2420 SetName(charset);
2421 }
2422
2423 m_encoding = wxFONTENCODING_SYSTEM;
2424 }
2425
2426 wxCSConv::wxCSConv(wxFontEncoding encoding)
2427 {
2428 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2429 {
2430 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2431
2432 encoding = wxFONTENCODING_SYSTEM;
2433 }
2434
2435 Init();
2436
2437 m_encoding = encoding;
2438 }
2439
2440 wxCSConv::~wxCSConv()
2441 {
2442 Clear();
2443 }
2444
2445 wxCSConv::wxCSConv(const wxCSConv& conv)
2446 : wxMBConv()
2447 {
2448 Init();
2449
2450 SetName(conv.m_name);
2451 m_encoding = conv.m_encoding;
2452 }
2453
2454 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2455 {
2456 Clear();
2457
2458 SetName(conv.m_name);
2459 m_encoding = conv.m_encoding;
2460
2461 return *this;
2462 }
2463
2464 void wxCSConv::Clear()
2465 {
2466 free(m_name);
2467 delete m_convReal;
2468
2469 m_name = NULL;
2470 m_convReal = NULL;
2471 }
2472
2473 void wxCSConv::SetName(const wxChar *charset)
2474 {
2475 if (charset)
2476 {
2477 m_name = wxStrdup(charset);
2478 m_deferred = true;
2479 }
2480 }
2481
2482 wxMBConv *wxCSConv::DoCreate() const
2483 {
2484 // check for the special case of ASCII or ISO8859-1 charset: as we have
2485 // special knowledge of it anyhow, we don't need to create a special
2486 // conversion object
2487 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2488 {
2489 // don't convert at all
2490 return NULL;
2491 }
2492
2493 // we trust OS to do conversion better than we can so try external
2494 // conversion methods first
2495 //
2496 // the full order is:
2497 // 1. OS conversion (iconv() under Unix or Win32 API)
2498 // 2. hard coded conversions for UTF
2499 // 3. wxEncodingConverter as fall back
2500
2501 // step (1)
2502 #ifdef HAVE_ICONV
2503 #if !wxUSE_FONTMAP
2504 if ( m_name )
2505 #endif // !wxUSE_FONTMAP
2506 {
2507 wxString name(m_name);
2508
2509 #if wxUSE_FONTMAP
2510 if ( name.empty() )
2511 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2512 #endif // wxUSE_FONTMAP
2513
2514 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2515 if ( conv->IsOk() )
2516 return conv;
2517
2518 delete conv;
2519 }
2520 #endif // HAVE_ICONV
2521
2522 #ifdef wxHAVE_WIN32_MB2WC
2523 {
2524 #if wxUSE_FONTMAP
2525 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2526 : new wxMBConv_win32(m_encoding);
2527 if ( conv->IsOk() )
2528 return conv;
2529
2530 delete conv;
2531 #else
2532 return NULL;
2533 #endif
2534 }
2535 #endif // wxHAVE_WIN32_MB2WC
2536 #if defined(__WXMAC__)
2537 {
2538 // leave UTF16 and UTF32 to the built-ins of wx
2539 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2540 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2541 {
2542
2543 #if wxUSE_FONTMAP
2544 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2545 : new wxMBConv_mac(m_encoding);
2546 #else
2547 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2548 #endif
2549 if ( conv->IsOk() )
2550 return conv;
2551
2552 delete conv;
2553 }
2554 }
2555 #endif
2556 #if defined(__WXCOCOA__)
2557 {
2558 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2559 {
2560
2561 #if wxUSE_FONTMAP
2562 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2563 : new wxMBConv_cocoa(m_encoding);
2564 #else
2565 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2566 #endif
2567 if ( conv->IsOk() )
2568 return conv;
2569
2570 delete conv;
2571 }
2572 }
2573 #endif
2574 // step (2)
2575 wxFontEncoding enc = m_encoding;
2576 #if wxUSE_FONTMAP
2577 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2578 {
2579 // use "false" to suppress interactive dialogs -- we can be called from
2580 // anywhere and popping up a dialog from here is the last thing we want to
2581 // do
2582 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2583 }
2584 #endif // wxUSE_FONTMAP
2585
2586 switch ( enc )
2587 {
2588 case wxFONTENCODING_UTF7:
2589 return new wxMBConvUTF7;
2590
2591 case wxFONTENCODING_UTF8:
2592 return new wxMBConvUTF8;
2593
2594 case wxFONTENCODING_UTF16BE:
2595 return new wxMBConvUTF16BE;
2596
2597 case wxFONTENCODING_UTF16LE:
2598 return new wxMBConvUTF16LE;
2599
2600 case wxFONTENCODING_UTF32BE:
2601 return new wxMBConvUTF32BE;
2602
2603 case wxFONTENCODING_UTF32LE:
2604 return new wxMBConvUTF32LE;
2605
2606 default:
2607 // nothing to do but put here to suppress gcc warnings
2608 ;
2609 }
2610
2611 // step (3)
2612 #if wxUSE_FONTMAP
2613 {
2614 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2615 : new wxMBConv_wxwin(m_encoding);
2616 if ( conv->IsOk() )
2617 return conv;
2618
2619 delete conv;
2620 }
2621 #endif // wxUSE_FONTMAP
2622
2623 // NB: This is a hack to prevent deadlock. What could otherwise happen
2624 // in Unicode build: wxConvLocal creation ends up being here
2625 // because of some failure and logs the error. But wxLog will try to
2626 // attach timestamp, for which it will need wxConvLocal (to convert
2627 // time to char* and then wchar_t*), but that fails, tries to log
2628 // error, but wxLog has a (already locked) critical section that
2629 // guards static buffer.
2630 static bool alreadyLoggingError = false;
2631 if (!alreadyLoggingError)
2632 {
2633 alreadyLoggingError = true;
2634 wxLogError(_("Cannot convert from the charset '%s'!"),
2635 m_name ? m_name
2636 :
2637 #if wxUSE_FONTMAP
2638 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2639 #else // !wxUSE_FONTMAP
2640 wxString::Format(_("encoding %s"), m_encoding).c_str()
2641 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2642 );
2643 alreadyLoggingError = false;
2644 }
2645
2646 return NULL;
2647 }
2648
2649 void wxCSConv::CreateConvIfNeeded() const
2650 {
2651 if ( m_deferred )
2652 {
2653 wxCSConv *self = (wxCSConv *)this; // const_cast
2654
2655 #if wxUSE_INTL
2656 // if we don't have neither the name nor the encoding, use the default
2657 // encoding for this system
2658 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2659 {
2660 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2661 }
2662 #endif // wxUSE_INTL
2663
2664 self->m_convReal = DoCreate();
2665 self->m_deferred = false;
2666 }
2667 }
2668
2669 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2670 {
2671 CreateConvIfNeeded();
2672
2673 if (m_convReal)
2674 return m_convReal->MB2WC(buf, psz, n);
2675
2676 // latin-1 (direct)
2677 size_t len = strlen(psz);
2678
2679 if (buf)
2680 {
2681 for (size_t c = 0; c <= len; c++)
2682 buf[c] = (unsigned char)(psz[c]);
2683 }
2684
2685 return len;
2686 }
2687
2688 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2689 {
2690 CreateConvIfNeeded();
2691
2692 if (m_convReal)
2693 return m_convReal->WC2MB(buf, psz, n);
2694
2695 // latin-1 (direct)
2696 const size_t len = wxWcslen(psz);
2697 if (buf)
2698 {
2699 for (size_t c = 0; c <= len; c++)
2700 {
2701 if (psz[c] > 0xFF)
2702 return (size_t)-1;
2703 buf[c] = (char)psz[c];
2704 }
2705 }
2706 else
2707 {
2708 for (size_t c = 0; c <= len; c++)
2709 {
2710 if (psz[c] > 0xFF)
2711 return (size_t)-1;
2712 }
2713 }
2714
2715 return len;
2716 }
2717
2718 // ----------------------------------------------------------------------------
2719 // globals
2720 // ----------------------------------------------------------------------------
2721
2722 #ifdef __WINDOWS__
2723 static wxMBConv_win32 wxConvLibcObj;
2724 #elif defined(__WXMAC__) && !defined(__MACH__)
2725 static wxMBConv_mac wxConvLibcObj ;
2726 #else
2727 static wxMBConvLibc wxConvLibcObj;
2728 #endif
2729
2730 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2731 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2732 static wxMBConvUTF7 wxConvUTF7Obj;
2733 static wxMBConvUTF8 wxConvUTF8Obj;
2734 static wxConvBrokenFileNames wxConvBrokenFileNamesObj;
2735
2736 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2737 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2738 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2739 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2740 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2741 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2742 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2743 #ifdef __WXOSX__
2744 wxConvUTF8Obj;
2745 #elif __WXGTK20__
2746 wxConvBrokenFileNamesObj;
2747 #else
2748 wxConvLibcObj;
2749 #endif
2750
2751
2752 #else // !wxUSE_WCHAR_T
2753
2754 // stand-ins in absence of wchar_t
2755 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2756 wxConvISO8859_1,
2757 wxConvLocal,
2758 wxConvUTF8;
2759
2760 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2761
2762