]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
respect G_FILENAME_ENCODING in wxConvBrokenFileName and also use UTF-8 instead ASCII
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58 #ifdef HAVE_LANGINFO_H
59 #include <langinfo.h>
60 #endif
61
62 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
63 #define wxHAVE_WIN32_MB2WC
64 #endif // __WIN32__ but !__WXMICROWIN__
65
66 // ----------------------------------------------------------------------------
67 // headers
68 // ----------------------------------------------------------------------------
69
70 #ifdef __SALFORDC__
71 #include <clib.h>
72 #endif
73
74 #ifdef HAVE_ICONV
75 #include <iconv.h>
76 #include "wx/thread.h"
77 #endif
78
79 #include "wx/encconv.h"
80 #include "wx/fontmap.h"
81 #include "wx/utils.h"
82
83 #ifdef __WXMAC__
84 #include <ATSUnicode.h>
85 #include <TextCommon.h>
86 #include <TextEncodingConverter.h>
87
88 #include "wx/mac/private.h" // includes mac headers
89 #endif
90 // ----------------------------------------------------------------------------
91 // macros
92 // ----------------------------------------------------------------------------
93
94 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
95 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
96
97 #if SIZEOF_WCHAR_T == 4
98 #define WC_NAME "UCS4"
99 #define WC_BSWAP BSWAP_UCS4
100 #ifdef WORDS_BIGENDIAN
101 #define WC_NAME_BEST "UCS-4BE"
102 #else
103 #define WC_NAME_BEST "UCS-4LE"
104 #endif
105 #elif SIZEOF_WCHAR_T == 2
106 #define WC_NAME "UTF16"
107 #define WC_BSWAP BSWAP_UTF16
108 #define WC_UTF16
109 #ifdef WORDS_BIGENDIAN
110 #define WC_NAME_BEST "UTF-16BE"
111 #else
112 #define WC_NAME_BEST "UTF-16LE"
113 #endif
114 #else // sizeof(wchar_t) != 2 nor 4
115 // does this ever happen?
116 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
117 #endif
118
119 // ============================================================================
120 // implementation
121 // ============================================================================
122
123 // ----------------------------------------------------------------------------
124 // UTF-16 en/decoding to/from UCS-4
125 // ----------------------------------------------------------------------------
126
127
128 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
129 {
130 if (input<=0xffff)
131 {
132 if (output)
133 *output = (wxUint16) input;
134 return 1;
135 }
136 else if (input>=0x110000)
137 {
138 return (size_t)-1;
139 }
140 else
141 {
142 if (output)
143 {
144 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
145 *output = (wxUint16) ((input&0x3ff)+0xdc00);
146 }
147 return 2;
148 }
149 }
150
151 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
152 {
153 if ((*input<0xd800) || (*input>0xdfff))
154 {
155 output = *input;
156 return 1;
157 }
158 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
159 {
160 output = *input;
161 return (size_t)-1;
162 }
163 else
164 {
165 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
166 return 2;
167 }
168 }
169
170
171 // ----------------------------------------------------------------------------
172 // wxMBConv
173 // ----------------------------------------------------------------------------
174
175 wxMBConv::~wxMBConv()
176 {
177 // nothing to do here (necessary for Darwin linking probably)
178 }
179
180 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
181 {
182 if ( psz )
183 {
184 // calculate the length of the buffer needed first
185 size_t nLen = MB2WC(NULL, psz, 0);
186 if ( nLen != (size_t)-1 )
187 {
188 // now do the actual conversion
189 wxWCharBuffer buf(nLen);
190 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
191 if ( nLen != (size_t)-1 )
192 {
193 return buf;
194 }
195 }
196 }
197
198 wxWCharBuffer buf((wchar_t *)NULL);
199
200 return buf;
201 }
202
203 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
204 {
205 if ( pwz )
206 {
207 size_t nLen = WC2MB(NULL, pwz, 0);
208 if ( nLen != (size_t)-1 )
209 {
210 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
211 nLen = WC2MB(buf.data(), pwz, nLen + 4);
212 if ( nLen != (size_t)-1 )
213 {
214 return buf;
215 }
216 }
217 }
218
219 wxCharBuffer buf((char *)NULL);
220
221 return buf;
222 }
223
224 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
225 {
226 wxASSERT(pOutSize != NULL);
227
228 const char* szEnd = szString + nStringLen + 1;
229 const char* szPos = szString;
230 const char* szStart = szPos;
231
232 size_t nActualLength = 0;
233 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
234
235 wxWCharBuffer theBuffer(nCurrentSize);
236
237 //Convert the string until the length() is reached, continuing the
238 //loop every time a null character is reached
239 while(szPos != szEnd)
240 {
241 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
242
243 //Get the length of the current (sub)string
244 size_t nLen = MB2WC(NULL, szPos, 0);
245
246 //Invalid conversion?
247 if( nLen == (size_t)-1 )
248 {
249 *pOutSize = 0;
250 theBuffer.data()[0u] = wxT('\0');
251 return theBuffer;
252 }
253
254
255 //Increase the actual length (+1 for current null character)
256 nActualLength += nLen + 1;
257
258 //if buffer too big, realloc the buffer
259 if (nActualLength > (nCurrentSize+1))
260 {
261 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
262 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
263 theBuffer = theNewBuffer;
264 nCurrentSize <<= 1;
265 }
266
267 //Convert the current (sub)string
268 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
269 {
270 *pOutSize = 0;
271 theBuffer.data()[0u] = wxT('\0');
272 return theBuffer;
273 }
274
275 //Increment to next (sub)string
276 //Note that we have to use strlen here instead of nLen
277 //here because XX2XX gives us the size of the output buffer,
278 //not neccessarly the length of the string
279 szPos += strlen(szPos) + 1;
280 }
281
282 //success - return actual length and the buffer
283 *pOutSize = nActualLength;
284 return theBuffer;
285 }
286
287 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
288 {
289 wxASSERT(pOutSize != NULL);
290
291 const wchar_t* szEnd = szString + nStringLen + 1;
292 const wchar_t* szPos = szString;
293 const wchar_t* szStart = szPos;
294
295 size_t nActualLength = 0;
296 size_t nCurrentSize = nStringLen << 2; //try * 4 first
297
298 wxCharBuffer theBuffer(nCurrentSize);
299
300 //Convert the string until the length() is reached, continuing the
301 //loop every time a null character is reached
302 while(szPos != szEnd)
303 {
304 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
305
306 //Get the length of the current (sub)string
307 size_t nLen = WC2MB(NULL, szPos, 0);
308
309 //Invalid conversion?
310 if( nLen == (size_t)-1 )
311 {
312 *pOutSize = 0;
313 theBuffer.data()[0u] = wxT('\0');
314 return theBuffer;
315 }
316
317 //Increase the actual length (+1 for current null character)
318 nActualLength += nLen + 1;
319
320 //if buffer too big, realloc the buffer
321 if (nActualLength > (nCurrentSize+1))
322 {
323 wxCharBuffer theNewBuffer(nCurrentSize << 1);
324 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
325 theBuffer = theNewBuffer;
326 nCurrentSize <<= 1;
327 }
328
329 //Convert the current (sub)string
330 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
331 {
332 *pOutSize = 0;
333 theBuffer.data()[0u] = wxT('\0');
334 return theBuffer;
335 }
336
337 //Increment to next (sub)string
338 //Note that we have to use wxWcslen here instead of nLen
339 //here because XX2XX gives us the size of the output buffer,
340 //not neccessarly the length of the string
341 szPos += wxWcslen(szPos) + 1;
342 }
343
344 //success - return actual length and the buffer
345 *pOutSize = nActualLength;
346 return theBuffer;
347 }
348
349 // ----------------------------------------------------------------------------
350 // wxMBConvLibc
351 // ----------------------------------------------------------------------------
352
353 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
354 {
355 return wxMB2WC(buf, psz, n);
356 }
357
358 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
359 {
360 return wxWC2MB(buf, psz, n);
361 }
362
363 #ifdef __WXGTK20__
364
365 // ----------------------------------------------------------------------------
366 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
367 // files are accidentally written in an encoding which is not
368 // the system encoding. Typically, the system encoding will be
369 // UTF8 but there might be files stored in ISO8859-1 on disk.
370 // ----------------------------------------------------------------------------
371
372 class wxConvBrokenFileNames : public wxMBConv
373 {
374 public:
375 wxConvBrokenFileNames();
376 virtual ~wxConvBrokenFileNames() { delete m_conv; }
377
378 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
379 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
380
381 private:
382 // the conversion object we forward to
383 wxMBConv *m_conv;
384 };
385
386 wxConvBrokenFileNames::wxConvBrokenFileNames()
387 {
388 // decide which conversion to use for the file names
389
390 // (1) this variable exists for the sole purpose of specifying the encoding
391 // of the filenames for GTK+ programs, so use it if it is set
392 const wxChar *encName = wxGetenv(_T("G_FILENAME_ENCODING"));
393 if ( encName )
394 {
395 m_conv = new wxCSConv(encName);
396 }
397 else // no G_FILENAME_ENCODING
398 {
399 // (2) if a non default locale is set, assume that the user wants his
400 // filenames in this locale too
401 switch ( wxLocale::GetSystemEncoding() )
402 {
403 default:
404 m_conv = new wxMBConvLibc;
405 break;
406
407 // (3) finally use UTF-8 by default
408 case wxFONTENCODING_SYSTEM:
409 case wxFONTENCODING_UTF8:
410 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
411 break;
412 }
413 }
414 }
415
416 size_t
417 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
418 const char *psz,
419 size_t outputSize) const
420 {
421 return m_conv->MB2WC( outputBuf, psz, outputSize );
422 }
423
424 size_t
425 wxConvBrokenFileNames::WC2MB(char *outputBuf,
426 const wchar_t *psz,
427 size_t outputSize) const
428 {
429 return m_conv->WC2MB( outputBuf, psz, outputSize );
430 }
431
432 #endif // __WXGTK20__
433
434 // ----------------------------------------------------------------------------
435 // UTF-7
436 // ----------------------------------------------------------------------------
437
438 // Implementation (C) 2004 Fredrik Roubert
439
440 //
441 // BASE64 decoding table
442 //
443 static const unsigned char utf7unb64[] =
444 {
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
451 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
452 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
453 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
454 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
455 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
456 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
457 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
458 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
459 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
460 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
461 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
462 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
463 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
464 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
471 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
472 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
476 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
477 };
478
479 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
480 {
481 size_t len = 0;
482
483 while (*psz && ((!buf) || (len < n)))
484 {
485 unsigned char cc = *psz++;
486 if (cc != '+')
487 {
488 // plain ASCII char
489 if (buf)
490 *buf++ = cc;
491 len++;
492 }
493 else if (*psz == '-')
494 {
495 // encoded plus sign
496 if (buf)
497 *buf++ = cc;
498 len++;
499 psz++;
500 }
501 else
502 {
503 // BASE64 encoded string
504 bool lsb;
505 unsigned char c;
506 unsigned int d, l;
507 for (lsb = false, d = 0, l = 0;
508 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
509 {
510 d <<= 6;
511 d += cc;
512 for (l += 6; l >= 8; lsb = !lsb)
513 {
514 c = (unsigned char)((d >> (l -= 8)) % 256);
515 if (lsb)
516 {
517 if (buf)
518 *buf++ |= c;
519 len ++;
520 }
521 else
522 if (buf)
523 *buf = (wchar_t)(c << 8);
524 }
525 }
526 if (*psz == '-')
527 psz++;
528 }
529 }
530 if (buf && (len < n))
531 *buf = 0;
532 return len;
533 }
534
535 //
536 // BASE64 encoding table
537 //
538 static const unsigned char utf7enb64[] =
539 {
540 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
541 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
542 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
543 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
544 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
545 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
546 'w', 'x', 'y', 'z', '0', '1', '2', '3',
547 '4', '5', '6', '7', '8', '9', '+', '/'
548 };
549
550 //
551 // UTF-7 encoding table
552 //
553 // 0 - Set D (directly encoded characters)
554 // 1 - Set O (optional direct characters)
555 // 2 - whitespace characters (optional)
556 // 3 - special characters
557 //
558 static const unsigned char utf7encode[128] =
559 {
560 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
561 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
562 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
564 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
565 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
566 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
567 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
568 };
569
570 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571 {
572
573
574 size_t len = 0;
575
576 while (*psz && ((!buf) || (len < n)))
577 {
578 wchar_t cc = *psz++;
579 if (cc < 0x80 && utf7encode[cc] < 1)
580 {
581 // plain ASCII char
582 if (buf)
583 *buf++ = (char)cc;
584 len++;
585 }
586 #ifndef WC_UTF16
587 else if (((wxUint32)cc) > 0xffff)
588 {
589 // no surrogate pair generation (yet?)
590 return (size_t)-1;
591 }
592 #endif
593 else
594 {
595 if (buf)
596 *buf++ = '+';
597 len++;
598 if (cc != '+')
599 {
600 // BASE64 encode string
601 unsigned int lsb, d, l;
602 for (d = 0, l = 0;; psz++)
603 {
604 for (lsb = 0; lsb < 2; lsb ++)
605 {
606 d <<= 8;
607 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
608
609 for (l += 8; l >= 6; )
610 {
611 l -= 6;
612 if (buf)
613 *buf++ = utf7enb64[(d >> l) % 64];
614 len++;
615 }
616 }
617 cc = *psz;
618 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
619 break;
620 }
621 if (l != 0)
622 {
623 if (buf)
624 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
625 len++;
626 }
627 }
628 if (buf)
629 *buf++ = '-';
630 len++;
631 }
632 }
633 if (buf && (len < n))
634 *buf = 0;
635 return len;
636 }
637
638 // ----------------------------------------------------------------------------
639 // UTF-8
640 // ----------------------------------------------------------------------------
641
642 static wxUint32 utf8_max[]=
643 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
644
645 // boundaries of the private use area we use to (temporarily) remap invalid
646 // characters invalid in a UTF-8 encoded string
647 const wxUint32 wxUnicodePUA = 0x100000;
648 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
649
650 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
651 {
652 size_t len = 0;
653
654 while (*psz && ((!buf) || (len < n)))
655 {
656 const char *opsz = psz;
657 bool invalid = false;
658 unsigned char cc = *psz++, fc = cc;
659 unsigned cnt;
660 for (cnt = 0; fc & 0x80; cnt++)
661 fc <<= 1;
662 if (!cnt)
663 {
664 // plain ASCII char
665 if (buf)
666 *buf++ = cc;
667 len++;
668 }
669 else
670 {
671 cnt--;
672 if (!cnt)
673 {
674 // invalid UTF-8 sequence
675 invalid = true;
676 }
677 else
678 {
679 unsigned ocnt = cnt - 1;
680 wxUint32 res = cc & (0x3f >> cnt);
681 while (cnt--)
682 {
683 cc = *psz;
684 if ((cc & 0xC0) != 0x80)
685 {
686 // invalid UTF-8 sequence
687 invalid = true;
688 break;
689 }
690 psz++;
691 res = (res << 6) | (cc & 0x3f);
692 }
693 if (invalid || res <= utf8_max[ocnt])
694 {
695 // illegal UTF-8 encoding
696 invalid = true;
697 }
698 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
699 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
700 {
701 // if one of our PUA characters turns up externally
702 // it must also be treated as an illegal sequence
703 // (a bit like you have to escape an escape character)
704 invalid = true;
705 }
706 else
707 {
708 #ifdef WC_UTF16
709 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
710 size_t pa = encode_utf16(res, (wxUint16 *)buf);
711 if (pa == (size_t)-1)
712 {
713 invalid = true;
714 }
715 else
716 {
717 if (buf)
718 buf += pa;
719 len += pa;
720 }
721 #else // !WC_UTF16
722 if (buf)
723 *buf++ = res;
724 len++;
725 #endif // WC_UTF16/!WC_UTF16
726 }
727 }
728 if (invalid)
729 {
730 if (m_options & MAP_INVALID_UTF8_TO_PUA)
731 {
732 while (opsz < psz && (!buf || len < n))
733 {
734 #ifdef WC_UTF16
735 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
736 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
737 wxASSERT(pa != (size_t)-1);
738 if (buf)
739 buf += pa;
740 opsz++;
741 len += pa;
742 #else
743 if (buf)
744 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
745 opsz++;
746 len++;
747 #endif
748 }
749 }
750 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
751 {
752 while (opsz < psz && (!buf || len < n))
753 {
754 if ( buf && len + 3 < n )
755 {
756 unsigned char n = *opsz;
757 *buf++ = L'\\';
758 *buf++ = L'0' + n / 0100;
759 *buf++ = L'0' + (n % 0100) / 010;
760 *buf++ = L'0' + n % 010;
761 }
762 opsz++;
763 len += 4;
764 }
765 }
766 else // MAP_INVALID_UTF8_NOT
767 {
768 return (size_t)-1;
769 }
770 }
771 }
772 }
773 if (buf && (len < n))
774 *buf = 0;
775 return len;
776 }
777
778 static inline bool isoctal(wchar_t wch)
779 {
780 return L'0' <= wch && wch <= L'7';
781 }
782
783 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
784 {
785 size_t len = 0;
786
787 while (*psz && ((!buf) || (len < n)))
788 {
789 wxUint32 cc;
790 #ifdef WC_UTF16
791 // cast is ok for WC_UTF16
792 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
793 psz += (pa == (size_t)-1) ? 1 : pa;
794 #else
795 cc=(*psz++) & 0x7fffffff;
796 #endif
797
798 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
799 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
800 {
801 if (buf)
802 *buf++ = (char)(cc - wxUnicodePUA);
803 len++;
804 }
805 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
806 cc == L'\\' &&
807 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
808 {
809 if (buf)
810 {
811 *buf++ = (char) (psz[0] - L'0')*0100 +
812 (psz[1] - L'0')*010 +
813 (psz[2] - L'0');
814 }
815
816 psz += 3;
817 len++;
818 }
819 else
820 {
821 unsigned cnt;
822 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
823 if (!cnt)
824 {
825 // plain ASCII char
826 if (buf)
827 *buf++ = (char) cc;
828 len++;
829 }
830
831 else
832 {
833 len += cnt + 1;
834 if (buf)
835 {
836 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
837 while (cnt--)
838 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
839 }
840 }
841 }
842 }
843
844 if (buf && (len<n))
845 *buf = 0;
846
847 return len;
848 }
849
850 // ----------------------------------------------------------------------------
851 // UTF-16
852 // ----------------------------------------------------------------------------
853
854 #ifdef WORDS_BIGENDIAN
855 #define wxMBConvUTF16straight wxMBConvUTF16BE
856 #define wxMBConvUTF16swap wxMBConvUTF16LE
857 #else
858 #define wxMBConvUTF16swap wxMBConvUTF16BE
859 #define wxMBConvUTF16straight wxMBConvUTF16LE
860 #endif
861
862
863 #ifdef WC_UTF16
864
865 // copy 16bit MB to 16bit String
866 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
867 {
868 size_t len=0;
869
870 while (*(wxUint16*)psz && (!buf || len < n))
871 {
872 if (buf)
873 *buf++ = *(wxUint16*)psz;
874 len++;
875
876 psz += sizeof(wxUint16);
877 }
878 if (buf && len<n) *buf=0;
879
880 return len;
881 }
882
883
884 // copy 16bit String to 16bit MB
885 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
886 {
887 size_t len=0;
888
889 while (*psz && (!buf || len < n))
890 {
891 if (buf)
892 {
893 *(wxUint16*)buf = *psz;
894 buf += sizeof(wxUint16);
895 }
896 len += sizeof(wxUint16);
897 psz++;
898 }
899 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
900
901 return len;
902 }
903
904
905 // swap 16bit MB to 16bit String
906 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
907 {
908 size_t len=0;
909
910 while (*(wxUint16*)psz && (!buf || len < n))
911 {
912 if (buf)
913 {
914 ((char *)buf)[0] = psz[1];
915 ((char *)buf)[1] = psz[0];
916 buf++;
917 }
918 len++;
919 psz += sizeof(wxUint16);
920 }
921 if (buf && len<n) *buf=0;
922
923 return len;
924 }
925
926
927 // swap 16bit MB to 16bit String
928 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
929 {
930 size_t len=0;
931
932 while (*psz && (!buf || len < n))
933 {
934 if (buf)
935 {
936 *buf++ = ((char*)psz)[1];
937 *buf++ = ((char*)psz)[0];
938 }
939 len += sizeof(wxUint16);
940 psz++;
941 }
942 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
943
944 return len;
945 }
946
947
948 #else // WC_UTF16
949
950
951 // copy 16bit MB to 32bit String
952 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
953 {
954 size_t len=0;
955
956 while (*(wxUint16*)psz && (!buf || len < n))
957 {
958 wxUint32 cc;
959 size_t pa=decode_utf16((wxUint16*)psz, cc);
960 if (pa == (size_t)-1)
961 return pa;
962
963 if (buf)
964 *buf++ = cc;
965 len++;
966 psz += pa * sizeof(wxUint16);
967 }
968 if (buf && len<n) *buf=0;
969
970 return len;
971 }
972
973
974 // copy 32bit String to 16bit MB
975 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
976 {
977 size_t len=0;
978
979 while (*psz && (!buf || len < n))
980 {
981 wxUint16 cc[2];
982 size_t pa=encode_utf16(*psz, cc);
983
984 if (pa == (size_t)-1)
985 return pa;
986
987 if (buf)
988 {
989 *(wxUint16*)buf = cc[0];
990 buf += sizeof(wxUint16);
991 if (pa > 1)
992 {
993 *(wxUint16*)buf = cc[1];
994 buf += sizeof(wxUint16);
995 }
996 }
997
998 len += pa*sizeof(wxUint16);
999 psz++;
1000 }
1001 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1002
1003 return len;
1004 }
1005
1006
1007 // swap 16bit MB to 32bit String
1008 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1009 {
1010 size_t len=0;
1011
1012 while (*(wxUint16*)psz && (!buf || len < n))
1013 {
1014 wxUint32 cc;
1015 char tmp[4];
1016 tmp[0]=psz[1]; tmp[1]=psz[0];
1017 tmp[2]=psz[3]; tmp[3]=psz[2];
1018
1019 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1020 if (pa == (size_t)-1)
1021 return pa;
1022
1023 if (buf)
1024 *buf++ = cc;
1025
1026 len++;
1027 psz += pa * sizeof(wxUint16);
1028 }
1029 if (buf && len<n) *buf=0;
1030
1031 return len;
1032 }
1033
1034
1035 // swap 32bit String to 16bit MB
1036 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1037 {
1038 size_t len=0;
1039
1040 while (*psz && (!buf || len < n))
1041 {
1042 wxUint16 cc[2];
1043 size_t pa=encode_utf16(*psz, cc);
1044
1045 if (pa == (size_t)-1)
1046 return pa;
1047
1048 if (buf)
1049 {
1050 *buf++ = ((char*)cc)[1];
1051 *buf++ = ((char*)cc)[0];
1052 if (pa > 1)
1053 {
1054 *buf++ = ((char*)cc)[3];
1055 *buf++ = ((char*)cc)[2];
1056 }
1057 }
1058
1059 len += pa*sizeof(wxUint16);
1060 psz++;
1061 }
1062 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1063
1064 return len;
1065 }
1066
1067 #endif // WC_UTF16
1068
1069
1070 // ----------------------------------------------------------------------------
1071 // UTF-32
1072 // ----------------------------------------------------------------------------
1073
1074 #ifdef WORDS_BIGENDIAN
1075 #define wxMBConvUTF32straight wxMBConvUTF32BE
1076 #define wxMBConvUTF32swap wxMBConvUTF32LE
1077 #else
1078 #define wxMBConvUTF32swap wxMBConvUTF32BE
1079 #define wxMBConvUTF32straight wxMBConvUTF32LE
1080 #endif
1081
1082
1083 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1084 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1085
1086
1087 #ifdef WC_UTF16
1088
1089 // copy 32bit MB to 16bit String
1090 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1091 {
1092 size_t len=0;
1093
1094 while (*(wxUint32*)psz && (!buf || len < n))
1095 {
1096 wxUint16 cc[2];
1097
1098 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1099 if (pa == (size_t)-1)
1100 return pa;
1101
1102 if (buf)
1103 {
1104 *buf++ = cc[0];
1105 if (pa > 1)
1106 *buf++ = cc[1];
1107 }
1108 len += pa;
1109 psz += sizeof(wxUint32);
1110 }
1111 if (buf && len<n) *buf=0;
1112
1113 return len;
1114 }
1115
1116
1117 // copy 16bit String to 32bit MB
1118 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1119 {
1120 size_t len=0;
1121
1122 while (*psz && (!buf || len < n))
1123 {
1124 wxUint32 cc;
1125
1126 // cast is ok for WC_UTF16
1127 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1128 if (pa == (size_t)-1)
1129 return pa;
1130
1131 if (buf)
1132 {
1133 *(wxUint32*)buf = cc;
1134 buf += sizeof(wxUint32);
1135 }
1136 len += sizeof(wxUint32);
1137 psz += pa;
1138 }
1139
1140 if (buf && len<=n-sizeof(wxUint32))
1141 *(wxUint32*)buf=0;
1142
1143 return len;
1144 }
1145
1146
1147
1148 // swap 32bit MB to 16bit String
1149 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1150 {
1151 size_t len=0;
1152
1153 while (*(wxUint32*)psz && (!buf || len < n))
1154 {
1155 char tmp[4];
1156 tmp[0] = psz[3]; tmp[1] = psz[2];
1157 tmp[2] = psz[1]; tmp[3] = psz[0];
1158
1159
1160 wxUint16 cc[2];
1161
1162 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1163 if (pa == (size_t)-1)
1164 return pa;
1165
1166 if (buf)
1167 {
1168 *buf++ = cc[0];
1169 if (pa > 1)
1170 *buf++ = cc[1];
1171 }
1172 len += pa;
1173 psz += sizeof(wxUint32);
1174 }
1175
1176 if (buf && len<n)
1177 *buf=0;
1178
1179 return len;
1180 }
1181
1182
1183 // swap 16bit String to 32bit MB
1184 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1185 {
1186 size_t len=0;
1187
1188 while (*psz && (!buf || len < n))
1189 {
1190 char cc[4];
1191
1192 // cast is ok for WC_UTF16
1193 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1194 if (pa == (size_t)-1)
1195 return pa;
1196
1197 if (buf)
1198 {
1199 *buf++ = cc[3];
1200 *buf++ = cc[2];
1201 *buf++ = cc[1];
1202 *buf++ = cc[0];
1203 }
1204 len += sizeof(wxUint32);
1205 psz += pa;
1206 }
1207
1208 if (buf && len<=n-sizeof(wxUint32))
1209 *(wxUint32*)buf=0;
1210
1211 return len;
1212 }
1213
1214 #else // WC_UTF16
1215
1216
1217 // copy 32bit MB to 32bit String
1218 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1219 {
1220 size_t len=0;
1221
1222 while (*(wxUint32*)psz && (!buf || len < n))
1223 {
1224 if (buf)
1225 *buf++ = *(wxUint32*)psz;
1226 len++;
1227 psz += sizeof(wxUint32);
1228 }
1229
1230 if (buf && len<n)
1231 *buf=0;
1232
1233 return len;
1234 }
1235
1236
1237 // copy 32bit String to 32bit MB
1238 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1239 {
1240 size_t len=0;
1241
1242 while (*psz && (!buf || len < n))
1243 {
1244 if (buf)
1245 {
1246 *(wxUint32*)buf = *psz;
1247 buf += sizeof(wxUint32);
1248 }
1249
1250 len += sizeof(wxUint32);
1251 psz++;
1252 }
1253
1254 if (buf && len<=n-sizeof(wxUint32))
1255 *(wxUint32*)buf=0;
1256
1257 return len;
1258 }
1259
1260
1261 // swap 32bit MB to 32bit String
1262 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1263 {
1264 size_t len=0;
1265
1266 while (*(wxUint32*)psz && (!buf || len < n))
1267 {
1268 if (buf)
1269 {
1270 ((char *)buf)[0] = psz[3];
1271 ((char *)buf)[1] = psz[2];
1272 ((char *)buf)[2] = psz[1];
1273 ((char *)buf)[3] = psz[0];
1274 buf++;
1275 }
1276 len++;
1277 psz += sizeof(wxUint32);
1278 }
1279
1280 if (buf && len<n)
1281 *buf=0;
1282
1283 return len;
1284 }
1285
1286
1287 // swap 32bit String to 32bit MB
1288 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1289 {
1290 size_t len=0;
1291
1292 while (*psz && (!buf || len < n))
1293 {
1294 if (buf)
1295 {
1296 *buf++ = ((char *)psz)[3];
1297 *buf++ = ((char *)psz)[2];
1298 *buf++ = ((char *)psz)[1];
1299 *buf++ = ((char *)psz)[0];
1300 }
1301 len += sizeof(wxUint32);
1302 psz++;
1303 }
1304
1305 if (buf && len<=n-sizeof(wxUint32))
1306 *(wxUint32*)buf=0;
1307
1308 return len;
1309 }
1310
1311
1312 #endif // WC_UTF16
1313
1314
1315 // ============================================================================
1316 // The classes doing conversion using the iconv_xxx() functions
1317 // ============================================================================
1318
1319 #ifdef HAVE_ICONV
1320
1321 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1322 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1323 // (unless there's yet another bug in glibc) the only case when iconv()
1324 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1325 // left in the input buffer -- when _real_ error occurs,
1326 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1327 // iconv() failure.
1328 // [This bug does not appear in glibc 2.2.]
1329 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1330 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1331 (errno != E2BIG || bufLeft != 0))
1332 #else
1333 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1334 #endif
1335
1336 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1337
1338 // ----------------------------------------------------------------------------
1339 // wxMBConv_iconv: encapsulates an iconv character set
1340 // ----------------------------------------------------------------------------
1341
1342 class wxMBConv_iconv : public wxMBConv
1343 {
1344 public:
1345 wxMBConv_iconv(const wxChar *name);
1346 virtual ~wxMBConv_iconv();
1347
1348 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1349 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1350
1351 bool IsOk() const
1352 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1353
1354 protected:
1355 // the iconv handlers used to translate from multibyte to wide char and in
1356 // the other direction
1357 iconv_t m2w,
1358 w2m;
1359 #if wxUSE_THREADS
1360 // guards access to m2w and w2m objects
1361 wxMutex m_iconvMutex;
1362 #endif
1363
1364 private:
1365 // the name (for iconv_open()) of a wide char charset -- if none is
1366 // available on this machine, it will remain NULL
1367 static const char *ms_wcCharsetName;
1368
1369 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1370 // different endian-ness than the native one
1371 static bool ms_wcNeedsSwap;
1372 };
1373
1374 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1375 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1376
1377 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1378 {
1379 // Do it the hard way
1380 char cname[100];
1381 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1382 cname[i] = (char) name[i];
1383
1384 // check for charset that represents wchar_t:
1385 if (ms_wcCharsetName == NULL)
1386 {
1387 ms_wcNeedsSwap = false;
1388
1389 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1390 ms_wcCharsetName = WC_NAME_BEST;
1391 m2w = iconv_open(ms_wcCharsetName, cname);
1392
1393 if (m2w == (iconv_t)-1)
1394 {
1395 // try charset w/o bytesex info (e.g. "UCS4")
1396 // and check for bytesex ourselves:
1397 ms_wcCharsetName = WC_NAME;
1398 m2w = iconv_open(ms_wcCharsetName, cname);
1399
1400 // last bet, try if it knows WCHAR_T pseudo-charset
1401 if (m2w == (iconv_t)-1)
1402 {
1403 ms_wcCharsetName = "WCHAR_T";
1404 m2w = iconv_open(ms_wcCharsetName, cname);
1405 }
1406
1407 if (m2w != (iconv_t)-1)
1408 {
1409 char buf[2], *bufPtr;
1410 wchar_t wbuf[2], *wbufPtr;
1411 size_t insz, outsz;
1412 size_t res;
1413
1414 buf[0] = 'A';
1415 buf[1] = 0;
1416 wbuf[0] = 0;
1417 insz = 2;
1418 outsz = SIZEOF_WCHAR_T * 2;
1419 wbufPtr = wbuf;
1420 bufPtr = buf;
1421
1422 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1423 (char**)&wbufPtr, &outsz);
1424
1425 if (ICONV_FAILED(res, insz))
1426 {
1427 ms_wcCharsetName = NULL;
1428 wxLogLastError(wxT("iconv"));
1429 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1430 }
1431 else
1432 {
1433 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1434 }
1435 }
1436 else
1437 {
1438 ms_wcCharsetName = NULL;
1439
1440 // VS: we must not output an error here, since wxWidgets will safely
1441 // fall back to using wxEncodingConverter.
1442 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1443 //wxLogError(
1444 }
1445 }
1446 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1447 }
1448 else // we already have ms_wcCharsetName
1449 {
1450 m2w = iconv_open(ms_wcCharsetName, cname);
1451 }
1452
1453 // NB: don't ever pass NULL to iconv_open(), it may crash!
1454 if ( ms_wcCharsetName )
1455 {
1456 w2m = iconv_open( cname, ms_wcCharsetName);
1457 }
1458 else
1459 {
1460 w2m = (iconv_t)-1;
1461 }
1462 }
1463
1464 wxMBConv_iconv::~wxMBConv_iconv()
1465 {
1466 if ( m2w != (iconv_t)-1 )
1467 iconv_close(m2w);
1468 if ( w2m != (iconv_t)-1 )
1469 iconv_close(w2m);
1470 }
1471
1472 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1473 {
1474 #if wxUSE_THREADS
1475 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1476 // Unfortunately there is a couple of global wxCSConv objects such as
1477 // wxConvLocal that are used all over wx code, so we have to make sure
1478 // the handle is used by at most one thread at the time. Otherwise
1479 // only a few wx classes would be safe to use from non-main threads
1480 // as MB<->WC conversion would fail "randomly".
1481 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1482 #endif
1483
1484 size_t inbuf = strlen(psz);
1485 size_t outbuf = n * SIZEOF_WCHAR_T;
1486 size_t res, cres;
1487 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1488 wchar_t *bufPtr = buf;
1489 const char *pszPtr = psz;
1490
1491 if (buf)
1492 {
1493 // have destination buffer, convert there
1494 cres = iconv(m2w,
1495 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1496 (char**)&bufPtr, &outbuf);
1497 res = n - (outbuf / SIZEOF_WCHAR_T);
1498
1499 if (ms_wcNeedsSwap)
1500 {
1501 // convert to native endianness
1502 WC_BSWAP(buf /* _not_ bufPtr */, res)
1503 }
1504
1505 // NB: iconv was given only strlen(psz) characters on input, and so
1506 // it couldn't convert the trailing zero. Let's do it ourselves
1507 // if there's some room left for it in the output buffer.
1508 if (res < n)
1509 buf[res] = 0;
1510 }
1511 else
1512 {
1513 // no destination buffer... convert using temp buffer
1514 // to calculate destination buffer requirement
1515 wchar_t tbuf[8];
1516 res = 0;
1517 do {
1518 bufPtr = tbuf;
1519 outbuf = 8*SIZEOF_WCHAR_T;
1520
1521 cres = iconv(m2w,
1522 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1523 (char**)&bufPtr, &outbuf );
1524
1525 res += 8-(outbuf/SIZEOF_WCHAR_T);
1526 } while ((cres==(size_t)-1) && (errno==E2BIG));
1527 }
1528
1529 if (ICONV_FAILED(cres, inbuf))
1530 {
1531 //VS: it is ok if iconv fails, hence trace only
1532 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1533 return (size_t)-1;
1534 }
1535
1536 return res;
1537 }
1538
1539 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1540 {
1541 #if wxUSE_THREADS
1542 // NB: explained in MB2WC
1543 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1544 #endif
1545
1546 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1547 size_t outbuf = n;
1548 size_t res, cres;
1549
1550 wchar_t *tmpbuf = 0;
1551
1552 if (ms_wcNeedsSwap)
1553 {
1554 // need to copy to temp buffer to switch endianness
1555 // this absolutely doesn't rock!
1556 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1557 // could be in read-only memory, or be accessed in some other thread)
1558 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1559 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1560 WC_BSWAP(tmpbuf, inbuf)
1561 psz=tmpbuf;
1562 }
1563
1564 if (buf)
1565 {
1566 // have destination buffer, convert there
1567 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1568
1569 res = n-outbuf;
1570
1571 // NB: iconv was given only wcslen(psz) characters on input, and so
1572 // it couldn't convert the trailing zero. Let's do it ourselves
1573 // if there's some room left for it in the output buffer.
1574 if (res < n)
1575 buf[0] = 0;
1576 }
1577 else
1578 {
1579 // no destination buffer... convert using temp buffer
1580 // to calculate destination buffer requirement
1581 char tbuf[16];
1582 res = 0;
1583 do {
1584 buf = tbuf; outbuf = 16;
1585
1586 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1587
1588 res += 16 - outbuf;
1589 } while ((cres==(size_t)-1) && (errno==E2BIG));
1590 }
1591
1592 if (ms_wcNeedsSwap)
1593 {
1594 free(tmpbuf);
1595 }
1596
1597 if (ICONV_FAILED(cres, inbuf))
1598 {
1599 //VS: it is ok if iconv fails, hence trace only
1600 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1601 return (size_t)-1;
1602 }
1603
1604 return res;
1605 }
1606
1607 #endif // HAVE_ICONV
1608
1609
1610 // ============================================================================
1611 // Win32 conversion classes
1612 // ============================================================================
1613
1614 #ifdef wxHAVE_WIN32_MB2WC
1615
1616 // from utils.cpp
1617 #if wxUSE_FONTMAP
1618 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1619 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1620 #endif
1621
1622 class wxMBConv_win32 : public wxMBConv
1623 {
1624 public:
1625 wxMBConv_win32()
1626 {
1627 m_CodePage = CP_ACP;
1628 }
1629
1630 #if wxUSE_FONTMAP
1631 wxMBConv_win32(const wxChar* name)
1632 {
1633 m_CodePage = wxCharsetToCodepage(name);
1634 }
1635
1636 wxMBConv_win32(wxFontEncoding encoding)
1637 {
1638 m_CodePage = wxEncodingToCodepage(encoding);
1639 }
1640 #endif
1641
1642 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1643 {
1644 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1645 // the behaviour is not compatible with the Unix version (using iconv)
1646 // and break the library itself, e.g. wxTextInputStream::NextChar()
1647 // wouldn't work if reading an incomplete MB char didn't result in an
1648 // error
1649 //
1650 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1651 // an error (tested under Windows Server 2003) and apparently it is
1652 // done on purpose, i.e. the function accepts any input in this case
1653 // and although I'd prefer to return error on ill-formed output, our
1654 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1655 // explicitly ill-formed according to RFC 2152) neither so we don't
1656 // even have any fallback here...
1657 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1658
1659 const size_t len = ::MultiByteToWideChar
1660 (
1661 m_CodePage, // code page
1662 flags, // flags: fall on error
1663 psz, // input string
1664 -1, // its length (NUL-terminated)
1665 buf, // output string
1666 buf ? n : 0 // size of output buffer
1667 );
1668
1669 // note that it returns count of written chars for buf != NULL and size
1670 // of the needed buffer for buf == NULL so in either case the length of
1671 // the string (which never includes the terminating NUL) is one less
1672 return len ? len - 1 : (size_t)-1;
1673 }
1674
1675 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1676 {
1677 /*
1678 we have a problem here: by default, WideCharToMultiByte() may
1679 replace characters unrepresentable in the target code page with bad
1680 quality approximations such as turning "1/2" symbol (U+00BD) into
1681 "1" for the code pages which don't have it and we, obviously, want
1682 to avoid this at any price
1683
1684 the trouble is that this function does it _silently_, i.e. it won't
1685 even tell us whether it did or not... Win98/2000 and higher provide
1686 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1687 we have to resort to a round trip, i.e. check that converting back
1688 results in the same string -- this is, of course, expensive but
1689 otherwise we simply can't be sure to not garble the data.
1690 */
1691
1692 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1693 // it doesn't work with CJK encodings (which we test for rather roughly
1694 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1695 // supporting it
1696 BOOL usedDef wxDUMMY_INITIALIZE(false);
1697 BOOL *pUsedDef;
1698 int flags;
1699 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1700 {
1701 // it's our lucky day
1702 flags = WC_NO_BEST_FIT_CHARS;
1703 pUsedDef = &usedDef;
1704 }
1705 else // old system or unsupported encoding
1706 {
1707 flags = 0;
1708 pUsedDef = NULL;
1709 }
1710
1711 const size_t len = ::WideCharToMultiByte
1712 (
1713 m_CodePage, // code page
1714 flags, // either none or no best fit
1715 pwz, // input string
1716 -1, // it is (wide) NUL-terminated
1717 buf, // output buffer
1718 buf ? n : 0, // and its size
1719 NULL, // default "replacement" char
1720 pUsedDef // [out] was it used?
1721 );
1722
1723 if ( !len )
1724 {
1725 // function totally failed
1726 return (size_t)-1;
1727 }
1728
1729 // if we were really converting, check if we succeeded
1730 if ( buf )
1731 {
1732 if ( flags )
1733 {
1734 // check if the conversion failed, i.e. if any replacements
1735 // were done
1736 if ( usedDef )
1737 return (size_t)-1;
1738 }
1739 else // we must resort to double tripping...
1740 {
1741 wxWCharBuffer wcBuf(n);
1742 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1743 wcscmp(wcBuf, pwz) != 0 )
1744 {
1745 // we didn't obtain the same thing we started from, hence
1746 // the conversion was lossy and we consider that it failed
1747 return (size_t)-1;
1748 }
1749 }
1750 }
1751
1752 // see the comment above for the reason of "len - 1"
1753 return len - 1;
1754 }
1755
1756 bool IsOk() const { return m_CodePage != -1; }
1757
1758 private:
1759 static bool CanUseNoBestFit()
1760 {
1761 static int s_isWin98Or2k = -1;
1762
1763 if ( s_isWin98Or2k == -1 )
1764 {
1765 int verMaj, verMin;
1766 switch ( wxGetOsVersion(&verMaj, &verMin) )
1767 {
1768 case wxWIN95:
1769 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1770 break;
1771
1772 case wxWINDOWS_NT:
1773 s_isWin98Or2k = verMaj >= 5;
1774 break;
1775
1776 default:
1777 // unknown, be conseravtive by default
1778 s_isWin98Or2k = 0;
1779 }
1780
1781 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1782 }
1783
1784 return s_isWin98Or2k == 1;
1785 }
1786
1787 long m_CodePage;
1788 };
1789
1790 #endif // wxHAVE_WIN32_MB2WC
1791
1792 // ============================================================================
1793 // Cocoa conversion classes
1794 // ============================================================================
1795
1796 #if defined(__WXCOCOA__)
1797
1798 // RN: There is no UTF-32 support in either Core Foundation or
1799 // Cocoa. Strangely enough, internally Core Foundation uses
1800 // UTF 32 internally quite a bit - its just not public (yet).
1801
1802 #include <CoreFoundation/CFString.h>
1803 #include <CoreFoundation/CFStringEncodingExt.h>
1804
1805 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1806 {
1807 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1808 if ( encoding == wxFONTENCODING_DEFAULT )
1809 {
1810 enc = CFStringGetSystemEncoding();
1811 }
1812 else switch( encoding)
1813 {
1814 case wxFONTENCODING_ISO8859_1 :
1815 enc = kCFStringEncodingISOLatin1 ;
1816 break ;
1817 case wxFONTENCODING_ISO8859_2 :
1818 enc = kCFStringEncodingISOLatin2;
1819 break ;
1820 case wxFONTENCODING_ISO8859_3 :
1821 enc = kCFStringEncodingISOLatin3 ;
1822 break ;
1823 case wxFONTENCODING_ISO8859_4 :
1824 enc = kCFStringEncodingISOLatin4;
1825 break ;
1826 case wxFONTENCODING_ISO8859_5 :
1827 enc = kCFStringEncodingISOLatinCyrillic;
1828 break ;
1829 case wxFONTENCODING_ISO8859_6 :
1830 enc = kCFStringEncodingISOLatinArabic;
1831 break ;
1832 case wxFONTENCODING_ISO8859_7 :
1833 enc = kCFStringEncodingISOLatinGreek;
1834 break ;
1835 case wxFONTENCODING_ISO8859_8 :
1836 enc = kCFStringEncodingISOLatinHebrew;
1837 break ;
1838 case wxFONTENCODING_ISO8859_9 :
1839 enc = kCFStringEncodingISOLatin5;
1840 break ;
1841 case wxFONTENCODING_ISO8859_10 :
1842 enc = kCFStringEncodingISOLatin6;
1843 break ;
1844 case wxFONTENCODING_ISO8859_11 :
1845 enc = kCFStringEncodingISOLatinThai;
1846 break ;
1847 case wxFONTENCODING_ISO8859_13 :
1848 enc = kCFStringEncodingISOLatin7;
1849 break ;
1850 case wxFONTENCODING_ISO8859_14 :
1851 enc = kCFStringEncodingISOLatin8;
1852 break ;
1853 case wxFONTENCODING_ISO8859_15 :
1854 enc = kCFStringEncodingISOLatin9;
1855 break ;
1856
1857 case wxFONTENCODING_KOI8 :
1858 enc = kCFStringEncodingKOI8_R;
1859 break ;
1860 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1861 enc = kCFStringEncodingDOSRussian;
1862 break ;
1863
1864 // case wxFONTENCODING_BULGARIAN :
1865 // enc = ;
1866 // break ;
1867
1868 case wxFONTENCODING_CP437 :
1869 enc =kCFStringEncodingDOSLatinUS ;
1870 break ;
1871 case wxFONTENCODING_CP850 :
1872 enc = kCFStringEncodingDOSLatin1;
1873 break ;
1874 case wxFONTENCODING_CP852 :
1875 enc = kCFStringEncodingDOSLatin2;
1876 break ;
1877 case wxFONTENCODING_CP855 :
1878 enc = kCFStringEncodingDOSCyrillic;
1879 break ;
1880 case wxFONTENCODING_CP866 :
1881 enc =kCFStringEncodingDOSRussian ;
1882 break ;
1883 case wxFONTENCODING_CP874 :
1884 enc = kCFStringEncodingDOSThai;
1885 break ;
1886 case wxFONTENCODING_CP932 :
1887 enc = kCFStringEncodingDOSJapanese;
1888 break ;
1889 case wxFONTENCODING_CP936 :
1890 enc =kCFStringEncodingDOSChineseSimplif ;
1891 break ;
1892 case wxFONTENCODING_CP949 :
1893 enc = kCFStringEncodingDOSKorean;
1894 break ;
1895 case wxFONTENCODING_CP950 :
1896 enc = kCFStringEncodingDOSChineseTrad;
1897 break ;
1898 case wxFONTENCODING_CP1250 :
1899 enc = kCFStringEncodingWindowsLatin2;
1900 break ;
1901 case wxFONTENCODING_CP1251 :
1902 enc =kCFStringEncodingWindowsCyrillic ;
1903 break ;
1904 case wxFONTENCODING_CP1252 :
1905 enc =kCFStringEncodingWindowsLatin1 ;
1906 break ;
1907 case wxFONTENCODING_CP1253 :
1908 enc = kCFStringEncodingWindowsGreek;
1909 break ;
1910 case wxFONTENCODING_CP1254 :
1911 enc = kCFStringEncodingWindowsLatin5;
1912 break ;
1913 case wxFONTENCODING_CP1255 :
1914 enc =kCFStringEncodingWindowsHebrew ;
1915 break ;
1916 case wxFONTENCODING_CP1256 :
1917 enc =kCFStringEncodingWindowsArabic ;
1918 break ;
1919 case wxFONTENCODING_CP1257 :
1920 enc = kCFStringEncodingWindowsBalticRim;
1921 break ;
1922 // This only really encodes to UTF7 (if that) evidently
1923 // case wxFONTENCODING_UTF7 :
1924 // enc = kCFStringEncodingNonLossyASCII ;
1925 // break ;
1926 case wxFONTENCODING_UTF8 :
1927 enc = kCFStringEncodingUTF8 ;
1928 break ;
1929 case wxFONTENCODING_EUC_JP :
1930 enc = kCFStringEncodingEUC_JP;
1931 break ;
1932 case wxFONTENCODING_UTF16 :
1933 enc = kCFStringEncodingUnicode ;
1934 break ;
1935 case wxFONTENCODING_MACROMAN :
1936 enc = kCFStringEncodingMacRoman ;
1937 break ;
1938 case wxFONTENCODING_MACJAPANESE :
1939 enc = kCFStringEncodingMacJapanese ;
1940 break ;
1941 case wxFONTENCODING_MACCHINESETRAD :
1942 enc = kCFStringEncodingMacChineseTrad ;
1943 break ;
1944 case wxFONTENCODING_MACKOREAN :
1945 enc = kCFStringEncodingMacKorean ;
1946 break ;
1947 case wxFONTENCODING_MACARABIC :
1948 enc = kCFStringEncodingMacArabic ;
1949 break ;
1950 case wxFONTENCODING_MACHEBREW :
1951 enc = kCFStringEncodingMacHebrew ;
1952 break ;
1953 case wxFONTENCODING_MACGREEK :
1954 enc = kCFStringEncodingMacGreek ;
1955 break ;
1956 case wxFONTENCODING_MACCYRILLIC :
1957 enc = kCFStringEncodingMacCyrillic ;
1958 break ;
1959 case wxFONTENCODING_MACDEVANAGARI :
1960 enc = kCFStringEncodingMacDevanagari ;
1961 break ;
1962 case wxFONTENCODING_MACGURMUKHI :
1963 enc = kCFStringEncodingMacGurmukhi ;
1964 break ;
1965 case wxFONTENCODING_MACGUJARATI :
1966 enc = kCFStringEncodingMacGujarati ;
1967 break ;
1968 case wxFONTENCODING_MACORIYA :
1969 enc = kCFStringEncodingMacOriya ;
1970 break ;
1971 case wxFONTENCODING_MACBENGALI :
1972 enc = kCFStringEncodingMacBengali ;
1973 break ;
1974 case wxFONTENCODING_MACTAMIL :
1975 enc = kCFStringEncodingMacTamil ;
1976 break ;
1977 case wxFONTENCODING_MACTELUGU :
1978 enc = kCFStringEncodingMacTelugu ;
1979 break ;
1980 case wxFONTENCODING_MACKANNADA :
1981 enc = kCFStringEncodingMacKannada ;
1982 break ;
1983 case wxFONTENCODING_MACMALAJALAM :
1984 enc = kCFStringEncodingMacMalayalam ;
1985 break ;
1986 case wxFONTENCODING_MACSINHALESE :
1987 enc = kCFStringEncodingMacSinhalese ;
1988 break ;
1989 case wxFONTENCODING_MACBURMESE :
1990 enc = kCFStringEncodingMacBurmese ;
1991 break ;
1992 case wxFONTENCODING_MACKHMER :
1993 enc = kCFStringEncodingMacKhmer ;
1994 break ;
1995 case wxFONTENCODING_MACTHAI :
1996 enc = kCFStringEncodingMacThai ;
1997 break ;
1998 case wxFONTENCODING_MACLAOTIAN :
1999 enc = kCFStringEncodingMacLaotian ;
2000 break ;
2001 case wxFONTENCODING_MACGEORGIAN :
2002 enc = kCFStringEncodingMacGeorgian ;
2003 break ;
2004 case wxFONTENCODING_MACARMENIAN :
2005 enc = kCFStringEncodingMacArmenian ;
2006 break ;
2007 case wxFONTENCODING_MACCHINESESIMP :
2008 enc = kCFStringEncodingMacChineseSimp ;
2009 break ;
2010 case wxFONTENCODING_MACTIBETAN :
2011 enc = kCFStringEncodingMacTibetan ;
2012 break ;
2013 case wxFONTENCODING_MACMONGOLIAN :
2014 enc = kCFStringEncodingMacMongolian ;
2015 break ;
2016 case wxFONTENCODING_MACETHIOPIC :
2017 enc = kCFStringEncodingMacEthiopic ;
2018 break ;
2019 case wxFONTENCODING_MACCENTRALEUR :
2020 enc = kCFStringEncodingMacCentralEurRoman ;
2021 break ;
2022 case wxFONTENCODING_MACVIATNAMESE :
2023 enc = kCFStringEncodingMacVietnamese ;
2024 break ;
2025 case wxFONTENCODING_MACARABICEXT :
2026 enc = kCFStringEncodingMacExtArabic ;
2027 break ;
2028 case wxFONTENCODING_MACSYMBOL :
2029 enc = kCFStringEncodingMacSymbol ;
2030 break ;
2031 case wxFONTENCODING_MACDINGBATS :
2032 enc = kCFStringEncodingMacDingbats ;
2033 break ;
2034 case wxFONTENCODING_MACTURKISH :
2035 enc = kCFStringEncodingMacTurkish ;
2036 break ;
2037 case wxFONTENCODING_MACCROATIAN :
2038 enc = kCFStringEncodingMacCroatian ;
2039 break ;
2040 case wxFONTENCODING_MACICELANDIC :
2041 enc = kCFStringEncodingMacIcelandic ;
2042 break ;
2043 case wxFONTENCODING_MACROMANIAN :
2044 enc = kCFStringEncodingMacRomanian ;
2045 break ;
2046 case wxFONTENCODING_MACCELTIC :
2047 enc = kCFStringEncodingMacCeltic ;
2048 break ;
2049 case wxFONTENCODING_MACGAELIC :
2050 enc = kCFStringEncodingMacGaelic ;
2051 break ;
2052 // case wxFONTENCODING_MACKEYBOARD :
2053 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2054 // break ;
2055 default :
2056 // because gcc is picky
2057 break ;
2058 } ;
2059 return enc ;
2060 }
2061
2062 class wxMBConv_cocoa : public wxMBConv
2063 {
2064 public:
2065 wxMBConv_cocoa()
2066 {
2067 Init(CFStringGetSystemEncoding()) ;
2068 }
2069
2070 #if wxUSE_FONTMAP
2071 wxMBConv_cocoa(const wxChar* name)
2072 {
2073 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2074 }
2075 #endif
2076
2077 wxMBConv_cocoa(wxFontEncoding encoding)
2078 {
2079 Init( wxCFStringEncFromFontEnc(encoding) );
2080 }
2081
2082 ~wxMBConv_cocoa()
2083 {
2084 }
2085
2086 void Init( CFStringEncoding encoding)
2087 {
2088 m_encoding = encoding ;
2089 }
2090
2091 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2092 {
2093 wxASSERT(szUnConv);
2094
2095 CFStringRef theString = CFStringCreateWithBytes (
2096 NULL, //the allocator
2097 (const UInt8*)szUnConv,
2098 strlen(szUnConv),
2099 m_encoding,
2100 false //no BOM/external representation
2101 );
2102
2103 wxASSERT(theString);
2104
2105 size_t nOutLength = CFStringGetLength(theString);
2106
2107 if (szOut == NULL)
2108 {
2109 CFRelease(theString);
2110 return nOutLength;
2111 }
2112
2113 CFRange theRange = { 0, nOutSize };
2114
2115 #if SIZEOF_WCHAR_T == 4
2116 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2117 #endif
2118
2119 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2120
2121 CFRelease(theString);
2122
2123 szUniCharBuffer[nOutLength] = '\0' ;
2124
2125 #if SIZEOF_WCHAR_T == 4
2126 wxMBConvUTF16 converter ;
2127 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2128 delete[] szUniCharBuffer;
2129 #endif
2130
2131 return nOutLength;
2132 }
2133
2134 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2135 {
2136 wxASSERT(szUnConv);
2137
2138 size_t nRealOutSize;
2139 size_t nBufSize = wxWcslen(szUnConv);
2140 UniChar* szUniBuffer = (UniChar*) szUnConv;
2141
2142 #if SIZEOF_WCHAR_T == 4
2143 wxMBConvUTF16BE converter ;
2144 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2145 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2146 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2147 nBufSize /= sizeof(UniChar);
2148 #endif
2149
2150 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2151 NULL, //allocator
2152 szUniBuffer,
2153 nBufSize,
2154 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2155 );
2156
2157 wxASSERT(theString);
2158
2159 //Note that CER puts a BOM when converting to unicode
2160 //so we check and use getchars instead in that case
2161 if (m_encoding == kCFStringEncodingUnicode)
2162 {
2163 if (szOut != NULL)
2164 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2165
2166 nRealOutSize = CFStringGetLength(theString) + 1;
2167 }
2168 else
2169 {
2170 CFStringGetBytes(
2171 theString,
2172 CFRangeMake(0, CFStringGetLength(theString)),
2173 m_encoding,
2174 0, //what to put in characters that can't be converted -
2175 //0 tells CFString to return NULL if it meets such a character
2176 false, //not an external representation
2177 (UInt8*) szOut,
2178 nOutSize,
2179 (CFIndex*) &nRealOutSize
2180 );
2181 }
2182
2183 CFRelease(theString);
2184
2185 #if SIZEOF_WCHAR_T == 4
2186 delete[] szUniBuffer;
2187 #endif
2188
2189 return nRealOutSize - 1;
2190 }
2191
2192 bool IsOk() const
2193 {
2194 return m_encoding != kCFStringEncodingInvalidId &&
2195 CFStringIsEncodingAvailable(m_encoding);
2196 }
2197
2198 private:
2199 CFStringEncoding m_encoding ;
2200 };
2201
2202 #endif // defined(__WXCOCOA__)
2203
2204 // ============================================================================
2205 // Mac conversion classes
2206 // ============================================================================
2207
2208 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2209
2210 class wxMBConv_mac : public wxMBConv
2211 {
2212 public:
2213 wxMBConv_mac()
2214 {
2215 Init(CFStringGetSystemEncoding()) ;
2216 }
2217
2218 #if wxUSE_FONTMAP
2219 wxMBConv_mac(const wxChar* name)
2220 {
2221 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2222 }
2223 #endif
2224
2225 wxMBConv_mac(wxFontEncoding encoding)
2226 {
2227 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2228 }
2229
2230 ~wxMBConv_mac()
2231 {
2232 OSStatus status = noErr ;
2233 status = TECDisposeConverter(m_MB2WC_converter);
2234 status = TECDisposeConverter(m_WC2MB_converter);
2235 }
2236
2237
2238 void Init( TextEncodingBase encoding)
2239 {
2240 OSStatus status = noErr ;
2241 m_char_encoding = encoding ;
2242 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2243
2244 status = TECCreateConverter(&m_MB2WC_converter,
2245 m_char_encoding,
2246 m_unicode_encoding);
2247 status = TECCreateConverter(&m_WC2MB_converter,
2248 m_unicode_encoding,
2249 m_char_encoding);
2250 }
2251
2252 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2253 {
2254 OSStatus status = noErr ;
2255 ByteCount byteOutLen ;
2256 ByteCount byteInLen = strlen(psz) ;
2257 wchar_t *tbuf = NULL ;
2258 UniChar* ubuf = NULL ;
2259 size_t res = 0 ;
2260
2261 if (buf == NULL)
2262 {
2263 //apple specs say at least 32
2264 n = wxMax( 32 , byteInLen ) ;
2265 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2266 }
2267 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2268 #if SIZEOF_WCHAR_T == 4
2269 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2270 #else
2271 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2272 #endif
2273 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2274 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2275 #if SIZEOF_WCHAR_T == 4
2276 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2277 // is not properly terminated we get random characters at the end
2278 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2279 wxMBConvUTF16BE converter ;
2280 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2281 free( ubuf ) ;
2282 #else
2283 res = byteOutLen / sizeof( UniChar ) ;
2284 #endif
2285 if ( buf == NULL )
2286 free(tbuf) ;
2287
2288 if ( buf && res < n)
2289 buf[res] = 0;
2290
2291 return res ;
2292 }
2293
2294 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2295 {
2296 OSStatus status = noErr ;
2297 ByteCount byteOutLen ;
2298 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2299
2300 char *tbuf = NULL ;
2301
2302 if (buf == NULL)
2303 {
2304 //apple specs say at least 32
2305 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2306 tbuf = (char*) malloc( n ) ;
2307 }
2308
2309 ByteCount byteBufferLen = n ;
2310 UniChar* ubuf = NULL ;
2311 #if SIZEOF_WCHAR_T == 4
2312 wxMBConvUTF16BE converter ;
2313 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2314 byteInLen = unicharlen ;
2315 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2316 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2317 #else
2318 ubuf = (UniChar*) psz ;
2319 #endif
2320 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2321 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2322 #if SIZEOF_WCHAR_T == 4
2323 free( ubuf ) ;
2324 #endif
2325 if ( buf == NULL )
2326 free(tbuf) ;
2327
2328 size_t res = byteOutLen ;
2329 if ( buf && res < n)
2330 {
2331 buf[res] = 0;
2332
2333 //we need to double-trip to verify it didn't insert any ? in place
2334 //of bogus characters
2335 wxWCharBuffer wcBuf(n);
2336 size_t pszlen = wxWcslen(psz);
2337 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2338 wxWcslen(wcBuf) != pszlen ||
2339 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2340 {
2341 // we didn't obtain the same thing we started from, hence
2342 // the conversion was lossy and we consider that it failed
2343 return (size_t)-1;
2344 }
2345 }
2346
2347 return res ;
2348 }
2349
2350 bool IsOk() const
2351 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2352
2353 private:
2354 TECObjectRef m_MB2WC_converter ;
2355 TECObjectRef m_WC2MB_converter ;
2356
2357 TextEncodingBase m_char_encoding ;
2358 TextEncodingBase m_unicode_encoding ;
2359 };
2360
2361 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2362
2363 // ============================================================================
2364 // wxEncodingConverter based conversion classes
2365 // ============================================================================
2366
2367 #if wxUSE_FONTMAP
2368
2369 class wxMBConv_wxwin : public wxMBConv
2370 {
2371 private:
2372 void Init()
2373 {
2374 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2375 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2376 }
2377
2378 public:
2379 // temporarily just use wxEncodingConverter stuff,
2380 // so that it works while a better implementation is built
2381 wxMBConv_wxwin(const wxChar* name)
2382 {
2383 if (name)
2384 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2385 else
2386 m_enc = wxFONTENCODING_SYSTEM;
2387
2388 Init();
2389 }
2390
2391 wxMBConv_wxwin(wxFontEncoding enc)
2392 {
2393 m_enc = enc;
2394
2395 Init();
2396 }
2397
2398 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2399 {
2400 size_t inbuf = strlen(psz);
2401 if (buf)
2402 {
2403 if (!m2w.Convert(psz,buf))
2404 return (size_t)-1;
2405 }
2406 return inbuf;
2407 }
2408
2409 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2410 {
2411 const size_t inbuf = wxWcslen(psz);
2412 if (buf)
2413 {
2414 if (!w2m.Convert(psz,buf))
2415 return (size_t)-1;
2416 }
2417
2418 return inbuf;
2419 }
2420
2421 bool IsOk() const { return m_ok; }
2422
2423 public:
2424 wxFontEncoding m_enc;
2425 wxEncodingConverter m2w, w2m;
2426
2427 // were we initialized successfully?
2428 bool m_ok;
2429
2430 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2431 };
2432
2433 #endif // wxUSE_FONTMAP
2434
2435 // ============================================================================
2436 // wxCSConv implementation
2437 // ============================================================================
2438
2439 void wxCSConv::Init()
2440 {
2441 m_name = NULL;
2442 m_convReal = NULL;
2443 m_deferred = true;
2444 }
2445
2446 wxCSConv::wxCSConv(const wxChar *charset)
2447 {
2448 Init();
2449
2450 if ( charset )
2451 {
2452 SetName(charset);
2453 }
2454
2455 m_encoding = wxFONTENCODING_SYSTEM;
2456 }
2457
2458 wxCSConv::wxCSConv(wxFontEncoding encoding)
2459 {
2460 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2461 {
2462 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2463
2464 encoding = wxFONTENCODING_SYSTEM;
2465 }
2466
2467 Init();
2468
2469 m_encoding = encoding;
2470 }
2471
2472 wxCSConv::~wxCSConv()
2473 {
2474 Clear();
2475 }
2476
2477 wxCSConv::wxCSConv(const wxCSConv& conv)
2478 : wxMBConv()
2479 {
2480 Init();
2481
2482 SetName(conv.m_name);
2483 m_encoding = conv.m_encoding;
2484 }
2485
2486 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2487 {
2488 Clear();
2489
2490 SetName(conv.m_name);
2491 m_encoding = conv.m_encoding;
2492
2493 return *this;
2494 }
2495
2496 void wxCSConv::Clear()
2497 {
2498 free(m_name);
2499 delete m_convReal;
2500
2501 m_name = NULL;
2502 m_convReal = NULL;
2503 }
2504
2505 void wxCSConv::SetName(const wxChar *charset)
2506 {
2507 if (charset)
2508 {
2509 m_name = wxStrdup(charset);
2510 m_deferred = true;
2511 }
2512 }
2513
2514 wxMBConv *wxCSConv::DoCreate() const
2515 {
2516 // check for the special case of ASCII or ISO8859-1 charset: as we have
2517 // special knowledge of it anyhow, we don't need to create a special
2518 // conversion object
2519 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2520 {
2521 // don't convert at all
2522 return NULL;
2523 }
2524
2525 // we trust OS to do conversion better than we can so try external
2526 // conversion methods first
2527 //
2528 // the full order is:
2529 // 1. OS conversion (iconv() under Unix or Win32 API)
2530 // 2. hard coded conversions for UTF
2531 // 3. wxEncodingConverter as fall back
2532
2533 // step (1)
2534 #ifdef HAVE_ICONV
2535 #if !wxUSE_FONTMAP
2536 if ( m_name )
2537 #endif // !wxUSE_FONTMAP
2538 {
2539 wxString name(m_name);
2540
2541 #if wxUSE_FONTMAP
2542 if ( name.empty() )
2543 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2544 #endif // wxUSE_FONTMAP
2545
2546 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2547 if ( conv->IsOk() )
2548 return conv;
2549
2550 delete conv;
2551 }
2552 #endif // HAVE_ICONV
2553
2554 #ifdef wxHAVE_WIN32_MB2WC
2555 {
2556 #if wxUSE_FONTMAP
2557 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2558 : new wxMBConv_win32(m_encoding);
2559 if ( conv->IsOk() )
2560 return conv;
2561
2562 delete conv;
2563 #else
2564 return NULL;
2565 #endif
2566 }
2567 #endif // wxHAVE_WIN32_MB2WC
2568 #if defined(__WXMAC__)
2569 {
2570 // leave UTF16 and UTF32 to the built-ins of wx
2571 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2572 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2573 {
2574
2575 #if wxUSE_FONTMAP
2576 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2577 : new wxMBConv_mac(m_encoding);
2578 #else
2579 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2580 #endif
2581 if ( conv->IsOk() )
2582 return conv;
2583
2584 delete conv;
2585 }
2586 }
2587 #endif
2588 #if defined(__WXCOCOA__)
2589 {
2590 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2591 {
2592
2593 #if wxUSE_FONTMAP
2594 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2595 : new wxMBConv_cocoa(m_encoding);
2596 #else
2597 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2598 #endif
2599 if ( conv->IsOk() )
2600 return conv;
2601
2602 delete conv;
2603 }
2604 }
2605 #endif
2606 // step (2)
2607 wxFontEncoding enc = m_encoding;
2608 #if wxUSE_FONTMAP
2609 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2610 {
2611 // use "false" to suppress interactive dialogs -- we can be called from
2612 // anywhere and popping up a dialog from here is the last thing we want to
2613 // do
2614 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2615 }
2616 #endif // wxUSE_FONTMAP
2617
2618 switch ( enc )
2619 {
2620 case wxFONTENCODING_UTF7:
2621 return new wxMBConvUTF7;
2622
2623 case wxFONTENCODING_UTF8:
2624 return new wxMBConvUTF8;
2625
2626 case wxFONTENCODING_UTF16BE:
2627 return new wxMBConvUTF16BE;
2628
2629 case wxFONTENCODING_UTF16LE:
2630 return new wxMBConvUTF16LE;
2631
2632 case wxFONTENCODING_UTF32BE:
2633 return new wxMBConvUTF32BE;
2634
2635 case wxFONTENCODING_UTF32LE:
2636 return new wxMBConvUTF32LE;
2637
2638 default:
2639 // nothing to do but put here to suppress gcc warnings
2640 ;
2641 }
2642
2643 // step (3)
2644 #if wxUSE_FONTMAP
2645 {
2646 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2647 : new wxMBConv_wxwin(m_encoding);
2648 if ( conv->IsOk() )
2649 return conv;
2650
2651 delete conv;
2652 }
2653 #endif // wxUSE_FONTMAP
2654
2655 // NB: This is a hack to prevent deadlock. What could otherwise happen
2656 // in Unicode build: wxConvLocal creation ends up being here
2657 // because of some failure and logs the error. But wxLog will try to
2658 // attach timestamp, for which it will need wxConvLocal (to convert
2659 // time to char* and then wchar_t*), but that fails, tries to log
2660 // error, but wxLog has a (already locked) critical section that
2661 // guards static buffer.
2662 static bool alreadyLoggingError = false;
2663 if (!alreadyLoggingError)
2664 {
2665 alreadyLoggingError = true;
2666 wxLogError(_("Cannot convert from the charset '%s'!"),
2667 m_name ? m_name
2668 :
2669 #if wxUSE_FONTMAP
2670 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2671 #else // !wxUSE_FONTMAP
2672 wxString::Format(_("encoding %s"), m_encoding).c_str()
2673 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2674 );
2675 alreadyLoggingError = false;
2676 }
2677
2678 return NULL;
2679 }
2680
2681 void wxCSConv::CreateConvIfNeeded() const
2682 {
2683 if ( m_deferred )
2684 {
2685 wxCSConv *self = (wxCSConv *)this; // const_cast
2686
2687 #if wxUSE_INTL
2688 // if we don't have neither the name nor the encoding, use the default
2689 // encoding for this system
2690 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2691 {
2692 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2693 }
2694 #endif // wxUSE_INTL
2695
2696 self->m_convReal = DoCreate();
2697 self->m_deferred = false;
2698 }
2699 }
2700
2701 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2702 {
2703 CreateConvIfNeeded();
2704
2705 if (m_convReal)
2706 return m_convReal->MB2WC(buf, psz, n);
2707
2708 // latin-1 (direct)
2709 size_t len = strlen(psz);
2710
2711 if (buf)
2712 {
2713 for (size_t c = 0; c <= len; c++)
2714 buf[c] = (unsigned char)(psz[c]);
2715 }
2716
2717 return len;
2718 }
2719
2720 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2721 {
2722 CreateConvIfNeeded();
2723
2724 if (m_convReal)
2725 return m_convReal->WC2MB(buf, psz, n);
2726
2727 // latin-1 (direct)
2728 const size_t len = wxWcslen(psz);
2729 if (buf)
2730 {
2731 for (size_t c = 0; c <= len; c++)
2732 {
2733 if (psz[c] > 0xFF)
2734 return (size_t)-1;
2735 buf[c] = (char)psz[c];
2736 }
2737 }
2738 else
2739 {
2740 for (size_t c = 0; c <= len; c++)
2741 {
2742 if (psz[c] > 0xFF)
2743 return (size_t)-1;
2744 }
2745 }
2746
2747 return len;
2748 }
2749
2750 // ----------------------------------------------------------------------------
2751 // globals
2752 // ----------------------------------------------------------------------------
2753
2754 #ifdef __WINDOWS__
2755 static wxMBConv_win32 wxConvLibcObj;
2756 #elif defined(__WXMAC__) && !defined(__MACH__)
2757 static wxMBConv_mac wxConvLibcObj ;
2758 #else
2759 static wxMBConvLibc wxConvLibcObj;
2760 #endif
2761
2762 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2763 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2764 static wxMBConvUTF7 wxConvUTF7Obj;
2765 static wxMBConvUTF8 wxConvUTF8Obj;
2766
2767 #ifdef __WXGTK20__
2768 static wxConvBrokenFileNames wxConvBrokenFileNamesObj;
2769 #endif
2770
2771 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2772 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2773 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2774 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2775 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2776 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2777 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2778 #ifdef __WXOSX__
2779 wxConvUTF8Obj;
2780 #elif __WXGTK20__
2781 wxConvBrokenFileNamesObj;
2782 #else
2783 wxConvLibcObj;
2784 #endif
2785
2786
2787 #else // !wxUSE_WCHAR_T
2788
2789 // stand-ins in absence of wchar_t
2790 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2791 wxConvISO8859_1,
2792 wxConvLocal,
2793 wxConvUTF8;
2794
2795 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2796
2797