]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Compilo fix for wxUSE_INTL=0 and use wxCSConv for broken filenames instead of wxMBCon...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359
360 #ifdef __UNIX__
361
362 // ----------------------------------------------------------------------------
363 // wxConvBrokenFileNames
364 // ----------------------------------------------------------------------------
365
366 wxConvBrokenFileNames::wxConvBrokenFileNames()
367 {
368 // decide which conversion to use for the file names
369
370 // (1) this variable exists for the sole purpose of specifying the encoding
371 // of the filenames for GTK+ programs, so use it if it is set
372 wxString encName(wxGetenv(_T("G_FILENAME_ENCODING")));
373 encName.MakeUpper();
374 if ( !encName.empty() && encName != _T("UTF-8") && encName != _T("UTF8") )
375 {
376 m_conv = new wxCSConv(encName);
377 }
378 else // no G_FILENAME_ENCODING
379 {
380 #if wxUSE_INTL
381 if ( encName.empty() )
382 encName = wxLocale::GetSystemEncodingName().Upper();
383 #endif
384
385 // (2) if a non default locale is set, assume that the user wants his
386 // filenames in this locale too
387 if ( !encName.empty() && encName != _T("UTF-8") && encName != _T("UTF8") )
388 {
389 wxSetEnv(_T("G_FILENAME_ENCODING"), encName);
390 m_conv = new wxCSConv(encName);
391 }
392 else
393 {
394 // (3) finally use UTF-8 by default
395 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
396 }
397 }
398 }
399
400 size_t
401 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
402 const char *psz,
403 size_t outputSize) const
404 {
405 return m_conv->MB2WC( outputBuf, psz, outputSize );
406 }
407
408 size_t
409 wxConvBrokenFileNames::WC2MB(char *outputBuf,
410 const wchar_t *psz,
411 size_t outputSize) const
412 {
413 return m_conv->WC2MB( outputBuf, psz, outputSize );
414 }
415
416 #endif
417
418 // ----------------------------------------------------------------------------
419 // UTF-7
420 // ----------------------------------------------------------------------------
421
422 // Implementation (C) 2004 Fredrik Roubert
423
424 //
425 // BASE64 decoding table
426 //
427 static const unsigned char utf7unb64[] =
428 {
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
435 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
436 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
437 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
438 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
439 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
440 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
441 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
442 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
443 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
444 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
451 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
452 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
453 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
454 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
455 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
456 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
457 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
458 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
459 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
460 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
461 };
462
463 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
464 {
465 size_t len = 0;
466
467 while (*psz && ((!buf) || (len < n)))
468 {
469 unsigned char cc = *psz++;
470 if (cc != '+')
471 {
472 // plain ASCII char
473 if (buf)
474 *buf++ = cc;
475 len++;
476 }
477 else if (*psz == '-')
478 {
479 // encoded plus sign
480 if (buf)
481 *buf++ = cc;
482 len++;
483 psz++;
484 }
485 else
486 {
487 // BASE64 encoded string
488 bool lsb;
489 unsigned char c;
490 unsigned int d, l;
491 for (lsb = false, d = 0, l = 0;
492 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
493 {
494 d <<= 6;
495 d += cc;
496 for (l += 6; l >= 8; lsb = !lsb)
497 {
498 c = (unsigned char)((d >> (l -= 8)) % 256);
499 if (lsb)
500 {
501 if (buf)
502 *buf++ |= c;
503 len ++;
504 }
505 else
506 if (buf)
507 *buf = (wchar_t)(c << 8);
508 }
509 }
510 if (*psz == '-')
511 psz++;
512 }
513 }
514 if (buf && (len < n))
515 *buf = 0;
516 return len;
517 }
518
519 //
520 // BASE64 encoding table
521 //
522 static const unsigned char utf7enb64[] =
523 {
524 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
525 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
526 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
527 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
528 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
529 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
530 'w', 'x', 'y', 'z', '0', '1', '2', '3',
531 '4', '5', '6', '7', '8', '9', '+', '/'
532 };
533
534 //
535 // UTF-7 encoding table
536 //
537 // 0 - Set D (directly encoded characters)
538 // 1 - Set O (optional direct characters)
539 // 2 - whitespace characters (optional)
540 // 3 - special characters
541 //
542 static const unsigned char utf7encode[128] =
543 {
544 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
545 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
546 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
548 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
550 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
552 };
553
554 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
555 {
556
557
558 size_t len = 0;
559
560 while (*psz && ((!buf) || (len < n)))
561 {
562 wchar_t cc = *psz++;
563 if (cc < 0x80 && utf7encode[cc] < 1)
564 {
565 // plain ASCII char
566 if (buf)
567 *buf++ = (char)cc;
568 len++;
569 }
570 #ifndef WC_UTF16
571 else if (((wxUint32)cc) > 0xffff)
572 {
573 // no surrogate pair generation (yet?)
574 return (size_t)-1;
575 }
576 #endif
577 else
578 {
579 if (buf)
580 *buf++ = '+';
581 len++;
582 if (cc != '+')
583 {
584 // BASE64 encode string
585 unsigned int lsb, d, l;
586 for (d = 0, l = 0;; psz++)
587 {
588 for (lsb = 0; lsb < 2; lsb ++)
589 {
590 d <<= 8;
591 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
592
593 for (l += 8; l >= 6; )
594 {
595 l -= 6;
596 if (buf)
597 *buf++ = utf7enb64[(d >> l) % 64];
598 len++;
599 }
600 }
601 cc = *psz;
602 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
603 break;
604 }
605 if (l != 0)
606 {
607 if (buf)
608 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
609 len++;
610 }
611 }
612 if (buf)
613 *buf++ = '-';
614 len++;
615 }
616 }
617 if (buf && (len < n))
618 *buf = 0;
619 return len;
620 }
621
622 // ----------------------------------------------------------------------------
623 // UTF-8
624 // ----------------------------------------------------------------------------
625
626 static wxUint32 utf8_max[]=
627 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
628
629 // boundaries of the private use area we use to (temporarily) remap invalid
630 // characters invalid in a UTF-8 encoded string
631 const wxUint32 wxUnicodePUA = 0x100000;
632 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
633
634 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
635 {
636 size_t len = 0;
637
638 while (*psz && ((!buf) || (len < n)))
639 {
640 const char *opsz = psz;
641 bool invalid = false;
642 unsigned char cc = *psz++, fc = cc;
643 unsigned cnt;
644 for (cnt = 0; fc & 0x80; cnt++)
645 fc <<= 1;
646 if (!cnt)
647 {
648 // plain ASCII char
649 if (buf)
650 *buf++ = cc;
651 len++;
652
653 // escape the escape character for octal escapes
654 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
655 && cc == '\\' && (!buf || len < n))
656 {
657 if (buf)
658 *buf++ = cc;
659 len++;
660 }
661 }
662 else
663 {
664 cnt--;
665 if (!cnt)
666 {
667 // invalid UTF-8 sequence
668 invalid = true;
669 }
670 else
671 {
672 unsigned ocnt = cnt - 1;
673 wxUint32 res = cc & (0x3f >> cnt);
674 while (cnt--)
675 {
676 cc = *psz;
677 if ((cc & 0xC0) != 0x80)
678 {
679 // invalid UTF-8 sequence
680 invalid = true;
681 break;
682 }
683 psz++;
684 res = (res << 6) | (cc & 0x3f);
685 }
686 if (invalid || res <= utf8_max[ocnt])
687 {
688 // illegal UTF-8 encoding
689 invalid = true;
690 }
691 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
692 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
693 {
694 // if one of our PUA characters turns up externally
695 // it must also be treated as an illegal sequence
696 // (a bit like you have to escape an escape character)
697 invalid = true;
698 }
699 else
700 {
701 #ifdef WC_UTF16
702 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
703 size_t pa = encode_utf16(res, (wxUint16 *)buf);
704 if (pa == (size_t)-1)
705 {
706 invalid = true;
707 }
708 else
709 {
710 if (buf)
711 buf += pa;
712 len += pa;
713 }
714 #else // !WC_UTF16
715 if (buf)
716 *buf++ = res;
717 len++;
718 #endif // WC_UTF16/!WC_UTF16
719 }
720 }
721 if (invalid)
722 {
723 if (m_options & MAP_INVALID_UTF8_TO_PUA)
724 {
725 while (opsz < psz && (!buf || len < n))
726 {
727 #ifdef WC_UTF16
728 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
729 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
730 wxASSERT(pa != (size_t)-1);
731 if (buf)
732 buf += pa;
733 opsz++;
734 len += pa;
735 #else
736 if (buf)
737 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
738 opsz++;
739 len++;
740 #endif
741 }
742 }
743 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
744 {
745 while (opsz < psz && (!buf || len < n))
746 {
747 if ( buf && len + 3 < n )
748 {
749 unsigned char n = *opsz;
750 *buf++ = L'\\';
751 *buf++ = (wchar_t)( L'0' + n / 0100 );
752 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
753 *buf++ = (wchar_t)( L'0' + n % 010 );
754 }
755 opsz++;
756 len += 4;
757 }
758 }
759 else // MAP_INVALID_UTF8_NOT
760 {
761 return (size_t)-1;
762 }
763 }
764 }
765 }
766 if (buf && (len < n))
767 *buf = 0;
768 return len;
769 }
770
771 static inline bool isoctal(wchar_t wch)
772 {
773 return L'0' <= wch && wch <= L'7';
774 }
775
776 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
777 {
778 size_t len = 0;
779
780 while (*psz && ((!buf) || (len < n)))
781 {
782 wxUint32 cc;
783 #ifdef WC_UTF16
784 // cast is ok for WC_UTF16
785 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
786 psz += (pa == (size_t)-1) ? 1 : pa;
787 #else
788 cc=(*psz++) & 0x7fffffff;
789 #endif
790
791 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
792 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
793 {
794 if (buf)
795 *buf++ = (char)(cc - wxUnicodePUA);
796 len++;
797 }
798 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
799 && cc == L'\\' && psz[0] == L'\\' )
800 {
801 if (buf)
802 *buf++ = (char)cc;
803 psz++;
804 len++;
805 }
806 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
807 cc == L'\\' &&
808 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
809 {
810 if (buf)
811 {
812 *buf++ = (char) ((psz[0] - L'0')*0100 +
813 (psz[1] - L'0')*010 +
814 (psz[2] - L'0'));
815 }
816
817 psz += 3;
818 len++;
819 }
820 else
821 {
822 unsigned cnt;
823 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
824 if (!cnt)
825 {
826 // plain ASCII char
827 if (buf)
828 *buf++ = (char) cc;
829 len++;
830 }
831
832 else
833 {
834 len += cnt + 1;
835 if (buf)
836 {
837 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
838 while (cnt--)
839 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
840 }
841 }
842 }
843 }
844
845 if (buf && (len<n))
846 *buf = 0;
847
848 return len;
849 }
850
851 // ----------------------------------------------------------------------------
852 // UTF-16
853 // ----------------------------------------------------------------------------
854
855 #ifdef WORDS_BIGENDIAN
856 #define wxMBConvUTF16straight wxMBConvUTF16BE
857 #define wxMBConvUTF16swap wxMBConvUTF16LE
858 #else
859 #define wxMBConvUTF16swap wxMBConvUTF16BE
860 #define wxMBConvUTF16straight wxMBConvUTF16LE
861 #endif
862
863
864 #ifdef WC_UTF16
865
866 // copy 16bit MB to 16bit String
867 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
868 {
869 size_t len=0;
870
871 while (*(wxUint16*)psz && (!buf || len < n))
872 {
873 if (buf)
874 *buf++ = *(wxUint16*)psz;
875 len++;
876
877 psz += sizeof(wxUint16);
878 }
879 if (buf && len<n) *buf=0;
880
881 return len;
882 }
883
884
885 // copy 16bit String to 16bit MB
886 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
887 {
888 size_t len=0;
889
890 while (*psz && (!buf || len < n))
891 {
892 if (buf)
893 {
894 *(wxUint16*)buf = *psz;
895 buf += sizeof(wxUint16);
896 }
897 len += sizeof(wxUint16);
898 psz++;
899 }
900 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
901
902 return len;
903 }
904
905
906 // swap 16bit MB to 16bit String
907 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
908 {
909 size_t len=0;
910
911 while (*(wxUint16*)psz && (!buf || len < n))
912 {
913 if (buf)
914 {
915 ((char *)buf)[0] = psz[1];
916 ((char *)buf)[1] = psz[0];
917 buf++;
918 }
919 len++;
920 psz += sizeof(wxUint16);
921 }
922 if (buf && len<n) *buf=0;
923
924 return len;
925 }
926
927
928 // swap 16bit MB to 16bit String
929 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
930 {
931 size_t len=0;
932
933 while (*psz && (!buf || len < n))
934 {
935 if (buf)
936 {
937 *buf++ = ((char*)psz)[1];
938 *buf++ = ((char*)psz)[0];
939 }
940 len += sizeof(wxUint16);
941 psz++;
942 }
943 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
944
945 return len;
946 }
947
948
949 #else // WC_UTF16
950
951
952 // copy 16bit MB to 32bit String
953 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954 {
955 size_t len=0;
956
957 while (*(wxUint16*)psz && (!buf || len < n))
958 {
959 wxUint32 cc;
960 size_t pa=decode_utf16((wxUint16*)psz, cc);
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
965 *buf++ = cc;
966 len++;
967 psz += pa * sizeof(wxUint16);
968 }
969 if (buf && len<n) *buf=0;
970
971 return len;
972 }
973
974
975 // copy 32bit String to 16bit MB
976 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
977 {
978 size_t len=0;
979
980 while (*psz && (!buf || len < n))
981 {
982 wxUint16 cc[2];
983 size_t pa=encode_utf16(*psz, cc);
984
985 if (pa == (size_t)-1)
986 return pa;
987
988 if (buf)
989 {
990 *(wxUint16*)buf = cc[0];
991 buf += sizeof(wxUint16);
992 if (pa > 1)
993 {
994 *(wxUint16*)buf = cc[1];
995 buf += sizeof(wxUint16);
996 }
997 }
998
999 len += pa*sizeof(wxUint16);
1000 psz++;
1001 }
1002 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1003
1004 return len;
1005 }
1006
1007
1008 // swap 16bit MB to 32bit String
1009 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1010 {
1011 size_t len=0;
1012
1013 while (*(wxUint16*)psz && (!buf || len < n))
1014 {
1015 wxUint32 cc;
1016 char tmp[4];
1017 tmp[0]=psz[1]; tmp[1]=psz[0];
1018 tmp[2]=psz[3]; tmp[3]=psz[2];
1019
1020 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1021 if (pa == (size_t)-1)
1022 return pa;
1023
1024 if (buf)
1025 *buf++ = cc;
1026
1027 len++;
1028 psz += pa * sizeof(wxUint16);
1029 }
1030 if (buf && len<n) *buf=0;
1031
1032 return len;
1033 }
1034
1035
1036 // swap 32bit String to 16bit MB
1037 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1038 {
1039 size_t len=0;
1040
1041 while (*psz && (!buf || len < n))
1042 {
1043 wxUint16 cc[2];
1044 size_t pa=encode_utf16(*psz, cc);
1045
1046 if (pa == (size_t)-1)
1047 return pa;
1048
1049 if (buf)
1050 {
1051 *buf++ = ((char*)cc)[1];
1052 *buf++ = ((char*)cc)[0];
1053 if (pa > 1)
1054 {
1055 *buf++ = ((char*)cc)[3];
1056 *buf++ = ((char*)cc)[2];
1057 }
1058 }
1059
1060 len += pa*sizeof(wxUint16);
1061 psz++;
1062 }
1063 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1064
1065 return len;
1066 }
1067
1068 #endif // WC_UTF16
1069
1070
1071 // ----------------------------------------------------------------------------
1072 // UTF-32
1073 // ----------------------------------------------------------------------------
1074
1075 #ifdef WORDS_BIGENDIAN
1076 #define wxMBConvUTF32straight wxMBConvUTF32BE
1077 #define wxMBConvUTF32swap wxMBConvUTF32LE
1078 #else
1079 #define wxMBConvUTF32swap wxMBConvUTF32BE
1080 #define wxMBConvUTF32straight wxMBConvUTF32LE
1081 #endif
1082
1083
1084 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1085 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1086
1087
1088 #ifdef WC_UTF16
1089
1090 // copy 32bit MB to 16bit String
1091 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1092 {
1093 size_t len=0;
1094
1095 while (*(wxUint32*)psz && (!buf || len < n))
1096 {
1097 wxUint16 cc[2];
1098
1099 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1100 if (pa == (size_t)-1)
1101 return pa;
1102
1103 if (buf)
1104 {
1105 *buf++ = cc[0];
1106 if (pa > 1)
1107 *buf++ = cc[1];
1108 }
1109 len += pa;
1110 psz += sizeof(wxUint32);
1111 }
1112 if (buf && len<n) *buf=0;
1113
1114 return len;
1115 }
1116
1117
1118 // copy 16bit String to 32bit MB
1119 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1120 {
1121 size_t len=0;
1122
1123 while (*psz && (!buf || len < n))
1124 {
1125 wxUint32 cc;
1126
1127 // cast is ok for WC_UTF16
1128 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1129 if (pa == (size_t)-1)
1130 return pa;
1131
1132 if (buf)
1133 {
1134 *(wxUint32*)buf = cc;
1135 buf += sizeof(wxUint32);
1136 }
1137 len += sizeof(wxUint32);
1138 psz += pa;
1139 }
1140
1141 if (buf && len<=n-sizeof(wxUint32))
1142 *(wxUint32*)buf=0;
1143
1144 return len;
1145 }
1146
1147
1148
1149 // swap 32bit MB to 16bit String
1150 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1151 {
1152 size_t len=0;
1153
1154 while (*(wxUint32*)psz && (!buf || len < n))
1155 {
1156 char tmp[4];
1157 tmp[0] = psz[3]; tmp[1] = psz[2];
1158 tmp[2] = psz[1]; tmp[3] = psz[0];
1159
1160
1161 wxUint16 cc[2];
1162
1163 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1164 if (pa == (size_t)-1)
1165 return pa;
1166
1167 if (buf)
1168 {
1169 *buf++ = cc[0];
1170 if (pa > 1)
1171 *buf++ = cc[1];
1172 }
1173 len += pa;
1174 psz += sizeof(wxUint32);
1175 }
1176
1177 if (buf && len<n)
1178 *buf=0;
1179
1180 return len;
1181 }
1182
1183
1184 // swap 16bit String to 32bit MB
1185 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1186 {
1187 size_t len=0;
1188
1189 while (*psz && (!buf || len < n))
1190 {
1191 char cc[4];
1192
1193 // cast is ok for WC_UTF16
1194 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1195 if (pa == (size_t)-1)
1196 return pa;
1197
1198 if (buf)
1199 {
1200 *buf++ = cc[3];
1201 *buf++ = cc[2];
1202 *buf++ = cc[1];
1203 *buf++ = cc[0];
1204 }
1205 len += sizeof(wxUint32);
1206 psz += pa;
1207 }
1208
1209 if (buf && len<=n-sizeof(wxUint32))
1210 *(wxUint32*)buf=0;
1211
1212 return len;
1213 }
1214
1215 #else // WC_UTF16
1216
1217
1218 // copy 32bit MB to 32bit String
1219 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1220 {
1221 size_t len=0;
1222
1223 while (*(wxUint32*)psz && (!buf || len < n))
1224 {
1225 if (buf)
1226 *buf++ = *(wxUint32*)psz;
1227 len++;
1228 psz += sizeof(wxUint32);
1229 }
1230
1231 if (buf && len<n)
1232 *buf=0;
1233
1234 return len;
1235 }
1236
1237
1238 // copy 32bit String to 32bit MB
1239 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1240 {
1241 size_t len=0;
1242
1243 while (*psz && (!buf || len < n))
1244 {
1245 if (buf)
1246 {
1247 *(wxUint32*)buf = *psz;
1248 buf += sizeof(wxUint32);
1249 }
1250
1251 len += sizeof(wxUint32);
1252 psz++;
1253 }
1254
1255 if (buf && len<=n-sizeof(wxUint32))
1256 *(wxUint32*)buf=0;
1257
1258 return len;
1259 }
1260
1261
1262 // swap 32bit MB to 32bit String
1263 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1264 {
1265 size_t len=0;
1266
1267 while (*(wxUint32*)psz && (!buf || len < n))
1268 {
1269 if (buf)
1270 {
1271 ((char *)buf)[0] = psz[3];
1272 ((char *)buf)[1] = psz[2];
1273 ((char *)buf)[2] = psz[1];
1274 ((char *)buf)[3] = psz[0];
1275 buf++;
1276 }
1277 len++;
1278 psz += sizeof(wxUint32);
1279 }
1280
1281 if (buf && len<n)
1282 *buf=0;
1283
1284 return len;
1285 }
1286
1287
1288 // swap 32bit String to 32bit MB
1289 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1290 {
1291 size_t len=0;
1292
1293 while (*psz && (!buf || len < n))
1294 {
1295 if (buf)
1296 {
1297 *buf++ = ((char *)psz)[3];
1298 *buf++ = ((char *)psz)[2];
1299 *buf++ = ((char *)psz)[1];
1300 *buf++ = ((char *)psz)[0];
1301 }
1302 len += sizeof(wxUint32);
1303 psz++;
1304 }
1305
1306 if (buf && len<=n-sizeof(wxUint32))
1307 *(wxUint32*)buf=0;
1308
1309 return len;
1310 }
1311
1312
1313 #endif // WC_UTF16
1314
1315
1316 // ============================================================================
1317 // The classes doing conversion using the iconv_xxx() functions
1318 // ============================================================================
1319
1320 #ifdef HAVE_ICONV
1321
1322 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1323 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1324 // (unless there's yet another bug in glibc) the only case when iconv()
1325 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1326 // left in the input buffer -- when _real_ error occurs,
1327 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1328 // iconv() failure.
1329 // [This bug does not appear in glibc 2.2.]
1330 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1331 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1332 (errno != E2BIG || bufLeft != 0))
1333 #else
1334 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1335 #endif
1336
1337 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1338
1339 // ----------------------------------------------------------------------------
1340 // wxMBConv_iconv: encapsulates an iconv character set
1341 // ----------------------------------------------------------------------------
1342
1343 class wxMBConv_iconv : public wxMBConv
1344 {
1345 public:
1346 wxMBConv_iconv(const wxChar *name);
1347 virtual ~wxMBConv_iconv();
1348
1349 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1350 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1351
1352 bool IsOk() const
1353 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1354
1355 protected:
1356 // the iconv handlers used to translate from multibyte to wide char and in
1357 // the other direction
1358 iconv_t m2w,
1359 w2m;
1360 #if wxUSE_THREADS
1361 // guards access to m2w and w2m objects
1362 wxMutex m_iconvMutex;
1363 #endif
1364
1365 private:
1366 // the name (for iconv_open()) of a wide char charset -- if none is
1367 // available on this machine, it will remain NULL
1368 static const char *ms_wcCharsetName;
1369
1370 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1371 // different endian-ness than the native one
1372 static bool ms_wcNeedsSwap;
1373 };
1374
1375 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1376 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1377
1378 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1379 {
1380 // Do it the hard way
1381 char cname[100];
1382 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1383 cname[i] = (char) name[i];
1384
1385 // check for charset that represents wchar_t:
1386 if (ms_wcCharsetName == NULL)
1387 {
1388 ms_wcNeedsSwap = false;
1389
1390 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1391 ms_wcCharsetName = WC_NAME_BEST;
1392 m2w = iconv_open(ms_wcCharsetName, cname);
1393
1394 if (m2w == (iconv_t)-1)
1395 {
1396 // try charset w/o bytesex info (e.g. "UCS4")
1397 // and check for bytesex ourselves:
1398 ms_wcCharsetName = WC_NAME;
1399 m2w = iconv_open(ms_wcCharsetName, cname);
1400
1401 // last bet, try if it knows WCHAR_T pseudo-charset
1402 if (m2w == (iconv_t)-1)
1403 {
1404 ms_wcCharsetName = "WCHAR_T";
1405 m2w = iconv_open(ms_wcCharsetName, cname);
1406 }
1407
1408 if (m2w != (iconv_t)-1)
1409 {
1410 char buf[2], *bufPtr;
1411 wchar_t wbuf[2], *wbufPtr;
1412 size_t insz, outsz;
1413 size_t res;
1414
1415 buf[0] = 'A';
1416 buf[1] = 0;
1417 wbuf[0] = 0;
1418 insz = 2;
1419 outsz = SIZEOF_WCHAR_T * 2;
1420 wbufPtr = wbuf;
1421 bufPtr = buf;
1422
1423 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1424 (char**)&wbufPtr, &outsz);
1425
1426 if (ICONV_FAILED(res, insz))
1427 {
1428 ms_wcCharsetName = NULL;
1429 wxLogLastError(wxT("iconv"));
1430 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1431 }
1432 else
1433 {
1434 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1435 }
1436 }
1437 else
1438 {
1439 ms_wcCharsetName = NULL;
1440
1441 // VS: we must not output an error here, since wxWidgets will safely
1442 // fall back to using wxEncodingConverter.
1443 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1444 //wxLogError(
1445 }
1446 }
1447 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1448 }
1449 else // we already have ms_wcCharsetName
1450 {
1451 m2w = iconv_open(ms_wcCharsetName, cname);
1452 }
1453
1454 // NB: don't ever pass NULL to iconv_open(), it may crash!
1455 if ( ms_wcCharsetName )
1456 {
1457 w2m = iconv_open( cname, ms_wcCharsetName);
1458 }
1459 else
1460 {
1461 w2m = (iconv_t)-1;
1462 }
1463 }
1464
1465 wxMBConv_iconv::~wxMBConv_iconv()
1466 {
1467 if ( m2w != (iconv_t)-1 )
1468 iconv_close(m2w);
1469 if ( w2m != (iconv_t)-1 )
1470 iconv_close(w2m);
1471 }
1472
1473 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1474 {
1475 #if wxUSE_THREADS
1476 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1477 // Unfortunately there is a couple of global wxCSConv objects such as
1478 // wxConvLocal that are used all over wx code, so we have to make sure
1479 // the handle is used by at most one thread at the time. Otherwise
1480 // only a few wx classes would be safe to use from non-main threads
1481 // as MB<->WC conversion would fail "randomly".
1482 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1483 #endif
1484
1485 size_t inbuf = strlen(psz);
1486 size_t outbuf = n * SIZEOF_WCHAR_T;
1487 size_t res, cres;
1488 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1489 wchar_t *bufPtr = buf;
1490 const char *pszPtr = psz;
1491
1492 if (buf)
1493 {
1494 // have destination buffer, convert there
1495 cres = iconv(m2w,
1496 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1497 (char**)&bufPtr, &outbuf);
1498 res = n - (outbuf / SIZEOF_WCHAR_T);
1499
1500 if (ms_wcNeedsSwap)
1501 {
1502 // convert to native endianness
1503 WC_BSWAP(buf /* _not_ bufPtr */, res)
1504 }
1505
1506 // NB: iconv was given only strlen(psz) characters on input, and so
1507 // it couldn't convert the trailing zero. Let's do it ourselves
1508 // if there's some room left for it in the output buffer.
1509 if (res < n)
1510 buf[res] = 0;
1511 }
1512 else
1513 {
1514 // no destination buffer... convert using temp buffer
1515 // to calculate destination buffer requirement
1516 wchar_t tbuf[8];
1517 res = 0;
1518 do {
1519 bufPtr = tbuf;
1520 outbuf = 8*SIZEOF_WCHAR_T;
1521
1522 cres = iconv(m2w,
1523 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1524 (char**)&bufPtr, &outbuf );
1525
1526 res += 8-(outbuf/SIZEOF_WCHAR_T);
1527 } while ((cres==(size_t)-1) && (errno==E2BIG));
1528 }
1529
1530 if (ICONV_FAILED(cres, inbuf))
1531 {
1532 //VS: it is ok if iconv fails, hence trace only
1533 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1534 return (size_t)-1;
1535 }
1536
1537 return res;
1538 }
1539
1540 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1541 {
1542 #if wxUSE_THREADS
1543 // NB: explained in MB2WC
1544 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1545 #endif
1546
1547 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1548 size_t outbuf = n;
1549 size_t res, cres;
1550
1551 wchar_t *tmpbuf = 0;
1552
1553 if (ms_wcNeedsSwap)
1554 {
1555 // need to copy to temp buffer to switch endianness
1556 // this absolutely doesn't rock!
1557 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1558 // could be in read-only memory, or be accessed in some other thread)
1559 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1560 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1561 WC_BSWAP(tmpbuf, inbuf)
1562 psz=tmpbuf;
1563 }
1564
1565 if (buf)
1566 {
1567 // have destination buffer, convert there
1568 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1569
1570 res = n-outbuf;
1571
1572 // NB: iconv was given only wcslen(psz) characters on input, and so
1573 // it couldn't convert the trailing zero. Let's do it ourselves
1574 // if there's some room left for it in the output buffer.
1575 if (res < n)
1576 buf[0] = 0;
1577 }
1578 else
1579 {
1580 // no destination buffer... convert using temp buffer
1581 // to calculate destination buffer requirement
1582 char tbuf[16];
1583 res = 0;
1584 do {
1585 buf = tbuf; outbuf = 16;
1586
1587 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1588
1589 res += 16 - outbuf;
1590 } while ((cres==(size_t)-1) && (errno==E2BIG));
1591 }
1592
1593 if (ms_wcNeedsSwap)
1594 {
1595 free(tmpbuf);
1596 }
1597
1598 if (ICONV_FAILED(cres, inbuf))
1599 {
1600 //VS: it is ok if iconv fails, hence trace only
1601 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1602 return (size_t)-1;
1603 }
1604
1605 return res;
1606 }
1607
1608 #endif // HAVE_ICONV
1609
1610
1611 // ============================================================================
1612 // Win32 conversion classes
1613 // ============================================================================
1614
1615 #ifdef wxHAVE_WIN32_MB2WC
1616
1617 // from utils.cpp
1618 #if wxUSE_FONTMAP
1619 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1620 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1621 #endif
1622
1623 class wxMBConv_win32 : public wxMBConv
1624 {
1625 public:
1626 wxMBConv_win32()
1627 {
1628 m_CodePage = CP_ACP;
1629 }
1630
1631 #if wxUSE_FONTMAP
1632 wxMBConv_win32(const wxChar* name)
1633 {
1634 m_CodePage = wxCharsetToCodepage(name);
1635 }
1636
1637 wxMBConv_win32(wxFontEncoding encoding)
1638 {
1639 m_CodePage = wxEncodingToCodepage(encoding);
1640 }
1641 #endif
1642
1643 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1644 {
1645 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1646 // the behaviour is not compatible with the Unix version (using iconv)
1647 // and break the library itself, e.g. wxTextInputStream::NextChar()
1648 // wouldn't work if reading an incomplete MB char didn't result in an
1649 // error
1650 //
1651 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1652 // an error (tested under Windows Server 2003) and apparently it is
1653 // done on purpose, i.e. the function accepts any input in this case
1654 // and although I'd prefer to return error on ill-formed output, our
1655 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1656 // explicitly ill-formed according to RFC 2152) neither so we don't
1657 // even have any fallback here...
1658 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1659
1660 const size_t len = ::MultiByteToWideChar
1661 (
1662 m_CodePage, // code page
1663 flags, // flags: fall on error
1664 psz, // input string
1665 -1, // its length (NUL-terminated)
1666 buf, // output string
1667 buf ? n : 0 // size of output buffer
1668 );
1669
1670 // note that it returns count of written chars for buf != NULL and size
1671 // of the needed buffer for buf == NULL so in either case the length of
1672 // the string (which never includes the terminating NUL) is one less
1673 return len ? len - 1 : (size_t)-1;
1674 }
1675
1676 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1677 {
1678 /*
1679 we have a problem here: by default, WideCharToMultiByte() may
1680 replace characters unrepresentable in the target code page with bad
1681 quality approximations such as turning "1/2" symbol (U+00BD) into
1682 "1" for the code pages which don't have it and we, obviously, want
1683 to avoid this at any price
1684
1685 the trouble is that this function does it _silently_, i.e. it won't
1686 even tell us whether it did or not... Win98/2000 and higher provide
1687 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1688 we have to resort to a round trip, i.e. check that converting back
1689 results in the same string -- this is, of course, expensive but
1690 otherwise we simply can't be sure to not garble the data.
1691 */
1692
1693 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1694 // it doesn't work with CJK encodings (which we test for rather roughly
1695 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1696 // supporting it
1697 BOOL usedDef wxDUMMY_INITIALIZE(false);
1698 BOOL *pUsedDef;
1699 int flags;
1700 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1701 {
1702 // it's our lucky day
1703 flags = WC_NO_BEST_FIT_CHARS;
1704 pUsedDef = &usedDef;
1705 }
1706 else // old system or unsupported encoding
1707 {
1708 flags = 0;
1709 pUsedDef = NULL;
1710 }
1711
1712 const size_t len = ::WideCharToMultiByte
1713 (
1714 m_CodePage, // code page
1715 flags, // either none or no best fit
1716 pwz, // input string
1717 -1, // it is (wide) NUL-terminated
1718 buf, // output buffer
1719 buf ? n : 0, // and its size
1720 NULL, // default "replacement" char
1721 pUsedDef // [out] was it used?
1722 );
1723
1724 if ( !len )
1725 {
1726 // function totally failed
1727 return (size_t)-1;
1728 }
1729
1730 // if we were really converting, check if we succeeded
1731 if ( buf )
1732 {
1733 if ( flags )
1734 {
1735 // check if the conversion failed, i.e. if any replacements
1736 // were done
1737 if ( usedDef )
1738 return (size_t)-1;
1739 }
1740 else // we must resort to double tripping...
1741 {
1742 wxWCharBuffer wcBuf(n);
1743 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1744 wcscmp(wcBuf, pwz) != 0 )
1745 {
1746 // we didn't obtain the same thing we started from, hence
1747 // the conversion was lossy and we consider that it failed
1748 return (size_t)-1;
1749 }
1750 }
1751 }
1752
1753 // see the comment above for the reason of "len - 1"
1754 return len - 1;
1755 }
1756
1757 bool IsOk() const { return m_CodePage != -1; }
1758
1759 private:
1760 static bool CanUseNoBestFit()
1761 {
1762 static int s_isWin98Or2k = -1;
1763
1764 if ( s_isWin98Or2k == -1 )
1765 {
1766 int verMaj, verMin;
1767 switch ( wxGetOsVersion(&verMaj, &verMin) )
1768 {
1769 case wxWIN95:
1770 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1771 break;
1772
1773 case wxWINDOWS_NT:
1774 s_isWin98Or2k = verMaj >= 5;
1775 break;
1776
1777 default:
1778 // unknown, be conseravtive by default
1779 s_isWin98Or2k = 0;
1780 }
1781
1782 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1783 }
1784
1785 return s_isWin98Or2k == 1;
1786 }
1787
1788 long m_CodePage;
1789 };
1790
1791 #endif // wxHAVE_WIN32_MB2WC
1792
1793 // ============================================================================
1794 // Cocoa conversion classes
1795 // ============================================================================
1796
1797 #if defined(__WXCOCOA__)
1798
1799 // RN: There is no UTF-32 support in either Core Foundation or
1800 // Cocoa. Strangely enough, internally Core Foundation uses
1801 // UTF 32 internally quite a bit - its just not public (yet).
1802
1803 #include <CoreFoundation/CFString.h>
1804 #include <CoreFoundation/CFStringEncodingExt.h>
1805
1806 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1807 {
1808 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1809 if ( encoding == wxFONTENCODING_DEFAULT )
1810 {
1811 enc = CFStringGetSystemEncoding();
1812 }
1813 else switch( encoding)
1814 {
1815 case wxFONTENCODING_ISO8859_1 :
1816 enc = kCFStringEncodingISOLatin1 ;
1817 break ;
1818 case wxFONTENCODING_ISO8859_2 :
1819 enc = kCFStringEncodingISOLatin2;
1820 break ;
1821 case wxFONTENCODING_ISO8859_3 :
1822 enc = kCFStringEncodingISOLatin3 ;
1823 break ;
1824 case wxFONTENCODING_ISO8859_4 :
1825 enc = kCFStringEncodingISOLatin4;
1826 break ;
1827 case wxFONTENCODING_ISO8859_5 :
1828 enc = kCFStringEncodingISOLatinCyrillic;
1829 break ;
1830 case wxFONTENCODING_ISO8859_6 :
1831 enc = kCFStringEncodingISOLatinArabic;
1832 break ;
1833 case wxFONTENCODING_ISO8859_7 :
1834 enc = kCFStringEncodingISOLatinGreek;
1835 break ;
1836 case wxFONTENCODING_ISO8859_8 :
1837 enc = kCFStringEncodingISOLatinHebrew;
1838 break ;
1839 case wxFONTENCODING_ISO8859_9 :
1840 enc = kCFStringEncodingISOLatin5;
1841 break ;
1842 case wxFONTENCODING_ISO8859_10 :
1843 enc = kCFStringEncodingISOLatin6;
1844 break ;
1845 case wxFONTENCODING_ISO8859_11 :
1846 enc = kCFStringEncodingISOLatinThai;
1847 break ;
1848 case wxFONTENCODING_ISO8859_13 :
1849 enc = kCFStringEncodingISOLatin7;
1850 break ;
1851 case wxFONTENCODING_ISO8859_14 :
1852 enc = kCFStringEncodingISOLatin8;
1853 break ;
1854 case wxFONTENCODING_ISO8859_15 :
1855 enc = kCFStringEncodingISOLatin9;
1856 break ;
1857
1858 case wxFONTENCODING_KOI8 :
1859 enc = kCFStringEncodingKOI8_R;
1860 break ;
1861 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1862 enc = kCFStringEncodingDOSRussian;
1863 break ;
1864
1865 // case wxFONTENCODING_BULGARIAN :
1866 // enc = ;
1867 // break ;
1868
1869 case wxFONTENCODING_CP437 :
1870 enc =kCFStringEncodingDOSLatinUS ;
1871 break ;
1872 case wxFONTENCODING_CP850 :
1873 enc = kCFStringEncodingDOSLatin1;
1874 break ;
1875 case wxFONTENCODING_CP852 :
1876 enc = kCFStringEncodingDOSLatin2;
1877 break ;
1878 case wxFONTENCODING_CP855 :
1879 enc = kCFStringEncodingDOSCyrillic;
1880 break ;
1881 case wxFONTENCODING_CP866 :
1882 enc =kCFStringEncodingDOSRussian ;
1883 break ;
1884 case wxFONTENCODING_CP874 :
1885 enc = kCFStringEncodingDOSThai;
1886 break ;
1887 case wxFONTENCODING_CP932 :
1888 enc = kCFStringEncodingDOSJapanese;
1889 break ;
1890 case wxFONTENCODING_CP936 :
1891 enc =kCFStringEncodingDOSChineseSimplif ;
1892 break ;
1893 case wxFONTENCODING_CP949 :
1894 enc = kCFStringEncodingDOSKorean;
1895 break ;
1896 case wxFONTENCODING_CP950 :
1897 enc = kCFStringEncodingDOSChineseTrad;
1898 break ;
1899 case wxFONTENCODING_CP1250 :
1900 enc = kCFStringEncodingWindowsLatin2;
1901 break ;
1902 case wxFONTENCODING_CP1251 :
1903 enc =kCFStringEncodingWindowsCyrillic ;
1904 break ;
1905 case wxFONTENCODING_CP1252 :
1906 enc =kCFStringEncodingWindowsLatin1 ;
1907 break ;
1908 case wxFONTENCODING_CP1253 :
1909 enc = kCFStringEncodingWindowsGreek;
1910 break ;
1911 case wxFONTENCODING_CP1254 :
1912 enc = kCFStringEncodingWindowsLatin5;
1913 break ;
1914 case wxFONTENCODING_CP1255 :
1915 enc =kCFStringEncodingWindowsHebrew ;
1916 break ;
1917 case wxFONTENCODING_CP1256 :
1918 enc =kCFStringEncodingWindowsArabic ;
1919 break ;
1920 case wxFONTENCODING_CP1257 :
1921 enc = kCFStringEncodingWindowsBalticRim;
1922 break ;
1923 // This only really encodes to UTF7 (if that) evidently
1924 // case wxFONTENCODING_UTF7 :
1925 // enc = kCFStringEncodingNonLossyASCII ;
1926 // break ;
1927 case wxFONTENCODING_UTF8 :
1928 enc = kCFStringEncodingUTF8 ;
1929 break ;
1930 case wxFONTENCODING_EUC_JP :
1931 enc = kCFStringEncodingEUC_JP;
1932 break ;
1933 case wxFONTENCODING_UTF16 :
1934 enc = kCFStringEncodingUnicode ;
1935 break ;
1936 case wxFONTENCODING_MACROMAN :
1937 enc = kCFStringEncodingMacRoman ;
1938 break ;
1939 case wxFONTENCODING_MACJAPANESE :
1940 enc = kCFStringEncodingMacJapanese ;
1941 break ;
1942 case wxFONTENCODING_MACCHINESETRAD :
1943 enc = kCFStringEncodingMacChineseTrad ;
1944 break ;
1945 case wxFONTENCODING_MACKOREAN :
1946 enc = kCFStringEncodingMacKorean ;
1947 break ;
1948 case wxFONTENCODING_MACARABIC :
1949 enc = kCFStringEncodingMacArabic ;
1950 break ;
1951 case wxFONTENCODING_MACHEBREW :
1952 enc = kCFStringEncodingMacHebrew ;
1953 break ;
1954 case wxFONTENCODING_MACGREEK :
1955 enc = kCFStringEncodingMacGreek ;
1956 break ;
1957 case wxFONTENCODING_MACCYRILLIC :
1958 enc = kCFStringEncodingMacCyrillic ;
1959 break ;
1960 case wxFONTENCODING_MACDEVANAGARI :
1961 enc = kCFStringEncodingMacDevanagari ;
1962 break ;
1963 case wxFONTENCODING_MACGURMUKHI :
1964 enc = kCFStringEncodingMacGurmukhi ;
1965 break ;
1966 case wxFONTENCODING_MACGUJARATI :
1967 enc = kCFStringEncodingMacGujarati ;
1968 break ;
1969 case wxFONTENCODING_MACORIYA :
1970 enc = kCFStringEncodingMacOriya ;
1971 break ;
1972 case wxFONTENCODING_MACBENGALI :
1973 enc = kCFStringEncodingMacBengali ;
1974 break ;
1975 case wxFONTENCODING_MACTAMIL :
1976 enc = kCFStringEncodingMacTamil ;
1977 break ;
1978 case wxFONTENCODING_MACTELUGU :
1979 enc = kCFStringEncodingMacTelugu ;
1980 break ;
1981 case wxFONTENCODING_MACKANNADA :
1982 enc = kCFStringEncodingMacKannada ;
1983 break ;
1984 case wxFONTENCODING_MACMALAJALAM :
1985 enc = kCFStringEncodingMacMalayalam ;
1986 break ;
1987 case wxFONTENCODING_MACSINHALESE :
1988 enc = kCFStringEncodingMacSinhalese ;
1989 break ;
1990 case wxFONTENCODING_MACBURMESE :
1991 enc = kCFStringEncodingMacBurmese ;
1992 break ;
1993 case wxFONTENCODING_MACKHMER :
1994 enc = kCFStringEncodingMacKhmer ;
1995 break ;
1996 case wxFONTENCODING_MACTHAI :
1997 enc = kCFStringEncodingMacThai ;
1998 break ;
1999 case wxFONTENCODING_MACLAOTIAN :
2000 enc = kCFStringEncodingMacLaotian ;
2001 break ;
2002 case wxFONTENCODING_MACGEORGIAN :
2003 enc = kCFStringEncodingMacGeorgian ;
2004 break ;
2005 case wxFONTENCODING_MACARMENIAN :
2006 enc = kCFStringEncodingMacArmenian ;
2007 break ;
2008 case wxFONTENCODING_MACCHINESESIMP :
2009 enc = kCFStringEncodingMacChineseSimp ;
2010 break ;
2011 case wxFONTENCODING_MACTIBETAN :
2012 enc = kCFStringEncodingMacTibetan ;
2013 break ;
2014 case wxFONTENCODING_MACMONGOLIAN :
2015 enc = kCFStringEncodingMacMongolian ;
2016 break ;
2017 case wxFONTENCODING_MACETHIOPIC :
2018 enc = kCFStringEncodingMacEthiopic ;
2019 break ;
2020 case wxFONTENCODING_MACCENTRALEUR :
2021 enc = kCFStringEncodingMacCentralEurRoman ;
2022 break ;
2023 case wxFONTENCODING_MACVIATNAMESE :
2024 enc = kCFStringEncodingMacVietnamese ;
2025 break ;
2026 case wxFONTENCODING_MACARABICEXT :
2027 enc = kCFStringEncodingMacExtArabic ;
2028 break ;
2029 case wxFONTENCODING_MACSYMBOL :
2030 enc = kCFStringEncodingMacSymbol ;
2031 break ;
2032 case wxFONTENCODING_MACDINGBATS :
2033 enc = kCFStringEncodingMacDingbats ;
2034 break ;
2035 case wxFONTENCODING_MACTURKISH :
2036 enc = kCFStringEncodingMacTurkish ;
2037 break ;
2038 case wxFONTENCODING_MACCROATIAN :
2039 enc = kCFStringEncodingMacCroatian ;
2040 break ;
2041 case wxFONTENCODING_MACICELANDIC :
2042 enc = kCFStringEncodingMacIcelandic ;
2043 break ;
2044 case wxFONTENCODING_MACROMANIAN :
2045 enc = kCFStringEncodingMacRomanian ;
2046 break ;
2047 case wxFONTENCODING_MACCELTIC :
2048 enc = kCFStringEncodingMacCeltic ;
2049 break ;
2050 case wxFONTENCODING_MACGAELIC :
2051 enc = kCFStringEncodingMacGaelic ;
2052 break ;
2053 // case wxFONTENCODING_MACKEYBOARD :
2054 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2055 // break ;
2056 default :
2057 // because gcc is picky
2058 break ;
2059 } ;
2060 return enc ;
2061 }
2062
2063 class wxMBConv_cocoa : public wxMBConv
2064 {
2065 public:
2066 wxMBConv_cocoa()
2067 {
2068 Init(CFStringGetSystemEncoding()) ;
2069 }
2070
2071 #if wxUSE_FONTMAP
2072 wxMBConv_cocoa(const wxChar* name)
2073 {
2074 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2075 }
2076 #endif
2077
2078 wxMBConv_cocoa(wxFontEncoding encoding)
2079 {
2080 Init( wxCFStringEncFromFontEnc(encoding) );
2081 }
2082
2083 ~wxMBConv_cocoa()
2084 {
2085 }
2086
2087 void Init( CFStringEncoding encoding)
2088 {
2089 m_encoding = encoding ;
2090 }
2091
2092 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2093 {
2094 wxASSERT(szUnConv);
2095
2096 CFStringRef theString = CFStringCreateWithBytes (
2097 NULL, //the allocator
2098 (const UInt8*)szUnConv,
2099 strlen(szUnConv),
2100 m_encoding,
2101 false //no BOM/external representation
2102 );
2103
2104 wxASSERT(theString);
2105
2106 size_t nOutLength = CFStringGetLength(theString);
2107
2108 if (szOut == NULL)
2109 {
2110 CFRelease(theString);
2111 return nOutLength;
2112 }
2113
2114 CFRange theRange = { 0, nOutSize };
2115
2116 #if SIZEOF_WCHAR_T == 4
2117 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2118 #endif
2119
2120 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2121
2122 CFRelease(theString);
2123
2124 szUniCharBuffer[nOutLength] = '\0' ;
2125
2126 #if SIZEOF_WCHAR_T == 4
2127 wxMBConvUTF16 converter ;
2128 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2129 delete[] szUniCharBuffer;
2130 #endif
2131
2132 return nOutLength;
2133 }
2134
2135 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2136 {
2137 wxASSERT(szUnConv);
2138
2139 size_t nRealOutSize;
2140 size_t nBufSize = wxWcslen(szUnConv);
2141 UniChar* szUniBuffer = (UniChar*) szUnConv;
2142
2143 #if SIZEOF_WCHAR_T == 4
2144 wxMBConvUTF16BE converter ;
2145 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2146 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2147 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2148 nBufSize /= sizeof(UniChar);
2149 #endif
2150
2151 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2152 NULL, //allocator
2153 szUniBuffer,
2154 nBufSize,
2155 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2156 );
2157
2158 wxASSERT(theString);
2159
2160 //Note that CER puts a BOM when converting to unicode
2161 //so we check and use getchars instead in that case
2162 if (m_encoding == kCFStringEncodingUnicode)
2163 {
2164 if (szOut != NULL)
2165 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2166
2167 nRealOutSize = CFStringGetLength(theString) + 1;
2168 }
2169 else
2170 {
2171 CFStringGetBytes(
2172 theString,
2173 CFRangeMake(0, CFStringGetLength(theString)),
2174 m_encoding,
2175 0, //what to put in characters that can't be converted -
2176 //0 tells CFString to return NULL if it meets such a character
2177 false, //not an external representation
2178 (UInt8*) szOut,
2179 nOutSize,
2180 (CFIndex*) &nRealOutSize
2181 );
2182 }
2183
2184 CFRelease(theString);
2185
2186 #if SIZEOF_WCHAR_T == 4
2187 delete[] szUniBuffer;
2188 #endif
2189
2190 return nRealOutSize - 1;
2191 }
2192
2193 bool IsOk() const
2194 {
2195 return m_encoding != kCFStringEncodingInvalidId &&
2196 CFStringIsEncodingAvailable(m_encoding);
2197 }
2198
2199 private:
2200 CFStringEncoding m_encoding ;
2201 };
2202
2203 #endif // defined(__WXCOCOA__)
2204
2205 // ============================================================================
2206 // Mac conversion classes
2207 // ============================================================================
2208
2209 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2210
2211 class wxMBConv_mac : public wxMBConv
2212 {
2213 public:
2214 wxMBConv_mac()
2215 {
2216 Init(CFStringGetSystemEncoding()) ;
2217 }
2218
2219 #if wxUSE_FONTMAP
2220 wxMBConv_mac(const wxChar* name)
2221 {
2222 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2223 }
2224 #endif
2225
2226 wxMBConv_mac(wxFontEncoding encoding)
2227 {
2228 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2229 }
2230
2231 ~wxMBConv_mac()
2232 {
2233 OSStatus status = noErr ;
2234 status = TECDisposeConverter(m_MB2WC_converter);
2235 status = TECDisposeConverter(m_WC2MB_converter);
2236 }
2237
2238
2239 void Init( TextEncodingBase encoding)
2240 {
2241 OSStatus status = noErr ;
2242 m_char_encoding = encoding ;
2243 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2244
2245 status = TECCreateConverter(&m_MB2WC_converter,
2246 m_char_encoding,
2247 m_unicode_encoding);
2248 status = TECCreateConverter(&m_WC2MB_converter,
2249 m_unicode_encoding,
2250 m_char_encoding);
2251 }
2252
2253 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2254 {
2255 OSStatus status = noErr ;
2256 ByteCount byteOutLen ;
2257 ByteCount byteInLen = strlen(psz) ;
2258 wchar_t *tbuf = NULL ;
2259 UniChar* ubuf = NULL ;
2260 size_t res = 0 ;
2261
2262 if (buf == NULL)
2263 {
2264 //apple specs say at least 32
2265 n = wxMax( 32 , byteInLen ) ;
2266 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2267 }
2268 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2269 #if SIZEOF_WCHAR_T == 4
2270 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2271 #else
2272 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2273 #endif
2274 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2275 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2276 #if SIZEOF_WCHAR_T == 4
2277 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2278 // is not properly terminated we get random characters at the end
2279 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2280 wxMBConvUTF16BE converter ;
2281 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2282 free( ubuf ) ;
2283 #else
2284 res = byteOutLen / sizeof( UniChar ) ;
2285 #endif
2286 if ( buf == NULL )
2287 free(tbuf) ;
2288
2289 if ( buf && res < n)
2290 buf[res] = 0;
2291
2292 return res ;
2293 }
2294
2295 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2296 {
2297 OSStatus status = noErr ;
2298 ByteCount byteOutLen ;
2299 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2300
2301 char *tbuf = NULL ;
2302
2303 if (buf == NULL)
2304 {
2305 //apple specs say at least 32
2306 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2307 tbuf = (char*) malloc( n ) ;
2308 }
2309
2310 ByteCount byteBufferLen = n ;
2311 UniChar* ubuf = NULL ;
2312 #if SIZEOF_WCHAR_T == 4
2313 wxMBConvUTF16BE converter ;
2314 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2315 byteInLen = unicharlen ;
2316 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2317 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2318 #else
2319 ubuf = (UniChar*) psz ;
2320 #endif
2321 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2322 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2323 #if SIZEOF_WCHAR_T == 4
2324 free( ubuf ) ;
2325 #endif
2326 if ( buf == NULL )
2327 free(tbuf) ;
2328
2329 size_t res = byteOutLen ;
2330 if ( buf && res < n)
2331 {
2332 buf[res] = 0;
2333
2334 //we need to double-trip to verify it didn't insert any ? in place
2335 //of bogus characters
2336 wxWCharBuffer wcBuf(n);
2337 size_t pszlen = wxWcslen(psz);
2338 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2339 wxWcslen(wcBuf) != pszlen ||
2340 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2341 {
2342 // we didn't obtain the same thing we started from, hence
2343 // the conversion was lossy and we consider that it failed
2344 return (size_t)-1;
2345 }
2346 }
2347
2348 return res ;
2349 }
2350
2351 bool IsOk() const
2352 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2353
2354 private:
2355 TECObjectRef m_MB2WC_converter ;
2356 TECObjectRef m_WC2MB_converter ;
2357
2358 TextEncodingBase m_char_encoding ;
2359 TextEncodingBase m_unicode_encoding ;
2360 };
2361
2362 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2363
2364 // ============================================================================
2365 // wxEncodingConverter based conversion classes
2366 // ============================================================================
2367
2368 #if wxUSE_FONTMAP
2369
2370 class wxMBConv_wxwin : public wxMBConv
2371 {
2372 private:
2373 void Init()
2374 {
2375 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2376 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2377 }
2378
2379 public:
2380 // temporarily just use wxEncodingConverter stuff,
2381 // so that it works while a better implementation is built
2382 wxMBConv_wxwin(const wxChar* name)
2383 {
2384 if (name)
2385 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2386 else
2387 m_enc = wxFONTENCODING_SYSTEM;
2388
2389 Init();
2390 }
2391
2392 wxMBConv_wxwin(wxFontEncoding enc)
2393 {
2394 m_enc = enc;
2395
2396 Init();
2397 }
2398
2399 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2400 {
2401 size_t inbuf = strlen(psz);
2402 if (buf)
2403 {
2404 if (!m2w.Convert(psz,buf))
2405 return (size_t)-1;
2406 }
2407 return inbuf;
2408 }
2409
2410 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2411 {
2412 const size_t inbuf = wxWcslen(psz);
2413 if (buf)
2414 {
2415 if (!w2m.Convert(psz,buf))
2416 return (size_t)-1;
2417 }
2418
2419 return inbuf;
2420 }
2421
2422 bool IsOk() const { return m_ok; }
2423
2424 public:
2425 wxFontEncoding m_enc;
2426 wxEncodingConverter m2w, w2m;
2427
2428 // were we initialized successfully?
2429 bool m_ok;
2430
2431 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2432 };
2433
2434 #endif // wxUSE_FONTMAP
2435
2436 // ============================================================================
2437 // wxCSConv implementation
2438 // ============================================================================
2439
2440 void wxCSConv::Init()
2441 {
2442 m_name = NULL;
2443 m_convReal = NULL;
2444 m_deferred = true;
2445 }
2446
2447 wxCSConv::wxCSConv(const wxChar *charset)
2448 {
2449 Init();
2450
2451 if ( charset )
2452 {
2453 SetName(charset);
2454 }
2455
2456 m_encoding = wxFONTENCODING_SYSTEM;
2457 }
2458
2459 wxCSConv::wxCSConv(wxFontEncoding encoding)
2460 {
2461 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2462 {
2463 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2464
2465 encoding = wxFONTENCODING_SYSTEM;
2466 }
2467
2468 Init();
2469
2470 m_encoding = encoding;
2471 }
2472
2473 wxCSConv::~wxCSConv()
2474 {
2475 Clear();
2476 }
2477
2478 wxCSConv::wxCSConv(const wxCSConv& conv)
2479 : wxMBConv()
2480 {
2481 Init();
2482
2483 SetName(conv.m_name);
2484 m_encoding = conv.m_encoding;
2485 }
2486
2487 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2488 {
2489 Clear();
2490
2491 SetName(conv.m_name);
2492 m_encoding = conv.m_encoding;
2493
2494 return *this;
2495 }
2496
2497 void wxCSConv::Clear()
2498 {
2499 free(m_name);
2500 delete m_convReal;
2501
2502 m_name = NULL;
2503 m_convReal = NULL;
2504 }
2505
2506 void wxCSConv::SetName(const wxChar *charset)
2507 {
2508 if (charset)
2509 {
2510 m_name = wxStrdup(charset);
2511 m_deferred = true;
2512 }
2513 }
2514
2515 wxMBConv *wxCSConv::DoCreate() const
2516 {
2517 // check for the special case of ASCII or ISO8859-1 charset: as we have
2518 // special knowledge of it anyhow, we don't need to create a special
2519 // conversion object
2520 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2521 {
2522 // don't convert at all
2523 return NULL;
2524 }
2525
2526 // we trust OS to do conversion better than we can so try external
2527 // conversion methods first
2528 //
2529 // the full order is:
2530 // 1. OS conversion (iconv() under Unix or Win32 API)
2531 // 2. hard coded conversions for UTF
2532 // 3. wxEncodingConverter as fall back
2533
2534 // step (1)
2535 #ifdef HAVE_ICONV
2536 #if !wxUSE_FONTMAP
2537 if ( m_name )
2538 #endif // !wxUSE_FONTMAP
2539 {
2540 wxString name(m_name);
2541
2542 #if wxUSE_FONTMAP
2543 if ( name.empty() )
2544 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2545 #endif // wxUSE_FONTMAP
2546
2547 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2548 if ( conv->IsOk() )
2549 return conv;
2550
2551 delete conv;
2552 }
2553 #endif // HAVE_ICONV
2554
2555 #ifdef wxHAVE_WIN32_MB2WC
2556 {
2557 #if wxUSE_FONTMAP
2558 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2559 : new wxMBConv_win32(m_encoding);
2560 if ( conv->IsOk() )
2561 return conv;
2562
2563 delete conv;
2564 #else
2565 return NULL;
2566 #endif
2567 }
2568 #endif // wxHAVE_WIN32_MB2WC
2569 #if defined(__WXMAC__)
2570 {
2571 // leave UTF16 and UTF32 to the built-ins of wx
2572 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2573 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2574 {
2575
2576 #if wxUSE_FONTMAP
2577 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2578 : new wxMBConv_mac(m_encoding);
2579 #else
2580 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2581 #endif
2582 if ( conv->IsOk() )
2583 return conv;
2584
2585 delete conv;
2586 }
2587 }
2588 #endif
2589 #if defined(__WXCOCOA__)
2590 {
2591 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2592 {
2593
2594 #if wxUSE_FONTMAP
2595 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2596 : new wxMBConv_cocoa(m_encoding);
2597 #else
2598 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2599 #endif
2600 if ( conv->IsOk() )
2601 return conv;
2602
2603 delete conv;
2604 }
2605 }
2606 #endif
2607 // step (2)
2608 wxFontEncoding enc = m_encoding;
2609 #if wxUSE_FONTMAP
2610 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2611 {
2612 // use "false" to suppress interactive dialogs -- we can be called from
2613 // anywhere and popping up a dialog from here is the last thing we want to
2614 // do
2615 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2616 }
2617 #endif // wxUSE_FONTMAP
2618
2619 switch ( enc )
2620 {
2621 case wxFONTENCODING_UTF7:
2622 return new wxMBConvUTF7;
2623
2624 case wxFONTENCODING_UTF8:
2625 return new wxMBConvUTF8;
2626
2627 case wxFONTENCODING_UTF16BE:
2628 return new wxMBConvUTF16BE;
2629
2630 case wxFONTENCODING_UTF16LE:
2631 return new wxMBConvUTF16LE;
2632
2633 case wxFONTENCODING_UTF32BE:
2634 return new wxMBConvUTF32BE;
2635
2636 case wxFONTENCODING_UTF32LE:
2637 return new wxMBConvUTF32LE;
2638
2639 default:
2640 // nothing to do but put here to suppress gcc warnings
2641 ;
2642 }
2643
2644 // step (3)
2645 #if wxUSE_FONTMAP
2646 {
2647 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2648 : new wxMBConv_wxwin(m_encoding);
2649 if ( conv->IsOk() )
2650 return conv;
2651
2652 delete conv;
2653 }
2654 #endif // wxUSE_FONTMAP
2655
2656 // NB: This is a hack to prevent deadlock. What could otherwise happen
2657 // in Unicode build: wxConvLocal creation ends up being here
2658 // because of some failure and logs the error. But wxLog will try to
2659 // attach timestamp, for which it will need wxConvLocal (to convert
2660 // time to char* and then wchar_t*), but that fails, tries to log
2661 // error, but wxLog has a (already locked) critical section that
2662 // guards static buffer.
2663 static bool alreadyLoggingError = false;
2664 if (!alreadyLoggingError)
2665 {
2666 alreadyLoggingError = true;
2667 wxLogError(_("Cannot convert from the charset '%s'!"),
2668 m_name ? m_name
2669 :
2670 #if wxUSE_FONTMAP
2671 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2672 #else // !wxUSE_FONTMAP
2673 wxString::Format(_("encoding %s"), m_encoding).c_str()
2674 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2675 );
2676 alreadyLoggingError = false;
2677 }
2678
2679 return NULL;
2680 }
2681
2682 void wxCSConv::CreateConvIfNeeded() const
2683 {
2684 if ( m_deferred )
2685 {
2686 wxCSConv *self = (wxCSConv *)this; // const_cast
2687
2688 #if wxUSE_INTL
2689 // if we don't have neither the name nor the encoding, use the default
2690 // encoding for this system
2691 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2692 {
2693 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2694 }
2695 #endif // wxUSE_INTL
2696
2697 self->m_convReal = DoCreate();
2698 self->m_deferred = false;
2699 }
2700 }
2701
2702 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2703 {
2704 CreateConvIfNeeded();
2705
2706 if (m_convReal)
2707 return m_convReal->MB2WC(buf, psz, n);
2708
2709 // latin-1 (direct)
2710 size_t len = strlen(psz);
2711
2712 if (buf)
2713 {
2714 for (size_t c = 0; c <= len; c++)
2715 buf[c] = (unsigned char)(psz[c]);
2716 }
2717
2718 return len;
2719 }
2720
2721 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2722 {
2723 CreateConvIfNeeded();
2724
2725 if (m_convReal)
2726 return m_convReal->WC2MB(buf, psz, n);
2727
2728 // latin-1 (direct)
2729 const size_t len = wxWcslen(psz);
2730 if (buf)
2731 {
2732 for (size_t c = 0; c <= len; c++)
2733 {
2734 if (psz[c] > 0xFF)
2735 return (size_t)-1;
2736 buf[c] = (char)psz[c];
2737 }
2738 }
2739 else
2740 {
2741 for (size_t c = 0; c <= len; c++)
2742 {
2743 if (psz[c] > 0xFF)
2744 return (size_t)-1;
2745 }
2746 }
2747
2748 return len;
2749 }
2750
2751 // ----------------------------------------------------------------------------
2752 // globals
2753 // ----------------------------------------------------------------------------
2754
2755 #ifdef __WINDOWS__
2756 static wxMBConv_win32 wxConvLibcObj;
2757 #elif defined(__WXMAC__) && !defined(__MACH__)
2758 static wxMBConv_mac wxConvLibcObj ;
2759 #else
2760 static wxMBConvLibc wxConvLibcObj;
2761 #endif
2762
2763 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2764 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2765 static wxMBConvUTF7 wxConvUTF7Obj;
2766 static wxMBConvUTF8 wxConvUTF8Obj;
2767
2768 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2769 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2770 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2771 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2772 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2773 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2774 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2775 #ifdef __WXOSX__
2776 wxConvUTF8Obj;
2777 #else
2778 wxConvLibcObj;
2779 #endif
2780
2781
2782 #else // !wxUSE_WCHAR_T
2783
2784 // stand-ins in absence of wchar_t
2785 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2786 wxConvISO8859_1,
2787 wxConvLocal,
2788 wxConvUTF8;
2789
2790 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2791
2792