]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Use wxMBConvUTF8 when G_FILENAME_ENCODING is UTF-8, and set
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359
360 #ifdef __UNIX__
361
362 // ----------------------------------------------------------------------------
363 // wxConvBrokenFileNames
364 // ----------------------------------------------------------------------------
365
366 wxConvBrokenFileNames::wxConvBrokenFileNames()
367 {
368 // decide which conversion to use for the file names
369
370 // (1) this variable exists for the sole purpose of specifying the encoding
371 // of the filenames for GTK+ programs, so use it if it is set
372 wxString encName(wxGetenv(_T("G_FILENAME_ENCODING")));
373 encName.MakeUpper();
374 if ( !encName.empty() && encName != _T("UTF-8") && encName != _T("UTF8") )
375 {
376 m_conv = new wxCSConv(encName);
377 }
378 else // no G_FILENAME_ENCODING
379 {
380 if ( encName.empty() )
381 encName = wxLocale::GetSystemEncodingName().Upper();
382
383 // (2) if a non default locale is set, assume that the user wants his
384 // filenames in this locale too
385 if ( !encName.empty() && encName != _T("UTF-8") && encName != _T("UTF8") )
386 {
387 wxSetEnv(_T("G_FILENAME_ENCODING"), encName);
388 m_conv = new wxMBConvLibc;
389 }
390 else
391 {
392 // (3) finally use UTF-8 by default
393 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
394 }
395 }
396 }
397
398 size_t
399 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
400 const char *psz,
401 size_t outputSize) const
402 {
403 return m_conv->MB2WC( outputBuf, psz, outputSize );
404 }
405
406 size_t
407 wxConvBrokenFileNames::WC2MB(char *outputBuf,
408 const wchar_t *psz,
409 size_t outputSize) const
410 {
411 return m_conv->WC2MB( outputBuf, psz, outputSize );
412 }
413
414 #endif
415
416 // ----------------------------------------------------------------------------
417 // UTF-7
418 // ----------------------------------------------------------------------------
419
420 // Implementation (C) 2004 Fredrik Roubert
421
422 //
423 // BASE64 decoding table
424 //
425 static const unsigned char utf7unb64[] =
426 {
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
433 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
434 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
436 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
437 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
438 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
439 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
440 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
441 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
442 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
443 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
444 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
451 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
452 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
453 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
454 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
455 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
456 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
457 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
458 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
459 };
460
461 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
462 {
463 size_t len = 0;
464
465 while (*psz && ((!buf) || (len < n)))
466 {
467 unsigned char cc = *psz++;
468 if (cc != '+')
469 {
470 // plain ASCII char
471 if (buf)
472 *buf++ = cc;
473 len++;
474 }
475 else if (*psz == '-')
476 {
477 // encoded plus sign
478 if (buf)
479 *buf++ = cc;
480 len++;
481 psz++;
482 }
483 else
484 {
485 // BASE64 encoded string
486 bool lsb;
487 unsigned char c;
488 unsigned int d, l;
489 for (lsb = false, d = 0, l = 0;
490 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
491 {
492 d <<= 6;
493 d += cc;
494 for (l += 6; l >= 8; lsb = !lsb)
495 {
496 c = (unsigned char)((d >> (l -= 8)) % 256);
497 if (lsb)
498 {
499 if (buf)
500 *buf++ |= c;
501 len ++;
502 }
503 else
504 if (buf)
505 *buf = (wchar_t)(c << 8);
506 }
507 }
508 if (*psz == '-')
509 psz++;
510 }
511 }
512 if (buf && (len < n))
513 *buf = 0;
514 return len;
515 }
516
517 //
518 // BASE64 encoding table
519 //
520 static const unsigned char utf7enb64[] =
521 {
522 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
523 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
524 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
525 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
526 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
527 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
528 'w', 'x', 'y', 'z', '0', '1', '2', '3',
529 '4', '5', '6', '7', '8', '9', '+', '/'
530 };
531
532 //
533 // UTF-7 encoding table
534 //
535 // 0 - Set D (directly encoded characters)
536 // 1 - Set O (optional direct characters)
537 // 2 - whitespace characters (optional)
538 // 3 - special characters
539 //
540 static const unsigned char utf7encode[128] =
541 {
542 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
543 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
544 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
546 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
548 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
550 };
551
552 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
553 {
554
555
556 size_t len = 0;
557
558 while (*psz && ((!buf) || (len < n)))
559 {
560 wchar_t cc = *psz++;
561 if (cc < 0x80 && utf7encode[cc] < 1)
562 {
563 // plain ASCII char
564 if (buf)
565 *buf++ = (char)cc;
566 len++;
567 }
568 #ifndef WC_UTF16
569 else if (((wxUint32)cc) > 0xffff)
570 {
571 // no surrogate pair generation (yet?)
572 return (size_t)-1;
573 }
574 #endif
575 else
576 {
577 if (buf)
578 *buf++ = '+';
579 len++;
580 if (cc != '+')
581 {
582 // BASE64 encode string
583 unsigned int lsb, d, l;
584 for (d = 0, l = 0;; psz++)
585 {
586 for (lsb = 0; lsb < 2; lsb ++)
587 {
588 d <<= 8;
589 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
590
591 for (l += 8; l >= 6; )
592 {
593 l -= 6;
594 if (buf)
595 *buf++ = utf7enb64[(d >> l) % 64];
596 len++;
597 }
598 }
599 cc = *psz;
600 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
601 break;
602 }
603 if (l != 0)
604 {
605 if (buf)
606 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
607 len++;
608 }
609 }
610 if (buf)
611 *buf++ = '-';
612 len++;
613 }
614 }
615 if (buf && (len < n))
616 *buf = 0;
617 return len;
618 }
619
620 // ----------------------------------------------------------------------------
621 // UTF-8
622 // ----------------------------------------------------------------------------
623
624 static wxUint32 utf8_max[]=
625 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
626
627 // boundaries of the private use area we use to (temporarily) remap invalid
628 // characters invalid in a UTF-8 encoded string
629 const wxUint32 wxUnicodePUA = 0x100000;
630 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
631
632 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
633 {
634 size_t len = 0;
635
636 while (*psz && ((!buf) || (len < n)))
637 {
638 const char *opsz = psz;
639 bool invalid = false;
640 unsigned char cc = *psz++, fc = cc;
641 unsigned cnt;
642 for (cnt = 0; fc & 0x80; cnt++)
643 fc <<= 1;
644 if (!cnt)
645 {
646 // plain ASCII char
647 if (buf)
648 *buf++ = cc;
649 len++;
650 }
651 else
652 {
653 cnt--;
654 if (!cnt)
655 {
656 // invalid UTF-8 sequence
657 invalid = true;
658 }
659 else
660 {
661 unsigned ocnt = cnt - 1;
662 wxUint32 res = cc & (0x3f >> cnt);
663 while (cnt--)
664 {
665 cc = *psz;
666 if ((cc & 0xC0) != 0x80)
667 {
668 // invalid UTF-8 sequence
669 invalid = true;
670 break;
671 }
672 psz++;
673 res = (res << 6) | (cc & 0x3f);
674 }
675 if (invalid || res <= utf8_max[ocnt])
676 {
677 // illegal UTF-8 encoding
678 invalid = true;
679 }
680 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
681 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
682 {
683 // if one of our PUA characters turns up externally
684 // it must also be treated as an illegal sequence
685 // (a bit like you have to escape an escape character)
686 invalid = true;
687 }
688 else
689 {
690 #ifdef WC_UTF16
691 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
692 size_t pa = encode_utf16(res, (wxUint16 *)buf);
693 if (pa == (size_t)-1)
694 {
695 invalid = true;
696 }
697 else
698 {
699 if (buf)
700 buf += pa;
701 len += pa;
702 }
703 #else // !WC_UTF16
704 if (buf)
705 *buf++ = res;
706 len++;
707 #endif // WC_UTF16/!WC_UTF16
708 }
709 }
710 if (invalid)
711 {
712 if (m_options & MAP_INVALID_UTF8_TO_PUA)
713 {
714 while (opsz < psz && (!buf || len < n))
715 {
716 #ifdef WC_UTF16
717 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
718 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
719 wxASSERT(pa != (size_t)-1);
720 if (buf)
721 buf += pa;
722 opsz++;
723 len += pa;
724 #else
725 if (buf)
726 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
727 opsz++;
728 len++;
729 #endif
730 }
731 }
732 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
733 {
734 while (opsz < psz && (!buf || len < n))
735 {
736 if ( buf && len + 3 < n )
737 {
738 unsigned char n = *opsz;
739 *buf++ = L'\\';
740 *buf++ = (wchar_t)( L'0' + n / 0100 );
741 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
742 *buf++ = (wchar_t)( L'0' + n % 010 );
743 }
744 opsz++;
745 len += 4;
746 }
747 }
748 else // MAP_INVALID_UTF8_NOT
749 {
750 return (size_t)-1;
751 }
752 }
753 }
754 }
755 if (buf && (len < n))
756 *buf = 0;
757 return len;
758 }
759
760 static inline bool isoctal(wchar_t wch)
761 {
762 return L'0' <= wch && wch <= L'7';
763 }
764
765 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
766 {
767 size_t len = 0;
768
769 while (*psz && ((!buf) || (len < n)))
770 {
771 wxUint32 cc;
772 #ifdef WC_UTF16
773 // cast is ok for WC_UTF16
774 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
775 psz += (pa == (size_t)-1) ? 1 : pa;
776 #else
777 cc=(*psz++) & 0x7fffffff;
778 #endif
779
780 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
781 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
782 {
783 if (buf)
784 *buf++ = (char)(cc - wxUnicodePUA);
785 len++;
786 }
787 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
788 cc == L'\\' &&
789 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
790 {
791 if (buf)
792 {
793 *buf++ = (char) ((psz[0] - L'0')*0100 +
794 (psz[1] - L'0')*010 +
795 (psz[2] - L'0'));
796 }
797
798 psz += 3;
799 len++;
800 }
801 else
802 {
803 unsigned cnt;
804 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
805 if (!cnt)
806 {
807 // plain ASCII char
808 if (buf)
809 *buf++ = (char) cc;
810 len++;
811 }
812
813 else
814 {
815 len += cnt + 1;
816 if (buf)
817 {
818 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
819 while (cnt--)
820 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
821 }
822 }
823 }
824 }
825
826 if (buf && (len<n))
827 *buf = 0;
828
829 return len;
830 }
831
832 // ----------------------------------------------------------------------------
833 // UTF-16
834 // ----------------------------------------------------------------------------
835
836 #ifdef WORDS_BIGENDIAN
837 #define wxMBConvUTF16straight wxMBConvUTF16BE
838 #define wxMBConvUTF16swap wxMBConvUTF16LE
839 #else
840 #define wxMBConvUTF16swap wxMBConvUTF16BE
841 #define wxMBConvUTF16straight wxMBConvUTF16LE
842 #endif
843
844
845 #ifdef WC_UTF16
846
847 // copy 16bit MB to 16bit String
848 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
849 {
850 size_t len=0;
851
852 while (*(wxUint16*)psz && (!buf || len < n))
853 {
854 if (buf)
855 *buf++ = *(wxUint16*)psz;
856 len++;
857
858 psz += sizeof(wxUint16);
859 }
860 if (buf && len<n) *buf=0;
861
862 return len;
863 }
864
865
866 // copy 16bit String to 16bit MB
867 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
868 {
869 size_t len=0;
870
871 while (*psz && (!buf || len < n))
872 {
873 if (buf)
874 {
875 *(wxUint16*)buf = *psz;
876 buf += sizeof(wxUint16);
877 }
878 len += sizeof(wxUint16);
879 psz++;
880 }
881 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
882
883 return len;
884 }
885
886
887 // swap 16bit MB to 16bit String
888 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
889 {
890 size_t len=0;
891
892 while (*(wxUint16*)psz && (!buf || len < n))
893 {
894 if (buf)
895 {
896 ((char *)buf)[0] = psz[1];
897 ((char *)buf)[1] = psz[0];
898 buf++;
899 }
900 len++;
901 psz += sizeof(wxUint16);
902 }
903 if (buf && len<n) *buf=0;
904
905 return len;
906 }
907
908
909 // swap 16bit MB to 16bit String
910 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
911 {
912 size_t len=0;
913
914 while (*psz && (!buf || len < n))
915 {
916 if (buf)
917 {
918 *buf++ = ((char*)psz)[1];
919 *buf++ = ((char*)psz)[0];
920 }
921 len += sizeof(wxUint16);
922 psz++;
923 }
924 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
925
926 return len;
927 }
928
929
930 #else // WC_UTF16
931
932
933 // copy 16bit MB to 32bit String
934 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
935 {
936 size_t len=0;
937
938 while (*(wxUint16*)psz && (!buf || len < n))
939 {
940 wxUint32 cc;
941 size_t pa=decode_utf16((wxUint16*)psz, cc);
942 if (pa == (size_t)-1)
943 return pa;
944
945 if (buf)
946 *buf++ = cc;
947 len++;
948 psz += pa * sizeof(wxUint16);
949 }
950 if (buf && len<n) *buf=0;
951
952 return len;
953 }
954
955
956 // copy 32bit String to 16bit MB
957 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
958 {
959 size_t len=0;
960
961 while (*psz && (!buf || len < n))
962 {
963 wxUint16 cc[2];
964 size_t pa=encode_utf16(*psz, cc);
965
966 if (pa == (size_t)-1)
967 return pa;
968
969 if (buf)
970 {
971 *(wxUint16*)buf = cc[0];
972 buf += sizeof(wxUint16);
973 if (pa > 1)
974 {
975 *(wxUint16*)buf = cc[1];
976 buf += sizeof(wxUint16);
977 }
978 }
979
980 len += pa*sizeof(wxUint16);
981 psz++;
982 }
983 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
984
985 return len;
986 }
987
988
989 // swap 16bit MB to 32bit String
990 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
991 {
992 size_t len=0;
993
994 while (*(wxUint16*)psz && (!buf || len < n))
995 {
996 wxUint32 cc;
997 char tmp[4];
998 tmp[0]=psz[1]; tmp[1]=psz[0];
999 tmp[2]=psz[3]; tmp[3]=psz[2];
1000
1001 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1002 if (pa == (size_t)-1)
1003 return pa;
1004
1005 if (buf)
1006 *buf++ = cc;
1007
1008 len++;
1009 psz += pa * sizeof(wxUint16);
1010 }
1011 if (buf && len<n) *buf=0;
1012
1013 return len;
1014 }
1015
1016
1017 // swap 32bit String to 16bit MB
1018 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1019 {
1020 size_t len=0;
1021
1022 while (*psz && (!buf || len < n))
1023 {
1024 wxUint16 cc[2];
1025 size_t pa=encode_utf16(*psz, cc);
1026
1027 if (pa == (size_t)-1)
1028 return pa;
1029
1030 if (buf)
1031 {
1032 *buf++ = ((char*)cc)[1];
1033 *buf++ = ((char*)cc)[0];
1034 if (pa > 1)
1035 {
1036 *buf++ = ((char*)cc)[3];
1037 *buf++ = ((char*)cc)[2];
1038 }
1039 }
1040
1041 len += pa*sizeof(wxUint16);
1042 psz++;
1043 }
1044 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1045
1046 return len;
1047 }
1048
1049 #endif // WC_UTF16
1050
1051
1052 // ----------------------------------------------------------------------------
1053 // UTF-32
1054 // ----------------------------------------------------------------------------
1055
1056 #ifdef WORDS_BIGENDIAN
1057 #define wxMBConvUTF32straight wxMBConvUTF32BE
1058 #define wxMBConvUTF32swap wxMBConvUTF32LE
1059 #else
1060 #define wxMBConvUTF32swap wxMBConvUTF32BE
1061 #define wxMBConvUTF32straight wxMBConvUTF32LE
1062 #endif
1063
1064
1065 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1066 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1067
1068
1069 #ifdef WC_UTF16
1070
1071 // copy 32bit MB to 16bit String
1072 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1073 {
1074 size_t len=0;
1075
1076 while (*(wxUint32*)psz && (!buf || len < n))
1077 {
1078 wxUint16 cc[2];
1079
1080 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1081 if (pa == (size_t)-1)
1082 return pa;
1083
1084 if (buf)
1085 {
1086 *buf++ = cc[0];
1087 if (pa > 1)
1088 *buf++ = cc[1];
1089 }
1090 len += pa;
1091 psz += sizeof(wxUint32);
1092 }
1093 if (buf && len<n) *buf=0;
1094
1095 return len;
1096 }
1097
1098
1099 // copy 16bit String to 32bit MB
1100 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1101 {
1102 size_t len=0;
1103
1104 while (*psz && (!buf || len < n))
1105 {
1106 wxUint32 cc;
1107
1108 // cast is ok for WC_UTF16
1109 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1110 if (pa == (size_t)-1)
1111 return pa;
1112
1113 if (buf)
1114 {
1115 *(wxUint32*)buf = cc;
1116 buf += sizeof(wxUint32);
1117 }
1118 len += sizeof(wxUint32);
1119 psz += pa;
1120 }
1121
1122 if (buf && len<=n-sizeof(wxUint32))
1123 *(wxUint32*)buf=0;
1124
1125 return len;
1126 }
1127
1128
1129
1130 // swap 32bit MB to 16bit String
1131 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1132 {
1133 size_t len=0;
1134
1135 while (*(wxUint32*)psz && (!buf || len < n))
1136 {
1137 char tmp[4];
1138 tmp[0] = psz[3]; tmp[1] = psz[2];
1139 tmp[2] = psz[1]; tmp[3] = psz[0];
1140
1141
1142 wxUint16 cc[2];
1143
1144 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1145 if (pa == (size_t)-1)
1146 return pa;
1147
1148 if (buf)
1149 {
1150 *buf++ = cc[0];
1151 if (pa > 1)
1152 *buf++ = cc[1];
1153 }
1154 len += pa;
1155 psz += sizeof(wxUint32);
1156 }
1157
1158 if (buf && len<n)
1159 *buf=0;
1160
1161 return len;
1162 }
1163
1164
1165 // swap 16bit String to 32bit MB
1166 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1167 {
1168 size_t len=0;
1169
1170 while (*psz && (!buf || len < n))
1171 {
1172 char cc[4];
1173
1174 // cast is ok for WC_UTF16
1175 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1176 if (pa == (size_t)-1)
1177 return pa;
1178
1179 if (buf)
1180 {
1181 *buf++ = cc[3];
1182 *buf++ = cc[2];
1183 *buf++ = cc[1];
1184 *buf++ = cc[0];
1185 }
1186 len += sizeof(wxUint32);
1187 psz += pa;
1188 }
1189
1190 if (buf && len<=n-sizeof(wxUint32))
1191 *(wxUint32*)buf=0;
1192
1193 return len;
1194 }
1195
1196 #else // WC_UTF16
1197
1198
1199 // copy 32bit MB to 32bit String
1200 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1201 {
1202 size_t len=0;
1203
1204 while (*(wxUint32*)psz && (!buf || len < n))
1205 {
1206 if (buf)
1207 *buf++ = *(wxUint32*)psz;
1208 len++;
1209 psz += sizeof(wxUint32);
1210 }
1211
1212 if (buf && len<n)
1213 *buf=0;
1214
1215 return len;
1216 }
1217
1218
1219 // copy 32bit String to 32bit MB
1220 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1221 {
1222 size_t len=0;
1223
1224 while (*psz && (!buf || len < n))
1225 {
1226 if (buf)
1227 {
1228 *(wxUint32*)buf = *psz;
1229 buf += sizeof(wxUint32);
1230 }
1231
1232 len += sizeof(wxUint32);
1233 psz++;
1234 }
1235
1236 if (buf && len<=n-sizeof(wxUint32))
1237 *(wxUint32*)buf=0;
1238
1239 return len;
1240 }
1241
1242
1243 // swap 32bit MB to 32bit String
1244 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1245 {
1246 size_t len=0;
1247
1248 while (*(wxUint32*)psz && (!buf || len < n))
1249 {
1250 if (buf)
1251 {
1252 ((char *)buf)[0] = psz[3];
1253 ((char *)buf)[1] = psz[2];
1254 ((char *)buf)[2] = psz[1];
1255 ((char *)buf)[3] = psz[0];
1256 buf++;
1257 }
1258 len++;
1259 psz += sizeof(wxUint32);
1260 }
1261
1262 if (buf && len<n)
1263 *buf=0;
1264
1265 return len;
1266 }
1267
1268
1269 // swap 32bit String to 32bit MB
1270 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1271 {
1272 size_t len=0;
1273
1274 while (*psz && (!buf || len < n))
1275 {
1276 if (buf)
1277 {
1278 *buf++ = ((char *)psz)[3];
1279 *buf++ = ((char *)psz)[2];
1280 *buf++ = ((char *)psz)[1];
1281 *buf++ = ((char *)psz)[0];
1282 }
1283 len += sizeof(wxUint32);
1284 psz++;
1285 }
1286
1287 if (buf && len<=n-sizeof(wxUint32))
1288 *(wxUint32*)buf=0;
1289
1290 return len;
1291 }
1292
1293
1294 #endif // WC_UTF16
1295
1296
1297 // ============================================================================
1298 // The classes doing conversion using the iconv_xxx() functions
1299 // ============================================================================
1300
1301 #ifdef HAVE_ICONV
1302
1303 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1304 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1305 // (unless there's yet another bug in glibc) the only case when iconv()
1306 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1307 // left in the input buffer -- when _real_ error occurs,
1308 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1309 // iconv() failure.
1310 // [This bug does not appear in glibc 2.2.]
1311 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1312 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1313 (errno != E2BIG || bufLeft != 0))
1314 #else
1315 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1316 #endif
1317
1318 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1319
1320 // ----------------------------------------------------------------------------
1321 // wxMBConv_iconv: encapsulates an iconv character set
1322 // ----------------------------------------------------------------------------
1323
1324 class wxMBConv_iconv : public wxMBConv
1325 {
1326 public:
1327 wxMBConv_iconv(const wxChar *name);
1328 virtual ~wxMBConv_iconv();
1329
1330 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1331 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1332
1333 bool IsOk() const
1334 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1335
1336 protected:
1337 // the iconv handlers used to translate from multibyte to wide char and in
1338 // the other direction
1339 iconv_t m2w,
1340 w2m;
1341 #if wxUSE_THREADS
1342 // guards access to m2w and w2m objects
1343 wxMutex m_iconvMutex;
1344 #endif
1345
1346 private:
1347 // the name (for iconv_open()) of a wide char charset -- if none is
1348 // available on this machine, it will remain NULL
1349 static const char *ms_wcCharsetName;
1350
1351 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1352 // different endian-ness than the native one
1353 static bool ms_wcNeedsSwap;
1354 };
1355
1356 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1357 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1358
1359 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1360 {
1361 // Do it the hard way
1362 char cname[100];
1363 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1364 cname[i] = (char) name[i];
1365
1366 // check for charset that represents wchar_t:
1367 if (ms_wcCharsetName == NULL)
1368 {
1369 ms_wcNeedsSwap = false;
1370
1371 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1372 ms_wcCharsetName = WC_NAME_BEST;
1373 m2w = iconv_open(ms_wcCharsetName, cname);
1374
1375 if (m2w == (iconv_t)-1)
1376 {
1377 // try charset w/o bytesex info (e.g. "UCS4")
1378 // and check for bytesex ourselves:
1379 ms_wcCharsetName = WC_NAME;
1380 m2w = iconv_open(ms_wcCharsetName, cname);
1381
1382 // last bet, try if it knows WCHAR_T pseudo-charset
1383 if (m2w == (iconv_t)-1)
1384 {
1385 ms_wcCharsetName = "WCHAR_T";
1386 m2w = iconv_open(ms_wcCharsetName, cname);
1387 }
1388
1389 if (m2w != (iconv_t)-1)
1390 {
1391 char buf[2], *bufPtr;
1392 wchar_t wbuf[2], *wbufPtr;
1393 size_t insz, outsz;
1394 size_t res;
1395
1396 buf[0] = 'A';
1397 buf[1] = 0;
1398 wbuf[0] = 0;
1399 insz = 2;
1400 outsz = SIZEOF_WCHAR_T * 2;
1401 wbufPtr = wbuf;
1402 bufPtr = buf;
1403
1404 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1405 (char**)&wbufPtr, &outsz);
1406
1407 if (ICONV_FAILED(res, insz))
1408 {
1409 ms_wcCharsetName = NULL;
1410 wxLogLastError(wxT("iconv"));
1411 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1412 }
1413 else
1414 {
1415 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1416 }
1417 }
1418 else
1419 {
1420 ms_wcCharsetName = NULL;
1421
1422 // VS: we must not output an error here, since wxWidgets will safely
1423 // fall back to using wxEncodingConverter.
1424 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1425 //wxLogError(
1426 }
1427 }
1428 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1429 }
1430 else // we already have ms_wcCharsetName
1431 {
1432 m2w = iconv_open(ms_wcCharsetName, cname);
1433 }
1434
1435 // NB: don't ever pass NULL to iconv_open(), it may crash!
1436 if ( ms_wcCharsetName )
1437 {
1438 w2m = iconv_open( cname, ms_wcCharsetName);
1439 }
1440 else
1441 {
1442 w2m = (iconv_t)-1;
1443 }
1444 }
1445
1446 wxMBConv_iconv::~wxMBConv_iconv()
1447 {
1448 if ( m2w != (iconv_t)-1 )
1449 iconv_close(m2w);
1450 if ( w2m != (iconv_t)-1 )
1451 iconv_close(w2m);
1452 }
1453
1454 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1455 {
1456 #if wxUSE_THREADS
1457 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1458 // Unfortunately there is a couple of global wxCSConv objects such as
1459 // wxConvLocal that are used all over wx code, so we have to make sure
1460 // the handle is used by at most one thread at the time. Otherwise
1461 // only a few wx classes would be safe to use from non-main threads
1462 // as MB<->WC conversion would fail "randomly".
1463 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1464 #endif
1465
1466 size_t inbuf = strlen(psz);
1467 size_t outbuf = n * SIZEOF_WCHAR_T;
1468 size_t res, cres;
1469 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1470 wchar_t *bufPtr = buf;
1471 const char *pszPtr = psz;
1472
1473 if (buf)
1474 {
1475 // have destination buffer, convert there
1476 cres = iconv(m2w,
1477 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1478 (char**)&bufPtr, &outbuf);
1479 res = n - (outbuf / SIZEOF_WCHAR_T);
1480
1481 if (ms_wcNeedsSwap)
1482 {
1483 // convert to native endianness
1484 WC_BSWAP(buf /* _not_ bufPtr */, res)
1485 }
1486
1487 // NB: iconv was given only strlen(psz) characters on input, and so
1488 // it couldn't convert the trailing zero. Let's do it ourselves
1489 // if there's some room left for it in the output buffer.
1490 if (res < n)
1491 buf[res] = 0;
1492 }
1493 else
1494 {
1495 // no destination buffer... convert using temp buffer
1496 // to calculate destination buffer requirement
1497 wchar_t tbuf[8];
1498 res = 0;
1499 do {
1500 bufPtr = tbuf;
1501 outbuf = 8*SIZEOF_WCHAR_T;
1502
1503 cres = iconv(m2w,
1504 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1505 (char**)&bufPtr, &outbuf );
1506
1507 res += 8-(outbuf/SIZEOF_WCHAR_T);
1508 } while ((cres==(size_t)-1) && (errno==E2BIG));
1509 }
1510
1511 if (ICONV_FAILED(cres, inbuf))
1512 {
1513 //VS: it is ok if iconv fails, hence trace only
1514 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1515 return (size_t)-1;
1516 }
1517
1518 return res;
1519 }
1520
1521 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1522 {
1523 #if wxUSE_THREADS
1524 // NB: explained in MB2WC
1525 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1526 #endif
1527
1528 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1529 size_t outbuf = n;
1530 size_t res, cres;
1531
1532 wchar_t *tmpbuf = 0;
1533
1534 if (ms_wcNeedsSwap)
1535 {
1536 // need to copy to temp buffer to switch endianness
1537 // this absolutely doesn't rock!
1538 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1539 // could be in read-only memory, or be accessed in some other thread)
1540 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1541 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1542 WC_BSWAP(tmpbuf, inbuf)
1543 psz=tmpbuf;
1544 }
1545
1546 if (buf)
1547 {
1548 // have destination buffer, convert there
1549 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1550
1551 res = n-outbuf;
1552
1553 // NB: iconv was given only wcslen(psz) characters on input, and so
1554 // it couldn't convert the trailing zero. Let's do it ourselves
1555 // if there's some room left for it in the output buffer.
1556 if (res < n)
1557 buf[0] = 0;
1558 }
1559 else
1560 {
1561 // no destination buffer... convert using temp buffer
1562 // to calculate destination buffer requirement
1563 char tbuf[16];
1564 res = 0;
1565 do {
1566 buf = tbuf; outbuf = 16;
1567
1568 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1569
1570 res += 16 - outbuf;
1571 } while ((cres==(size_t)-1) && (errno==E2BIG));
1572 }
1573
1574 if (ms_wcNeedsSwap)
1575 {
1576 free(tmpbuf);
1577 }
1578
1579 if (ICONV_FAILED(cres, inbuf))
1580 {
1581 //VS: it is ok if iconv fails, hence trace only
1582 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1583 return (size_t)-1;
1584 }
1585
1586 return res;
1587 }
1588
1589 #endif // HAVE_ICONV
1590
1591
1592 // ============================================================================
1593 // Win32 conversion classes
1594 // ============================================================================
1595
1596 #ifdef wxHAVE_WIN32_MB2WC
1597
1598 // from utils.cpp
1599 #if wxUSE_FONTMAP
1600 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1601 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1602 #endif
1603
1604 class wxMBConv_win32 : public wxMBConv
1605 {
1606 public:
1607 wxMBConv_win32()
1608 {
1609 m_CodePage = CP_ACP;
1610 }
1611
1612 #if wxUSE_FONTMAP
1613 wxMBConv_win32(const wxChar* name)
1614 {
1615 m_CodePage = wxCharsetToCodepage(name);
1616 }
1617
1618 wxMBConv_win32(wxFontEncoding encoding)
1619 {
1620 m_CodePage = wxEncodingToCodepage(encoding);
1621 }
1622 #endif
1623
1624 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1625 {
1626 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1627 // the behaviour is not compatible with the Unix version (using iconv)
1628 // and break the library itself, e.g. wxTextInputStream::NextChar()
1629 // wouldn't work if reading an incomplete MB char didn't result in an
1630 // error
1631 //
1632 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1633 // an error (tested under Windows Server 2003) and apparently it is
1634 // done on purpose, i.e. the function accepts any input in this case
1635 // and although I'd prefer to return error on ill-formed output, our
1636 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1637 // explicitly ill-formed according to RFC 2152) neither so we don't
1638 // even have any fallback here...
1639 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1640
1641 const size_t len = ::MultiByteToWideChar
1642 (
1643 m_CodePage, // code page
1644 flags, // flags: fall on error
1645 psz, // input string
1646 -1, // its length (NUL-terminated)
1647 buf, // output string
1648 buf ? n : 0 // size of output buffer
1649 );
1650
1651 // note that it returns count of written chars for buf != NULL and size
1652 // of the needed buffer for buf == NULL so in either case the length of
1653 // the string (which never includes the terminating NUL) is one less
1654 return len ? len - 1 : (size_t)-1;
1655 }
1656
1657 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1658 {
1659 /*
1660 we have a problem here: by default, WideCharToMultiByte() may
1661 replace characters unrepresentable in the target code page with bad
1662 quality approximations such as turning "1/2" symbol (U+00BD) into
1663 "1" for the code pages which don't have it and we, obviously, want
1664 to avoid this at any price
1665
1666 the trouble is that this function does it _silently_, i.e. it won't
1667 even tell us whether it did or not... Win98/2000 and higher provide
1668 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1669 we have to resort to a round trip, i.e. check that converting back
1670 results in the same string -- this is, of course, expensive but
1671 otherwise we simply can't be sure to not garble the data.
1672 */
1673
1674 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1675 // it doesn't work with CJK encodings (which we test for rather roughly
1676 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1677 // supporting it
1678 BOOL usedDef wxDUMMY_INITIALIZE(false);
1679 BOOL *pUsedDef;
1680 int flags;
1681 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1682 {
1683 // it's our lucky day
1684 flags = WC_NO_BEST_FIT_CHARS;
1685 pUsedDef = &usedDef;
1686 }
1687 else // old system or unsupported encoding
1688 {
1689 flags = 0;
1690 pUsedDef = NULL;
1691 }
1692
1693 const size_t len = ::WideCharToMultiByte
1694 (
1695 m_CodePage, // code page
1696 flags, // either none or no best fit
1697 pwz, // input string
1698 -1, // it is (wide) NUL-terminated
1699 buf, // output buffer
1700 buf ? n : 0, // and its size
1701 NULL, // default "replacement" char
1702 pUsedDef // [out] was it used?
1703 );
1704
1705 if ( !len )
1706 {
1707 // function totally failed
1708 return (size_t)-1;
1709 }
1710
1711 // if we were really converting, check if we succeeded
1712 if ( buf )
1713 {
1714 if ( flags )
1715 {
1716 // check if the conversion failed, i.e. if any replacements
1717 // were done
1718 if ( usedDef )
1719 return (size_t)-1;
1720 }
1721 else // we must resort to double tripping...
1722 {
1723 wxWCharBuffer wcBuf(n);
1724 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1725 wcscmp(wcBuf, pwz) != 0 )
1726 {
1727 // we didn't obtain the same thing we started from, hence
1728 // the conversion was lossy and we consider that it failed
1729 return (size_t)-1;
1730 }
1731 }
1732 }
1733
1734 // see the comment above for the reason of "len - 1"
1735 return len - 1;
1736 }
1737
1738 bool IsOk() const { return m_CodePage != -1; }
1739
1740 private:
1741 static bool CanUseNoBestFit()
1742 {
1743 static int s_isWin98Or2k = -1;
1744
1745 if ( s_isWin98Or2k == -1 )
1746 {
1747 int verMaj, verMin;
1748 switch ( wxGetOsVersion(&verMaj, &verMin) )
1749 {
1750 case wxWIN95:
1751 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1752 break;
1753
1754 case wxWINDOWS_NT:
1755 s_isWin98Or2k = verMaj >= 5;
1756 break;
1757
1758 default:
1759 // unknown, be conseravtive by default
1760 s_isWin98Or2k = 0;
1761 }
1762
1763 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1764 }
1765
1766 return s_isWin98Or2k == 1;
1767 }
1768
1769 long m_CodePage;
1770 };
1771
1772 #endif // wxHAVE_WIN32_MB2WC
1773
1774 // ============================================================================
1775 // Cocoa conversion classes
1776 // ============================================================================
1777
1778 #if defined(__WXCOCOA__)
1779
1780 // RN: There is no UTF-32 support in either Core Foundation or
1781 // Cocoa. Strangely enough, internally Core Foundation uses
1782 // UTF 32 internally quite a bit - its just not public (yet).
1783
1784 #include <CoreFoundation/CFString.h>
1785 #include <CoreFoundation/CFStringEncodingExt.h>
1786
1787 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1788 {
1789 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1790 if ( encoding == wxFONTENCODING_DEFAULT )
1791 {
1792 enc = CFStringGetSystemEncoding();
1793 }
1794 else switch( encoding)
1795 {
1796 case wxFONTENCODING_ISO8859_1 :
1797 enc = kCFStringEncodingISOLatin1 ;
1798 break ;
1799 case wxFONTENCODING_ISO8859_2 :
1800 enc = kCFStringEncodingISOLatin2;
1801 break ;
1802 case wxFONTENCODING_ISO8859_3 :
1803 enc = kCFStringEncodingISOLatin3 ;
1804 break ;
1805 case wxFONTENCODING_ISO8859_4 :
1806 enc = kCFStringEncodingISOLatin4;
1807 break ;
1808 case wxFONTENCODING_ISO8859_5 :
1809 enc = kCFStringEncodingISOLatinCyrillic;
1810 break ;
1811 case wxFONTENCODING_ISO8859_6 :
1812 enc = kCFStringEncodingISOLatinArabic;
1813 break ;
1814 case wxFONTENCODING_ISO8859_7 :
1815 enc = kCFStringEncodingISOLatinGreek;
1816 break ;
1817 case wxFONTENCODING_ISO8859_8 :
1818 enc = kCFStringEncodingISOLatinHebrew;
1819 break ;
1820 case wxFONTENCODING_ISO8859_9 :
1821 enc = kCFStringEncodingISOLatin5;
1822 break ;
1823 case wxFONTENCODING_ISO8859_10 :
1824 enc = kCFStringEncodingISOLatin6;
1825 break ;
1826 case wxFONTENCODING_ISO8859_11 :
1827 enc = kCFStringEncodingISOLatinThai;
1828 break ;
1829 case wxFONTENCODING_ISO8859_13 :
1830 enc = kCFStringEncodingISOLatin7;
1831 break ;
1832 case wxFONTENCODING_ISO8859_14 :
1833 enc = kCFStringEncodingISOLatin8;
1834 break ;
1835 case wxFONTENCODING_ISO8859_15 :
1836 enc = kCFStringEncodingISOLatin9;
1837 break ;
1838
1839 case wxFONTENCODING_KOI8 :
1840 enc = kCFStringEncodingKOI8_R;
1841 break ;
1842 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1843 enc = kCFStringEncodingDOSRussian;
1844 break ;
1845
1846 // case wxFONTENCODING_BULGARIAN :
1847 // enc = ;
1848 // break ;
1849
1850 case wxFONTENCODING_CP437 :
1851 enc =kCFStringEncodingDOSLatinUS ;
1852 break ;
1853 case wxFONTENCODING_CP850 :
1854 enc = kCFStringEncodingDOSLatin1;
1855 break ;
1856 case wxFONTENCODING_CP852 :
1857 enc = kCFStringEncodingDOSLatin2;
1858 break ;
1859 case wxFONTENCODING_CP855 :
1860 enc = kCFStringEncodingDOSCyrillic;
1861 break ;
1862 case wxFONTENCODING_CP866 :
1863 enc =kCFStringEncodingDOSRussian ;
1864 break ;
1865 case wxFONTENCODING_CP874 :
1866 enc = kCFStringEncodingDOSThai;
1867 break ;
1868 case wxFONTENCODING_CP932 :
1869 enc = kCFStringEncodingDOSJapanese;
1870 break ;
1871 case wxFONTENCODING_CP936 :
1872 enc =kCFStringEncodingDOSChineseSimplif ;
1873 break ;
1874 case wxFONTENCODING_CP949 :
1875 enc = kCFStringEncodingDOSKorean;
1876 break ;
1877 case wxFONTENCODING_CP950 :
1878 enc = kCFStringEncodingDOSChineseTrad;
1879 break ;
1880 case wxFONTENCODING_CP1250 :
1881 enc = kCFStringEncodingWindowsLatin2;
1882 break ;
1883 case wxFONTENCODING_CP1251 :
1884 enc =kCFStringEncodingWindowsCyrillic ;
1885 break ;
1886 case wxFONTENCODING_CP1252 :
1887 enc =kCFStringEncodingWindowsLatin1 ;
1888 break ;
1889 case wxFONTENCODING_CP1253 :
1890 enc = kCFStringEncodingWindowsGreek;
1891 break ;
1892 case wxFONTENCODING_CP1254 :
1893 enc = kCFStringEncodingWindowsLatin5;
1894 break ;
1895 case wxFONTENCODING_CP1255 :
1896 enc =kCFStringEncodingWindowsHebrew ;
1897 break ;
1898 case wxFONTENCODING_CP1256 :
1899 enc =kCFStringEncodingWindowsArabic ;
1900 break ;
1901 case wxFONTENCODING_CP1257 :
1902 enc = kCFStringEncodingWindowsBalticRim;
1903 break ;
1904 // This only really encodes to UTF7 (if that) evidently
1905 // case wxFONTENCODING_UTF7 :
1906 // enc = kCFStringEncodingNonLossyASCII ;
1907 // break ;
1908 case wxFONTENCODING_UTF8 :
1909 enc = kCFStringEncodingUTF8 ;
1910 break ;
1911 case wxFONTENCODING_EUC_JP :
1912 enc = kCFStringEncodingEUC_JP;
1913 break ;
1914 case wxFONTENCODING_UTF16 :
1915 enc = kCFStringEncodingUnicode ;
1916 break ;
1917 case wxFONTENCODING_MACROMAN :
1918 enc = kCFStringEncodingMacRoman ;
1919 break ;
1920 case wxFONTENCODING_MACJAPANESE :
1921 enc = kCFStringEncodingMacJapanese ;
1922 break ;
1923 case wxFONTENCODING_MACCHINESETRAD :
1924 enc = kCFStringEncodingMacChineseTrad ;
1925 break ;
1926 case wxFONTENCODING_MACKOREAN :
1927 enc = kCFStringEncodingMacKorean ;
1928 break ;
1929 case wxFONTENCODING_MACARABIC :
1930 enc = kCFStringEncodingMacArabic ;
1931 break ;
1932 case wxFONTENCODING_MACHEBREW :
1933 enc = kCFStringEncodingMacHebrew ;
1934 break ;
1935 case wxFONTENCODING_MACGREEK :
1936 enc = kCFStringEncodingMacGreek ;
1937 break ;
1938 case wxFONTENCODING_MACCYRILLIC :
1939 enc = kCFStringEncodingMacCyrillic ;
1940 break ;
1941 case wxFONTENCODING_MACDEVANAGARI :
1942 enc = kCFStringEncodingMacDevanagari ;
1943 break ;
1944 case wxFONTENCODING_MACGURMUKHI :
1945 enc = kCFStringEncodingMacGurmukhi ;
1946 break ;
1947 case wxFONTENCODING_MACGUJARATI :
1948 enc = kCFStringEncodingMacGujarati ;
1949 break ;
1950 case wxFONTENCODING_MACORIYA :
1951 enc = kCFStringEncodingMacOriya ;
1952 break ;
1953 case wxFONTENCODING_MACBENGALI :
1954 enc = kCFStringEncodingMacBengali ;
1955 break ;
1956 case wxFONTENCODING_MACTAMIL :
1957 enc = kCFStringEncodingMacTamil ;
1958 break ;
1959 case wxFONTENCODING_MACTELUGU :
1960 enc = kCFStringEncodingMacTelugu ;
1961 break ;
1962 case wxFONTENCODING_MACKANNADA :
1963 enc = kCFStringEncodingMacKannada ;
1964 break ;
1965 case wxFONTENCODING_MACMALAJALAM :
1966 enc = kCFStringEncodingMacMalayalam ;
1967 break ;
1968 case wxFONTENCODING_MACSINHALESE :
1969 enc = kCFStringEncodingMacSinhalese ;
1970 break ;
1971 case wxFONTENCODING_MACBURMESE :
1972 enc = kCFStringEncodingMacBurmese ;
1973 break ;
1974 case wxFONTENCODING_MACKHMER :
1975 enc = kCFStringEncodingMacKhmer ;
1976 break ;
1977 case wxFONTENCODING_MACTHAI :
1978 enc = kCFStringEncodingMacThai ;
1979 break ;
1980 case wxFONTENCODING_MACLAOTIAN :
1981 enc = kCFStringEncodingMacLaotian ;
1982 break ;
1983 case wxFONTENCODING_MACGEORGIAN :
1984 enc = kCFStringEncodingMacGeorgian ;
1985 break ;
1986 case wxFONTENCODING_MACARMENIAN :
1987 enc = kCFStringEncodingMacArmenian ;
1988 break ;
1989 case wxFONTENCODING_MACCHINESESIMP :
1990 enc = kCFStringEncodingMacChineseSimp ;
1991 break ;
1992 case wxFONTENCODING_MACTIBETAN :
1993 enc = kCFStringEncodingMacTibetan ;
1994 break ;
1995 case wxFONTENCODING_MACMONGOLIAN :
1996 enc = kCFStringEncodingMacMongolian ;
1997 break ;
1998 case wxFONTENCODING_MACETHIOPIC :
1999 enc = kCFStringEncodingMacEthiopic ;
2000 break ;
2001 case wxFONTENCODING_MACCENTRALEUR :
2002 enc = kCFStringEncodingMacCentralEurRoman ;
2003 break ;
2004 case wxFONTENCODING_MACVIATNAMESE :
2005 enc = kCFStringEncodingMacVietnamese ;
2006 break ;
2007 case wxFONTENCODING_MACARABICEXT :
2008 enc = kCFStringEncodingMacExtArabic ;
2009 break ;
2010 case wxFONTENCODING_MACSYMBOL :
2011 enc = kCFStringEncodingMacSymbol ;
2012 break ;
2013 case wxFONTENCODING_MACDINGBATS :
2014 enc = kCFStringEncodingMacDingbats ;
2015 break ;
2016 case wxFONTENCODING_MACTURKISH :
2017 enc = kCFStringEncodingMacTurkish ;
2018 break ;
2019 case wxFONTENCODING_MACCROATIAN :
2020 enc = kCFStringEncodingMacCroatian ;
2021 break ;
2022 case wxFONTENCODING_MACICELANDIC :
2023 enc = kCFStringEncodingMacIcelandic ;
2024 break ;
2025 case wxFONTENCODING_MACROMANIAN :
2026 enc = kCFStringEncodingMacRomanian ;
2027 break ;
2028 case wxFONTENCODING_MACCELTIC :
2029 enc = kCFStringEncodingMacCeltic ;
2030 break ;
2031 case wxFONTENCODING_MACGAELIC :
2032 enc = kCFStringEncodingMacGaelic ;
2033 break ;
2034 // case wxFONTENCODING_MACKEYBOARD :
2035 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2036 // break ;
2037 default :
2038 // because gcc is picky
2039 break ;
2040 } ;
2041 return enc ;
2042 }
2043
2044 class wxMBConv_cocoa : public wxMBConv
2045 {
2046 public:
2047 wxMBConv_cocoa()
2048 {
2049 Init(CFStringGetSystemEncoding()) ;
2050 }
2051
2052 #if wxUSE_FONTMAP
2053 wxMBConv_cocoa(const wxChar* name)
2054 {
2055 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2056 }
2057 #endif
2058
2059 wxMBConv_cocoa(wxFontEncoding encoding)
2060 {
2061 Init( wxCFStringEncFromFontEnc(encoding) );
2062 }
2063
2064 ~wxMBConv_cocoa()
2065 {
2066 }
2067
2068 void Init( CFStringEncoding encoding)
2069 {
2070 m_encoding = encoding ;
2071 }
2072
2073 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2074 {
2075 wxASSERT(szUnConv);
2076
2077 CFStringRef theString = CFStringCreateWithBytes (
2078 NULL, //the allocator
2079 (const UInt8*)szUnConv,
2080 strlen(szUnConv),
2081 m_encoding,
2082 false //no BOM/external representation
2083 );
2084
2085 wxASSERT(theString);
2086
2087 size_t nOutLength = CFStringGetLength(theString);
2088
2089 if (szOut == NULL)
2090 {
2091 CFRelease(theString);
2092 return nOutLength;
2093 }
2094
2095 CFRange theRange = { 0, nOutSize };
2096
2097 #if SIZEOF_WCHAR_T == 4
2098 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2099 #endif
2100
2101 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2102
2103 CFRelease(theString);
2104
2105 szUniCharBuffer[nOutLength] = '\0' ;
2106
2107 #if SIZEOF_WCHAR_T == 4
2108 wxMBConvUTF16 converter ;
2109 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2110 delete[] szUniCharBuffer;
2111 #endif
2112
2113 return nOutLength;
2114 }
2115
2116 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2117 {
2118 wxASSERT(szUnConv);
2119
2120 size_t nRealOutSize;
2121 size_t nBufSize = wxWcslen(szUnConv);
2122 UniChar* szUniBuffer = (UniChar*) szUnConv;
2123
2124 #if SIZEOF_WCHAR_T == 4
2125 wxMBConvUTF16BE converter ;
2126 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2127 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2128 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2129 nBufSize /= sizeof(UniChar);
2130 #endif
2131
2132 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2133 NULL, //allocator
2134 szUniBuffer,
2135 nBufSize,
2136 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2137 );
2138
2139 wxASSERT(theString);
2140
2141 //Note that CER puts a BOM when converting to unicode
2142 //so we check and use getchars instead in that case
2143 if (m_encoding == kCFStringEncodingUnicode)
2144 {
2145 if (szOut != NULL)
2146 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2147
2148 nRealOutSize = CFStringGetLength(theString) + 1;
2149 }
2150 else
2151 {
2152 CFStringGetBytes(
2153 theString,
2154 CFRangeMake(0, CFStringGetLength(theString)),
2155 m_encoding,
2156 0, //what to put in characters that can't be converted -
2157 //0 tells CFString to return NULL if it meets such a character
2158 false, //not an external representation
2159 (UInt8*) szOut,
2160 nOutSize,
2161 (CFIndex*) &nRealOutSize
2162 );
2163 }
2164
2165 CFRelease(theString);
2166
2167 #if SIZEOF_WCHAR_T == 4
2168 delete[] szUniBuffer;
2169 #endif
2170
2171 return nRealOutSize - 1;
2172 }
2173
2174 bool IsOk() const
2175 {
2176 return m_encoding != kCFStringEncodingInvalidId &&
2177 CFStringIsEncodingAvailable(m_encoding);
2178 }
2179
2180 private:
2181 CFStringEncoding m_encoding ;
2182 };
2183
2184 #endif // defined(__WXCOCOA__)
2185
2186 // ============================================================================
2187 // Mac conversion classes
2188 // ============================================================================
2189
2190 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2191
2192 class wxMBConv_mac : public wxMBConv
2193 {
2194 public:
2195 wxMBConv_mac()
2196 {
2197 Init(CFStringGetSystemEncoding()) ;
2198 }
2199
2200 #if wxUSE_FONTMAP
2201 wxMBConv_mac(const wxChar* name)
2202 {
2203 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2204 }
2205 #endif
2206
2207 wxMBConv_mac(wxFontEncoding encoding)
2208 {
2209 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2210 }
2211
2212 ~wxMBConv_mac()
2213 {
2214 OSStatus status = noErr ;
2215 status = TECDisposeConverter(m_MB2WC_converter);
2216 status = TECDisposeConverter(m_WC2MB_converter);
2217 }
2218
2219
2220 void Init( TextEncodingBase encoding)
2221 {
2222 OSStatus status = noErr ;
2223 m_char_encoding = encoding ;
2224 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2225
2226 status = TECCreateConverter(&m_MB2WC_converter,
2227 m_char_encoding,
2228 m_unicode_encoding);
2229 status = TECCreateConverter(&m_WC2MB_converter,
2230 m_unicode_encoding,
2231 m_char_encoding);
2232 }
2233
2234 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2235 {
2236 OSStatus status = noErr ;
2237 ByteCount byteOutLen ;
2238 ByteCount byteInLen = strlen(psz) ;
2239 wchar_t *tbuf = NULL ;
2240 UniChar* ubuf = NULL ;
2241 size_t res = 0 ;
2242
2243 if (buf == NULL)
2244 {
2245 //apple specs say at least 32
2246 n = wxMax( 32 , byteInLen ) ;
2247 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2248 }
2249 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2250 #if SIZEOF_WCHAR_T == 4
2251 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2252 #else
2253 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2254 #endif
2255 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2256 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2257 #if SIZEOF_WCHAR_T == 4
2258 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2259 // is not properly terminated we get random characters at the end
2260 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2261 wxMBConvUTF16BE converter ;
2262 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2263 free( ubuf ) ;
2264 #else
2265 res = byteOutLen / sizeof( UniChar ) ;
2266 #endif
2267 if ( buf == NULL )
2268 free(tbuf) ;
2269
2270 if ( buf && res < n)
2271 buf[res] = 0;
2272
2273 return res ;
2274 }
2275
2276 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2277 {
2278 OSStatus status = noErr ;
2279 ByteCount byteOutLen ;
2280 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2281
2282 char *tbuf = NULL ;
2283
2284 if (buf == NULL)
2285 {
2286 //apple specs say at least 32
2287 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2288 tbuf = (char*) malloc( n ) ;
2289 }
2290
2291 ByteCount byteBufferLen = n ;
2292 UniChar* ubuf = NULL ;
2293 #if SIZEOF_WCHAR_T == 4
2294 wxMBConvUTF16BE converter ;
2295 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2296 byteInLen = unicharlen ;
2297 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2298 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2299 #else
2300 ubuf = (UniChar*) psz ;
2301 #endif
2302 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2303 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2304 #if SIZEOF_WCHAR_T == 4
2305 free( ubuf ) ;
2306 #endif
2307 if ( buf == NULL )
2308 free(tbuf) ;
2309
2310 size_t res = byteOutLen ;
2311 if ( buf && res < n)
2312 {
2313 buf[res] = 0;
2314
2315 //we need to double-trip to verify it didn't insert any ? in place
2316 //of bogus characters
2317 wxWCharBuffer wcBuf(n);
2318 size_t pszlen = wxWcslen(psz);
2319 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2320 wxWcslen(wcBuf) != pszlen ||
2321 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2322 {
2323 // we didn't obtain the same thing we started from, hence
2324 // the conversion was lossy and we consider that it failed
2325 return (size_t)-1;
2326 }
2327 }
2328
2329 return res ;
2330 }
2331
2332 bool IsOk() const
2333 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2334
2335 private:
2336 TECObjectRef m_MB2WC_converter ;
2337 TECObjectRef m_WC2MB_converter ;
2338
2339 TextEncodingBase m_char_encoding ;
2340 TextEncodingBase m_unicode_encoding ;
2341 };
2342
2343 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2344
2345 // ============================================================================
2346 // wxEncodingConverter based conversion classes
2347 // ============================================================================
2348
2349 #if wxUSE_FONTMAP
2350
2351 class wxMBConv_wxwin : public wxMBConv
2352 {
2353 private:
2354 void Init()
2355 {
2356 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2357 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2358 }
2359
2360 public:
2361 // temporarily just use wxEncodingConverter stuff,
2362 // so that it works while a better implementation is built
2363 wxMBConv_wxwin(const wxChar* name)
2364 {
2365 if (name)
2366 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2367 else
2368 m_enc = wxFONTENCODING_SYSTEM;
2369
2370 Init();
2371 }
2372
2373 wxMBConv_wxwin(wxFontEncoding enc)
2374 {
2375 m_enc = enc;
2376
2377 Init();
2378 }
2379
2380 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2381 {
2382 size_t inbuf = strlen(psz);
2383 if (buf)
2384 {
2385 if (!m2w.Convert(psz,buf))
2386 return (size_t)-1;
2387 }
2388 return inbuf;
2389 }
2390
2391 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2392 {
2393 const size_t inbuf = wxWcslen(psz);
2394 if (buf)
2395 {
2396 if (!w2m.Convert(psz,buf))
2397 return (size_t)-1;
2398 }
2399
2400 return inbuf;
2401 }
2402
2403 bool IsOk() const { return m_ok; }
2404
2405 public:
2406 wxFontEncoding m_enc;
2407 wxEncodingConverter m2w, w2m;
2408
2409 // were we initialized successfully?
2410 bool m_ok;
2411
2412 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2413 };
2414
2415 #endif // wxUSE_FONTMAP
2416
2417 // ============================================================================
2418 // wxCSConv implementation
2419 // ============================================================================
2420
2421 void wxCSConv::Init()
2422 {
2423 m_name = NULL;
2424 m_convReal = NULL;
2425 m_deferred = true;
2426 }
2427
2428 wxCSConv::wxCSConv(const wxChar *charset)
2429 {
2430 Init();
2431
2432 if ( charset )
2433 {
2434 SetName(charset);
2435 }
2436
2437 m_encoding = wxFONTENCODING_SYSTEM;
2438 }
2439
2440 wxCSConv::wxCSConv(wxFontEncoding encoding)
2441 {
2442 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2443 {
2444 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2445
2446 encoding = wxFONTENCODING_SYSTEM;
2447 }
2448
2449 Init();
2450
2451 m_encoding = encoding;
2452 }
2453
2454 wxCSConv::~wxCSConv()
2455 {
2456 Clear();
2457 }
2458
2459 wxCSConv::wxCSConv(const wxCSConv& conv)
2460 : wxMBConv()
2461 {
2462 Init();
2463
2464 SetName(conv.m_name);
2465 m_encoding = conv.m_encoding;
2466 }
2467
2468 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2469 {
2470 Clear();
2471
2472 SetName(conv.m_name);
2473 m_encoding = conv.m_encoding;
2474
2475 return *this;
2476 }
2477
2478 void wxCSConv::Clear()
2479 {
2480 free(m_name);
2481 delete m_convReal;
2482
2483 m_name = NULL;
2484 m_convReal = NULL;
2485 }
2486
2487 void wxCSConv::SetName(const wxChar *charset)
2488 {
2489 if (charset)
2490 {
2491 m_name = wxStrdup(charset);
2492 m_deferred = true;
2493 }
2494 }
2495
2496 wxMBConv *wxCSConv::DoCreate() const
2497 {
2498 // check for the special case of ASCII or ISO8859-1 charset: as we have
2499 // special knowledge of it anyhow, we don't need to create a special
2500 // conversion object
2501 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2502 {
2503 // don't convert at all
2504 return NULL;
2505 }
2506
2507 // we trust OS to do conversion better than we can so try external
2508 // conversion methods first
2509 //
2510 // the full order is:
2511 // 1. OS conversion (iconv() under Unix or Win32 API)
2512 // 2. hard coded conversions for UTF
2513 // 3. wxEncodingConverter as fall back
2514
2515 // step (1)
2516 #ifdef HAVE_ICONV
2517 #if !wxUSE_FONTMAP
2518 if ( m_name )
2519 #endif // !wxUSE_FONTMAP
2520 {
2521 wxString name(m_name);
2522
2523 #if wxUSE_FONTMAP
2524 if ( name.empty() )
2525 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2526 #endif // wxUSE_FONTMAP
2527
2528 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2529 if ( conv->IsOk() )
2530 return conv;
2531
2532 delete conv;
2533 }
2534 #endif // HAVE_ICONV
2535
2536 #ifdef wxHAVE_WIN32_MB2WC
2537 {
2538 #if wxUSE_FONTMAP
2539 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2540 : new wxMBConv_win32(m_encoding);
2541 if ( conv->IsOk() )
2542 return conv;
2543
2544 delete conv;
2545 #else
2546 return NULL;
2547 #endif
2548 }
2549 #endif // wxHAVE_WIN32_MB2WC
2550 #if defined(__WXMAC__)
2551 {
2552 // leave UTF16 and UTF32 to the built-ins of wx
2553 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2554 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2555 {
2556
2557 #if wxUSE_FONTMAP
2558 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2559 : new wxMBConv_mac(m_encoding);
2560 #else
2561 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2562 #endif
2563 if ( conv->IsOk() )
2564 return conv;
2565
2566 delete conv;
2567 }
2568 }
2569 #endif
2570 #if defined(__WXCOCOA__)
2571 {
2572 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2573 {
2574
2575 #if wxUSE_FONTMAP
2576 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2577 : new wxMBConv_cocoa(m_encoding);
2578 #else
2579 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2580 #endif
2581 if ( conv->IsOk() )
2582 return conv;
2583
2584 delete conv;
2585 }
2586 }
2587 #endif
2588 // step (2)
2589 wxFontEncoding enc = m_encoding;
2590 #if wxUSE_FONTMAP
2591 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2592 {
2593 // use "false" to suppress interactive dialogs -- we can be called from
2594 // anywhere and popping up a dialog from here is the last thing we want to
2595 // do
2596 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2597 }
2598 #endif // wxUSE_FONTMAP
2599
2600 switch ( enc )
2601 {
2602 case wxFONTENCODING_UTF7:
2603 return new wxMBConvUTF7;
2604
2605 case wxFONTENCODING_UTF8:
2606 return new wxMBConvUTF8;
2607
2608 case wxFONTENCODING_UTF16BE:
2609 return new wxMBConvUTF16BE;
2610
2611 case wxFONTENCODING_UTF16LE:
2612 return new wxMBConvUTF16LE;
2613
2614 case wxFONTENCODING_UTF32BE:
2615 return new wxMBConvUTF32BE;
2616
2617 case wxFONTENCODING_UTF32LE:
2618 return new wxMBConvUTF32LE;
2619
2620 default:
2621 // nothing to do but put here to suppress gcc warnings
2622 ;
2623 }
2624
2625 // step (3)
2626 #if wxUSE_FONTMAP
2627 {
2628 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2629 : new wxMBConv_wxwin(m_encoding);
2630 if ( conv->IsOk() )
2631 return conv;
2632
2633 delete conv;
2634 }
2635 #endif // wxUSE_FONTMAP
2636
2637 // NB: This is a hack to prevent deadlock. What could otherwise happen
2638 // in Unicode build: wxConvLocal creation ends up being here
2639 // because of some failure and logs the error. But wxLog will try to
2640 // attach timestamp, for which it will need wxConvLocal (to convert
2641 // time to char* and then wchar_t*), but that fails, tries to log
2642 // error, but wxLog has a (already locked) critical section that
2643 // guards static buffer.
2644 static bool alreadyLoggingError = false;
2645 if (!alreadyLoggingError)
2646 {
2647 alreadyLoggingError = true;
2648 wxLogError(_("Cannot convert from the charset '%s'!"),
2649 m_name ? m_name
2650 :
2651 #if wxUSE_FONTMAP
2652 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2653 #else // !wxUSE_FONTMAP
2654 wxString::Format(_("encoding %s"), m_encoding).c_str()
2655 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2656 );
2657 alreadyLoggingError = false;
2658 }
2659
2660 return NULL;
2661 }
2662
2663 void wxCSConv::CreateConvIfNeeded() const
2664 {
2665 if ( m_deferred )
2666 {
2667 wxCSConv *self = (wxCSConv *)this; // const_cast
2668
2669 #if wxUSE_INTL
2670 // if we don't have neither the name nor the encoding, use the default
2671 // encoding for this system
2672 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2673 {
2674 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2675 }
2676 #endif // wxUSE_INTL
2677
2678 self->m_convReal = DoCreate();
2679 self->m_deferred = false;
2680 }
2681 }
2682
2683 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2684 {
2685 CreateConvIfNeeded();
2686
2687 if (m_convReal)
2688 return m_convReal->MB2WC(buf, psz, n);
2689
2690 // latin-1 (direct)
2691 size_t len = strlen(psz);
2692
2693 if (buf)
2694 {
2695 for (size_t c = 0; c <= len; c++)
2696 buf[c] = (unsigned char)(psz[c]);
2697 }
2698
2699 return len;
2700 }
2701
2702 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2703 {
2704 CreateConvIfNeeded();
2705
2706 if (m_convReal)
2707 return m_convReal->WC2MB(buf, psz, n);
2708
2709 // latin-1 (direct)
2710 const size_t len = wxWcslen(psz);
2711 if (buf)
2712 {
2713 for (size_t c = 0; c <= len; c++)
2714 {
2715 if (psz[c] > 0xFF)
2716 return (size_t)-1;
2717 buf[c] = (char)psz[c];
2718 }
2719 }
2720 else
2721 {
2722 for (size_t c = 0; c <= len; c++)
2723 {
2724 if (psz[c] > 0xFF)
2725 return (size_t)-1;
2726 }
2727 }
2728
2729 return len;
2730 }
2731
2732 // ----------------------------------------------------------------------------
2733 // globals
2734 // ----------------------------------------------------------------------------
2735
2736 #ifdef __WINDOWS__
2737 static wxMBConv_win32 wxConvLibcObj;
2738 #elif defined(__WXMAC__) && !defined(__MACH__)
2739 static wxMBConv_mac wxConvLibcObj ;
2740 #else
2741 static wxMBConvLibc wxConvLibcObj;
2742 #endif
2743
2744 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2745 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2746 static wxMBConvUTF7 wxConvUTF7Obj;
2747 static wxMBConvUTF8 wxConvUTF8Obj;
2748
2749 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2750 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2751 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2752 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2753 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2754 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2755 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2756 #ifdef __WXOSX__
2757 wxConvUTF8Obj;
2758 #else
2759 wxConvLibcObj;
2760 #endif
2761
2762
2763 #else // !wxUSE_WCHAR_T
2764
2765 // stand-ins in absence of wchar_t
2766 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2767 wxConvISO8859_1,
2768 wxConvLocal,
2769 wxConvUTF8;
2770
2771 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2772
2773