]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Hopefully fixed library names generated by wx-config for OS/2's PM port.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen instead of nLen here
274 //because XX2XX gives us the size of the output buffer,
275 //which is not necessarily the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen instead of nLen here
336 //because XX2XX gives us the size of the output buffer,
337 //which is not necessarily the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359
360 #ifdef __UNIX__
361
362 // ----------------------------------------------------------------------------
363 // wxConvBrokenFileNames
364 // ----------------------------------------------------------------------------
365
366 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
367 {
368 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
369 || wxStricmp(charset, _T("UTF8")) == 0 )
370 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
371 else
372 m_conv = new wxCSConv(charset);
373 }
374
375 size_t
376 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
377 const char *psz,
378 size_t outputSize) const
379 {
380 return m_conv->MB2WC( outputBuf, psz, outputSize );
381 }
382
383 size_t
384 wxConvBrokenFileNames::WC2MB(char *outputBuf,
385 const wchar_t *psz,
386 size_t outputSize) const
387 {
388 return m_conv->WC2MB( outputBuf, psz, outputSize );
389 }
390
391 #endif
392
393 // ----------------------------------------------------------------------------
394 // UTF-7
395 // ----------------------------------------------------------------------------
396
397 // Implementation (C) 2004 Fredrik Roubert
398
399 //
400 // BASE64 decoding table
401 //
402 static const unsigned char utf7unb64[] =
403 {
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
410 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
411 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
412 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
413 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
414 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
415 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
416 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
417 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
418 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
419 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
436 };
437
438 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
439 {
440 size_t len = 0;
441
442 while (*psz && ((!buf) || (len < n)))
443 {
444 unsigned char cc = *psz++;
445 if (cc != '+')
446 {
447 // plain ASCII char
448 if (buf)
449 *buf++ = cc;
450 len++;
451 }
452 else if (*psz == '-')
453 {
454 // encoded plus sign
455 if (buf)
456 *buf++ = cc;
457 len++;
458 psz++;
459 }
460 else
461 {
462 // BASE64 encoded string
463 bool lsb;
464 unsigned char c;
465 unsigned int d, l;
466 for (lsb = false, d = 0, l = 0;
467 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
468 {
469 d <<= 6;
470 d += cc;
471 for (l += 6; l >= 8; lsb = !lsb)
472 {
473 c = (unsigned char)((d >> (l -= 8)) % 256);
474 if (lsb)
475 {
476 if (buf)
477 *buf++ |= c;
478 len ++;
479 }
480 else
481 if (buf)
482 *buf = (wchar_t)(c << 8);
483 }
484 }
485 if (*psz == '-')
486 psz++;
487 }
488 }
489 if (buf && (len < n))
490 *buf = 0;
491 return len;
492 }
493
494 //
495 // BASE64 encoding table
496 //
497 static const unsigned char utf7enb64[] =
498 {
499 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
500 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
501 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
502 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
503 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
504 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
505 'w', 'x', 'y', 'z', '0', '1', '2', '3',
506 '4', '5', '6', '7', '8', '9', '+', '/'
507 };
508
509 //
510 // UTF-7 encoding table
511 //
512 // 0 - Set D (directly encoded characters)
513 // 1 - Set O (optional direct characters)
514 // 2 - whitespace characters (optional)
515 // 3 - special characters
516 //
517 static const unsigned char utf7encode[128] =
518 {
519 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
521 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
523 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
525 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
527 };
528
529 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
530 {
531
532
533 size_t len = 0;
534
535 while (*psz && ((!buf) || (len < n)))
536 {
537 wchar_t cc = *psz++;
538 if (cc < 0x80 && utf7encode[cc] < 1)
539 {
540 // plain ASCII char
541 if (buf)
542 *buf++ = (char)cc;
543 len++;
544 }
545 #ifndef WC_UTF16
546 else if (((wxUint32)cc) > 0xffff)
547 {
548 // no surrogate pair generation (yet?)
549 return (size_t)-1;
550 }
551 #endif
552 else
553 {
554 if (buf)
555 *buf++ = '+';
556 len++;
557 if (cc != '+')
558 {
559 // BASE64 encode string
560 unsigned int lsb, d, l;
561 for (d = 0, l = 0;; psz++)
562 {
563 for (lsb = 0; lsb < 2; lsb ++)
564 {
565 d <<= 8;
566 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
567
568 for (l += 8; l >= 6; )
569 {
570 l -= 6;
571 if (buf)
572 *buf++ = utf7enb64[(d >> l) % 64];
573 len++;
574 }
575 }
576 cc = *psz;
577 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
578 break;
579 }
580 if (l != 0)
581 {
582 if (buf)
583 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
584 len++;
585 }
586 }
587 if (buf)
588 *buf++ = '-';
589 len++;
590 }
591 }
592 if (buf && (len < n))
593 *buf = 0;
594 return len;
595 }
596
597 // ----------------------------------------------------------------------------
598 // UTF-8
599 // ----------------------------------------------------------------------------
600
601 static wxUint32 utf8_max[]=
602 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
603
604 // boundaries of the private use area we use to (temporarily) remap invalid
605 // characters invalid in a UTF-8 encoded string
606 const wxUint32 wxUnicodePUA = 0x100000;
607 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
608
609 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
610 {
611 size_t len = 0;
612
613 while (*psz && ((!buf) || (len < n)))
614 {
615 const char *opsz = psz;
616 bool invalid = false;
617 unsigned char cc = *psz++, fc = cc;
618 unsigned cnt;
619 for (cnt = 0; fc & 0x80; cnt++)
620 fc <<= 1;
621 if (!cnt)
622 {
623 // plain ASCII char
624 if (buf)
625 *buf++ = cc;
626 len++;
627
628 // escape the escape character for octal escapes
629 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
630 && cc == '\\' && (!buf || len < n))
631 {
632 if (buf)
633 *buf++ = cc;
634 len++;
635 }
636 }
637 else
638 {
639 cnt--;
640 if (!cnt)
641 {
642 // invalid UTF-8 sequence
643 invalid = true;
644 }
645 else
646 {
647 unsigned ocnt = cnt - 1;
648 wxUint32 res = cc & (0x3f >> cnt);
649 while (cnt--)
650 {
651 cc = *psz;
652 if ((cc & 0xC0) != 0x80)
653 {
654 // invalid UTF-8 sequence
655 invalid = true;
656 break;
657 }
658 psz++;
659 res = (res << 6) | (cc & 0x3f);
660 }
661 if (invalid || res <= utf8_max[ocnt])
662 {
663 // illegal UTF-8 encoding
664 invalid = true;
665 }
666 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
667 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
668 {
669 // if one of our PUA characters turns up externally
670 // it must also be treated as an illegal sequence
671 // (a bit like you have to escape an escape character)
672 invalid = true;
673 }
674 else
675 {
676 #ifdef WC_UTF16
677 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
678 size_t pa = encode_utf16(res, (wxUint16 *)buf);
679 if (pa == (size_t)-1)
680 {
681 invalid = true;
682 }
683 else
684 {
685 if (buf)
686 buf += pa;
687 len += pa;
688 }
689 #else // !WC_UTF16
690 if (buf)
691 *buf++ = res;
692 len++;
693 #endif // WC_UTF16/!WC_UTF16
694 }
695 }
696 if (invalid)
697 {
698 if (m_options & MAP_INVALID_UTF8_TO_PUA)
699 {
700 while (opsz < psz && (!buf || len < n))
701 {
702 #ifdef WC_UTF16
703 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
704 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
705 wxASSERT(pa != (size_t)-1);
706 if (buf)
707 buf += pa;
708 opsz++;
709 len += pa;
710 #else
711 if (buf)
712 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
713 opsz++;
714 len++;
715 #endif
716 }
717 }
718 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
719 {
720 while (opsz < psz && (!buf || len < n))
721 {
722 if ( buf && len + 3 < n )
723 {
724 unsigned char n = *opsz;
725 *buf++ = L'\\';
726 *buf++ = (wchar_t)( L'0' + n / 0100 );
727 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
728 *buf++ = (wchar_t)( L'0' + n % 010 );
729 }
730 opsz++;
731 len += 4;
732 }
733 }
734 else // MAP_INVALID_UTF8_NOT
735 {
736 return (size_t)-1;
737 }
738 }
739 }
740 }
741 if (buf && (len < n))
742 *buf = 0;
743 return len;
744 }
745
746 static inline bool isoctal(wchar_t wch)
747 {
748 return L'0' <= wch && wch <= L'7';
749 }
750
751 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
752 {
753 size_t len = 0;
754
755 while (*psz && ((!buf) || (len < n)))
756 {
757 wxUint32 cc;
758 #ifdef WC_UTF16
759 // cast is ok for WC_UTF16
760 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
761 psz += (pa == (size_t)-1) ? 1 : pa;
762 #else
763 cc=(*psz++) & 0x7fffffff;
764 #endif
765
766 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
767 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
768 {
769 if (buf)
770 *buf++ = (char)(cc - wxUnicodePUA);
771 len++;
772 }
773 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
774 && cc == L'\\' && psz[0] == L'\\' )
775 {
776 if (buf)
777 *buf++ = (char)cc;
778 psz++;
779 len++;
780 }
781 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
782 cc == L'\\' &&
783 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
784 {
785 if (buf)
786 {
787 *buf++ = (char) ((psz[0] - L'0')*0100 +
788 (psz[1] - L'0')*010 +
789 (psz[2] - L'0'));
790 }
791
792 psz += 3;
793 len++;
794 }
795 else
796 {
797 unsigned cnt;
798 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
799 if (!cnt)
800 {
801 // plain ASCII char
802 if (buf)
803 *buf++ = (char) cc;
804 len++;
805 }
806
807 else
808 {
809 len += cnt + 1;
810 if (buf)
811 {
812 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
813 while (cnt--)
814 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
815 }
816 }
817 }
818 }
819
820 if (buf && (len<n))
821 *buf = 0;
822
823 return len;
824 }
825
826 // ----------------------------------------------------------------------------
827 // UTF-16
828 // ----------------------------------------------------------------------------
829
830 #ifdef WORDS_BIGENDIAN
831 #define wxMBConvUTF16straight wxMBConvUTF16BE
832 #define wxMBConvUTF16swap wxMBConvUTF16LE
833 #else
834 #define wxMBConvUTF16swap wxMBConvUTF16BE
835 #define wxMBConvUTF16straight wxMBConvUTF16LE
836 #endif
837
838
839 #ifdef WC_UTF16
840
841 // copy 16bit MB to 16bit String
842 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
843 {
844 size_t len=0;
845
846 while (*(wxUint16*)psz && (!buf || len < n))
847 {
848 if (buf)
849 *buf++ = *(wxUint16*)psz;
850 len++;
851
852 psz += sizeof(wxUint16);
853 }
854 if (buf && len<n) *buf=0;
855
856 return len;
857 }
858
859
860 // copy 16bit String to 16bit MB
861 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
862 {
863 size_t len=0;
864
865 while (*psz && (!buf || len < n))
866 {
867 if (buf)
868 {
869 *(wxUint16*)buf = *psz;
870 buf += sizeof(wxUint16);
871 }
872 len += sizeof(wxUint16);
873 psz++;
874 }
875 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
876
877 return len;
878 }
879
880
881 // swap 16bit MB to 16bit String
882 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
883 {
884 size_t len=0;
885
886 while (*(wxUint16*)psz && (!buf || len < n))
887 {
888 if (buf)
889 {
890 ((char *)buf)[0] = psz[1];
891 ((char *)buf)[1] = psz[0];
892 buf++;
893 }
894 len++;
895 psz += sizeof(wxUint16);
896 }
897 if (buf && len<n) *buf=0;
898
899 return len;
900 }
901
902
903 // swap 16bit MB to 16bit String
904 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
905 {
906 size_t len=0;
907
908 while (*psz && (!buf || len < n))
909 {
910 if (buf)
911 {
912 *buf++ = ((char*)psz)[1];
913 *buf++ = ((char*)psz)[0];
914 }
915 len += sizeof(wxUint16);
916 psz++;
917 }
918 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
919
920 return len;
921 }
922
923
924 #else // WC_UTF16
925
926
927 // copy 16bit MB to 32bit String
928 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
929 {
930 size_t len=0;
931
932 while (*(wxUint16*)psz && (!buf || len < n))
933 {
934 wxUint32 cc;
935 size_t pa=decode_utf16((wxUint16*)psz, cc);
936 if (pa == (size_t)-1)
937 return pa;
938
939 if (buf)
940 *buf++ = cc;
941 len++;
942 psz += pa * sizeof(wxUint16);
943 }
944 if (buf && len<n) *buf=0;
945
946 return len;
947 }
948
949
950 // copy 32bit String to 16bit MB
951 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
952 {
953 size_t len=0;
954
955 while (*psz && (!buf || len < n))
956 {
957 wxUint16 cc[2];
958 size_t pa=encode_utf16(*psz, cc);
959
960 if (pa == (size_t)-1)
961 return pa;
962
963 if (buf)
964 {
965 *(wxUint16*)buf = cc[0];
966 buf += sizeof(wxUint16);
967 if (pa > 1)
968 {
969 *(wxUint16*)buf = cc[1];
970 buf += sizeof(wxUint16);
971 }
972 }
973
974 len += pa*sizeof(wxUint16);
975 psz++;
976 }
977 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
978
979 return len;
980 }
981
982
983 // swap 16bit MB to 32bit String
984 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
985 {
986 size_t len=0;
987
988 while (*(wxUint16*)psz && (!buf || len < n))
989 {
990 wxUint32 cc;
991 char tmp[4];
992 tmp[0]=psz[1]; tmp[1]=psz[0];
993 tmp[2]=psz[3]; tmp[3]=psz[2];
994
995 size_t pa=decode_utf16((wxUint16*)tmp, cc);
996 if (pa == (size_t)-1)
997 return pa;
998
999 if (buf)
1000 *buf++ = cc;
1001
1002 len++;
1003 psz += pa * sizeof(wxUint16);
1004 }
1005 if (buf && len<n) *buf=0;
1006
1007 return len;
1008 }
1009
1010
1011 // swap 32bit String to 16bit MB
1012 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1013 {
1014 size_t len=0;
1015
1016 while (*psz && (!buf || len < n))
1017 {
1018 wxUint16 cc[2];
1019 size_t pa=encode_utf16(*psz, cc);
1020
1021 if (pa == (size_t)-1)
1022 return pa;
1023
1024 if (buf)
1025 {
1026 *buf++ = ((char*)cc)[1];
1027 *buf++ = ((char*)cc)[0];
1028 if (pa > 1)
1029 {
1030 *buf++ = ((char*)cc)[3];
1031 *buf++ = ((char*)cc)[2];
1032 }
1033 }
1034
1035 len += pa*sizeof(wxUint16);
1036 psz++;
1037 }
1038 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1039
1040 return len;
1041 }
1042
1043 #endif // WC_UTF16
1044
1045
1046 // ----------------------------------------------------------------------------
1047 // UTF-32
1048 // ----------------------------------------------------------------------------
1049
1050 #ifdef WORDS_BIGENDIAN
1051 #define wxMBConvUTF32straight wxMBConvUTF32BE
1052 #define wxMBConvUTF32swap wxMBConvUTF32LE
1053 #else
1054 #define wxMBConvUTF32swap wxMBConvUTF32BE
1055 #define wxMBConvUTF32straight wxMBConvUTF32LE
1056 #endif
1057
1058
1059 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1060 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1061
1062
1063 #ifdef WC_UTF16
1064
1065 // copy 32bit MB to 16bit String
1066 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1067 {
1068 size_t len=0;
1069
1070 while (*(wxUint32*)psz && (!buf || len < n))
1071 {
1072 wxUint16 cc[2];
1073
1074 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1075 if (pa == (size_t)-1)
1076 return pa;
1077
1078 if (buf)
1079 {
1080 *buf++ = cc[0];
1081 if (pa > 1)
1082 *buf++ = cc[1];
1083 }
1084 len += pa;
1085 psz += sizeof(wxUint32);
1086 }
1087 if (buf && len<n) *buf=0;
1088
1089 return len;
1090 }
1091
1092
1093 // copy 16bit String to 32bit MB
1094 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1095 {
1096 size_t len=0;
1097
1098 while (*psz && (!buf || len < n))
1099 {
1100 wxUint32 cc;
1101
1102 // cast is ok for WC_UTF16
1103 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1104 if (pa == (size_t)-1)
1105 return pa;
1106
1107 if (buf)
1108 {
1109 *(wxUint32*)buf = cc;
1110 buf += sizeof(wxUint32);
1111 }
1112 len += sizeof(wxUint32);
1113 psz += pa;
1114 }
1115
1116 if (buf && len<=n-sizeof(wxUint32))
1117 *(wxUint32*)buf=0;
1118
1119 return len;
1120 }
1121
1122
1123
1124 // swap 32bit MB to 16bit String
1125 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1126 {
1127 size_t len=0;
1128
1129 while (*(wxUint32*)psz && (!buf || len < n))
1130 {
1131 char tmp[4];
1132 tmp[0] = psz[3]; tmp[1] = psz[2];
1133 tmp[2] = psz[1]; tmp[3] = psz[0];
1134
1135
1136 wxUint16 cc[2];
1137
1138 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1139 if (pa == (size_t)-1)
1140 return pa;
1141
1142 if (buf)
1143 {
1144 *buf++ = cc[0];
1145 if (pa > 1)
1146 *buf++ = cc[1];
1147 }
1148 len += pa;
1149 psz += sizeof(wxUint32);
1150 }
1151
1152 if (buf && len<n)
1153 *buf=0;
1154
1155 return len;
1156 }
1157
1158
1159 // swap 16bit String to 32bit MB
1160 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1161 {
1162 size_t len=0;
1163
1164 while (*psz && (!buf || len < n))
1165 {
1166 char cc[4];
1167
1168 // cast is ok for WC_UTF16
1169 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1170 if (pa == (size_t)-1)
1171 return pa;
1172
1173 if (buf)
1174 {
1175 *buf++ = cc[3];
1176 *buf++ = cc[2];
1177 *buf++ = cc[1];
1178 *buf++ = cc[0];
1179 }
1180 len += sizeof(wxUint32);
1181 psz += pa;
1182 }
1183
1184 if (buf && len<=n-sizeof(wxUint32))
1185 *(wxUint32*)buf=0;
1186
1187 return len;
1188 }
1189
1190 #else // WC_UTF16
1191
1192
1193 // copy 32bit MB to 32bit String
1194 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1195 {
1196 size_t len=0;
1197
1198 while (*(wxUint32*)psz && (!buf || len < n))
1199 {
1200 if (buf)
1201 *buf++ = *(wxUint32*)psz;
1202 len++;
1203 psz += sizeof(wxUint32);
1204 }
1205
1206 if (buf && len<n)
1207 *buf=0;
1208
1209 return len;
1210 }
1211
1212
1213 // copy 32bit String to 32bit MB
1214 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1215 {
1216 size_t len=0;
1217
1218 while (*psz && (!buf || len < n))
1219 {
1220 if (buf)
1221 {
1222 *(wxUint32*)buf = *psz;
1223 buf += sizeof(wxUint32);
1224 }
1225
1226 len += sizeof(wxUint32);
1227 psz++;
1228 }
1229
1230 if (buf && len<=n-sizeof(wxUint32))
1231 *(wxUint32*)buf=0;
1232
1233 return len;
1234 }
1235
1236
1237 // swap 32bit MB to 32bit String
1238 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1239 {
1240 size_t len=0;
1241
1242 while (*(wxUint32*)psz && (!buf || len < n))
1243 {
1244 if (buf)
1245 {
1246 ((char *)buf)[0] = psz[3];
1247 ((char *)buf)[1] = psz[2];
1248 ((char *)buf)[2] = psz[1];
1249 ((char *)buf)[3] = psz[0];
1250 buf++;
1251 }
1252 len++;
1253 psz += sizeof(wxUint32);
1254 }
1255
1256 if (buf && len<n)
1257 *buf=0;
1258
1259 return len;
1260 }
1261
1262
1263 // swap 32bit String to 32bit MB
1264 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1265 {
1266 size_t len=0;
1267
1268 while (*psz && (!buf || len < n))
1269 {
1270 if (buf)
1271 {
1272 *buf++ = ((char *)psz)[3];
1273 *buf++ = ((char *)psz)[2];
1274 *buf++ = ((char *)psz)[1];
1275 *buf++ = ((char *)psz)[0];
1276 }
1277 len += sizeof(wxUint32);
1278 psz++;
1279 }
1280
1281 if (buf && len<=n-sizeof(wxUint32))
1282 *(wxUint32*)buf=0;
1283
1284 return len;
1285 }
1286
1287
1288 #endif // WC_UTF16
1289
1290
1291 // ============================================================================
1292 // The classes doing conversion using the iconv_xxx() functions
1293 // ============================================================================
1294
1295 #ifdef HAVE_ICONV
1296
1297 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1298 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1299 // (unless there's yet another bug in glibc) the only case when iconv()
1300 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1301 // left in the input buffer -- when _real_ error occurs,
1302 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1303 // iconv() failure.
1304 // [This bug does not appear in glibc 2.2.]
1305 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1306 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1307 (errno != E2BIG || bufLeft != 0))
1308 #else
1309 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1310 #endif
1311
1312 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1313
1314 // ----------------------------------------------------------------------------
1315 // wxMBConv_iconv: encapsulates an iconv character set
1316 // ----------------------------------------------------------------------------
1317
1318 class wxMBConv_iconv : public wxMBConv
1319 {
1320 public:
1321 wxMBConv_iconv(const wxChar *name);
1322 virtual ~wxMBConv_iconv();
1323
1324 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1325 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1326
1327 bool IsOk() const
1328 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1329
1330 protected:
1331 // the iconv handlers used to translate from multibyte to wide char and in
1332 // the other direction
1333 iconv_t m2w,
1334 w2m;
1335 #if wxUSE_THREADS
1336 // guards access to m2w and w2m objects
1337 wxMutex m_iconvMutex;
1338 #endif
1339
1340 private:
1341 // the name (for iconv_open()) of a wide char charset -- if none is
1342 // available on this machine, it will remain NULL
1343 static const char *ms_wcCharsetName;
1344
1345 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1346 // different endian-ness than the native one
1347 static bool ms_wcNeedsSwap;
1348 };
1349
1350 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1351 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1352
1353 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1354 {
1355 // Do it the hard way
1356 char cname[100];
1357 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1358 cname[i] = (char) name[i];
1359
1360 // check for charset that represents wchar_t:
1361 if (ms_wcCharsetName == NULL)
1362 {
1363 ms_wcNeedsSwap = false;
1364
1365 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1366 ms_wcCharsetName = WC_NAME_BEST;
1367 m2w = iconv_open(ms_wcCharsetName, cname);
1368
1369 if (m2w == (iconv_t)-1)
1370 {
1371 // try charset w/o bytesex info (e.g. "UCS4")
1372 // and check for bytesex ourselves:
1373 ms_wcCharsetName = WC_NAME;
1374 m2w = iconv_open(ms_wcCharsetName, cname);
1375
1376 // last bet, try if it knows WCHAR_T pseudo-charset
1377 if (m2w == (iconv_t)-1)
1378 {
1379 ms_wcCharsetName = "WCHAR_T";
1380 m2w = iconv_open(ms_wcCharsetName, cname);
1381 }
1382
1383 if (m2w != (iconv_t)-1)
1384 {
1385 char buf[2], *bufPtr;
1386 wchar_t wbuf[2], *wbufPtr;
1387 size_t insz, outsz;
1388 size_t res;
1389
1390 buf[0] = 'A';
1391 buf[1] = 0;
1392 wbuf[0] = 0;
1393 insz = 2;
1394 outsz = SIZEOF_WCHAR_T * 2;
1395 wbufPtr = wbuf;
1396 bufPtr = buf;
1397
1398 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1399 (char**)&wbufPtr, &outsz);
1400
1401 if (ICONV_FAILED(res, insz))
1402 {
1403 ms_wcCharsetName = NULL;
1404 wxLogLastError(wxT("iconv"));
1405 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1406 }
1407 else
1408 {
1409 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1410 }
1411 }
1412 else
1413 {
1414 ms_wcCharsetName = NULL;
1415
1416 // VS: we must not output an error here, since wxWidgets will safely
1417 // fall back to using wxEncodingConverter.
1418 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1419 //wxLogError(
1420 }
1421 }
1422 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1423 }
1424 else // we already have ms_wcCharsetName
1425 {
1426 m2w = iconv_open(ms_wcCharsetName, cname);
1427 }
1428
1429 // NB: don't ever pass NULL to iconv_open(), it may crash!
1430 if ( ms_wcCharsetName )
1431 {
1432 w2m = iconv_open( cname, ms_wcCharsetName);
1433 }
1434 else
1435 {
1436 w2m = (iconv_t)-1;
1437 }
1438 }
1439
1440 wxMBConv_iconv::~wxMBConv_iconv()
1441 {
1442 if ( m2w != (iconv_t)-1 )
1443 iconv_close(m2w);
1444 if ( w2m != (iconv_t)-1 )
1445 iconv_close(w2m);
1446 }
1447
1448 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1449 {
1450 #if wxUSE_THREADS
1451 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1452 // Unfortunately there is a couple of global wxCSConv objects such as
1453 // wxConvLocal that are used all over wx code, so we have to make sure
1454 // the handle is used by at most one thread at the time. Otherwise
1455 // only a few wx classes would be safe to use from non-main threads
1456 // as MB<->WC conversion would fail "randomly".
1457 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1458 #endif
1459
1460 size_t inbuf = strlen(psz);
1461 size_t outbuf = n * SIZEOF_WCHAR_T;
1462 size_t res, cres;
1463 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1464 wchar_t *bufPtr = buf;
1465 const char *pszPtr = psz;
1466
1467 if (buf)
1468 {
1469 // have destination buffer, convert there
1470 cres = iconv(m2w,
1471 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1472 (char**)&bufPtr, &outbuf);
1473 res = n - (outbuf / SIZEOF_WCHAR_T);
1474
1475 if (ms_wcNeedsSwap)
1476 {
1477 // convert to native endianness
1478 WC_BSWAP(buf /* _not_ bufPtr */, res)
1479 }
1480
1481 // NB: iconv was given only strlen(psz) characters on input, and so
1482 // it couldn't convert the trailing zero. Let's do it ourselves
1483 // if there's some room left for it in the output buffer.
1484 if (res < n)
1485 buf[res] = 0;
1486 }
1487 else
1488 {
1489 // no destination buffer... convert using temp buffer
1490 // to calculate destination buffer requirement
1491 wchar_t tbuf[8];
1492 res = 0;
1493 do {
1494 bufPtr = tbuf;
1495 outbuf = 8*SIZEOF_WCHAR_T;
1496
1497 cres = iconv(m2w,
1498 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1499 (char**)&bufPtr, &outbuf );
1500
1501 res += 8-(outbuf/SIZEOF_WCHAR_T);
1502 } while ((cres==(size_t)-1) && (errno==E2BIG));
1503 }
1504
1505 if (ICONV_FAILED(cres, inbuf))
1506 {
1507 //VS: it is ok if iconv fails, hence trace only
1508 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1509 return (size_t)-1;
1510 }
1511
1512 return res;
1513 }
1514
1515 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1516 {
1517 #if wxUSE_THREADS
1518 // NB: explained in MB2WC
1519 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1520 #endif
1521
1522 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1523 size_t outbuf = n;
1524 size_t res, cres;
1525
1526 wchar_t *tmpbuf = 0;
1527
1528 if (ms_wcNeedsSwap)
1529 {
1530 // need to copy to temp buffer to switch endianness
1531 // this absolutely doesn't rock!
1532 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1533 // could be in read-only memory, or be accessed in some other thread)
1534 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1535 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1536 WC_BSWAP(tmpbuf, inbuf)
1537 psz=tmpbuf;
1538 }
1539
1540 if (buf)
1541 {
1542 // have destination buffer, convert there
1543 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1544
1545 res = n-outbuf;
1546
1547 // NB: iconv was given only wcslen(psz) characters on input, and so
1548 // it couldn't convert the trailing zero. Let's do it ourselves
1549 // if there's some room left for it in the output buffer.
1550 if (res < n)
1551 buf[0] = 0;
1552 }
1553 else
1554 {
1555 // no destination buffer... convert using temp buffer
1556 // to calculate destination buffer requirement
1557 char tbuf[16];
1558 res = 0;
1559 do {
1560 buf = tbuf; outbuf = 16;
1561
1562 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1563
1564 res += 16 - outbuf;
1565 } while ((cres==(size_t)-1) && (errno==E2BIG));
1566 }
1567
1568 if (ms_wcNeedsSwap)
1569 {
1570 free(tmpbuf);
1571 }
1572
1573 if (ICONV_FAILED(cres, inbuf))
1574 {
1575 //VS: it is ok if iconv fails, hence trace only
1576 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1577 return (size_t)-1;
1578 }
1579
1580 return res;
1581 }
1582
1583 #endif // HAVE_ICONV
1584
1585
1586 // ============================================================================
1587 // Win32 conversion classes
1588 // ============================================================================
1589
1590 #ifdef wxHAVE_WIN32_MB2WC
1591
1592 // from utils.cpp
1593 #if wxUSE_FONTMAP
1594 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1595 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1596 #endif
1597
1598 class wxMBConv_win32 : public wxMBConv
1599 {
1600 public:
1601 wxMBConv_win32()
1602 {
1603 m_CodePage = CP_ACP;
1604 }
1605
1606 #if wxUSE_FONTMAP
1607 wxMBConv_win32(const wxChar* name)
1608 {
1609 m_CodePage = wxCharsetToCodepage(name);
1610 }
1611
1612 wxMBConv_win32(wxFontEncoding encoding)
1613 {
1614 m_CodePage = wxEncodingToCodepage(encoding);
1615 }
1616 #endif
1617
1618 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1619 {
1620 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1621 // the behaviour is not compatible with the Unix version (using iconv)
1622 // and break the library itself, e.g. wxTextInputStream::NextChar()
1623 // wouldn't work if reading an incomplete MB char didn't result in an
1624 // error
1625 //
1626 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1627 // an error (tested under Windows Server 2003) and apparently it is
1628 // done on purpose, i.e. the function accepts any input in this case
1629 // and although I'd prefer to return error on ill-formed output, our
1630 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1631 // explicitly ill-formed according to RFC 2152) neither so we don't
1632 // even have any fallback here...
1633 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1634
1635 const size_t len = ::MultiByteToWideChar
1636 (
1637 m_CodePage, // code page
1638 flags, // flags: fall on error
1639 psz, // input string
1640 -1, // its length (NUL-terminated)
1641 buf, // output string
1642 buf ? n : 0 // size of output buffer
1643 );
1644
1645 // note that it returns count of written chars for buf != NULL and size
1646 // of the needed buffer for buf == NULL so in either case the length of
1647 // the string (which never includes the terminating NUL) is one less
1648 return len ? len - 1 : (size_t)-1;
1649 }
1650
1651 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1652 {
1653 /*
1654 we have a problem here: by default, WideCharToMultiByte() may
1655 replace characters unrepresentable in the target code page with bad
1656 quality approximations such as turning "1/2" symbol (U+00BD) into
1657 "1" for the code pages which don't have it and we, obviously, want
1658 to avoid this at any price
1659
1660 the trouble is that this function does it _silently_, i.e. it won't
1661 even tell us whether it did or not... Win98/2000 and higher provide
1662 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1663 we have to resort to a round trip, i.e. check that converting back
1664 results in the same string -- this is, of course, expensive but
1665 otherwise we simply can't be sure to not garble the data.
1666 */
1667
1668 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1669 // it doesn't work with CJK encodings (which we test for rather roughly
1670 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1671 // supporting it
1672 BOOL usedDef wxDUMMY_INITIALIZE(false);
1673 BOOL *pUsedDef;
1674 int flags;
1675 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1676 {
1677 // it's our lucky day
1678 flags = WC_NO_BEST_FIT_CHARS;
1679 pUsedDef = &usedDef;
1680 }
1681 else // old system or unsupported encoding
1682 {
1683 flags = 0;
1684 pUsedDef = NULL;
1685 }
1686
1687 const size_t len = ::WideCharToMultiByte
1688 (
1689 m_CodePage, // code page
1690 flags, // either none or no best fit
1691 pwz, // input string
1692 -1, // it is (wide) NUL-terminated
1693 buf, // output buffer
1694 buf ? n : 0, // and its size
1695 NULL, // default "replacement" char
1696 pUsedDef // [out] was it used?
1697 );
1698
1699 if ( !len )
1700 {
1701 // function totally failed
1702 return (size_t)-1;
1703 }
1704
1705 // if we were really converting, check if we succeeded
1706 if ( buf )
1707 {
1708 if ( flags )
1709 {
1710 // check if the conversion failed, i.e. if any replacements
1711 // were done
1712 if ( usedDef )
1713 return (size_t)-1;
1714 }
1715 else // we must resort to double tripping...
1716 {
1717 wxWCharBuffer wcBuf(n);
1718 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1719 wcscmp(wcBuf, pwz) != 0 )
1720 {
1721 // we didn't obtain the same thing we started from, hence
1722 // the conversion was lossy and we consider that it failed
1723 return (size_t)-1;
1724 }
1725 }
1726 }
1727
1728 // see the comment above for the reason of "len - 1"
1729 return len - 1;
1730 }
1731
1732 bool IsOk() const { return m_CodePage != -1; }
1733
1734 private:
1735 static bool CanUseNoBestFit()
1736 {
1737 static int s_isWin98Or2k = -1;
1738
1739 if ( s_isWin98Or2k == -1 )
1740 {
1741 int verMaj, verMin;
1742 switch ( wxGetOsVersion(&verMaj, &verMin) )
1743 {
1744 case wxWIN95:
1745 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1746 break;
1747
1748 case wxWINDOWS_NT:
1749 s_isWin98Or2k = verMaj >= 5;
1750 break;
1751
1752 default:
1753 // unknown, be conseravtive by default
1754 s_isWin98Or2k = 0;
1755 }
1756
1757 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1758 }
1759
1760 return s_isWin98Or2k == 1;
1761 }
1762
1763 long m_CodePage;
1764 };
1765
1766 #endif // wxHAVE_WIN32_MB2WC
1767
1768 // ============================================================================
1769 // Cocoa conversion classes
1770 // ============================================================================
1771
1772 #if defined(__WXCOCOA__)
1773
1774 // RN: There is no UTF-32 support in either Core Foundation or
1775 // Cocoa. Strangely enough, internally Core Foundation uses
1776 // UTF 32 internally quite a bit - its just not public (yet).
1777
1778 #include <CoreFoundation/CFString.h>
1779 #include <CoreFoundation/CFStringEncodingExt.h>
1780
1781 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1782 {
1783 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1784 if ( encoding == wxFONTENCODING_DEFAULT )
1785 {
1786 enc = CFStringGetSystemEncoding();
1787 }
1788 else switch( encoding)
1789 {
1790 case wxFONTENCODING_ISO8859_1 :
1791 enc = kCFStringEncodingISOLatin1 ;
1792 break ;
1793 case wxFONTENCODING_ISO8859_2 :
1794 enc = kCFStringEncodingISOLatin2;
1795 break ;
1796 case wxFONTENCODING_ISO8859_3 :
1797 enc = kCFStringEncodingISOLatin3 ;
1798 break ;
1799 case wxFONTENCODING_ISO8859_4 :
1800 enc = kCFStringEncodingISOLatin4;
1801 break ;
1802 case wxFONTENCODING_ISO8859_5 :
1803 enc = kCFStringEncodingISOLatinCyrillic;
1804 break ;
1805 case wxFONTENCODING_ISO8859_6 :
1806 enc = kCFStringEncodingISOLatinArabic;
1807 break ;
1808 case wxFONTENCODING_ISO8859_7 :
1809 enc = kCFStringEncodingISOLatinGreek;
1810 break ;
1811 case wxFONTENCODING_ISO8859_8 :
1812 enc = kCFStringEncodingISOLatinHebrew;
1813 break ;
1814 case wxFONTENCODING_ISO8859_9 :
1815 enc = kCFStringEncodingISOLatin5;
1816 break ;
1817 case wxFONTENCODING_ISO8859_10 :
1818 enc = kCFStringEncodingISOLatin6;
1819 break ;
1820 case wxFONTENCODING_ISO8859_11 :
1821 enc = kCFStringEncodingISOLatinThai;
1822 break ;
1823 case wxFONTENCODING_ISO8859_13 :
1824 enc = kCFStringEncodingISOLatin7;
1825 break ;
1826 case wxFONTENCODING_ISO8859_14 :
1827 enc = kCFStringEncodingISOLatin8;
1828 break ;
1829 case wxFONTENCODING_ISO8859_15 :
1830 enc = kCFStringEncodingISOLatin9;
1831 break ;
1832
1833 case wxFONTENCODING_KOI8 :
1834 enc = kCFStringEncodingKOI8_R;
1835 break ;
1836 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1837 enc = kCFStringEncodingDOSRussian;
1838 break ;
1839
1840 // case wxFONTENCODING_BULGARIAN :
1841 // enc = ;
1842 // break ;
1843
1844 case wxFONTENCODING_CP437 :
1845 enc =kCFStringEncodingDOSLatinUS ;
1846 break ;
1847 case wxFONTENCODING_CP850 :
1848 enc = kCFStringEncodingDOSLatin1;
1849 break ;
1850 case wxFONTENCODING_CP852 :
1851 enc = kCFStringEncodingDOSLatin2;
1852 break ;
1853 case wxFONTENCODING_CP855 :
1854 enc = kCFStringEncodingDOSCyrillic;
1855 break ;
1856 case wxFONTENCODING_CP866 :
1857 enc =kCFStringEncodingDOSRussian ;
1858 break ;
1859 case wxFONTENCODING_CP874 :
1860 enc = kCFStringEncodingDOSThai;
1861 break ;
1862 case wxFONTENCODING_CP932 :
1863 enc = kCFStringEncodingDOSJapanese;
1864 break ;
1865 case wxFONTENCODING_CP936 :
1866 enc =kCFStringEncodingDOSChineseSimplif ;
1867 break ;
1868 case wxFONTENCODING_CP949 :
1869 enc = kCFStringEncodingDOSKorean;
1870 break ;
1871 case wxFONTENCODING_CP950 :
1872 enc = kCFStringEncodingDOSChineseTrad;
1873 break ;
1874 case wxFONTENCODING_CP1250 :
1875 enc = kCFStringEncodingWindowsLatin2;
1876 break ;
1877 case wxFONTENCODING_CP1251 :
1878 enc =kCFStringEncodingWindowsCyrillic ;
1879 break ;
1880 case wxFONTENCODING_CP1252 :
1881 enc =kCFStringEncodingWindowsLatin1 ;
1882 break ;
1883 case wxFONTENCODING_CP1253 :
1884 enc = kCFStringEncodingWindowsGreek;
1885 break ;
1886 case wxFONTENCODING_CP1254 :
1887 enc = kCFStringEncodingWindowsLatin5;
1888 break ;
1889 case wxFONTENCODING_CP1255 :
1890 enc =kCFStringEncodingWindowsHebrew ;
1891 break ;
1892 case wxFONTENCODING_CP1256 :
1893 enc =kCFStringEncodingWindowsArabic ;
1894 break ;
1895 case wxFONTENCODING_CP1257 :
1896 enc = kCFStringEncodingWindowsBalticRim;
1897 break ;
1898 // This only really encodes to UTF7 (if that) evidently
1899 // case wxFONTENCODING_UTF7 :
1900 // enc = kCFStringEncodingNonLossyASCII ;
1901 // break ;
1902 case wxFONTENCODING_UTF8 :
1903 enc = kCFStringEncodingUTF8 ;
1904 break ;
1905 case wxFONTENCODING_EUC_JP :
1906 enc = kCFStringEncodingEUC_JP;
1907 break ;
1908 case wxFONTENCODING_UTF16 :
1909 enc = kCFStringEncodingUnicode ;
1910 break ;
1911 case wxFONTENCODING_MACROMAN :
1912 enc = kCFStringEncodingMacRoman ;
1913 break ;
1914 case wxFONTENCODING_MACJAPANESE :
1915 enc = kCFStringEncodingMacJapanese ;
1916 break ;
1917 case wxFONTENCODING_MACCHINESETRAD :
1918 enc = kCFStringEncodingMacChineseTrad ;
1919 break ;
1920 case wxFONTENCODING_MACKOREAN :
1921 enc = kCFStringEncodingMacKorean ;
1922 break ;
1923 case wxFONTENCODING_MACARABIC :
1924 enc = kCFStringEncodingMacArabic ;
1925 break ;
1926 case wxFONTENCODING_MACHEBREW :
1927 enc = kCFStringEncodingMacHebrew ;
1928 break ;
1929 case wxFONTENCODING_MACGREEK :
1930 enc = kCFStringEncodingMacGreek ;
1931 break ;
1932 case wxFONTENCODING_MACCYRILLIC :
1933 enc = kCFStringEncodingMacCyrillic ;
1934 break ;
1935 case wxFONTENCODING_MACDEVANAGARI :
1936 enc = kCFStringEncodingMacDevanagari ;
1937 break ;
1938 case wxFONTENCODING_MACGURMUKHI :
1939 enc = kCFStringEncodingMacGurmukhi ;
1940 break ;
1941 case wxFONTENCODING_MACGUJARATI :
1942 enc = kCFStringEncodingMacGujarati ;
1943 break ;
1944 case wxFONTENCODING_MACORIYA :
1945 enc = kCFStringEncodingMacOriya ;
1946 break ;
1947 case wxFONTENCODING_MACBENGALI :
1948 enc = kCFStringEncodingMacBengali ;
1949 break ;
1950 case wxFONTENCODING_MACTAMIL :
1951 enc = kCFStringEncodingMacTamil ;
1952 break ;
1953 case wxFONTENCODING_MACTELUGU :
1954 enc = kCFStringEncodingMacTelugu ;
1955 break ;
1956 case wxFONTENCODING_MACKANNADA :
1957 enc = kCFStringEncodingMacKannada ;
1958 break ;
1959 case wxFONTENCODING_MACMALAJALAM :
1960 enc = kCFStringEncodingMacMalayalam ;
1961 break ;
1962 case wxFONTENCODING_MACSINHALESE :
1963 enc = kCFStringEncodingMacSinhalese ;
1964 break ;
1965 case wxFONTENCODING_MACBURMESE :
1966 enc = kCFStringEncodingMacBurmese ;
1967 break ;
1968 case wxFONTENCODING_MACKHMER :
1969 enc = kCFStringEncodingMacKhmer ;
1970 break ;
1971 case wxFONTENCODING_MACTHAI :
1972 enc = kCFStringEncodingMacThai ;
1973 break ;
1974 case wxFONTENCODING_MACLAOTIAN :
1975 enc = kCFStringEncodingMacLaotian ;
1976 break ;
1977 case wxFONTENCODING_MACGEORGIAN :
1978 enc = kCFStringEncodingMacGeorgian ;
1979 break ;
1980 case wxFONTENCODING_MACARMENIAN :
1981 enc = kCFStringEncodingMacArmenian ;
1982 break ;
1983 case wxFONTENCODING_MACCHINESESIMP :
1984 enc = kCFStringEncodingMacChineseSimp ;
1985 break ;
1986 case wxFONTENCODING_MACTIBETAN :
1987 enc = kCFStringEncodingMacTibetan ;
1988 break ;
1989 case wxFONTENCODING_MACMONGOLIAN :
1990 enc = kCFStringEncodingMacMongolian ;
1991 break ;
1992 case wxFONTENCODING_MACETHIOPIC :
1993 enc = kCFStringEncodingMacEthiopic ;
1994 break ;
1995 case wxFONTENCODING_MACCENTRALEUR :
1996 enc = kCFStringEncodingMacCentralEurRoman ;
1997 break ;
1998 case wxFONTENCODING_MACVIATNAMESE :
1999 enc = kCFStringEncodingMacVietnamese ;
2000 break ;
2001 case wxFONTENCODING_MACARABICEXT :
2002 enc = kCFStringEncodingMacExtArabic ;
2003 break ;
2004 case wxFONTENCODING_MACSYMBOL :
2005 enc = kCFStringEncodingMacSymbol ;
2006 break ;
2007 case wxFONTENCODING_MACDINGBATS :
2008 enc = kCFStringEncodingMacDingbats ;
2009 break ;
2010 case wxFONTENCODING_MACTURKISH :
2011 enc = kCFStringEncodingMacTurkish ;
2012 break ;
2013 case wxFONTENCODING_MACCROATIAN :
2014 enc = kCFStringEncodingMacCroatian ;
2015 break ;
2016 case wxFONTENCODING_MACICELANDIC :
2017 enc = kCFStringEncodingMacIcelandic ;
2018 break ;
2019 case wxFONTENCODING_MACROMANIAN :
2020 enc = kCFStringEncodingMacRomanian ;
2021 break ;
2022 case wxFONTENCODING_MACCELTIC :
2023 enc = kCFStringEncodingMacCeltic ;
2024 break ;
2025 case wxFONTENCODING_MACGAELIC :
2026 enc = kCFStringEncodingMacGaelic ;
2027 break ;
2028 // case wxFONTENCODING_MACKEYBOARD :
2029 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2030 // break ;
2031 default :
2032 // because gcc is picky
2033 break ;
2034 } ;
2035 return enc ;
2036 }
2037
2038 class wxMBConv_cocoa : public wxMBConv
2039 {
2040 public:
2041 wxMBConv_cocoa()
2042 {
2043 Init(CFStringGetSystemEncoding()) ;
2044 }
2045
2046 #if wxUSE_FONTMAP
2047 wxMBConv_cocoa(const wxChar* name)
2048 {
2049 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2050 }
2051 #endif
2052
2053 wxMBConv_cocoa(wxFontEncoding encoding)
2054 {
2055 Init( wxCFStringEncFromFontEnc(encoding) );
2056 }
2057
2058 ~wxMBConv_cocoa()
2059 {
2060 }
2061
2062 void Init( CFStringEncoding encoding)
2063 {
2064 m_encoding = encoding ;
2065 }
2066
2067 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2068 {
2069 wxASSERT(szUnConv);
2070
2071 CFStringRef theString = CFStringCreateWithBytes (
2072 NULL, //the allocator
2073 (const UInt8*)szUnConv,
2074 strlen(szUnConv),
2075 m_encoding,
2076 false //no BOM/external representation
2077 );
2078
2079 wxASSERT(theString);
2080
2081 size_t nOutLength = CFStringGetLength(theString);
2082
2083 if (szOut == NULL)
2084 {
2085 CFRelease(theString);
2086 return nOutLength;
2087 }
2088
2089 CFRange theRange = { 0, nOutSize };
2090
2091 #if SIZEOF_WCHAR_T == 4
2092 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2093 #endif
2094
2095 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2096
2097 CFRelease(theString);
2098
2099 szUniCharBuffer[nOutLength] = '\0' ;
2100
2101 #if SIZEOF_WCHAR_T == 4
2102 wxMBConvUTF16 converter ;
2103 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2104 delete[] szUniCharBuffer;
2105 #endif
2106
2107 return nOutLength;
2108 }
2109
2110 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2111 {
2112 wxASSERT(szUnConv);
2113
2114 size_t nRealOutSize;
2115 size_t nBufSize = wxWcslen(szUnConv);
2116 UniChar* szUniBuffer = (UniChar*) szUnConv;
2117
2118 #if SIZEOF_WCHAR_T == 4
2119 wxMBConvUTF16 converter ;
2120 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2121 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2122 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2123 nBufSize /= sizeof(UniChar);
2124 #endif
2125
2126 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2127 NULL, //allocator
2128 szUniBuffer,
2129 nBufSize,
2130 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2131 );
2132
2133 wxASSERT(theString);
2134
2135 //Note that CER puts a BOM when converting to unicode
2136 //so we check and use getchars instead in that case
2137 if (m_encoding == kCFStringEncodingUnicode)
2138 {
2139 if (szOut != NULL)
2140 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2141
2142 nRealOutSize = CFStringGetLength(theString) + 1;
2143 }
2144 else
2145 {
2146 CFStringGetBytes(
2147 theString,
2148 CFRangeMake(0, CFStringGetLength(theString)),
2149 m_encoding,
2150 0, //what to put in characters that can't be converted -
2151 //0 tells CFString to return NULL if it meets such a character
2152 false, //not an external representation
2153 (UInt8*) szOut,
2154 nOutSize,
2155 (CFIndex*) &nRealOutSize
2156 );
2157 }
2158
2159 CFRelease(theString);
2160
2161 #if SIZEOF_WCHAR_T == 4
2162 delete[] szUniBuffer;
2163 #endif
2164
2165 return nRealOutSize - 1;
2166 }
2167
2168 bool IsOk() const
2169 {
2170 return m_encoding != kCFStringEncodingInvalidId &&
2171 CFStringIsEncodingAvailable(m_encoding);
2172 }
2173
2174 private:
2175 CFStringEncoding m_encoding ;
2176 };
2177
2178 #endif // defined(__WXCOCOA__)
2179
2180 // ============================================================================
2181 // Mac conversion classes
2182 // ============================================================================
2183
2184 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2185
2186 class wxMBConv_mac : public wxMBConv
2187 {
2188 public:
2189 wxMBConv_mac()
2190 {
2191 Init(CFStringGetSystemEncoding()) ;
2192 }
2193
2194 #if wxUSE_FONTMAP
2195 wxMBConv_mac(const wxChar* name)
2196 {
2197 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2198 }
2199 #endif
2200
2201 wxMBConv_mac(wxFontEncoding encoding)
2202 {
2203 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2204 }
2205
2206 ~wxMBConv_mac()
2207 {
2208 OSStatus status = noErr ;
2209 status = TECDisposeConverter(m_MB2WC_converter);
2210 status = TECDisposeConverter(m_WC2MB_converter);
2211 }
2212
2213
2214 void Init( TextEncodingBase encoding)
2215 {
2216 OSStatus status = noErr ;
2217 m_char_encoding = encoding ;
2218 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2219
2220 status = TECCreateConverter(&m_MB2WC_converter,
2221 m_char_encoding,
2222 m_unicode_encoding);
2223 status = TECCreateConverter(&m_WC2MB_converter,
2224 m_unicode_encoding,
2225 m_char_encoding);
2226 }
2227
2228 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2229 {
2230 OSStatus status = noErr ;
2231 ByteCount byteOutLen ;
2232 ByteCount byteInLen = strlen(psz) ;
2233 wchar_t *tbuf = NULL ;
2234 UniChar* ubuf = NULL ;
2235 size_t res = 0 ;
2236
2237 if (buf == NULL)
2238 {
2239 //apple specs say at least 32
2240 n = wxMax( 32 , byteInLen ) ;
2241 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2242 }
2243 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2244 #if SIZEOF_WCHAR_T == 4
2245 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2246 #else
2247 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2248 #endif
2249 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2250 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2251 #if SIZEOF_WCHAR_T == 4
2252 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2253 // is not properly terminated we get random characters at the end
2254 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2255 wxMBConvUTF16 converter ;
2256 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2257 free( ubuf ) ;
2258 #else
2259 res = byteOutLen / sizeof( UniChar ) ;
2260 #endif
2261 if ( buf == NULL )
2262 free(tbuf) ;
2263
2264 if ( buf && res < n)
2265 buf[res] = 0;
2266
2267 return res ;
2268 }
2269
2270 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2271 {
2272 OSStatus status = noErr ;
2273 ByteCount byteOutLen ;
2274 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2275
2276 char *tbuf = NULL ;
2277
2278 if (buf == NULL)
2279 {
2280 //apple specs say at least 32
2281 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2282 tbuf = (char*) malloc( n ) ;
2283 }
2284
2285 ByteCount byteBufferLen = n ;
2286 UniChar* ubuf = NULL ;
2287 #if SIZEOF_WCHAR_T == 4
2288 wxMBConvUTF16 converter ;
2289 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2290 byteInLen = unicharlen ;
2291 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2292 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2293 #else
2294 ubuf = (UniChar*) psz ;
2295 #endif
2296 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2297 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2298 #if SIZEOF_WCHAR_T == 4
2299 free( ubuf ) ;
2300 #endif
2301 if ( buf == NULL )
2302 free(tbuf) ;
2303
2304 size_t res = byteOutLen ;
2305 if ( buf && res < n)
2306 {
2307 buf[res] = 0;
2308
2309 //we need to double-trip to verify it didn't insert any ? in place
2310 //of bogus characters
2311 wxWCharBuffer wcBuf(n);
2312 size_t pszlen = wxWcslen(psz);
2313 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2314 wxWcslen(wcBuf) != pszlen ||
2315 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2316 {
2317 // we didn't obtain the same thing we started from, hence
2318 // the conversion was lossy and we consider that it failed
2319 return (size_t)-1;
2320 }
2321 }
2322
2323 return res ;
2324 }
2325
2326 bool IsOk() const
2327 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2328
2329 private:
2330 TECObjectRef m_MB2WC_converter ;
2331 TECObjectRef m_WC2MB_converter ;
2332
2333 TextEncodingBase m_char_encoding ;
2334 TextEncodingBase m_unicode_encoding ;
2335 };
2336
2337 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2338
2339 // ============================================================================
2340 // wxEncodingConverter based conversion classes
2341 // ============================================================================
2342
2343 #if wxUSE_FONTMAP
2344
2345 class wxMBConv_wxwin : public wxMBConv
2346 {
2347 private:
2348 void Init()
2349 {
2350 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2351 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2352 }
2353
2354 public:
2355 // temporarily just use wxEncodingConverter stuff,
2356 // so that it works while a better implementation is built
2357 wxMBConv_wxwin(const wxChar* name)
2358 {
2359 if (name)
2360 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2361 else
2362 m_enc = wxFONTENCODING_SYSTEM;
2363
2364 Init();
2365 }
2366
2367 wxMBConv_wxwin(wxFontEncoding enc)
2368 {
2369 m_enc = enc;
2370
2371 Init();
2372 }
2373
2374 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2375 {
2376 size_t inbuf = strlen(psz);
2377 if (buf)
2378 {
2379 if (!m2w.Convert(psz,buf))
2380 return (size_t)-1;
2381 }
2382 return inbuf;
2383 }
2384
2385 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2386 {
2387 const size_t inbuf = wxWcslen(psz);
2388 if (buf)
2389 {
2390 if (!w2m.Convert(psz,buf))
2391 return (size_t)-1;
2392 }
2393
2394 return inbuf;
2395 }
2396
2397 bool IsOk() const { return m_ok; }
2398
2399 public:
2400 wxFontEncoding m_enc;
2401 wxEncodingConverter m2w, w2m;
2402
2403 // were we initialized successfully?
2404 bool m_ok;
2405
2406 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2407 };
2408
2409 #endif // wxUSE_FONTMAP
2410
2411 // ============================================================================
2412 // wxCSConv implementation
2413 // ============================================================================
2414
2415 void wxCSConv::Init()
2416 {
2417 m_name = NULL;
2418 m_convReal = NULL;
2419 m_deferred = true;
2420 }
2421
2422 wxCSConv::wxCSConv(const wxChar *charset)
2423 {
2424 Init();
2425
2426 if ( charset )
2427 {
2428 SetName(charset);
2429 }
2430
2431 m_encoding = wxFONTENCODING_SYSTEM;
2432 }
2433
2434 wxCSConv::wxCSConv(wxFontEncoding encoding)
2435 {
2436 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2437 {
2438 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2439
2440 encoding = wxFONTENCODING_SYSTEM;
2441 }
2442
2443 Init();
2444
2445 m_encoding = encoding;
2446 }
2447
2448 wxCSConv::~wxCSConv()
2449 {
2450 Clear();
2451 }
2452
2453 wxCSConv::wxCSConv(const wxCSConv& conv)
2454 : wxMBConv()
2455 {
2456 Init();
2457
2458 SetName(conv.m_name);
2459 m_encoding = conv.m_encoding;
2460 }
2461
2462 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2463 {
2464 Clear();
2465
2466 SetName(conv.m_name);
2467 m_encoding = conv.m_encoding;
2468
2469 return *this;
2470 }
2471
2472 void wxCSConv::Clear()
2473 {
2474 free(m_name);
2475 delete m_convReal;
2476
2477 m_name = NULL;
2478 m_convReal = NULL;
2479 }
2480
2481 void wxCSConv::SetName(const wxChar *charset)
2482 {
2483 if (charset)
2484 {
2485 m_name = wxStrdup(charset);
2486 m_deferred = true;
2487 }
2488 }
2489
2490 wxMBConv *wxCSConv::DoCreate() const
2491 {
2492 // check for the special case of ASCII or ISO8859-1 charset: as we have
2493 // special knowledge of it anyhow, we don't need to create a special
2494 // conversion object
2495 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2496 {
2497 // don't convert at all
2498 return NULL;
2499 }
2500
2501 // we trust OS to do conversion better than we can so try external
2502 // conversion methods first
2503 //
2504 // the full order is:
2505 // 1. OS conversion (iconv() under Unix or Win32 API)
2506 // 2. hard coded conversions for UTF
2507 // 3. wxEncodingConverter as fall back
2508
2509 // step (1)
2510 #ifdef HAVE_ICONV
2511 #if !wxUSE_FONTMAP
2512 if ( m_name )
2513 #endif // !wxUSE_FONTMAP
2514 {
2515 wxString name(m_name);
2516
2517 #if wxUSE_FONTMAP
2518 if ( name.empty() )
2519 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2520 #endif // wxUSE_FONTMAP
2521
2522 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2523 if ( conv->IsOk() )
2524 return conv;
2525
2526 delete conv;
2527 }
2528 #endif // HAVE_ICONV
2529
2530 #ifdef wxHAVE_WIN32_MB2WC
2531 {
2532 #if wxUSE_FONTMAP
2533 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2534 : new wxMBConv_win32(m_encoding);
2535 if ( conv->IsOk() )
2536 return conv;
2537
2538 delete conv;
2539 #else
2540 return NULL;
2541 #endif
2542 }
2543 #endif // wxHAVE_WIN32_MB2WC
2544 #if defined(__WXMAC__)
2545 {
2546 // leave UTF16 and UTF32 to the built-ins of wx
2547 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2548 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2549 {
2550
2551 #if wxUSE_FONTMAP
2552 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2553 : new wxMBConv_mac(m_encoding);
2554 #else
2555 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2556 #endif
2557 if ( conv->IsOk() )
2558 return conv;
2559
2560 delete conv;
2561 }
2562 }
2563 #endif
2564 #if defined(__WXCOCOA__)
2565 {
2566 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2567 {
2568
2569 #if wxUSE_FONTMAP
2570 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2571 : new wxMBConv_cocoa(m_encoding);
2572 #else
2573 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2574 #endif
2575 if ( conv->IsOk() )
2576 return conv;
2577
2578 delete conv;
2579 }
2580 }
2581 #endif
2582 // step (2)
2583 wxFontEncoding enc = m_encoding;
2584 #if wxUSE_FONTMAP
2585 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2586 {
2587 // use "false" to suppress interactive dialogs -- we can be called from
2588 // anywhere and popping up a dialog from here is the last thing we want to
2589 // do
2590 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2591 }
2592 #endif // wxUSE_FONTMAP
2593
2594 switch ( enc )
2595 {
2596 case wxFONTENCODING_UTF7:
2597 return new wxMBConvUTF7;
2598
2599 case wxFONTENCODING_UTF8:
2600 return new wxMBConvUTF8;
2601
2602 case wxFONTENCODING_UTF16BE:
2603 return new wxMBConvUTF16BE;
2604
2605 case wxFONTENCODING_UTF16LE:
2606 return new wxMBConvUTF16LE;
2607
2608 case wxFONTENCODING_UTF32BE:
2609 return new wxMBConvUTF32BE;
2610
2611 case wxFONTENCODING_UTF32LE:
2612 return new wxMBConvUTF32LE;
2613
2614 default:
2615 // nothing to do but put here to suppress gcc warnings
2616 ;
2617 }
2618
2619 // step (3)
2620 #if wxUSE_FONTMAP
2621 {
2622 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2623 : new wxMBConv_wxwin(m_encoding);
2624 if ( conv->IsOk() )
2625 return conv;
2626
2627 delete conv;
2628 }
2629 #endif // wxUSE_FONTMAP
2630
2631 // NB: This is a hack to prevent deadlock. What could otherwise happen
2632 // in Unicode build: wxConvLocal creation ends up being here
2633 // because of some failure and logs the error. But wxLog will try to
2634 // attach timestamp, for which it will need wxConvLocal (to convert
2635 // time to char* and then wchar_t*), but that fails, tries to log
2636 // error, but wxLog has a (already locked) critical section that
2637 // guards static buffer.
2638 static bool alreadyLoggingError = false;
2639 if (!alreadyLoggingError)
2640 {
2641 alreadyLoggingError = true;
2642 wxLogError(_("Cannot convert from the charset '%s'!"),
2643 m_name ? m_name
2644 :
2645 #if wxUSE_FONTMAP
2646 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2647 #else // !wxUSE_FONTMAP
2648 wxString::Format(_("encoding %s"), m_encoding).c_str()
2649 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2650 );
2651 alreadyLoggingError = false;
2652 }
2653
2654 return NULL;
2655 }
2656
2657 void wxCSConv::CreateConvIfNeeded() const
2658 {
2659 if ( m_deferred )
2660 {
2661 wxCSConv *self = (wxCSConv *)this; // const_cast
2662
2663 #if wxUSE_INTL
2664 // if we don't have neither the name nor the encoding, use the default
2665 // encoding for this system
2666 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2667 {
2668 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2669 }
2670 #endif // wxUSE_INTL
2671
2672 self->m_convReal = DoCreate();
2673 self->m_deferred = false;
2674 }
2675 }
2676
2677 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2678 {
2679 CreateConvIfNeeded();
2680
2681 if (m_convReal)
2682 return m_convReal->MB2WC(buf, psz, n);
2683
2684 // latin-1 (direct)
2685 size_t len = strlen(psz);
2686
2687 if (buf)
2688 {
2689 for (size_t c = 0; c <= len; c++)
2690 buf[c] = (unsigned char)(psz[c]);
2691 }
2692
2693 return len;
2694 }
2695
2696 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2697 {
2698 CreateConvIfNeeded();
2699
2700 if (m_convReal)
2701 return m_convReal->WC2MB(buf, psz, n);
2702
2703 // latin-1 (direct)
2704 const size_t len = wxWcslen(psz);
2705 if (buf)
2706 {
2707 for (size_t c = 0; c <= len; c++)
2708 {
2709 if (psz[c] > 0xFF)
2710 return (size_t)-1;
2711 buf[c] = (char)psz[c];
2712 }
2713 }
2714 else
2715 {
2716 for (size_t c = 0; c <= len; c++)
2717 {
2718 if (psz[c] > 0xFF)
2719 return (size_t)-1;
2720 }
2721 }
2722
2723 return len;
2724 }
2725
2726 // ----------------------------------------------------------------------------
2727 // globals
2728 // ----------------------------------------------------------------------------
2729
2730 #ifdef __WINDOWS__
2731 static wxMBConv_win32 wxConvLibcObj;
2732 #elif defined(__WXMAC__) && !defined(__MACH__)
2733 static wxMBConv_mac wxConvLibcObj ;
2734 #else
2735 static wxMBConvLibc wxConvLibcObj;
2736 #endif
2737
2738 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2739 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2740 static wxMBConvUTF7 wxConvUTF7Obj;
2741 static wxMBConvUTF8 wxConvUTF8Obj;
2742
2743 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2744 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2745 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2746 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2747 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2748 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2749 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2750 #ifdef __WXOSX__
2751 wxConvUTF8Obj;
2752 #else
2753 wxConvLibcObj;
2754 #endif
2755
2756
2757 #else // !wxUSE_WCHAR_T
2758
2759 // stand-ins in absence of wchar_t
2760 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2761 wxConvISO8859_1,
2762 wxConvLocal,
2763 wxConvUTF8;
2764
2765 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2766
2767