]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
assign 0, not NULL, to an int
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WINDOWS__
44 #include "wx/msw/private.h"
45 #include "wx/msw/missing.h"
46 #endif
47
48 #ifndef __WXWINCE__
49 #include <errno.h>
50 #endif
51
52 #include <ctype.h>
53 #include <string.h>
54 #include <stdlib.h>
55
56 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
57 #define wxHAVE_WIN32_MB2WC
58 #endif // __WIN32__ but !__WXMICROWIN__
59
60 // ----------------------------------------------------------------------------
61 // headers
62 // ----------------------------------------------------------------------------
63
64 #ifdef __SALFORDC__
65 #include <clib.h>
66 #endif
67
68 #ifdef HAVE_ICONV
69 #include <iconv.h>
70 #include "wx/thread.h"
71 #endif
72
73 #include "wx/encconv.h"
74 #include "wx/fontmap.h"
75 #include "wx/utils.h"
76
77 #ifdef __WXMAC__
78 #ifndef __DARWIN__
79 #include <ATSUnicode.h>
80 #include <TextCommon.h>
81 #include <TextEncodingConverter.h>
82 #endif
83
84 #include "wx/mac/private.h" // includes mac headers
85 #endif
86 // ----------------------------------------------------------------------------
87 // macros
88 // ----------------------------------------------------------------------------
89
90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
92
93 #if SIZEOF_WCHAR_T == 4
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
98 #else
99 #define WC_NAME_BEST "UCS-4LE"
100 #endif
101 #elif SIZEOF_WCHAR_T == 2
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
104 #define WC_UTF16
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
107 #else
108 #define WC_NAME_BEST "UTF-16LE"
109 #endif
110 #else // sizeof(wchar_t) != 2 nor 4
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
113 #endif
114
115 // ============================================================================
116 // implementation
117 // ============================================================================
118
119 // ----------------------------------------------------------------------------
120 // UTF-16 en/decoding to/from UCS-4
121 // ----------------------------------------------------------------------------
122
123
124 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
125 {
126 if (input<=0xffff)
127 {
128 if (output)
129 *output = (wxUint16) input;
130 return 1;
131 }
132 else if (input>=0x110000)
133 {
134 return (size_t)-1;
135 }
136 else
137 {
138 if (output)
139 {
140 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
141 *output = (wxUint16) ((input&0x3ff)+0xdc00);
142 }
143 return 2;
144 }
145 }
146
147 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
148 {
149 if ((*input<0xd800) || (*input>0xdfff))
150 {
151 output = *input;
152 return 1;
153 }
154 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
155 {
156 output = *input;
157 return (size_t)-1;
158 }
159 else
160 {
161 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
162 return 2;
163 }
164 }
165
166
167 // ----------------------------------------------------------------------------
168 // wxMBConv
169 // ----------------------------------------------------------------------------
170
171 wxMBConv::~wxMBConv()
172 {
173 // nothing to do here (necessary for Darwin linking probably)
174 }
175
176 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
177 {
178 if ( psz )
179 {
180 // calculate the length of the buffer needed first
181 size_t nLen = MB2WC(NULL, psz, 0);
182 if ( nLen != (size_t)-1 )
183 {
184 // now do the actual conversion
185 wxWCharBuffer buf(nLen);
186 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
187 if ( nLen != (size_t)-1 )
188 {
189 return buf;
190 }
191 }
192 }
193
194 wxWCharBuffer buf((wchar_t *)NULL);
195
196 return buf;
197 }
198
199 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
200 {
201 if ( pwz )
202 {
203 size_t nLen = WC2MB(NULL, pwz, 0);
204 if ( nLen != (size_t)-1 )
205 {
206 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
207 nLen = WC2MB(buf.data(), pwz, nLen + 4);
208 if ( nLen != (size_t)-1 )
209 {
210 return buf;
211 }
212 }
213 }
214
215 wxCharBuffer buf((char *)NULL);
216
217 return buf;
218 }
219
220 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
221 {
222 wxASSERT(pOutSize != NULL);
223
224 const char* szEnd = szString + nStringLen + 1;
225 const char* szPos = szString;
226 const char* szStart = szPos;
227
228 size_t nActualLength = 0;
229 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
230
231 wxWCharBuffer theBuffer(nCurrentSize);
232
233 //Convert the string until the length() is reached, continuing the
234 //loop every time a null character is reached
235 while(szPos != szEnd)
236 {
237 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
238
239 //Get the length of the current (sub)string
240 size_t nLen = MB2WC(NULL, szPos, 0);
241
242 //Invalid conversion?
243 if( nLen == (size_t)-1 )
244 {
245 *pOutSize = 0;
246 theBuffer.data()[0u] = wxT('\0');
247 return theBuffer;
248 }
249
250
251 //Increase the actual length (+1 for current null character)
252 nActualLength += nLen + 1;
253
254 //if buffer too big, realloc the buffer
255 if (nActualLength > (nCurrentSize+1))
256 {
257 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
258 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
259 theBuffer = theNewBuffer;
260 nCurrentSize <<= 1;
261 }
262
263 //Convert the current (sub)string
264 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
265 {
266 *pOutSize = 0;
267 theBuffer.data()[0u] = wxT('\0');
268 return theBuffer;
269 }
270
271 //Increment to next (sub)string
272 //Note that we have to use strlen instead of nLen here
273 //because XX2XX gives us the size of the output buffer,
274 //which is not necessarily the length of the string
275 szPos += strlen(szPos) + 1;
276 }
277
278 //success - return actual length and the buffer
279 *pOutSize = nActualLength;
280 return theBuffer;
281 }
282
283 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
284 {
285 wxASSERT(pOutSize != NULL);
286
287 const wchar_t* szEnd = szString + nStringLen + 1;
288 const wchar_t* szPos = szString;
289 const wchar_t* szStart = szPos;
290
291 size_t nActualLength = 0;
292 size_t nCurrentSize = nStringLen << 2; //try * 4 first
293
294 wxCharBuffer theBuffer(nCurrentSize);
295
296 //Convert the string until the length() is reached, continuing the
297 //loop every time a null character is reached
298 while(szPos != szEnd)
299 {
300 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
301
302 //Get the length of the current (sub)string
303 size_t nLen = WC2MB(NULL, szPos, 0);
304
305 //Invalid conversion?
306 if( nLen == (size_t)-1 )
307 {
308 *pOutSize = 0;
309 theBuffer.data()[0u] = wxT('\0');
310 return theBuffer;
311 }
312
313 //Increase the actual length (+1 for current null character)
314 nActualLength += nLen + 1;
315
316 //if buffer too big, realloc the buffer
317 if (nActualLength > (nCurrentSize+1))
318 {
319 wxCharBuffer theNewBuffer(nCurrentSize << 1);
320 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
321 theBuffer = theNewBuffer;
322 nCurrentSize <<= 1;
323 }
324
325 //Convert the current (sub)string
326 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
327 {
328 *pOutSize = 0;
329 theBuffer.data()[0u] = wxT('\0');
330 return theBuffer;
331 }
332
333 //Increment to next (sub)string
334 //Note that we have to use wxWcslen instead of nLen here
335 //because XX2XX gives us the size of the output buffer,
336 //which is not necessarily the length of the string
337 szPos += wxWcslen(szPos) + 1;
338 }
339
340 //success - return actual length and the buffer
341 *pOutSize = nActualLength;
342 return theBuffer;
343 }
344
345 // ----------------------------------------------------------------------------
346 // wxMBConvLibc
347 // ----------------------------------------------------------------------------
348
349 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
350 {
351 return wxMB2WC(buf, psz, n);
352 }
353
354 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
355 {
356 return wxWC2MB(buf, psz, n);
357 }
358
359 #ifdef __UNIX__
360
361 // ----------------------------------------------------------------------------
362 // wxConvBrokenFileNames
363 // ----------------------------------------------------------------------------
364
365 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
366 {
367 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
368 || wxStricmp(charset, _T("UTF8")) == 0 )
369 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
370 else
371 m_conv = new wxCSConv(charset);
372 }
373
374 size_t
375 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
376 const char *psz,
377 size_t outputSize) const
378 {
379 return m_conv->MB2WC( outputBuf, psz, outputSize );
380 }
381
382 size_t
383 wxConvBrokenFileNames::WC2MB(char *outputBuf,
384 const wchar_t *psz,
385 size_t outputSize) const
386 {
387 return m_conv->WC2MB( outputBuf, psz, outputSize );
388 }
389
390 #endif
391
392 // ----------------------------------------------------------------------------
393 // UTF-7
394 // ----------------------------------------------------------------------------
395
396 // Implementation (C) 2004 Fredrik Roubert
397
398 //
399 // BASE64 decoding table
400 //
401 static const unsigned char utf7unb64[] =
402 {
403 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
409 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
410 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
412 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
413 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
414 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
415 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
416 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
417 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
418 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
419 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
420 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
435 };
436
437 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
438 {
439 size_t len = 0;
440
441 while (*psz && ((!buf) || (len < n)))
442 {
443 unsigned char cc = *psz++;
444 if (cc != '+')
445 {
446 // plain ASCII char
447 if (buf)
448 *buf++ = cc;
449 len++;
450 }
451 else if (*psz == '-')
452 {
453 // encoded plus sign
454 if (buf)
455 *buf++ = cc;
456 len++;
457 psz++;
458 }
459 else
460 {
461 // BASE64 encoded string
462 bool lsb;
463 unsigned char c;
464 unsigned int d, l;
465 for (lsb = false, d = 0, l = 0;
466 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
467 {
468 d <<= 6;
469 d += cc;
470 for (l += 6; l >= 8; lsb = !lsb)
471 {
472 c = (unsigned char)((d >> (l -= 8)) % 256);
473 if (lsb)
474 {
475 if (buf)
476 *buf++ |= c;
477 len ++;
478 }
479 else
480 if (buf)
481 *buf = (wchar_t)(c << 8);
482 }
483 }
484 if (*psz == '-')
485 psz++;
486 }
487 }
488 if (buf && (len < n))
489 *buf = 0;
490 return len;
491 }
492
493 //
494 // BASE64 encoding table
495 //
496 static const unsigned char utf7enb64[] =
497 {
498 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
499 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
500 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
501 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
502 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
503 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
504 'w', 'x', 'y', 'z', '0', '1', '2', '3',
505 '4', '5', '6', '7', '8', '9', '+', '/'
506 };
507
508 //
509 // UTF-7 encoding table
510 //
511 // 0 - Set D (directly encoded characters)
512 // 1 - Set O (optional direct characters)
513 // 2 - whitespace characters (optional)
514 // 3 - special characters
515 //
516 static const unsigned char utf7encode[128] =
517 {
518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
519 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
520 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
522 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
524 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
526 };
527
528 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
529 {
530
531
532 size_t len = 0;
533
534 while (*psz && ((!buf) || (len < n)))
535 {
536 wchar_t cc = *psz++;
537 if (cc < 0x80 && utf7encode[cc] < 1)
538 {
539 // plain ASCII char
540 if (buf)
541 *buf++ = (char)cc;
542 len++;
543 }
544 #ifndef WC_UTF16
545 else if (((wxUint32)cc) > 0xffff)
546 {
547 // no surrogate pair generation (yet?)
548 return (size_t)-1;
549 }
550 #endif
551 else
552 {
553 if (buf)
554 *buf++ = '+';
555 len++;
556 if (cc != '+')
557 {
558 // BASE64 encode string
559 unsigned int lsb, d, l;
560 for (d = 0, l = 0;; psz++)
561 {
562 for (lsb = 0; lsb < 2; lsb ++)
563 {
564 d <<= 8;
565 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
566
567 for (l += 8; l >= 6; )
568 {
569 l -= 6;
570 if (buf)
571 *buf++ = utf7enb64[(d >> l) % 64];
572 len++;
573 }
574 }
575 cc = *psz;
576 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
577 break;
578 }
579 if (l != 0)
580 {
581 if (buf)
582 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
583 len++;
584 }
585 }
586 if (buf)
587 *buf++ = '-';
588 len++;
589 }
590 }
591 if (buf && (len < n))
592 *buf = 0;
593 return len;
594 }
595
596 // ----------------------------------------------------------------------------
597 // UTF-8
598 // ----------------------------------------------------------------------------
599
600 static wxUint32 utf8_max[]=
601 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
602
603 // boundaries of the private use area we use to (temporarily) remap invalid
604 // characters invalid in a UTF-8 encoded string
605 const wxUint32 wxUnicodePUA = 0x100000;
606 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
607
608 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
609 {
610 size_t len = 0;
611
612 while (*psz && ((!buf) || (len < n)))
613 {
614 const char *opsz = psz;
615 bool invalid = false;
616 unsigned char cc = *psz++, fc = cc;
617 unsigned cnt;
618 for (cnt = 0; fc & 0x80; cnt++)
619 fc <<= 1;
620 if (!cnt)
621 {
622 // plain ASCII char
623 if (buf)
624 *buf++ = cc;
625 len++;
626
627 // escape the escape character for octal escapes
628 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
629 && cc == '\\' && (!buf || len < n))
630 {
631 if (buf)
632 *buf++ = cc;
633 len++;
634 }
635 }
636 else
637 {
638 cnt--;
639 if (!cnt)
640 {
641 // invalid UTF-8 sequence
642 invalid = true;
643 }
644 else
645 {
646 unsigned ocnt = cnt - 1;
647 wxUint32 res = cc & (0x3f >> cnt);
648 while (cnt--)
649 {
650 cc = *psz;
651 if ((cc & 0xC0) != 0x80)
652 {
653 // invalid UTF-8 sequence
654 invalid = true;
655 break;
656 }
657 psz++;
658 res = (res << 6) | (cc & 0x3f);
659 }
660 if (invalid || res <= utf8_max[ocnt])
661 {
662 // illegal UTF-8 encoding
663 invalid = true;
664 }
665 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
666 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
667 {
668 // if one of our PUA characters turns up externally
669 // it must also be treated as an illegal sequence
670 // (a bit like you have to escape an escape character)
671 invalid = true;
672 }
673 else
674 {
675 #ifdef WC_UTF16
676 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
677 size_t pa = encode_utf16(res, (wxUint16 *)buf);
678 if (pa == (size_t)-1)
679 {
680 invalid = true;
681 }
682 else
683 {
684 if (buf)
685 buf += pa;
686 len += pa;
687 }
688 #else // !WC_UTF16
689 if (buf)
690 *buf++ = res;
691 len++;
692 #endif // WC_UTF16/!WC_UTF16
693 }
694 }
695 if (invalid)
696 {
697 if (m_options & MAP_INVALID_UTF8_TO_PUA)
698 {
699 while (opsz < psz && (!buf || len < n))
700 {
701 #ifdef WC_UTF16
702 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
703 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
704 wxASSERT(pa != (size_t)-1);
705 if (buf)
706 buf += pa;
707 opsz++;
708 len += pa;
709 #else
710 if (buf)
711 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
712 opsz++;
713 len++;
714 #endif
715 }
716 }
717 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
718 {
719 while (opsz < psz && (!buf || len < n))
720 {
721 if ( buf && len + 3 < n )
722 {
723 unsigned char n = *opsz;
724 *buf++ = L'\\';
725 *buf++ = (wchar_t)( L'0' + n / 0100 );
726 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
727 *buf++ = (wchar_t)( L'0' + n % 010 );
728 }
729 opsz++;
730 len += 4;
731 }
732 }
733 else // MAP_INVALID_UTF8_NOT
734 {
735 return (size_t)-1;
736 }
737 }
738 }
739 }
740 if (buf && (len < n))
741 *buf = 0;
742 return len;
743 }
744
745 static inline bool isoctal(wchar_t wch)
746 {
747 return L'0' <= wch && wch <= L'7';
748 }
749
750 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
751 {
752 size_t len = 0;
753
754 while (*psz && ((!buf) || (len < n)))
755 {
756 wxUint32 cc;
757 #ifdef WC_UTF16
758 // cast is ok for WC_UTF16
759 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
760 psz += (pa == (size_t)-1) ? 1 : pa;
761 #else
762 cc=(*psz++) & 0x7fffffff;
763 #endif
764
765 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
766 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
767 {
768 if (buf)
769 *buf++ = (char)(cc - wxUnicodePUA);
770 len++;
771 }
772 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
773 && cc == L'\\' && psz[0] == L'\\' )
774 {
775 if (buf)
776 *buf++ = (char)cc;
777 psz++;
778 len++;
779 }
780 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
781 cc == L'\\' &&
782 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
783 {
784 if (buf)
785 {
786 *buf++ = (char) ((psz[0] - L'0')*0100 +
787 (psz[1] - L'0')*010 +
788 (psz[2] - L'0'));
789 }
790
791 psz += 3;
792 len++;
793 }
794 else
795 {
796 unsigned cnt;
797 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
798 if (!cnt)
799 {
800 // plain ASCII char
801 if (buf)
802 *buf++ = (char) cc;
803 len++;
804 }
805
806 else
807 {
808 len += cnt + 1;
809 if (buf)
810 {
811 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
812 while (cnt--)
813 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
814 }
815 }
816 }
817 }
818
819 if (buf && (len<n))
820 *buf = 0;
821
822 return len;
823 }
824
825 // ----------------------------------------------------------------------------
826 // UTF-16
827 // ----------------------------------------------------------------------------
828
829 #ifdef WORDS_BIGENDIAN
830 #define wxMBConvUTF16straight wxMBConvUTF16BE
831 #define wxMBConvUTF16swap wxMBConvUTF16LE
832 #else
833 #define wxMBConvUTF16swap wxMBConvUTF16BE
834 #define wxMBConvUTF16straight wxMBConvUTF16LE
835 #endif
836
837
838 #ifdef WC_UTF16
839
840 // copy 16bit MB to 16bit String
841 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
842 {
843 size_t len=0;
844
845 while (*(wxUint16*)psz && (!buf || len < n))
846 {
847 if (buf)
848 *buf++ = *(wxUint16*)psz;
849 len++;
850
851 psz += sizeof(wxUint16);
852 }
853 if (buf && len<n) *buf=0;
854
855 return len;
856 }
857
858
859 // copy 16bit String to 16bit MB
860 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
861 {
862 size_t len=0;
863
864 while (*psz && (!buf || len < n))
865 {
866 if (buf)
867 {
868 *(wxUint16*)buf = *psz;
869 buf += sizeof(wxUint16);
870 }
871 len += sizeof(wxUint16);
872 psz++;
873 }
874 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
875
876 return len;
877 }
878
879
880 // swap 16bit MB to 16bit String
881 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
882 {
883 size_t len=0;
884
885 while (*(wxUint16*)psz && (!buf || len < n))
886 {
887 if (buf)
888 {
889 ((char *)buf)[0] = psz[1];
890 ((char *)buf)[1] = psz[0];
891 buf++;
892 }
893 len++;
894 psz += sizeof(wxUint16);
895 }
896 if (buf && len<n) *buf=0;
897
898 return len;
899 }
900
901
902 // swap 16bit MB to 16bit String
903 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
904 {
905 size_t len=0;
906
907 while (*psz && (!buf || len < n))
908 {
909 if (buf)
910 {
911 *buf++ = ((char*)psz)[1];
912 *buf++ = ((char*)psz)[0];
913 }
914 len += sizeof(wxUint16);
915 psz++;
916 }
917 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
918
919 return len;
920 }
921
922
923 #else // WC_UTF16
924
925
926 // copy 16bit MB to 32bit String
927 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
928 {
929 size_t len=0;
930
931 while (*(wxUint16*)psz && (!buf || len < n))
932 {
933 wxUint32 cc;
934 size_t pa=decode_utf16((wxUint16*)psz, cc);
935 if (pa == (size_t)-1)
936 return pa;
937
938 if (buf)
939 *buf++ = cc;
940 len++;
941 psz += pa * sizeof(wxUint16);
942 }
943 if (buf && len<n) *buf=0;
944
945 return len;
946 }
947
948
949 // copy 32bit String to 16bit MB
950 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
951 {
952 size_t len=0;
953
954 while (*psz && (!buf || len < n))
955 {
956 wxUint16 cc[2];
957 size_t pa=encode_utf16(*psz, cc);
958
959 if (pa == (size_t)-1)
960 return pa;
961
962 if (buf)
963 {
964 *(wxUint16*)buf = cc[0];
965 buf += sizeof(wxUint16);
966 if (pa > 1)
967 {
968 *(wxUint16*)buf = cc[1];
969 buf += sizeof(wxUint16);
970 }
971 }
972
973 len += pa*sizeof(wxUint16);
974 psz++;
975 }
976 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
977
978 return len;
979 }
980
981
982 // swap 16bit MB to 32bit String
983 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
984 {
985 size_t len=0;
986
987 while (*(wxUint16*)psz && (!buf || len < n))
988 {
989 wxUint32 cc;
990 char tmp[4];
991 tmp[0]=psz[1]; tmp[1]=psz[0];
992 tmp[2]=psz[3]; tmp[3]=psz[2];
993
994 size_t pa=decode_utf16((wxUint16*)tmp, cc);
995 if (pa == (size_t)-1)
996 return pa;
997
998 if (buf)
999 *buf++ = cc;
1000
1001 len++;
1002 psz += pa * sizeof(wxUint16);
1003 }
1004 if (buf && len<n) *buf=0;
1005
1006 return len;
1007 }
1008
1009
1010 // swap 32bit String to 16bit MB
1011 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1012 {
1013 size_t len=0;
1014
1015 while (*psz && (!buf || len < n))
1016 {
1017 wxUint16 cc[2];
1018 size_t pa=encode_utf16(*psz, cc);
1019
1020 if (pa == (size_t)-1)
1021 return pa;
1022
1023 if (buf)
1024 {
1025 *buf++ = ((char*)cc)[1];
1026 *buf++ = ((char*)cc)[0];
1027 if (pa > 1)
1028 {
1029 *buf++ = ((char*)cc)[3];
1030 *buf++ = ((char*)cc)[2];
1031 }
1032 }
1033
1034 len += pa*sizeof(wxUint16);
1035 psz++;
1036 }
1037 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1038
1039 return len;
1040 }
1041
1042 #endif // WC_UTF16
1043
1044
1045 // ----------------------------------------------------------------------------
1046 // UTF-32
1047 // ----------------------------------------------------------------------------
1048
1049 #ifdef WORDS_BIGENDIAN
1050 #define wxMBConvUTF32straight wxMBConvUTF32BE
1051 #define wxMBConvUTF32swap wxMBConvUTF32LE
1052 #else
1053 #define wxMBConvUTF32swap wxMBConvUTF32BE
1054 #define wxMBConvUTF32straight wxMBConvUTF32LE
1055 #endif
1056
1057
1058 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1059 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1060
1061
1062 #ifdef WC_UTF16
1063
1064 // copy 32bit MB to 16bit String
1065 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1066 {
1067 size_t len=0;
1068
1069 while (*(wxUint32*)psz && (!buf || len < n))
1070 {
1071 wxUint16 cc[2];
1072
1073 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1074 if (pa == (size_t)-1)
1075 return pa;
1076
1077 if (buf)
1078 {
1079 *buf++ = cc[0];
1080 if (pa > 1)
1081 *buf++ = cc[1];
1082 }
1083 len += pa;
1084 psz += sizeof(wxUint32);
1085 }
1086 if (buf && len<n) *buf=0;
1087
1088 return len;
1089 }
1090
1091
1092 // copy 16bit String to 32bit MB
1093 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1094 {
1095 size_t len=0;
1096
1097 while (*psz && (!buf || len < n))
1098 {
1099 wxUint32 cc;
1100
1101 // cast is ok for WC_UTF16
1102 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1103 if (pa == (size_t)-1)
1104 return pa;
1105
1106 if (buf)
1107 {
1108 *(wxUint32*)buf = cc;
1109 buf += sizeof(wxUint32);
1110 }
1111 len += sizeof(wxUint32);
1112 psz += pa;
1113 }
1114
1115 if (buf && len<=n-sizeof(wxUint32))
1116 *(wxUint32*)buf=0;
1117
1118 return len;
1119 }
1120
1121
1122
1123 // swap 32bit MB to 16bit String
1124 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1125 {
1126 size_t len=0;
1127
1128 while (*(wxUint32*)psz && (!buf || len < n))
1129 {
1130 char tmp[4];
1131 tmp[0] = psz[3]; tmp[1] = psz[2];
1132 tmp[2] = psz[1]; tmp[3] = psz[0];
1133
1134
1135 wxUint16 cc[2];
1136
1137 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1138 if (pa == (size_t)-1)
1139 return pa;
1140
1141 if (buf)
1142 {
1143 *buf++ = cc[0];
1144 if (pa > 1)
1145 *buf++ = cc[1];
1146 }
1147 len += pa;
1148 psz += sizeof(wxUint32);
1149 }
1150
1151 if (buf && len<n)
1152 *buf=0;
1153
1154 return len;
1155 }
1156
1157
1158 // swap 16bit String to 32bit MB
1159 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1160 {
1161 size_t len=0;
1162
1163 while (*psz && (!buf || len < n))
1164 {
1165 char cc[4];
1166
1167 // cast is ok for WC_UTF16
1168 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1169 if (pa == (size_t)-1)
1170 return pa;
1171
1172 if (buf)
1173 {
1174 *buf++ = cc[3];
1175 *buf++ = cc[2];
1176 *buf++ = cc[1];
1177 *buf++ = cc[0];
1178 }
1179 len += sizeof(wxUint32);
1180 psz += pa;
1181 }
1182
1183 if (buf && len<=n-sizeof(wxUint32))
1184 *(wxUint32*)buf=0;
1185
1186 return len;
1187 }
1188
1189 #else // WC_UTF16
1190
1191
1192 // copy 32bit MB to 32bit String
1193 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1194 {
1195 size_t len=0;
1196
1197 while (*(wxUint32*)psz && (!buf || len < n))
1198 {
1199 if (buf)
1200 *buf++ = *(wxUint32*)psz;
1201 len++;
1202 psz += sizeof(wxUint32);
1203 }
1204
1205 if (buf && len<n)
1206 *buf=0;
1207
1208 return len;
1209 }
1210
1211
1212 // copy 32bit String to 32bit MB
1213 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1214 {
1215 size_t len=0;
1216
1217 while (*psz && (!buf || len < n))
1218 {
1219 if (buf)
1220 {
1221 *(wxUint32*)buf = *psz;
1222 buf += sizeof(wxUint32);
1223 }
1224
1225 len += sizeof(wxUint32);
1226 psz++;
1227 }
1228
1229 if (buf && len<=n-sizeof(wxUint32))
1230 *(wxUint32*)buf=0;
1231
1232 return len;
1233 }
1234
1235
1236 // swap 32bit MB to 32bit String
1237 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1238 {
1239 size_t len=0;
1240
1241 while (*(wxUint32*)psz && (!buf || len < n))
1242 {
1243 if (buf)
1244 {
1245 ((char *)buf)[0] = psz[3];
1246 ((char *)buf)[1] = psz[2];
1247 ((char *)buf)[2] = psz[1];
1248 ((char *)buf)[3] = psz[0];
1249 buf++;
1250 }
1251 len++;
1252 psz += sizeof(wxUint32);
1253 }
1254
1255 if (buf && len<n)
1256 *buf=0;
1257
1258 return len;
1259 }
1260
1261
1262 // swap 32bit String to 32bit MB
1263 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1264 {
1265 size_t len=0;
1266
1267 while (*psz && (!buf || len < n))
1268 {
1269 if (buf)
1270 {
1271 *buf++ = ((char *)psz)[3];
1272 *buf++ = ((char *)psz)[2];
1273 *buf++ = ((char *)psz)[1];
1274 *buf++ = ((char *)psz)[0];
1275 }
1276 len += sizeof(wxUint32);
1277 psz++;
1278 }
1279
1280 if (buf && len<=n-sizeof(wxUint32))
1281 *(wxUint32*)buf=0;
1282
1283 return len;
1284 }
1285
1286
1287 #endif // WC_UTF16
1288
1289
1290 // ============================================================================
1291 // The classes doing conversion using the iconv_xxx() functions
1292 // ============================================================================
1293
1294 #ifdef HAVE_ICONV
1295
1296 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1297 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1298 // (unless there's yet another bug in glibc) the only case when iconv()
1299 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1300 // left in the input buffer -- when _real_ error occurs,
1301 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1302 // iconv() failure.
1303 // [This bug does not appear in glibc 2.2.]
1304 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1305 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1306 (errno != E2BIG || bufLeft != 0))
1307 #else
1308 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1309 #endif
1310
1311 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1312
1313 // ----------------------------------------------------------------------------
1314 // wxMBConv_iconv: encapsulates an iconv character set
1315 // ----------------------------------------------------------------------------
1316
1317 class wxMBConv_iconv : public wxMBConv
1318 {
1319 public:
1320 wxMBConv_iconv(const wxChar *name);
1321 virtual ~wxMBConv_iconv();
1322
1323 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1324 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1325
1326 bool IsOk() const
1327 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1328
1329 protected:
1330 // the iconv handlers used to translate from multibyte to wide char and in
1331 // the other direction
1332 iconv_t m2w,
1333 w2m;
1334 #if wxUSE_THREADS
1335 // guards access to m2w and w2m objects
1336 wxMutex m_iconvMutex;
1337 #endif
1338
1339 private:
1340 // the name (for iconv_open()) of a wide char charset -- if none is
1341 // available on this machine, it will remain NULL
1342 static const char *ms_wcCharsetName;
1343
1344 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1345 // different endian-ness than the native one
1346 static bool ms_wcNeedsSwap;
1347 };
1348
1349 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1350 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1351
1352 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1353 {
1354 // Do it the hard way
1355 char cname[100];
1356 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1357 cname[i] = (char) name[i];
1358
1359 // check for charset that represents wchar_t:
1360 if (ms_wcCharsetName == NULL)
1361 {
1362 ms_wcNeedsSwap = false;
1363
1364 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1365 ms_wcCharsetName = WC_NAME_BEST;
1366 m2w = iconv_open(ms_wcCharsetName, cname);
1367
1368 if (m2w == (iconv_t)-1)
1369 {
1370 // try charset w/o bytesex info (e.g. "UCS4")
1371 // and check for bytesex ourselves:
1372 ms_wcCharsetName = WC_NAME;
1373 m2w = iconv_open(ms_wcCharsetName, cname);
1374
1375 // last bet, try if it knows WCHAR_T pseudo-charset
1376 if (m2w == (iconv_t)-1)
1377 {
1378 ms_wcCharsetName = "WCHAR_T";
1379 m2w = iconv_open(ms_wcCharsetName, cname);
1380 }
1381
1382 if (m2w != (iconv_t)-1)
1383 {
1384 char buf[2], *bufPtr;
1385 wchar_t wbuf[2], *wbufPtr;
1386 size_t insz, outsz;
1387 size_t res;
1388
1389 buf[0] = 'A';
1390 buf[1] = 0;
1391 wbuf[0] = 0;
1392 insz = 2;
1393 outsz = SIZEOF_WCHAR_T * 2;
1394 wbufPtr = wbuf;
1395 bufPtr = buf;
1396
1397 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1398 (char**)&wbufPtr, &outsz);
1399
1400 if (ICONV_FAILED(res, insz))
1401 {
1402 ms_wcCharsetName = NULL;
1403 wxLogLastError(wxT("iconv"));
1404 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1405 }
1406 else
1407 {
1408 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1409 }
1410 }
1411 else
1412 {
1413 ms_wcCharsetName = NULL;
1414
1415 // VS: we must not output an error here, since wxWidgets will safely
1416 // fall back to using wxEncodingConverter.
1417 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1418 //wxLogError(
1419 }
1420 }
1421 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1422 }
1423 else // we already have ms_wcCharsetName
1424 {
1425 m2w = iconv_open(ms_wcCharsetName, cname);
1426 }
1427
1428 // NB: don't ever pass NULL to iconv_open(), it may crash!
1429 if ( ms_wcCharsetName )
1430 {
1431 w2m = iconv_open( cname, ms_wcCharsetName);
1432 }
1433 else
1434 {
1435 w2m = (iconv_t)-1;
1436 }
1437 }
1438
1439 wxMBConv_iconv::~wxMBConv_iconv()
1440 {
1441 if ( m2w != (iconv_t)-1 )
1442 iconv_close(m2w);
1443 if ( w2m != (iconv_t)-1 )
1444 iconv_close(w2m);
1445 }
1446
1447 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1448 {
1449 #if wxUSE_THREADS
1450 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1451 // Unfortunately there is a couple of global wxCSConv objects such as
1452 // wxConvLocal that are used all over wx code, so we have to make sure
1453 // the handle is used by at most one thread at the time. Otherwise
1454 // only a few wx classes would be safe to use from non-main threads
1455 // as MB<->WC conversion would fail "randomly".
1456 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1457 #endif
1458
1459 size_t inbuf = strlen(psz);
1460 size_t outbuf = n * SIZEOF_WCHAR_T;
1461 size_t res, cres;
1462 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1463 wchar_t *bufPtr = buf;
1464 const char *pszPtr = psz;
1465
1466 if (buf)
1467 {
1468 // have destination buffer, convert there
1469 cres = iconv(m2w,
1470 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1471 (char**)&bufPtr, &outbuf);
1472 res = n - (outbuf / SIZEOF_WCHAR_T);
1473
1474 if (ms_wcNeedsSwap)
1475 {
1476 // convert to native endianness
1477 WC_BSWAP(buf /* _not_ bufPtr */, res)
1478 }
1479
1480 // NB: iconv was given only strlen(psz) characters on input, and so
1481 // it couldn't convert the trailing zero. Let's do it ourselves
1482 // if there's some room left for it in the output buffer.
1483 if (res < n)
1484 buf[res] = 0;
1485 }
1486 else
1487 {
1488 // no destination buffer... convert using temp buffer
1489 // to calculate destination buffer requirement
1490 wchar_t tbuf[8];
1491 res = 0;
1492 do {
1493 bufPtr = tbuf;
1494 outbuf = 8*SIZEOF_WCHAR_T;
1495
1496 cres = iconv(m2w,
1497 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1498 (char**)&bufPtr, &outbuf );
1499
1500 res += 8-(outbuf/SIZEOF_WCHAR_T);
1501 } while ((cres==(size_t)-1) && (errno==E2BIG));
1502 }
1503
1504 if (ICONV_FAILED(cres, inbuf))
1505 {
1506 //VS: it is ok if iconv fails, hence trace only
1507 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1508 return (size_t)-1;
1509 }
1510
1511 return res;
1512 }
1513
1514 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1515 {
1516 #if wxUSE_THREADS
1517 // NB: explained in MB2WC
1518 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1519 #endif
1520
1521 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1522 size_t outbuf = n;
1523 size_t res, cres;
1524
1525 wchar_t *tmpbuf = 0;
1526
1527 if (ms_wcNeedsSwap)
1528 {
1529 // need to copy to temp buffer to switch endianness
1530 // this absolutely doesn't rock!
1531 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1532 // could be in read-only memory, or be accessed in some other thread)
1533 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1534 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1535 WC_BSWAP(tmpbuf, inbuf)
1536 psz=tmpbuf;
1537 }
1538
1539 if (buf)
1540 {
1541 // have destination buffer, convert there
1542 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1543
1544 res = n-outbuf;
1545
1546 // NB: iconv was given only wcslen(psz) characters on input, and so
1547 // it couldn't convert the trailing zero. Let's do it ourselves
1548 // if there's some room left for it in the output buffer.
1549 if (res < n)
1550 buf[0] = 0;
1551 }
1552 else
1553 {
1554 // no destination buffer... convert using temp buffer
1555 // to calculate destination buffer requirement
1556 char tbuf[16];
1557 res = 0;
1558 do {
1559 buf = tbuf; outbuf = 16;
1560
1561 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1562
1563 res += 16 - outbuf;
1564 } while ((cres==(size_t)-1) && (errno==E2BIG));
1565 }
1566
1567 if (ms_wcNeedsSwap)
1568 {
1569 free(tmpbuf);
1570 }
1571
1572 if (ICONV_FAILED(cres, inbuf))
1573 {
1574 //VS: it is ok if iconv fails, hence trace only
1575 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1576 return (size_t)-1;
1577 }
1578
1579 return res;
1580 }
1581
1582 #endif // HAVE_ICONV
1583
1584
1585 // ============================================================================
1586 // Win32 conversion classes
1587 // ============================================================================
1588
1589 #ifdef wxHAVE_WIN32_MB2WC
1590
1591 // from utils.cpp
1592 #if wxUSE_FONTMAP
1593 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1594 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1595 #endif
1596
1597 class wxMBConv_win32 : public wxMBConv
1598 {
1599 public:
1600 wxMBConv_win32()
1601 {
1602 m_CodePage = CP_ACP;
1603 }
1604
1605 #if wxUSE_FONTMAP
1606 wxMBConv_win32(const wxChar* name)
1607 {
1608 m_CodePage = wxCharsetToCodepage(name);
1609 }
1610
1611 wxMBConv_win32(wxFontEncoding encoding)
1612 {
1613 m_CodePage = wxEncodingToCodepage(encoding);
1614 }
1615 #endif
1616
1617 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1618 {
1619 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1620 // the behaviour is not compatible with the Unix version (using iconv)
1621 // and break the library itself, e.g. wxTextInputStream::NextChar()
1622 // wouldn't work if reading an incomplete MB char didn't result in an
1623 // error
1624 //
1625 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1626 // an error (tested under Windows Server 2003) and apparently it is
1627 // done on purpose, i.e. the function accepts any input in this case
1628 // and although I'd prefer to return error on ill-formed output, our
1629 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1630 // explicitly ill-formed according to RFC 2152) neither so we don't
1631 // even have any fallback here...
1632 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1633
1634 const size_t len = ::MultiByteToWideChar
1635 (
1636 m_CodePage, // code page
1637 flags, // flags: fall on error
1638 psz, // input string
1639 -1, // its length (NUL-terminated)
1640 buf, // output string
1641 buf ? n : 0 // size of output buffer
1642 );
1643
1644 // note that it returns count of written chars for buf != NULL and size
1645 // of the needed buffer for buf == NULL so in either case the length of
1646 // the string (which never includes the terminating NUL) is one less
1647 return len ? len - 1 : (size_t)-1;
1648 }
1649
1650 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1651 {
1652 /*
1653 we have a problem here: by default, WideCharToMultiByte() may
1654 replace characters unrepresentable in the target code page with bad
1655 quality approximations such as turning "1/2" symbol (U+00BD) into
1656 "1" for the code pages which don't have it and we, obviously, want
1657 to avoid this at any price
1658
1659 the trouble is that this function does it _silently_, i.e. it won't
1660 even tell us whether it did or not... Win98/2000 and higher provide
1661 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1662 we have to resort to a round trip, i.e. check that converting back
1663 results in the same string -- this is, of course, expensive but
1664 otherwise we simply can't be sure to not garble the data.
1665 */
1666
1667 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1668 // it doesn't work with CJK encodings (which we test for rather roughly
1669 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1670 // supporting it
1671 BOOL usedDef wxDUMMY_INITIALIZE(false);
1672 BOOL *pUsedDef;
1673 int flags;
1674 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1675 {
1676 // it's our lucky day
1677 flags = WC_NO_BEST_FIT_CHARS;
1678 pUsedDef = &usedDef;
1679 }
1680 else // old system or unsupported encoding
1681 {
1682 flags = 0;
1683 pUsedDef = NULL;
1684 }
1685
1686 const size_t len = ::WideCharToMultiByte
1687 (
1688 m_CodePage, // code page
1689 flags, // either none or no best fit
1690 pwz, // input string
1691 -1, // it is (wide) NUL-terminated
1692 buf, // output buffer
1693 buf ? n : 0, // and its size
1694 NULL, // default "replacement" char
1695 pUsedDef // [out] was it used?
1696 );
1697
1698 if ( !len )
1699 {
1700 // function totally failed
1701 return (size_t)-1;
1702 }
1703
1704 // if we were really converting, check if we succeeded
1705 if ( buf )
1706 {
1707 if ( flags )
1708 {
1709 // check if the conversion failed, i.e. if any replacements
1710 // were done
1711 if ( usedDef )
1712 return (size_t)-1;
1713 }
1714 else // we must resort to double tripping...
1715 {
1716 wxWCharBuffer wcBuf(n);
1717 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1718 wcscmp(wcBuf, pwz) != 0 )
1719 {
1720 // we didn't obtain the same thing we started from, hence
1721 // the conversion was lossy and we consider that it failed
1722 return (size_t)-1;
1723 }
1724 }
1725 }
1726
1727 // see the comment above for the reason of "len - 1"
1728 return len - 1;
1729 }
1730
1731 bool IsOk() const { return m_CodePage != -1; }
1732
1733 private:
1734 static bool CanUseNoBestFit()
1735 {
1736 static int s_isWin98Or2k = -1;
1737
1738 if ( s_isWin98Or2k == -1 )
1739 {
1740 int verMaj, verMin;
1741 switch ( wxGetOsVersion(&verMaj, &verMin) )
1742 {
1743 case wxWIN95:
1744 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1745 break;
1746
1747 case wxWINDOWS_NT:
1748 s_isWin98Or2k = verMaj >= 5;
1749 break;
1750
1751 default:
1752 // unknown, be conseravtive by default
1753 s_isWin98Or2k = 0;
1754 }
1755
1756 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1757 }
1758
1759 return s_isWin98Or2k == 1;
1760 }
1761
1762 long m_CodePage;
1763 };
1764
1765 #endif // wxHAVE_WIN32_MB2WC
1766
1767 // ============================================================================
1768 // Cocoa conversion classes
1769 // ============================================================================
1770
1771 #if defined(__WXCOCOA__)
1772
1773 // RN: There is no UTF-32 support in either Core Foundation or
1774 // Cocoa. Strangely enough, internally Core Foundation uses
1775 // UTF 32 internally quite a bit - its just not public (yet).
1776
1777 #include <CoreFoundation/CFString.h>
1778 #include <CoreFoundation/CFStringEncodingExt.h>
1779
1780 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1781 {
1782 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1783 if ( encoding == wxFONTENCODING_DEFAULT )
1784 {
1785 enc = CFStringGetSystemEncoding();
1786 }
1787 else switch( encoding)
1788 {
1789 case wxFONTENCODING_ISO8859_1 :
1790 enc = kCFStringEncodingISOLatin1 ;
1791 break ;
1792 case wxFONTENCODING_ISO8859_2 :
1793 enc = kCFStringEncodingISOLatin2;
1794 break ;
1795 case wxFONTENCODING_ISO8859_3 :
1796 enc = kCFStringEncodingISOLatin3 ;
1797 break ;
1798 case wxFONTENCODING_ISO8859_4 :
1799 enc = kCFStringEncodingISOLatin4;
1800 break ;
1801 case wxFONTENCODING_ISO8859_5 :
1802 enc = kCFStringEncodingISOLatinCyrillic;
1803 break ;
1804 case wxFONTENCODING_ISO8859_6 :
1805 enc = kCFStringEncodingISOLatinArabic;
1806 break ;
1807 case wxFONTENCODING_ISO8859_7 :
1808 enc = kCFStringEncodingISOLatinGreek;
1809 break ;
1810 case wxFONTENCODING_ISO8859_8 :
1811 enc = kCFStringEncodingISOLatinHebrew;
1812 break ;
1813 case wxFONTENCODING_ISO8859_9 :
1814 enc = kCFStringEncodingISOLatin5;
1815 break ;
1816 case wxFONTENCODING_ISO8859_10 :
1817 enc = kCFStringEncodingISOLatin6;
1818 break ;
1819 case wxFONTENCODING_ISO8859_11 :
1820 enc = kCFStringEncodingISOLatinThai;
1821 break ;
1822 case wxFONTENCODING_ISO8859_13 :
1823 enc = kCFStringEncodingISOLatin7;
1824 break ;
1825 case wxFONTENCODING_ISO8859_14 :
1826 enc = kCFStringEncodingISOLatin8;
1827 break ;
1828 case wxFONTENCODING_ISO8859_15 :
1829 enc = kCFStringEncodingISOLatin9;
1830 break ;
1831
1832 case wxFONTENCODING_KOI8 :
1833 enc = kCFStringEncodingKOI8_R;
1834 break ;
1835 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1836 enc = kCFStringEncodingDOSRussian;
1837 break ;
1838
1839 // case wxFONTENCODING_BULGARIAN :
1840 // enc = ;
1841 // break ;
1842
1843 case wxFONTENCODING_CP437 :
1844 enc =kCFStringEncodingDOSLatinUS ;
1845 break ;
1846 case wxFONTENCODING_CP850 :
1847 enc = kCFStringEncodingDOSLatin1;
1848 break ;
1849 case wxFONTENCODING_CP852 :
1850 enc = kCFStringEncodingDOSLatin2;
1851 break ;
1852 case wxFONTENCODING_CP855 :
1853 enc = kCFStringEncodingDOSCyrillic;
1854 break ;
1855 case wxFONTENCODING_CP866 :
1856 enc =kCFStringEncodingDOSRussian ;
1857 break ;
1858 case wxFONTENCODING_CP874 :
1859 enc = kCFStringEncodingDOSThai;
1860 break ;
1861 case wxFONTENCODING_CP932 :
1862 enc = kCFStringEncodingDOSJapanese;
1863 break ;
1864 case wxFONTENCODING_CP936 :
1865 enc =kCFStringEncodingDOSChineseSimplif ;
1866 break ;
1867 case wxFONTENCODING_CP949 :
1868 enc = kCFStringEncodingDOSKorean;
1869 break ;
1870 case wxFONTENCODING_CP950 :
1871 enc = kCFStringEncodingDOSChineseTrad;
1872 break ;
1873 case wxFONTENCODING_CP1250 :
1874 enc = kCFStringEncodingWindowsLatin2;
1875 break ;
1876 case wxFONTENCODING_CP1251 :
1877 enc =kCFStringEncodingWindowsCyrillic ;
1878 break ;
1879 case wxFONTENCODING_CP1252 :
1880 enc =kCFStringEncodingWindowsLatin1 ;
1881 break ;
1882 case wxFONTENCODING_CP1253 :
1883 enc = kCFStringEncodingWindowsGreek;
1884 break ;
1885 case wxFONTENCODING_CP1254 :
1886 enc = kCFStringEncodingWindowsLatin5;
1887 break ;
1888 case wxFONTENCODING_CP1255 :
1889 enc =kCFStringEncodingWindowsHebrew ;
1890 break ;
1891 case wxFONTENCODING_CP1256 :
1892 enc =kCFStringEncodingWindowsArabic ;
1893 break ;
1894 case wxFONTENCODING_CP1257 :
1895 enc = kCFStringEncodingWindowsBalticRim;
1896 break ;
1897 // This only really encodes to UTF7 (if that) evidently
1898 // case wxFONTENCODING_UTF7 :
1899 // enc = kCFStringEncodingNonLossyASCII ;
1900 // break ;
1901 case wxFONTENCODING_UTF8 :
1902 enc = kCFStringEncodingUTF8 ;
1903 break ;
1904 case wxFONTENCODING_EUC_JP :
1905 enc = kCFStringEncodingEUC_JP;
1906 break ;
1907 case wxFONTENCODING_UTF16 :
1908 enc = kCFStringEncodingUnicode ;
1909 break ;
1910 case wxFONTENCODING_MACROMAN :
1911 enc = kCFStringEncodingMacRoman ;
1912 break ;
1913 case wxFONTENCODING_MACJAPANESE :
1914 enc = kCFStringEncodingMacJapanese ;
1915 break ;
1916 case wxFONTENCODING_MACCHINESETRAD :
1917 enc = kCFStringEncodingMacChineseTrad ;
1918 break ;
1919 case wxFONTENCODING_MACKOREAN :
1920 enc = kCFStringEncodingMacKorean ;
1921 break ;
1922 case wxFONTENCODING_MACARABIC :
1923 enc = kCFStringEncodingMacArabic ;
1924 break ;
1925 case wxFONTENCODING_MACHEBREW :
1926 enc = kCFStringEncodingMacHebrew ;
1927 break ;
1928 case wxFONTENCODING_MACGREEK :
1929 enc = kCFStringEncodingMacGreek ;
1930 break ;
1931 case wxFONTENCODING_MACCYRILLIC :
1932 enc = kCFStringEncodingMacCyrillic ;
1933 break ;
1934 case wxFONTENCODING_MACDEVANAGARI :
1935 enc = kCFStringEncodingMacDevanagari ;
1936 break ;
1937 case wxFONTENCODING_MACGURMUKHI :
1938 enc = kCFStringEncodingMacGurmukhi ;
1939 break ;
1940 case wxFONTENCODING_MACGUJARATI :
1941 enc = kCFStringEncodingMacGujarati ;
1942 break ;
1943 case wxFONTENCODING_MACORIYA :
1944 enc = kCFStringEncodingMacOriya ;
1945 break ;
1946 case wxFONTENCODING_MACBENGALI :
1947 enc = kCFStringEncodingMacBengali ;
1948 break ;
1949 case wxFONTENCODING_MACTAMIL :
1950 enc = kCFStringEncodingMacTamil ;
1951 break ;
1952 case wxFONTENCODING_MACTELUGU :
1953 enc = kCFStringEncodingMacTelugu ;
1954 break ;
1955 case wxFONTENCODING_MACKANNADA :
1956 enc = kCFStringEncodingMacKannada ;
1957 break ;
1958 case wxFONTENCODING_MACMALAJALAM :
1959 enc = kCFStringEncodingMacMalayalam ;
1960 break ;
1961 case wxFONTENCODING_MACSINHALESE :
1962 enc = kCFStringEncodingMacSinhalese ;
1963 break ;
1964 case wxFONTENCODING_MACBURMESE :
1965 enc = kCFStringEncodingMacBurmese ;
1966 break ;
1967 case wxFONTENCODING_MACKHMER :
1968 enc = kCFStringEncodingMacKhmer ;
1969 break ;
1970 case wxFONTENCODING_MACTHAI :
1971 enc = kCFStringEncodingMacThai ;
1972 break ;
1973 case wxFONTENCODING_MACLAOTIAN :
1974 enc = kCFStringEncodingMacLaotian ;
1975 break ;
1976 case wxFONTENCODING_MACGEORGIAN :
1977 enc = kCFStringEncodingMacGeorgian ;
1978 break ;
1979 case wxFONTENCODING_MACARMENIAN :
1980 enc = kCFStringEncodingMacArmenian ;
1981 break ;
1982 case wxFONTENCODING_MACCHINESESIMP :
1983 enc = kCFStringEncodingMacChineseSimp ;
1984 break ;
1985 case wxFONTENCODING_MACTIBETAN :
1986 enc = kCFStringEncodingMacTibetan ;
1987 break ;
1988 case wxFONTENCODING_MACMONGOLIAN :
1989 enc = kCFStringEncodingMacMongolian ;
1990 break ;
1991 case wxFONTENCODING_MACETHIOPIC :
1992 enc = kCFStringEncodingMacEthiopic ;
1993 break ;
1994 case wxFONTENCODING_MACCENTRALEUR :
1995 enc = kCFStringEncodingMacCentralEurRoman ;
1996 break ;
1997 case wxFONTENCODING_MACVIATNAMESE :
1998 enc = kCFStringEncodingMacVietnamese ;
1999 break ;
2000 case wxFONTENCODING_MACARABICEXT :
2001 enc = kCFStringEncodingMacExtArabic ;
2002 break ;
2003 case wxFONTENCODING_MACSYMBOL :
2004 enc = kCFStringEncodingMacSymbol ;
2005 break ;
2006 case wxFONTENCODING_MACDINGBATS :
2007 enc = kCFStringEncodingMacDingbats ;
2008 break ;
2009 case wxFONTENCODING_MACTURKISH :
2010 enc = kCFStringEncodingMacTurkish ;
2011 break ;
2012 case wxFONTENCODING_MACCROATIAN :
2013 enc = kCFStringEncodingMacCroatian ;
2014 break ;
2015 case wxFONTENCODING_MACICELANDIC :
2016 enc = kCFStringEncodingMacIcelandic ;
2017 break ;
2018 case wxFONTENCODING_MACROMANIAN :
2019 enc = kCFStringEncodingMacRomanian ;
2020 break ;
2021 case wxFONTENCODING_MACCELTIC :
2022 enc = kCFStringEncodingMacCeltic ;
2023 break ;
2024 case wxFONTENCODING_MACGAELIC :
2025 enc = kCFStringEncodingMacGaelic ;
2026 break ;
2027 // case wxFONTENCODING_MACKEYBOARD :
2028 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2029 // break ;
2030 default :
2031 // because gcc is picky
2032 break ;
2033 } ;
2034 return enc ;
2035 }
2036
2037 class wxMBConv_cocoa : public wxMBConv
2038 {
2039 public:
2040 wxMBConv_cocoa()
2041 {
2042 Init(CFStringGetSystemEncoding()) ;
2043 }
2044
2045 #if wxUSE_FONTMAP
2046 wxMBConv_cocoa(const wxChar* name)
2047 {
2048 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2049 }
2050 #endif
2051
2052 wxMBConv_cocoa(wxFontEncoding encoding)
2053 {
2054 Init( wxCFStringEncFromFontEnc(encoding) );
2055 }
2056
2057 ~wxMBConv_cocoa()
2058 {
2059 }
2060
2061 void Init( CFStringEncoding encoding)
2062 {
2063 m_encoding = encoding ;
2064 }
2065
2066 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2067 {
2068 wxASSERT(szUnConv);
2069
2070 CFStringRef theString = CFStringCreateWithBytes (
2071 NULL, //the allocator
2072 (const UInt8*)szUnConv,
2073 strlen(szUnConv),
2074 m_encoding,
2075 false //no BOM/external representation
2076 );
2077
2078 wxASSERT(theString);
2079
2080 size_t nOutLength = CFStringGetLength(theString);
2081
2082 if (szOut == NULL)
2083 {
2084 CFRelease(theString);
2085 return nOutLength;
2086 }
2087
2088 CFRange theRange = { 0, nOutSize };
2089
2090 #if SIZEOF_WCHAR_T == 4
2091 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2092 #endif
2093
2094 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2095
2096 CFRelease(theString);
2097
2098 szUniCharBuffer[nOutLength] = '\0' ;
2099
2100 #if SIZEOF_WCHAR_T == 4
2101 wxMBConvUTF16 converter ;
2102 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2103 delete[] szUniCharBuffer;
2104 #endif
2105
2106 return nOutLength;
2107 }
2108
2109 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2110 {
2111 wxASSERT(szUnConv);
2112
2113 size_t nRealOutSize;
2114 size_t nBufSize = wxWcslen(szUnConv);
2115 UniChar* szUniBuffer = (UniChar*) szUnConv;
2116
2117 #if SIZEOF_WCHAR_T == 4
2118 wxMBConvUTF16 converter ;
2119 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2120 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2121 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2122 nBufSize /= sizeof(UniChar);
2123 #endif
2124
2125 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2126 NULL, //allocator
2127 szUniBuffer,
2128 nBufSize,
2129 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2130 );
2131
2132 wxASSERT(theString);
2133
2134 //Note that CER puts a BOM when converting to unicode
2135 //so we check and use getchars instead in that case
2136 if (m_encoding == kCFStringEncodingUnicode)
2137 {
2138 if (szOut != NULL)
2139 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2140
2141 nRealOutSize = CFStringGetLength(theString) + 1;
2142 }
2143 else
2144 {
2145 CFStringGetBytes(
2146 theString,
2147 CFRangeMake(0, CFStringGetLength(theString)),
2148 m_encoding,
2149 0, //what to put in characters that can't be converted -
2150 //0 tells CFString to return NULL if it meets such a character
2151 false, //not an external representation
2152 (UInt8*) szOut,
2153 nOutSize,
2154 (CFIndex*) &nRealOutSize
2155 );
2156 }
2157
2158 CFRelease(theString);
2159
2160 #if SIZEOF_WCHAR_T == 4
2161 delete[] szUniBuffer;
2162 #endif
2163
2164 return nRealOutSize - 1;
2165 }
2166
2167 bool IsOk() const
2168 {
2169 return m_encoding != kCFStringEncodingInvalidId &&
2170 CFStringIsEncodingAvailable(m_encoding);
2171 }
2172
2173 private:
2174 CFStringEncoding m_encoding ;
2175 };
2176
2177 #endif // defined(__WXCOCOA__)
2178
2179 // ============================================================================
2180 // Mac conversion classes
2181 // ============================================================================
2182
2183 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2184
2185 class wxMBConv_mac : public wxMBConv
2186 {
2187 public:
2188 wxMBConv_mac()
2189 {
2190 Init(CFStringGetSystemEncoding()) ;
2191 }
2192
2193 #if wxUSE_FONTMAP
2194 wxMBConv_mac(const wxChar* name)
2195 {
2196 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2197 }
2198 #endif
2199
2200 wxMBConv_mac(wxFontEncoding encoding)
2201 {
2202 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2203 }
2204
2205 ~wxMBConv_mac()
2206 {
2207 OSStatus status = noErr ;
2208 status = TECDisposeConverter(m_MB2WC_converter);
2209 status = TECDisposeConverter(m_WC2MB_converter);
2210 }
2211
2212
2213 void Init( TextEncodingBase encoding)
2214 {
2215 OSStatus status = noErr ;
2216 m_char_encoding = encoding ;
2217 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2218
2219 status = TECCreateConverter(&m_MB2WC_converter,
2220 m_char_encoding,
2221 m_unicode_encoding);
2222 status = TECCreateConverter(&m_WC2MB_converter,
2223 m_unicode_encoding,
2224 m_char_encoding);
2225 }
2226
2227 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2228 {
2229 OSStatus status = noErr ;
2230 ByteCount byteOutLen ;
2231 ByteCount byteInLen = strlen(psz) ;
2232 wchar_t *tbuf = NULL ;
2233 UniChar* ubuf = NULL ;
2234 size_t res = 0 ;
2235
2236 if (buf == NULL)
2237 {
2238 //apple specs say at least 32
2239 n = wxMax( 32 , byteInLen ) ;
2240 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2241 }
2242 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2243 #if SIZEOF_WCHAR_T == 4
2244 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2245 #else
2246 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2247 #endif
2248 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2249 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2250 #if SIZEOF_WCHAR_T == 4
2251 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2252 // is not properly terminated we get random characters at the end
2253 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2254 wxMBConvUTF16 converter ;
2255 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2256 free( ubuf ) ;
2257 #else
2258 res = byteOutLen / sizeof( UniChar ) ;
2259 #endif
2260 if ( buf == NULL )
2261 free(tbuf) ;
2262
2263 if ( buf && res < n)
2264 buf[res] = 0;
2265
2266 return res ;
2267 }
2268
2269 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2270 {
2271 OSStatus status = noErr ;
2272 ByteCount byteOutLen ;
2273 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2274
2275 char *tbuf = NULL ;
2276
2277 if (buf == NULL)
2278 {
2279 //apple specs say at least 32
2280 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2281 tbuf = (char*) malloc( n ) ;
2282 }
2283
2284 ByteCount byteBufferLen = n ;
2285 UniChar* ubuf = NULL ;
2286 #if SIZEOF_WCHAR_T == 4
2287 wxMBConvUTF16 converter ;
2288 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2289 byteInLen = unicharlen ;
2290 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2291 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2292 #else
2293 ubuf = (UniChar*) psz ;
2294 #endif
2295 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2296 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2297 #if SIZEOF_WCHAR_T == 4
2298 free( ubuf ) ;
2299 #endif
2300 if ( buf == NULL )
2301 free(tbuf) ;
2302
2303 size_t res = byteOutLen ;
2304 if ( buf && res < n)
2305 {
2306 buf[res] = 0;
2307
2308 //we need to double-trip to verify it didn't insert any ? in place
2309 //of bogus characters
2310 wxWCharBuffer wcBuf(n);
2311 size_t pszlen = wxWcslen(psz);
2312 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2313 wxWcslen(wcBuf) != pszlen ||
2314 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2315 {
2316 // we didn't obtain the same thing we started from, hence
2317 // the conversion was lossy and we consider that it failed
2318 return (size_t)-1;
2319 }
2320 }
2321
2322 return res ;
2323 }
2324
2325 bool IsOk() const
2326 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2327
2328 private:
2329 TECObjectRef m_MB2WC_converter ;
2330 TECObjectRef m_WC2MB_converter ;
2331
2332 TextEncodingBase m_char_encoding ;
2333 TextEncodingBase m_unicode_encoding ;
2334 };
2335
2336 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2337
2338 // ============================================================================
2339 // wxEncodingConverter based conversion classes
2340 // ============================================================================
2341
2342 #if wxUSE_FONTMAP
2343
2344 class wxMBConv_wxwin : public wxMBConv
2345 {
2346 private:
2347 void Init()
2348 {
2349 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2350 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2351 }
2352
2353 public:
2354 // temporarily just use wxEncodingConverter stuff,
2355 // so that it works while a better implementation is built
2356 wxMBConv_wxwin(const wxChar* name)
2357 {
2358 if (name)
2359 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2360 else
2361 m_enc = wxFONTENCODING_SYSTEM;
2362
2363 Init();
2364 }
2365
2366 wxMBConv_wxwin(wxFontEncoding enc)
2367 {
2368 m_enc = enc;
2369
2370 Init();
2371 }
2372
2373 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2374 {
2375 size_t inbuf = strlen(psz);
2376 if (buf)
2377 {
2378 if (!m2w.Convert(psz,buf))
2379 return (size_t)-1;
2380 }
2381 return inbuf;
2382 }
2383
2384 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2385 {
2386 const size_t inbuf = wxWcslen(psz);
2387 if (buf)
2388 {
2389 if (!w2m.Convert(psz,buf))
2390 return (size_t)-1;
2391 }
2392
2393 return inbuf;
2394 }
2395
2396 bool IsOk() const { return m_ok; }
2397
2398 public:
2399 wxFontEncoding m_enc;
2400 wxEncodingConverter m2w, w2m;
2401
2402 // were we initialized successfully?
2403 bool m_ok;
2404
2405 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2406 };
2407
2408 #endif // wxUSE_FONTMAP
2409
2410 // ============================================================================
2411 // wxCSConv implementation
2412 // ============================================================================
2413
2414 void wxCSConv::Init()
2415 {
2416 m_name = NULL;
2417 m_convReal = NULL;
2418 m_deferred = true;
2419 }
2420
2421 wxCSConv::wxCSConv(const wxChar *charset)
2422 {
2423 Init();
2424
2425 if ( charset )
2426 {
2427 SetName(charset);
2428 }
2429
2430 m_encoding = wxFONTENCODING_SYSTEM;
2431 }
2432
2433 wxCSConv::wxCSConv(wxFontEncoding encoding)
2434 {
2435 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2436 {
2437 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2438
2439 encoding = wxFONTENCODING_SYSTEM;
2440 }
2441
2442 Init();
2443
2444 m_encoding = encoding;
2445 }
2446
2447 wxCSConv::~wxCSConv()
2448 {
2449 Clear();
2450 }
2451
2452 wxCSConv::wxCSConv(const wxCSConv& conv)
2453 : wxMBConv()
2454 {
2455 Init();
2456
2457 SetName(conv.m_name);
2458 m_encoding = conv.m_encoding;
2459 }
2460
2461 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2462 {
2463 Clear();
2464
2465 SetName(conv.m_name);
2466 m_encoding = conv.m_encoding;
2467
2468 return *this;
2469 }
2470
2471 void wxCSConv::Clear()
2472 {
2473 free(m_name);
2474 delete m_convReal;
2475
2476 m_name = NULL;
2477 m_convReal = NULL;
2478 }
2479
2480 void wxCSConv::SetName(const wxChar *charset)
2481 {
2482 if (charset)
2483 {
2484 m_name = wxStrdup(charset);
2485 m_deferred = true;
2486 }
2487 }
2488
2489 wxMBConv *wxCSConv::DoCreate() const
2490 {
2491 // check for the special case of ASCII or ISO8859-1 charset: as we have
2492 // special knowledge of it anyhow, we don't need to create a special
2493 // conversion object
2494 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2495 {
2496 // don't convert at all
2497 return NULL;
2498 }
2499
2500 // we trust OS to do conversion better than we can so try external
2501 // conversion methods first
2502 //
2503 // the full order is:
2504 // 1. OS conversion (iconv() under Unix or Win32 API)
2505 // 2. hard coded conversions for UTF
2506 // 3. wxEncodingConverter as fall back
2507
2508 // step (1)
2509 #ifdef HAVE_ICONV
2510 #if !wxUSE_FONTMAP
2511 if ( m_name )
2512 #endif // !wxUSE_FONTMAP
2513 {
2514 wxString name(m_name);
2515
2516 #if wxUSE_FONTMAP
2517 if ( name.empty() )
2518 name = wxFontMapperBase::GetEncodingName(m_encoding);
2519 #endif // wxUSE_FONTMAP
2520
2521 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2522 if ( conv->IsOk() )
2523 return conv;
2524
2525 delete conv;
2526 }
2527 #endif // HAVE_ICONV
2528
2529 #ifdef wxHAVE_WIN32_MB2WC
2530 {
2531 #if wxUSE_FONTMAP
2532 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2533 : new wxMBConv_win32(m_encoding);
2534 if ( conv->IsOk() )
2535 return conv;
2536
2537 delete conv;
2538 #else
2539 return NULL;
2540 #endif
2541 }
2542 #endif // wxHAVE_WIN32_MB2WC
2543 #if defined(__WXMAC__)
2544 {
2545 // leave UTF16 and UTF32 to the built-ins of wx
2546 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2547 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2548 {
2549
2550 #if wxUSE_FONTMAP
2551 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2552 : new wxMBConv_mac(m_encoding);
2553 #else
2554 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2555 #endif
2556 if ( conv->IsOk() )
2557 return conv;
2558
2559 delete conv;
2560 }
2561 }
2562 #endif
2563 #if defined(__WXCOCOA__)
2564 {
2565 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2566 {
2567
2568 #if wxUSE_FONTMAP
2569 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2570 : new wxMBConv_cocoa(m_encoding);
2571 #else
2572 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2573 #endif
2574 if ( conv->IsOk() )
2575 return conv;
2576
2577 delete conv;
2578 }
2579 }
2580 #endif
2581 // step (2)
2582 wxFontEncoding enc = m_encoding;
2583 #if wxUSE_FONTMAP
2584 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2585 {
2586 // use "false" to suppress interactive dialogs -- we can be called from
2587 // anywhere and popping up a dialog from here is the last thing we want to
2588 // do
2589 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2590 }
2591 #endif // wxUSE_FONTMAP
2592
2593 switch ( enc )
2594 {
2595 case wxFONTENCODING_UTF7:
2596 return new wxMBConvUTF7;
2597
2598 case wxFONTENCODING_UTF8:
2599 return new wxMBConvUTF8;
2600
2601 case wxFONTENCODING_UTF16BE:
2602 return new wxMBConvUTF16BE;
2603
2604 case wxFONTENCODING_UTF16LE:
2605 return new wxMBConvUTF16LE;
2606
2607 case wxFONTENCODING_UTF32BE:
2608 return new wxMBConvUTF32BE;
2609
2610 case wxFONTENCODING_UTF32LE:
2611 return new wxMBConvUTF32LE;
2612
2613 default:
2614 // nothing to do but put here to suppress gcc warnings
2615 ;
2616 }
2617
2618 // step (3)
2619 #if wxUSE_FONTMAP
2620 {
2621 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2622 : new wxMBConv_wxwin(m_encoding);
2623 if ( conv->IsOk() )
2624 return conv;
2625
2626 delete conv;
2627 }
2628 #endif // wxUSE_FONTMAP
2629
2630 // NB: This is a hack to prevent deadlock. What could otherwise happen
2631 // in Unicode build: wxConvLocal creation ends up being here
2632 // because of some failure and logs the error. But wxLog will try to
2633 // attach timestamp, for which it will need wxConvLocal (to convert
2634 // time to char* and then wchar_t*), but that fails, tries to log
2635 // error, but wxLog has a (already locked) critical section that
2636 // guards static buffer.
2637 static bool alreadyLoggingError = false;
2638 if (!alreadyLoggingError)
2639 {
2640 alreadyLoggingError = true;
2641 wxLogError(_("Cannot convert from the charset '%s'!"),
2642 m_name ? m_name
2643 :
2644 #if wxUSE_FONTMAP
2645 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2646 #else // !wxUSE_FONTMAP
2647 wxString::Format(_("encoding %s"), m_encoding).c_str()
2648 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2649 );
2650 alreadyLoggingError = false;
2651 }
2652
2653 return NULL;
2654 }
2655
2656 void wxCSConv::CreateConvIfNeeded() const
2657 {
2658 if ( m_deferred )
2659 {
2660 wxCSConv *self = (wxCSConv *)this; // const_cast
2661
2662 #if wxUSE_INTL
2663 // if we don't have neither the name nor the encoding, use the default
2664 // encoding for this system
2665 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2666 {
2667 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2668 }
2669 #endif // wxUSE_INTL
2670
2671 self->m_convReal = DoCreate();
2672 self->m_deferred = false;
2673 }
2674 }
2675
2676 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2677 {
2678 CreateConvIfNeeded();
2679
2680 if (m_convReal)
2681 return m_convReal->MB2WC(buf, psz, n);
2682
2683 // latin-1 (direct)
2684 size_t len = strlen(psz);
2685
2686 if (buf)
2687 {
2688 for (size_t c = 0; c <= len; c++)
2689 buf[c] = (unsigned char)(psz[c]);
2690 }
2691
2692 return len;
2693 }
2694
2695 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2696 {
2697 CreateConvIfNeeded();
2698
2699 if (m_convReal)
2700 return m_convReal->WC2MB(buf, psz, n);
2701
2702 // latin-1 (direct)
2703 const size_t len = wxWcslen(psz);
2704 if (buf)
2705 {
2706 for (size_t c = 0; c <= len; c++)
2707 {
2708 if (psz[c] > 0xFF)
2709 return (size_t)-1;
2710 buf[c] = (char)psz[c];
2711 }
2712 }
2713 else
2714 {
2715 for (size_t c = 0; c <= len; c++)
2716 {
2717 if (psz[c] > 0xFF)
2718 return (size_t)-1;
2719 }
2720 }
2721
2722 return len;
2723 }
2724
2725 // ----------------------------------------------------------------------------
2726 // globals
2727 // ----------------------------------------------------------------------------
2728
2729 #ifdef __WINDOWS__
2730 static wxMBConv_win32 wxConvLibcObj;
2731 #elif defined(__WXMAC__) && !defined(__MACH__)
2732 static wxMBConv_mac wxConvLibcObj ;
2733 #else
2734 static wxMBConvLibc wxConvLibcObj;
2735 #endif
2736
2737 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2738 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2739 static wxMBConvUTF7 wxConvUTF7Obj;
2740 static wxMBConvUTF8 wxConvUTF8Obj;
2741
2742 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2743 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2744 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2745 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2746 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2747 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2748 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2749 #ifdef __WXOSX__
2750 wxConvUTF8Obj;
2751 #else
2752 wxConvLibcObj;
2753 #endif
2754
2755
2756 #else // !wxUSE_WCHAR_T
2757
2758 // stand-ins in absence of wchar_t
2759 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2760 wxConvISO8859_1,
2761 wxConvLocal,
2762 wxConvUTF8;
2763
2764 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2765
2766