]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
update from Martin Srebotnjak
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359 // ----------------------------------------------------------------------------
360 // UTF-7
361 // ----------------------------------------------------------------------------
362
363 // Implementation (C) 2004 Fredrik Roubert
364
365 //
366 // BASE64 decoding table
367 //
368 static const unsigned char utf7unb64[] =
369 {
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
376 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
377 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
379 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
380 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
381 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
382 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
383 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
384 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
385 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
402 };
403
404 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
405 {
406 size_t len = 0;
407
408 while (*psz && ((!buf) || (len < n)))
409 {
410 unsigned char cc = *psz++;
411 if (cc != '+')
412 {
413 // plain ASCII char
414 if (buf)
415 *buf++ = cc;
416 len++;
417 }
418 else if (*psz == '-')
419 {
420 // encoded plus sign
421 if (buf)
422 *buf++ = cc;
423 len++;
424 psz++;
425 }
426 else
427 {
428 // BASE64 encoded string
429 bool lsb;
430 unsigned char c;
431 unsigned int d, l;
432 for (lsb = false, d = 0, l = 0;
433 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
434 {
435 d <<= 6;
436 d += cc;
437 for (l += 6; l >= 8; lsb = !lsb)
438 {
439 c = (unsigned char)((d >> (l -= 8)) % 256);
440 if (lsb)
441 {
442 if (buf)
443 *buf++ |= c;
444 len ++;
445 }
446 else
447 if (buf)
448 *buf = (wchar_t)(c << 8);
449 }
450 }
451 if (*psz == '-')
452 psz++;
453 }
454 }
455 if (buf && (len < n))
456 *buf = 0;
457 return len;
458 }
459
460 //
461 // BASE64 encoding table
462 //
463 static const unsigned char utf7enb64[] =
464 {
465 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
466 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
467 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
468 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
469 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
470 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
471 'w', 'x', 'y', 'z', '0', '1', '2', '3',
472 '4', '5', '6', '7', '8', '9', '+', '/'
473 };
474
475 //
476 // UTF-7 encoding table
477 //
478 // 0 - Set D (directly encoded characters)
479 // 1 - Set O (optional direct characters)
480 // 2 - whitespace characters (optional)
481 // 3 - special characters
482 //
483 static const unsigned char utf7encode[128] =
484 {
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
486 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
487 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
489 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
491 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
493 };
494
495 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
496 {
497
498
499 size_t len = 0;
500
501 while (*psz && ((!buf) || (len < n)))
502 {
503 wchar_t cc = *psz++;
504 if (cc < 0x80 && utf7encode[cc] < 1)
505 {
506 // plain ASCII char
507 if (buf)
508 *buf++ = (char)cc;
509 len++;
510 }
511 #ifndef WC_UTF16
512 else if (((wxUint32)cc) > 0xffff)
513 {
514 // no surrogate pair generation (yet?)
515 return (size_t)-1;
516 }
517 #endif
518 else
519 {
520 if (buf)
521 *buf++ = '+';
522 len++;
523 if (cc != '+')
524 {
525 // BASE64 encode string
526 unsigned int lsb, d, l;
527 for (d = 0, l = 0;; psz++)
528 {
529 for (lsb = 0; lsb < 2; lsb ++)
530 {
531 d <<= 8;
532 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
533
534 for (l += 8; l >= 6; )
535 {
536 l -= 6;
537 if (buf)
538 *buf++ = utf7enb64[(d >> l) % 64];
539 len++;
540 }
541 }
542 cc = *psz;
543 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
544 break;
545 }
546 if (l != 0)
547 {
548 if (buf)
549 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
550 len++;
551 }
552 }
553 if (buf)
554 *buf++ = '-';
555 len++;
556 }
557 }
558 if (buf && (len < n))
559 *buf = 0;
560 return len;
561 }
562
563 // ----------------------------------------------------------------------------
564 // UTF-8
565 // ----------------------------------------------------------------------------
566
567 static wxUint32 utf8_max[]=
568 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
569
570 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
571 {
572 size_t len = 0;
573
574 while (*psz && ((!buf) || (len < n)))
575 {
576 unsigned char cc = *psz++, fc = cc;
577 unsigned cnt;
578 for (cnt = 0; fc & 0x80; cnt++)
579 fc <<= 1;
580 if (!cnt)
581 {
582 // plain ASCII char
583 if (buf)
584 *buf++ = cc;
585 len++;
586 }
587 else
588 {
589 cnt--;
590 if (!cnt)
591 {
592 // invalid UTF-8 sequence
593 return (size_t)-1;
594 }
595 else
596 {
597 unsigned ocnt = cnt - 1;
598 wxUint32 res = cc & (0x3f >> cnt);
599 while (cnt--)
600 {
601 cc = *psz++;
602 if ((cc & 0xC0) != 0x80)
603 {
604 // invalid UTF-8 sequence
605 return (size_t)-1;
606 }
607 res = (res << 6) | (cc & 0x3f);
608 }
609 if (res <= utf8_max[ocnt])
610 {
611 // illegal UTF-8 encoding
612 return (size_t)-1;
613 }
614 #ifdef WC_UTF16
615 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
616 size_t pa = encode_utf16(res, (wxUint16 *)buf);
617 if (pa == (size_t)-1)
618 return (size_t)-1;
619 if (buf)
620 buf += pa;
621 len += pa;
622 #else // !WC_UTF16
623 if (buf)
624 *buf++ = res;
625 len++;
626 #endif // WC_UTF16/!WC_UTF16
627 }
628 }
629 }
630 if (buf && (len < n))
631 *buf = 0;
632 return len;
633 }
634
635 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
636 {
637 size_t len = 0;
638
639 while (*psz && ((!buf) || (len < n)))
640 {
641 wxUint32 cc;
642 #ifdef WC_UTF16
643 // cast is ok for WC_UTF16
644 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
645 psz += (pa == (size_t)-1) ? 1 : pa;
646 #else
647 cc=(*psz++) & 0x7fffffff;
648 #endif
649 unsigned cnt;
650 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
651 if (!cnt)
652 {
653 // plain ASCII char
654 if (buf)
655 *buf++ = (char) cc;
656 len++;
657 }
658
659 else
660 {
661 len += cnt + 1;
662 if (buf)
663 {
664 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
665 while (cnt--)
666 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
667 }
668 }
669 }
670
671 if (buf && (len<n)) *buf = 0;
672
673 return len;
674 }
675
676
677
678
679 // ----------------------------------------------------------------------------
680 // UTF-16
681 // ----------------------------------------------------------------------------
682
683 #ifdef WORDS_BIGENDIAN
684 #define wxMBConvUTF16straight wxMBConvUTF16BE
685 #define wxMBConvUTF16swap wxMBConvUTF16LE
686 #else
687 #define wxMBConvUTF16swap wxMBConvUTF16BE
688 #define wxMBConvUTF16straight wxMBConvUTF16LE
689 #endif
690
691
692 #ifdef WC_UTF16
693
694 // copy 16bit MB to 16bit String
695 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
696 {
697 size_t len=0;
698
699 while (*(wxUint16*)psz && (!buf || len < n))
700 {
701 if (buf)
702 *buf++ = *(wxUint16*)psz;
703 len++;
704
705 psz += sizeof(wxUint16);
706 }
707 if (buf && len<n) *buf=0;
708
709 return len;
710 }
711
712
713 // copy 16bit String to 16bit MB
714 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
715 {
716 size_t len=0;
717
718 while (*psz && (!buf || len < n))
719 {
720 if (buf)
721 {
722 *(wxUint16*)buf = *psz;
723 buf += sizeof(wxUint16);
724 }
725 len += sizeof(wxUint16);
726 psz++;
727 }
728 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
729
730 return len;
731 }
732
733
734 // swap 16bit MB to 16bit String
735 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
736 {
737 size_t len=0;
738
739 while (*(wxUint16*)psz && (!buf || len < n))
740 {
741 if (buf)
742 {
743 ((char *)buf)[0] = psz[1];
744 ((char *)buf)[1] = psz[0];
745 buf++;
746 }
747 len++;
748 psz += sizeof(wxUint16);
749 }
750 if (buf && len<n) *buf=0;
751
752 return len;
753 }
754
755
756 // swap 16bit MB to 16bit String
757 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
758 {
759 size_t len=0;
760
761 while (*psz && (!buf || len < n))
762 {
763 if (buf)
764 {
765 *buf++ = ((char*)psz)[1];
766 *buf++ = ((char*)psz)[0];
767 }
768 len += sizeof(wxUint16);
769 psz++;
770 }
771 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
772
773 return len;
774 }
775
776
777 #else // WC_UTF16
778
779
780 // copy 16bit MB to 32bit String
781 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
782 {
783 size_t len=0;
784
785 while (*(wxUint16*)psz && (!buf || len < n))
786 {
787 wxUint32 cc;
788 size_t pa=decode_utf16((wxUint16*)psz, cc);
789 if (pa == (size_t)-1)
790 return pa;
791
792 if (buf)
793 *buf++ = cc;
794 len++;
795 psz += pa * sizeof(wxUint16);
796 }
797 if (buf && len<n) *buf=0;
798
799 return len;
800 }
801
802
803 // copy 32bit String to 16bit MB
804 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
805 {
806 size_t len=0;
807
808 while (*psz && (!buf || len < n))
809 {
810 wxUint16 cc[2];
811 size_t pa=encode_utf16(*psz, cc);
812
813 if (pa == (size_t)-1)
814 return pa;
815
816 if (buf)
817 {
818 *(wxUint16*)buf = cc[0];
819 buf += sizeof(wxUint16);
820 if (pa > 1)
821 {
822 *(wxUint16*)buf = cc[1];
823 buf += sizeof(wxUint16);
824 }
825 }
826
827 len += pa*sizeof(wxUint16);
828 psz++;
829 }
830 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
831
832 return len;
833 }
834
835
836 // swap 16bit MB to 32bit String
837 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
838 {
839 size_t len=0;
840
841 while (*(wxUint16*)psz && (!buf || len < n))
842 {
843 wxUint32 cc;
844 char tmp[4];
845 tmp[0]=psz[1]; tmp[1]=psz[0];
846 tmp[2]=psz[3]; tmp[3]=psz[2];
847
848 size_t pa=decode_utf16((wxUint16*)tmp, cc);
849 if (pa == (size_t)-1)
850 return pa;
851
852 if (buf)
853 *buf++ = cc;
854
855 len++;
856 psz += pa * sizeof(wxUint16);
857 }
858 if (buf && len<n) *buf=0;
859
860 return len;
861 }
862
863
864 // swap 32bit String to 16bit MB
865 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
866 {
867 size_t len=0;
868
869 while (*psz && (!buf || len < n))
870 {
871 wxUint16 cc[2];
872 size_t pa=encode_utf16(*psz, cc);
873
874 if (pa == (size_t)-1)
875 return pa;
876
877 if (buf)
878 {
879 *buf++ = ((char*)cc)[1];
880 *buf++ = ((char*)cc)[0];
881 if (pa > 1)
882 {
883 *buf++ = ((char*)cc)[3];
884 *buf++ = ((char*)cc)[2];
885 }
886 }
887
888 len += pa*sizeof(wxUint16);
889 psz++;
890 }
891 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
892
893 return len;
894 }
895
896 #endif // WC_UTF16
897
898
899 // ----------------------------------------------------------------------------
900 // UTF-32
901 // ----------------------------------------------------------------------------
902
903 #ifdef WORDS_BIGENDIAN
904 #define wxMBConvUTF32straight wxMBConvUTF32BE
905 #define wxMBConvUTF32swap wxMBConvUTF32LE
906 #else
907 #define wxMBConvUTF32swap wxMBConvUTF32BE
908 #define wxMBConvUTF32straight wxMBConvUTF32LE
909 #endif
910
911
912 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
914
915
916 #ifdef WC_UTF16
917
918 // copy 32bit MB to 16bit String
919 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
920 {
921 size_t len=0;
922
923 while (*(wxUint32*)psz && (!buf || len < n))
924 {
925 wxUint16 cc[2];
926
927 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
928 if (pa == (size_t)-1)
929 return pa;
930
931 if (buf)
932 {
933 *buf++ = cc[0];
934 if (pa > 1)
935 *buf++ = cc[1];
936 }
937 len += pa;
938 psz += sizeof(wxUint32);
939 }
940 if (buf && len<n) *buf=0;
941
942 return len;
943 }
944
945
946 // copy 16bit String to 32bit MB
947 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
948 {
949 size_t len=0;
950
951 while (*psz && (!buf || len < n))
952 {
953 wxUint32 cc;
954
955 // cast is ok for WC_UTF16
956 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
957 if (pa == (size_t)-1)
958 return pa;
959
960 if (buf)
961 {
962 *(wxUint32*)buf = cc;
963 buf += sizeof(wxUint32);
964 }
965 len += sizeof(wxUint32);
966 psz += pa;
967 }
968
969 if (buf && len<=n-sizeof(wxUint32))
970 *(wxUint32*)buf=0;
971
972 return len;
973 }
974
975
976
977 // swap 32bit MB to 16bit String
978 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
979 {
980 size_t len=0;
981
982 while (*(wxUint32*)psz && (!buf || len < n))
983 {
984 char tmp[4];
985 tmp[0] = psz[3]; tmp[1] = psz[2];
986 tmp[2] = psz[1]; tmp[3] = psz[0];
987
988
989 wxUint16 cc[2];
990
991 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
992 if (pa == (size_t)-1)
993 return pa;
994
995 if (buf)
996 {
997 *buf++ = cc[0];
998 if (pa > 1)
999 *buf++ = cc[1];
1000 }
1001 len += pa;
1002 psz += sizeof(wxUint32);
1003 }
1004
1005 if (buf && len<n)
1006 *buf=0;
1007
1008 return len;
1009 }
1010
1011
1012 // swap 16bit String to 32bit MB
1013 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1014 {
1015 size_t len=0;
1016
1017 while (*psz && (!buf || len < n))
1018 {
1019 char cc[4];
1020
1021 // cast is ok for WC_UTF16
1022 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1023 if (pa == (size_t)-1)
1024 return pa;
1025
1026 if (buf)
1027 {
1028 *buf++ = cc[3];
1029 *buf++ = cc[2];
1030 *buf++ = cc[1];
1031 *buf++ = cc[0];
1032 }
1033 len += sizeof(wxUint32);
1034 psz += pa;
1035 }
1036
1037 if (buf && len<=n-sizeof(wxUint32))
1038 *(wxUint32*)buf=0;
1039
1040 return len;
1041 }
1042
1043 #else // WC_UTF16
1044
1045
1046 // copy 32bit MB to 32bit String
1047 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1048 {
1049 size_t len=0;
1050
1051 while (*(wxUint32*)psz && (!buf || len < n))
1052 {
1053 if (buf)
1054 *buf++ = *(wxUint32*)psz;
1055 len++;
1056 psz += sizeof(wxUint32);
1057 }
1058
1059 if (buf && len<n)
1060 *buf=0;
1061
1062 return len;
1063 }
1064
1065
1066 // copy 32bit String to 32bit MB
1067 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1068 {
1069 size_t len=0;
1070
1071 while (*psz && (!buf || len < n))
1072 {
1073 if (buf)
1074 {
1075 *(wxUint32*)buf = *psz;
1076 buf += sizeof(wxUint32);
1077 }
1078
1079 len += sizeof(wxUint32);
1080 psz++;
1081 }
1082
1083 if (buf && len<=n-sizeof(wxUint32))
1084 *(wxUint32*)buf=0;
1085
1086 return len;
1087 }
1088
1089
1090 // swap 32bit MB to 32bit String
1091 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1092 {
1093 size_t len=0;
1094
1095 while (*(wxUint32*)psz && (!buf || len < n))
1096 {
1097 if (buf)
1098 {
1099 ((char *)buf)[0] = psz[3];
1100 ((char *)buf)[1] = psz[2];
1101 ((char *)buf)[2] = psz[1];
1102 ((char *)buf)[3] = psz[0];
1103 buf++;
1104 }
1105 len++;
1106 psz += sizeof(wxUint32);
1107 }
1108
1109 if (buf && len<n)
1110 *buf=0;
1111
1112 return len;
1113 }
1114
1115
1116 // swap 32bit String to 32bit MB
1117 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1118 {
1119 size_t len=0;
1120
1121 while (*psz && (!buf || len < n))
1122 {
1123 if (buf)
1124 {
1125 *buf++ = ((char *)psz)[3];
1126 *buf++ = ((char *)psz)[2];
1127 *buf++ = ((char *)psz)[1];
1128 *buf++ = ((char *)psz)[0];
1129 }
1130 len += sizeof(wxUint32);
1131 psz++;
1132 }
1133
1134 if (buf && len<=n-sizeof(wxUint32))
1135 *(wxUint32*)buf=0;
1136
1137 return len;
1138 }
1139
1140
1141 #endif // WC_UTF16
1142
1143
1144 // ============================================================================
1145 // The classes doing conversion using the iconv_xxx() functions
1146 // ============================================================================
1147
1148 #ifdef HAVE_ICONV
1149
1150 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1151 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1152 // (unless there's yet another bug in glibc) the only case when iconv()
1153 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1154 // left in the input buffer -- when _real_ error occurs,
1155 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1156 // iconv() failure.
1157 // [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160 (errno != E2BIG || bufLeft != 0))
1161 #else
1162 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1163 #endif
1164
1165 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1166
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1170
1171 class wxMBConv_iconv : public wxMBConv
1172 {
1173 public:
1174 wxMBConv_iconv(const wxChar *name);
1175 virtual ~wxMBConv_iconv();
1176
1177 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1178 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1179
1180 bool IsOk() const
1181 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1182
1183 protected:
1184 // the iconv handlers used to translate from multibyte to wide char and in
1185 // the other direction
1186 iconv_t m2w,
1187 w2m;
1188 #if wxUSE_THREADS
1189 // guards access to m2w and w2m objects
1190 wxMutex m_iconvMutex;
1191 #endif
1192
1193 private:
1194 // the name (for iconv_open()) of a wide char charset -- if none is
1195 // available on this machine, it will remain NULL
1196 static const char *ms_wcCharsetName;
1197
1198 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1199 // different endian-ness than the native one
1200 static bool ms_wcNeedsSwap;
1201 };
1202
1203 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1204 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1205
1206 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1207 {
1208 // Do it the hard way
1209 char cname[100];
1210 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1211 cname[i] = (char) name[i];
1212
1213 // check for charset that represents wchar_t:
1214 if (ms_wcCharsetName == NULL)
1215 {
1216 ms_wcNeedsSwap = false;
1217
1218 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1219 ms_wcCharsetName = WC_NAME_BEST;
1220 m2w = iconv_open(ms_wcCharsetName, cname);
1221
1222 if (m2w == (iconv_t)-1)
1223 {
1224 // try charset w/o bytesex info (e.g. "UCS4")
1225 // and check for bytesex ourselves:
1226 ms_wcCharsetName = WC_NAME;
1227 m2w = iconv_open(ms_wcCharsetName, cname);
1228
1229 // last bet, try if it knows WCHAR_T pseudo-charset
1230 if (m2w == (iconv_t)-1)
1231 {
1232 ms_wcCharsetName = "WCHAR_T";
1233 m2w = iconv_open(ms_wcCharsetName, cname);
1234 }
1235
1236 if (m2w != (iconv_t)-1)
1237 {
1238 char buf[2], *bufPtr;
1239 wchar_t wbuf[2], *wbufPtr;
1240 size_t insz, outsz;
1241 size_t res;
1242
1243 buf[0] = 'A';
1244 buf[1] = 0;
1245 wbuf[0] = 0;
1246 insz = 2;
1247 outsz = SIZEOF_WCHAR_T * 2;
1248 wbufPtr = wbuf;
1249 bufPtr = buf;
1250
1251 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1252 (char**)&wbufPtr, &outsz);
1253
1254 if (ICONV_FAILED(res, insz))
1255 {
1256 ms_wcCharsetName = NULL;
1257 wxLogLastError(wxT("iconv"));
1258 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1259 }
1260 else
1261 {
1262 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1263 }
1264 }
1265 else
1266 {
1267 ms_wcCharsetName = NULL;
1268
1269 // VS: we must not output an error here, since wxWidgets will safely
1270 // fall back to using wxEncodingConverter.
1271 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1272 //wxLogError(
1273 }
1274 }
1275 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1276 }
1277 else // we already have ms_wcCharsetName
1278 {
1279 m2w = iconv_open(ms_wcCharsetName, cname);
1280 }
1281
1282 // NB: don't ever pass NULL to iconv_open(), it may crash!
1283 if ( ms_wcCharsetName )
1284 {
1285 w2m = iconv_open( cname, ms_wcCharsetName);
1286 }
1287 else
1288 {
1289 w2m = (iconv_t)-1;
1290 }
1291 }
1292
1293 wxMBConv_iconv::~wxMBConv_iconv()
1294 {
1295 if ( m2w != (iconv_t)-1 )
1296 iconv_close(m2w);
1297 if ( w2m != (iconv_t)-1 )
1298 iconv_close(w2m);
1299 }
1300
1301 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1302 {
1303 #if wxUSE_THREADS
1304 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1305 // Unfortunately there is a couple of global wxCSConv objects such as
1306 // wxConvLocal that are used all over wx code, so we have to make sure
1307 // the handle is used by at most one thread at the time. Otherwise
1308 // only a few wx classes would be safe to use from non-main threads
1309 // as MB<->WC conversion would fail "randomly".
1310 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1311 #endif
1312
1313 size_t inbuf = strlen(psz);
1314 size_t outbuf = n * SIZEOF_WCHAR_T;
1315 size_t res, cres;
1316 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1317 wchar_t *bufPtr = buf;
1318 const char *pszPtr = psz;
1319
1320 if (buf)
1321 {
1322 // have destination buffer, convert there
1323 cres = iconv(m2w,
1324 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1325 (char**)&bufPtr, &outbuf);
1326 res = n - (outbuf / SIZEOF_WCHAR_T);
1327
1328 if (ms_wcNeedsSwap)
1329 {
1330 // convert to native endianness
1331 WC_BSWAP(buf /* _not_ bufPtr */, res)
1332 }
1333
1334 // NB: iconv was given only strlen(psz) characters on input, and so
1335 // it couldn't convert the trailing zero. Let's do it ourselves
1336 // if there's some room left for it in the output buffer.
1337 if (res < n)
1338 buf[res] = 0;
1339 }
1340 else
1341 {
1342 // no destination buffer... convert using temp buffer
1343 // to calculate destination buffer requirement
1344 wchar_t tbuf[8];
1345 res = 0;
1346 do {
1347 bufPtr = tbuf;
1348 outbuf = 8*SIZEOF_WCHAR_T;
1349
1350 cres = iconv(m2w,
1351 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1352 (char**)&bufPtr, &outbuf );
1353
1354 res += 8-(outbuf/SIZEOF_WCHAR_T);
1355 } while ((cres==(size_t)-1) && (errno==E2BIG));
1356 }
1357
1358 if (ICONV_FAILED(cres, inbuf))
1359 {
1360 //VS: it is ok if iconv fails, hence trace only
1361 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1362 return (size_t)-1;
1363 }
1364
1365 return res;
1366 }
1367
1368 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1369 {
1370 #if wxUSE_THREADS
1371 // NB: explained in MB2WC
1372 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1373 #endif
1374
1375 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1376 size_t outbuf = n;
1377 size_t res, cres;
1378
1379 wchar_t *tmpbuf = 0;
1380
1381 if (ms_wcNeedsSwap)
1382 {
1383 // need to copy to temp buffer to switch endianness
1384 // this absolutely doesn't rock!
1385 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1386 // could be in read-only memory, or be accessed in some other thread)
1387 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1388 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1389 WC_BSWAP(tmpbuf, inbuf)
1390 psz=tmpbuf;
1391 }
1392
1393 if (buf)
1394 {
1395 // have destination buffer, convert there
1396 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1397
1398 res = n-outbuf;
1399
1400 // NB: iconv was given only wcslen(psz) characters on input, and so
1401 // it couldn't convert the trailing zero. Let's do it ourselves
1402 // if there's some room left for it in the output buffer.
1403 if (res < n)
1404 buf[0] = 0;
1405 }
1406 else
1407 {
1408 // no destination buffer... convert using temp buffer
1409 // to calculate destination buffer requirement
1410 char tbuf[16];
1411 res = 0;
1412 do {
1413 buf = tbuf; outbuf = 16;
1414
1415 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1416
1417 res += 16 - outbuf;
1418 } while ((cres==(size_t)-1) && (errno==E2BIG));
1419 }
1420
1421 if (ms_wcNeedsSwap)
1422 {
1423 free(tmpbuf);
1424 }
1425
1426 if (ICONV_FAILED(cres, inbuf))
1427 {
1428 //VS: it is ok if iconv fails, hence trace only
1429 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1430 return (size_t)-1;
1431 }
1432
1433 return res;
1434 }
1435
1436 #endif // HAVE_ICONV
1437
1438
1439 // ============================================================================
1440 // Win32 conversion classes
1441 // ============================================================================
1442
1443 #ifdef wxHAVE_WIN32_MB2WC
1444
1445 // from utils.cpp
1446 #if wxUSE_FONTMAP
1447 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1448 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1449 #endif
1450
1451 class wxMBConv_win32 : public wxMBConv
1452 {
1453 public:
1454 wxMBConv_win32()
1455 {
1456 m_CodePage = CP_ACP;
1457 }
1458
1459 #if wxUSE_FONTMAP
1460 wxMBConv_win32(const wxChar* name)
1461 {
1462 m_CodePage = wxCharsetToCodepage(name);
1463 }
1464
1465 wxMBConv_win32(wxFontEncoding encoding)
1466 {
1467 m_CodePage = wxEncodingToCodepage(encoding);
1468 }
1469 #endif
1470
1471 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1472 {
1473 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1474 // the behaviour is not compatible with the Unix version (using iconv)
1475 // and break the library itself, e.g. wxTextInputStream::NextChar()
1476 // wouldn't work if reading an incomplete MB char didn't result in an
1477 // error
1478 //
1479 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1480 // an error (tested under Windows Server 2003) and apparently it is
1481 // done on purpose, i.e. the function accepts any input in this case
1482 // and although I'd prefer to return error on ill-formed output, our
1483 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1484 // explicitly ill-formed according to RFC 2152) neither so we don't
1485 // even have any fallback here...
1486 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1487
1488 const size_t len = ::MultiByteToWideChar
1489 (
1490 m_CodePage, // code page
1491 flags, // flags: fall on error
1492 psz, // input string
1493 -1, // its length (NUL-terminated)
1494 buf, // output string
1495 buf ? n : 0 // size of output buffer
1496 );
1497
1498 // note that it returns count of written chars for buf != NULL and size
1499 // of the needed buffer for buf == NULL so in either case the length of
1500 // the string (which never includes the terminating NUL) is one less
1501 return len ? len - 1 : (size_t)-1;
1502 }
1503
1504 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1505 {
1506 /*
1507 we have a problem here: by default, WideCharToMultiByte() may
1508 replace characters unrepresentable in the target code page with bad
1509 quality approximations such as turning "1/2" symbol (U+00BD) into
1510 "1" for the code pages which don't have it and we, obviously, want
1511 to avoid this at any price
1512
1513 the trouble is that this function does it _silently_, i.e. it won't
1514 even tell us whether it did or not... Win98/2000 and higher provide
1515 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1516 we have to resort to a round trip, i.e. check that converting back
1517 results in the same string -- this is, of course, expensive but
1518 otherwise we simply can't be sure to not garble the data.
1519 */
1520
1521 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1522 // it doesn't work with CJK encodings (which we test for rather roughly
1523 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1524 // supporting it
1525 BOOL usedDef wxDUMMY_INITIALIZE(false);
1526 BOOL *pUsedDef;
1527 int flags;
1528 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1529 {
1530 // it's our lucky day
1531 flags = WC_NO_BEST_FIT_CHARS;
1532 pUsedDef = &usedDef;
1533 }
1534 else // old system or unsupported encoding
1535 {
1536 flags = 0;
1537 pUsedDef = NULL;
1538 }
1539
1540 const size_t len = ::WideCharToMultiByte
1541 (
1542 m_CodePage, // code page
1543 flags, // either none or no best fit
1544 pwz, // input string
1545 -1, // it is (wide) NUL-terminated
1546 buf, // output buffer
1547 buf ? n : 0, // and its size
1548 NULL, // default "replacement" char
1549 pUsedDef // [out] was it used?
1550 );
1551
1552 if ( !len )
1553 {
1554 // function totally failed
1555 return (size_t)-1;
1556 }
1557
1558 // if we were really converting, check if we succeeded
1559 if ( buf )
1560 {
1561 if ( flags )
1562 {
1563 // check if the conversion failed, i.e. if any replacements
1564 // were done
1565 if ( usedDef )
1566 return (size_t)-1;
1567 }
1568 else // we must resort to double tripping...
1569 {
1570 wxWCharBuffer wcBuf(n);
1571 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1572 wcscmp(wcBuf, pwz) != 0 )
1573 {
1574 // we didn't obtain the same thing we started from, hence
1575 // the conversion was lossy and we consider that it failed
1576 return (size_t)-1;
1577 }
1578 }
1579 }
1580
1581 // see the comment above for the reason of "len - 1"
1582 return len - 1;
1583 }
1584
1585 bool IsOk() const { return m_CodePage != -1; }
1586
1587 private:
1588 static bool CanUseNoBestFit()
1589 {
1590 static int s_isWin98Or2k = -1;
1591
1592 if ( s_isWin98Or2k == -1 )
1593 {
1594 int verMaj, verMin;
1595 switch ( wxGetOsVersion(&verMaj, &verMin) )
1596 {
1597 case wxWIN95:
1598 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1599 break;
1600
1601 case wxWINDOWS_NT:
1602 s_isWin98Or2k = verMaj >= 5;
1603 break;
1604
1605 default:
1606 // unknown, be conseravtive by default
1607 s_isWin98Or2k = 0;
1608 }
1609
1610 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1611 }
1612
1613 return s_isWin98Or2k == 1;
1614 }
1615
1616 long m_CodePage;
1617 };
1618
1619 #endif // wxHAVE_WIN32_MB2WC
1620
1621 // ============================================================================
1622 // Cocoa conversion classes
1623 // ============================================================================
1624
1625 #if defined(__WXCOCOA__)
1626
1627 // RN: There is no UTF-32 support in either Core Foundation or
1628 // Cocoa. Strangely enough, internally Core Foundation uses
1629 // UTF 32 internally quite a bit - its just not public (yet).
1630
1631 #include <CoreFoundation/CFString.h>
1632 #include <CoreFoundation/CFStringEncodingExt.h>
1633
1634 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1635 {
1636 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1637 if ( encoding == wxFONTENCODING_DEFAULT )
1638 {
1639 enc = CFStringGetSystemEncoding();
1640 }
1641 else switch( encoding)
1642 {
1643 case wxFONTENCODING_ISO8859_1 :
1644 enc = kCFStringEncodingISOLatin1 ;
1645 break ;
1646 case wxFONTENCODING_ISO8859_2 :
1647 enc = kCFStringEncodingISOLatin2;
1648 break ;
1649 case wxFONTENCODING_ISO8859_3 :
1650 enc = kCFStringEncodingISOLatin3 ;
1651 break ;
1652 case wxFONTENCODING_ISO8859_4 :
1653 enc = kCFStringEncodingISOLatin4;
1654 break ;
1655 case wxFONTENCODING_ISO8859_5 :
1656 enc = kCFStringEncodingISOLatinCyrillic;
1657 break ;
1658 case wxFONTENCODING_ISO8859_6 :
1659 enc = kCFStringEncodingISOLatinArabic;
1660 break ;
1661 case wxFONTENCODING_ISO8859_7 :
1662 enc = kCFStringEncodingISOLatinGreek;
1663 break ;
1664 case wxFONTENCODING_ISO8859_8 :
1665 enc = kCFStringEncodingISOLatinHebrew;
1666 break ;
1667 case wxFONTENCODING_ISO8859_9 :
1668 enc = kCFStringEncodingISOLatin5;
1669 break ;
1670 case wxFONTENCODING_ISO8859_10 :
1671 enc = kCFStringEncodingISOLatin6;
1672 break ;
1673 case wxFONTENCODING_ISO8859_11 :
1674 enc = kCFStringEncodingISOLatinThai;
1675 break ;
1676 case wxFONTENCODING_ISO8859_13 :
1677 enc = kCFStringEncodingISOLatin7;
1678 break ;
1679 case wxFONTENCODING_ISO8859_14 :
1680 enc = kCFStringEncodingISOLatin8;
1681 break ;
1682 case wxFONTENCODING_ISO8859_15 :
1683 enc = kCFStringEncodingISOLatin9;
1684 break ;
1685
1686 case wxFONTENCODING_KOI8 :
1687 enc = kCFStringEncodingKOI8_R;
1688 break ;
1689 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1690 enc = kCFStringEncodingDOSRussian;
1691 break ;
1692
1693 // case wxFONTENCODING_BULGARIAN :
1694 // enc = ;
1695 // break ;
1696
1697 case wxFONTENCODING_CP437 :
1698 enc =kCFStringEncodingDOSLatinUS ;
1699 break ;
1700 case wxFONTENCODING_CP850 :
1701 enc = kCFStringEncodingDOSLatin1;
1702 break ;
1703 case wxFONTENCODING_CP852 :
1704 enc = kCFStringEncodingDOSLatin2;
1705 break ;
1706 case wxFONTENCODING_CP855 :
1707 enc = kCFStringEncodingDOSCyrillic;
1708 break ;
1709 case wxFONTENCODING_CP866 :
1710 enc =kCFStringEncodingDOSRussian ;
1711 break ;
1712 case wxFONTENCODING_CP874 :
1713 enc = kCFStringEncodingDOSThai;
1714 break ;
1715 case wxFONTENCODING_CP932 :
1716 enc = kCFStringEncodingDOSJapanese;
1717 break ;
1718 case wxFONTENCODING_CP936 :
1719 enc =kCFStringEncodingDOSChineseSimplif ;
1720 break ;
1721 case wxFONTENCODING_CP949 :
1722 enc = kCFStringEncodingDOSKorean;
1723 break ;
1724 case wxFONTENCODING_CP950 :
1725 enc = kCFStringEncodingDOSChineseTrad;
1726 break ;
1727 case wxFONTENCODING_CP1250 :
1728 enc = kCFStringEncodingWindowsLatin2;
1729 break ;
1730 case wxFONTENCODING_CP1251 :
1731 enc =kCFStringEncodingWindowsCyrillic ;
1732 break ;
1733 case wxFONTENCODING_CP1252 :
1734 enc =kCFStringEncodingWindowsLatin1 ;
1735 break ;
1736 case wxFONTENCODING_CP1253 :
1737 enc = kCFStringEncodingWindowsGreek;
1738 break ;
1739 case wxFONTENCODING_CP1254 :
1740 enc = kCFStringEncodingWindowsLatin5;
1741 break ;
1742 case wxFONTENCODING_CP1255 :
1743 enc =kCFStringEncodingWindowsHebrew ;
1744 break ;
1745 case wxFONTENCODING_CP1256 :
1746 enc =kCFStringEncodingWindowsArabic ;
1747 break ;
1748 case wxFONTENCODING_CP1257 :
1749 enc = kCFStringEncodingWindowsBalticRim;
1750 break ;
1751 // This only really encodes to UTF7 (if that) evidently
1752 // case wxFONTENCODING_UTF7 :
1753 // enc = kCFStringEncodingNonLossyASCII ;
1754 // break ;
1755 case wxFONTENCODING_UTF8 :
1756 enc = kCFStringEncodingUTF8 ;
1757 break ;
1758 case wxFONTENCODING_EUC_JP :
1759 enc = kCFStringEncodingEUC_JP;
1760 break ;
1761 case wxFONTENCODING_UTF16 :
1762 enc = kCFStringEncodingUnicode ;
1763 break ;
1764 case wxFONTENCODING_MACROMAN :
1765 enc = kCFStringEncodingMacRoman ;
1766 break ;
1767 case wxFONTENCODING_MACJAPANESE :
1768 enc = kCFStringEncodingMacJapanese ;
1769 break ;
1770 case wxFONTENCODING_MACCHINESETRAD :
1771 enc = kCFStringEncodingMacChineseTrad ;
1772 break ;
1773 case wxFONTENCODING_MACKOREAN :
1774 enc = kCFStringEncodingMacKorean ;
1775 break ;
1776 case wxFONTENCODING_MACARABIC :
1777 enc = kCFStringEncodingMacArabic ;
1778 break ;
1779 case wxFONTENCODING_MACHEBREW :
1780 enc = kCFStringEncodingMacHebrew ;
1781 break ;
1782 case wxFONTENCODING_MACGREEK :
1783 enc = kCFStringEncodingMacGreek ;
1784 break ;
1785 case wxFONTENCODING_MACCYRILLIC :
1786 enc = kCFStringEncodingMacCyrillic ;
1787 break ;
1788 case wxFONTENCODING_MACDEVANAGARI :
1789 enc = kCFStringEncodingMacDevanagari ;
1790 break ;
1791 case wxFONTENCODING_MACGURMUKHI :
1792 enc = kCFStringEncodingMacGurmukhi ;
1793 break ;
1794 case wxFONTENCODING_MACGUJARATI :
1795 enc = kCFStringEncodingMacGujarati ;
1796 break ;
1797 case wxFONTENCODING_MACORIYA :
1798 enc = kCFStringEncodingMacOriya ;
1799 break ;
1800 case wxFONTENCODING_MACBENGALI :
1801 enc = kCFStringEncodingMacBengali ;
1802 break ;
1803 case wxFONTENCODING_MACTAMIL :
1804 enc = kCFStringEncodingMacTamil ;
1805 break ;
1806 case wxFONTENCODING_MACTELUGU :
1807 enc = kCFStringEncodingMacTelugu ;
1808 break ;
1809 case wxFONTENCODING_MACKANNADA :
1810 enc = kCFStringEncodingMacKannada ;
1811 break ;
1812 case wxFONTENCODING_MACMALAJALAM :
1813 enc = kCFStringEncodingMacMalayalam ;
1814 break ;
1815 case wxFONTENCODING_MACSINHALESE :
1816 enc = kCFStringEncodingMacSinhalese ;
1817 break ;
1818 case wxFONTENCODING_MACBURMESE :
1819 enc = kCFStringEncodingMacBurmese ;
1820 break ;
1821 case wxFONTENCODING_MACKHMER :
1822 enc = kCFStringEncodingMacKhmer ;
1823 break ;
1824 case wxFONTENCODING_MACTHAI :
1825 enc = kCFStringEncodingMacThai ;
1826 break ;
1827 case wxFONTENCODING_MACLAOTIAN :
1828 enc = kCFStringEncodingMacLaotian ;
1829 break ;
1830 case wxFONTENCODING_MACGEORGIAN :
1831 enc = kCFStringEncodingMacGeorgian ;
1832 break ;
1833 case wxFONTENCODING_MACARMENIAN :
1834 enc = kCFStringEncodingMacArmenian ;
1835 break ;
1836 case wxFONTENCODING_MACCHINESESIMP :
1837 enc = kCFStringEncodingMacChineseSimp ;
1838 break ;
1839 case wxFONTENCODING_MACTIBETAN :
1840 enc = kCFStringEncodingMacTibetan ;
1841 break ;
1842 case wxFONTENCODING_MACMONGOLIAN :
1843 enc = kCFStringEncodingMacMongolian ;
1844 break ;
1845 case wxFONTENCODING_MACETHIOPIC :
1846 enc = kCFStringEncodingMacEthiopic ;
1847 break ;
1848 case wxFONTENCODING_MACCENTRALEUR :
1849 enc = kCFStringEncodingMacCentralEurRoman ;
1850 break ;
1851 case wxFONTENCODING_MACVIATNAMESE :
1852 enc = kCFStringEncodingMacVietnamese ;
1853 break ;
1854 case wxFONTENCODING_MACARABICEXT :
1855 enc = kCFStringEncodingMacExtArabic ;
1856 break ;
1857 case wxFONTENCODING_MACSYMBOL :
1858 enc = kCFStringEncodingMacSymbol ;
1859 break ;
1860 case wxFONTENCODING_MACDINGBATS :
1861 enc = kCFStringEncodingMacDingbats ;
1862 break ;
1863 case wxFONTENCODING_MACTURKISH :
1864 enc = kCFStringEncodingMacTurkish ;
1865 break ;
1866 case wxFONTENCODING_MACCROATIAN :
1867 enc = kCFStringEncodingMacCroatian ;
1868 break ;
1869 case wxFONTENCODING_MACICELANDIC :
1870 enc = kCFStringEncodingMacIcelandic ;
1871 break ;
1872 case wxFONTENCODING_MACROMANIAN :
1873 enc = kCFStringEncodingMacRomanian ;
1874 break ;
1875 case wxFONTENCODING_MACCELTIC :
1876 enc = kCFStringEncodingMacCeltic ;
1877 break ;
1878 case wxFONTENCODING_MACGAELIC :
1879 enc = kCFStringEncodingMacGaelic ;
1880 break ;
1881 // case wxFONTENCODING_MACKEYBOARD :
1882 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1883 // break ;
1884 default :
1885 // because gcc is picky
1886 break ;
1887 } ;
1888 return enc ;
1889 }
1890
1891 class wxMBConv_cocoa : public wxMBConv
1892 {
1893 public:
1894 wxMBConv_cocoa()
1895 {
1896 Init(CFStringGetSystemEncoding()) ;
1897 }
1898
1899 wxMBConv_cocoa(const wxChar* name)
1900 {
1901 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1902 }
1903
1904 wxMBConv_cocoa(wxFontEncoding encoding)
1905 {
1906 Init( wxCFStringEncFromFontEnc(encoding) );
1907 }
1908
1909 ~wxMBConv_cocoa()
1910 {
1911 }
1912
1913 void Init( CFStringEncoding encoding)
1914 {
1915 m_encoding = encoding ;
1916 }
1917
1918 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1919 {
1920 wxASSERT(szUnConv);
1921
1922 CFStringRef theString = CFStringCreateWithBytes (
1923 NULL, //the allocator
1924 (const UInt8*)szUnConv,
1925 strlen(szUnConv),
1926 m_encoding,
1927 false //no BOM/external representation
1928 );
1929
1930 wxASSERT(theString);
1931
1932 size_t nOutLength = CFStringGetLength(theString);
1933
1934 if (szOut == NULL)
1935 {
1936 CFRelease(theString);
1937 return nOutLength;
1938 }
1939
1940 CFRange theRange = { 0, nOutSize };
1941
1942 #if SIZEOF_WCHAR_T == 4
1943 UniChar* szUniCharBuffer = new UniChar[nOutSize];
1944 #endif
1945
1946 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1947
1948 CFRelease(theString);
1949
1950 szUniCharBuffer[nOutLength] = '\0' ;
1951
1952 #if SIZEOF_WCHAR_T == 4
1953 wxMBConvUTF16 converter ;
1954 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1955 delete[] szUniCharBuffer;
1956 #endif
1957
1958 return nOutLength;
1959 }
1960
1961 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1962 {
1963 wxASSERT(szUnConv);
1964
1965 size_t nRealOutSize;
1966 size_t nBufSize = wxWcslen(szUnConv);
1967 UniChar* szUniBuffer = (UniChar*) szUnConv;
1968
1969 #if SIZEOF_WCHAR_T == 4
1970 wxMBConvUTF16BE converter ;
1971 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1972 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1973 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1974 nBufSize /= sizeof(UniChar);
1975 #endif
1976
1977 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1978 NULL, //allocator
1979 szUniBuffer,
1980 nBufSize,
1981 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
1982 );
1983
1984 wxASSERT(theString);
1985
1986 //Note that CER puts a BOM when converting to unicode
1987 //so we check and use getchars instead in that case
1988 if (m_encoding == kCFStringEncodingUnicode)
1989 {
1990 if (szOut != NULL)
1991 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1992
1993 nRealOutSize = CFStringGetLength(theString) + 1;
1994 }
1995 else
1996 {
1997 CFStringGetBytes(
1998 theString,
1999 CFRangeMake(0, CFStringGetLength(theString)),
2000 m_encoding,
2001 0, //what to put in characters that can't be converted -
2002 //0 tells CFString to return NULL if it meets such a character
2003 false, //not an external representation
2004 (UInt8*) szOut,
2005 nOutSize,
2006 (CFIndex*) &nRealOutSize
2007 );
2008 }
2009
2010 CFRelease(theString);
2011
2012 #if SIZEOF_WCHAR_T == 4
2013 delete[] szUniBuffer;
2014 #endif
2015
2016 return nRealOutSize - 1;
2017 }
2018
2019 bool IsOk() const
2020 {
2021 return m_encoding != kCFStringEncodingInvalidId &&
2022 CFStringIsEncodingAvailable(m_encoding);
2023 }
2024
2025 private:
2026 CFStringEncoding m_encoding ;
2027 };
2028
2029 #endif // defined(__WXCOCOA__)
2030
2031 // ============================================================================
2032 // Mac conversion classes
2033 // ============================================================================
2034
2035 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2036
2037 class wxMBConv_mac : public wxMBConv
2038 {
2039 public:
2040 wxMBConv_mac()
2041 {
2042 Init(CFStringGetSystemEncoding()) ;
2043 }
2044
2045 wxMBConv_mac(const wxChar* name)
2046 {
2047 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2048 }
2049
2050 wxMBConv_mac(wxFontEncoding encoding)
2051 {
2052 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2053 }
2054
2055 ~wxMBConv_mac()
2056 {
2057 OSStatus status = noErr ;
2058 status = TECDisposeConverter(m_MB2WC_converter);
2059 status = TECDisposeConverter(m_WC2MB_converter);
2060 }
2061
2062
2063 void Init( TextEncodingBase encoding)
2064 {
2065 OSStatus status = noErr ;
2066 m_char_encoding = encoding ;
2067 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2068
2069 status = TECCreateConverter(&m_MB2WC_converter,
2070 m_char_encoding,
2071 m_unicode_encoding);
2072 status = TECCreateConverter(&m_WC2MB_converter,
2073 m_unicode_encoding,
2074 m_char_encoding);
2075 }
2076
2077 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2078 {
2079 OSStatus status = noErr ;
2080 ByteCount byteOutLen ;
2081 ByteCount byteInLen = strlen(psz) ;
2082 wchar_t *tbuf = NULL ;
2083 UniChar* ubuf = NULL ;
2084 size_t res = 0 ;
2085
2086 if (buf == NULL)
2087 {
2088 //apple specs say at least 32
2089 n = wxMax( 32 , byteInLen ) ;
2090 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2091 }
2092 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2093 #if SIZEOF_WCHAR_T == 4
2094 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2095 #else
2096 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2097 #endif
2098 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2099 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2100 #if SIZEOF_WCHAR_T == 4
2101 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2102 // is not properly terminated we get random characters at the end
2103 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2104 wxMBConvUTF16BE converter ;
2105 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2106 free( ubuf ) ;
2107 #else
2108 res = byteOutLen / sizeof( UniChar ) ;
2109 #endif
2110 if ( buf == NULL )
2111 free(tbuf) ;
2112
2113 if ( buf && res < n)
2114 buf[res] = 0;
2115
2116 return res ;
2117 }
2118
2119 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2120 {
2121 OSStatus status = noErr ;
2122 ByteCount byteOutLen ;
2123 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2124
2125 char *tbuf = NULL ;
2126
2127 if (buf == NULL)
2128 {
2129 //apple specs say at least 32
2130 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2131 tbuf = (char*) malloc( n ) ;
2132 }
2133
2134 ByteCount byteBufferLen = n ;
2135 UniChar* ubuf = NULL ;
2136 #if SIZEOF_WCHAR_T == 4
2137 wxMBConvUTF16BE converter ;
2138 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2139 byteInLen = unicharlen ;
2140 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2141 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2142 #else
2143 ubuf = (UniChar*) psz ;
2144 #endif
2145 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2146 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2147 #if SIZEOF_WCHAR_T == 4
2148 free( ubuf ) ;
2149 #endif
2150 if ( buf == NULL )
2151 free(tbuf) ;
2152
2153 size_t res = byteOutLen ;
2154 if ( buf && res < n)
2155 {
2156 buf[res] = 0;
2157
2158 //we need to double-trip to verify it didn't insert any ? in place
2159 //of bogus characters
2160 wxWCharBuffer wcBuf(n);
2161 size_t pszlen = wxWcslen(psz);
2162 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2163 wxWcslen(wcBuf) != pszlen ||
2164 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2165 {
2166 // we didn't obtain the same thing we started from, hence
2167 // the conversion was lossy and we consider that it failed
2168 return (size_t)-1;
2169 }
2170 }
2171
2172 return res ;
2173 }
2174
2175 bool IsOk() const
2176 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2177
2178 private:
2179 TECObjectRef m_MB2WC_converter ;
2180 TECObjectRef m_WC2MB_converter ;
2181
2182 TextEncodingBase m_char_encoding ;
2183 TextEncodingBase m_unicode_encoding ;
2184 };
2185
2186 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2187
2188 // ============================================================================
2189 // wxEncodingConverter based conversion classes
2190 // ============================================================================
2191
2192 #if wxUSE_FONTMAP
2193
2194 class wxMBConv_wxwin : public wxMBConv
2195 {
2196 private:
2197 void Init()
2198 {
2199 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2200 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2201 }
2202
2203 public:
2204 // temporarily just use wxEncodingConverter stuff,
2205 // so that it works while a better implementation is built
2206 wxMBConv_wxwin(const wxChar* name)
2207 {
2208 if (name)
2209 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2210 else
2211 m_enc = wxFONTENCODING_SYSTEM;
2212
2213 Init();
2214 }
2215
2216 wxMBConv_wxwin(wxFontEncoding enc)
2217 {
2218 m_enc = enc;
2219
2220 Init();
2221 }
2222
2223 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2224 {
2225 size_t inbuf = strlen(psz);
2226 if (buf)
2227 {
2228 if (!m2w.Convert(psz,buf))
2229 return (size_t)-1;
2230 }
2231 return inbuf;
2232 }
2233
2234 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2235 {
2236 const size_t inbuf = wxWcslen(psz);
2237 if (buf)
2238 {
2239 if (!w2m.Convert(psz,buf))
2240 return (size_t)-1;
2241 }
2242
2243 return inbuf;
2244 }
2245
2246 bool IsOk() const { return m_ok; }
2247
2248 public:
2249 wxFontEncoding m_enc;
2250 wxEncodingConverter m2w, w2m;
2251
2252 // were we initialized successfully?
2253 bool m_ok;
2254
2255 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2256 };
2257
2258 #endif // wxUSE_FONTMAP
2259
2260 // ============================================================================
2261 // wxCSConv implementation
2262 // ============================================================================
2263
2264 void wxCSConv::Init()
2265 {
2266 m_name = NULL;
2267 m_convReal = NULL;
2268 m_deferred = true;
2269 }
2270
2271 wxCSConv::wxCSConv(const wxChar *charset)
2272 {
2273 Init();
2274
2275 if ( charset )
2276 {
2277 SetName(charset);
2278 }
2279
2280 m_encoding = wxFONTENCODING_SYSTEM;
2281 }
2282
2283 wxCSConv::wxCSConv(wxFontEncoding encoding)
2284 {
2285 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2286 {
2287 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2288
2289 encoding = wxFONTENCODING_SYSTEM;
2290 }
2291
2292 Init();
2293
2294 m_encoding = encoding;
2295 }
2296
2297 wxCSConv::~wxCSConv()
2298 {
2299 Clear();
2300 }
2301
2302 wxCSConv::wxCSConv(const wxCSConv& conv)
2303 : wxMBConv()
2304 {
2305 Init();
2306
2307 SetName(conv.m_name);
2308 m_encoding = conv.m_encoding;
2309 }
2310
2311 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2312 {
2313 Clear();
2314
2315 SetName(conv.m_name);
2316 m_encoding = conv.m_encoding;
2317
2318 return *this;
2319 }
2320
2321 void wxCSConv::Clear()
2322 {
2323 free(m_name);
2324 delete m_convReal;
2325
2326 m_name = NULL;
2327 m_convReal = NULL;
2328 }
2329
2330 void wxCSConv::SetName(const wxChar *charset)
2331 {
2332 if (charset)
2333 {
2334 m_name = wxStrdup(charset);
2335 m_deferred = true;
2336 }
2337 }
2338
2339 wxMBConv *wxCSConv::DoCreate() const
2340 {
2341 // check for the special case of ASCII or ISO8859-1 charset: as we have
2342 // special knowledge of it anyhow, we don't need to create a special
2343 // conversion object
2344 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2345 {
2346 // don't convert at all
2347 return NULL;
2348 }
2349
2350 // we trust OS to do conversion better than we can so try external
2351 // conversion methods first
2352 //
2353 // the full order is:
2354 // 1. OS conversion (iconv() under Unix or Win32 API)
2355 // 2. hard coded conversions for UTF
2356 // 3. wxEncodingConverter as fall back
2357
2358 // step (1)
2359 #ifdef HAVE_ICONV
2360 #if !wxUSE_FONTMAP
2361 if ( m_name )
2362 #endif // !wxUSE_FONTMAP
2363 {
2364 wxString name(m_name);
2365
2366 #if wxUSE_FONTMAP
2367 if ( name.empty() )
2368 name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2369 #endif // wxUSE_FONTMAP
2370
2371 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2372 if ( conv->IsOk() )
2373 return conv;
2374
2375 delete conv;
2376 }
2377 #endif // HAVE_ICONV
2378
2379 #ifdef wxHAVE_WIN32_MB2WC
2380 {
2381 #if wxUSE_FONTMAP
2382 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2383 : new wxMBConv_win32(m_encoding);
2384 if ( conv->IsOk() )
2385 return conv;
2386
2387 delete conv;
2388 #else
2389 return NULL;
2390 #endif
2391 }
2392 #endif // wxHAVE_WIN32_MB2WC
2393 #if defined(__WXMAC__)
2394 {
2395 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2396 {
2397
2398 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2399 : new wxMBConv_mac(m_encoding);
2400 if ( conv->IsOk() )
2401 return conv;
2402
2403 delete conv;
2404 }
2405 }
2406 #endif
2407 #if defined(__WXCOCOA__)
2408 {
2409 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2410 {
2411
2412 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2413 : new wxMBConv_cocoa(m_encoding);
2414 if ( conv->IsOk() )
2415 return conv;
2416
2417 delete conv;
2418 }
2419 }
2420 #endif
2421 // step (2)
2422 wxFontEncoding enc = m_encoding;
2423 #if wxUSE_FONTMAP
2424 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2425 {
2426 // use "false" to suppress interactive dialogs -- we can be called from
2427 // anywhere and popping up a dialog from here is the last thing we want to
2428 // do
2429 enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2430 }
2431 #endif // wxUSE_FONTMAP
2432
2433 switch ( enc )
2434 {
2435 case wxFONTENCODING_UTF7:
2436 return new wxMBConvUTF7;
2437
2438 case wxFONTENCODING_UTF8:
2439 return new wxMBConvUTF8;
2440
2441 case wxFONTENCODING_UTF16BE:
2442 return new wxMBConvUTF16BE;
2443
2444 case wxFONTENCODING_UTF16LE:
2445 return new wxMBConvUTF16LE;
2446
2447 case wxFONTENCODING_UTF32BE:
2448 return new wxMBConvUTF32BE;
2449
2450 case wxFONTENCODING_UTF32LE:
2451 return new wxMBConvUTF32LE;
2452
2453 default:
2454 // nothing to do but put here to suppress gcc warnings
2455 ;
2456 }
2457
2458 // step (3)
2459 #if wxUSE_FONTMAP
2460 {
2461 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2462 : new wxMBConv_wxwin(m_encoding);
2463 if ( conv->IsOk() )
2464 return conv;
2465
2466 delete conv;
2467 }
2468 #endif // wxUSE_FONTMAP
2469
2470 // NB: This is a hack to prevent deadlock. What could otherwise happen
2471 // in Unicode build: wxConvLocal creation ends up being here
2472 // because of some failure and logs the error. But wxLog will try to
2473 // attach timestamp, for which it will need wxConvLocal (to convert
2474 // time to char* and then wchar_t*), but that fails, tries to log
2475 // error, but wxLog has a (already locked) critical section that
2476 // guards static buffer.
2477 static bool alreadyLoggingError = false;
2478 if (!alreadyLoggingError)
2479 {
2480 alreadyLoggingError = true;
2481 wxLogError(_("Cannot convert from the charset '%s'!"),
2482 m_name ? m_name
2483 :
2484 #if wxUSE_FONTMAP
2485 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2486 #else // !wxUSE_FONTMAP
2487 wxString::Format(_("encoding %s"), m_encoding).c_str()
2488 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2489 );
2490 alreadyLoggingError = false;
2491 }
2492
2493 return NULL;
2494 }
2495
2496 void wxCSConv::CreateConvIfNeeded() const
2497 {
2498 if ( m_deferred )
2499 {
2500 wxCSConv *self = (wxCSConv *)this; // const_cast
2501
2502 #if wxUSE_INTL
2503 // if we don't have neither the name nor the encoding, use the default
2504 // encoding for this system
2505 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2506 {
2507 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2508 }
2509 #endif // wxUSE_INTL
2510
2511 self->m_convReal = DoCreate();
2512 self->m_deferred = false;
2513 }
2514 }
2515
2516 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2517 {
2518 CreateConvIfNeeded();
2519
2520 if (m_convReal)
2521 return m_convReal->MB2WC(buf, psz, n);
2522
2523 // latin-1 (direct)
2524 size_t len = strlen(psz);
2525
2526 if (buf)
2527 {
2528 for (size_t c = 0; c <= len; c++)
2529 buf[c] = (unsigned char)(psz[c]);
2530 }
2531
2532 return len;
2533 }
2534
2535 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2536 {
2537 CreateConvIfNeeded();
2538
2539 if (m_convReal)
2540 return m_convReal->WC2MB(buf, psz, n);
2541
2542 // latin-1 (direct)
2543 const size_t len = wxWcslen(psz);
2544 if (buf)
2545 {
2546 for (size_t c = 0; c <= len; c++)
2547 {
2548 if (psz[c] > 0xFF)
2549 return (size_t)-1;
2550 buf[c] = (char)psz[c];
2551 }
2552 }
2553 else
2554 {
2555 for (size_t c = 0; c <= len; c++)
2556 {
2557 if (psz[c] > 0xFF)
2558 return (size_t)-1;
2559 }
2560 }
2561
2562 return len;
2563 }
2564
2565 // ----------------------------------------------------------------------------
2566 // globals
2567 // ----------------------------------------------------------------------------
2568
2569 #ifdef __WINDOWS__
2570 static wxMBConv_win32 wxConvLibcObj;
2571 #elif defined(__WXMAC__) && !defined(__MACH__)
2572 static wxMBConv_mac wxConvLibcObj ;
2573 #else
2574 static wxMBConvLibc wxConvLibcObj;
2575 #endif
2576
2577 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2578 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2579 static wxMBConvUTF7 wxConvUTF7Obj;
2580 static wxMBConvUTF8 wxConvUTF8Obj;
2581
2582
2583 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2584 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2585 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2586 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2587 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2588 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2589
2590 #else // !wxUSE_WCHAR_T
2591
2592 // stand-ins in absence of wchar_t
2593 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2594 wxConvISO8859_1,
2595 wxConvLocal,
2596 wxConvUTF8;
2597
2598 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2599
2600