]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
correct to int32 for range comparison
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #endif
74
75 #include "wx/encconv.h"
76 #include "wx/fontmap.h"
77 #include "wx/utils.h"
78
79 #ifdef __WXMAC__
80 #include <ATSUnicode.h>
81 #include <TextCommon.h>
82 #include <TextEncodingConverter.h>
83
84 #include "wx/mac/private.h" // includes mac headers
85 #endif
86 // ----------------------------------------------------------------------------
87 // macros
88 // ----------------------------------------------------------------------------
89
90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
92
93 #if SIZEOF_WCHAR_T == 4
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
98 #else
99 #define WC_NAME_BEST "UCS-4LE"
100 #endif
101 #elif SIZEOF_WCHAR_T == 2
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
104 #define WC_UTF16
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
107 #else
108 #define WC_NAME_BEST "UTF-16LE"
109 #endif
110 #else // sizeof(wchar_t) != 2 nor 4
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
113 #endif
114
115 // ============================================================================
116 // implementation
117 // ============================================================================
118
119 // ----------------------------------------------------------------------------
120 // UTF-16 en/decoding to/from UCS-4
121 // ----------------------------------------------------------------------------
122
123
124 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
125 {
126 if (input<=0xffff)
127 {
128 if (output)
129 *output = (wxUint16) input;
130 return 1;
131 }
132 else if (input>=0x110000)
133 {
134 return (size_t)-1;
135 }
136 else
137 {
138 if (output)
139 {
140 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
141 *output = (wxUint16) ((input&0x3ff)+0xdc00);
142 }
143 return 2;
144 }
145 }
146
147 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
148 {
149 if ((*input<0xd800) || (*input>0xdfff))
150 {
151 output = *input;
152 return 1;
153 }
154 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
155 {
156 output = *input;
157 return (size_t)-1;
158 }
159 else
160 {
161 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
162 return 2;
163 }
164 }
165
166
167 // ----------------------------------------------------------------------------
168 // wxMBConv
169 // ----------------------------------------------------------------------------
170
171 wxMBConv::~wxMBConv()
172 {
173 // nothing to do here (necessary for Darwin linking probably)
174 }
175
176 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
177 {
178 if ( psz )
179 {
180 // calculate the length of the buffer needed first
181 size_t nLen = MB2WC(NULL, psz, 0);
182 if ( nLen != (size_t)-1 )
183 {
184 // now do the actual conversion
185 wxWCharBuffer buf(nLen);
186 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
187 if ( nLen != (size_t)-1 )
188 {
189 return buf;
190 }
191 }
192 }
193
194 wxWCharBuffer buf((wchar_t *)NULL);
195
196 return buf;
197 }
198
199 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
200 {
201 if ( pwz )
202 {
203 size_t nLen = WC2MB(NULL, pwz, 0);
204 if ( nLen != (size_t)-1 )
205 {
206 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
207 nLen = WC2MB(buf.data(), pwz, nLen + 4);
208 if ( nLen != (size_t)-1 )
209 {
210 return buf;
211 }
212 }
213 }
214
215 wxCharBuffer buf((char *)NULL);
216
217 return buf;
218 }
219
220 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
221 {
222 wxASSERT(pOutSize != NULL);
223
224 const char* szEnd = szString + nStringLen + 1;
225 const char* szPos = szString;
226 const char* szStart = szPos;
227
228 size_t nActualLength = 0;
229 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
230
231 wxWCharBuffer theBuffer(nCurrentSize);
232
233 //Convert the string until the length() is reached, continuing the
234 //loop every time a null character is reached
235 while(szPos != szEnd)
236 {
237 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
238
239 //Get the length of the current (sub)string
240 size_t nLen = MB2WC(NULL, szPos, 0);
241
242 //Invalid conversion?
243 if( nLen == (size_t)-1 )
244 {
245 *pOutSize = 0;
246 theBuffer.data()[0u] = wxT('\0');
247 return theBuffer;
248 }
249
250
251 //Increase the actual length (+1 for current null character)
252 nActualLength += nLen + 1;
253
254 //if buffer too big, realloc the buffer
255 if (nActualLength > (nCurrentSize+1))
256 {
257 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
258 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
259 theBuffer = theNewBuffer;
260 nCurrentSize <<= 1;
261 }
262
263 //Convert the current (sub)string
264 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
265 {
266 *pOutSize = 0;
267 theBuffer.data()[0u] = wxT('\0');
268 return theBuffer;
269 }
270
271 //Increment to next (sub)string
272 //Note that we have to use strlen here instead of nLen
273 //here because XX2XX gives us the size of the output buffer,
274 //not neccessarly the length of the string
275 szPos += strlen(szPos) + 1;
276 }
277
278 //success - return actual length and the buffer
279 *pOutSize = nActualLength;
280 return theBuffer;
281 }
282
283 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
284 {
285 wxASSERT(pOutSize != NULL);
286
287 const wchar_t* szEnd = szString + nStringLen + 1;
288 const wchar_t* szPos = szString;
289 const wchar_t* szStart = szPos;
290
291 size_t nActualLength = 0;
292 size_t nCurrentSize = nStringLen << 2; //try * 4 first
293
294 wxCharBuffer theBuffer(nCurrentSize);
295
296 //Convert the string until the length() is reached, continuing the
297 //loop every time a null character is reached
298 while(szPos != szEnd)
299 {
300 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
301
302 //Get the length of the current (sub)string
303 size_t nLen = WC2MB(NULL, szPos, 0);
304
305 //Invalid conversion?
306 if( nLen == (size_t)-1 )
307 {
308 *pOutSize = 0;
309 theBuffer.data()[0u] = wxT('\0');
310 return theBuffer;
311 }
312
313 //Increase the actual length (+1 for current null character)
314 nActualLength += nLen + 1;
315
316 //if buffer too big, realloc the buffer
317 if (nActualLength > (nCurrentSize+1))
318 {
319 wxCharBuffer theNewBuffer(nCurrentSize << 1);
320 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
321 theBuffer = theNewBuffer;
322 nCurrentSize <<= 1;
323 }
324
325 //Convert the current (sub)string
326 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
327 {
328 *pOutSize = 0;
329 theBuffer.data()[0u] = wxT('\0');
330 return theBuffer;
331 }
332
333 //Increment to next (sub)string
334 //Note that we have to use wxWcslen here instead of nLen
335 //here because XX2XX gives us the size of the output buffer,
336 //not neccessarly the length of the string
337 szPos += wxWcslen(szPos) + 1;
338 }
339
340 //success - return actual length and the buffer
341 *pOutSize = nActualLength;
342 return theBuffer;
343 }
344
345 // ----------------------------------------------------------------------------
346 // wxMBConvLibc
347 // ----------------------------------------------------------------------------
348
349 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
350 {
351 return wxMB2WC(buf, psz, n);
352 }
353
354 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
355 {
356 return wxWC2MB(buf, psz, n);
357 }
358 // ----------------------------------------------------------------------------
359 // UTF-7
360 // ----------------------------------------------------------------------------
361
362 // Implementation (C) 2004 Fredrik Roubert
363
364 //
365 // BASE64 decoding table
366 //
367 static const unsigned char utf7unb64[] =
368 {
369 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
375 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
376 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
378 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
379 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
380 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
382 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
383 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
384 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
401 };
402
403 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
404 {
405
406 size_t len = 0;
407
408 while (*psz && ((!buf) || (len < n)))
409 {
410 unsigned char cc = *psz++;
411 if (cc != '+')
412 {
413 // plain ASCII char
414 if (buf)
415 *buf++ = cc;
416 len++;
417 }
418 else if (*psz == '-')
419 {
420 // encoded plus sign
421 if (buf)
422 *buf++ = cc;
423 len++;
424 psz++;
425 }
426 else
427 {
428 // BASE64 encoded string
429 bool lsb;
430 unsigned char c;
431 unsigned int d, l;
432 for (lsb = false, d = 0, l = 0;
433 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
434 {
435 d <<= 6;
436 d += cc;
437 for (l += 6; l >= 8; lsb = !lsb)
438 {
439 c = (d >> (l -= 8)) % 256;
440 if (lsb)
441 {
442 if (buf)
443 *buf++ |= c;
444 len ++;
445 }
446 else
447 if (buf)
448 *buf = c << 8;
449 }
450 }
451 if (*psz == '-')
452 psz++;
453 }
454 }
455 if (buf && (len < n))
456 *buf = 0;
457 return len;
458 }
459
460 //
461 // BASE64 encoding table
462 //
463 static const unsigned char utf7enb64[] =
464 {
465 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
466 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
467 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
468 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
469 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
470 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
471 'w', 'x', 'y', 'z', '0', '1', '2', '3',
472 '4', '5', '6', '7', '8', '9', '+', '/'
473 };
474
475 //
476 // UTF-7 encoding table
477 //
478 // 0 - Set D (directly encoded characters)
479 // 1 - Set O (optional direct characters)
480 // 2 - whitespace characters (optional)
481 // 3 - special characters
482 //
483 static const unsigned char utf7encode[128] =
484 {
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
486 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
487 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
489 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
491 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
493 };
494
495 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
496 *psz, size_t n) const
497 {
498
499
500 size_t len = 0;
501
502 while (*psz && ((!buf) || (len < n)))
503 {
504 wchar_t cc = *psz++;
505 if (cc < 0x80 && utf7encode[cc] < 1)
506 {
507 // plain ASCII char
508 if (buf)
509 *buf++ = (char)cc;
510 len++;
511 }
512 #ifndef WC_UTF16
513 else if (((wxUint32)cc) > 0xffff)
514 {
515 // no surrogate pair generation (yet?)
516 return (size_t)-1;
517 }
518 #endif
519 else
520 {
521 if (buf)
522 *buf++ = '+';
523 len++;
524 if (cc != '+')
525 {
526 // BASE64 encode string
527 unsigned int lsb, d, l;
528 for (d = 0, l = 0;; psz++)
529 {
530 for (lsb = 0; lsb < 2; lsb ++)
531 {
532 d <<= 8;
533 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
534
535 for (l += 8; l >= 6; )
536 {
537 l -= 6;
538 if (buf)
539 *buf++ = utf7enb64[(d >> l) % 64];
540 len++;
541 }
542 }
543 cc = *psz;
544 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
545 break;
546 }
547 if (l != 0)
548 {
549 if (buf)
550 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
551 len++;
552 }
553 }
554 if (buf)
555 *buf++ = '-';
556 len++;
557 }
558 }
559 if (buf && (len < n))
560 *buf = 0;
561 return len;
562 }
563
564 // ----------------------------------------------------------------------------
565 // UTF-8
566 // ----------------------------------------------------------------------------
567
568 static wxUint32 utf8_max[]=
569 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
570
571 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
572 {
573 size_t len = 0;
574
575 while (*psz && ((!buf) || (len < n)))
576 {
577 unsigned char cc = *psz++, fc = cc;
578 unsigned cnt;
579 for (cnt = 0; fc & 0x80; cnt++)
580 fc <<= 1;
581 if (!cnt)
582 {
583 // plain ASCII char
584 if (buf)
585 *buf++ = cc;
586 len++;
587 }
588 else
589 {
590 cnt--;
591 if (!cnt)
592 {
593 // invalid UTF-8 sequence
594 return (size_t)-1;
595 }
596 else
597 {
598 unsigned ocnt = cnt - 1;
599 wxUint32 res = cc & (0x3f >> cnt);
600 while (cnt--)
601 {
602 cc = *psz++;
603 if ((cc & 0xC0) != 0x80)
604 {
605 // invalid UTF-8 sequence
606 return (size_t)-1;
607 }
608 res = (res << 6) | (cc & 0x3f);
609 }
610 if (res <= utf8_max[ocnt])
611 {
612 // illegal UTF-8 encoding
613 return (size_t)-1;
614 }
615 #ifdef WC_UTF16
616 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
617 size_t pa = encode_utf16(res, (wxUint16 *)buf);
618 if (pa == (size_t)-1)
619 return (size_t)-1;
620 if (buf)
621 buf += pa;
622 len += pa;
623 #else // !WC_UTF16
624 if (buf)
625 *buf++ = res;
626 len++;
627 #endif // WC_UTF16/!WC_UTF16
628 }
629 }
630 }
631 if (buf && (len < n))
632 *buf = 0;
633 return len;
634 }
635
636 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
637 {
638 size_t len = 0;
639
640 while (*psz && ((!buf) || (len < n)))
641 {
642 wxUint32 cc;
643 #ifdef WC_UTF16
644 // cast is ok for WC_UTF16
645 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
646 psz += (pa == (size_t)-1) ? 1 : pa;
647 #else
648 cc=(*psz++) & 0x7fffffff;
649 #endif
650 unsigned cnt;
651 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
652 if (!cnt)
653 {
654 // plain ASCII char
655 if (buf)
656 *buf++ = (char) cc;
657 len++;
658 }
659
660 else
661 {
662 len += cnt + 1;
663 if (buf)
664 {
665 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
666 while (cnt--)
667 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
668 }
669 }
670 }
671
672 if (buf && (len<n)) *buf = 0;
673
674 return len;
675 }
676
677
678
679
680 // ----------------------------------------------------------------------------
681 // UTF-16
682 // ----------------------------------------------------------------------------
683
684 #ifdef WORDS_BIGENDIAN
685 #define wxMBConvUTF16straight wxMBConvUTF16BE
686 #define wxMBConvUTF16swap wxMBConvUTF16LE
687 #else
688 #define wxMBConvUTF16swap wxMBConvUTF16BE
689 #define wxMBConvUTF16straight wxMBConvUTF16LE
690 #endif
691
692
693 #ifdef WC_UTF16
694
695 // copy 16bit MB to 16bit String
696 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
697 {
698 size_t len=0;
699
700 while (*(wxUint16*)psz && (!buf || len < n))
701 {
702 if (buf)
703 *buf++ = *(wxUint16*)psz;
704 len++;
705
706 psz += sizeof(wxUint16);
707 }
708 if (buf && len<n) *buf=0;
709
710 return len;
711 }
712
713
714 // copy 16bit String to 16bit MB
715 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
716 {
717 size_t len=0;
718
719 while (*psz && (!buf || len < n))
720 {
721 if (buf)
722 {
723 *(wxUint16*)buf = *psz;
724 buf += sizeof(wxUint16);
725 }
726 len += sizeof(wxUint16);
727 psz++;
728 }
729 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
730
731 return len;
732 }
733
734
735 // swap 16bit MB to 16bit String
736 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
737 {
738 size_t len=0;
739
740 while (*(wxUint16*)psz && (!buf || len < n))
741 {
742 if (buf)
743 {
744 ((char *)buf)[0] = psz[1];
745 ((char *)buf)[1] = psz[0];
746 buf++;
747 }
748 len++;
749 psz += sizeof(wxUint16);
750 }
751 if (buf && len<n) *buf=0;
752
753 return len;
754 }
755
756
757 // swap 16bit MB to 16bit String
758 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
759 {
760 size_t len=0;
761
762 while (*psz && (!buf || len < n))
763 {
764 if (buf)
765 {
766 *buf++ = ((char*)psz)[1];
767 *buf++ = ((char*)psz)[0];
768 }
769 len += sizeof(wxUint16);
770 psz++;
771 }
772 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
773
774 return len;
775 }
776
777
778 #else // WC_UTF16
779
780
781 // copy 16bit MB to 32bit String
782 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
783 {
784 size_t len=0;
785
786 while (*(wxUint16*)psz && (!buf || len < n))
787 {
788 wxUint32 cc;
789 size_t pa=decode_utf16((wxUint16*)psz, cc);
790 if (pa == (size_t)-1)
791 return pa;
792
793 if (buf)
794 *buf++ = cc;
795 len++;
796 psz += pa * sizeof(wxUint16);
797 }
798 if (buf && len<n) *buf=0;
799
800 return len;
801 }
802
803
804 // copy 32bit String to 16bit MB
805 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
806 {
807 size_t len=0;
808
809 while (*psz && (!buf || len < n))
810 {
811 wxUint16 cc[2];
812 size_t pa=encode_utf16(*psz, cc);
813
814 if (pa == (size_t)-1)
815 return pa;
816
817 if (buf)
818 {
819 *(wxUint16*)buf = cc[0];
820 buf += sizeof(wxUint16);
821 if (pa > 1)
822 {
823 *(wxUint16*)buf = cc[1];
824 buf += sizeof(wxUint16);
825 }
826 }
827
828 len += pa*sizeof(wxUint16);
829 psz++;
830 }
831 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
832
833 return len;
834 }
835
836
837 // swap 16bit MB to 32bit String
838 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
839 {
840 size_t len=0;
841
842 while (*(wxUint16*)psz && (!buf || len < n))
843 {
844 wxUint32 cc;
845 char tmp[4];
846 tmp[0]=psz[1]; tmp[1]=psz[0];
847 tmp[2]=psz[3]; tmp[3]=psz[2];
848
849 size_t pa=decode_utf16((wxUint16*)tmp, cc);
850 if (pa == (size_t)-1)
851 return pa;
852
853 if (buf)
854 *buf++ = cc;
855
856 len++;
857 psz += pa * sizeof(wxUint16);
858 }
859 if (buf && len<n) *buf=0;
860
861 return len;
862 }
863
864
865 // swap 32bit String to 16bit MB
866 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
867 {
868 size_t len=0;
869
870 while (*psz && (!buf || len < n))
871 {
872 wxUint16 cc[2];
873 size_t pa=encode_utf16(*psz, cc);
874
875 if (pa == (size_t)-1)
876 return pa;
877
878 if (buf)
879 {
880 *buf++ = ((char*)cc)[1];
881 *buf++ = ((char*)cc)[0];
882 if (pa > 1)
883 {
884 *buf++ = ((char*)cc)[3];
885 *buf++ = ((char*)cc)[2];
886 }
887 }
888
889 len += pa*sizeof(wxUint16);
890 psz++;
891 }
892 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
893
894 return len;
895 }
896
897 #endif // WC_UTF16
898
899
900 // ----------------------------------------------------------------------------
901 // UTF-32
902 // ----------------------------------------------------------------------------
903
904 #ifdef WORDS_BIGENDIAN
905 #define wxMBConvUTF32straight wxMBConvUTF32BE
906 #define wxMBConvUTF32swap wxMBConvUTF32LE
907 #else
908 #define wxMBConvUTF32swap wxMBConvUTF32BE
909 #define wxMBConvUTF32straight wxMBConvUTF32LE
910 #endif
911
912
913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
914 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
915
916
917 #ifdef WC_UTF16
918
919 // copy 32bit MB to 16bit String
920 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
921 {
922 size_t len=0;
923
924 while (*(wxUint32*)psz && (!buf || len < n))
925 {
926 wxUint16 cc[2];
927
928 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
929 if (pa == (size_t)-1)
930 return pa;
931
932 if (buf)
933 {
934 *buf++ = cc[0];
935 if (pa > 1)
936 *buf++ = cc[1];
937 }
938 len += pa;
939 psz += sizeof(wxUint32);
940 }
941 if (buf && len<n) *buf=0;
942
943 return len;
944 }
945
946
947 // copy 16bit String to 32bit MB
948 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
949 {
950 size_t len=0;
951
952 while (*psz && (!buf || len < n))
953 {
954 wxUint32 cc;
955
956 // cast is ok for WC_UTF16
957 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
958 if (pa == (size_t)-1)
959 return pa;
960
961 if (buf)
962 {
963 *(wxUint32*)buf = cc;
964 buf += sizeof(wxUint32);
965 }
966 len += sizeof(wxUint32);
967 psz += pa;
968 }
969
970 if (buf && len<=n-sizeof(wxUint32))
971 *(wxUint32*)buf=0;
972
973 return len;
974 }
975
976
977
978 // swap 32bit MB to 16bit String
979 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
980 {
981 size_t len=0;
982
983 while (*(wxUint32*)psz && (!buf || len < n))
984 {
985 char tmp[4];
986 tmp[0] = psz[3]; tmp[1] = psz[2];
987 tmp[2] = psz[1]; tmp[3] = psz[0];
988
989
990 wxUint16 cc[2];
991
992 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
993 if (pa == (size_t)-1)
994 return pa;
995
996 if (buf)
997 {
998 *buf++ = cc[0];
999 if (pa > 1)
1000 *buf++ = cc[1];
1001 }
1002 len += pa;
1003 psz += sizeof(wxUint32);
1004 }
1005
1006 if (buf && len<n)
1007 *buf=0;
1008
1009 return len;
1010 }
1011
1012
1013 // swap 16bit String to 32bit MB
1014 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1015 {
1016 size_t len=0;
1017
1018 while (*psz && (!buf || len < n))
1019 {
1020 char cc[4];
1021
1022 // cast is ok for WC_UTF16
1023 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1024 if (pa == (size_t)-1)
1025 return pa;
1026
1027 if (buf)
1028 {
1029 *buf++ = cc[3];
1030 *buf++ = cc[2];
1031 *buf++ = cc[1];
1032 *buf++ = cc[0];
1033 }
1034 len += sizeof(wxUint32);
1035 psz += pa;
1036 }
1037
1038 if (buf && len<=n-sizeof(wxUint32))
1039 *(wxUint32*)buf=0;
1040
1041 return len;
1042 }
1043
1044 #else // WC_UTF16
1045
1046
1047 // copy 32bit MB to 32bit String
1048 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1049 {
1050 size_t len=0;
1051
1052 while (*(wxUint32*)psz && (!buf || len < n))
1053 {
1054 if (buf)
1055 *buf++ = *(wxUint32*)psz;
1056 len++;
1057 psz += sizeof(wxUint32);
1058 }
1059
1060 if (buf && len<n)
1061 *buf=0;
1062
1063 return len;
1064 }
1065
1066
1067 // copy 32bit String to 32bit MB
1068 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1069 {
1070 size_t len=0;
1071
1072 while (*psz && (!buf || len < n))
1073 {
1074 if (buf)
1075 {
1076 *(wxUint32*)buf = *psz;
1077 buf += sizeof(wxUint32);
1078 }
1079
1080 len += sizeof(wxUint32);
1081 psz++;
1082 }
1083
1084 if (buf && len<=n-sizeof(wxUint32))
1085 *(wxUint32*)buf=0;
1086
1087 return len;
1088 }
1089
1090
1091 // swap 32bit MB to 32bit String
1092 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1093 {
1094 size_t len=0;
1095
1096 while (*(wxUint32*)psz && (!buf || len < n))
1097 {
1098 if (buf)
1099 {
1100 ((char *)buf)[0] = psz[3];
1101 ((char *)buf)[1] = psz[2];
1102 ((char *)buf)[2] = psz[1];
1103 ((char *)buf)[3] = psz[0];
1104 buf++;
1105 }
1106 len++;
1107 psz += sizeof(wxUint32);
1108 }
1109
1110 if (buf && len<n)
1111 *buf=0;
1112
1113 return len;
1114 }
1115
1116
1117 // swap 32bit String to 32bit MB
1118 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1119 {
1120 size_t len=0;
1121
1122 while (*psz && (!buf || len < n))
1123 {
1124 if (buf)
1125 {
1126 *buf++ = ((char *)psz)[3];
1127 *buf++ = ((char *)psz)[2];
1128 *buf++ = ((char *)psz)[1];
1129 *buf++ = ((char *)psz)[0];
1130 }
1131 len += sizeof(wxUint32);
1132 psz++;
1133 }
1134
1135 if (buf && len<=n-sizeof(wxUint32))
1136 *(wxUint32*)buf=0;
1137
1138 return len;
1139 }
1140
1141
1142 #endif // WC_UTF16
1143
1144
1145 // ============================================================================
1146 // The classes doing conversion using the iconv_xxx() functions
1147 // ============================================================================
1148
1149 #ifdef HAVE_ICONV
1150
1151 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1152 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
1153 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1154 // (which means error) and says there are 0 bytes left in the input buffer --
1155 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1156 // this alternative test for iconv() failure.
1157 // [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160 (errno != E2BIG || bufLeft != 0))
1161 #else
1162 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1163 #endif
1164
1165 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1166
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1170
1171 class wxMBConv_iconv : public wxMBConv
1172 {
1173 public:
1174 wxMBConv_iconv(const wxChar *name);
1175 virtual ~wxMBConv_iconv();
1176
1177 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1178 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1179
1180 bool IsOk() const
1181 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1182
1183 protected:
1184 // the iconv handlers used to translate from multibyte to wide char and in
1185 // the other direction
1186 iconv_t m2w,
1187 w2m;
1188
1189 private:
1190 // the name (for iconv_open()) of a wide char charset -- if none is
1191 // available on this machine, it will remain NULL
1192 static const char *ms_wcCharsetName;
1193
1194 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1195 // different endian-ness than the native one
1196 static bool ms_wcNeedsSwap;
1197 };
1198
1199 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1200 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1201
1202 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1203 {
1204 // Do it the hard way
1205 char cname[100];
1206 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1207 cname[i] = (char) name[i];
1208
1209 // check for charset that represents wchar_t:
1210 if (ms_wcCharsetName == NULL)
1211 {
1212 ms_wcNeedsSwap = false;
1213
1214 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1215 ms_wcCharsetName = WC_NAME_BEST;
1216 m2w = iconv_open(ms_wcCharsetName, cname);
1217
1218 if (m2w == (iconv_t)-1)
1219 {
1220 // try charset w/o bytesex info (e.g. "UCS4")
1221 // and check for bytesex ourselves:
1222 ms_wcCharsetName = WC_NAME;
1223 m2w = iconv_open(ms_wcCharsetName, cname);
1224
1225 // last bet, try if it knows WCHAR_T pseudo-charset
1226 if (m2w == (iconv_t)-1)
1227 {
1228 ms_wcCharsetName = "WCHAR_T";
1229 m2w = iconv_open(ms_wcCharsetName, cname);
1230 }
1231
1232 if (m2w != (iconv_t)-1)
1233 {
1234 char buf[2], *bufPtr;
1235 wchar_t wbuf[2], *wbufPtr;
1236 size_t insz, outsz;
1237 size_t res;
1238
1239 buf[0] = 'A';
1240 buf[1] = 0;
1241 wbuf[0] = 0;
1242 insz = 2;
1243 outsz = SIZEOF_WCHAR_T * 2;
1244 wbufPtr = wbuf;
1245 bufPtr = buf;
1246
1247 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1248 (char**)&wbufPtr, &outsz);
1249
1250 if (ICONV_FAILED(res, insz))
1251 {
1252 ms_wcCharsetName = NULL;
1253 wxLogLastError(wxT("iconv"));
1254 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1255 }
1256 else
1257 {
1258 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1259 }
1260 }
1261 else
1262 {
1263 ms_wcCharsetName = NULL;
1264
1265 // VS: we must not output an error here, since wxWidgets will safely
1266 // fall back to using wxEncodingConverter.
1267 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1268 //wxLogError(
1269 }
1270 }
1271 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1272 }
1273 else // we already have ms_wcCharsetName
1274 {
1275 m2w = iconv_open(ms_wcCharsetName, cname);
1276 }
1277
1278 // NB: don't ever pass NULL to iconv_open(), it may crash!
1279 if ( ms_wcCharsetName )
1280 {
1281 w2m = iconv_open( cname, ms_wcCharsetName);
1282 }
1283 else
1284 {
1285 w2m = (iconv_t)-1;
1286 }
1287 }
1288
1289 wxMBConv_iconv::~wxMBConv_iconv()
1290 {
1291 if ( m2w != (iconv_t)-1 )
1292 iconv_close(m2w);
1293 if ( w2m != (iconv_t)-1 )
1294 iconv_close(w2m);
1295 }
1296
1297 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1298 {
1299 size_t inbuf = strlen(psz);
1300 size_t outbuf = n * SIZEOF_WCHAR_T;
1301 size_t res, cres;
1302 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1303 wchar_t *bufPtr = buf;
1304 const char *pszPtr = psz;
1305
1306 if (buf)
1307 {
1308 // have destination buffer, convert there
1309 cres = iconv(m2w,
1310 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1311 (char**)&bufPtr, &outbuf);
1312 res = n - (outbuf / SIZEOF_WCHAR_T);
1313
1314 if (ms_wcNeedsSwap)
1315 {
1316 // convert to native endianness
1317 WC_BSWAP(buf /* _not_ bufPtr */, res)
1318 }
1319
1320 // NB: iconv was given only strlen(psz) characters on input, and so
1321 // it couldn't convert the trailing zero. Let's do it ourselves
1322 // if there's some room left for it in the output buffer.
1323 if (res < n)
1324 buf[res] = 0;
1325 }
1326 else
1327 {
1328 // no destination buffer... convert using temp buffer
1329 // to calculate destination buffer requirement
1330 wchar_t tbuf[8];
1331 res = 0;
1332 do {
1333 bufPtr = tbuf;
1334 outbuf = 8*SIZEOF_WCHAR_T;
1335
1336 cres = iconv(m2w,
1337 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1338 (char**)&bufPtr, &outbuf );
1339
1340 res += 8-(outbuf/SIZEOF_WCHAR_T);
1341 } while ((cres==(size_t)-1) && (errno==E2BIG));
1342 }
1343
1344 if (ICONV_FAILED(cres, inbuf))
1345 {
1346 //VS: it is ok if iconv fails, hence trace only
1347 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1348 return (size_t)-1;
1349 }
1350
1351 return res;
1352 }
1353
1354 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1355 {
1356 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1357 size_t outbuf = n;
1358 size_t res, cres;
1359
1360 wchar_t *tmpbuf = 0;
1361
1362 if (ms_wcNeedsSwap)
1363 {
1364 // need to copy to temp buffer to switch endianness
1365 // this absolutely doesn't rock!
1366 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1367 // could be in read-only memory, or be accessed in some other thread)
1368 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1369 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1370 WC_BSWAP(tmpbuf, inbuf)
1371 psz=tmpbuf;
1372 }
1373
1374 if (buf)
1375 {
1376 // have destination buffer, convert there
1377 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1378
1379 res = n-outbuf;
1380
1381 // NB: iconv was given only wcslen(psz) characters on input, and so
1382 // it couldn't convert the trailing zero. Let's do it ourselves
1383 // if there's some room left for it in the output buffer.
1384 if (res < n)
1385 buf[0] = 0;
1386 }
1387 else
1388 {
1389 // no destination buffer... convert using temp buffer
1390 // to calculate destination buffer requirement
1391 char tbuf[16];
1392 res = 0;
1393 do {
1394 buf = tbuf; outbuf = 16;
1395
1396 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1397
1398 res += 16 - outbuf;
1399 } while ((cres==(size_t)-1) && (errno==E2BIG));
1400 }
1401
1402 if (ms_wcNeedsSwap)
1403 {
1404 free(tmpbuf);
1405 }
1406
1407 if (ICONV_FAILED(cres, inbuf))
1408 {
1409 //VS: it is ok if iconv fails, hence trace only
1410 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1411 return (size_t)-1;
1412 }
1413
1414 return res;
1415 }
1416
1417 #endif // HAVE_ICONV
1418
1419
1420 // ============================================================================
1421 // Win32 conversion classes
1422 // ============================================================================
1423
1424 #ifdef wxHAVE_WIN32_MB2WC
1425
1426 // from utils.cpp
1427 #if wxUSE_FONTMAP
1428 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1429 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1430 #endif
1431
1432 class wxMBConv_win32 : public wxMBConv
1433 {
1434 public:
1435 wxMBConv_win32()
1436 {
1437 m_CodePage = CP_ACP;
1438 }
1439
1440 #if wxUSE_FONTMAP
1441 wxMBConv_win32(const wxChar* name)
1442 {
1443 m_CodePage = wxCharsetToCodepage(name);
1444 }
1445
1446 wxMBConv_win32(wxFontEncoding encoding)
1447 {
1448 m_CodePage = wxEncodingToCodepage(encoding);
1449 }
1450 #endif
1451
1452 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1453 {
1454 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1455 // the behaviour is not compatible with the Unix version (using iconv)
1456 // and break the library itself, e.g. wxTextInputStream::NextChar()
1457 // wouldn't work if reading an incomplete MB char didn't result in an
1458 // error
1459 const size_t len = ::MultiByteToWideChar
1460 (
1461 m_CodePage, // code page
1462 MB_ERR_INVALID_CHARS, // flags: fall on error
1463 psz, // input string
1464 -1, // its length (NUL-terminated)
1465 buf, // output string
1466 buf ? n : 0 // size of output buffer
1467 );
1468
1469 // note that it returns count of written chars for buf != NULL and size
1470 // of the needed buffer for buf == NULL so in either case the length of
1471 // the string (which never includes the terminating NUL) is one less
1472 return len ? len - 1 : (size_t)-1;
1473 }
1474
1475 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1476 {
1477 /*
1478 we have a problem here: by default, WideCharToMultiByte() may
1479 replace characters unrepresentable in the target code page with bad
1480 quality approximations such as turning "1/2" symbol (U+00BD) into
1481 "1" for the code pages which don't have it and we, obviously, want
1482 to avoid this at any price
1483
1484 the trouble is that this function does it _silently_, i.e. it won't
1485 even tell us whether it did or not... Win98/2000 and higher provide
1486 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1487 we have to resort to a round trip, i.e. check that converting back
1488 results in the same string -- this is, of course, expensive but
1489 otherwise we simply can't be sure to not garble the data.
1490 */
1491
1492 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1493 // it doesn't work with CJK encodings (which we test for rather roughly
1494 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1495 // supporting it
1496 BOOL usedDef wxDUMMY_INITIALIZE(false);
1497 BOOL *pUsedDef;
1498 int flags;
1499 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1500 {
1501 // it's our lucky day
1502 flags = WC_NO_BEST_FIT_CHARS;
1503 pUsedDef = &usedDef;
1504 }
1505 else // old system or unsupported encoding
1506 {
1507 flags = 0;
1508 pUsedDef = NULL;
1509 }
1510
1511 const size_t len = ::WideCharToMultiByte
1512 (
1513 m_CodePage, // code page
1514 flags, // either none or no best fit
1515 pwz, // input string
1516 -1, // it is (wide) NUL-terminated
1517 buf, // output buffer
1518 buf ? n : 0, // and its size
1519 NULL, // default "replacement" char
1520 pUsedDef // [out] was it used?
1521 );
1522
1523 if ( !len )
1524 {
1525 // function totally failed
1526 return (size_t)-1;
1527 }
1528
1529 // if we were really converting, check if we succeeded
1530 if ( buf )
1531 {
1532 if ( flags )
1533 {
1534 // check if the conversion failed, i.e. if any replacements
1535 // were done
1536 if ( usedDef )
1537 return (size_t)-1;
1538 }
1539 else // we must resort to double tripping...
1540 {
1541 wxWCharBuffer wcBuf(n);
1542 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1543 wcscmp(wcBuf, pwz) != 0 )
1544 {
1545 // we didn't obtain the same thing we started from, hence
1546 // the conversion was lossy and we consider that it failed
1547 return (size_t)-1;
1548 }
1549 }
1550 }
1551
1552 // see the comment above for the reason of "len - 1"
1553 return len - 1;
1554 }
1555
1556 bool IsOk() const { return m_CodePage != -1; }
1557
1558 private:
1559 static bool CanUseNoBestFit()
1560 {
1561 static int s_isWin98Or2k = -1;
1562
1563 if ( s_isWin98Or2k == -1 )
1564 {
1565 int verMaj, verMin;
1566 switch ( wxGetOsVersion(&verMaj, &verMin) )
1567 {
1568 case wxWIN95:
1569 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1570 break;
1571
1572 case wxWINDOWS_NT:
1573 s_isWin98Or2k = verMaj >= 5;
1574 break;
1575
1576 default:
1577 // unknown, be conseravtive by default
1578 s_isWin98Or2k = 0;
1579 }
1580
1581 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1582 }
1583
1584 return s_isWin98Or2k == 1;
1585 }
1586
1587 long m_CodePage;
1588 };
1589
1590 #endif // wxHAVE_WIN32_MB2WC
1591
1592 // ============================================================================
1593 // Cocoa conversion classes
1594 // ============================================================================
1595
1596 #if defined(__WXCOCOA__)
1597
1598 // RN: There is no UTF-32 support in either Core Foundation or
1599 // Cocoa. Strangely enough, internally Core Foundation uses
1600 // UTF 32 internally quite a bit - its just not public (yet).
1601
1602 #include <CoreFoundation/CFString.h>
1603 #include <CoreFoundation/CFStringEncodingExt.h>
1604
1605 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1606 {
1607 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1608 if ( encoding == wxFONTENCODING_DEFAULT )
1609 {
1610 enc = CFStringGetSystemEncoding();
1611 }
1612 else switch( encoding)
1613 {
1614 case wxFONTENCODING_ISO8859_1 :
1615 enc = kCFStringEncodingISOLatin1 ;
1616 break ;
1617 case wxFONTENCODING_ISO8859_2 :
1618 enc = kCFStringEncodingISOLatin2;
1619 break ;
1620 case wxFONTENCODING_ISO8859_3 :
1621 enc = kCFStringEncodingISOLatin3 ;
1622 break ;
1623 case wxFONTENCODING_ISO8859_4 :
1624 enc = kCFStringEncodingISOLatin4;
1625 break ;
1626 case wxFONTENCODING_ISO8859_5 :
1627 enc = kCFStringEncodingISOLatinCyrillic;
1628 break ;
1629 case wxFONTENCODING_ISO8859_6 :
1630 enc = kCFStringEncodingISOLatinArabic;
1631 break ;
1632 case wxFONTENCODING_ISO8859_7 :
1633 enc = kCFStringEncodingISOLatinGreek;
1634 break ;
1635 case wxFONTENCODING_ISO8859_8 :
1636 enc = kCFStringEncodingISOLatinHebrew;
1637 break ;
1638 case wxFONTENCODING_ISO8859_9 :
1639 enc = kCFStringEncodingISOLatin5;
1640 break ;
1641 case wxFONTENCODING_ISO8859_10 :
1642 enc = kCFStringEncodingISOLatin6;
1643 break ;
1644 case wxFONTENCODING_ISO8859_11 :
1645 enc = kCFStringEncodingISOLatinThai;
1646 break ;
1647 case wxFONTENCODING_ISO8859_13 :
1648 enc = kCFStringEncodingISOLatin7;
1649 break ;
1650 case wxFONTENCODING_ISO8859_14 :
1651 enc = kCFStringEncodingISOLatin8;
1652 break ;
1653 case wxFONTENCODING_ISO8859_15 :
1654 enc = kCFStringEncodingISOLatin9;
1655 break ;
1656
1657 case wxFONTENCODING_KOI8 :
1658 enc = kCFStringEncodingKOI8_R;
1659 break ;
1660 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1661 enc = kCFStringEncodingDOSRussian;
1662 break ;
1663
1664 // case wxFONTENCODING_BULGARIAN :
1665 // enc = ;
1666 // break ;
1667
1668 case wxFONTENCODING_CP437 :
1669 enc =kCFStringEncodingDOSLatinUS ;
1670 break ;
1671 case wxFONTENCODING_CP850 :
1672 enc = kCFStringEncodingDOSLatin1;
1673 break ;
1674 case wxFONTENCODING_CP852 :
1675 enc = kCFStringEncodingDOSLatin2;
1676 break ;
1677 case wxFONTENCODING_CP855 :
1678 enc = kCFStringEncodingDOSCyrillic;
1679 break ;
1680 case wxFONTENCODING_CP866 :
1681 enc =kCFStringEncodingDOSRussian ;
1682 break ;
1683 case wxFONTENCODING_CP874 :
1684 enc = kCFStringEncodingDOSThai;
1685 break ;
1686 case wxFONTENCODING_CP932 :
1687 enc = kCFStringEncodingDOSJapanese;
1688 break ;
1689 case wxFONTENCODING_CP936 :
1690 enc =kCFStringEncodingDOSChineseSimplif ;
1691 break ;
1692 case wxFONTENCODING_CP949 :
1693 enc = kCFStringEncodingDOSKorean;
1694 break ;
1695 case wxFONTENCODING_CP950 :
1696 enc = kCFStringEncodingDOSChineseTrad;
1697 break ;
1698 case wxFONTENCODING_CP1250 :
1699 enc = kCFStringEncodingWindowsLatin2;
1700 break ;
1701 case wxFONTENCODING_CP1251 :
1702 enc =kCFStringEncodingWindowsCyrillic ;
1703 break ;
1704 case wxFONTENCODING_CP1252 :
1705 enc =kCFStringEncodingWindowsLatin1 ;
1706 break ;
1707 case wxFONTENCODING_CP1253 :
1708 enc = kCFStringEncodingWindowsGreek;
1709 break ;
1710 case wxFONTENCODING_CP1254 :
1711 enc = kCFStringEncodingWindowsLatin5;
1712 break ;
1713 case wxFONTENCODING_CP1255 :
1714 enc =kCFStringEncodingWindowsHebrew ;
1715 break ;
1716 case wxFONTENCODING_CP1256 :
1717 enc =kCFStringEncodingWindowsArabic ;
1718 break ;
1719 case wxFONTENCODING_CP1257 :
1720 enc = kCFStringEncodingWindowsBalticRim;
1721 break ;
1722 // This only really encodes to UTF7 (if that) evidently
1723 // case wxFONTENCODING_UTF7 :
1724 // enc = kCFStringEncodingNonLossyASCII ;
1725 // break ;
1726 case wxFONTENCODING_UTF8 :
1727 enc = kCFStringEncodingUTF8 ;
1728 break ;
1729 case wxFONTENCODING_EUC_JP :
1730 enc = kCFStringEncodingEUC_JP;
1731 break ;
1732 case wxFONTENCODING_UTF16 :
1733 enc = kCFStringEncodingUnicode ;
1734 break ;
1735 case wxFONTENCODING_MACROMAN :
1736 enc = kCFStringEncodingMacRoman ;
1737 break ;
1738 case wxFONTENCODING_MACJAPANESE :
1739 enc = kCFStringEncodingMacJapanese ;
1740 break ;
1741 case wxFONTENCODING_MACCHINESETRAD :
1742 enc = kCFStringEncodingMacChineseTrad ;
1743 break ;
1744 case wxFONTENCODING_MACKOREAN :
1745 enc = kCFStringEncodingMacKorean ;
1746 break ;
1747 case wxFONTENCODING_MACARABIC :
1748 enc = kCFStringEncodingMacArabic ;
1749 break ;
1750 case wxFONTENCODING_MACHEBREW :
1751 enc = kCFStringEncodingMacHebrew ;
1752 break ;
1753 case wxFONTENCODING_MACGREEK :
1754 enc = kCFStringEncodingMacGreek ;
1755 break ;
1756 case wxFONTENCODING_MACCYRILLIC :
1757 enc = kCFStringEncodingMacCyrillic ;
1758 break ;
1759 case wxFONTENCODING_MACDEVANAGARI :
1760 enc = kCFStringEncodingMacDevanagari ;
1761 break ;
1762 case wxFONTENCODING_MACGURMUKHI :
1763 enc = kCFStringEncodingMacGurmukhi ;
1764 break ;
1765 case wxFONTENCODING_MACGUJARATI :
1766 enc = kCFStringEncodingMacGujarati ;
1767 break ;
1768 case wxFONTENCODING_MACORIYA :
1769 enc = kCFStringEncodingMacOriya ;
1770 break ;
1771 case wxFONTENCODING_MACBENGALI :
1772 enc = kCFStringEncodingMacBengali ;
1773 break ;
1774 case wxFONTENCODING_MACTAMIL :
1775 enc = kCFStringEncodingMacTamil ;
1776 break ;
1777 case wxFONTENCODING_MACTELUGU :
1778 enc = kCFStringEncodingMacTelugu ;
1779 break ;
1780 case wxFONTENCODING_MACKANNADA :
1781 enc = kCFStringEncodingMacKannada ;
1782 break ;
1783 case wxFONTENCODING_MACMALAJALAM :
1784 enc = kCFStringEncodingMacMalayalam ;
1785 break ;
1786 case wxFONTENCODING_MACSINHALESE :
1787 enc = kCFStringEncodingMacSinhalese ;
1788 break ;
1789 case wxFONTENCODING_MACBURMESE :
1790 enc = kCFStringEncodingMacBurmese ;
1791 break ;
1792 case wxFONTENCODING_MACKHMER :
1793 enc = kCFStringEncodingMacKhmer ;
1794 break ;
1795 case wxFONTENCODING_MACTHAI :
1796 enc = kCFStringEncodingMacThai ;
1797 break ;
1798 case wxFONTENCODING_MACLAOTIAN :
1799 enc = kCFStringEncodingMacLaotian ;
1800 break ;
1801 case wxFONTENCODING_MACGEORGIAN :
1802 enc = kCFStringEncodingMacGeorgian ;
1803 break ;
1804 case wxFONTENCODING_MACARMENIAN :
1805 enc = kCFStringEncodingMacArmenian ;
1806 break ;
1807 case wxFONTENCODING_MACCHINESESIMP :
1808 enc = kCFStringEncodingMacChineseSimp ;
1809 break ;
1810 case wxFONTENCODING_MACTIBETAN :
1811 enc = kCFStringEncodingMacTibetan ;
1812 break ;
1813 case wxFONTENCODING_MACMONGOLIAN :
1814 enc = kCFStringEncodingMacMongolian ;
1815 break ;
1816 case wxFONTENCODING_MACETHIOPIC :
1817 enc = kCFStringEncodingMacEthiopic ;
1818 break ;
1819 case wxFONTENCODING_MACCENTRALEUR :
1820 enc = kCFStringEncodingMacCentralEurRoman ;
1821 break ;
1822 case wxFONTENCODING_MACVIATNAMESE :
1823 enc = kCFStringEncodingMacVietnamese ;
1824 break ;
1825 case wxFONTENCODING_MACARABICEXT :
1826 enc = kCFStringEncodingMacExtArabic ;
1827 break ;
1828 case wxFONTENCODING_MACSYMBOL :
1829 enc = kCFStringEncodingMacSymbol ;
1830 break ;
1831 case wxFONTENCODING_MACDINGBATS :
1832 enc = kCFStringEncodingMacDingbats ;
1833 break ;
1834 case wxFONTENCODING_MACTURKISH :
1835 enc = kCFStringEncodingMacTurkish ;
1836 break ;
1837 case wxFONTENCODING_MACCROATIAN :
1838 enc = kCFStringEncodingMacCroatian ;
1839 break ;
1840 case wxFONTENCODING_MACICELANDIC :
1841 enc = kCFStringEncodingMacIcelandic ;
1842 break ;
1843 case wxFONTENCODING_MACROMANIAN :
1844 enc = kCFStringEncodingMacRomanian ;
1845 break ;
1846 case wxFONTENCODING_MACCELTIC :
1847 enc = kCFStringEncodingMacCeltic ;
1848 break ;
1849 case wxFONTENCODING_MACGAELIC :
1850 enc = kCFStringEncodingMacGaelic ;
1851 break ;
1852 // case wxFONTENCODING_MACKEYBOARD :
1853 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1854 // break ;
1855 default :
1856 // because gcc is picky
1857 break ;
1858 } ;
1859 return enc ;
1860 }
1861
1862 class wxMBConv_cocoa : public wxMBConv
1863 {
1864 public:
1865 wxMBConv_cocoa()
1866 {
1867 Init(CFStringGetSystemEncoding()) ;
1868 }
1869
1870 wxMBConv_cocoa(const wxChar* name)
1871 {
1872 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1873 }
1874
1875 wxMBConv_cocoa(wxFontEncoding encoding)
1876 {
1877 Init( wxCFStringEncFromFontEnc(encoding) );
1878 }
1879
1880 ~wxMBConv_cocoa()
1881 {
1882 }
1883
1884 void Init( CFStringEncoding encoding)
1885 {
1886 m_encoding = encoding ;
1887 }
1888
1889 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1890 {
1891 wxASSERT(szUnConv);
1892
1893 CFStringRef theString = CFStringCreateWithBytes (
1894 NULL, //the allocator
1895 (const UInt8*)szUnConv,
1896 strlen(szUnConv),
1897 m_encoding,
1898 false //no BOM/external representation
1899 );
1900
1901 wxASSERT(theString);
1902
1903 size_t nOutLength = CFStringGetLength(theString);
1904
1905 if (szOut == NULL)
1906 {
1907 CFRelease(theString);
1908 return nOutLength;
1909 }
1910
1911 CFRange theRange = { 0, nOutSize };
1912
1913 #if SIZEOF_WCHAR_T == 4
1914 UniChar* szUniCharBuffer = new UniChar[nOutSize];
1915 #endif
1916
1917 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1918
1919 CFRelease(theString);
1920
1921 szUniCharBuffer[nOutLength] = '\0' ;
1922
1923 #if SIZEOF_WCHAR_T == 4
1924 wxMBConvUTF16 converter ;
1925 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1926 delete[] szUniCharBuffer;
1927 #endif
1928
1929 return nOutLength;
1930 }
1931
1932 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1933 {
1934 wxASSERT(szUnConv);
1935
1936 size_t nRealOutSize;
1937 size_t nBufSize = wxWcslen(szUnConv);
1938 UniChar* szUniBuffer = (UniChar*) szUnConv;
1939
1940 #if SIZEOF_WCHAR_T == 4
1941 wxMBConvUTF16BE converter ;
1942 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1943 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1944 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1945 nBufSize /= sizeof(UniChar);
1946 #endif
1947
1948 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1949 NULL, //allocator
1950 szUniBuffer,
1951 nBufSize,
1952 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
1953 );
1954
1955 wxASSERT(theString);
1956
1957 //Note that CER puts a BOM when converting to unicode
1958 //so we check and use getchars instead in that case
1959 if (m_encoding == kCFStringEncodingUnicode)
1960 {
1961 if (szOut != NULL)
1962 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1963
1964 nRealOutSize = CFStringGetLength(theString) + 1;
1965 }
1966 else
1967 {
1968 CFStringGetBytes(
1969 theString,
1970 CFRangeMake(0, CFStringGetLength(theString)),
1971 m_encoding,
1972 0, //what to put in characters that can't be converted -
1973 //0 tells CFString to return NULL if it meets such a character
1974 false, //not an external representation
1975 (UInt8*) szOut,
1976 nOutSize,
1977 (CFIndex*) &nRealOutSize
1978 );
1979 }
1980
1981 CFRelease(theString);
1982
1983 #if SIZEOF_WCHAR_T == 4
1984 delete[] szUniBuffer;
1985 #endif
1986
1987 return nRealOutSize - 1;
1988 }
1989
1990 bool IsOk() const
1991 {
1992 return m_encoding != kCFStringEncodingInvalidId &&
1993 CFStringIsEncodingAvailable(m_encoding);
1994 }
1995
1996 private:
1997 CFStringEncoding m_encoding ;
1998 };
1999
2000 #endif // defined(__WXCOCOA__)
2001
2002 // ============================================================================
2003 // Mac conversion classes
2004 // ============================================================================
2005
2006 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2007
2008 class wxMBConv_mac : public wxMBConv
2009 {
2010 public:
2011 wxMBConv_mac()
2012 {
2013 Init(CFStringGetSystemEncoding()) ;
2014 }
2015
2016 wxMBConv_mac(const wxChar* name)
2017 {
2018 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2019 }
2020
2021 wxMBConv_mac(wxFontEncoding encoding)
2022 {
2023 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2024 }
2025
2026 ~wxMBConv_mac()
2027 {
2028 OSStatus status = noErr ;
2029 status = TECDisposeConverter(m_MB2WC_converter);
2030 status = TECDisposeConverter(m_WC2MB_converter);
2031 }
2032
2033
2034 void Init( TextEncodingBase encoding)
2035 {
2036 OSStatus status = noErr ;
2037 m_char_encoding = encoding ;
2038 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2039
2040 status = TECCreateConverter(&m_MB2WC_converter,
2041 m_char_encoding,
2042 m_unicode_encoding);
2043 status = TECCreateConverter(&m_WC2MB_converter,
2044 m_unicode_encoding,
2045 m_char_encoding);
2046 }
2047
2048 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2049 {
2050 OSStatus status = noErr ;
2051 ByteCount byteOutLen ;
2052 ByteCount byteInLen = strlen(psz) ;
2053 wchar_t *tbuf = NULL ;
2054 UniChar* ubuf = NULL ;
2055 size_t res = 0 ;
2056
2057 if (buf == NULL)
2058 {
2059 //apple specs say at least 32
2060 n = wxMax( 32 , byteInLen ) ;
2061 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2062 }
2063 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2064 #if SIZEOF_WCHAR_T == 4
2065 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2066 #else
2067 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2068 #endif
2069 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2070 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2071 #if SIZEOF_WCHAR_T == 4
2072 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2073 // is not properly terminated we get random characters at the end
2074 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2075 wxMBConvUTF16BE converter ;
2076 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2077 free( ubuf ) ;
2078 #else
2079 res = byteOutLen / sizeof( UniChar ) ;
2080 #endif
2081 if ( buf == NULL )
2082 free(tbuf) ;
2083
2084 if ( buf && res < n)
2085 buf[res] = 0;
2086
2087 return res ;
2088 }
2089
2090 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2091 {
2092 OSStatus status = noErr ;
2093 ByteCount byteOutLen ;
2094 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2095
2096 char *tbuf = NULL ;
2097
2098 if (buf == NULL)
2099 {
2100 //apple specs say at least 32
2101 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2102 tbuf = (char*) malloc( n ) ;
2103 }
2104
2105 ByteCount byteBufferLen = n ;
2106 UniChar* ubuf = NULL ;
2107 #if SIZEOF_WCHAR_T == 4
2108 wxMBConvUTF16BE converter ;
2109 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2110 byteInLen = unicharlen ;
2111 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2112 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2113 #else
2114 ubuf = (UniChar*) psz ;
2115 #endif
2116 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2117 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2118 #if SIZEOF_WCHAR_T == 4
2119 free( ubuf ) ;
2120 #endif
2121 if ( buf == NULL )
2122 free(tbuf) ;
2123
2124 size_t res = byteOutLen ;
2125 if ( buf && res < n)
2126 {
2127 buf[res] = 0;
2128
2129 //we need to double-trip to verify it didn't insert any ? in place
2130 //of bogus characters
2131 wxWCharBuffer wcBuf(n);
2132 size_t pszlen = wxWcslen(psz);
2133 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2134 wxWcslen(wcBuf) != pszlen ||
2135 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2136 {
2137 // we didn't obtain the same thing we started from, hence
2138 // the conversion was lossy and we consider that it failed
2139 return (size_t)-1;
2140 }
2141 }
2142
2143 return res ;
2144 }
2145
2146 bool IsOk() const
2147 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2148
2149 private:
2150 TECObjectRef m_MB2WC_converter ;
2151 TECObjectRef m_WC2MB_converter ;
2152
2153 TextEncodingBase m_char_encoding ;
2154 TextEncodingBase m_unicode_encoding ;
2155 };
2156
2157 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2158
2159 // ============================================================================
2160 // wxEncodingConverter based conversion classes
2161 // ============================================================================
2162
2163 #if wxUSE_FONTMAP
2164
2165 class wxMBConv_wxwin : public wxMBConv
2166 {
2167 private:
2168 void Init()
2169 {
2170 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2171 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2172 }
2173
2174 public:
2175 // temporarily just use wxEncodingConverter stuff,
2176 // so that it works while a better implementation is built
2177 wxMBConv_wxwin(const wxChar* name)
2178 {
2179 if (name)
2180 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2181 else
2182 m_enc = wxFONTENCODING_SYSTEM;
2183
2184 Init();
2185 }
2186
2187 wxMBConv_wxwin(wxFontEncoding enc)
2188 {
2189 m_enc = enc;
2190
2191 Init();
2192 }
2193
2194 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2195 {
2196 size_t inbuf = strlen(psz);
2197 if (buf)
2198 m2w.Convert(psz,buf);
2199 return inbuf;
2200 }
2201
2202 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2203 {
2204 const size_t inbuf = wxWcslen(psz);
2205 if (buf)
2206 w2m.Convert(psz,buf);
2207
2208 return inbuf;
2209 }
2210
2211 bool IsOk() const { return m_ok; }
2212
2213 public:
2214 wxFontEncoding m_enc;
2215 wxEncodingConverter m2w, w2m;
2216
2217 // were we initialized successfully?
2218 bool m_ok;
2219
2220 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2221 };
2222
2223 #endif // wxUSE_FONTMAP
2224
2225 // ============================================================================
2226 // wxCSConv implementation
2227 // ============================================================================
2228
2229 void wxCSConv::Init()
2230 {
2231 m_name = NULL;
2232 m_convReal = NULL;
2233 m_deferred = true;
2234 }
2235
2236 wxCSConv::wxCSConv(const wxChar *charset)
2237 {
2238 Init();
2239
2240 if ( charset )
2241 {
2242 SetName(charset);
2243 }
2244
2245 m_encoding = wxFONTENCODING_SYSTEM;
2246 }
2247
2248 wxCSConv::wxCSConv(wxFontEncoding encoding)
2249 {
2250 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2251 {
2252 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2253
2254 encoding = wxFONTENCODING_SYSTEM;
2255 }
2256
2257 Init();
2258
2259 m_encoding = encoding;
2260 }
2261
2262 wxCSConv::~wxCSConv()
2263 {
2264 Clear();
2265 }
2266
2267 wxCSConv::wxCSConv(const wxCSConv& conv)
2268 : wxMBConv()
2269 {
2270 Init();
2271
2272 SetName(conv.m_name);
2273 m_encoding = conv.m_encoding;
2274 }
2275
2276 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2277 {
2278 Clear();
2279
2280 SetName(conv.m_name);
2281 m_encoding = conv.m_encoding;
2282
2283 return *this;
2284 }
2285
2286 void wxCSConv::Clear()
2287 {
2288 free(m_name);
2289 delete m_convReal;
2290
2291 m_name = NULL;
2292 m_convReal = NULL;
2293 }
2294
2295 void wxCSConv::SetName(const wxChar *charset)
2296 {
2297 if (charset)
2298 {
2299 m_name = wxStrdup(charset);
2300 m_deferred = true;
2301 }
2302 }
2303
2304 wxMBConv *wxCSConv::DoCreate() const
2305 {
2306 // check for the special case of ASCII or ISO8859-1 charset: as we have
2307 // special knowledge of it anyhow, we don't need to create a special
2308 // conversion object
2309 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2310 {
2311 // don't convert at all
2312 return NULL;
2313 }
2314
2315 // we trust OS to do conversion better than we can so try external
2316 // conversion methods first
2317 //
2318 // the full order is:
2319 // 1. OS conversion (iconv() under Unix or Win32 API)
2320 // 2. hard coded conversions for UTF
2321 // 3. wxEncodingConverter as fall back
2322
2323 // step (1)
2324 #ifdef HAVE_ICONV
2325 #if !wxUSE_FONTMAP
2326 if ( m_name )
2327 #endif // !wxUSE_FONTMAP
2328 {
2329 wxString name(m_name);
2330
2331 #if wxUSE_FONTMAP
2332 if ( name.empty() )
2333 name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2334 #endif // wxUSE_FONTMAP
2335
2336 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2337 if ( conv->IsOk() )
2338 return conv;
2339
2340 delete conv;
2341 }
2342 #endif // HAVE_ICONV
2343
2344 #ifdef wxHAVE_WIN32_MB2WC
2345 {
2346 #if wxUSE_FONTMAP
2347 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2348 : new wxMBConv_win32(m_encoding);
2349 if ( conv->IsOk() )
2350 return conv;
2351
2352 delete conv;
2353 #else
2354 return NULL;
2355 #endif
2356 }
2357 #endif // wxHAVE_WIN32_MB2WC
2358 #if defined(__WXMAC__)
2359 {
2360 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2361 {
2362
2363 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2364 : new wxMBConv_mac(m_encoding);
2365 if ( conv->IsOk() )
2366 return conv;
2367
2368 delete conv;
2369 }
2370 }
2371 #endif
2372 #if defined(__WXCOCOA__)
2373 {
2374 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2375 {
2376
2377 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2378 : new wxMBConv_cocoa(m_encoding);
2379 if ( conv->IsOk() )
2380 return conv;
2381
2382 delete conv;
2383 }
2384 }
2385 #endif
2386 // step (2)
2387 wxFontEncoding enc = m_encoding;
2388 #if wxUSE_FONTMAP
2389 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2390 {
2391 // use "false" to suppress interactive dialogs -- we can be called from
2392 // anywhere and popping up a dialog from here is the last thing we want to
2393 // do
2394 enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2395 }
2396 #endif // wxUSE_FONTMAP
2397
2398 switch ( enc )
2399 {
2400 case wxFONTENCODING_UTF7:
2401 return new wxMBConvUTF7;
2402
2403 case wxFONTENCODING_UTF8:
2404 return new wxMBConvUTF8;
2405
2406 case wxFONTENCODING_UTF16BE:
2407 return new wxMBConvUTF16BE;
2408
2409 case wxFONTENCODING_UTF16LE:
2410 return new wxMBConvUTF16LE;
2411
2412 case wxFONTENCODING_UTF32BE:
2413 return new wxMBConvUTF32BE;
2414
2415 case wxFONTENCODING_UTF32LE:
2416 return new wxMBConvUTF32LE;
2417
2418 default:
2419 // nothing to do but put here to suppress gcc warnings
2420 ;
2421 }
2422
2423 // step (3)
2424 #if wxUSE_FONTMAP
2425 {
2426 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2427 : new wxMBConv_wxwin(m_encoding);
2428 if ( conv->IsOk() )
2429 return conv;
2430
2431 delete conv;
2432 }
2433 #endif // wxUSE_FONTMAP
2434
2435 // NB: This is a hack to prevent deadlock. What could otherwise happen
2436 // in Unicode build: wxConvLocal creation ends up being here
2437 // because of some failure and logs the error. But wxLog will try to
2438 // attach timestamp, for which it will need wxConvLocal (to convert
2439 // time to char* and then wchar_t*), but that fails, tries to log
2440 // error, but wxLog has a (already locked) critical section that
2441 // guards static buffer.
2442 static bool alreadyLoggingError = false;
2443 if (!alreadyLoggingError)
2444 {
2445 alreadyLoggingError = true;
2446 wxLogError(_("Cannot convert from the charset '%s'!"),
2447 m_name ? m_name
2448 :
2449 #if wxUSE_FONTMAP
2450 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2451 #else // !wxUSE_FONTMAP
2452 wxString::Format(_("encoding %s"), m_encoding).c_str()
2453 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2454 );
2455 alreadyLoggingError = false;
2456 }
2457
2458 return NULL;
2459 }
2460
2461 void wxCSConv::CreateConvIfNeeded() const
2462 {
2463 if ( m_deferred )
2464 {
2465 wxCSConv *self = (wxCSConv *)this; // const_cast
2466
2467 #if wxUSE_INTL
2468 // if we don't have neither the name nor the encoding, use the default
2469 // encoding for this system
2470 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2471 {
2472 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2473 }
2474 #endif // wxUSE_INTL
2475
2476 self->m_convReal = DoCreate();
2477 self->m_deferred = false;
2478 }
2479 }
2480
2481 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2482 {
2483 CreateConvIfNeeded();
2484
2485 if (m_convReal)
2486 return m_convReal->MB2WC(buf, psz, n);
2487
2488 // latin-1 (direct)
2489 size_t len = strlen(psz);
2490
2491 if (buf)
2492 {
2493 for (size_t c = 0; c <= len; c++)
2494 buf[c] = (unsigned char)(psz[c]);
2495 }
2496
2497 return len;
2498 }
2499
2500 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2501 {
2502 CreateConvIfNeeded();
2503
2504 if (m_convReal)
2505 return m_convReal->WC2MB(buf, psz, n);
2506
2507 // latin-1 (direct)
2508 const size_t len = wxWcslen(psz);
2509 if (buf)
2510 {
2511 for (size_t c = 0; c <= len; c++)
2512 {
2513 if (psz[c] > 0xFF)
2514 return (size_t)-1;
2515 buf[c] = (char)psz[c];
2516 }
2517 }
2518 else
2519 {
2520 for (size_t c = 0; c <= len; c++)
2521 {
2522 if (psz[c] > 0xFF)
2523 return (size_t)-1;
2524 }
2525 }
2526
2527 return len;
2528 }
2529
2530 // ----------------------------------------------------------------------------
2531 // globals
2532 // ----------------------------------------------------------------------------
2533
2534 #ifdef __WINDOWS__
2535 static wxMBConv_win32 wxConvLibcObj;
2536 #elif defined(__WXMAC__) && !defined(__MACH__)
2537 static wxMBConv_mac wxConvLibcObj ;
2538 #else
2539 static wxMBConvLibc wxConvLibcObj;
2540 #endif
2541
2542 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2543 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2544 static wxMBConvUTF7 wxConvUTF7Obj;
2545 static wxMBConvUTF8 wxConvUTF8Obj;
2546
2547
2548 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2549 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2550 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2551 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2552 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2553 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2554
2555 #else // !wxUSE_WCHAR_T
2556
2557 // stand-ins in absence of wchar_t
2558 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2559 wxConvISO8859_1,
2560 wxConvLocal,
2561 wxConvUTF8;
2562
2563 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2564
2565