warning fixes for BCC and OW (heavily modified patch 819146)
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
23 #endif
24
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
27
28 #ifdef __BORLANDC__
29 #pragma hdrstop
30 #endif
31
32 #ifndef WX_PRECOMP
33 #include "wx/intl.h"
34 #include "wx/log.h"
35 #endif // WX_PRECOMP
36
37 #include "wx/strconv.h"
38
39 #if wxUSE_WCHAR_T
40
41 #ifdef __WXMSW__
42 #include "wx/msw/private.h"
43 #endif
44
45 #ifndef __WXWINCE__
46 #include <errno.h>
47 #endif
48
49 #include <ctype.h>
50 #include <string.h>
51 #include <stdlib.h>
52
53 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
54 #define wxHAVE_WIN32_MB2WC
55 #endif // __WIN32__ but !__WXMICROWIN__
56
57 // ----------------------------------------------------------------------------
58 // headers
59 // ----------------------------------------------------------------------------
60
61 #ifdef __SALFORDC__
62 #include <clib.h>
63 #endif
64
65 #ifdef HAVE_ICONV
66 #include <iconv.h>
67 #endif
68
69 #include "wx/encconv.h"
70 #include "wx/fontmap.h"
71
72 // ----------------------------------------------------------------------------
73 // macros
74 // ----------------------------------------------------------------------------
75
76 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
77 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
78
79 #if SIZEOF_WCHAR_T == 4
80 #define WC_NAME "UCS4"
81 #define WC_BSWAP BSWAP_UCS4
82 #ifdef WORDS_BIGENDIAN
83 #define WC_NAME_BEST "UCS-4BE"
84 #else
85 #define WC_NAME_BEST "UCS-4LE"
86 #endif
87 #elif SIZEOF_WCHAR_T == 2
88 #define WC_NAME "UTF16"
89 #define WC_BSWAP BSWAP_UTF16
90 #define WC_UTF16
91 #ifdef WORDS_BIGENDIAN
92 #define WC_NAME_BEST "UTF-16BE"
93 #else
94 #define WC_NAME_BEST "UTF-16LE"
95 #endif
96 #else // sizeof(wchar_t) != 2 nor 4
97 // does this ever happen?
98 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
99 #endif
100
101 // ============================================================================
102 // implementation
103 // ============================================================================
104
105 // ----------------------------------------------------------------------------
106 // UTF-16 en/decoding to/from UCS-4
107 // ----------------------------------------------------------------------------
108
109
110 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
111 {
112 if (input<=0xffff)
113 {
114 if (output)
115 *output = (wxUint16) input;
116 return 1;
117 }
118 else if (input>=0x110000)
119 {
120 return (size_t)-1;
121 }
122 else
123 {
124 if (output)
125 {
126 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
127 *output = (wxUint16) ((input&0x3ff)+0xdc00);
128 }
129 return 2;
130 }
131 }
132
133 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
134 {
135 if ((*input<0xd800) || (*input>0xdfff))
136 {
137 output = *input;
138 return 1;
139 }
140 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
141 {
142 output = *input;
143 return (size_t)-1;
144 }
145 else
146 {
147 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
148 return 2;
149 }
150 }
151
152
153 // ----------------------------------------------------------------------------
154 // wxMBConv
155 // ----------------------------------------------------------------------------
156
157 wxMBConv::~wxMBConv()
158 {
159 // nothing to do here
160 }
161
162 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
163 {
164 if ( psz )
165 {
166 // calculate the length of the buffer needed first
167 size_t nLen = MB2WC(NULL, psz, 0);
168 if ( nLen != (size_t)-1 )
169 {
170 // now do the actual conversion
171 wxWCharBuffer buf(nLen);
172 MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL
173
174 return buf;
175 }
176 }
177
178 wxWCharBuffer buf((wchar_t *)NULL);
179
180 return buf;
181 }
182
183 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
184 {
185 if ( pwz )
186 {
187 size_t nLen = WC2MB(NULL, pwz, 0);
188 if ( nLen != (size_t)-1 )
189 {
190 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
191 WC2MB(buf.data(), pwz, nLen + 4);
192
193 return buf;
194 }
195 }
196
197 wxCharBuffer buf((char *)NULL);
198
199 return buf;
200 }
201
202 // ----------------------------------------------------------------------------
203 // wxMBConvLibc
204 // ----------------------------------------------------------------------------
205
206 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
207 {
208 return wxMB2WC(buf, psz, n);
209 }
210
211 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
212 {
213 return wxWC2MB(buf, psz, n);
214 }
215
216 // ----------------------------------------------------------------------------
217 // UTF-7
218 // ----------------------------------------------------------------------------
219
220 #if 0
221 static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
222 "abcdefghijklmnopqrstuvwxyz"
223 "0123456789'(),-./:?";
224 static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
225 static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
226 "abcdefghijklmnopqrstuvwxyz"
227 "0123456789+/";
228 #endif
229
230 // TODO: write actual implementations of UTF-7 here
231 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
232 const char * WXUNUSED(psz),
233 size_t WXUNUSED(n)) const
234 {
235 return 0;
236 }
237
238 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
239 const wchar_t * WXUNUSED(psz),
240 size_t WXUNUSED(n)) const
241 {
242 return 0;
243 }
244
245 // ----------------------------------------------------------------------------
246 // UTF-8
247 // ----------------------------------------------------------------------------
248
249 static wxUint32 utf8_max[]=
250 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
251
252 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
253 {
254 size_t len = 0;
255
256 while (*psz && ((!buf) || (len < n)))
257 {
258 unsigned char cc = *psz++, fc = cc;
259 unsigned cnt;
260 for (cnt = 0; fc & 0x80; cnt++)
261 fc <<= 1;
262 if (!cnt)
263 {
264 // plain ASCII char
265 if (buf)
266 *buf++ = cc;
267 len++;
268 }
269 else
270 {
271 cnt--;
272 if (!cnt)
273 {
274 // invalid UTF-8 sequence
275 return (size_t)-1;
276 }
277 else
278 {
279 unsigned ocnt = cnt - 1;
280 wxUint32 res = cc & (0x3f >> cnt);
281 while (cnt--)
282 {
283 cc = *psz++;
284 if ((cc & 0xC0) != 0x80)
285 {
286 // invalid UTF-8 sequence
287 return (size_t)-1;
288 }
289 res = (res << 6) | (cc & 0x3f);
290 }
291 if (res <= utf8_max[ocnt])
292 {
293 // illegal UTF-8 encoding
294 return (size_t)-1;
295 }
296 #ifdef WC_UTF16
297 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
298 size_t pa = encode_utf16(res, (wxUint16 *)buf);
299 if (pa == (size_t)-1)
300 return (size_t)-1;
301 if (buf)
302 buf += pa;
303 len += pa;
304 #else // !WC_UTF16
305 if (buf)
306 *buf++ = res;
307 len++;
308 #endif // WC_UTF16/!WC_UTF16
309 }
310 }
311 }
312 if (buf && (len < n))
313 *buf = 0;
314 return len;
315 }
316
317 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
318 {
319 size_t len = 0;
320
321 while (*psz && ((!buf) || (len < n)))
322 {
323 wxUint32 cc;
324 #ifdef WC_UTF16
325 // cast is ok for WC_UTF16
326 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
327 psz += (pa == (size_t)-1) ? 1 : pa;
328 #else
329 cc=(*psz++) & 0x7fffffff;
330 #endif
331 unsigned cnt;
332 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
333 if (!cnt)
334 {
335 // plain ASCII char
336 if (buf)
337 *buf++ = (char) cc;
338 len++;
339 }
340
341 else
342 {
343 len += cnt + 1;
344 if (buf)
345 {
346 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
347 while (cnt--)
348 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
349 }
350 }
351 }
352
353 if (buf && (len<n)) *buf = 0;
354
355 return len;
356 }
357
358
359
360
361 // ----------------------------------------------------------------------------
362 // UTF-16
363 // ----------------------------------------------------------------------------
364
365 #ifdef WORDS_BIGENDIAN
366 #define wxMBConvUTF16straight wxMBConvUTF16BE
367 #define wxMBConvUTF16swap wxMBConvUTF16LE
368 #else
369 #define wxMBConvUTF16swap wxMBConvUTF16BE
370 #define wxMBConvUTF16straight wxMBConvUTF16LE
371 #endif
372
373
374 #ifdef WC_UTF16
375
376 // copy 16bit MB to 16bit String
377 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
378 {
379 size_t len=0;
380
381 while (*(wxUint16*)psz && (!buf || len < n))
382 {
383 if (buf)
384 *buf++ = *(wxUint16*)psz;
385 len++;
386
387 psz += sizeof(wxUint16);
388 }
389 if (buf && len<n) *buf=0;
390
391 return len;
392 }
393
394
395 // copy 16bit String to 16bit MB
396 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
397 {
398 size_t len=0;
399
400 while (*psz && (!buf || len < n))
401 {
402 if (buf)
403 {
404 *(wxUint16*)buf = *psz;
405 buf += sizeof(wxUint16);
406 }
407 len += sizeof(wxUint16);
408 psz++;
409 }
410 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
411
412 return len;
413 }
414
415
416 // swap 16bit MB to 16bit String
417 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
418 {
419 size_t len=0;
420
421 while (*(wxUint16*)psz && (!buf || len < n))
422 {
423 if (buf)
424 {
425 ((char *)buf)[0] = psz[1];
426 ((char *)buf)[1] = psz[0];
427 buf++;
428 }
429 len++;
430 psz += sizeof(wxUint16);
431 }
432 if (buf && len<n) *buf=0;
433
434 return len;
435 }
436
437
438 // swap 16bit MB to 16bit String
439 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
440 {
441 size_t len=0;
442
443 while (*psz && (!buf || len < n))
444 {
445 if (buf)
446 {
447 *buf++ = ((char*)psz)[1];
448 *buf++ = ((char*)psz)[0];
449 }
450 len += sizeof(wxUint16);
451 psz++;
452 }
453 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
454
455 return len;
456 }
457
458
459 #else // WC_UTF16
460
461
462 // copy 16bit MB to 32bit String
463 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
464 {
465 size_t len=0;
466
467 while (*(wxUint16*)psz && (!buf || len < n))
468 {
469 wxUint32 cc;
470 size_t pa=decode_utf16((wxUint16*)psz, cc);
471 if (pa == (size_t)-1)
472 return pa;
473
474 if (buf)
475 *buf++ = cc;
476 len++;
477 psz += pa * sizeof(wxUint16);
478 }
479 if (buf && len<n) *buf=0;
480
481 return len;
482 }
483
484
485 // copy 32bit String to 16bit MB
486 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
487 {
488 size_t len=0;
489
490 while (*psz && (!buf || len < n))
491 {
492 wxUint16 cc[2];
493 size_t pa=encode_utf16(*psz, cc);
494
495 if (pa == (size_t)-1)
496 return pa;
497
498 if (buf)
499 {
500 *(wxUint16*)buf = cc[0];
501 buf += sizeof(wxUint16);
502 if (pa > 1)
503 {
504 *(wxUint16*)buf = cc[1];
505 buf += sizeof(wxUint16);
506 }
507 }
508
509 len += pa*sizeof(wxUint16);
510 psz++;
511 }
512 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
513
514 return len;
515 }
516
517
518 // swap 16bit MB to 32bit String
519 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
520 {
521 size_t len=0;
522
523 while (*(wxUint16*)psz && (!buf || len < n))
524 {
525 wxUint32 cc;
526 char tmp[4];
527 tmp[0]=psz[1]; tmp[1]=psz[0];
528 tmp[2]=psz[3]; tmp[3]=psz[2];
529
530 size_t pa=decode_utf16((wxUint16*)tmp, cc);
531 if (pa == (size_t)-1)
532 return pa;
533
534 if (buf)
535 *buf++ = cc;
536
537 len++;
538 psz += pa * sizeof(wxUint16);
539 }
540 if (buf && len<n) *buf=0;
541
542 return len;
543 }
544
545
546 // swap 32bit String to 16bit MB
547 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
548 {
549 size_t len=0;
550
551 while (*psz && (!buf || len < n))
552 {
553 wxUint16 cc[2];
554 size_t pa=encode_utf16(*psz, cc);
555
556 if (pa == (size_t)-1)
557 return pa;
558
559 if (buf)
560 {
561 *buf++ = ((char*)cc)[1];
562 *buf++ = ((char*)cc)[0];
563 if (pa > 1)
564 {
565 *buf++ = ((char*)cc)[3];
566 *buf++ = ((char*)cc)[2];
567 }
568 }
569
570 len += pa*sizeof(wxUint16);
571 psz++;
572 }
573 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
574
575 return len;
576 }
577
578 #endif // WC_UTF16
579
580
581 // ----------------------------------------------------------------------------
582 // UTF-32
583 // ----------------------------------------------------------------------------
584
585 #ifdef WORDS_BIGENDIAN
586 #define wxMBConvUTF32straight wxMBConvUTF32BE
587 #define wxMBConvUTF32swap wxMBConvUTF32LE
588 #else
589 #define wxMBConvUTF32swap wxMBConvUTF32BE
590 #define wxMBConvUTF32straight wxMBConvUTF32LE
591 #endif
592
593
594 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
595 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
596
597
598 #ifdef WC_UTF16
599
600 // copy 32bit MB to 16bit String
601 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
602 {
603 size_t len=0;
604
605 while (*(wxUint32*)psz && (!buf || len < n))
606 {
607 wxUint16 cc[2];
608
609 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
610 if (pa == (size_t)-1)
611 return pa;
612
613 if (buf)
614 {
615 *buf++ = cc[0];
616 if (pa > 1)
617 *buf++ = cc[1];
618 }
619 len += pa;
620 psz += sizeof(wxUint32);
621 }
622 if (buf && len<n) *buf=0;
623
624 return len;
625 }
626
627
628 // copy 16bit String to 32bit MB
629 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
630 {
631 size_t len=0;
632
633 while (*psz && (!buf || len < n))
634 {
635 wxUint32 cc;
636
637 // cast is ok for WC_UTF16
638 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
639 if (pa == (size_t)-1)
640 return pa;
641
642 if (buf)
643 {
644 *(wxUint32*)buf = cc;
645 buf += sizeof(wxUint32);
646 }
647 len += sizeof(wxUint32);
648 psz += pa;
649 }
650
651 if (buf && len<=n-sizeof(wxUint32))
652 *(wxUint32*)buf=0;
653
654 return len;
655 }
656
657
658
659 // swap 32bit MB to 16bit String
660 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
661 {
662 size_t len=0;
663
664 while (*(wxUint32*)psz && (!buf || len < n))
665 {
666 char tmp[4];
667 tmp[0] = psz[3]; tmp[1] = psz[2];
668 tmp[2] = psz[1]; tmp[3] = psz[0];
669
670
671 wxUint16 cc[2];
672
673 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
674 if (pa == (size_t)-1)
675 return pa;
676
677 if (buf)
678 {
679 *buf++ = cc[0];
680 if (pa > 1)
681 *buf++ = cc[1];
682 }
683 len += pa;
684 psz += sizeof(wxUint32);
685 }
686
687 if (buf && len<n)
688 *buf=0;
689
690 return len;
691 }
692
693
694 // swap 16bit String to 32bit MB
695 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
696 {
697 size_t len=0;
698
699 while (*psz && (!buf || len < n))
700 {
701 char cc[4];
702
703 // cast is ok for WC_UTF16
704 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
705 if (pa == (size_t)-1)
706 return pa;
707
708 if (buf)
709 {
710 *buf++ = cc[3];
711 *buf++ = cc[2];
712 *buf++ = cc[1];
713 *buf++ = cc[0];
714 }
715 len += sizeof(wxUint32);
716 psz += pa;
717 }
718
719 if (buf && len<=n-sizeof(wxUint32))
720 *(wxUint32*)buf=0;
721
722 return len;
723 }
724
725 #else // WC_UTF16
726
727
728 // copy 32bit MB to 32bit String
729 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
730 {
731 size_t len=0;
732
733 while (*(wxUint32*)psz && (!buf || len < n))
734 {
735 if (buf)
736 *buf++ = *(wxUint32*)psz;
737 len++;
738 psz += sizeof(wxUint32);
739 }
740
741 if (buf && len<n)
742 *buf=0;
743
744 return len;
745 }
746
747
748 // copy 32bit String to 32bit MB
749 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
750 {
751 size_t len=0;
752
753 while (*psz && (!buf || len < n))
754 {
755 if (buf)
756 {
757 *(wxUint32*)buf = *psz;
758 buf += sizeof(wxUint32);
759 }
760
761 len += sizeof(wxUint32);
762 psz++;
763 }
764
765 if (buf && len<=n-sizeof(wxUint32))
766 *(wxUint32*)buf=0;
767
768 return len;
769 }
770
771
772 // swap 32bit MB to 32bit String
773 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
774 {
775 size_t len=0;
776
777 while (*(wxUint32*)psz && (!buf || len < n))
778 {
779 if (buf)
780 {
781 ((char *)buf)[0] = psz[3];
782 ((char *)buf)[1] = psz[2];
783 ((char *)buf)[2] = psz[1];
784 ((char *)buf)[3] = psz[0];
785 buf++;
786 }
787 len++;
788 psz += sizeof(wxUint32);
789 }
790
791 if (buf && len<n)
792 *buf=0;
793
794 return len;
795 }
796
797
798 // swap 32bit String to 32bit MB
799 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
800 {
801 size_t len=0;
802
803 while (*psz && (!buf || len < n))
804 {
805 if (buf)
806 {
807 *buf++ = ((char *)psz)[3];
808 *buf++ = ((char *)psz)[2];
809 *buf++ = ((char *)psz)[1];
810 *buf++ = ((char *)psz)[0];
811 }
812 len += sizeof(wxUint32);
813 psz++;
814 }
815
816 if (buf && len<=n-sizeof(wxUint32))
817 *(wxUint32*)buf=0;
818
819 return len;
820 }
821
822
823 #endif // WC_UTF16
824
825
826 // ============================================================================
827 // The classes doing conversion using the iconv_xxx() functions
828 // ============================================================================
829
830 #ifdef HAVE_ICONV
831
832 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
833 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
834 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
835 // (which means error) and says there are 0 bytes left in the input buffer --
836 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
837 // this alternative test for iconv() failure.
838 // [This bug does not appear in glibc 2.2.]
839 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
840 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
841 (errno != E2BIG || bufLeft != 0))
842 #else
843 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
844 #endif
845
846 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
847
848 // ----------------------------------------------------------------------------
849 // wxMBConv_iconv: encapsulates an iconv character set
850 // ----------------------------------------------------------------------------
851
852 class wxMBConv_iconv : public wxMBConv
853 {
854 public:
855 wxMBConv_iconv(const wxChar *name);
856 virtual ~wxMBConv_iconv();
857
858 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
859 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
860
861 bool IsOk() const
862 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
863
864 protected:
865 // the iconv handlers used to translate from multibyte to wide char and in
866 // the other direction
867 iconv_t m2w,
868 w2m;
869
870 private:
871 // the name (for iconv_open()) of a wide char charset -- if none is
872 // available on this machine, it will remain NULL
873 static const char *ms_wcCharsetName;
874
875 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
876 // different endian-ness than the native one
877 static bool ms_wcNeedsSwap;
878 };
879
880 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
881 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
882
883 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
884 {
885 // Do it the hard way
886 char cname[100];
887 for (size_t i = 0; i < wxStrlen(name)+1; i++)
888 cname[i] = (char) name[i];
889
890 // check for charset that represents wchar_t:
891 if (ms_wcCharsetName == NULL)
892 {
893 ms_wcNeedsSwap = false;
894
895 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
896 ms_wcCharsetName = WC_NAME_BEST;
897 m2w = iconv_open(ms_wcCharsetName, cname);
898
899 if (m2w == (iconv_t)-1)
900 {
901 // try charset w/o bytesex info (e.g. "UCS4")
902 // and check for bytesex ourselves:
903 ms_wcCharsetName = WC_NAME;
904 m2w = iconv_open(ms_wcCharsetName, cname);
905
906 // last bet, try if it knows WCHAR_T pseudo-charset
907 if (m2w == (iconv_t)-1)
908 {
909 ms_wcCharsetName = "WCHAR_T";
910 m2w = iconv_open(ms_wcCharsetName, cname);
911 }
912
913 if (m2w != (iconv_t)-1)
914 {
915 char buf[2], *bufPtr;
916 wchar_t wbuf[2], *wbufPtr;
917 size_t insz, outsz;
918 size_t res;
919
920 buf[0] = 'A';
921 buf[1] = 0;
922 wbuf[0] = 0;
923 insz = 2;
924 outsz = SIZEOF_WCHAR_T * 2;
925 wbufPtr = wbuf;
926 bufPtr = buf;
927
928 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
929 (char**)&wbufPtr, &outsz);
930
931 if (ICONV_FAILED(res, insz))
932 {
933 ms_wcCharsetName = NULL;
934 wxLogLastError(wxT("iconv"));
935 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
936 }
937 else
938 {
939 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
940 }
941 }
942 else
943 {
944 ms_wcCharsetName = NULL;
945
946 // VS: we must not output an error here, since wxWindows will safely
947 // fall back to using wxEncodingConverter.
948 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
949 //wxLogError(
950 }
951 }
952 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
953 }
954 else // we already have ms_wcCharsetName
955 {
956 m2w = iconv_open(ms_wcCharsetName, cname);
957 }
958
959 // NB: don't ever pass NULL to iconv_open(), it may crash!
960 if ( ms_wcCharsetName )
961 {
962 w2m = iconv_open( cname, ms_wcCharsetName);
963 }
964 else
965 {
966 w2m = (iconv_t)-1;
967 }
968 }
969
970 wxMBConv_iconv::~wxMBConv_iconv()
971 {
972 if ( m2w != (iconv_t)-1 )
973 iconv_close(m2w);
974 if ( w2m != (iconv_t)-1 )
975 iconv_close(w2m);
976 }
977
978 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
979 {
980 size_t inbuf = strlen(psz);
981 size_t outbuf = n * SIZEOF_WCHAR_T;
982 size_t res, cres;
983 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
984 wchar_t *bufPtr = buf;
985 const char *pszPtr = psz;
986
987 if (buf)
988 {
989 // have destination buffer, convert there
990 cres = iconv(m2w,
991 ICONV_CHAR_CAST(&pszPtr), &inbuf,
992 (char**)&bufPtr, &outbuf);
993 res = n - (outbuf / SIZEOF_WCHAR_T);
994
995 if (ms_wcNeedsSwap)
996 {
997 // convert to native endianness
998 WC_BSWAP(buf /* _not_ bufPtr */, res)
999 }
1000
1001 // NB: iconv was given only strlen(psz) characters on input, and so
1002 // it couldn't convert the trailing zero. Let's do it ourselves
1003 // if there's some room left for it in the output buffer.
1004 if (res < n)
1005 buf[res] = 0;
1006 }
1007 else
1008 {
1009 // no destination buffer... convert using temp buffer
1010 // to calculate destination buffer requirement
1011 wchar_t tbuf[8];
1012 res = 0;
1013 do {
1014 bufPtr = tbuf;
1015 outbuf = 8*SIZEOF_WCHAR_T;
1016
1017 cres = iconv(m2w,
1018 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1019 (char**)&bufPtr, &outbuf );
1020
1021 res += 8-(outbuf/SIZEOF_WCHAR_T);
1022 } while ((cres==(size_t)-1) && (errno==E2BIG));
1023 }
1024
1025 if (ICONV_FAILED(cres, inbuf))
1026 {
1027 //VS: it is ok if iconv fails, hence trace only
1028 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1029 return (size_t)-1;
1030 }
1031
1032 return res;
1033 }
1034
1035 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1036 {
1037 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1038 size_t outbuf = n;
1039 size_t res, cres;
1040
1041 wchar_t *tmpbuf = 0;
1042
1043 if (ms_wcNeedsSwap)
1044 {
1045 // need to copy to temp buffer to switch endianness
1046 // this absolutely doesn't rock!
1047 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1048 // could be in read-only memory, or be accessed in some other thread)
1049 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1050 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1051 WC_BSWAP(tmpbuf, inbuf)
1052 psz=tmpbuf;
1053 }
1054
1055 if (buf)
1056 {
1057 // have destination buffer, convert there
1058 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1059
1060 res = n-outbuf;
1061
1062 // NB: iconv was given only wcslen(psz) characters on input, and so
1063 // it couldn't convert the trailing zero. Let's do it ourselves
1064 // if there's some room left for it in the output buffer.
1065 if (res < n)
1066 buf[0] = 0;
1067 }
1068 else
1069 {
1070 // no destination buffer... convert using temp buffer
1071 // to calculate destination buffer requirement
1072 char tbuf[16];
1073 res = 0;
1074 do {
1075 buf = tbuf; outbuf = 16;
1076
1077 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1078
1079 res += 16 - outbuf;
1080 } while ((cres==(size_t)-1) && (errno==E2BIG));
1081 }
1082
1083 if (ms_wcNeedsSwap)
1084 {
1085 free(tmpbuf);
1086 }
1087
1088 if (ICONV_FAILED(cres, inbuf))
1089 {
1090 //VS: it is ok if iconv fails, hence trace only
1091 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1092 return (size_t)-1;
1093 }
1094
1095 return res;
1096 }
1097
1098 #endif // HAVE_ICONV
1099
1100
1101 // ============================================================================
1102 // Win32 conversion classes
1103 // ============================================================================
1104
1105 #ifdef wxHAVE_WIN32_MB2WC
1106
1107 // from utils.cpp
1108 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1109 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1110
1111 class wxMBConv_win32 : public wxMBConv
1112 {
1113 public:
1114 wxMBConv_win32()
1115 {
1116 m_CodePage = CP_ACP;
1117 }
1118
1119 wxMBConv_win32(const wxChar* name)
1120 {
1121 m_CodePage = wxCharsetToCodepage(name);
1122 }
1123
1124 wxMBConv_win32(wxFontEncoding encoding)
1125 {
1126 m_CodePage = wxEncodingToCodepage(encoding);
1127 }
1128
1129 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1130 {
1131 const size_t len = ::MultiByteToWideChar
1132 (
1133 m_CodePage, // code page
1134 0, // flags (none)
1135 psz, // input string
1136 -1, // its length (NUL-terminated)
1137 buf, // output string
1138 buf ? n : 0 // size of output buffer
1139 );
1140
1141 // note that it returns count of written chars for buf != NULL and size
1142 // of the needed buffer for buf == NULL so in either case the length of
1143 // the string (which never includes the terminating NUL) is one less
1144 return len ? len - 1 : (size_t)-1;
1145 }
1146
1147 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
1148 {
1149 const size_t len = ::WideCharToMultiByte
1150 (
1151 m_CodePage, // code page
1152 0, // flags (none)
1153 psz, // input string
1154 -1, // it is (wide) NUL-terminated
1155 buf, // output buffer
1156 buf ? n : 0, // and its size
1157 NULL, // default "replacement" char
1158 NULL // [out] was it used?
1159 );
1160
1161 // see the comment above for the reason of "len - 1"
1162 return len ? len - 1 : (size_t)-1;
1163 }
1164
1165 bool IsOk() const
1166 { return m_CodePage != -1; }
1167
1168 public:
1169 long m_CodePage;
1170 };
1171
1172 #endif // wxHAVE_WIN32_MB2WC
1173
1174
1175 // ============================================================================
1176 // wxEncodingConverter based conversion classes
1177 // ============================================================================
1178
1179 #if wxUSE_FONTMAP
1180
1181 class wxMBConv_wxwin : public wxMBConv
1182 {
1183 private:
1184 void Init()
1185 {
1186 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
1187 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
1188 }
1189
1190 public:
1191 // temporarily just use wxEncodingConverter stuff,
1192 // so that it works while a better implementation is built
1193 wxMBConv_wxwin(const wxChar* name)
1194 {
1195 if (name)
1196 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
1197 else
1198 m_enc = wxFONTENCODING_SYSTEM;
1199
1200 Init();
1201 }
1202
1203 wxMBConv_wxwin(wxFontEncoding enc)
1204 {
1205 m_enc = enc;
1206
1207 Init();
1208 }
1209
1210 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
1211 {
1212 size_t inbuf = strlen(psz);
1213 if (buf)
1214 m2w.Convert(psz,buf);
1215 return inbuf;
1216 }
1217
1218 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
1219 {
1220 const size_t inbuf = wxWcslen(psz);
1221 if (buf)
1222 w2m.Convert(psz,buf);
1223
1224 return inbuf;
1225 }
1226
1227 bool IsOk() const { return m_ok; }
1228
1229 public:
1230 wxFontEncoding m_enc;
1231 wxEncodingConverter m2w, w2m;
1232
1233 // were we initialized successfully?
1234 bool m_ok;
1235
1236 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
1237 };
1238
1239 #endif // wxUSE_FONTMAP
1240
1241 // ============================================================================
1242 // wxCSConv implementation
1243 // ============================================================================
1244
1245 void wxCSConv::Init()
1246 {
1247 m_name = NULL;
1248 m_convReal = NULL;
1249 m_deferred = true;
1250 }
1251
1252 wxCSConv::wxCSConv(const wxChar *charset)
1253 {
1254 Init();
1255
1256 if ( charset )
1257 {
1258 SetName(charset);
1259 }
1260
1261 m_encoding = wxFONTENCODING_SYSTEM;
1262 }
1263
1264 wxCSConv::wxCSConv(wxFontEncoding encoding)
1265 {
1266 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
1267 {
1268 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1269
1270 encoding = wxFONTENCODING_SYSTEM;
1271 }
1272
1273 Init();
1274
1275 m_encoding = encoding;
1276 }
1277
1278 wxCSConv::~wxCSConv()
1279 {
1280 Clear();
1281 }
1282
1283 wxCSConv::wxCSConv(const wxCSConv& conv)
1284 : wxMBConv()
1285 {
1286 Init();
1287
1288 SetName(conv.m_name);
1289 m_encoding = conv.m_encoding;
1290 }
1291
1292 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
1293 {
1294 Clear();
1295
1296 SetName(conv.m_name);
1297 m_encoding = conv.m_encoding;
1298
1299 return *this;
1300 }
1301
1302 void wxCSConv::Clear()
1303 {
1304 free(m_name);
1305 delete m_convReal;
1306
1307 m_name = NULL;
1308 m_convReal = NULL;
1309 }
1310
1311 void wxCSConv::SetName(const wxChar *charset)
1312 {
1313 if (charset)
1314 {
1315 m_name = wxStrdup(charset);
1316 m_deferred = true;
1317 }
1318 }
1319
1320 static inline bool DoesntNeedConv(wxFontEncoding enc)
1321 {
1322 return enc == wxFONTENCODING_DEFAULT ||
1323 enc == wxFONTENCODING_SYSTEM ||
1324 enc == wxFONTENCODING_ISO8859_1;
1325 }
1326
1327 wxMBConv *wxCSConv::DoCreate() const
1328 {
1329 #if wxUSE_FONTMAP
1330 wxFontMapper * const fontMapper = wxFontMapper::Get();
1331
1332 wxFontEncoding encFromName = m_name ? fontMapper->CharsetToEncoding(m_name)
1333 : wxFONTENCODING_SYSTEM;
1334 #endif // wxUSE_FONTMAP
1335
1336 // check for the special case of ASCII charset
1337 if ( (!m_name && DoesntNeedConv(m_encoding))
1338 #if wxUSE_FONTMAP
1339 || (m_name && DoesntNeedConv(encFromName))
1340 #endif // wxUSE_FONTMAP
1341 )
1342 {
1343 // don't convert at all
1344 return NULL;
1345 }
1346
1347 // we trust OS to do conversion better than we can so try external
1348 // conversion methods first
1349 //
1350 // the full order is:
1351 // 1. OS conversion (iconv() under Unix or Win32 API)
1352 // 2. hard coded conversions for UTF
1353 // 3. wxEncodingConverter as fall back
1354
1355 // step (1)
1356 #ifdef HAVE_ICONV
1357 if ( m_name )
1358 {
1359 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
1360 if ( conv->IsOk() )
1361 return conv;
1362
1363 delete conv;
1364 }
1365 #endif // HAVE_ICONV
1366
1367 #ifdef wxHAVE_WIN32_MB2WC
1368 {
1369 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
1370 : new wxMBConv_win32(m_encoding);
1371 if ( conv->IsOk() )
1372 return conv;
1373
1374 delete conv;
1375 }
1376 #endif // wxHAVE_WIN32_MB2WC
1377
1378 // step (2)
1379 wxFontEncoding enc = m_encoding;
1380 #if wxUSE_FONTMAP
1381 if ( enc == wxFONTENCODING_SYSTEM )
1382 enc = encFromName;
1383 #endif // wxUSE_FONTMAP
1384
1385 switch ( enc )
1386 {
1387 case wxFONTENCODING_UTF7:
1388 return new wxMBConvUTF7;
1389
1390 case wxFONTENCODING_UTF8:
1391 return new wxMBConvUTF8;
1392
1393 case wxFONTENCODING_UTF16BE:
1394 return new wxMBConvUTF16BE;
1395
1396 case wxFONTENCODING_UTF16LE:
1397 return new wxMBConvUTF16LE;
1398
1399 case wxFONTENCODING_UTF32BE:
1400 return new wxMBConvUTF32BE;
1401
1402 case wxFONTENCODING_UTF32LE:
1403 return new wxMBConvUTF32LE;
1404
1405 default:
1406 // nothing to do but put here to suppress gcc warnings
1407 ;
1408 }
1409
1410 // step (3)
1411 #if wxUSE_FONTMAP
1412 {
1413 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
1414 : new wxMBConv_wxwin(m_encoding);
1415 if ( conv->IsOk() )
1416 return conv;
1417
1418 delete conv;
1419 }
1420 #endif // wxUSE_FONTMAP
1421
1422 wxLogError(_("Cannot convert from the charset '%s'!"),
1423 m_name ? m_name
1424 :
1425 #if wxUSE_FONTMAP
1426 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
1427 #else // !wxUSE_FONTMAP
1428 wxString::Format(_("encoding %s"), m_encoding).c_str()
1429 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1430 );
1431
1432 return NULL;
1433 }
1434
1435 void wxCSConv::CreateConvIfNeeded() const
1436 {
1437 if ( m_deferred )
1438 {
1439 wxCSConv *self = (wxCSConv *)this; // const_cast
1440
1441 #if wxUSE_INTL
1442 // if we don't have neither the name nor the encoding, use the default
1443 // encoding for this system
1444 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
1445 {
1446 self->m_encoding = wxLocale::GetSystemEncoding();
1447 }
1448 #endif // wxUSE_INTL
1449
1450 self->m_convReal = DoCreate();
1451 self->m_deferred = false;
1452 }
1453 }
1454
1455 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1456 {
1457 CreateConvIfNeeded();
1458
1459 if (m_convReal)
1460 return m_convReal->MB2WC(buf, psz, n);
1461
1462 // latin-1 (direct)
1463 size_t len = strlen(psz);
1464
1465 if (buf)
1466 {
1467 for (size_t c = 0; c <= len; c++)
1468 buf[c] = (unsigned char)(psz[c]);
1469 }
1470
1471 return len;
1472 }
1473
1474 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1475 {
1476 CreateConvIfNeeded();
1477
1478 if (m_convReal)
1479 return m_convReal->WC2MB(buf, psz, n);
1480
1481 // latin-1 (direct)
1482 const size_t len = wxWcslen(psz);
1483 if (buf)
1484 {
1485 for (size_t c = 0; c <= len; c++)
1486 buf[c] = (psz[c] > 0xff) ? '?' : psz[c];
1487 }
1488
1489 return len;
1490 }
1491
1492 // ----------------------------------------------------------------------------
1493 // globals
1494 // ----------------------------------------------------------------------------
1495
1496 #ifdef __WINDOWS__
1497 static wxMBConv_win32 wxConvLibcObj;
1498 #else
1499 static wxMBConvLibc wxConvLibcObj;
1500 #endif
1501
1502 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
1503 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
1504 static wxMBConvUTF7 wxConvUTF7Obj;
1505 static wxMBConvUTF8 wxConvUTF8Obj;
1506
1507
1508 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
1509 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
1510 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
1511 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
1512 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
1513 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
1514
1515 #else // !wxUSE_WCHAR_T
1516
1517 // stand-ins in absence of wchar_t
1518 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
1519 wxConvISO8859_1,
1520 wxConvLocal,
1521 wxConvUTF8;
1522
1523 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
1524
1525