]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
remove wxBase files to get rid of RPM's complains about unpackaged files
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
23 #endif
24
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
27
28 #ifdef __BORLANDC__
29 #pragma hdrstop
30 #endif
31
32 #ifndef WX_PRECOMP
33 #include "wx/intl.h"
34 #include "wx/log.h"
35 #endif // WX_PRECOMP
36
37 #include "wx/strconv.h"
38
39 #if wxUSE_WCHAR_T
40
41 #ifdef __WXMSW__
42 #include "wx/msw/private.h"
43 #endif
44
45 #ifndef __WXWINCE__
46 #include <errno.h>
47 #endif
48
49 #include <ctype.h>
50 #include <string.h>
51 #include <stdlib.h>
52
53 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
54 #define wxHAVE_WIN32_MB2WC
55 #endif // __WIN32__ but !__WXMICROWIN__
56
57 // ----------------------------------------------------------------------------
58 // headers
59 // ----------------------------------------------------------------------------
60
61 #ifdef __SALFORDC__
62 #include <clib.h>
63 #endif
64
65 #ifdef HAVE_ICONV
66 #include <iconv.h>
67 #endif
68
69 #include "wx/encconv.h"
70 #include "wx/fontmap.h"
71
72 // ----------------------------------------------------------------------------
73 // macros
74 // ----------------------------------------------------------------------------
75
76 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
77 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
78
79 #if SIZEOF_WCHAR_T == 4
80 #define WC_NAME "UCS4"
81 #define WC_BSWAP BSWAP_UCS4
82 #ifdef WORDS_BIGENDIAN
83 #define WC_NAME_BEST "UCS-4BE"
84 #else
85 #define WC_NAME_BEST "UCS-4LE"
86 #endif
87 #elif SIZEOF_WCHAR_T == 2
88 #define WC_NAME "UTF16"
89 #define WC_BSWAP BSWAP_UTF16
90 #define WC_UTF16
91 #ifdef WORDS_BIGENDIAN
92 #define WC_NAME_BEST "UTF-16BE"
93 #else
94 #define WC_NAME_BEST "UTF-16LE"
95 #endif
96 #else // sizeof(wchar_t) != 2 nor 4
97 // does this ever happen?
98 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
99 #endif
100
101 // ============================================================================
102 // implementation
103 // ============================================================================
104
105 // ----------------------------------------------------------------------------
106 // UTF-16 en/decoding to/from UCS-4
107 // ----------------------------------------------------------------------------
108
109
110 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
111 {
112 if (input<=0xffff)
113 {
114 if (output) *output++ = (wxUint16) input;
115 return 1;
116 }
117 else if (input>=0x110000)
118 {
119 return (size_t)-1;
120 }
121 else
122 {
123 if (output)
124 {
125 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
126 *output++ = (wxUint16) ((input&0x3ff)+0xdc00);
127 }
128 return 2;
129 }
130 }
131
132 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
133 {
134 if ((*input<0xd800) || (*input>0xdfff))
135 {
136 output = *input;
137 return 1;
138 }
139 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
140 {
141 output = *input;
142 return (size_t)-1;
143 }
144 else
145 {
146 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
147 return 2;
148 }
149 }
150
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 wxMBConv::~wxMBConv()
157 {
158 // nothing to do here
159 }
160
161 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
162 {
163 if ( psz )
164 {
165 // calculate the length of the buffer needed first
166 size_t nLen = MB2WC(NULL, psz, 0);
167 if ( nLen != (size_t)-1 )
168 {
169 // now do the actual conversion
170 wxWCharBuffer buf(nLen);
171 MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL
172
173 return buf;
174 }
175 }
176
177 wxWCharBuffer buf((wchar_t *)NULL);
178
179 return buf;
180 }
181
182 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
183 {
184 if ( pwz )
185 {
186 size_t nLen = WC2MB(NULL, pwz, 0);
187 if ( nLen != (size_t)-1 )
188 {
189 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
190 WC2MB(buf.data(), pwz, nLen + 4);
191
192 return buf;
193 }
194 }
195
196 wxCharBuffer buf((char *)NULL);
197
198 return buf;
199 }
200
201 // ----------------------------------------------------------------------------
202 // wxMBConvLibc
203 // ----------------------------------------------------------------------------
204
205 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
206 {
207 return wxMB2WC(buf, psz, n);
208 }
209
210 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
211 {
212 return wxWC2MB(buf, psz, n);
213 }
214
215 // ----------------------------------------------------------------------------
216 // UTF-7
217 // ----------------------------------------------------------------------------
218
219 #if 0
220 static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
221 "abcdefghijklmnopqrstuvwxyz"
222 "0123456789'(),-./:?";
223 static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
224 static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
225 "abcdefghijklmnopqrstuvwxyz"
226 "0123456789+/";
227 #endif
228
229 // TODO: write actual implementations of UTF-7 here
230 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
231 const char * WXUNUSED(psz),
232 size_t WXUNUSED(n)) const
233 {
234 return 0;
235 }
236
237 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
238 const wchar_t * WXUNUSED(psz),
239 size_t WXUNUSED(n)) const
240 {
241 return 0;
242 }
243
244 // ----------------------------------------------------------------------------
245 // UTF-8
246 // ----------------------------------------------------------------------------
247
248 static wxUint32 utf8_max[]=
249 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
250
251 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
252 {
253 size_t len = 0;
254
255 while (*psz && ((!buf) || (len < n)))
256 {
257 unsigned char cc = *psz++, fc = cc;
258 unsigned cnt;
259 for (cnt = 0; fc & 0x80; cnt++)
260 fc <<= 1;
261 if (!cnt)
262 {
263 // plain ASCII char
264 if (buf)
265 *buf++ = cc;
266 len++;
267 }
268 else
269 {
270 cnt--;
271 if (!cnt)
272 {
273 // invalid UTF-8 sequence
274 return (size_t)-1;
275 }
276 else
277 {
278 unsigned ocnt = cnt - 1;
279 wxUint32 res = cc & (0x3f >> cnt);
280 while (cnt--)
281 {
282 cc = *psz++;
283 if ((cc & 0xC0) != 0x80)
284 {
285 // invalid UTF-8 sequence
286 return (size_t)-1;
287 }
288 res = (res << 6) | (cc & 0x3f);
289 }
290 if (res <= utf8_max[ocnt])
291 {
292 // illegal UTF-8 encoding
293 return (size_t)-1;
294 }
295 #ifdef WC_UTF16
296 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
297 size_t pa = encode_utf16(res, (wxUint16 *)buf);
298 if (pa == (size_t)-1)
299 return (size_t)-1;
300 if (buf)
301 buf += pa;
302 len += pa;
303 #else // !WC_UTF16
304 if (buf)
305 *buf++ = res;
306 len++;
307 #endif // WC_UTF16/!WC_UTF16
308 }
309 }
310 }
311 if (buf && (len < n))
312 *buf = 0;
313 return len;
314 }
315
316 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
317 {
318 size_t len = 0;
319
320 while (*psz && ((!buf) || (len < n)))
321 {
322 wxUint32 cc;
323 #ifdef WC_UTF16
324 // cast is ok for WC_UTF16
325 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
326 psz += (pa == (size_t)-1) ? 1 : pa;
327 #else
328 cc=(*psz++) & 0x7fffffff;
329 #endif
330 unsigned cnt;
331 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
332 if (!cnt)
333 {
334 // plain ASCII char
335 if (buf)
336 *buf++ = (char) cc;
337 len++;
338 }
339
340 else
341 {
342 len += cnt + 1;
343 if (buf)
344 {
345 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
346 while (cnt--)
347 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
348 }
349 }
350 }
351
352 if (buf && (len<n)) *buf = 0;
353
354 return len;
355 }
356
357
358
359
360 // ----------------------------------------------------------------------------
361 // UTF-16
362 // ----------------------------------------------------------------------------
363
364 #ifdef WORDS_BIGENDIAN
365 #define wxMBConvUTF16straight wxMBConvUTF16BE
366 #define wxMBConvUTF16swap wxMBConvUTF16LE
367 #else
368 #define wxMBConvUTF16swap wxMBConvUTF16BE
369 #define wxMBConvUTF16straight wxMBConvUTF16LE
370 #endif
371
372
373 #ifdef WC_UTF16
374
375 // copy 16bit MB to 16bit String
376 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
377 {
378 size_t len=0;
379
380 while (*(wxUint16*)psz && (!buf || len < n))
381 {
382 if (buf)
383 *buf++ = *(wxUint16*)psz;
384 len++;
385
386 psz += sizeof(wxUint16);
387 }
388 if (buf && len<n) *buf=0;
389
390 return len;
391 }
392
393
394 // copy 16bit String to 16bit MB
395 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
396 {
397 size_t len=0;
398
399 while (*psz && (!buf || len < n))
400 {
401 if (buf)
402 {
403 *(wxUint16*)buf = *psz;
404 buf += sizeof(wxUint16);
405 }
406 len += sizeof(wxUint16);
407 psz++;
408 }
409 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
410
411 return len;
412 }
413
414
415 // swap 16bit MB to 16bit String
416 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
417 {
418 size_t len=0;
419
420 while (*(wxUint16*)psz && (!buf || len < n))
421 {
422 if (buf)
423 {
424 ((char *)buf)[0] = psz[1];
425 ((char *)buf)[1] = psz[0];
426 buf++;
427 }
428 len++;
429 psz += sizeof(wxUint16);
430 }
431 if (buf && len<n) *buf=0;
432
433 return len;
434 }
435
436
437 // swap 16bit MB to 16bit String
438 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
439 {
440 size_t len=0;
441
442 while (*psz && (!buf || len < n))
443 {
444 if (buf)
445 {
446 *buf++ = ((char*)psz)[1];
447 *buf++ = ((char*)psz)[0];
448 }
449 len += sizeof(wxUint16);
450 psz++;
451 }
452 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
453
454 return len;
455 }
456
457
458 #else // WC_UTF16
459
460
461 // copy 16bit MB to 32bit String
462 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
463 {
464 size_t len=0;
465
466 while (*(wxUint16*)psz && (!buf || len < n))
467 {
468 wxUint32 cc;
469 size_t pa=decode_utf16((wxUint16*)psz, cc);
470 if (pa == (size_t)-1)
471 return pa;
472
473 if (buf)
474 *buf++ = cc;
475 len++;
476 psz += pa * sizeof(wxUint16);
477 }
478 if (buf && len<n) *buf=0;
479
480 return len;
481 }
482
483
484 // copy 32bit String to 16bit MB
485 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
486 {
487 size_t len=0;
488
489 while (*psz && (!buf || len < n))
490 {
491 wxUint16 cc[2];
492 size_t pa=encode_utf16(*psz, cc);
493
494 if (pa == (size_t)-1)
495 return pa;
496
497 if (buf)
498 {
499 *(wxUint16*)buf = cc[0];
500 buf += sizeof(wxUint16);
501 if (pa > 1)
502 {
503 *(wxUint16*)buf = cc[1];
504 buf += sizeof(wxUint16);
505 }
506 }
507
508 len += pa*sizeof(wxUint16);
509 psz++;
510 }
511 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
512
513 return len;
514 }
515
516
517 // swap 16bit MB to 32bit String
518 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
519 {
520 size_t len=0;
521
522 while (*(wxUint16*)psz && (!buf || len < n))
523 {
524 wxUint32 cc;
525 char tmp[4];
526 tmp[0]=psz[1]; tmp[1]=psz[0];
527 tmp[2]=psz[3]; tmp[3]=psz[2];
528
529 size_t pa=decode_utf16((wxUint16*)tmp, cc);
530 if (pa == (size_t)-1)
531 return pa;
532
533 if (buf)
534 *buf++ = cc;
535
536 len++;
537 psz += pa * sizeof(wxUint16);
538 }
539 if (buf && len<n) *buf=0;
540
541 return len;
542 }
543
544
545 // swap 32bit String to 16bit MB
546 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
547 {
548 size_t len=0;
549
550 while (*psz && (!buf || len < n))
551 {
552 wxUint16 cc[2];
553 size_t pa=encode_utf16(*psz, cc);
554
555 if (pa == (size_t)-1)
556 return pa;
557
558 if (buf)
559 {
560 *buf++ = ((char*)cc)[1];
561 *buf++ = ((char*)cc)[0];
562 if (pa > 1)
563 {
564 *buf++ = ((char*)cc)[3];
565 *buf++ = ((char*)cc)[2];
566 }
567 }
568
569 len += pa*sizeof(wxUint16);
570 psz++;
571 }
572 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
573
574 return len;
575 }
576
577 #endif // WC_UTF16
578
579
580 // ----------------------------------------------------------------------------
581 // UTF-32
582 // ----------------------------------------------------------------------------
583
584 #ifdef WORDS_BIGENDIAN
585 #define wxMBConvUTF32straight wxMBConvUTF32BE
586 #define wxMBConvUTF32swap wxMBConvUTF32LE
587 #else
588 #define wxMBConvUTF32swap wxMBConvUTF32BE
589 #define wxMBConvUTF32straight wxMBConvUTF32LE
590 #endif
591
592
593 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
594 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
595
596
597 #ifdef WC_UTF16
598
599 // copy 32bit MB to 16bit String
600 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
601 {
602 size_t len=0;
603
604 while (*(wxUint32*)psz && (!buf || len < n))
605 {
606 wxUint16 cc[2];
607
608 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
609 if (pa == (size_t)-1)
610 return pa;
611
612 if (buf)
613 {
614 *buf++ = cc[0];
615 if (pa > 1)
616 *buf++ = cc[1];
617 }
618 len += pa;
619 psz += sizeof(wxUint32);
620 }
621 if (buf && len<n) *buf=0;
622
623 return len;
624 }
625
626
627 // copy 16bit String to 32bit MB
628 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
629 {
630 size_t len=0;
631
632 while (*psz && (!buf || len < n))
633 {
634 wxUint32 cc;
635
636 // cast is ok for WC_UTF16
637 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
638 if (pa == (size_t)-1)
639 return pa;
640
641 if (buf)
642 {
643 *(wxUint32*)buf = cc;
644 buf += sizeof(wxUint32);
645 }
646 len += sizeof(wxUint32);
647 psz += pa;
648 }
649
650 if (buf && len<=n-sizeof(wxUint32))
651 *(wxUint32*)buf=0;
652
653 return len;
654 }
655
656
657
658 // swap 32bit MB to 16bit String
659 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
660 {
661 size_t len=0;
662
663 while (*(wxUint32*)psz && (!buf || len < n))
664 {
665 char tmp[4];
666 tmp[0] = psz[3]; tmp[1] = psz[2];
667 tmp[2] = psz[1]; tmp[3] = psz[0];
668
669
670 wxUint16 cc[2];
671
672 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
673 if (pa == (size_t)-1)
674 return pa;
675
676 if (buf)
677 {
678 *buf++ = cc[0];
679 if (pa > 1)
680 *buf++ = cc[1];
681 }
682 len += pa;
683 psz += sizeof(wxUint32);
684 }
685
686 if (buf && len<n)
687 *buf=0;
688
689 return len;
690 }
691
692
693 // swap 16bit String to 32bit MB
694 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
695 {
696 size_t len=0;
697
698 while (*psz && (!buf || len < n))
699 {
700 char cc[4];
701
702 // cast is ok for WC_UTF16
703 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
704 if (pa == (size_t)-1)
705 return pa;
706
707 if (buf)
708 {
709 *buf++ = cc[3];
710 *buf++ = cc[2];
711 *buf++ = cc[1];
712 *buf++ = cc[0];
713 }
714 len += sizeof(wxUint32);
715 psz += pa;
716 }
717
718 if (buf && len<=n-sizeof(wxUint32))
719 *(wxUint32*)buf=0;
720
721 return len;
722 }
723
724 #else // WC_UTF16
725
726
727 // copy 32bit MB to 32bit String
728 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
729 {
730 size_t len=0;
731
732 while (*(wxUint32*)psz && (!buf || len < n))
733 {
734 if (buf)
735 *buf++ = *(wxUint32*)psz;
736 len++;
737 psz += sizeof(wxUint32);
738 }
739
740 if (buf && len<n)
741 *buf=0;
742
743 return len;
744 }
745
746
747 // copy 32bit String to 32bit MB
748 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
749 {
750 size_t len=0;
751
752 while (*psz && (!buf || len < n))
753 {
754 if (buf)
755 {
756 *(wxUint32*)buf = *psz;
757 buf += sizeof(wxUint32);
758 }
759
760 len += sizeof(wxUint32);
761 psz++;
762 }
763
764 if (buf && len<=n-sizeof(wxUint32))
765 *(wxUint32*)buf=0;
766
767 return len;
768 }
769
770
771 // swap 32bit MB to 32bit String
772 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
773 {
774 size_t len=0;
775
776 while (*(wxUint32*)psz && (!buf || len < n))
777 {
778 if (buf)
779 {
780 ((char *)buf)[0] = psz[3];
781 ((char *)buf)[1] = psz[2];
782 ((char *)buf)[2] = psz[1];
783 ((char *)buf)[3] = psz[0];
784 buf++;
785 }
786 len++;
787 psz += sizeof(wxUint32);
788 }
789
790 if (buf && len<n)
791 *buf=0;
792
793 return len;
794 }
795
796
797 // swap 32bit String to 32bit MB
798 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
799 {
800 size_t len=0;
801
802 while (*psz && (!buf || len < n))
803 {
804 if (buf)
805 {
806 *buf++ = ((char *)psz)[3];
807 *buf++ = ((char *)psz)[2];
808 *buf++ = ((char *)psz)[1];
809 *buf++ = ((char *)psz)[0];
810 }
811 len += sizeof(wxUint32);
812 psz++;
813 }
814
815 if (buf && len<=n-sizeof(wxUint32))
816 *(wxUint32*)buf=0;
817
818 return len;
819 }
820
821
822 #endif // WC_UTF16
823
824
825 // ============================================================================
826 // The classes doing conversion using the iconv_xxx() functions
827 // ============================================================================
828
829 #ifdef HAVE_ICONV
830
831 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
832 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
833 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
834 // (which means error) and says there are 0 bytes left in the input buffer --
835 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
836 // this alternative test for iconv() failure.
837 // [This bug does not appear in glibc 2.2.]
838 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
839 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
840 (errno != E2BIG || bufLeft != 0))
841 #else
842 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
843 #endif
844
845 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
846
847 // ----------------------------------------------------------------------------
848 // wxMBConv_iconv: encapsulates an iconv character set
849 // ----------------------------------------------------------------------------
850
851 class wxMBConv_iconv : public wxMBConv
852 {
853 public:
854 wxMBConv_iconv(const wxChar *name);
855 virtual ~wxMBConv_iconv();
856
857 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
858 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
859
860 bool IsOk() const
861 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
862
863 protected:
864 // the iconv handlers used to translate from multibyte to wide char and in
865 // the other direction
866 iconv_t m2w,
867 w2m;
868
869 private:
870 // the name (for iconv_open()) of a wide char charset -- if none is
871 // available on this machine, it will remain NULL
872 static const char *ms_wcCharsetName;
873
874 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
875 // different endian-ness than the native one
876 static bool ms_wcNeedsSwap;
877 };
878
879 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
880 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
881
882 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
883 {
884 // Do it the hard way
885 char cname[100];
886 for (size_t i = 0; i < wxStrlen(name)+1; i++)
887 cname[i] = (char) name[i];
888
889 // check for charset that represents wchar_t:
890 if (ms_wcCharsetName == NULL)
891 {
892 ms_wcNeedsSwap = false;
893
894 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
895 ms_wcCharsetName = WC_NAME_BEST;
896 m2w = iconv_open(ms_wcCharsetName, cname);
897
898 if (m2w == (iconv_t)-1)
899 {
900 // try charset w/o bytesex info (e.g. "UCS4")
901 // and check for bytesex ourselves:
902 ms_wcCharsetName = WC_NAME;
903 m2w = iconv_open(ms_wcCharsetName, cname);
904
905 // last bet, try if it knows WCHAR_T pseudo-charset
906 if (m2w == (iconv_t)-1)
907 {
908 ms_wcCharsetName = "WCHAR_T";
909 m2w = iconv_open(ms_wcCharsetName, cname);
910 }
911
912 if (m2w != (iconv_t)-1)
913 {
914 char buf[2], *bufPtr;
915 wchar_t wbuf[2], *wbufPtr;
916 size_t insz, outsz;
917 size_t res;
918
919 buf[0] = 'A';
920 buf[1] = 0;
921 wbuf[0] = 0;
922 insz = 2;
923 outsz = SIZEOF_WCHAR_T * 2;
924 wbufPtr = wbuf;
925 bufPtr = buf;
926
927 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
928 (char**)&wbufPtr, &outsz);
929
930 if (ICONV_FAILED(res, insz))
931 {
932 ms_wcCharsetName = NULL;
933 wxLogLastError(wxT("iconv"));
934 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
935 }
936 else
937 {
938 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
939 }
940 }
941 else
942 {
943 ms_wcCharsetName = NULL;
944
945 // VS: we must not output an error here, since wxWindows will safely
946 // fall back to using wxEncodingConverter.
947 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
948 //wxLogError(
949 }
950 }
951 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
952 }
953 else // we already have ms_wcCharsetName
954 {
955 m2w = iconv_open(ms_wcCharsetName, cname);
956 }
957
958 // NB: don't ever pass NULL to iconv_open(), it may crash!
959 if ( ms_wcCharsetName )
960 {
961 w2m = iconv_open( cname, ms_wcCharsetName);
962 }
963 else
964 {
965 w2m = (iconv_t)-1;
966 }
967 }
968
969 wxMBConv_iconv::~wxMBConv_iconv()
970 {
971 if ( m2w != (iconv_t)-1 )
972 iconv_close(m2w);
973 if ( w2m != (iconv_t)-1 )
974 iconv_close(w2m);
975 }
976
977 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
978 {
979 size_t inbuf = strlen(psz);
980 size_t outbuf = n * SIZEOF_WCHAR_T;
981 size_t res, cres;
982 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
983 wchar_t *bufPtr = buf;
984 const char *pszPtr = psz;
985
986 if (buf)
987 {
988 // have destination buffer, convert there
989 cres = iconv(m2w,
990 ICONV_CHAR_CAST(&pszPtr), &inbuf,
991 (char**)&bufPtr, &outbuf);
992 res = n - (outbuf / SIZEOF_WCHAR_T);
993
994 if (ms_wcNeedsSwap)
995 {
996 // convert to native endianness
997 WC_BSWAP(buf /* _not_ bufPtr */, res)
998 }
999
1000 // NB: iconv was given only strlen(psz) characters on input, and so
1001 // it couldn't convert the trailing zero. Let's do it ourselves
1002 // if there's some room left for it in the output buffer.
1003 if (res < n)
1004 buf[res] = 0;
1005 }
1006 else
1007 {
1008 // no destination buffer... convert using temp buffer
1009 // to calculate destination buffer requirement
1010 wchar_t tbuf[8];
1011 res = 0;
1012 do {
1013 bufPtr = tbuf;
1014 outbuf = 8*SIZEOF_WCHAR_T;
1015
1016 cres = iconv(m2w,
1017 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1018 (char**)&bufPtr, &outbuf );
1019
1020 res += 8-(outbuf/SIZEOF_WCHAR_T);
1021 } while ((cres==(size_t)-1) && (errno==E2BIG));
1022 }
1023
1024 if (ICONV_FAILED(cres, inbuf))
1025 {
1026 //VS: it is ok if iconv fails, hence trace only
1027 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1028 return (size_t)-1;
1029 }
1030
1031 return res;
1032 }
1033
1034 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1035 {
1036 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1037 size_t outbuf = n;
1038 size_t res, cres;
1039
1040 wchar_t *tmpbuf = 0;
1041
1042 if (ms_wcNeedsSwap)
1043 {
1044 // need to copy to temp buffer to switch endianness
1045 // this absolutely doesn't rock!
1046 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1047 // could be in read-only memory, or be accessed in some other thread)
1048 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1049 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1050 WC_BSWAP(tmpbuf, inbuf)
1051 psz=tmpbuf;
1052 }
1053
1054 if (buf)
1055 {
1056 // have destination buffer, convert there
1057 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1058
1059 res = n-outbuf;
1060
1061 // NB: iconv was given only wcslen(psz) characters on input, and so
1062 // it couldn't convert the trailing zero. Let's do it ourselves
1063 // if there's some room left for it in the output buffer.
1064 if (res < n)
1065 buf[0] = 0;
1066 }
1067 else
1068 {
1069 // no destination buffer... convert using temp buffer
1070 // to calculate destination buffer requirement
1071 char tbuf[16];
1072 res = 0;
1073 do {
1074 buf = tbuf; outbuf = 16;
1075
1076 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1077
1078 res += 16 - outbuf;
1079 } while ((cres==(size_t)-1) && (errno==E2BIG));
1080 }
1081
1082 if (ms_wcNeedsSwap)
1083 {
1084 free(tmpbuf);
1085 }
1086
1087 if (ICONV_FAILED(cres, inbuf))
1088 {
1089 //VS: it is ok if iconv fails, hence trace only
1090 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1091 return (size_t)-1;
1092 }
1093
1094 return res;
1095 }
1096
1097 #endif // HAVE_ICONV
1098
1099
1100 // ============================================================================
1101 // Win32 conversion classes
1102 // ============================================================================
1103
1104 #ifdef wxHAVE_WIN32_MB2WC
1105
1106 // from utils.cpp
1107 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1108 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1109
1110 class wxMBConv_win32 : public wxMBConv
1111 {
1112 public:
1113 wxMBConv_win32()
1114 {
1115 m_CodePage = CP_ACP;
1116 }
1117
1118 wxMBConv_win32(const wxChar* name)
1119 {
1120 m_CodePage = wxCharsetToCodepage(name);
1121 }
1122
1123 wxMBConv_win32(wxFontEncoding encoding)
1124 {
1125 m_CodePage = wxEncodingToCodepage(encoding);
1126 }
1127
1128 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1129 {
1130 const size_t len = ::MultiByteToWideChar
1131 (
1132 m_CodePage, // code page
1133 0, // flags (none)
1134 psz, // input string
1135 -1, // its length (NUL-terminated)
1136 buf, // output string
1137 buf ? n : 0 // size of output buffer
1138 );
1139
1140 // note that it returns count of written chars for buf != NULL and size
1141 // of the needed buffer for buf == NULL so in either case the length of
1142 // the string (which never includes the terminating NUL) is one less
1143 return len ? len - 1 : (size_t)-1;
1144 }
1145
1146 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
1147 {
1148 const size_t len = ::WideCharToMultiByte
1149 (
1150 m_CodePage, // code page
1151 0, // flags (none)
1152 psz, // input string
1153 -1, // it is (wide) NUL-terminated
1154 buf, // output buffer
1155 buf ? n : 0, // and its size
1156 NULL, // default "replacement" char
1157 NULL // [out] was it used?
1158 );
1159
1160 // see the comment above for the reason of "len - 1"
1161 return len ? len - 1 : (size_t)-1;
1162 }
1163
1164 bool IsOk() const
1165 { return m_CodePage != -1; }
1166
1167 public:
1168 long m_CodePage;
1169 };
1170
1171 #endif // wxHAVE_WIN32_MB2WC
1172
1173
1174 // ============================================================================
1175 // wxEncodingConverter based conversion classes
1176 // ============================================================================
1177
1178 #if wxUSE_FONTMAP
1179
1180 class wxMBConv_wxwin : public wxMBConv
1181 {
1182 private:
1183 void Init()
1184 {
1185 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
1186 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
1187 }
1188
1189 public:
1190 // temporarily just use wxEncodingConverter stuff,
1191 // so that it works while a better implementation is built
1192 wxMBConv_wxwin(const wxChar* name)
1193 {
1194 if (name)
1195 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
1196 else
1197 m_enc = wxFONTENCODING_SYSTEM;
1198
1199 Init();
1200 }
1201
1202 wxMBConv_wxwin(wxFontEncoding enc)
1203 {
1204 m_enc = enc;
1205
1206 Init();
1207 }
1208
1209 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
1210 {
1211 size_t inbuf = strlen(psz);
1212 if (buf)
1213 m2w.Convert(psz,buf);
1214 return inbuf;
1215 }
1216
1217 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
1218 {
1219 const size_t inbuf = wxWcslen(psz);
1220 if (buf)
1221 w2m.Convert(psz,buf);
1222
1223 return inbuf;
1224 }
1225
1226 bool IsOk() const { return m_ok; }
1227
1228 public:
1229 wxFontEncoding m_enc;
1230 wxEncodingConverter m2w, w2m;
1231
1232 // were we initialized successfully?
1233 bool m_ok;
1234
1235 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
1236 };
1237
1238 #endif // wxUSE_FONTMAP
1239
1240 // ============================================================================
1241 // wxCSConv implementation
1242 // ============================================================================
1243
1244 void wxCSConv::Init()
1245 {
1246 m_name = NULL;
1247 m_convReal = NULL;
1248 m_deferred = true;
1249 }
1250
1251 wxCSConv::wxCSConv(const wxChar *charset)
1252 {
1253 Init();
1254
1255 if ( charset )
1256 {
1257 SetName(charset);
1258 }
1259
1260 m_encoding = wxFONTENCODING_SYSTEM;
1261 }
1262
1263 wxCSConv::wxCSConv(wxFontEncoding encoding)
1264 {
1265 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
1266 {
1267 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1268
1269 encoding = wxFONTENCODING_SYSTEM;
1270 }
1271
1272 Init();
1273
1274 m_encoding = encoding;
1275 }
1276
1277 wxCSConv::~wxCSConv()
1278 {
1279 Clear();
1280 }
1281
1282 wxCSConv::wxCSConv(const wxCSConv& conv)
1283 : wxMBConv()
1284 {
1285 Init();
1286
1287 SetName(conv.m_name);
1288 m_encoding = conv.m_encoding;
1289 }
1290
1291 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
1292 {
1293 Clear();
1294
1295 SetName(conv.m_name);
1296 m_encoding = conv.m_encoding;
1297
1298 return *this;
1299 }
1300
1301 void wxCSConv::Clear()
1302 {
1303 free(m_name);
1304 delete m_convReal;
1305
1306 m_name = NULL;
1307 m_convReal = NULL;
1308 }
1309
1310 void wxCSConv::SetName(const wxChar *charset)
1311 {
1312 if (charset)
1313 {
1314 m_name = wxStrdup(charset);
1315 m_deferred = true;
1316 }
1317 }
1318
1319 static inline bool DoesntNeedConv(wxFontEncoding enc)
1320 {
1321 return enc == wxFONTENCODING_DEFAULT ||
1322 enc == wxFONTENCODING_SYSTEM ||
1323 enc == wxFONTENCODING_ISO8859_1;
1324 }
1325
1326 wxMBConv *wxCSConv::DoCreate() const
1327 {
1328 #if wxUSE_FONTMAP
1329 wxFontMapper * const fontMapper = wxFontMapper::Get();
1330
1331 wxFontEncoding encFromName = m_name ? fontMapper->CharsetToEncoding(m_name)
1332 : wxFONTENCODING_SYSTEM;
1333 #endif // wxUSE_FONTMAP
1334
1335 // check for the special case of ASCII charset
1336 if ( (!m_name && DoesntNeedConv(m_encoding))
1337 #if wxUSE_FONTMAP
1338 || (m_name && DoesntNeedConv(encFromName))
1339 #endif // wxUSE_FONTMAP
1340 )
1341 {
1342 // don't convert at all
1343 return NULL;
1344 }
1345
1346 // we trust OS to do conversion better than we can so try external
1347 // conversion methods first
1348 //
1349 // the full order is:
1350 // 1. OS conversion (iconv() under Unix or Win32 API)
1351 // 2. hard coded conversions for UTF
1352 // 3. wxEncodingConverter as fall back
1353
1354 // step (1)
1355 #ifdef HAVE_ICONV
1356 if ( m_name )
1357 {
1358 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
1359 if ( conv->IsOk() )
1360 return conv;
1361
1362 delete conv;
1363 }
1364 #endif // HAVE_ICONV
1365
1366 #ifdef wxHAVE_WIN32_MB2WC
1367 {
1368 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
1369 : new wxMBConv_win32(m_encoding);
1370 if ( conv->IsOk() )
1371 return conv;
1372
1373 delete conv;
1374 }
1375 #endif // wxHAVE_WIN32_MB2WC
1376
1377 // step (2)
1378 wxFontEncoding enc = m_encoding;
1379 #if wxUSE_FONTMAP
1380 if ( enc == wxFONTENCODING_SYSTEM )
1381 enc = encFromName;
1382 #endif // wxUSE_FONTMAP
1383
1384 switch ( enc )
1385 {
1386 case wxFONTENCODING_UTF7:
1387 return new wxMBConvUTF7;
1388
1389 case wxFONTENCODING_UTF8:
1390 return new wxMBConvUTF8;
1391
1392 case wxFONTENCODING_UTF16BE:
1393 return new wxMBConvUTF16BE;
1394
1395 case wxFONTENCODING_UTF16LE:
1396 return new wxMBConvUTF16LE;
1397
1398 case wxFONTENCODING_UTF32BE:
1399 return new wxMBConvUTF32BE;
1400
1401 case wxFONTENCODING_UTF32LE:
1402 return new wxMBConvUTF32LE;
1403
1404 default:
1405 // nothing to do but put here to suppress gcc warnings
1406 ;
1407 }
1408
1409 // step (3)
1410 #if wxUSE_FONTMAP
1411 {
1412 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
1413 : new wxMBConv_wxwin(m_encoding);
1414 if ( conv->IsOk() )
1415 return conv;
1416
1417 delete conv;
1418 }
1419 #endif // wxUSE_FONTMAP
1420
1421 wxLogError(_("Cannot convert from the charset '%s'!"),
1422 m_name ? m_name
1423 :
1424 #if wxUSE_FONTMAP
1425 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
1426 #else // !wxUSE_FONTMAP
1427 wxString::Format(_("encoding %s"), m_encoding).c_str()
1428 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1429 );
1430
1431 return NULL;
1432 }
1433
1434 void wxCSConv::CreateConvIfNeeded() const
1435 {
1436 if ( m_deferred )
1437 {
1438 wxCSConv *self = (wxCSConv *)this; // const_cast
1439
1440 #if wxUSE_INTL
1441 // if we don't have neither the name nor the encoding, use the default
1442 // encoding for this system
1443 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
1444 {
1445 self->m_encoding = wxLocale::GetSystemEncoding();
1446 }
1447 #endif // wxUSE_INTL
1448
1449 self->m_convReal = DoCreate();
1450 self->m_deferred = false;
1451 }
1452 }
1453
1454 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1455 {
1456 CreateConvIfNeeded();
1457
1458 if (m_convReal)
1459 return m_convReal->MB2WC(buf, psz, n);
1460
1461 // latin-1 (direct)
1462 size_t len = strlen(psz);
1463
1464 if (buf)
1465 {
1466 for (size_t c = 0; c <= len; c++)
1467 buf[c] = (unsigned char)(psz[c]);
1468 }
1469
1470 return len;
1471 }
1472
1473 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1474 {
1475 CreateConvIfNeeded();
1476
1477 if (m_convReal)
1478 return m_convReal->WC2MB(buf, psz, n);
1479
1480 // latin-1 (direct)
1481 const size_t len = wxWcslen(psz);
1482 if (buf)
1483 {
1484 for (size_t c = 0; c <= len; c++)
1485 buf[c] = (psz[c] > 0xff) ? '?' : psz[c];
1486 }
1487
1488 return len;
1489 }
1490
1491 // ----------------------------------------------------------------------------
1492 // globals
1493 // ----------------------------------------------------------------------------
1494
1495 #ifdef __WINDOWS__
1496 static wxMBConv_win32 wxConvLibcObj;
1497 #else
1498 static wxMBConvLibc wxConvLibcObj;
1499 #endif
1500
1501 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
1502 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
1503 static wxMBConvUTF7 wxConvUTF7Obj;
1504 static wxMBConvUTF8 wxConvUTF8Obj;
1505
1506
1507 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
1508 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
1509 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
1510 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
1511 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
1512 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
1513
1514 #else // !wxUSE_WCHAR_T
1515
1516 // stand-ins in absence of wchar_t
1517 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
1518 wxConvISO8859_1,
1519 wxConvLocal,
1520 wxConvUTF8;
1521
1522 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
1523
1524