]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
more wxMBConv classes cleanup, define wxConvLibc to use Win32 API under Windows
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
23 #endif
24
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
27
28 #ifdef __BORLANDC__
29 #pragma hdrstop
30 #endif
31
32 #ifndef WX_PRECOMP
33 #include "wx/intl.h"
34 #include "wx/log.h"
35 #endif // WX_PRECOMP
36
37 #include "wx/strconv.h"
38
39 #if wxUSE_WCHAR_T
40
41 #ifdef __WXMSW__
42 #include "wx/msw/private.h"
43 #endif
44
45 #ifndef __WXWINCE__
46 #include <errno.h>
47 #endif
48
49 #include <ctype.h>
50 #include <string.h>
51 #include <stdlib.h>
52
53 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
54 #define wxHAVE_WIN32_MB2WC
55 #endif // __WIN32__ but !__WXMICROWIN__
56
57 // ----------------------------------------------------------------------------
58 // headers
59 // ----------------------------------------------------------------------------
60
61 #ifdef __SALFORDC__
62 #include <clib.h>
63 #endif
64
65 #ifdef HAVE_ICONV
66 #include <iconv.h>
67 #endif
68
69 #include "wx/encconv.h"
70 #include "wx/fontmap.h"
71
72 // ----------------------------------------------------------------------------
73 // macros
74 // ----------------------------------------------------------------------------
75
76 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
77 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
78
79 #if SIZEOF_WCHAR_T == 4
80 #define WC_NAME "UCS4"
81 #define WC_BSWAP BSWAP_UCS4
82 #ifdef WORDS_BIGENDIAN
83 #define WC_NAME_BEST "UCS-4BE"
84 #else
85 #define WC_NAME_BEST "UCS-4LE"
86 #endif
87 #elif SIZEOF_WCHAR_T == 2
88 #define WC_NAME "UTF16"
89 #define WC_BSWAP BSWAP_UTF16
90 #define WC_UTF16
91 #ifdef WORDS_BIGENDIAN
92 #define WC_NAME_BEST "UTF-16BE"
93 #else
94 #define WC_NAME_BEST "UTF-16LE"
95 #endif
96 #else // sizeof(wchar_t) != 2 nor 4
97 // does this ever happen?
98 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
99 #endif
100
101 // ============================================================================
102 // implementation
103 // ============================================================================
104
105 // ----------------------------------------------------------------------------
106 // UTF-16 en/decoding to/from UCS-4
107 // ----------------------------------------------------------------------------
108
109
110 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
111 {
112 if (input<=0xffff)
113 {
114 if (output) *output++ = (wxUint16) input;
115 return 1;
116 }
117 else if (input>=0x110000)
118 {
119 return (size_t)-1;
120 }
121 else
122 {
123 if (output)
124 {
125 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
126 *output++ = (wxUint16) ((input&0x3ff)+0xdc00);
127 }
128 return 2;
129 }
130 }
131
132 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
133 {
134 if ((*input<0xd800) || (*input>0xdfff))
135 {
136 output = *input;
137 return 1;
138 }
139 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
140 {
141 output = *input;
142 return (size_t)-1;
143 }
144 else
145 {
146 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
147 return 2;
148 }
149 }
150
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 wxMBConv::~wxMBConv()
157 {
158 // nothing to do here
159 }
160
161 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
162 {
163 if ( psz )
164 {
165 // calculate the length of the buffer needed first
166 size_t nLen = MB2WC(NULL, psz, 0);
167 if ( nLen != (size_t)-1 )
168 {
169 // now do the actual conversion
170 wxWCharBuffer buf(nLen);
171 MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL
172
173 return buf;
174 }
175 }
176
177 wxWCharBuffer buf((wchar_t *)NULL);
178
179 return buf;
180 }
181
182 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
183 {
184 if ( pwz )
185 {
186 size_t nLen = WC2MB(NULL, pwz, 0);
187 if ( nLen != (size_t)-1 )
188 {
189 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
190 WC2MB(buf.data(), pwz, nLen + 4);
191
192 return buf;
193 }
194 }
195
196 wxCharBuffer buf((char *)NULL);
197
198 return buf;
199 }
200
201 // ----------------------------------------------------------------------------
202 // wxMBConvLibc
203 // ----------------------------------------------------------------------------
204
205 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
206 {
207 return wxMB2WC(buf, psz, n);
208 }
209
210 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
211 {
212 return wxWC2MB(buf, psz, n);
213 }
214
215 // ----------------------------------------------------------------------------
216 // UTF-7
217 // ----------------------------------------------------------------------------
218
219 #if 0
220 static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
221 "abcdefghijklmnopqrstuvwxyz"
222 "0123456789'(),-./:?";
223 static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
224 static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
225 "abcdefghijklmnopqrstuvwxyz"
226 "0123456789+/";
227 #endif
228
229 // TODO: write actual implementations of UTF-7 here
230 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
231 const char * WXUNUSED(psz),
232 size_t WXUNUSED(n)) const
233 {
234 return 0;
235 }
236
237 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
238 const wchar_t * WXUNUSED(psz),
239 size_t WXUNUSED(n)) const
240 {
241 return 0;
242 }
243
244 // ----------------------------------------------------------------------------
245 // UTF-8
246 // ----------------------------------------------------------------------------
247
248 static wxUint32 utf8_max[]=
249 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
250
251 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
252 {
253 size_t len = 0;
254
255 while (*psz && ((!buf) || (len < n)))
256 {
257 unsigned char cc = *psz++, fc = cc;
258 unsigned cnt;
259 for (cnt = 0; fc & 0x80; cnt++)
260 fc <<= 1;
261 if (!cnt)
262 {
263 // plain ASCII char
264 if (buf)
265 *buf++ = cc;
266 len++;
267 }
268 else
269 {
270 cnt--;
271 if (!cnt)
272 {
273 // invalid UTF-8 sequence
274 return (size_t)-1;
275 }
276 else
277 {
278 unsigned ocnt = cnt - 1;
279 wxUint32 res = cc & (0x3f >> cnt);
280 while (cnt--)
281 {
282 cc = *psz++;
283 if ((cc & 0xC0) != 0x80)
284 {
285 // invalid UTF-8 sequence
286 return (size_t)-1;
287 }
288 res = (res << 6) | (cc & 0x3f);
289 }
290 if (res <= utf8_max[ocnt])
291 {
292 // illegal UTF-8 encoding
293 return (size_t)-1;
294 }
295 #ifdef WC_UTF16
296 size_t pa = encode_utf16(res, buf);
297 if (pa == (size_t)-1)
298 return (size_t)-1;
299 if (buf)
300 buf += pa;
301 len += pa;
302 #else // !WC_UTF16
303 if (buf)
304 *buf++ = res;
305 len++;
306 #endif // WC_UTF16/!WC_UTF16
307 }
308 }
309 }
310 if (buf && (len < n))
311 *buf = 0;
312 return len;
313 }
314
315 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
316 {
317 size_t len = 0;
318
319 while (*psz && ((!buf) || (len < n)))
320 {
321 wxUint32 cc;
322 #ifdef WC_UTF16
323 size_t pa = decode_utf16(psz, cc);
324 psz += (pa == (size_t)-1) ? 1 : pa;
325 #else
326 cc=(*psz++) & 0x7fffffff;
327 #endif
328 unsigned cnt;
329 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
330 if (!cnt)
331 {
332 // plain ASCII char
333 if (buf)
334 *buf++ = (char) cc;
335 len++;
336 }
337
338 else
339 {
340 len += cnt + 1;
341 if (buf)
342 {
343 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
344 while (cnt--)
345 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
346 }
347 }
348 }
349
350 if (buf && (len<n)) *buf = 0;
351
352 return len;
353 }
354
355
356
357
358 // ----------------------------------------------------------------------------
359 // UTF-16
360 // ----------------------------------------------------------------------------
361
362 #ifdef WORDS_BIGENDIAN
363 #define wxMBConvUTF16straight wxMBConvUTF16BE
364 #define wxMBConvUTF16swap wxMBConvUTF16LE
365 #else
366 #define wxMBConvUTF16swap wxMBConvUTF16BE
367 #define wxMBConvUTF16straight wxMBConvUTF16LE
368 #endif
369
370
371 #ifdef WC_UTF16
372
373 // copy 16bit MB to 16bit String
374 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
375 {
376 size_t len=0;
377
378 while (*(wxUint16*)psz && (!buf || len < n))
379 {
380 if (buf)
381 *buf++ = *(wxUint16*)psz;
382 len++;
383
384 psz += sizeof(wxUint16);
385 }
386 if (buf && len<n) *buf=0;
387
388 return len;
389 }
390
391
392 // copy 16bit String to 16bit MB
393 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
394 {
395 size_t len=0;
396
397 while (*psz && (!buf || len < n))
398 {
399 if (buf)
400 {
401 *(wxUint16*)buf = *psz;
402 buf += sizeof(wxUint16);
403 }
404 len += sizeof(wxUint16);
405 psz++;
406 }
407 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
408
409 return len;
410 }
411
412
413 // swap 16bit MB to 16bit String
414 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
415 {
416 size_t len=0;
417
418 while (*(wxUint16*)psz && (!buf || len < n))
419 {
420 if (buf)
421 {
422 ((char *)buf)[0] = psz[1];
423 ((char *)buf)[1] = psz[0];
424 buf++;
425 }
426 len++;
427 psz += sizeof(wxUint16);
428 }
429 if (buf && len<n) *buf=0;
430
431 return len;
432 }
433
434
435 // swap 16bit MB to 16bit String
436 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
437 {
438 size_t len=0;
439
440 while (*psz && (!buf || len < n))
441 {
442 if (buf)
443 {
444 *buf++ = ((char*)psz)[1];
445 *buf++ = ((char*)psz)[0];
446 }
447 len += sizeof(wxUint16);
448 psz++;
449 }
450 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
451
452 return len;
453 }
454
455
456 #else // WC_UTF16
457
458
459 // copy 16bit MB to 32bit String
460 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
461 {
462 size_t len=0;
463
464 while (*(wxUint16*)psz && (!buf || len < n))
465 {
466 wxUint32 cc;
467 size_t pa=decode_utf16((wxUint16*)psz, cc);
468 if (pa == (size_t)-1)
469 return pa;
470
471 if (buf)
472 *buf++ = cc;
473 len++;
474 psz += pa * sizeof(wxUint16);
475 }
476 if (buf && len<n) *buf=0;
477
478 return len;
479 }
480
481
482 // copy 32bit String to 16bit MB
483 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
484 {
485 size_t len=0;
486
487 while (*psz && (!buf || len < n))
488 {
489 wxUint16 cc[2];
490 size_t pa=encode_utf16(*psz, cc);
491
492 if (pa == (size_t)-1)
493 return pa;
494
495 if (buf)
496 {
497 *(wxUint16*)buf = cc[0];
498 buf += sizeof(wxUint16);
499 if (pa > 1)
500 {
501 *(wxUint16*)buf = cc[1];
502 buf += sizeof(wxUint16);
503 }
504 }
505
506 len += pa*sizeof(wxUint16);
507 psz++;
508 }
509 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
510
511 return len;
512 }
513
514
515 // swap 16bit MB to 32bit String
516 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
517 {
518 size_t len=0;
519
520 while (*(wxUint16*)psz && (!buf || len < n))
521 {
522 wxUint32 cc;
523 char tmp[4];
524 tmp[0]=psz[1]; tmp[1]=psz[0];
525 tmp[2]=psz[3]; tmp[3]=psz[2];
526
527 size_t pa=decode_utf16((wxUint16*)tmp, cc);
528 if (pa == (size_t)-1)
529 return pa;
530
531 if (buf)
532 *buf++ = cc;
533
534 len++;
535 psz += pa * sizeof(wxUint16);
536 }
537 if (buf && len<n) *buf=0;
538
539 return len;
540 }
541
542
543 // swap 32bit String to 16bit MB
544 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
545 {
546 size_t len=0;
547
548 while (*psz && (!buf || len < n))
549 {
550 wxUint16 cc[2];
551 size_t pa=encode_utf16(*psz, cc);
552
553 if (pa == (size_t)-1)
554 return pa;
555
556 if (buf)
557 {
558 *buf++ = ((char*)cc)[1];
559 *buf++ = ((char*)cc)[0];
560 if (pa > 1)
561 {
562 *buf++ = ((char*)cc)[3];
563 *buf++ = ((char*)cc)[2];
564 }
565 }
566
567 len += pa*sizeof(wxUint16);
568 psz++;
569 }
570 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
571
572 return len;
573 }
574
575 #endif // WC_UTF16
576
577
578 // ----------------------------------------------------------------------------
579 // UTF-32
580 // ----------------------------------------------------------------------------
581
582 #ifdef WORDS_BIGENDIAN
583 #define wxMBConvUTF32straight wxMBConvUTF32BE
584 #define wxMBConvUTF32swap wxMBConvUTF32LE
585 #else
586 #define wxMBConvUTF32swap wxMBConvUTF32BE
587 #define wxMBConvUTF32straight wxMBConvUTF32LE
588 #endif
589
590
591 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
592 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
593
594
595 #ifdef WC_UTF16
596
597 // copy 32bit MB to 16bit String
598 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
599 {
600 size_t len=0;
601
602 while (*(wxUint32*)psz && (!buf || len < n))
603 {
604 wxUint16 cc[2];
605
606 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
607 if (pa == (size_t)-1)
608 return pa;
609
610 if (buf)
611 {
612 *buf++ = cc[0];
613 if (pa > 1)
614 *buf++ = cc[1];
615 }
616 len += pa;
617 psz += sizeof(wxUint32);
618 }
619 if (buf && len<n) *buf=0;
620
621 return len;
622 }
623
624
625 // copy 16bit String to 32bit MB
626 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
627 {
628 size_t len=0;
629
630 while (*psz && (!buf || len < n))
631 {
632 wxUint32 cc;
633
634 size_t pa=decode_utf16(psz, cc);
635 if (pa == (size_t)-1)
636 return pa;
637
638 if (buf)
639 {
640 *(wxUint32*)buf = cc;
641 buf += sizeof(wxUint32);
642 }
643 len += sizeof(wxUint32);
644 psz += pa;
645 }
646 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
647
648 return len;
649 }
650
651
652
653 // swap 32bit MB to 16bit String
654 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
655 {
656 size_t len=0;
657
658 while (*(wxUint32*)psz && (!buf || len < n))
659 {
660 char tmp[4];
661 tmp[0] = psz[3]; tmp[1] = psz[2];
662 tmp[2] = psz[1]; tmp[3] = psz[0];
663
664
665 wxUint16 cc[2];
666
667 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
668 if (pa == (size_t)-1)
669 return pa;
670
671 if (buf)
672 {
673 *buf++ = cc[0];
674 if (pa > 1)
675 *buf++ = cc[1];
676 }
677 len += pa;
678 psz += sizeof(wxUint32);
679 }
680 if (buf && len<n) *buf=0;
681
682 return len;
683 }
684
685
686 // swap 16bit String to 32bit MB
687 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
688 {
689 size_t len=0;
690
691 while (*psz && (!buf || len < n))
692 {
693 char cc[4];
694
695 size_t pa=decode_utf16(psz, *(wxUint32*)cc);
696 if (pa == (size_t)-1)
697 return pa;
698
699 if (buf)
700 {
701 *buf++ = cc[3];
702 *buf++ = cc[2];
703 *buf++ = cc[1];
704 *buf++ = cc[0];
705 }
706 len += sizeof(wxUint32);
707 psz += pa;
708 }
709 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
710
711 return len;
712 }
713
714 #else // WC_UTF16
715
716
717 // copy 32bit MB to 32bit String
718 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
719 {
720 size_t len=0;
721
722 while (*(wxUint32*)psz && (!buf || len < n))
723 {
724 if (buf)
725 *buf++ = *(wxUint32*)psz;
726 len++;
727 psz += sizeof(wxUint32);
728 }
729 if (buf && len<n) *buf=0;
730
731 return len;
732 }
733
734
735 // copy 32bit String to 32bit MB
736 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
737 {
738 size_t len=0;
739
740 while (*psz && (!buf || len < n))
741 {
742 if (buf)
743 {
744 *(wxUint32*)buf = *psz;
745 buf += sizeof(wxUint32);
746 }
747
748 len += sizeof(wxUint32);
749 psz++;
750 }
751
752 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
753
754 return len;
755 }
756
757
758 // swap 32bit MB to 32bit String
759 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
760 {
761 size_t len=0;
762
763 while (*(wxUint32*)psz && (!buf || len < n))
764 {
765 if (buf)
766 {
767 ((char *)buf)[0] = psz[3];
768 ((char *)buf)[1] = psz[2];
769 ((char *)buf)[2] = psz[1];
770 ((char *)buf)[3] = psz[0];
771 buf++;
772 }
773 len++;
774 psz += sizeof(wxUint32);
775 }
776 if (buf && len<n) *buf=0;
777
778 return len;
779 }
780
781
782 // swap 32bit String to 32bit MB
783 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
784 {
785 size_t len=0;
786
787 while (*psz && (!buf || len < n))
788 {
789 if (buf)
790 {
791 *buf++ = ((char *)psz)[3];
792 *buf++ = ((char *)psz)[2];
793 *buf++ = ((char *)psz)[1];
794 *buf++ = ((char *)psz)[0];
795 }
796 len += sizeof(wxUint32);
797 psz++;
798 }
799 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
800
801 return len;
802 }
803
804
805 #endif // WC_UTF16
806
807
808 // ============================================================================
809 // The classes doing conversion using the iconv_xxx() functions
810 // ============================================================================
811
812 #ifdef HAVE_ICONV
813
814 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
815 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
816 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
817 // (which means error) and says there are 0 bytes left in the input buffer --
818 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
819 // this alternative test for iconv() failure.
820 // [This bug does not appear in glibc 2.2.]
821 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
822 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
823 (errno != E2BIG || bufLeft != 0))
824 #else
825 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
826 #endif
827
828 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
829
830 // ----------------------------------------------------------------------------
831 // wxMBConv_iconv: encapsulates an iconv character set
832 // ----------------------------------------------------------------------------
833
834 class wxMBConv_iconv : public wxMBConv
835 {
836 public:
837 wxMBConv_iconv(const wxChar *name);
838 virtual ~wxMBConv_iconv();
839
840 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
841 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
842
843 bool IsOk() const
844 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
845
846 protected:
847 // the iconv handlers used to translate from multibyte to wide char and in
848 // the other direction
849 iconv_t m2w,
850 w2m;
851
852 private:
853 // the name (for iconv_open()) of a wide char charset -- if none is
854 // available on this machine, it will remain NULL
855 static const char *ms_wcCharsetName;
856
857 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
858 // different endian-ness than the native one
859 static bool ms_wcNeedsSwap;
860 };
861
862 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
863 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
864
865 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
866 {
867 // Do it the hard way
868 char cname[100];
869 for (size_t i = 0; i < wxStrlen(name)+1; i++)
870 cname[i] = (char) name[i];
871
872 // check for charset that represents wchar_t:
873 if (ms_wcCharsetName == NULL)
874 {
875 ms_wcNeedsSwap = false;
876
877 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
878 ms_wcCharsetName = WC_NAME_BEST;
879 m2w = iconv_open(ms_wcCharsetName, cname);
880
881 if (m2w == (iconv_t)-1)
882 {
883 // try charset w/o bytesex info (e.g. "UCS4")
884 // and check for bytesex ourselves:
885 ms_wcCharsetName = WC_NAME;
886 m2w = iconv_open(ms_wcCharsetName, cname);
887
888 // last bet, try if it knows WCHAR_T pseudo-charset
889 if (m2w == (iconv_t)-1)
890 {
891 ms_wcCharsetName = "WCHAR_T";
892 m2w = iconv_open(ms_wcCharsetName, cname);
893 }
894
895 if (m2w != (iconv_t)-1)
896 {
897 char buf[2], *bufPtr;
898 wchar_t wbuf[2], *wbufPtr;
899 size_t insz, outsz;
900 size_t res;
901
902 buf[0] = 'A';
903 buf[1] = 0;
904 wbuf[0] = 0;
905 insz = 2;
906 outsz = SIZEOF_WCHAR_T * 2;
907 wbufPtr = wbuf;
908 bufPtr = buf;
909
910 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
911 (char**)&wbufPtr, &outsz);
912
913 if (ICONV_FAILED(res, insz))
914 {
915 ms_wcCharsetName = NULL;
916 wxLogLastError(wxT("iconv"));
917 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
918 }
919 else
920 {
921 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
922 }
923 }
924 else
925 {
926 ms_wcCharsetName = NULL;
927
928 // VS: we must not output an error here, since wxWindows will safely
929 // fall back to using wxEncodingConverter.
930 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
931 //wxLogError(
932 }
933 }
934 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
935 }
936 else // we already have ms_wcCharsetName
937 {
938 m2w = iconv_open(ms_wcCharsetName, cname);
939 }
940
941 // NB: don't ever pass NULL to iconv_open(), it may crash!
942 if ( ms_wcCharsetName )
943 {
944 w2m = iconv_open( cname, ms_wcCharsetName);
945 }
946 else
947 {
948 w2m = (iconv_t)-1;
949 }
950 }
951
952 wxMBConv_iconv::~wxMBConv_iconv()
953 {
954 if ( m2w != (iconv_t)-1 )
955 iconv_close(m2w);
956 if ( w2m != (iconv_t)-1 )
957 iconv_close(w2m);
958 }
959
960 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
961 {
962 size_t inbuf = strlen(psz);
963 size_t outbuf = n * SIZEOF_WCHAR_T;
964 size_t res, cres;
965 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
966 wchar_t *bufPtr = buf;
967 const char *pszPtr = psz;
968
969 if (buf)
970 {
971 // have destination buffer, convert there
972 cres = iconv(m2w,
973 ICONV_CHAR_CAST(&pszPtr), &inbuf,
974 (char**)&bufPtr, &outbuf);
975 res = n - (outbuf / SIZEOF_WCHAR_T);
976
977 if (ms_wcNeedsSwap)
978 {
979 // convert to native endianness
980 WC_BSWAP(buf /* _not_ bufPtr */, res)
981 }
982
983 // NB: iconv was given only strlen(psz) characters on input, and so
984 // it couldn't convert the trailing zero. Let's do it ourselves
985 // if there's some room left for it in the output buffer.
986 if (res < n)
987 buf[res] = 0;
988 }
989 else
990 {
991 // no destination buffer... convert using temp buffer
992 // to calculate destination buffer requirement
993 wchar_t tbuf[8];
994 res = 0;
995 do {
996 bufPtr = tbuf;
997 outbuf = 8*SIZEOF_WCHAR_T;
998
999 cres = iconv(m2w,
1000 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1001 (char**)&bufPtr, &outbuf );
1002
1003 res += 8-(outbuf/SIZEOF_WCHAR_T);
1004 } while ((cres==(size_t)-1) && (errno==E2BIG));
1005 }
1006
1007 if (ICONV_FAILED(cres, inbuf))
1008 {
1009 //VS: it is ok if iconv fails, hence trace only
1010 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1011 return (size_t)-1;
1012 }
1013
1014 return res;
1015 }
1016
1017 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1018 {
1019 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1020 size_t outbuf = n;
1021 size_t res, cres;
1022
1023 wchar_t *tmpbuf = 0;
1024
1025 if (ms_wcNeedsSwap)
1026 {
1027 // need to copy to temp buffer to switch endianness
1028 // this absolutely doesn't rock!
1029 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1030 // could be in read-only memory, or be accessed in some other thread)
1031 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1032 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1033 WC_BSWAP(tmpbuf, inbuf)
1034 psz=tmpbuf;
1035 }
1036
1037 if (buf)
1038 {
1039 // have destination buffer, convert there
1040 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1041
1042 res = n-outbuf;
1043
1044 // NB: iconv was given only wcslen(psz) characters on input, and so
1045 // it couldn't convert the trailing zero. Let's do it ourselves
1046 // if there's some room left for it in the output buffer.
1047 if (res < n)
1048 buf[0] = 0;
1049 }
1050 else
1051 {
1052 // no destination buffer... convert using temp buffer
1053 // to calculate destination buffer requirement
1054 char tbuf[16];
1055 res = 0;
1056 do {
1057 buf = tbuf; outbuf = 16;
1058
1059 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1060
1061 res += 16 - outbuf;
1062 } while ((cres==(size_t)-1) && (errno==E2BIG));
1063 }
1064
1065 if (ms_wcNeedsSwap)
1066 {
1067 free(tmpbuf);
1068 }
1069
1070 if (ICONV_FAILED(cres, inbuf))
1071 {
1072 //VS: it is ok if iconv fails, hence trace only
1073 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1074 return (size_t)-1;
1075 }
1076
1077 return res;
1078 }
1079
1080 #endif // HAVE_ICONV
1081
1082
1083 // ============================================================================
1084 // Win32 conversion classes
1085 // ============================================================================
1086
1087 #ifdef wxHAVE_WIN32_MB2WC
1088
1089 // from utils.cpp
1090 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1091 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1092
1093 class wxMBConv_win32 : public wxMBConv
1094 {
1095 public:
1096 wxMBConv_win32()
1097 {
1098 m_CodePage = CP_ACP;
1099 }
1100
1101 wxMBConv_win32(const wxChar* name)
1102 {
1103 m_CodePage = wxCharsetToCodepage(name);
1104 }
1105
1106 wxMBConv_win32(wxFontEncoding encoding)
1107 {
1108 m_CodePage = wxEncodingToCodepage(encoding);
1109 }
1110
1111 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1112 {
1113 const size_t len = ::MultiByteToWideChar
1114 (
1115 m_CodePage, // code page
1116 0, // flags (none)
1117 psz, // input string
1118 -1, // its length (NUL-terminated)
1119 buf, // output string
1120 buf ? n : 0 // size of output buffer
1121 );
1122
1123 // note that it returns # of written chars for buf != NULL and *size*
1124 // of the needed buffer for buf == NULL
1125 return len ? (buf ? len : len - 1) : (size_t)-1;
1126 }
1127
1128 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
1129 {
1130 const size_t len = ::WideCharToMultiByte
1131 (
1132 m_CodePage, // code page
1133 0, // flags (none)
1134 psz, // input string
1135 -1, // it is (wide) NUL-terminated
1136 buf, // output buffer
1137 buf ? n : 0, // and its size
1138 NULL, // default "replacement" char
1139 NULL // [out] was it used?
1140 );
1141
1142 // see the comment above!
1143 return len ? (buf ? len : len - 1) : (size_t)-1;
1144 }
1145
1146 bool IsOk() const
1147 { return m_CodePage != -1; }
1148
1149 public:
1150 long m_CodePage;
1151 };
1152
1153 #endif // wxHAVE_WIN32_MB2WC
1154
1155
1156 // ============================================================================
1157 // wxEncodingConverter based conversion classes
1158 // ============================================================================
1159
1160 #if wxUSE_FONTMAP
1161
1162 class wxMBConv_wxwin : public wxMBConv
1163 {
1164 private:
1165 void Init()
1166 {
1167 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
1168 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
1169 }
1170
1171 public:
1172 // temporarily just use wxEncodingConverter stuff,
1173 // so that it works while a better implementation is built
1174 wxMBConv_wxwin(const wxChar* name)
1175 {
1176 if (name)
1177 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
1178 else
1179 m_enc = wxFONTENCODING_SYSTEM;
1180
1181 Init();
1182 }
1183
1184 wxMBConv_wxwin(wxFontEncoding enc)
1185 {
1186 m_enc = enc;
1187
1188 Init();
1189 }
1190
1191 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
1192 {
1193 size_t inbuf = strlen(psz);
1194 if (buf)
1195 m2w.Convert(psz,buf);
1196 return inbuf;
1197 }
1198
1199 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
1200 {
1201 const size_t inbuf = wxWcslen(psz);
1202 if (buf)
1203 w2m.Convert(psz,buf);
1204
1205 return inbuf;
1206 }
1207
1208 bool IsOk() const { return m_ok; }
1209
1210 public:
1211 wxFontEncoding m_enc;
1212 wxEncodingConverter m2w, w2m;
1213
1214 // were we initialized successfully?
1215 bool m_ok;
1216
1217 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
1218 };
1219
1220 #endif // wxUSE_FONTMAP
1221
1222 // ============================================================================
1223 // wxCSConv implementation
1224 // ============================================================================
1225
1226 void wxCSConv::Init()
1227 {
1228 m_name = NULL;
1229 m_convReal = NULL;
1230 m_deferred = true;
1231 }
1232
1233 // find a valid value for the encoding
1234 void wxCSConv::SetEncoding()
1235 {
1236 #if wxUSE_INTL
1237 m_encoding = wxLocale::GetSystemEncoding();
1238 #else
1239 m_encoding = wxFONTENCODING_SYSTEM;
1240 #endif
1241 }
1242
1243 wxCSConv::wxCSConv(const wxChar *charset)
1244 {
1245 Init();
1246
1247 if ( charset )
1248 {
1249 // not used
1250 m_encoding = wxFONTENCODING_SYSTEM;
1251
1252 SetName(charset);
1253 }
1254 else // no charset specified
1255 {
1256 SetEncoding();
1257 }
1258 }
1259
1260 wxCSConv::wxCSConv(wxFontEncoding encoding)
1261 {
1262 if ( encoding == wxFONTENCODING_MAX ||
1263 encoding == wxFONTENCODING_DEFAULT )
1264 {
1265 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1266
1267 encoding = wxFONTENCODING_SYSTEM;
1268 }
1269
1270 Init();
1271
1272 if ( encoding == wxFONTENCODING_SYSTEM )
1273 {
1274 SetEncoding();
1275 }
1276 else // have valid encoding, use it
1277 {
1278 m_encoding = encoding;
1279 }
1280 }
1281
1282 wxCSConv::~wxCSConv()
1283 {
1284 Clear();
1285 }
1286
1287 wxCSConv::wxCSConv(const wxCSConv& conv)
1288 : wxMBConv()
1289 {
1290 Init();
1291
1292 SetName(conv.m_name);
1293 m_encoding = conv.m_encoding;
1294 }
1295
1296 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
1297 {
1298 Clear();
1299
1300 SetName(conv.m_name);
1301 m_encoding = conv.m_encoding;
1302
1303 return *this;
1304 }
1305
1306 void wxCSConv::Clear()
1307 {
1308 free(m_name);
1309 delete m_convReal;
1310
1311 m_name = NULL;
1312 m_convReal = NULL;
1313 }
1314
1315 void wxCSConv::SetName(const wxChar *charset)
1316 {
1317 if (charset)
1318 {
1319 m_name = wxStrdup(charset);
1320 m_deferred = true;
1321 }
1322 }
1323
1324 static inline bool DoesntNeedConv(wxFontEncoding enc)
1325 {
1326 return enc == wxFONTENCODING_DEFAULT ||
1327 enc == wxFONTENCODING_SYSTEM ||
1328 enc == wxFONTENCODING_ISO8859_1;
1329 }
1330
1331 wxMBConv *wxCSConv::DoCreate() const
1332 {
1333 #if wxUSE_FONTMAP
1334 wxFontMapper * const fontMapper = wxFontMapper::Get();
1335
1336 wxFontEncoding encFromName = m_name ? fontMapper->CharsetToEncoding(m_name)
1337 : wxFONTENCODING_SYSTEM;
1338 #endif // wxUSE_FONTMAP
1339
1340 // check for the special case of ASCII charset
1341 if ( (!m_name && DoesntNeedConv(m_encoding))
1342 #if wxUSE_FONTMAP
1343 || (m_name && DoesntNeedConv(encFromName))
1344 #endif // wxUSE_FONTMAP
1345 )
1346 {
1347 // don't convert at all
1348 return NULL;
1349 }
1350
1351 // we trust OS to do conversion better than we can so try external
1352 // conversion methods first
1353 //
1354 // the full order is:
1355 // 1. OS conversion (iconv() under Unix or Win32 API)
1356 // 2. hard coded conversions for UTF
1357 // 3. wxEncodingConverter as fall back
1358
1359 // step (1)
1360 #ifdef HAVE_ICONV
1361 if ( m_name )
1362 {
1363 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
1364 if ( conv->IsOk() )
1365 return conv;
1366
1367 delete conv;
1368 }
1369 #endif // HAVE_ICONV
1370
1371 #ifdef wxHAVE_WIN32_MB2WC
1372 {
1373 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
1374 : new wxMBConv_win32(m_encoding);
1375 if ( conv->IsOk() )
1376 return conv;
1377
1378 delete conv;
1379 }
1380 #endif // wxHAVE_WIN32_MB2WC
1381
1382 // step (2)
1383 wxFontEncoding enc = m_encoding;
1384 #if wxUSE_FONTMAP
1385 if ( enc == wxFONTENCODING_SYSTEM )
1386 enc = encFromName;
1387 #endif // wxUSE_FONTMAP
1388
1389 switch ( enc )
1390 {
1391 case wxFONTENCODING_UTF7:
1392 return new wxMBConvUTF7;
1393
1394 case wxFONTENCODING_UTF8:
1395 return new wxMBConvUTF8;
1396
1397 case wxFONTENCODING_UTF16:
1398 return new wxMBConvUTF16;
1399
1400 case wxFONTENCODING_UTF16BE:
1401 return new wxMBConvUTF16BE;
1402
1403 case wxFONTENCODING_UTF16LE:
1404 return new wxMBConvUTF16LE;
1405
1406 case wxFONTENCODING_UTF32:
1407 return new wxMBConvUTF32;
1408
1409 case wxFONTENCODING_UTF32BE:
1410 return new wxMBConvUTF32BE;
1411
1412 case wxFONTENCODING_UTF32LE:
1413 return new wxMBConvUTF32LE;
1414
1415 default:
1416 // nothing to do but put here to suppress gcc warnings
1417 ;
1418 }
1419
1420 // step (3)
1421 #if wxUSE_FONTMAP
1422 {
1423 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
1424 : new wxMBConv_wxwin(m_encoding);
1425 if ( conv->IsOk() )
1426 return conv;
1427
1428 delete conv;
1429 }
1430 #endif // wxUSE_FONTMAP
1431
1432 wxLogError(_("Cannot convert from the charset '%s'!"),
1433 m_name ? m_name
1434 :
1435 #if wxUSE_FONTMAP
1436 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
1437 #else // !wxUSE_FONTMAP
1438 wxString::Format(_("encoding %s"), m_encoding).c_str()
1439 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1440 );
1441
1442 return NULL;
1443 }
1444
1445 void wxCSConv::CreateConvIfNeeded() const
1446 {
1447 if ( m_deferred )
1448 {
1449 wxCSConv *self = (wxCSConv *)this; // const_cast
1450 self->m_convReal = DoCreate();
1451 self->m_deferred = false;
1452 }
1453 }
1454
1455 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1456 {
1457 CreateConvIfNeeded();
1458
1459 if (m_convReal)
1460 return m_convReal->MB2WC(buf, psz, n);
1461
1462 // latin-1 (direct)
1463 size_t len = strlen(psz);
1464
1465 if (buf)
1466 {
1467 for (size_t c = 0; c <= len; c++)
1468 buf[c] = (unsigned char)(psz[c]);
1469 }
1470
1471 return len;
1472 }
1473
1474 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1475 {
1476 CreateConvIfNeeded();
1477
1478 if (m_convReal)
1479 return m_convReal->WC2MB(buf, psz, n);
1480
1481 // latin-1 (direct)
1482 const size_t len = wxWcslen(psz);
1483 if (buf)
1484 {
1485 for (size_t c = 0; c <= len; c++)
1486 buf[c] = (psz[c] > 0xff) ? '?' : psz[c];
1487 }
1488
1489 return len;
1490 }
1491
1492 // ----------------------------------------------------------------------------
1493 // globals
1494 // ----------------------------------------------------------------------------
1495
1496 #ifdef __WINDOWS__
1497 static wxMBConv_win32 wxConvLibcObj;
1498 #else
1499 static wxMBConvSystem wxConvLibcObj;
1500 #endif
1501
1502 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
1503 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
1504 static wxMBConvUTF7 wxConvUTF7Obj;
1505 static wxMBConvUTF8 wxConvUTF8Obj;
1506
1507
1508 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
1509 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
1510 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
1511 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
1512 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
1513 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
1514
1515 #else // !wxUSE_WCHAR_T
1516
1517 // stand-ins in absence of wchar_t
1518 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
1519 wxConvISO8859_1,
1520 wxConvLocal,
1521 wxConvUTF8;
1522
1523 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
1524
1525