]> git.saurik.com Git - wxWidgets.git/blame_incremental - src/common/strconv.cpp
Applied [ 821234 ] Fix: erroneous assertion failed wxListBox::SetSelection
[wxWidgets.git] / src / common / strconv.cpp
... / ...
CommitLineData
1/////////////////////////////////////////////////////////////////////////////
2// Name: strconv.cpp
3// Purpose: Unicode conversion classes
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
5// Modified by:
6// Created: 29/01/98
7// RCS-ID: $Id$
8// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9// (c) 2000-2003 Vadim Zeitlin
10// Licence: wxWindows licence
11/////////////////////////////////////////////////////////////////////////////
12
13// ============================================================================
14// declarations
15// ============================================================================
16
17// ----------------------------------------------------------------------------
18// headers
19// ----------------------------------------------------------------------------
20
21#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
23#endif
24
25// For compilers that support precompilation, includes "wx.h".
26#include "wx/wxprec.h"
27
28#ifdef __BORLANDC__
29 #pragma hdrstop
30#endif
31
32#ifndef WX_PRECOMP
33 #include "wx/intl.h"
34 #include "wx/log.h"
35#endif // WX_PRECOMP
36
37#include "wx/strconv.h"
38
39#if wxUSE_WCHAR_T
40
41#ifdef __WXMSW__
42 #include "wx/msw/private.h"
43#endif
44
45#ifndef __WXWINCE__
46#include <errno.h>
47#endif
48
49#include <ctype.h>
50#include <string.h>
51#include <stdlib.h>
52
53#if defined(__WIN32__) && !defined(__WXMICROWIN__)
54 #define wxHAVE_WIN32_MB2WC
55#endif // __WIN32__ but !__WXMICROWIN__
56
57// ----------------------------------------------------------------------------
58// headers
59// ----------------------------------------------------------------------------
60
61#ifdef __SALFORDC__
62 #include <clib.h>
63#endif
64
65#ifdef HAVE_ICONV
66 #include <iconv.h>
67#endif
68
69#include "wx/encconv.h"
70#include "wx/fontmap.h"
71
72// ----------------------------------------------------------------------------
73// macros
74// ----------------------------------------------------------------------------
75
76#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
77#define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
78
79#if SIZEOF_WCHAR_T == 4
80 #define WC_NAME "UCS4"
81 #define WC_BSWAP BSWAP_UCS4
82 #ifdef WORDS_BIGENDIAN
83 #define WC_NAME_BEST "UCS-4BE"
84 #else
85 #define WC_NAME_BEST "UCS-4LE"
86 #endif
87#elif SIZEOF_WCHAR_T == 2
88 #define WC_NAME "UTF16"
89 #define WC_BSWAP BSWAP_UTF16
90 #define WC_UTF16
91 #ifdef WORDS_BIGENDIAN
92 #define WC_NAME_BEST "UTF-16BE"
93 #else
94 #define WC_NAME_BEST "UTF-16LE"
95 #endif
96#else // sizeof(wchar_t) != 2 nor 4
97 // does this ever happen?
98 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
99#endif
100
101// ============================================================================
102// implementation
103// ============================================================================
104
105// ----------------------------------------------------------------------------
106// UTF-16 en/decoding to/from UCS-4
107// ----------------------------------------------------------------------------
108
109
110static size_t encode_utf16(wxUint32 input, wxUint16 *output)
111{
112 if (input<=0xffff)
113 {
114 if (output)
115 *output = (wxUint16) input;
116 return 1;
117 }
118 else if (input>=0x110000)
119 {
120 return (size_t)-1;
121 }
122 else
123 {
124 if (output)
125 {
126 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
127 *output = (wxUint16) ((input&0x3ff)+0xdc00);
128 }
129 return 2;
130 }
131}
132
133static size_t decode_utf16(const wxUint16* input, wxUint32& output)
134{
135 if ((*input<0xd800) || (*input>0xdfff))
136 {
137 output = *input;
138 return 1;
139 }
140 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
141 {
142 output = *input;
143 return (size_t)-1;
144 }
145 else
146 {
147 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
148 return 2;
149 }
150}
151
152
153// ----------------------------------------------------------------------------
154// wxMBConv
155// ----------------------------------------------------------------------------
156
157wxMBConv::~wxMBConv()
158{
159 // nothing to do here
160}
161
162const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
163{
164 if ( psz )
165 {
166 // calculate the length of the buffer needed first
167 size_t nLen = MB2WC(NULL, psz, 0);
168 if ( nLen != (size_t)-1 )
169 {
170 // now do the actual conversion
171 wxWCharBuffer buf(nLen);
172 MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL
173
174 return buf;
175 }
176 }
177
178 wxWCharBuffer buf((wchar_t *)NULL);
179
180 return buf;
181}
182
183const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
184{
185 if ( pwz )
186 {
187 size_t nLen = WC2MB(NULL, pwz, 0);
188 if ( nLen != (size_t)-1 )
189 {
190 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
191 WC2MB(buf.data(), pwz, nLen + 4);
192
193 return buf;
194 }
195 }
196
197 wxCharBuffer buf((char *)NULL);
198
199 return buf;
200}
201
202// ----------------------------------------------------------------------------
203// wxMBConvLibc
204// ----------------------------------------------------------------------------
205
206size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
207{
208 return wxMB2WC(buf, psz, n);
209}
210
211size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
212{
213 return wxWC2MB(buf, psz, n);
214}
215
216// ----------------------------------------------------------------------------
217// UTF-7
218// ----------------------------------------------------------------------------
219
220#if 0
221static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
222 "abcdefghijklmnopqrstuvwxyz"
223 "0123456789'(),-./:?";
224static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
225static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
226 "abcdefghijklmnopqrstuvwxyz"
227 "0123456789+/";
228#endif
229
230// TODO: write actual implementations of UTF-7 here
231size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
232 const char * WXUNUSED(psz),
233 size_t WXUNUSED(n)) const
234{
235 return 0;
236}
237
238size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
239 const wchar_t * WXUNUSED(psz),
240 size_t WXUNUSED(n)) const
241{
242 return 0;
243}
244
245// ----------------------------------------------------------------------------
246// UTF-8
247// ----------------------------------------------------------------------------
248
249static wxUint32 utf8_max[]=
250 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
251
252size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
253{
254 size_t len = 0;
255
256 while (*psz && ((!buf) || (len < n)))
257 {
258 unsigned char cc = *psz++, fc = cc;
259 unsigned cnt;
260 for (cnt = 0; fc & 0x80; cnt++)
261 fc <<= 1;
262 if (!cnt)
263 {
264 // plain ASCII char
265 if (buf)
266 *buf++ = cc;
267 len++;
268 }
269 else
270 {
271 cnt--;
272 if (!cnt)
273 {
274 // invalid UTF-8 sequence
275 return (size_t)-1;
276 }
277 else
278 {
279 unsigned ocnt = cnt - 1;
280 wxUint32 res = cc & (0x3f >> cnt);
281 while (cnt--)
282 {
283 cc = *psz++;
284 if ((cc & 0xC0) != 0x80)
285 {
286 // invalid UTF-8 sequence
287 return (size_t)-1;
288 }
289 res = (res << 6) | (cc & 0x3f);
290 }
291 if (res <= utf8_max[ocnt])
292 {
293 // illegal UTF-8 encoding
294 return (size_t)-1;
295 }
296#ifdef WC_UTF16
297 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
298 size_t pa = encode_utf16(res, (wxUint16 *)buf);
299 if (pa == (size_t)-1)
300 return (size_t)-1;
301 if (buf)
302 buf += pa;
303 len += pa;
304#else // !WC_UTF16
305 if (buf)
306 *buf++ = res;
307 len++;
308#endif // WC_UTF16/!WC_UTF16
309 }
310 }
311 }
312 if (buf && (len < n))
313 *buf = 0;
314 return len;
315}
316
317size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
318{
319 size_t len = 0;
320
321 while (*psz && ((!buf) || (len < n)))
322 {
323 wxUint32 cc;
324#ifdef WC_UTF16
325 // cast is ok for WC_UTF16
326 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
327 psz += (pa == (size_t)-1) ? 1 : pa;
328#else
329 cc=(*psz++) & 0x7fffffff;
330#endif
331 unsigned cnt;
332 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
333 if (!cnt)
334 {
335 // plain ASCII char
336 if (buf)
337 *buf++ = (char) cc;
338 len++;
339 }
340
341 else
342 {
343 len += cnt + 1;
344 if (buf)
345 {
346 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
347 while (cnt--)
348 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
349 }
350 }
351 }
352
353 if (buf && (len<n)) *buf = 0;
354
355 return len;
356}
357
358
359
360
361// ----------------------------------------------------------------------------
362// UTF-16
363// ----------------------------------------------------------------------------
364
365#ifdef WORDS_BIGENDIAN
366 #define wxMBConvUTF16straight wxMBConvUTF16BE
367 #define wxMBConvUTF16swap wxMBConvUTF16LE
368#else
369 #define wxMBConvUTF16swap wxMBConvUTF16BE
370 #define wxMBConvUTF16straight wxMBConvUTF16LE
371#endif
372
373
374#ifdef WC_UTF16
375
376// copy 16bit MB to 16bit String
377size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
378{
379 size_t len=0;
380
381 while (*(wxUint16*)psz && (!buf || len < n))
382 {
383 if (buf)
384 *buf++ = *(wxUint16*)psz;
385 len++;
386
387 psz += sizeof(wxUint16);
388 }
389 if (buf && len<n) *buf=0;
390
391 return len;
392}
393
394
395// copy 16bit String to 16bit MB
396size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
397{
398 size_t len=0;
399
400 while (*psz && (!buf || len < n))
401 {
402 if (buf)
403 {
404 *(wxUint16*)buf = *psz;
405 buf += sizeof(wxUint16);
406 }
407 len += sizeof(wxUint16);
408 psz++;
409 }
410 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
411
412 return len;
413}
414
415
416// swap 16bit MB to 16bit String
417size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
418{
419 size_t len=0;
420
421 while (*(wxUint16*)psz && (!buf || len < n))
422 {
423 if (buf)
424 {
425 ((char *)buf)[0] = psz[1];
426 ((char *)buf)[1] = psz[0];
427 buf++;
428 }
429 len++;
430 psz += sizeof(wxUint16);
431 }
432 if (buf && len<n) *buf=0;
433
434 return len;
435}
436
437
438// swap 16bit MB to 16bit String
439size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
440{
441 size_t len=0;
442
443 while (*psz && (!buf || len < n))
444 {
445 if (buf)
446 {
447 *buf++ = ((char*)psz)[1];
448 *buf++ = ((char*)psz)[0];
449 }
450 len += sizeof(wxUint16);
451 psz++;
452 }
453 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
454
455 return len;
456}
457
458
459#else // WC_UTF16
460
461
462// copy 16bit MB to 32bit String
463size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
464{
465 size_t len=0;
466
467 while (*(wxUint16*)psz && (!buf || len < n))
468 {
469 wxUint32 cc;
470 size_t pa=decode_utf16((wxUint16*)psz, cc);
471 if (pa == (size_t)-1)
472 return pa;
473
474 if (buf)
475 *buf++ = cc;
476 len++;
477 psz += pa * sizeof(wxUint16);
478 }
479 if (buf && len<n) *buf=0;
480
481 return len;
482}
483
484
485// copy 32bit String to 16bit MB
486size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
487{
488 size_t len=0;
489
490 while (*psz && (!buf || len < n))
491 {
492 wxUint16 cc[2];
493 size_t pa=encode_utf16(*psz, cc);
494
495 if (pa == (size_t)-1)
496 return pa;
497
498 if (buf)
499 {
500 *(wxUint16*)buf = cc[0];
501 buf += sizeof(wxUint16);
502 if (pa > 1)
503 {
504 *(wxUint16*)buf = cc[1];
505 buf += sizeof(wxUint16);
506 }
507 }
508
509 len += pa*sizeof(wxUint16);
510 psz++;
511 }
512 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
513
514 return len;
515}
516
517
518// swap 16bit MB to 32bit String
519size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
520{
521 size_t len=0;
522
523 while (*(wxUint16*)psz && (!buf || len < n))
524 {
525 wxUint32 cc;
526 char tmp[4];
527 tmp[0]=psz[1]; tmp[1]=psz[0];
528 tmp[2]=psz[3]; tmp[3]=psz[2];
529
530 size_t pa=decode_utf16((wxUint16*)tmp, cc);
531 if (pa == (size_t)-1)
532 return pa;
533
534 if (buf)
535 *buf++ = cc;
536
537 len++;
538 psz += pa * sizeof(wxUint16);
539 }
540 if (buf && len<n) *buf=0;
541
542 return len;
543}
544
545
546// swap 32bit String to 16bit MB
547size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
548{
549 size_t len=0;
550
551 while (*psz && (!buf || len < n))
552 {
553 wxUint16 cc[2];
554 size_t pa=encode_utf16(*psz, cc);
555
556 if (pa == (size_t)-1)
557 return pa;
558
559 if (buf)
560 {
561 *buf++ = ((char*)cc)[1];
562 *buf++ = ((char*)cc)[0];
563 if (pa > 1)
564 {
565 *buf++ = ((char*)cc)[3];
566 *buf++ = ((char*)cc)[2];
567 }
568 }
569
570 len += pa*sizeof(wxUint16);
571 psz++;
572 }
573 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
574
575 return len;
576}
577
578#endif // WC_UTF16
579
580
581// ----------------------------------------------------------------------------
582// UTF-32
583// ----------------------------------------------------------------------------
584
585#ifdef WORDS_BIGENDIAN
586#define wxMBConvUTF32straight wxMBConvUTF32BE
587#define wxMBConvUTF32swap wxMBConvUTF32LE
588#else
589#define wxMBConvUTF32swap wxMBConvUTF32BE
590#define wxMBConvUTF32straight wxMBConvUTF32LE
591#endif
592
593
594WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
595WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
596
597
598#ifdef WC_UTF16
599
600// copy 32bit MB to 16bit String
601size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
602{
603 size_t len=0;
604
605 while (*(wxUint32*)psz && (!buf || len < n))
606 {
607 wxUint16 cc[2];
608
609 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
610 if (pa == (size_t)-1)
611 return pa;
612
613 if (buf)
614 {
615 *buf++ = cc[0];
616 if (pa > 1)
617 *buf++ = cc[1];
618 }
619 len += pa;
620 psz += sizeof(wxUint32);
621 }
622 if (buf && len<n) *buf=0;
623
624 return len;
625}
626
627
628// copy 16bit String to 32bit MB
629size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
630{
631 size_t len=0;
632
633 while (*psz && (!buf || len < n))
634 {
635 wxUint32 cc;
636
637 // cast is ok for WC_UTF16
638 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
639 if (pa == (size_t)-1)
640 return pa;
641
642 if (buf)
643 {
644 *(wxUint32*)buf = cc;
645 buf += sizeof(wxUint32);
646 }
647 len += sizeof(wxUint32);
648 psz += pa;
649 }
650
651 if (buf && len<=n-sizeof(wxUint32))
652 *(wxUint32*)buf=0;
653
654 return len;
655}
656
657
658
659// swap 32bit MB to 16bit String
660size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
661{
662 size_t len=0;
663
664 while (*(wxUint32*)psz && (!buf || len < n))
665 {
666 char tmp[4];
667 tmp[0] = psz[3]; tmp[1] = psz[2];
668 tmp[2] = psz[1]; tmp[3] = psz[0];
669
670
671 wxUint16 cc[2];
672
673 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
674 if (pa == (size_t)-1)
675 return pa;
676
677 if (buf)
678 {
679 *buf++ = cc[0];
680 if (pa > 1)
681 *buf++ = cc[1];
682 }
683 len += pa;
684 psz += sizeof(wxUint32);
685 }
686
687 if (buf && len<n)
688 *buf=0;
689
690 return len;
691}
692
693
694// swap 16bit String to 32bit MB
695size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
696{
697 size_t len=0;
698
699 while (*psz && (!buf || len < n))
700 {
701 char cc[4];
702
703 // cast is ok for WC_UTF16
704 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
705 if (pa == (size_t)-1)
706 return pa;
707
708 if (buf)
709 {
710 *buf++ = cc[3];
711 *buf++ = cc[2];
712 *buf++ = cc[1];
713 *buf++ = cc[0];
714 }
715 len += sizeof(wxUint32);
716 psz += pa;
717 }
718
719 if (buf && len<=n-sizeof(wxUint32))
720 *(wxUint32*)buf=0;
721
722 return len;
723}
724
725#else // WC_UTF16
726
727
728// copy 32bit MB to 32bit String
729size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
730{
731 size_t len=0;
732
733 while (*(wxUint32*)psz && (!buf || len < n))
734 {
735 if (buf)
736 *buf++ = *(wxUint32*)psz;
737 len++;
738 psz += sizeof(wxUint32);
739 }
740
741 if (buf && len<n)
742 *buf=0;
743
744 return len;
745}
746
747
748// copy 32bit String to 32bit MB
749size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
750{
751 size_t len=0;
752
753 while (*psz && (!buf || len < n))
754 {
755 if (buf)
756 {
757 *(wxUint32*)buf = *psz;
758 buf += sizeof(wxUint32);
759 }
760
761 len += sizeof(wxUint32);
762 psz++;
763 }
764
765 if (buf && len<=n-sizeof(wxUint32))
766 *(wxUint32*)buf=0;
767
768 return len;
769}
770
771
772// swap 32bit MB to 32bit String
773size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
774{
775 size_t len=0;
776
777 while (*(wxUint32*)psz && (!buf || len < n))
778 {
779 if (buf)
780 {
781 ((char *)buf)[0] = psz[3];
782 ((char *)buf)[1] = psz[2];
783 ((char *)buf)[2] = psz[1];
784 ((char *)buf)[3] = psz[0];
785 buf++;
786 }
787 len++;
788 psz += sizeof(wxUint32);
789 }
790
791 if (buf && len<n)
792 *buf=0;
793
794 return len;
795}
796
797
798// swap 32bit String to 32bit MB
799size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
800{
801 size_t len=0;
802
803 while (*psz && (!buf || len < n))
804 {
805 if (buf)
806 {
807 *buf++ = ((char *)psz)[3];
808 *buf++ = ((char *)psz)[2];
809 *buf++ = ((char *)psz)[1];
810 *buf++ = ((char *)psz)[0];
811 }
812 len += sizeof(wxUint32);
813 psz++;
814 }
815
816 if (buf && len<=n-sizeof(wxUint32))
817 *(wxUint32*)buf=0;
818
819 return len;
820}
821
822
823#endif // WC_UTF16
824
825
826// ============================================================================
827// The classes doing conversion using the iconv_xxx() functions
828// ============================================================================
829
830#ifdef HAVE_ICONV
831
832// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
833// if output buffer is _exactly_ as big as needed. Such case is (unless there's
834// yet another bug in glibc) the only case when iconv() returns with (size_t)-1
835// (which means error) and says there are 0 bytes left in the input buffer --
836// when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
837// this alternative test for iconv() failure.
838// [This bug does not appear in glibc 2.2.]
839#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
840#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
841 (errno != E2BIG || bufLeft != 0))
842#else
843#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
844#endif
845
846#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
847
848// ----------------------------------------------------------------------------
849// wxMBConv_iconv: encapsulates an iconv character set
850// ----------------------------------------------------------------------------
851
852class wxMBConv_iconv : public wxMBConv
853{
854public:
855 wxMBConv_iconv(const wxChar *name);
856 virtual ~wxMBConv_iconv();
857
858 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
859 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
860
861 bool IsOk() const
862 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
863
864protected:
865 // the iconv handlers used to translate from multibyte to wide char and in
866 // the other direction
867 iconv_t m2w,
868 w2m;
869
870private:
871 // the name (for iconv_open()) of a wide char charset -- if none is
872 // available on this machine, it will remain NULL
873 static const char *ms_wcCharsetName;
874
875 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
876 // different endian-ness than the native one
877 static bool ms_wcNeedsSwap;
878};
879
880const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
881bool wxMBConv_iconv::ms_wcNeedsSwap = false;
882
883wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
884{
885 // Do it the hard way
886 char cname[100];
887 for (size_t i = 0; i < wxStrlen(name)+1; i++)
888 cname[i] = (char) name[i];
889
890 // check for charset that represents wchar_t:
891 if (ms_wcCharsetName == NULL)
892 {
893 ms_wcNeedsSwap = false;
894
895 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
896 ms_wcCharsetName = WC_NAME_BEST;
897 m2w = iconv_open(ms_wcCharsetName, cname);
898
899 if (m2w == (iconv_t)-1)
900 {
901 // try charset w/o bytesex info (e.g. "UCS4")
902 // and check for bytesex ourselves:
903 ms_wcCharsetName = WC_NAME;
904 m2w = iconv_open(ms_wcCharsetName, cname);
905
906 // last bet, try if it knows WCHAR_T pseudo-charset
907 if (m2w == (iconv_t)-1)
908 {
909 ms_wcCharsetName = "WCHAR_T";
910 m2w = iconv_open(ms_wcCharsetName, cname);
911 }
912
913 if (m2w != (iconv_t)-1)
914 {
915 char buf[2], *bufPtr;
916 wchar_t wbuf[2], *wbufPtr;
917 size_t insz, outsz;
918 size_t res;
919
920 buf[0] = 'A';
921 buf[1] = 0;
922 wbuf[0] = 0;
923 insz = 2;
924 outsz = SIZEOF_WCHAR_T * 2;
925 wbufPtr = wbuf;
926 bufPtr = buf;
927
928 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
929 (char**)&wbufPtr, &outsz);
930
931 if (ICONV_FAILED(res, insz))
932 {
933 ms_wcCharsetName = NULL;
934 wxLogLastError(wxT("iconv"));
935 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
936 }
937 else
938 {
939 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
940 }
941 }
942 else
943 {
944 ms_wcCharsetName = NULL;
945
946 // VS: we must not output an error here, since wxWindows will safely
947 // fall back to using wxEncodingConverter.
948 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
949 //wxLogError(
950 }
951 }
952 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
953 }
954 else // we already have ms_wcCharsetName
955 {
956 m2w = iconv_open(ms_wcCharsetName, cname);
957 }
958
959 // NB: don't ever pass NULL to iconv_open(), it may crash!
960 if ( ms_wcCharsetName )
961 {
962 w2m = iconv_open( cname, ms_wcCharsetName);
963 }
964 else
965 {
966 w2m = (iconv_t)-1;
967 }
968}
969
970wxMBConv_iconv::~wxMBConv_iconv()
971{
972 if ( m2w != (iconv_t)-1 )
973 iconv_close(m2w);
974 if ( w2m != (iconv_t)-1 )
975 iconv_close(w2m);
976}
977
978size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
979{
980 size_t inbuf = strlen(psz);
981 size_t outbuf = n * SIZEOF_WCHAR_T;
982 size_t res, cres;
983 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
984 wchar_t *bufPtr = buf;
985 const char *pszPtr = psz;
986
987 if (buf)
988 {
989 // have destination buffer, convert there
990 cres = iconv(m2w,
991 ICONV_CHAR_CAST(&pszPtr), &inbuf,
992 (char**)&bufPtr, &outbuf);
993 res = n - (outbuf / SIZEOF_WCHAR_T);
994
995 if (ms_wcNeedsSwap)
996 {
997 // convert to native endianness
998 WC_BSWAP(buf /* _not_ bufPtr */, res)
999 }
1000
1001 // NB: iconv was given only strlen(psz) characters on input, and so
1002 // it couldn't convert the trailing zero. Let's do it ourselves
1003 // if there's some room left for it in the output buffer.
1004 if (res < n)
1005 buf[res] = 0;
1006 }
1007 else
1008 {
1009 // no destination buffer... convert using temp buffer
1010 // to calculate destination buffer requirement
1011 wchar_t tbuf[8];
1012 res = 0;
1013 do {
1014 bufPtr = tbuf;
1015 outbuf = 8*SIZEOF_WCHAR_T;
1016
1017 cres = iconv(m2w,
1018 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1019 (char**)&bufPtr, &outbuf );
1020
1021 res += 8-(outbuf/SIZEOF_WCHAR_T);
1022 } while ((cres==(size_t)-1) && (errno==E2BIG));
1023 }
1024
1025 if (ICONV_FAILED(cres, inbuf))
1026 {
1027 //VS: it is ok if iconv fails, hence trace only
1028 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1029 return (size_t)-1;
1030 }
1031
1032 return res;
1033}
1034
1035size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1036{
1037 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1038 size_t outbuf = n;
1039 size_t res, cres;
1040
1041 wchar_t *tmpbuf = 0;
1042
1043 if (ms_wcNeedsSwap)
1044 {
1045 // need to copy to temp buffer to switch endianness
1046 // this absolutely doesn't rock!
1047 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1048 // could be in read-only memory, or be accessed in some other thread)
1049 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1050 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1051 WC_BSWAP(tmpbuf, inbuf)
1052 psz=tmpbuf;
1053 }
1054
1055 if (buf)
1056 {
1057 // have destination buffer, convert there
1058 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1059
1060 res = n-outbuf;
1061
1062 // NB: iconv was given only wcslen(psz) characters on input, and so
1063 // it couldn't convert the trailing zero. Let's do it ourselves
1064 // if there's some room left for it in the output buffer.
1065 if (res < n)
1066 buf[0] = 0;
1067 }
1068 else
1069 {
1070 // no destination buffer... convert using temp buffer
1071 // to calculate destination buffer requirement
1072 char tbuf[16];
1073 res = 0;
1074 do {
1075 buf = tbuf; outbuf = 16;
1076
1077 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1078
1079 res += 16 - outbuf;
1080 } while ((cres==(size_t)-1) && (errno==E2BIG));
1081 }
1082
1083 if (ms_wcNeedsSwap)
1084 {
1085 free(tmpbuf);
1086 }
1087
1088 if (ICONV_FAILED(cres, inbuf))
1089 {
1090 //VS: it is ok if iconv fails, hence trace only
1091 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1092 return (size_t)-1;
1093 }
1094
1095 return res;
1096}
1097
1098#endif // HAVE_ICONV
1099
1100
1101// ============================================================================
1102// Win32 conversion classes
1103// ============================================================================
1104
1105#ifdef wxHAVE_WIN32_MB2WC
1106
1107// from utils.cpp
1108extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1109extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1110
1111class wxMBConv_win32 : public wxMBConv
1112{
1113public:
1114 wxMBConv_win32()
1115 {
1116 m_CodePage = CP_ACP;
1117 }
1118
1119 wxMBConv_win32(const wxChar* name)
1120 {
1121 m_CodePage = wxCharsetToCodepage(name);
1122 }
1123
1124 wxMBConv_win32(wxFontEncoding encoding)
1125 {
1126 m_CodePage = wxEncodingToCodepage(encoding);
1127 }
1128
1129 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1130 {
1131 const size_t len = ::MultiByteToWideChar
1132 (
1133 m_CodePage, // code page
1134 0, // flags (none)
1135 psz, // input string
1136 -1, // its length (NUL-terminated)
1137 buf, // output string
1138 buf ? n : 0 // size of output buffer
1139 );
1140
1141 // note that it returns count of written chars for buf != NULL and size
1142 // of the needed buffer for buf == NULL so in either case the length of
1143 // the string (which never includes the terminating NUL) is one less
1144 return len ? len - 1 : (size_t)-1;
1145 }
1146
1147 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
1148 {
1149 const size_t len = ::WideCharToMultiByte
1150 (
1151 m_CodePage, // code page
1152 0, // flags (none)
1153 psz, // input string
1154 -1, // it is (wide) NUL-terminated
1155 buf, // output buffer
1156 buf ? n : 0, // and its size
1157 NULL, // default "replacement" char
1158 NULL // [out] was it used?
1159 );
1160
1161 // see the comment above for the reason of "len - 1"
1162 return len ? len - 1 : (size_t)-1;
1163 }
1164
1165 bool IsOk() const
1166 { return m_CodePage != -1; }
1167
1168public:
1169 long m_CodePage;
1170};
1171
1172#endif // wxHAVE_WIN32_MB2WC
1173
1174
1175// ============================================================================
1176// wxEncodingConverter based conversion classes
1177// ============================================================================
1178
1179#if wxUSE_FONTMAP
1180
1181class wxMBConv_wxwin : public wxMBConv
1182{
1183private:
1184 void Init()
1185 {
1186 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
1187 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
1188 }
1189
1190public:
1191 // temporarily just use wxEncodingConverter stuff,
1192 // so that it works while a better implementation is built
1193 wxMBConv_wxwin(const wxChar* name)
1194 {
1195 if (name)
1196 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
1197 else
1198 m_enc = wxFONTENCODING_SYSTEM;
1199
1200 Init();
1201 }
1202
1203 wxMBConv_wxwin(wxFontEncoding enc)
1204 {
1205 m_enc = enc;
1206
1207 Init();
1208 }
1209
1210 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
1211 {
1212 size_t inbuf = strlen(psz);
1213 if (buf)
1214 m2w.Convert(psz,buf);
1215 return inbuf;
1216 }
1217
1218 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
1219 {
1220 const size_t inbuf = wxWcslen(psz);
1221 if (buf)
1222 w2m.Convert(psz,buf);
1223
1224 return inbuf;
1225 }
1226
1227 bool IsOk() const { return m_ok; }
1228
1229public:
1230 wxFontEncoding m_enc;
1231 wxEncodingConverter m2w, w2m;
1232
1233 // were we initialized successfully?
1234 bool m_ok;
1235
1236 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
1237};
1238
1239#endif // wxUSE_FONTMAP
1240
1241// ============================================================================
1242// wxCSConv implementation
1243// ============================================================================
1244
1245void wxCSConv::Init()
1246{
1247 m_name = NULL;
1248 m_convReal = NULL;
1249 m_deferred = true;
1250}
1251
1252wxCSConv::wxCSConv(const wxChar *charset)
1253{
1254 Init();
1255
1256 if ( charset )
1257 {
1258 SetName(charset);
1259 }
1260
1261 m_encoding = wxFONTENCODING_SYSTEM;
1262}
1263
1264wxCSConv::wxCSConv(wxFontEncoding encoding)
1265{
1266 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
1267 {
1268 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1269
1270 encoding = wxFONTENCODING_SYSTEM;
1271 }
1272
1273 Init();
1274
1275 m_encoding = encoding;
1276}
1277
1278wxCSConv::~wxCSConv()
1279{
1280 Clear();
1281}
1282
1283wxCSConv::wxCSConv(const wxCSConv& conv)
1284 : wxMBConv()
1285{
1286 Init();
1287
1288 SetName(conv.m_name);
1289 m_encoding = conv.m_encoding;
1290}
1291
1292wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
1293{
1294 Clear();
1295
1296 SetName(conv.m_name);
1297 m_encoding = conv.m_encoding;
1298
1299 return *this;
1300}
1301
1302void wxCSConv::Clear()
1303{
1304 free(m_name);
1305 delete m_convReal;
1306
1307 m_name = NULL;
1308 m_convReal = NULL;
1309}
1310
1311void wxCSConv::SetName(const wxChar *charset)
1312{
1313 if (charset)
1314 {
1315 m_name = wxStrdup(charset);
1316 m_deferred = true;
1317 }
1318}
1319
1320wxMBConv *wxCSConv::DoCreate() const
1321{
1322 // check for the special case of ASCII or ISO8859-1 charset: as we have
1323 // special knowledge of it anyhow, we don't need to create a special
1324 // conversion object
1325 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
1326 {
1327 // don't convert at all
1328 return NULL;
1329 }
1330
1331 // we trust OS to do conversion better than we can so try external
1332 // conversion methods first
1333 //
1334 // the full order is:
1335 // 1. OS conversion (iconv() under Unix or Win32 API)
1336 // 2. hard coded conversions for UTF
1337 // 3. wxEncodingConverter as fall back
1338
1339 // step (1)
1340#ifdef HAVE_ICONV
1341#if !wxUSE_FONTMAP
1342 if ( m_name )
1343#endif // !wxUSE_FONTMAP
1344 {
1345 wxString name(m_name);
1346
1347#if wxUSE_FONTMAP
1348 if ( name.empty() )
1349 name = wxFontMapper::Get()->GetEncodingName(m_encoding);
1350#endif // wxUSE_FONTMAP
1351
1352 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
1353 if ( conv->IsOk() )
1354 return conv;
1355
1356 delete conv;
1357 }
1358#endif // HAVE_ICONV
1359
1360#ifdef wxHAVE_WIN32_MB2WC
1361 {
1362 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
1363 : new wxMBConv_win32(m_encoding);
1364 if ( conv->IsOk() )
1365 return conv;
1366
1367 delete conv;
1368 }
1369#endif // wxHAVE_WIN32_MB2WC
1370
1371 // step (2)
1372 wxFontEncoding enc = m_encoding;
1373#if wxUSE_FONTMAP
1374 if ( enc == wxFONTENCODING_SYSTEM && m_name )
1375 {
1376 // use "false" to suppress interactive dialogs -- we can be called from
1377 // anywhere and popping up a dialog from here is the last thing we want to
1378 // do
1379 enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
1380 }
1381#endif // wxUSE_FONTMAP
1382
1383 switch ( enc )
1384 {
1385 case wxFONTENCODING_UTF7:
1386 return new wxMBConvUTF7;
1387
1388 case wxFONTENCODING_UTF8:
1389 return new wxMBConvUTF8;
1390
1391 case wxFONTENCODING_UTF16BE:
1392 return new wxMBConvUTF16BE;
1393
1394 case wxFONTENCODING_UTF16LE:
1395 return new wxMBConvUTF16LE;
1396
1397 case wxFONTENCODING_UTF32BE:
1398 return new wxMBConvUTF32BE;
1399
1400 case wxFONTENCODING_UTF32LE:
1401 return new wxMBConvUTF32LE;
1402
1403 default:
1404 // nothing to do but put here to suppress gcc warnings
1405 ;
1406 }
1407
1408 // step (3)
1409#if wxUSE_FONTMAP
1410 {
1411 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
1412 : new wxMBConv_wxwin(m_encoding);
1413 if ( conv->IsOk() )
1414 return conv;
1415
1416 delete conv;
1417 }
1418#endif // wxUSE_FONTMAP
1419
1420 // NB: This is a hack to prevent deadlock. What could otherwise happen
1421 // in Unicode build: wxConvLocal creation ends up being here
1422 // because of some failure and logs the error. But wxLog will try to
1423 // attach timestamp, for which it will need wxConvLocal (to convert
1424 // time to char* and then wchar_t*), but that fails, tries to log
1425 // error, but wxLog has a (already locked) critical section that
1426 // guards static buffer.
1427 static bool alreadyLoggingError = false;
1428 if (!alreadyLoggingError)
1429 {
1430 alreadyLoggingError = true;
1431 wxLogError(_("Cannot convert from the charset '%s'!"),
1432 m_name ? m_name
1433 :
1434#if wxUSE_FONTMAP
1435 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
1436#else // !wxUSE_FONTMAP
1437 wxString::Format(_("encoding %s"), m_encoding).c_str()
1438#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1439 );
1440 alreadyLoggingError = false;
1441 }
1442
1443 return NULL;
1444}
1445
1446void wxCSConv::CreateConvIfNeeded() const
1447{
1448 if ( m_deferred )
1449 {
1450 wxCSConv *self = (wxCSConv *)this; // const_cast
1451
1452#if wxUSE_INTL
1453 // if we don't have neither the name nor the encoding, use the default
1454 // encoding for this system
1455 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
1456 {
1457 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
1458 }
1459#endif // wxUSE_INTL
1460
1461 self->m_convReal = DoCreate();
1462 self->m_deferred = false;
1463 }
1464}
1465
1466size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1467{
1468 CreateConvIfNeeded();
1469
1470 if (m_convReal)
1471 return m_convReal->MB2WC(buf, psz, n);
1472
1473 // latin-1 (direct)
1474 size_t len = strlen(psz);
1475
1476 if (buf)
1477 {
1478 for (size_t c = 0; c <= len; c++)
1479 buf[c] = (unsigned char)(psz[c]);
1480 }
1481
1482 return len;
1483}
1484
1485size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1486{
1487 CreateConvIfNeeded();
1488
1489 if (m_convReal)
1490 return m_convReal->WC2MB(buf, psz, n);
1491
1492 // latin-1 (direct)
1493 const size_t len = wxWcslen(psz);
1494 if (buf)
1495 {
1496 for (size_t c = 0; c <= len; c++)
1497 {
1498 if (psz[c] > 0xFF)
1499 return (size_t)-1;
1500 buf[c] = psz[c];
1501 }
1502 }
1503 else
1504 {
1505 for (size_t c = 0; c <= len; c++)
1506 {
1507 if (psz[c] > 0xFF)
1508 return (size_t)-1;
1509 }
1510 }
1511
1512 return len;
1513}
1514
1515// ----------------------------------------------------------------------------
1516// globals
1517// ----------------------------------------------------------------------------
1518
1519#ifdef __WINDOWS__
1520 static wxMBConv_win32 wxConvLibcObj;
1521#else
1522 static wxMBConvLibc wxConvLibcObj;
1523#endif
1524
1525static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
1526static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
1527static wxMBConvUTF7 wxConvUTF7Obj;
1528static wxMBConvUTF8 wxConvUTF8Obj;
1529
1530
1531WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
1532WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
1533WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
1534WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
1535WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
1536WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
1537
1538#else // !wxUSE_WCHAR_T
1539
1540// stand-ins in absence of wchar_t
1541WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
1542 wxConvISO8859_1,
1543 wxConvLocal,
1544 wxConvUTF8;
1545
1546#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
1547
1548