]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
added UTF-16/32-[LB]E conversions; got rid of wxCharacterSet and simplified and fixed...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
23 #endif
24
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
27
28 #ifdef __BORLANDC__
29 #pragma hdrstop
30 #endif
31
32 #ifndef WX_PRECOMP
33 #include "wx/intl.h"
34 #include "wx/log.h"
35 #endif // WX_PRECOMP
36
37 #ifdef __WXMSW__
38 #include "wx/msw/private.h"
39 #endif
40
41 #ifndef __WXWINCE__
42 #include <errno.h>
43 #endif
44
45 #include <ctype.h>
46 #include <string.h>
47 #include <stdlib.h>
48
49 #include "wx/module.h"
50 #include "wx/strconv.h"
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 // ----------------------------------------------------------------------------
57 // globals
58 // ----------------------------------------------------------------------------
59
60 #if wxUSE_WCHAR_T
61 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc;
62 WXDLLIMPEXP_DATA_BASE(wxCSConv) wxConvLocal((const wxChar *)NULL);
63 WXDLLIMPEXP_DATA_BASE(wxCSConv) wxConvISO8859_1(_T("iso-8859-1"));
64 #else
65 // stand-ins in absence of wchar_t
66 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
67 wxConvFile,
68 wxConvISO8859_1,
69 wxConvLocal,
70 wxConvUTF8;
71 #endif // wxUSE_WCHAR_T
72
73 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibc;
74
75 class wxStrConvModule: public wxModule
76 {
77 public:
78 wxStrConvModule() : wxModule() { }
79 virtual bool OnInit() { return true; }
80 virtual void OnExit()
81 {
82 #if wxUSE_WCHAR_T
83 wxConvLocal.Clear();
84 wxConvISO8859_1.Clear();
85 #endif
86 }
87
88 DECLARE_DYNAMIC_CLASS(wxStrConvModule)
89 };
90
91 IMPLEMENT_DYNAMIC_CLASS(wxStrConvModule, wxModule)
92
93
94 // ----------------------------------------------------------------------------
95 // headers
96 // ----------------------------------------------------------------------------
97
98 #if wxUSE_WCHAR_T
99
100 #ifdef __SALFORDC__
101 #include <clib.h>
102 #endif
103
104 #ifdef HAVE_ICONV
105 #include <iconv.h>
106 #endif
107
108 #include "wx/encconv.h"
109 #include "wx/fontmap.h"
110
111 // ----------------------------------------------------------------------------
112 // macros
113 // ----------------------------------------------------------------------------
114
115 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
116 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
117
118 // under Unix SIZEOF_WCHAR_T is defined by configure, but under other platforms
119 // it might be not defined - assume the most common value
120 #ifndef SIZEOF_WCHAR_T
121 #define SIZEOF_WCHAR_T 2
122 #endif // !defined(SIZEOF_WCHAR_T)
123
124 #if SIZEOF_WCHAR_T == 4
125 #define WC_NAME "UCS4"
126 #define WC_BSWAP BSWAP_UCS4
127 #ifdef WORDS_BIGENDIAN
128 #define WC_NAME_BEST "UCS-4BE"
129 #else
130 #define WC_NAME_BEST "UCS-4LE"
131 #endif
132 #elif SIZEOF_WCHAR_T == 2
133 #define WC_NAME "UTF16"
134 #define WC_BSWAP BSWAP_UTF16
135 #define WC_UTF16
136 #ifdef WORDS_BIGENDIAN
137 #define WC_NAME_BEST "UTF-16BE"
138 #else
139 #define WC_NAME_BEST "UTF-16LE"
140 #endif
141 #else // sizeof(wchar_t) != 2 nor 4
142 // I don't know what to do about this
143 #error "Weird sizeof(wchar_t): please report your platform details to wx-users mailing list"
144 #endif
145
146 // ============================================================================
147 // implementation
148 // ============================================================================
149
150 // ----------------------------------------------------------------------------
151 // UTF-16 en/decoding to/from UCS-4
152 // ----------------------------------------------------------------------------
153
154
155 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
156 {
157 if (input<=0xffff)
158 {
159 if (output) *output++ = (wxUint16) input;
160 return 1;
161 }
162 else if (input>=0x110000)
163 {
164 return (size_t)-1;
165 }
166 else
167 {
168 if (output)
169 {
170 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
171 *output++ = (wxUint16) ((input&0x3ff)+0xdc00);
172 }
173 return 2;
174 }
175 }
176
177 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
178 {
179 if ((*input<0xd800) || (*input>0xdfff))
180 {
181 output = *input;
182 return 1;
183 }
184 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
185 {
186 output = *input;
187 return (size_t)-1;
188 }
189 else
190 {
191 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
192 return 2;
193 }
194 }
195
196
197 // ----------------------------------------------------------------------------
198 // wxMBConv
199 // ----------------------------------------------------------------------------
200
201 #define IGNORE_LIBC 0
202
203 wxMBConv::~wxMBConv()
204 {
205 // nothing to do here
206 }
207
208 size_t wxMBConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
209 {
210 #if IGNORE_LIBC
211 if (buf)
212 {
213 for (size_t i = 0; i < strlen( psz )+1; i++)
214 buf[i] = (wchar_t) psz[i];
215 return strlen( psz );
216 }
217 else
218 {
219 return strlen( psz );
220 }
221 #else
222 return wxMB2WC(buf, psz, n);
223 #endif
224 }
225
226 size_t wxMBConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
227 {
228 #if IGNORE_LIBC
229 if (buf)
230 {
231 for (size_t i = 0; i < wxStrlen( psz )+1; i++)
232 buf[i] = (char) psz[i];
233 return wxStrlen( psz );
234 }
235 else
236 {
237 return wxStrlen( psz );
238 }
239 #else
240 return wxWC2MB(buf, psz, n);
241 #endif
242 }
243
244 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
245 {
246 if ( psz )
247 {
248 // calculate the length of the buffer needed first
249 size_t nLen = MB2WC(NULL, psz, 0);
250 if ( nLen != (size_t)-1 )
251 {
252 // now do the actual conversion
253 wxWCharBuffer buf(nLen);
254 MB2WC(buf.data(), psz, nLen + 1); // with the trailing NUL
255
256 return buf;
257 }
258 }
259
260 wxWCharBuffer buf((wchar_t *)NULL);
261
262 return buf;
263 }
264
265 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
266 {
267 if ( pwz )
268 {
269 size_t nLen = WC2MB(NULL, pwz, 0);
270 if ( nLen != (size_t)-1 )
271 {
272 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
273 WC2MB(buf.data(), pwz, nLen + 4);
274
275 return buf;
276 }
277 }
278
279 wxCharBuffer buf((char *)NULL);
280
281 return buf;
282 }
283
284 // ----------------------------------------------------------------------------
285 // UTF-7
286 // ----------------------------------------------------------------------------
287
288 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7) wxConvUTF7;
289
290 #if 0
291 static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
292 "abcdefghijklmnopqrstuvwxyz"
293 "0123456789'(),-./:?";
294 static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
295 static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
296 "abcdefghijklmnopqrstuvwxyz"
297 "0123456789+/";
298 #endif
299
300 // TODO: write actual implementations of UTF-7 here
301 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
302 const char * WXUNUSED(psz),
303 size_t WXUNUSED(n)) const
304 {
305 return 0;
306 }
307
308 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
309 const wchar_t * WXUNUSED(psz),
310 size_t WXUNUSED(n)) const
311 {
312 return 0;
313 }
314
315 // ----------------------------------------------------------------------------
316 // UTF-8
317 // ----------------------------------------------------------------------------
318
319 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8) wxConvUTF8;
320
321 static wxUint32 utf8_max[]=
322 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
323
324 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
325 {
326 size_t len = 0;
327
328 while (*psz && ((!buf) || (len < n)))
329 {
330 unsigned char cc = *psz++, fc = cc;
331 unsigned cnt;
332 for (cnt = 0; fc & 0x80; cnt++)
333 fc <<= 1;
334 if (!cnt)
335 {
336 // plain ASCII char
337 if (buf)
338 *buf++ = cc;
339 len++;
340 }
341 else
342 {
343 cnt--;
344 if (!cnt)
345 {
346 // invalid UTF-8 sequence
347 return (size_t)-1;
348 }
349 else
350 {
351 unsigned ocnt = cnt - 1;
352 wxUint32 res = cc & (0x3f >> cnt);
353 while (cnt--)
354 {
355 cc = *psz++;
356 if ((cc & 0xC0) != 0x80)
357 {
358 // invalid UTF-8 sequence
359 return (size_t)-1;
360 }
361 res = (res << 6) | (cc & 0x3f);
362 }
363 if (res <= utf8_max[ocnt])
364 {
365 // illegal UTF-8 encoding
366 return (size_t)-1;
367 }
368 #ifdef WC_UTF16
369 size_t pa = encode_utf16(res, buf);
370 if (pa == (size_t)-1)
371 return (size_t)-1;
372 if (buf)
373 buf += pa;
374 len += pa;
375 #else // !WC_UTF16
376 if (buf)
377 *buf++ = res;
378 len++;
379 #endif // WC_UTF16/!WC_UTF16
380 }
381 }
382 }
383 if (buf && (len < n))
384 *buf = 0;
385 return len;
386 }
387
388 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
389 {
390 size_t len = 0;
391
392 while (*psz && ((!buf) || (len < n)))
393 {
394 wxUint32 cc;
395 #ifdef WC_UTF16
396 size_t pa = decode_utf16(psz, cc);
397 psz += (pa == (size_t)-1) ? 1 : pa;
398 #else
399 cc=(*psz++) & 0x7fffffff;
400 #endif
401 unsigned cnt;
402 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
403 if (!cnt)
404 {
405 // plain ASCII char
406 if (buf)
407 *buf++ = (char) cc;
408 len++;
409 }
410
411 else
412 {
413 len += cnt + 1;
414 if (buf)
415 {
416 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
417 while (cnt--)
418 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
419 }
420 }
421 }
422
423 if (buf && (len<n)) *buf = 0;
424
425 return len;
426 }
427
428
429
430
431 // ----------------------------------------------------------------------------
432 // UTF-16
433 // ----------------------------------------------------------------------------
434
435 #ifdef WORDS_BIGENDIAN
436 #define wxMBConvUTF16straight wxMBConvUTF16BE
437 #define wxMBConvUTF16swap wxMBConvUTF16LE
438 #else
439 #define wxMBConvUTF16swap wxMBConvUTF16BE
440 #define wxMBConvUTF16straight wxMBConvUTF16LE
441 #endif
442
443
444 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF16LE) wxConvUTF16LE;
445 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF16BE) wxConvUTF16BE;
446
447
448
449
450
451 #ifdef WC_UTF16
452
453
454 // copy 16bit MB to 16bit String
455 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456 {
457 size_t len=0;
458
459 while (*(wxUint16*)psz && (!buf || len < n))
460 {
461 if (buf)
462 *buf++ = *(wxUint16*)psz;
463 len++;
464
465 psz += sizeof(wxUint16);
466 }
467 if (buf && len<n) *buf=0;
468
469 return len;
470 }
471
472
473 // copy 16bit String to 16bit MB
474 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
475 {
476 size_t len=0;
477
478 while (*psz && (!buf || len < n))
479 {
480 if (buf)
481 {
482 *(wxUint16*)buf = *psz;
483 buf += sizeof(wxUint16);
484 }
485 len += sizeof(wxUint16);
486 psz++;
487 }
488 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
489
490 return len;
491 }
492
493
494 // swap 16bit MB to 16bit String
495 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
496 {
497 size_t len=0;
498
499 while (*(wxUint16*)psz && (!buf || len < n))
500 {
501 if (buf)
502 {
503 ((char *)buf)[0] = psz[1];
504 ((char *)buf)[1] = psz[0];
505 buf++;
506 }
507 len++;
508 psz += sizeof(wxUint16);
509 }
510 if (buf && len<n) *buf=0;
511
512 return len;
513 }
514
515
516 // swap 16bit MB to 16bit String
517 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
518 {
519 size_t len=0;
520
521 while (*psz && (!buf || len < n))
522 {
523 if (buf)
524 {
525 *buf++ = ((char*)psz)[1];
526 *buf++ = ((char*)psz)[0];
527 }
528 len += sizeof(wxUint16);
529 psz++;
530 }
531 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
532
533 return len;
534 }
535
536
537 #else // WC_UTF16
538
539
540 // copy 16bit MB to 32bit String
541 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
542 {
543 size_t len=0;
544
545 while (*(wxUint16*)psz && (!buf || len < n))
546 {
547 wxUint32 cc;
548 size_t pa=decode_utf16((wxUint16*)psz, cc);
549 if (pa == (size_t)-1)
550 return pa;
551
552 if (buf)
553 *buf++ = cc;
554 len++;
555 psz += pa * sizeof(wxUint16);
556 }
557 if (buf && len<n) *buf=0;
558
559 return len;
560 }
561
562
563 // copy 32bit String to 16bit MB
564 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
565 {
566 size_t len=0;
567
568 while (*psz && (!buf || len < n))
569 {
570 wxUint16 cc[2];
571 size_t pa=encode_utf16(*psz, cc);
572
573 if (pa == (size_t)-1)
574 return pa;
575
576 if (buf)
577 {
578 *(wxUint16*)buf = cc[0];
579 buf += sizeof(wxUint16);
580 if (pa > 1)
581 {
582 *(wxUint16*)buf = cc[1];
583 buf += sizeof(wxUint16);
584 }
585 }
586
587 len += pa*sizeof(wxUint16);
588 psz++;
589 }
590 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
591
592 return len;
593 }
594
595
596 // swap 16bit MB to 32bit String
597 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
598 {
599 size_t len=0;
600
601 while (*(wxUint16*)psz && (!buf || len < n))
602 {
603 wxUint32 cc;
604 char tmp[4];
605 tmp[0]=psz[1]; tmp[1]=psz[0];
606 tmp[2]=psz[3]; tmp[3]=psz[2];
607
608 size_t pa=decode_utf16((wxUint16*)tmp, cc);
609 if (pa == (size_t)-1)
610 return pa;
611
612 if (buf)
613 *buf++ = cc;
614
615 len++;
616 psz += pa * sizeof(wxUint16);
617 }
618 if (buf && len<n) *buf=0;
619
620 return len;
621 }
622
623
624 // swap 32bit String to 16bit MB
625 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
626 {
627 size_t len=0;
628
629 while (*psz && (!buf || len < n))
630 {
631 wxUint16 cc[2];
632 size_t pa=encode_utf16(*psz, cc);
633
634 if (pa == (size_t)-1)
635 return pa;
636
637 if (buf)
638 {
639 *buf++ = ((char*)cc)[1];
640 *buf++ = ((char*)cc)[0];
641 if (pa > 1)
642 {
643 *buf++ = ((char*)cc)[3];
644 *buf++ = ((char*)cc)[2];
645 }
646 }
647
648 len += pa*sizeof(wxUint16);
649 psz++;
650 }
651 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
652
653 return len;
654 }
655
656 #endif // WC_UTF16
657
658
659 // ----------------------------------------------------------------------------
660 // UTF-32
661 // ----------------------------------------------------------------------------
662
663 #ifdef WORDS_BIGENDIAN
664 #define wxMBConvUTF32straight wxMBConvUTF32BE
665 #define wxMBConvUTF32swap wxMBConvUTF32LE
666 #else
667 #define wxMBConvUTF32swap wxMBConvUTF32BE
668 #define wxMBConvUTF32straight wxMBConvUTF32LE
669 #endif
670
671
672 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
673 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
674
675
676 #ifdef WC_UTF16
677
678 // copy 32bit MB to 16bit String
679 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
680 {
681 size_t len=0;
682
683 while (*(wxUint32*)psz && (!buf || len < n))
684 {
685 wxUint16 cc[2];
686
687 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
688 if (pa == (size_t)-1)
689 return pa;
690
691 if (buf)
692 {
693 *buf++ = cc[0];
694 if (pa > 1)
695 *buf++ = cc[1];
696 }
697 len += pa;
698 psz += sizeof(wxUint32);
699 }
700 if (buf && len<n) *buf=0;
701
702 return len;
703 }
704
705
706 // copy 16bit String to 32bit MB
707 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
708 {
709 size_t len=0;
710
711 while (*psz && (!buf || len < n))
712 {
713 wxUint32 cc;
714
715 size_t pa=decode_utf16(psz, cc);
716 if (pa == (size_t)-1)
717 return pa;
718
719 if (buf)
720 {
721 *(wxUint32*)buf = cc;
722 buf += sizeof(wxUint32);
723 }
724 len += sizeof(wxUint32);
725 psz += pa;
726 }
727 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
728
729 return len;
730 }
731
732
733
734 // swap 32bit MB to 16bit String
735 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
736 {
737 size_t len=0;
738
739 while (*(wxUint32*)psz && (!buf || len < n))
740 {
741 char tmp[4];
742 tmp[0] = psz[3]; tmp[1] = psz[2];
743 tmp[2] = psz[1]; tmp[3] = psz[0];
744
745
746 wxUint16 cc[2];
747
748 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
749 if (pa == (size_t)-1)
750 return pa;
751
752 if (buf)
753 {
754 *buf++ = cc[0];
755 if (pa > 1)
756 *buf++ = cc[1];
757 }
758 len += pa;
759 psz += sizeof(wxUint32);
760 }
761 if (buf && len<n) *buf=0;
762
763 return len;
764 }
765
766
767 // swap 16bit String to 32bit MB
768 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
769 {
770 size_t len=0;
771
772 while (*psz && (!buf || len < n))
773 {
774 char cc[4];
775
776 size_t pa=decode_utf16(psz, *(wxUint32*)cc);
777 if (pa == (size_t)-1)
778 return pa;
779
780 if (buf)
781 {
782 *buf++ = cc[3];
783 *buf++ = cc[2];
784 *buf++ = cc[1];
785 *buf++ = cc[0];
786 }
787 len += sizeof(wxUint32);
788 psz += pa;
789 }
790 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
791
792 return len;
793 }
794
795 #else // WC_UTF16
796
797
798 // copy 32bit MB to 32bit String
799 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
800 {
801 size_t len=0;
802
803 while (*(wxUint32*)psz && (!buf || len < n))
804 {
805 if (buf)
806 *buf++ = *(wxUint32*)psz;
807 len++;
808 psz += sizeof(wxUint32);
809 }
810 if (buf && len<n) *buf=0;
811
812 return len;
813 }
814
815
816 // copy 32bit String to 32bit MB
817 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
818 {
819 size_t len=0;
820
821 while (*psz && (!buf || len < n))
822 {
823 if (buf)
824 {
825 *(wxUint32*)buf = *psz;
826 buf += sizeof(wxUint32);
827 }
828
829 len += sizeof(wxUint32);
830 psz++;
831 }
832
833 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
834
835 return len;
836 }
837
838
839 // swap 32bit MB to 32bit String
840 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
841 {
842 size_t len=0;
843
844 while (*(wxUint32*)psz && (!buf || len < n))
845 {
846 if (buf)
847 {
848 ((char *)buf)[0] = psz[3];
849 ((char *)buf)[1] = psz[2];
850 ((char *)buf)[2] = psz[1];
851 ((char *)buf)[3] = psz[0];
852 buf++;
853 }
854 len++;
855 psz += sizeof(wxUint32);
856 }
857 if (buf && len<n) *buf=0;
858
859 return len;
860 }
861
862
863 // swap 32bit String to 32bit MB
864 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
865 {
866 size_t len=0;
867
868 while (*psz && (!buf || len < n))
869 {
870 if (buf)
871 {
872 *buf++ = ((char *)psz)[3];
873 *buf++ = ((char *)psz)[2];
874 *buf++ = ((char *)psz)[1];
875 *buf++ = ((char *)psz)[0];
876 }
877 len += sizeof(wxUint32);
878 psz++;
879 }
880 if (buf && len<=n-sizeof(wxUint32)) *(wxUint32*)buf=0;
881
882 return len;
883 }
884
885
886 #endif // WC_UTF16
887
888
889 // ============================================================================
890 // The classes doing conversion using the iconv_xxx() functions
891 // ============================================================================
892
893 #ifdef HAVE_ICONV
894
895 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
896 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
897 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
898 // (which means error) and says there are 0 bytes left in the input buffer --
899 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
900 // this alternative test for iconv() failure.
901 // [This bug does not appear in glibc 2.2.]
902 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
903 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
904 (errno != E2BIG || bufLeft != 0))
905 #else
906 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
907 #endif
908
909 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
910
911 // ----------------------------------------------------------------------------
912 // wxMBConv_iconv: encapsulates an iconv character set
913 // ----------------------------------------------------------------------------
914
915 class wxMBConv_iconv : public wxMBConv
916 {
917 public:
918 wxMBConv_iconv(const wxChar *name);
919 virtual ~wxMBConv_iconv();
920
921 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n);
922 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n);
923
924 bool IsOk() const
925 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
926
927 protected:
928 // the iconv handlers used to translate from multibyte to wide char and in
929 // the other direction
930 iconv_t m2w,
931 w2m;
932
933 private:
934 // the name (for iconv_open()) of a wide char charset -- if none is
935 // available on this machine, it will remain NULL
936 static const char *ms_wcCharsetName;
937
938 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
939 // different endian-ness than the native one
940 static bool ms_wcNeedsSwap;
941 };
942
943 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
944 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
945
946 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
947 {
948 // Do it the hard way
949 char cname[100];
950 for (size_t i = 0; i < wxStrlen(name)+1; i++)
951 cname[i] = (char) name[i];
952
953 // check for charset that represents wchar_t:
954 if (ms_wcCharsetName == NULL)
955 {
956 ms_wcNeedsSwap = false;
957
958 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
959 ms_wcCharsetName = WC_NAME_BEST;
960 m2w = iconv_open(ms_wcCharsetName, cname);
961
962 if (m2w == (iconv_t)-1)
963 {
964 // try charset w/o bytesex info (e.g. "UCS4")
965 // and check for bytesex ourselves:
966 ms_wcCharsetName = WC_NAME;
967 m2w = iconv_open(ms_wcCharsetName, cname);
968
969 // last bet, try if it knows WCHAR_T pseudo-charset
970 if (m2w == (iconv_t)-1)
971 {
972 ms_wcCharsetName = "WCHAR_T";
973 m2w = iconv_open(ms_wcCharsetName, cname);
974 }
975
976 if (m2w != (iconv_t)-1)
977 {
978 char buf[2], *bufPtr;
979 wchar_t wbuf[2], *wbufPtr;
980 size_t insz, outsz;
981 size_t res;
982
983 buf[0] = 'A';
984 buf[1] = 0;
985 wbuf[0] = 0;
986 insz = 2;
987 outsz = SIZEOF_WCHAR_T * 2;
988 wbufPtr = wbuf;
989 bufPtr = buf;
990
991 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
992 (char**)&wbufPtr, &outsz);
993
994 if (ICONV_FAILED(res, insz))
995 {
996 ms_wcCharsetName = NULL;
997 wxLogLastError(wxT("iconv"));
998 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
999 }
1000 else
1001 {
1002 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1003 }
1004 }
1005 else
1006 {
1007 ms_wcCharsetName = NULL;
1008
1009 // VS: we must not output an error here, since wxWindows will safely
1010 // fall back to using wxEncodingConverter.
1011 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1012 //wxLogError(
1013 }
1014 }
1015 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1016 }
1017 else // we already have ms_wcCharsetName
1018 {
1019 m2w = iconv_open(ms_wcCharsetName, cname);
1020 }
1021
1022 // NB: don't ever pass NULL to iconv_open(), it may crash!
1023 if ( ms_wcCharsetName )
1024 {
1025 w2m = iconv_open( cname, ms_wcCharsetName);
1026 }
1027 else
1028 {
1029 w2m = (iconv_t)-1;
1030 }
1031 }
1032
1033 wxMBConv_iconv::~wxMBConv_iconv()
1034 {
1035 if ( m2w != (iconv_t)-1 )
1036 iconv_close(m2w);
1037 if ( w2m != (iconv_t)-1 )
1038 iconv_close(w2m);
1039 }
1040
1041 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n)
1042 {
1043 size_t inbuf = strlen(psz);
1044 size_t outbuf = n * SIZEOF_WCHAR_T;
1045 size_t res, cres;
1046 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1047 wchar_t *bufPtr = buf;
1048 const char *pszPtr = psz;
1049
1050 if (buf)
1051 {
1052 // have destination buffer, convert there
1053 cres = iconv(m2w,
1054 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1055 (char**)&bufPtr, &outbuf);
1056 res = n - (outbuf / SIZEOF_WCHAR_T);
1057
1058 if (ms_wcNeedsSwap)
1059 {
1060 // convert to native endianness
1061 WC_BSWAP(buf /* _not_ bufPtr */, res)
1062 }
1063
1064 // NB: iconv was given only strlen(psz) characters on input, and so
1065 // it couldn't convert the trailing zero. Let's do it ourselves
1066 // if there's some room left for it in the output buffer.
1067 if (res < n)
1068 buf[res] = 0;
1069 }
1070 else
1071 {
1072 // no destination buffer... convert using temp buffer
1073 // to calculate destination buffer requirement
1074 wchar_t tbuf[8];
1075 res = 0;
1076 do {
1077 bufPtr = tbuf;
1078 outbuf = 8*SIZEOF_WCHAR_T;
1079
1080 cres = iconv(m2w,
1081 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1082 (char**)&bufPtr, &outbuf );
1083
1084 res += 8-(outbuf/SIZEOF_WCHAR_T);
1085 } while ((cres==(size_t)-1) && (errno==E2BIG));
1086 }
1087
1088 if (ICONV_FAILED(cres, inbuf))
1089 {
1090 //VS: it is ok if iconv fails, hence trace only
1091 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1092 return (size_t)-1;
1093 }
1094
1095 return res;
1096 }
1097
1098 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n)
1099 {
1100 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1101 size_t outbuf = n;
1102 size_t res, cres;
1103
1104 wchar_t *tmpbuf = 0;
1105
1106 if (ms_wcNeedsSwap)
1107 {
1108 // need to copy to temp buffer to switch endianness
1109 // this absolutely doesn't rock!
1110 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1111 // could be in read-only memory, or be accessed in some other thread)
1112 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1113 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1114 WC_BSWAP(tmpbuf, inbuf)
1115 psz=tmpbuf;
1116 }
1117
1118 if (buf)
1119 {
1120 // have destination buffer, convert there
1121 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1122
1123 res = n-outbuf;
1124
1125 // NB: iconv was given only wcslen(psz) characters on input, and so
1126 // it couldn't convert the trailing zero. Let's do it ourselves
1127 // if there's some room left for it in the output buffer.
1128 if (res < n)
1129 buf[0] = 0;
1130 }
1131 else
1132 {
1133 // no destination buffer... convert using temp buffer
1134 // to calculate destination buffer requirement
1135 char tbuf[16];
1136 res = 0;
1137 do {
1138 buf = tbuf; outbuf = 16;
1139
1140 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1141
1142 res += 16 - outbuf;
1143 } while ((cres==(size_t)-1) && (errno==E2BIG));
1144 }
1145
1146 if (ms_wcNeedsSwap)
1147 {
1148 free(tmpbuf);
1149 }
1150
1151 if (ICONV_FAILED(cres, inbuf))
1152 {
1153 //VS: it is ok if iconv fails, hence trace only
1154 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1155 return (size_t)-1;
1156 }
1157
1158 return res;
1159 }
1160
1161 #endif // HAVE_ICONV
1162
1163
1164 // ============================================================================
1165 // Win32 conversion classes
1166 // ============================================================================
1167
1168 #ifdef wxHAVE_WIN32_MB2WC
1169
1170 // from utils.cpp
1171 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1172 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1173
1174 class wxMBConv_win32 : public wxMBConv
1175 {
1176 public:
1177 wxMBConv_win32(const wxChar* name)
1178 {
1179 m_CodePage = wxCharsetToCodepage(name);
1180 }
1181
1182 wxMBConv_win32(wxFontEncoding encoding)
1183 {
1184 m_CodePage = wxEncodingToCodepage(encoding);
1185 }
1186
1187 size_t MB2WC(wchar_t *buf, const char *psz, size_t n)
1188 {
1189 const size_t len = ::MultiByteToWideChar
1190 (
1191 m_CodePage, // code page
1192 0, // flags (none)
1193 psz, // input string
1194 -1, // its length (NUL-terminated)
1195 buf, // output string
1196 buf ? n : 0 // size of output buffer
1197 );
1198
1199 // note that it returns # of written chars for buf != NULL and *size*
1200 // of the needed buffer for buf == NULL
1201 return len ? (buf ? len : len - 1) : (size_t)-1;
1202 }
1203
1204 size_t WC2MB(char *buf, const wchar_t *psz, size_t n)
1205 {
1206 const size_t len = ::WideCharToMultiByte
1207 (
1208 m_CodePage, // code page
1209 0, // flags (none)
1210 psz, // input string
1211 -1, // it is (wide) NUL-terminated
1212 buf, // output buffer
1213 buf ? n : 0, // and its size
1214 NULL, // default "replacement" char
1215 NULL // [out] was it used?
1216 );
1217
1218 // see the comment above!
1219 return len ? (buf ? len : len - 1) : (size_t)-1;
1220 }
1221
1222 bool IsOk() const
1223 { return m_CodePage != -1; }
1224
1225 public:
1226 long m_CodePage;
1227 };
1228
1229 #endif // wxHAVE_WIN32_MB2WC
1230
1231
1232 // ============================================================================
1233 // wxEncodingConverter based conversion classes
1234 // ============================================================================
1235
1236 #if wxUSE_FONTMAP
1237
1238 class wxMBConv_wxwin : public wxMBConv
1239 {
1240 private:
1241 void Init()
1242 {
1243 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
1244 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
1245 }
1246
1247 public:
1248 // temporarily just use wxEncodingConverter stuff,
1249 // so that it works while a better implementation is built
1250 wxMBConv_wxwin(const wxChar* name)
1251 {
1252 if (name)
1253 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
1254 else
1255 m_enc = wxFONTENCODING_SYSTEM;
1256
1257 Init();
1258 }
1259
1260 wxMBConv_wxwin(wxFontEncoding enc)
1261 {
1262 m_enc = enc;
1263
1264 Init();
1265 }
1266
1267 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n))
1268 {
1269 size_t inbuf = strlen(psz);
1270 if (buf)
1271 m2w.Convert(psz,buf);
1272 return inbuf;
1273 }
1274
1275 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n))
1276 {
1277 const size_t inbuf = wxWcslen(psz);
1278 if (buf)
1279 w2m.Convert(psz,buf);
1280
1281 return inbuf;
1282 }
1283
1284 bool IsOk() const { return m_ok; }
1285
1286 public:
1287 wxFontEncoding m_enc;
1288 wxEncodingConverter m2w, w2m;
1289
1290 // were we initialized successfully?
1291 bool m_ok;
1292
1293 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
1294 };
1295
1296 #endif // wxUSE_FONTMAP
1297
1298 // ============================================================================
1299 // wxCSConv implementation
1300 // ============================================================================
1301
1302 void wxCSConv::Init()
1303 {
1304 m_name = NULL;
1305 m_convReal = NULL;
1306 m_deferred = true;
1307 }
1308
1309 // find a valid value for the encoding
1310 void wxCSConv::SetEncoding()
1311 {
1312 #if wxUSE_INTL
1313 m_encoding = wxLocale::GetSystemEncoding();
1314 #else
1315 m_encoding = wxFONTENCODING_SYSTEM;
1316 #endif
1317 }
1318
1319 wxCSConv::wxCSConv(const wxChar *charset)
1320 {
1321 Init();
1322
1323 if ( charset )
1324 {
1325 // not used
1326 m_encoding = wxFONTENCODING_SYSTEM;
1327
1328 SetName(charset);
1329 }
1330 else // no charset specified
1331 {
1332 SetEncoding();
1333 }
1334 }
1335
1336 wxCSConv::wxCSConv(wxFontEncoding encoding)
1337 {
1338 if ( encoding == wxFONTENCODING_MAX ||
1339 encoding == wxFONTENCODING_DEFAULT )
1340 {
1341 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1342
1343 encoding = wxFONTENCODING_SYSTEM;
1344 }
1345
1346 Init();
1347
1348 if ( encoding == wxFONTENCODING_SYSTEM )
1349 {
1350 SetEncoding();
1351 }
1352 else // have valid encoding, use it
1353 {
1354 m_encoding = encoding;
1355 }
1356 }
1357
1358 wxCSConv::~wxCSConv()
1359 {
1360 Clear();
1361 }
1362
1363 wxCSConv::wxCSConv(const wxCSConv& conv)
1364 : wxMBConv()
1365 {
1366 Init();
1367
1368 SetName(conv.m_name);
1369 m_encoding = conv.m_encoding;
1370 }
1371
1372 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
1373 {
1374 Clear();
1375
1376 SetName(conv.m_name);
1377 m_encoding = conv.m_encoding;
1378
1379 return *this;
1380 }
1381
1382 void wxCSConv::Clear()
1383 {
1384 free(m_name);
1385 delete m_convReal;
1386
1387 m_name = NULL;
1388 m_convReal = NULL;
1389 }
1390
1391 void wxCSConv::SetName(const wxChar *charset)
1392 {
1393 if (charset)
1394 {
1395 m_name = wxStrdup(charset);
1396 m_deferred = true;
1397 }
1398 }
1399
1400 static inline bool DoesntNeedConv(wxFontEncoding enc)
1401 {
1402 return enc == wxFONTENCODING_DEFAULT ||
1403 enc == wxFONTENCODING_SYSTEM ||
1404 enc == wxFONTENCODING_ISO8859_1;
1405 }
1406
1407 wxMBConv *wxCSConv::DoCreate() const
1408 {
1409 #if wxUSE_FONTMAP
1410 wxFontMapper * const fontMapper = wxFontMapper::Get();
1411
1412 wxFontEncoding encFromName = m_name ? fontMapper->CharsetToEncoding(m_name)
1413 : wxFONTENCODING_SYSTEM;
1414 #endif // wxUSE_FONTMAP
1415
1416 // check for the special case of ASCII charset
1417 if ( (!m_name && DoesntNeedConv(m_encoding))
1418 #if wxUSE_FONTMAP
1419 || (m_name && DoesntNeedConv(encFromName))
1420 #endif // wxUSE_FONTMAP
1421 )
1422 {
1423 // don't convert at all
1424 return NULL;
1425 }
1426
1427 // we trust OS to do conversion better than we can so try external
1428 // conversion methods first
1429 //
1430 // the full order is:
1431 // 1. OS conversion (iconv() under Unix or Win32 API)
1432 // 2. hard coded conversions for UTF
1433 // 3. wxEncodingConverter as fall back
1434
1435 // step (1)
1436 #ifdef HAVE_ICONV
1437 if ( m_name )
1438 {
1439 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
1440 if ( conv->IsOk() )
1441 return conv;
1442
1443 delete conv;
1444 }
1445 #endif // HAVE_ICONV
1446
1447 #ifdef wxHAVE_WIN32_MB2WC
1448 {
1449 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
1450 : new wxMBConv_win32(m_encoding);
1451 if ( conv->IsOk() )
1452 return conv;
1453
1454 delete conv;
1455 }
1456 #endif // wxHAVE_WIN32_MB2WC
1457
1458 // step (2)
1459 wxFontEncoding enc = m_encoding;
1460 #if wxUSE_FONTMAP
1461 if ( enc == wxFONTENCODING_SYSTEM )
1462 enc = encFromName;
1463 #endif // wxUSE_FONTMAP
1464
1465 switch ( enc )
1466 {
1467 case wxFONTENCODING_UTF7:
1468 return new wxMBConvUTF7;
1469
1470 case wxFONTENCODING_UTF8:
1471 return new wxMBConvUTF8;
1472
1473 case wxFONTENCODING_UTF16:
1474 return new wxMBConvUTF16;
1475
1476 case wxFONTENCODING_UTF16BE:
1477 return new wxMBConvUTF16BE;
1478
1479 case wxFONTENCODING_UTF16LE:
1480 return new wxMBConvUTF16LE;
1481
1482 case wxFONTENCODING_UTF32:
1483 return new wxMBConvUTF32;
1484
1485 case wxFONTENCODING_UTF32BE:
1486 return new wxMBConvUTF32BE;
1487
1488 case wxFONTENCODING_UTF32LE:
1489 return new wxMBConvUTF32LE;
1490
1491 default:
1492 // nothing to do but put here to suppress gcc warnings
1493 ;
1494 }
1495
1496 // step (3)
1497 #if wxUSE_FONTMAP
1498 {
1499 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
1500 : new wxMBConv_wxwin(m_encoding);
1501 if ( conv->IsOk() )
1502 return conv;
1503
1504 delete conv;
1505 }
1506 #endif // wxUSE_FONTMAP
1507
1508 wxLogError(_("Cannot convert from the charset '%s'!"),
1509 m_name ? m_name
1510 :
1511 #if wxUSE_FONTMAP
1512 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
1513 #else // !wxUSE_FONTMAP
1514 wxString::Format(_("encoding %s"), m_encoding).c_str()
1515 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1516 );
1517
1518 return NULL;
1519 }
1520
1521 void wxCSConv::CreateConvIfNeeded() const
1522 {
1523 if ( m_deferred )
1524 {
1525 wxCSConv *self = (wxCSConv *)this; // const_cast
1526 self->m_convReal = DoCreate();
1527 self->m_deferred = false;
1528 }
1529 }
1530
1531 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1532 {
1533 CreateConvIfNeeded();
1534
1535 if (m_convReal)
1536 return m_convReal->MB2WC(buf, psz, n);
1537
1538 // latin-1 (direct)
1539 size_t len = strlen(psz);
1540
1541 if (buf)
1542 {
1543 for (size_t c = 0; c <= len; c++)
1544 buf[c] = (unsigned char)(psz[c]);
1545 }
1546
1547 return len;
1548 }
1549
1550 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1551 {
1552 CreateConvIfNeeded();
1553
1554 if (m_convReal)
1555 return m_convReal->WC2MB(buf, psz, n);
1556
1557 // latin-1 (direct)
1558 const size_t len = wxWcslen(psz);
1559 if (buf)
1560 {
1561 for (size_t c = 0; c <= len; c++)
1562 buf[c] = (psz[c] > 0xff) ? '?' : psz[c];
1563 }
1564
1565 return len;
1566 }
1567
1568 #endif // wxUSE_WCHAR_T
1569
1570