]>
git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 // ============================================================================
14 // ============================================================================
16 // ----------------------------------------------------------------------------
18 // ----------------------------------------------------------------------------
21 #pragma implementation "strconv.h"
24 // For compilers that support precompilation, includes "wx.h".
25 #include "wx/wxprec.h"
32 #include "wx/msw/private.h"
53 #include "wx/strconv.h"
57 #if defined(WORDS_BIGENDIAN) || defined(__STDC_ISO_10646__)
58 #define BSWAP_UCS4(str, len)
59 #define BSWAP_UCS2(str, len)
61 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
62 #define BSWAP_UCS2(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
65 #define BSWAP_UTF32(str, len) BSWAP_UCS4(str, len)
66 #define BSWAP_UTF16(str, len) BSWAP_UCS2(str, len)
68 // under Unix SIZEOF_WCHAR_T is defined by configure, but under other platforms
69 // it might be not defined - assume the most common value
70 #ifndef SIZEOF_WCHAR_T
71 #define SIZEOF_WCHAR_T 2
72 #endif // !defined(SIZEOF_WCHAR_T)
74 #if SIZEOF_WCHAR_T == 4
75 #define WC_NAME "UCS4"
76 #define WC_BSWAP BSWAP_UCS4
77 #elif SIZEOF_WCHAR_T == 2
78 #define WC_NAME "UTF16"
79 #define WC_BSWAP BSWAP_UTF16
81 #else // sizeof(wchar_t) != 2 nor 4
82 // I don't know what to do about this
83 #error "Weird sizeof(wchar_t): please report your platform details to wx-users mailing list"
86 // ----------------------------------------------------------------------------
88 // ----------------------------------------------------------------------------
90 WXDLLEXPORT_DATA(wxMBConv
*) wxConvCurrent
= &wxConvLibc
;
92 // ============================================================================
94 // ============================================================================
100 static size_t encode_utf16(wxUint32 input
, wchar_t *output
)
104 if (output
) *output
++ = input
;
107 else if (input
>=0x110000)
115 *output
++ = (input
>> 10)+0xd7c0;
116 *output
++ = (input
&0x3ff)+0xdc00;
122 static size_t decode_utf16(const wchar_t* input
, wxUint32
& output
)
124 if ((*input
<0xd800) || (*input
>0xdfff))
129 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
136 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
143 // ----------------------------------------------------------------------------
145 // ----------------------------------------------------------------------------
147 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
;
149 size_t wxMBConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
151 return wxMB2WC(buf
, psz
, n
);
154 size_t wxMBConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
156 return wxWC2MB(buf
, psz
, n
);
159 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
163 size_t nLen
= MB2WC((wchar_t *) NULL
, psz
, 0);
164 if (nLen
== (size_t)-1)
165 return wxWCharBuffer((wchar_t *) NULL
);
166 wxWCharBuffer
buf(nLen
);
167 MB2WC((wchar_t *)(const wchar_t *) buf
, psz
, nLen
);
171 return wxWCharBuffer((wchar_t *) NULL
);
174 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *psz
) const
178 size_t nLen
= WC2MB((char *) NULL
, psz
, 0);
179 if (nLen
== (size_t)-1)
180 return wxCharBuffer((char *) NULL
);
181 wxCharBuffer
buf(nLen
);
182 WC2MB((char *)(const char *) buf
, psz
, nLen
);
186 return wxCharBuffer((char *) NULL
);
189 // ----------------------------------------------------------------------------
190 // standard file conversion
191 // ----------------------------------------------------------------------------
193 WXDLLEXPORT_DATA(wxMBConvFile
) wxConvFile
;
195 // just use the libc conversion for now
196 size_t wxMBConvFile::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
198 return wxMB2WC(buf
, psz
, n
);
201 size_t wxMBConvFile::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
203 return wxWC2MB(buf
, psz
, n
);
206 // ----------------------------------------------------------------------------
207 // standard gdk conversion
208 // ----------------------------------------------------------------------------
212 WXDLLEXPORT_DATA(wxMBConvGdk
) wxConvGdk
;
216 size_t wxMBConvGdk::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
220 return gdk_mbstowcs((GdkWChar
*)buf
, psz
, n
);
224 GdkWChar
*nbuf
= new GdkWChar
[n
=strlen(psz
)];
225 size_t len
= gdk_mbstowcs(nbuf
, psz
, n
);
231 size_t wxMBConvGdk::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
233 char *mbstr
= gdk_wcstombs((GdkWChar
*)psz
);
234 size_t len
= mbstr
? strlen(mbstr
) : 0;
239 memcpy(buf
, psz
, len
);
248 // ----------------------------------------------------------------------------
250 // ----------------------------------------------------------------------------
252 WXDLLEXPORT_DATA(wxMBConvUTF7
) wxConvUTF7
;
255 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
256 "abcdefghijklmnopqrstuvwxyz"
257 "0123456789'(),-./:?";
258 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
259 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
260 "abcdefghijklmnopqrstuvwxyz"
264 // TODO: write actual implementations of UTF-7 here
265 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
266 const char * WXUNUSED(psz
),
267 size_t WXUNUSED(n
)) const
272 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
273 const wchar_t * WXUNUSED(psz
),
274 size_t WXUNUSED(n
)) const
279 // ----------------------------------------------------------------------------
281 // ----------------------------------------------------------------------------
283 WXDLLEXPORT_DATA(wxMBConvUTF8
) wxConvUTF8
;
285 static wxUint32 utf8_max
[]=
286 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
288 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
292 while (*psz
&& ((!buf
) || (len
< n
)))
294 unsigned char cc
= *psz
++, fc
= cc
;
296 for (cnt
= 0; fc
& 0x80; cnt
++)
310 // invalid UTF-8 sequence
315 unsigned ocnt
= cnt
- 1;
316 wxUint32 res
= cc
& (0x3f >> cnt
);
320 if ((cc
& 0xC0) != 0x80)
322 // invalid UTF-8 sequence
325 res
= (res
<< 6) | (cc
& 0x3f);
327 if (res
<= utf8_max
[ocnt
])
329 // illegal UTF-8 encoding
333 size_t pa
= encode_utf16(res
, buf
);
334 if (pa
== (size_t)-1)
347 if (buf
&& (len
< n
))
352 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
356 while (*psz
&& ((!buf
) || (len
< n
)))
360 size_t pa
= decode_utf16(psz
, cc
);
361 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
363 cc
=(*psz
++) & 0x7fffffff;
366 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
380 *buf
++ = (-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
));
382 *buf
++ = 0x80 | ((cc
>> (cnt
* 6)) & 0x3f);
387 if (buf
&& (len
<n
)) *buf
= 0;
391 // ----------------------------------------------------------------------------
392 // specified character set
393 // ----------------------------------------------------------------------------
395 WXDLLEXPORT_DATA(wxCSConv
) wxConvLocal((const wxChar
*)NULL
);
397 #include "wx/encconv.h"
398 #include "wx/fontmap.h"
400 // TODO: add some tables here
401 // - perhaps common encodings to common codepages (for Win32)
402 // - perhaps common encodings to objects ("UTF8" -> wxConvUTF8)
403 // - move wxEncodingConverter meat in here
405 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
406 #include "wx/msw/registry.h"
407 // this should work if M$ Internet Exploiter is installed
408 static long CharsetToCodepage(const wxChar
*name
)
417 wxString
path(wxT("MIME\\Database\\Charset\\"));
419 wxRegKey
key(wxRegKey::HKCR
, path
);
421 if (!key
.Exists()) break;
423 // two cases: either there's an AliasForCharset string,
424 // or there are Codepage and InternetEncoding dwords.
425 // The InternetEncoding gives us the actual encoding,
426 // the Codepage just says which Windows character set to
427 // use when displaying the data.
428 if (key
.HasValue(wxT("InternetEncoding")) &&
429 key
.QueryValue(wxT("InternetEncoding"), &CP
)) break;
431 // no encoding, see if it's an alias
432 if (!key
.HasValue(wxT("AliasForCharset")) ||
433 !key
.QueryValue(wxT("AliasForCharset"), cn
)) break;
443 wxCharacterSet(const wxChar
*name
)
445 virtual ~wxCharacterSet()
447 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
)
448 { return (size_t)-1; }
449 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
)
450 { return (size_t)-1; }
451 virtual bool usable()
457 class ID_CharSet
: public wxCharacterSet
460 ID_CharSet(const wxChar
*name
,wxMBConv
*cnv
)
461 : wxCharacterSet(name
), work(cnv
) {}
463 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
)
464 { return work
? work
->MB2WC(buf
,psz
,n
) : (size_t)-1; }
466 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
)
467 { return work
? work
->WC2MB(buf
,psz
,n
) : (size_t)-1; }
470 { return work
!=NULL
; }
478 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
479 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
480 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
481 // (which means error) and says there are 0 bytes left in the input buffer --
482 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
483 // this alternative test for iconv() failure.
484 // [This bug does not appear in glibc 2.2.]
485 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
486 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
487 (errno != E2BIG || bufLeft != 0))
489 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
492 class IC_CharSet
: public wxCharacterSet
495 IC_CharSet(const wxChar
*name
)
496 : wxCharacterSet(name
)
498 m2w
= iconv_open(WC_NAME
, wxConvLibc
.cWX2MB(cname
));
499 w2m
= iconv_open(wxConvLibc
.cWX2MB(cname
), WC_NAME
);
504 if ( m2w
!= (iconv_t
)-1 )
506 if ( w2m
!= (iconv_t
)-1 )
510 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
)
512 size_t inbuf
= strlen(psz
);
513 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
515 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
516 wchar_t *bufPtr
= buf
;
517 const char *pszPtr
= psz
;
521 // have destination buffer, convert there
522 #ifdef WX_ICONV_TAKES_CHAR
523 cres
= iconv(m2w
, (char**)&pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
525 cres
= iconv(m2w
, &pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
527 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
528 // convert to native endianness
530 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
535 // no destination buffer... convert using temp buffer
536 // to calculate destination buffer requirement
540 bufPtr
= tbuf
; outbuf
= 8*SIZEOF_WCHAR_T
;
541 #ifdef WX_ICONV_TAKES_CHAR
542 cres
= iconv( m2w
, (char**)&pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
544 cres
= iconv( m2w
, &pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
546 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
547 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
550 if (ICONV_FAILED(cres
, inbuf
))
556 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
)
558 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
559 size_t inbuf
= std::wcslen(psz
) * SIZEOF_WCHAR_T
;
561 size_t inbuf
= ::wcslen(psz
) * SIZEOF_WCHAR_T
;
567 // need to copy to temp buffer to switch endianness
568 // this absolutely doesn't rock!
569 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
570 // could be in read-only memory, or be accessed in some other thread)
571 wchar_t *tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
572 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
573 WC_BSWAP(tmpbuf
, inbuf
)
578 // have destination buffer, convert there
579 #ifdef WX_ICONV_TAKES_CHAR
580 cres
= iconv( w2m
, (char**)&psz
, &inbuf
, &buf
, &outbuf
);
582 cres
= iconv( w2m
, (const char**)&psz
, &inbuf
, &buf
, &outbuf
);
588 // no destination buffer... convert using temp buffer
589 // to calculate destination buffer requirement
593 buf
= tbuf
; outbuf
= 16;
594 #ifdef WX_ICONV_TAKES_CHAR
595 cres
= iconv( w2m
, (char**)&psz
, &inbuf
, &buf
, &outbuf
);
597 cres
= iconv( w2m
, (const char**)&psz
, &inbuf
, &buf
, &outbuf
);
600 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
605 if (ICONV_FAILED(cres
, inbuf
))
612 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
619 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
620 class CP_CharSet
: public wxCharacterSet
623 CP_CharSet(const wxChar
*name
)
624 : wxCharacterSet(name
), CodePage(CharsetToCodepage(name
)) {}
626 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
)
629 MultiByteToWideChar(CodePage
, 0, psz
, -1, buf
, buf
? n
: 0);
630 //VS: returns # of written chars for buf!=NULL and *size*
631 // needed buffer for buf==NULL
632 return len
? (buf
? len
: len
-1) : (size_t)-1;
635 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
)
637 size_t len
= WideCharToMultiByte(CodePage
, 0, psz
, -1, buf
,
638 buf
? n
: 0, NULL
, NULL
);
639 //VS: returns # of written chars for buf!=NULL and *size*
640 // needed buffer for buf==NULL
641 return len
? (buf
? len
: len
-1) : (size_t)-1;
645 { return CodePage
!= -1; }
654 class EC_CharSet
: public wxCharacterSet
657 // temporarily just use wxEncodingConverter stuff,
658 // so that it works while a better implementation is built
659 EC_CharSet(const wxChar
*name
) : wxCharacterSet(name
),
660 enc(wxFONTENCODING_SYSTEM
)
663 enc
= wxTheFontMapper
->CharsetToEncoding(name
, FALSE
);
664 m2w
.Init(enc
, wxFONTENCODING_UNICODE
);
665 w2m
.Init(wxFONTENCODING_UNICODE
, enc
);
668 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
)
670 size_t inbuf
= strlen(psz
);
672 m2w
.Convert(psz
,buf
);
676 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
)
678 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
679 size_t inbuf
= std::wcslen(psz
);
681 size_t inbuf
= ::wcslen(psz
);
684 w2m
.Convert(psz
,buf
);
690 { return (enc
!=wxFONTENCODING_SYSTEM
) && (enc
!=wxFONTENCODING_DEFAULT
); }
694 wxEncodingConverter m2w
, w2m
;
697 #endif // wxUSE_FONTMAP
699 static wxCharacterSet
*wxGetCharacterSet(const wxChar
*name
)
701 wxCharacterSet
*cset
= NULL
;
704 if (wxStricmp(name
, wxT("UTF8")) == 0 || wxStricmp(name
, wxT("UTF-8")) == 0)
706 cset
= new ID_CharSet(name
, &wxConvUTF8
);
711 cset
= new IC_CharSet(name
); // may not take NULL
716 if (cset
&& cset
->usable())
725 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
726 cset
= new CP_CharSet(name
); // may take NULL
734 cset
= new EC_CharSet(name
);
737 #endif // wxUSE_FONTMAP
740 wxLogError(_("Unknown encoding '%s'!"), name
);
744 wxCSConv::wxCSConv(const wxChar
*charset
)
746 m_name
= (wxChar
*)NULL
;
747 m_cset
= (wxCharacterSet
*) NULL
;
753 wxCSConv::~wxCSConv()
759 void wxCSConv::SetName(const wxChar
*charset
)
763 m_name
= wxStrdup(charset
);
768 void wxCSConv::LoadNow()
774 wxString name
= wxLocale::GetSystemEncodingName();
779 m_cset
= wxGetCharacterSet(m_name
);
784 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
786 ((wxCSConv
*)this)->LoadNow(); // discard constness
789 return m_cset
->MB2WC(buf
, psz
, n
);
792 size_t len
= strlen(psz
);
796 for (size_t c
= 0; c
<= len
; c
++)
797 buf
[c
] = (unsigned char)(psz
[c
]);
803 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
805 ((wxCSConv
*)this)->LoadNow(); // discard constness
808 return m_cset
->WC2MB(buf
, psz
, n
);
811 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
812 size_t len
=std::wcslen(psz
);
814 size_t len
=::wcslen(psz
);
818 for (size_t c
= 0; c
<= len
; c
++)
819 buf
[c
] = (psz
[c
] > 0xff) ? '?' : psz
[c
];
827 class IC_CharSetConverter
830 IC_CharSetConverter(IC_CharSet
*from
, IC_CharSet
*to
)
832 cnv
= iconv_open(wxConvLibc
.cWX2MB(to
->cname
),
833 wxConvLibc
.cWX2MB(from
->cname
));
836 ~IC_CharSetConverter()
838 if (cnv
!= (iconv_t
)-1)
842 size_t Convert(char *buf
, const char *psz
, size_t n
)
844 size_t inbuf
= strlen(psz
);
846 #ifdef WX_ICONV_TAKES_CHAR
847 size_t res
= iconv( cnv
, (char**)&psz
, &inbuf
, &buf
, &outbuf
);
849 size_t res
= iconv( cnv
, &psz
, &inbuf
, &buf
, &outbuf
);
851 if (res
== (size_t)-1)
860 #endif // HAVE_ICONV_H
862 class EC_CharSetConverter
865 EC_CharSetConverter(EC_CharSet
* from
,EC_CharSet
* to
)
866 { cnv
.Init(from
->enc
,to
->enc
); }
868 size_t Convert(char* buf
, const char* psz
, size_t n
)
870 size_t inbuf
= strlen(psz
);
871 if (buf
) cnv
.Convert(psz
,buf
);
876 wxEncodingConverter cnv
;
879 #else // !wxUSE_WCHAR_T
881 // ----------------------------------------------------------------------------
882 // stand-ins in absence of wchar_t
883 // ----------------------------------------------------------------------------
885 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
, wxConvFile
;
887 #endif // wxUSE_WCHAR_T