]>
git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
fdb714185f8c7b36b5d5cd344de9c3278ec607bc
1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 // ============================================================================
14 // ============================================================================
16 // ----------------------------------------------------------------------------
18 // ----------------------------------------------------------------------------
21 #pragma implementation "strconv.h"
24 // For compilers that support precompilation, includes "wx.h".
25 #include "wx/wxprec.h"
32 #include "wx/msw/private.h"
47 #ifdef HAVE_LANGINFO_H
56 #include "wx/strconv.h"
60 #if defined(WORDS_BIGENDIAN) || defined(__STDC_ISO_10646__)
61 #define BSWAP_UCS4(str, len)
62 #define BSWAP_UCS2(str, len)
64 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
65 #define BSWAP_UCS2(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
68 #define BSWAP_UTF32(str, len) BSWAP_UCS4(str, len)
69 #define BSWAP_UTF16(str, len) BSWAP_UCS2(str, len)
71 #if SIZEOF_WCHAR_T == 4
72 #define WC_NAME "UCS4"
73 #define WC_BSWAP BSWAP_UCS4
74 #elif SIZEOF_WCHAR_T == 2
75 #define WC_NAME "UTF16"
76 #define WC_BSWAP BSWAP_UTF16
80 // ----------------------------------------------------------------------------
82 // ----------------------------------------------------------------------------
84 WXDLLEXPORT_DATA(wxMBConv
*) wxConvCurrent
= &wxConvLibc
;
86 // ============================================================================
88 // ============================================================================
94 static size_t encode_utf16(wxUint32 input
,wxUint16
*output
)
97 if (output
) *output
++ = input
;
100 if (input
>=0x110000) {
104 *output
++ = (input
>> 10)+0xd7c0;
105 *output
++ = (input
&0x3ff)+0xdc00;
111 static size_t decode_utf16(wxUint16
*input
,wxUint32
&output
)
113 if ((*input
<0xd800) || (*input
>0xdfff)) {
117 if ((input
[1]<0xdc00) || (input
[1]>=0xdfff)) {
121 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
128 // ----------------------------------------------------------------------------
130 // ----------------------------------------------------------------------------
132 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
;
134 size_t wxMBConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
136 return wxMB2WC(buf
, psz
, n
);
139 size_t wxMBConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
141 return wxWC2MB(buf
, psz
, n
);
144 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
148 size_t nLen
= MB2WC((wchar_t *) NULL
, psz
, 0);
149 if (nLen
== (size_t)-1)
150 return wxWCharBuffer((wchar_t *) NULL
);
151 wxWCharBuffer
buf(nLen
);
152 MB2WC((wchar_t *)(const wchar_t *) buf
, psz
, nLen
);
156 return wxWCharBuffer((wchar_t *) NULL
);
159 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *psz
) const
163 size_t nLen
= WC2MB((char *) NULL
, psz
, 0);
164 if (nLen
== (size_t)-1)
165 return wxCharBuffer((char *) NULL
);
166 wxCharBuffer
buf(nLen
);
167 WC2MB((char *)(const char *) buf
, psz
, nLen
);
171 return wxCharBuffer((char *) NULL
);
174 // ----------------------------------------------------------------------------
175 // standard file conversion
176 // ----------------------------------------------------------------------------
178 WXDLLEXPORT_DATA(wxMBConvFile
) wxConvFile
;
180 // just use the libc conversion for now
181 size_t wxMBConvFile::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
183 return wxMB2WC(buf
, psz
, n
);
186 size_t wxMBConvFile::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
188 return wxWC2MB(buf
, psz
, n
);
191 // ----------------------------------------------------------------------------
192 // standard gdk conversion
193 // ----------------------------------------------------------------------------
197 WXDLLEXPORT_DATA(wxMBConvGdk
) wxConvGdk
;
201 size_t wxMBConvGdk::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
204 return gdk_mbstowcs((GdkWChar
*)buf
, psz
, n
);
206 GdkWChar
*nbuf
= new GdkWChar
[n
=strlen(psz
)];
207 size_t len
= gdk_mbstowcs(nbuf
, psz
, n
);
213 size_t wxMBConvGdk::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
215 char *mbstr
= gdk_wcstombs((GdkWChar
*)psz
);
216 size_t len
= mbstr
? strlen(mbstr
) : 0;
218 if (len
> n
) len
= n
;
219 memcpy(buf
, psz
, len
);
220 if (len
< n
) buf
[len
] = 0;
227 // ----------------------------------------------------------------------------
229 // ----------------------------------------------------------------------------
231 WXDLLEXPORT_DATA(wxMBConvUTF7
) wxConvUTF7
;
234 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
235 "abcdefghijklmnopqrstuvwxyz"
236 "0123456789'(),-./:?";
237 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
238 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
239 "abcdefghijklmnopqrstuvwxyz"
243 // TODO: write actual implementations of UTF-7 here
244 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
245 const char * WXUNUSED(psz
),
246 size_t WXUNUSED(n
)) const
251 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
252 const wchar_t * WXUNUSED(psz
),
253 size_t WXUNUSED(n
)) const
258 // ----------------------------------------------------------------------------
260 // ----------------------------------------------------------------------------
262 WXDLLEXPORT_DATA(wxMBConvUTF8
) wxConvUTF8
;
264 static wxUint32 utf8_max
[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
266 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
270 while (*psz
&& ((!buf
) || (len
<n
))) {
271 unsigned char cc
=*psz
++, fc
=cc
;
273 for (cnt
=0; fc
&0x80; cnt
++) fc
<<=1;
281 // invalid UTF-8 sequence
285 wxUint32 res
=cc
&(0x3f>>cnt
);
288 if ((cc
&0xC0)!=0x80) {
289 // invalid UTF-8 sequence
292 res
=(res
<<6)|(cc
&0x3f);
294 if (res
<=utf8_max
[ocnt
]) {
295 // illegal UTF-8 encoding
299 size_t pa
= encode_utf16(res
, buf
);
300 if (pa
== (size_t)-1)
311 if (buf
&& (len
<n
)) *buf
= 0;
315 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
319 while (*psz
&& ((!buf
) || (len
<n
))) {
322 size_t pa
= decode_utf16(psz
,cc
);
323 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
325 cc
=(*psz
++)&0x7fffffff;
328 for (cnt
=0; cc
>utf8_max
[cnt
]; cnt
++);
336 *buf
++=(-128>>cnt
)|((cc
>>(cnt
*6))&(0x3f>>cnt
));
338 *buf
++=0x80|((cc
>>(cnt
*6))&0x3f);
342 if (buf
&& (len
<n
)) *buf
= 0;
346 // ----------------------------------------------------------------------------
347 // specified character set
348 // ----------------------------------------------------------------------------
350 WXDLLEXPORT_DATA(wxCSConv
) wxConvLocal((const wxChar
*)NULL
);
352 #include "wx/encconv.h"
353 #include "wx/fontmap.h"
355 // TODO: add some tables here
356 // - perhaps common encodings to common codepages (for Win32)
357 // - perhaps common encodings to objects ("UTF8" -> wxConvUTF8)
358 // - move wxEncodingConverter meat in here
361 #include "wx/msw/registry.h"
362 // this should work if M$ Internet Exploiter is installed
363 static long CharsetToCodepage(const wxChar
*name
)
372 wxString
path(wxT("MIME\\Database\\Charset\\"));
374 wxRegKey
key(wxRegKey::HKCR
, path
);
376 if (!key
.Exists()) continue;
378 // two cases: either there's an AliasForCharset string,
379 // or there are Codepage and InternetEncoding dwords.
380 // The InternetEncoding gives us the actual encoding,
381 // the Codepage just says which Windows character set to
382 // use when displaying the data.
383 if (key
.HasValue(wxT("InternetEncoding")) &&
384 key
.QueryValue(wxT("InternetEncoding"), &CP
)) break;
386 // no encoding, see if it's an alias
387 if (!key
.HasValue(wxT("AliasForCharset")) ||
388 !key
.QueryValue(wxT("AliasForCharset"), cn
)) break;
398 wxCharacterSet(const wxChar
*name
)
400 virtual ~wxCharacterSet()
402 virtual size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
)
403 { return (size_t)-1; }
404 virtual size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
)
405 { return (size_t)-1; }
406 virtual bool usable()
412 class ID_CharSet
: public wxCharacterSet
415 ID_CharSet(const wxChar
*name
,wxMBConv
*cnv
)
416 : wxCharacterSet(name
), work(cnv
) {}
418 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
)
419 { return work
? work
->MB2WC(buf
,psz
,n
) : (size_t)-1; }
421 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
)
422 { return work
? work
->WC2MB(buf
,psz
,n
) : (size_t)-1; }
425 { return work
!=NULL
; }
433 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
434 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
435 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
436 // (which means error) and says there are 0 bytes left in the input buffer --
437 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
438 // this alternative test for iconv() failure.
439 // [This bug does not appear in glibc 2.2.]
440 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
441 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
442 (errno != E2BIG || bufLeft != 0))
444 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
447 class IC_CharSet
: public wxCharacterSet
450 IC_CharSet(const wxChar
*name
)
451 : wxCharacterSet(name
)
453 m2w
= iconv_open(WC_NAME
, wxConvLibc
.cWX2MB(cname
));
454 w2m
= iconv_open(wxConvLibc
.cWX2MB(cname
), WC_NAME
);
459 if ( m2w
!= (iconv_t
)-1 )
461 if ( w2m
!= (iconv_t
)-1 )
465 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
)
467 size_t inbuf
= strlen(psz
);
468 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
470 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
471 wchar_t *bufPtr
= buf
;
472 const char *pszPtr
= psz
;
476 // have destination buffer, convert there
477 #ifdef WX_ICONV_TAKES_CHAR
478 cres
= iconv(m2w
, (char**)&pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
480 cres
= iconv(m2w
, &pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
482 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
483 // convert to native endianness
485 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
490 // no destination buffer... convert using temp buffer
491 // to calculate destination buffer requirement
495 bufPtr
= tbuf
; outbuf
= 8*SIZEOF_WCHAR_T
;
496 #ifdef WX_ICONV_TAKES_CHAR
497 cres
= iconv( m2w
, (char**)&pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
499 cres
= iconv( m2w
, &pszPtr
, &inbuf
, (char**)&bufPtr
, &outbuf
);
501 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
502 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
505 if (ICONV_FAILED(cres
, inbuf
))
511 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
)
513 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
514 size_t inbuf
= std::wcslen(psz
) * SIZEOF_WCHAR_T
;
516 size_t inbuf
= ::wcslen(psz
) * SIZEOF_WCHAR_T
;
522 // need to copy to temp buffer to switch endianness
523 // this absolutely doesn't rock!
524 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
525 // could be in read-only memory, or be accessed in some other thread)
526 wchar_t*tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
527 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
528 WC_BSWAP(tmpbuf
, inbuf
)
533 // have destination buffer, convert there
534 #ifdef WX_ICONV_TAKES_CHAR
535 cres
= iconv( w2m
, (char**)&psz
, &inbuf
, &buf
, &outbuf
);
537 cres
= iconv( w2m
, (const char**)&psz
, &inbuf
, &buf
, &outbuf
);
543 // no destination buffer... convert using temp buffer
544 // to calculate destination buffer requirement
548 buf
= tbuf
; outbuf
= 16;
549 #ifdef WX_ICONV_TAKES_CHAR
550 cres
= iconv( w2m
, (char**)&psz
, &inbuf
, &buf
, &outbuf
);
552 cres
= iconv( w2m
, (const char**)&psz
, &inbuf
, &buf
, &outbuf
);
555 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
560 if (ICONV_FAILED(cres
, inbuf
))
567 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
575 class CP_CharSet
: public wxCharacterSet
578 CP_CharSet(const wxChar
*name
)
579 : wxCharacterSet(name
), CodePage(CharsetToCodepage(name
)) {}
581 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
)
583 size_t len
= MultiByteToWideChar(CodePage
,0,psz
,-1,buf
,buf
?n
:0);
584 return len
? len
: (size_t)-1;
587 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
)
589 size_t len
= WideCharToMultiByte(CodePage
,0,psz
,-1,buf
,buf
?n
:0,NULL
,NULL
);
590 return len
? len
: (size_t)-1;
594 { return CodePage
!=-1; }
601 class EC_CharSet
: public wxCharacterSet
604 // temporarily just use wxEncodingConverter stuff,
605 // so that it works while a better implementation is built
606 EC_CharSet(const wxChar
*name
) : wxCharacterSet(name
), enc(wxFONTENCODING_SYSTEM
)
609 enc
= wxTheFontMapper
->CharsetToEncoding(name
, FALSE
);
610 m2w
.Init(enc
, wxFONTENCODING_UNICODE
);
611 w2m
.Init(wxFONTENCODING_UNICODE
, enc
);
614 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
)
616 size_t inbuf
= strlen(psz
);
617 if (buf
) m2w
.Convert(psz
,buf
);
621 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
)
623 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
624 size_t inbuf
= std::wcslen(psz
);
626 size_t inbuf
= ::wcslen(psz
);
629 w2m
.Convert(psz
,buf
);
635 { return (enc
!=wxFONTENCODING_SYSTEM
) && (enc
!=wxFONTENCODING_DEFAULT
); }
639 wxEncodingConverter m2w
, w2m
;
642 static wxCharacterSet
*wxGetCharacterSet(const wxChar
*name
)
644 wxCharacterSet
*cset
= NULL
;
647 if (!wxStricmp(name
, wxT("UTF8")) || !wxStricmp(name
, wxT("UTF-8")))
649 cset
= new ID_CharSet(name
, &wxConvUTF8
);
654 cset
= new IC_CharSet(name
); // may not take NULL
659 if (cset
&& cset
->usable()) return cset
;
660 if (cset
) delete cset
;
663 cset
= new CP_CharSet(name
); // may take NULL
664 if (cset
->usable()) return cset
;
666 if (cset
) delete cset
;
667 cset
= new EC_CharSet(name
);
668 if (cset
->usable()) return cset
;
670 wxLogError(_("Unknown encoding '%s'!"), name
);
674 wxCSConv::wxCSConv(const wxChar
*charset
)
676 m_name
= (wxChar
*) NULL
;
677 m_cset
= (wxCharacterSet
*) NULL
;
682 wxCSConv::~wxCSConv()
684 if (m_name
) free(m_name
);
685 if (m_cset
) delete m_cset
;
688 void wxCSConv::SetName(const wxChar
*charset
)
692 m_name
= wxStrdup(charset
);
697 void wxCSConv::LoadNow()
704 #if defined(HAVE_LANGINFO_H) && defined(CODESET)
705 // GNU libc provides current character set this way
706 char*alang
= nl_langinfo(CODESET
);
709 SetName(wxConvLibc
.cMB2WX(alang
));
714 // if we can't get at the character set directly,
715 // try to see if it's in the environment variables
716 // (in most cases this won't work, but I was out of ideas)
717 wxChar
*lang
= wxGetenv(wxT("LC_ALL"));
718 if (!lang
) lang
= wxGetenv(wxT("LC_CTYPE"));
719 if (!lang
) lang
= wxGetenv(wxT("LANG"));
720 wxChar
*dot
= lang
? wxStrchr(lang
, wxT('.')) : (wxChar
*)NULL
;
721 if (dot
) SetName(dot
+1);
725 m_cset
= wxGetCharacterSet(m_name
);
730 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
732 ((wxCSConv
*)this)->LoadNow(); // discard constness
735 return m_cset
->MB2WC(buf
, psz
, n
);
738 size_t len
=strlen(psz
);
742 for (size_t c
=0; c
<=len
; c
++)
743 buf
[c
] = (unsigned char)(psz
[c
]);
749 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
751 ((wxCSConv
*)this)->LoadNow(); // discard constness
754 return m_cset
->WC2MB(buf
, psz
, n
);
757 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
758 size_t len
=std::wcslen(psz
);
760 size_t len
=::wcslen(psz
);
764 for (size_t c
=0; c
<=len
; c
++)
765 buf
[c
] = (psz
[c
]>0xff) ? '?' : psz
[c
];
772 class IC_CharSetConverter
775 IC_CharSetConverter(IC_CharSet
*from
,IC_CharSet
*to
)
776 { cnv
= iconv_open(wxConvLibc
.cWX2MB(to
->cname
),wxConvLibc
.cWX2MB(from
->cname
)); }
778 ~IC_CharSetConverter()
779 { if (cnv
!=(iconv_t
)-1) iconv_close(cnv
); }
781 size_t Convert(char*buf
, const char*psz
, size_t n
)
783 size_t inbuf
= strlen(psz
);
785 #ifdef WX_ICONV_TAKES_CHAR
786 size_t res
= iconv( cnv
, (char**)&psz
, &inbuf
, &buf
, &outbuf
);
788 size_t res
= iconv( cnv
, &psz
, &inbuf
, &buf
, &outbuf
);
790 if (res
==(size_t)-1) return (size_t)-1;
799 class EC_CharSetConverter
802 EC_CharSetConverter(EC_CharSet
*from
,EC_CharSet
*to
)
803 { cnv
.Init(from
->enc
,to
->enc
); }
805 size_t Convert(char*buf
, const char*psz
, size_t n
)
807 size_t inbuf
= strlen(psz
);
808 if (buf
) cnv
.Convert(psz
,buf
);
813 wxEncodingConverter cnv
;
816 #else // !wxUSE_WCHAR_T
818 // ----------------------------------------------------------------------------
819 // stand-ins in absence of wchar_t
820 // ----------------------------------------------------------------------------
822 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
, wxConvFile
;
824 #endif // wxUSE_WCHAR_T