]>
git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
ab68a01e14520725a5a03974b9f880dff2876e7a
1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
12 // ============================================================================
14 // ============================================================================
16 // ----------------------------------------------------------------------------
18 // ----------------------------------------------------------------------------
21 #pragma implementation "strconv.h"
24 // For compilers that support precompilation, includes "wx.h".
25 #include "wx/wxprec.h"
32 #include "wx/msw/private.h"
47 #ifdef HAVE_LANGINFO_H
52 #include "wx/strconv.h"
54 #ifdef WORDS_BIGENDIAN
55 #define BSWAP_UCS4(str, len)
56 #define BSWAP_UCS2(str, len)
58 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
59 #define BSWAP_UCS2(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
62 #define BSWAP_UTF32(str, len) BSWAP_UCS4(str, len)
63 #define BSWAP_UTF16(str, len) BSWAP_UCS2(str, len)
65 #if SIZEOF_WCHAR_T == 4
66 #define WC_NAME "UCS4"
67 #define WC_BSWAP BSWAP_UCS4
68 #elif SIZEOF_WCHAR_T == 2
69 #define WC_NAME "UTF16"
70 #define WC_BSWAP BSWAP_UTF16
74 // ----------------------------------------------------------------------------
76 // ----------------------------------------------------------------------------
78 WXDLLEXPORT_DATA(wxMBConv
*) wxConvCurrent
= &wxConvLibc
;
80 // ============================================================================
82 // ============================================================================
86 static size_t encode_utf16(wxUint32 input
,wxUint16
*output
)
89 if (output
) *output
++ = input
;
92 if (input
>=0x110000) {
96 *output
++ = (input
>> 10)+0xd7c0;
97 *output
++ = (input
&0x3ff)+0xdc00;
103 static size_t decode_utf16(wxUint16
*input
,wxUint32
&output
)
105 if ((*input
<0xd800) || (*input
>0xdfff)) {
109 if ((input
[1]<0xdc00) || (input
[1]>=0xdfff)) {
113 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
118 // ----------------------------------------------------------------------------
120 // ----------------------------------------------------------------------------
122 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
;
124 size_t wxMBConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
126 return wxMB2WC(buf
, psz
, n
);
129 size_t wxMBConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
131 return wxWC2MB(buf
, psz
, n
);
134 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
138 size_t nLen
= MB2WC((wchar_t *) NULL
, psz
, 0);
139 if (nLen
== (size_t)-1)
140 return wxWCharBuffer((wchar_t *) NULL
);
141 wxWCharBuffer
buf(nLen
);
142 MB2WC((wchar_t *)(const wchar_t *) buf
, psz
, nLen
);
146 return wxWCharBuffer((wchar_t *) NULL
);
149 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *psz
) const
153 size_t nLen
= WC2MB((char *) NULL
, psz
, 0);
154 if (nLen
== (size_t)-1)
155 return wxCharBuffer((char *) NULL
);
156 wxCharBuffer
buf(nLen
);
157 WC2MB((char *)(const char *) buf
, psz
, nLen
);
161 return wxCharBuffer((char *) NULL
);
164 // ----------------------------------------------------------------------------
165 // standard file conversion
166 // ----------------------------------------------------------------------------
168 WXDLLEXPORT_DATA(wxMBConvFile
) wxConvFile
;
170 // just use the libc conversion for now
171 size_t wxMBConvFile::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
173 return wxMB2WC(buf
, psz
, n
);
176 size_t wxMBConvFile::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
178 return wxWC2MB(buf
, psz
, n
);
181 // ----------------------------------------------------------------------------
182 // standard gdk conversion
183 // ----------------------------------------------------------------------------
187 WXDLLEXPORT_DATA(wxMBConvGdk
) wxConvGdk
;
191 size_t wxMBConvGdk::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
194 return gdk_mbstowcs((GdkWChar
*)buf
, psz
, n
);
196 GdkWChar
*nbuf
= new GdkWChar
[n
=strlen(psz
)];
197 size_t len
= gdk_mbstowcs(nbuf
, psz
, n
);
203 size_t wxMBConvGdk::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
205 char *mbstr
= gdk_wcstombs((GdkWChar
*)psz
);
206 size_t len
= mbstr
? strlen(mbstr
) : 0;
208 if (len
> n
) len
= n
;
209 memcpy(buf
, psz
, len
);
210 if (len
< n
) buf
[len
] = 0;
217 // ----------------------------------------------------------------------------
219 // ----------------------------------------------------------------------------
221 WXDLLEXPORT_DATA(wxMBConvUTF7
) wxConvUTF7
;
224 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
225 "abcdefghijklmnopqrstuvwxyz"
226 "0123456789'(),-./:?";
227 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
228 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
229 "abcdefghijklmnopqrstuvwxyz"
233 // TODO: write actual implementations of UTF-7 here
234 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
235 const char * WXUNUSED(psz
),
236 size_t WXUNUSED(n
)) const
241 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
242 const wchar_t * WXUNUSED(psz
),
243 size_t WXUNUSED(n
)) const
248 // ----------------------------------------------------------------------------
250 // ----------------------------------------------------------------------------
252 WXDLLEXPORT_DATA(wxMBConvUTF8
) wxConvUTF8
;
254 static wxUint32 utf8_max
[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
256 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
260 while (*psz
&& ((!buf
) || (len
<n
))) {
261 unsigned char cc
=*psz
++, fc
=cc
;
263 for (cnt
=0; fc
&0x80; cnt
++) fc
<<=1;
271 // invalid UTF-8 sequence
275 wxUint32 res
=cc
&(0x3f>>cnt
);
278 if ((cc
&0xC0)!=0x80) {
279 // invalid UTF-8 sequence
282 res
=(res
<<6)|(cc
&0x3f);
284 if (res
<=utf8_max
[ocnt
]) {
285 // illegal UTF-8 encoding
289 size_t pa
= encode_utf16(res
, buf
);
290 if (pa
== (size_t)-1)
301 if (buf
&& (len
<n
)) *buf
= 0;
305 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
309 while (*psz
&& ((!buf
) || (len
<n
))) {
312 size_t pa
= decode_utf16(psz
,cc
);
313 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
315 cc
=(*psz
++)&0x7fffffff;
318 for (cnt
=0; cc
>utf8_max
[cnt
]; cnt
++);
326 *buf
++=(-128>>cnt
)|((cc
>>(cnt
*6))&(0x3f>>cnt
));
328 *buf
++=0x80|((cc
>>(cnt
*6))&0x3f);
332 if (buf
&& (len
<n
)) *buf
= 0;
336 // ----------------------------------------------------------------------------
337 // specified character set
338 // ----------------------------------------------------------------------------
340 WXDLLEXPORT_DATA(wxCSConv
) wxConvLocal((const wxChar
*)NULL
);
342 #include "wx/encconv.h"
343 #include "wx/fontmap.h"
345 // TODO: add some tables here
346 // - perhaps common encodings to common codepages (for Win32)
347 // - perhaps common encodings to objects ("UTF8" -> wxConvUTF8)
348 // - move wxEncodingConverter meat in here
351 #include "wx/msw/registry.h"
352 // this should work if M$ Internet Exploiter is installed
353 static long CharsetToCodepage(const wxChar
*name
)
355 if (!name
) return GetACP();
359 wxString
path(wxT("MIME\\Database\\Charset\\"));
361 wxRegKey
key(wxRegKey::HKCR
,path
);
363 /* two cases: either there's an AliasForCharset string,
364 * or there are Codepage and InternetEncoding dwords.
365 * The InternetEncoding gives us the actual encoding,
366 * the Codepage just says which Windows character set to
367 * use when displaying the data.
369 if (key
.QueryValue(wxT("InternetEncoding"),&CP
)) break;
370 // no encoding, see if it's an alias
371 if (!key
.QueryValue(wxT("AliasForCharset"),cn
)) break;
381 wxCharacterSet(const wxChar
*name
) : cname(name
) {}
382 virtual ~wxCharacterSet() {}
383 virtual size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) { return (size_t)-1; }
384 virtual size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) { return (size_t)-1; }
385 virtual bool usable() { return FALSE
; }
388 class ID_CharSet
: public wxCharacterSet
392 ID_CharSet(const wxChar
*name
,wxMBConv
*cnv
) : wxCharacterSet(name
), work(cnv
) {}
393 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
)
394 { return work
? work
->MB2WC(buf
,psz
,n
) : (size_t)-1; }
395 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
)
396 { return work
? work
->WC2MB(buf
,psz
,n
) : (size_t)-1; }
397 bool usable() { return work
!=NULL
; }
401 class IC_CharSet
: public wxCharacterSet
405 IC_CharSet(const wxChar
*name
) : wxCharacterSet(name
), m2w((iconv_t
)-1), w2m((iconv_t
)-1) {}
407 if (m2w
!=(iconv_t
)-1) iconv_close(m2w
);
408 if (w2m
!=(iconv_t
)-1) iconv_close(w2m
);
410 void LoadM2W() { if (m2w
==(iconv_t
)-1) m2w
=iconv_open(WC_NAME
,wxConvLibc
.cWX2MB(cname
)); }
411 void LoadW2M() { if (w2m
==(iconv_t
)-1) w2m
=iconv_open(wxConvLibc
.cWX2MB(cname
),WC_NAME
); }
412 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) {
414 size_t inbuf
= strlen(psz
);
415 size_t outbuf
= n
*SIZEOF_WCHAR_T
;
417 fprintf(stderr
,"IC Convert to WC using %s\n",(const char*)wxConvLibc
.cWX2MB(cname
));
419 // have destination buffer, convert there
420 cres
= iconv(m2w
,&psz
,&inbuf
,(char**)&buf
,&outbuf
);
421 res
= n
-(outbuf
/SIZEOF_WCHAR_T
);
422 // convert to native endianness
425 // no destination buffer... convert using temp buffer
426 // to calculate destination buffer requirement
430 buf
= tbuf
; outbuf
= 8*SIZEOF_WCHAR_T
;
431 cres
= iconv(m2w
,&psz
,&inbuf
,(char**)&buf
,&outbuf
);
432 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
433 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
435 if (cres
==(size_t)-1) return (size_t)-1;
438 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) {
440 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
441 size_t inbuf
= std::wcslen(psz
);
443 size_t inbuf
= ::wcslen(psz
);
447 fprintf(stderr
,"IC Convert from WC using %s\n",(const char*)wxConvLibc
.cWX2MB(cname
));
449 // need to copy to temp buffer to switch endianness
450 // this absolutely doesn't rock!
451 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
452 // could be in read-only memory, or be accessed in some other thread)
453 wchar_t*tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
454 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
455 WC_BSWAP(tmpbuf
, inbuf
)
459 // have destination buffer, convert there
460 cres
= iconv(w2m
,(const char**)&psz
,&inbuf
,&buf
,&outbuf
);
463 // no destination buffer... convert using temp buffer
464 // to calculate destination buffer requirement
468 buf
= tbuf
; outbuf
= 16;
469 cres
= iconv(w2m
,(const char**)&psz
,&inbuf
,&buf
,&outbuf
);
471 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
476 if (cres
==(size_t)-1) return (size_t)-1;
479 bool usable() { return TRUE
; }
484 class CP_CharSet
: public wxCharacterSet
488 CP_CharSet(const wxChar
*name
) : wxCharacterSet(name
), CodePage(CharsetToCodepage(name
)) {}
489 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) {
490 size_t len
= MultiByteToWideChar(CodePage
,0,psz
,-1,buf
,buf
?n
:0);
491 return len
?len
:(size_t)-1;
493 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) {
494 size_t len
= WideCharToMultiByte(CodePage
,0,psz
,-1,buf
,buf
?n
:0,NULL
,NULL
);
495 return len
?len
:(size_t)-1;
497 bool usable() { return CodePage
!=-1; }
501 class EC_CharSet
: public wxCharacterSet
504 // temporarily just use wxEncodingConverter stuff,
505 // so that it works while a better implementation is built
507 wxEncodingConverter m2w
, w2m
;
508 EC_CharSet(const wxChar
*name
) : wxCharacterSet(name
), enc(wxFONTENCODING_SYSTEM
)
510 if (name
) enc
= wxTheFontMapper
->CharsetToEncoding(name
, FALSE
);
511 m2w
.Init(enc
, wxFONTENCODING_UNICODE
);
512 w2m
.Init(wxFONTENCODING_UNICODE
, enc
);
514 size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) {
515 size_t inbuf
= strlen(psz
);
516 fprintf(stderr
,"EC Convert to WC using %d\n",enc
);
517 if (buf
) m2w
.Convert(psz
,buf
);
520 size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) {
521 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
522 size_t inbuf
= std::wcslen(psz
);
524 size_t inbuf
= ::wcslen(psz
);
526 fprintf(stderr
,"EC Convert from WC using %d\n",enc
);
527 if (buf
) w2m
.Convert(psz
,buf
);
530 bool usable() { return (enc
!=wxFONTENCODING_SYSTEM
) && (enc
!=wxFONTENCODING_DEFAULT
); }
533 static wxCharacterSet
*wxGetCharacterSet(const wxChar
*name
)
535 wxCharacterSet
*cset
= NULL
;
537 if (!wxStricmp(name
, wxT("UTF8")) || !wxStricmp(name
, wxT("UTF-8"))) {
538 cset
= new ID_CharSet(name
, &wxConvUTF8
);
541 cset
= new IC_CharSet(name
); // may not take NULL
545 if (cset
&& cset
->usable()) return cset
;
546 if (cset
) delete cset
;
548 cset
= new CP_CharSet(name
); // may take NULL
549 if (cset
->usable()) return cset
;
551 if (cset
) delete cset
;
552 cset
= new EC_CharSet(name
);
553 if (cset
->usable()) return cset
;
558 wxCSConv::wxCSConv(const wxChar
*charset
)
560 m_name
= (wxChar
*) NULL
;
561 m_cset
= (wxCharacterSet
*) NULL
;
566 wxCSConv::~wxCSConv()
568 if (m_name
) free(m_name
);
569 if (m_cset
) delete m_cset
;
572 void wxCSConv::SetName(const wxChar
*charset
)
575 m_name
= wxStrdup(charset
);
580 void wxCSConv::LoadNow()
582 // wxPrintf(wxT("Conversion request\n"));
586 #if defined(HAVE_LANGINFO_H) && defined(CODESET)
587 // GNU libc provides current character set this way
588 char*alang
= nl_langinfo(CODESET
);
589 if (alang
) SetName(wxConvLibc
.cMB2WX(alang
));
592 // if we can't get at the character set directly,
593 // try to see if it's in the environment variables
594 // (in most cases this won't work, but I was out of ideas)
596 wxChar
*lang
= wxGetenv(wxT("LC_ALL"));
597 if (!lang
) lang
= wxGetenv(wxT("LC_CTYPE"));
598 if (!lang
) lang
= wxGetenv(wxT("LANG"));
599 wxChar
*dot
= lang
? wxStrchr(lang
, wxT('.')) : (wxChar
*)NULL
;
600 if (dot
) SetName(dot
+1);
604 m_cset
= wxGetCharacterSet(m_name
);
609 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
611 ((wxCSConv
*)this)->LoadNow(); // discard constness
613 return m_cset
->MB2WC(buf
, psz
, n
);
616 size_t len
=strlen(psz
);
618 for (size_t c
=0; c
<=len
; c
++)
619 buf
[c
] = (unsigned char)(psz
[c
]);
624 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
626 ((wxCSConv
*)this)->LoadNow(); // discard constness
628 return m_cset
->WC2MB(buf
, psz
, n
);
631 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
632 size_t len
=std::wcslen(psz
);
634 size_t len
=::wcslen(psz
);
637 for (size_t c
=0; c
<=len
; c
++)
638 buf
[c
] = (psz
[c
]>0xff) ? '?' : psz
[c
];
644 class IC_CharSetConverter
648 IC_CharSetConverter(IC_CharSet
*from
,IC_CharSet
*to
) {
649 cnv
=iconv_open(wxConvLibc
.cWX2MB(to
->cname
),wxConvLibc
.cWX2MB(from
->cname
));
651 ~IC_CharSetConverter() {
652 if (cnv
!=(iconv_t
)-1) iconv_close(cnv
);
654 size_t Convert(char*buf
, const char*psz
, size_t n
) {
655 size_t inbuf
= strlen(psz
);
657 size_t res
= iconv(cnv
,&psz
,&inbuf
,&buf
,&outbuf
);
658 if (res
==(size_t)-1) return (size_t)-1;
664 class EC_CharSetConverter
667 wxEncodingConverter cnv
;
668 EC_CharSetConverter(EC_CharSet
*from
,EC_CharSet
*to
) {
669 cnv
.Init(from
->enc
,to
->enc
);
671 size_t Convert(char*buf
, const char*psz
, size_t n
) {
672 size_t inbuf
= strlen(psz
);
673 if (buf
) cnv
.Convert(psz
,buf
);
678 #else // !wxUSE_WCHAR_T
680 // ----------------------------------------------------------------------------
681 // stand-ins in absence of wchar_t
682 // ----------------------------------------------------------------------------
684 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
, wxConvFile
;
686 #endif // wxUSE_WCHAR_T