]>
git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
ab68a01e14520725a5a03974b9f880dff2876e7a
   1 ///////////////////////////////////////////////////////////////////////////// 
   3 // Purpose:     Unicode conversion classes 
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin 
   8 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin 
   9 // Licence:     wxWindows license 
  10 ///////////////////////////////////////////////////////////////////////////// 
  12 // ============================================================================ 
  14 // ============================================================================ 
  16 // ---------------------------------------------------------------------------- 
  18 // ---------------------------------------------------------------------------- 
  21   #pragma implementation "strconv.h" 
  24 // For compilers that support precompilation, includes "wx.h". 
  25 #include "wx/wxprec.h" 
  32   #include "wx/msw/private.h" 
  47 #ifdef HAVE_LANGINFO_H 
  52 #include "wx/strconv.h" 
  54 #ifdef WORDS_BIGENDIAN 
  55 #define BSWAP_UCS4(str, len) 
  56 #define BSWAP_UCS2(str, len) 
  58 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); } 
  59 #define BSWAP_UCS2(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); } 
  62 #define BSWAP_UTF32(str, len) BSWAP_UCS4(str, len) 
  63 #define BSWAP_UTF16(str, len) BSWAP_UCS2(str, len) 
  65 #if SIZEOF_WCHAR_T == 4 
  66 #define WC_NAME "UCS4" 
  67 #define WC_BSWAP BSWAP_UCS4 
  68 #elif SIZEOF_WCHAR_T == 2 
  69 #define WC_NAME "UTF16" 
  70 #define WC_BSWAP BSWAP_UTF16 
  74 // ---------------------------------------------------------------------------- 
  76 // ---------------------------------------------------------------------------- 
  78 WXDLLEXPORT_DATA(wxMBConv 
*) wxConvCurrent 
= &wxConvLibc
; 
  80 // ============================================================================ 
  82 // ============================================================================ 
  86 static size_t encode_utf16(wxUint32 input
,wxUint16
*output
) 
  89     if (output
) *output
++ = input
; 
  92   if (input
>=0x110000) { 
  96       *output
++ = (input 
>> 10)+0xd7c0; 
  97       *output
++ = (input
&0x3ff)+0xdc00; 
 103 static size_t decode_utf16(wxUint16
*input
,wxUint32
&output
) 
 105   if ((*input
<0xd800) || (*input
>0xdfff)) { 
 109   if ((input
[1]<0xdc00) || (input
[1]>=0xdfff)) { 
 113     output 
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00); 
 118 // ---------------------------------------------------------------------------- 
 120 // ---------------------------------------------------------------------------- 
 122 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
; 
 124 size_t wxMBConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 126     return wxMB2WC(buf
, psz
, n
); 
 129 size_t wxMBConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 131     return wxWC2MB(buf
, psz
, n
); 
 134 const wxWCharBuffer 
wxMBConv::cMB2WC(const char *psz
) const 
 138         size_t nLen 
= MB2WC((wchar_t *) NULL
, psz
, 0); 
 139         if (nLen 
== (size_t)-1) 
 140             return wxWCharBuffer((wchar_t *) NULL
); 
 141         wxWCharBuffer 
buf(nLen
); 
 142         MB2WC((wchar_t *)(const wchar_t *) buf
, psz
, nLen
); 
 146         return wxWCharBuffer((wchar_t *) NULL
); 
 149 const wxCharBuffer 
wxMBConv::cWC2MB(const wchar_t *psz
) const 
 153         size_t nLen 
= WC2MB((char *) NULL
, psz
, 0); 
 154         if (nLen 
== (size_t)-1) 
 155             return wxCharBuffer((char *) NULL
); 
 156         wxCharBuffer 
buf(nLen
); 
 157         WC2MB((char *)(const char *) buf
, psz
, nLen
); 
 161         return wxCharBuffer((char *) NULL
); 
 164 // ---------------------------------------------------------------------------- 
 165 // standard file conversion 
 166 // ---------------------------------------------------------------------------- 
 168 WXDLLEXPORT_DATA(wxMBConvFile
) wxConvFile
; 
 170 // just use the libc conversion for now 
 171 size_t wxMBConvFile::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 173     return wxMB2WC(buf
, psz
, n
); 
 176 size_t wxMBConvFile::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 178     return wxWC2MB(buf
, psz
, n
); 
 181 // ---------------------------------------------------------------------------- 
 182 // standard gdk conversion 
 183 // ---------------------------------------------------------------------------- 
 187 WXDLLEXPORT_DATA(wxMBConvGdk
) wxConvGdk
; 
 191 size_t wxMBConvGdk::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 194     return gdk_mbstowcs((GdkWChar 
*)buf
, psz
, n
); 
 196     GdkWChar 
*nbuf 
= new GdkWChar
[n
=strlen(psz
)]; 
 197     size_t len 
= gdk_mbstowcs(nbuf
, psz
, n
); 
 203 size_t wxMBConvGdk::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 205   char *mbstr 
= gdk_wcstombs((GdkWChar 
*)psz
); 
 206   size_t len 
= mbstr 
? strlen(mbstr
) : 0; 
 208     if (len 
> n
) len 
= n
; 
 209     memcpy(buf
, psz
, len
); 
 210     if (len 
< n
) buf
[len
] = 0; 
 217 // ---------------------------------------------------------------------------- 
 219 // ---------------------------------------------------------------------------- 
 221 WXDLLEXPORT_DATA(wxMBConvUTF7
) wxConvUTF7
; 
 224 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ" 
 225                         "abcdefghijklmnopqrstuvwxyz" 
 226                         "0123456789'(),-./:?"; 
 227 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}"; 
 228 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ" 
 229                         "abcdefghijklmnopqrstuvwxyz" 
 233 // TODO: write actual implementations of UTF-7 here 
 234 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
), 
 235                            const char * WXUNUSED(psz
), 
 236                            size_t WXUNUSED(n
)) const 
 241 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
), 
 242                            const wchar_t * WXUNUSED(psz
), 
 243                            size_t WXUNUSED(n
)) const 
 248 // ---------------------------------------------------------------------------- 
 250 // ---------------------------------------------------------------------------- 
 252 WXDLLEXPORT_DATA(wxMBConvUTF8
) wxConvUTF8
; 
 254 static wxUint32 utf8_max
[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff}; 
 256 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 260   while (*psz 
&& ((!buf
) || (len
<n
))) { 
 261     unsigned char cc
=*psz
++, fc
=cc
; 
 263     for (cnt
=0; fc
&0x80; cnt
++) fc
<<=1; 
 271         // invalid UTF-8 sequence 
 275         wxUint32 res
=cc
&(0x3f>>cnt
); 
 278           if ((cc
&0xC0)!=0x80) { 
 279             // invalid UTF-8 sequence 
 282           res
=(res
<<6)|(cc
&0x3f); 
 284         if (res
<=utf8_max
[ocnt
]) { 
 285           // illegal UTF-8 encoding 
 289         size_t pa 
= encode_utf16(res
, buf
); 
 290         if (pa 
== (size_t)-1) 
 301   if (buf 
&& (len
<n
)) *buf 
= 0; 
 305 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 309   while (*psz 
&& ((!buf
) || (len
<n
))) { 
 312     size_t pa 
= decode_utf16(psz
,cc
); 
 313     psz 
+= (pa 
== (size_t)-1) ? 1 : pa
; 
 315     cc
=(*psz
++)&0x7fffffff; 
 318     for (cnt
=0; cc
>utf8_max
[cnt
]; cnt
++); 
 326         *buf
++=(-128>>cnt
)|((cc
>>(cnt
*6))&(0x3f>>cnt
)); 
 328           *buf
++=0x80|((cc
>>(cnt
*6))&0x3f); 
 332   if (buf 
&& (len
<n
)) *buf 
= 0; 
 336 // ---------------------------------------------------------------------------- 
 337 // specified character set 
 338 // ---------------------------------------------------------------------------- 
 340 WXDLLEXPORT_DATA(wxCSConv
) wxConvLocal((const wxChar 
*)NULL
); 
 342 #include "wx/encconv.h" 
 343 #include "wx/fontmap.h" 
 345 // TODO: add some tables here 
 346 // - perhaps common encodings to common codepages (for Win32) 
 347 // - perhaps common encodings to objects ("UTF8" -> wxConvUTF8) 
 348 // - move wxEncodingConverter meat in here 
 351 #include "wx/msw/registry.h" 
 352 // this should work if M$ Internet Exploiter is installed 
 353 static long CharsetToCodepage(const wxChar 
*name
) 
 355   if (!name
) return GetACP(); 
 359     wxString 
path(wxT("MIME\\Database\\Charset\\")); 
 361     wxRegKey 
key(wxRegKey::HKCR
,path
); 
 363     /* two cases: either there's an AliasForCharset string, 
 364      * or there are Codepage and InternetEncoding dwords. 
 365      * The InternetEncoding gives us the actual encoding, 
 366      * the Codepage just says which Windows character set to 
 367      * use when displaying the data. 
 369     if (key
.QueryValue(wxT("InternetEncoding"),&CP
)) break; 
 370     // no encoding, see if it's an alias 
 371     if (!key
.QueryValue(wxT("AliasForCharset"),cn
)) break; 
 381   wxCharacterSet(const wxChar
*name
) : cname(name
) {} 
 382   virtual ~wxCharacterSet() {} 
 383   virtual size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) { return (size_t)-1; } 
 384   virtual size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) { return (size_t)-1; } 
 385   virtual bool usable() { return FALSE
; } 
 388 class ID_CharSet 
: public wxCharacterSet
 
 392   ID_CharSet(const wxChar
*name
,wxMBConv
*cnv
) : wxCharacterSet(name
), work(cnv
) {} 
 393   size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) 
 394   { return work 
? work
->MB2WC(buf
,psz
,n
) : (size_t)-1; } 
 395   size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) 
 396   { return work 
? work
->WC2MB(buf
,psz
,n
) : (size_t)-1; } 
 397   bool usable() { return work
!=NULL
; } 
 401 class IC_CharSet 
: public wxCharacterSet
 
 405   IC_CharSet(const wxChar
*name
) : wxCharacterSet(name
), m2w((iconv_t
)-1), w2m((iconv_t
)-1) {} 
 407     if (m2w
!=(iconv_t
)-1) iconv_close(m2w
); 
 408     if (w2m
!=(iconv_t
)-1) iconv_close(w2m
); 
 410   void LoadM2W() { if (m2w
==(iconv_t
)-1) m2w
=iconv_open(WC_NAME
,wxConvLibc
.cWX2MB(cname
)); } 
 411   void LoadW2M() { if (w2m
==(iconv_t
)-1) w2m
=iconv_open(wxConvLibc
.cWX2MB(cname
),WC_NAME
); } 
 412   size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) { 
 414     size_t inbuf 
= strlen(psz
); 
 415     size_t outbuf 
= n
*SIZEOF_WCHAR_T
; 
 417     fprintf(stderr
,"IC Convert to WC using %s\n",(const char*)wxConvLibc
.cWX2MB(cname
)); 
 419       // have destination buffer, convert there 
 420       cres 
= iconv(m2w
,&psz
,&inbuf
,(char**)&buf
,&outbuf
); 
 421       res 
= n
-(outbuf
/SIZEOF_WCHAR_T
); 
 422       // convert to native endianness 
 425       // no destination buffer... convert using temp buffer 
 426       // to calculate destination buffer requirement 
 430         buf 
= tbuf
; outbuf 
= 8*SIZEOF_WCHAR_T
; 
 431         cres 
= iconv(m2w
,&psz
,&inbuf
,(char**)&buf
,&outbuf
); 
 432         res 
+= 8-(outbuf
/SIZEOF_WCHAR_T
); 
 433       } while ((cres
==(size_t)-1) && (errno
==E2BIG
)); 
 435     if (cres
==(size_t)-1) return (size_t)-1; 
 438   size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) { 
 440 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530) 
 441     size_t inbuf 
= std::wcslen(psz
); 
 443     size_t inbuf 
= ::wcslen(psz
); 
 447     fprintf(stderr
,"IC Convert from WC using %s\n",(const char*)wxConvLibc
.cWX2MB(cname
)); 
 449     // need to copy to temp buffer to switch endianness 
 450     // this absolutely doesn't rock! 
 451     // (no, doing WC_BSWAP twice on the original buffer won't help, as it 
 452     //  could be in read-only memory, or be accessed in some other thread) 
 453     wchar_t*tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
); 
 454     memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
); 
 455     WC_BSWAP(tmpbuf
, inbuf
) 
 459       // have destination buffer, convert there 
 460       cres 
= iconv(w2m
,(const char**)&psz
,&inbuf
,&buf
,&outbuf
); 
 463       // no destination buffer... convert using temp buffer 
 464       // to calculate destination buffer requirement 
 468         buf 
= tbuf
; outbuf 
= 16; 
 469         cres 
= iconv(w2m
,(const char**)&psz
,&inbuf
,&buf
,&outbuf
); 
 471       } while ((cres
==(size_t)-1) && (errno
==E2BIG
)); 
 476     if (cres
==(size_t)-1) return (size_t)-1; 
 479   bool usable() { return TRUE
; } 
 484 class CP_CharSet 
: public wxCharacterSet
 
 488   CP_CharSet(const wxChar
*name
) : wxCharacterSet(name
), CodePage(CharsetToCodepage(name
)) {} 
 489   size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) { 
 490     size_t len 
= MultiByteToWideChar(CodePage
,0,psz
,-1,buf
,buf
?n
:0); 
 491     return len
?len
:(size_t)-1; 
 493   size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) { 
 494     size_t len 
= WideCharToMultiByte(CodePage
,0,psz
,-1,buf
,buf
?n
:0,NULL
,NULL
); 
 495     return len
?len
:(size_t)-1; 
 497   bool usable() { return CodePage
!=-1; } 
 501 class EC_CharSet 
: public wxCharacterSet
 
 504   // temporarily just use wxEncodingConverter stuff, 
 505   // so that it works while a better implementation is built 
 507   wxEncodingConverter m2w
, w2m
; 
 508   EC_CharSet(const wxChar
*name
) : wxCharacterSet(name
), enc(wxFONTENCODING_SYSTEM
) 
 510     if (name
) enc 
= wxTheFontMapper
->CharsetToEncoding(name
, FALSE
); 
 511     m2w
.Init(enc
, wxFONTENCODING_UNICODE
); 
 512     w2m
.Init(wxFONTENCODING_UNICODE
, enc
); 
 514   size_t MB2WC(wchar_t*buf
, const char*psz
, size_t n
) { 
 515     size_t inbuf 
= strlen(psz
); 
 516     fprintf(stderr
,"EC Convert to WC using %d\n",enc
); 
 517     if (buf
) m2w
.Convert(psz
,buf
); 
 520   size_t WC2MB(char*buf
, const wchar_t*psz
, size_t n
) { 
 521 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530) 
 522     size_t inbuf 
= std::wcslen(psz
); 
 524     size_t inbuf 
= ::wcslen(psz
); 
 526     fprintf(stderr
,"EC Convert from WC using %d\n",enc
); 
 527     if (buf
) w2m
.Convert(psz
,buf
); 
 530   bool usable() { return (enc
!=wxFONTENCODING_SYSTEM
) && (enc
!=wxFONTENCODING_DEFAULT
); } 
 533 static wxCharacterSet 
*wxGetCharacterSet(const wxChar 
*name
) 
 535   wxCharacterSet 
*cset 
= NULL
; 
 537     if (!wxStricmp(name
, wxT("UTF8")) || !wxStricmp(name
, wxT("UTF-8"))) { 
 538       cset 
= new ID_CharSet(name
, &wxConvUTF8
); 
 541       cset 
= new IC_CharSet(name
); // may not take NULL 
 545   if (cset 
&& cset
->usable()) return cset
; 
 546   if (cset
) delete cset
; 
 548   cset 
= new CP_CharSet(name
); // may take NULL 
 549   if (cset
->usable()) return cset
; 
 551   if (cset
) delete cset
; 
 552   cset 
= new EC_CharSet(name
); 
 553   if (cset
->usable()) return cset
; 
 558 wxCSConv::wxCSConv(const wxChar 
*charset
) 
 560   m_name 
= (wxChar 
*) NULL
; 
 561   m_cset 
= (wxCharacterSet 
*) NULL
; 
 566 wxCSConv::~wxCSConv() 
 568   if (m_name
) free(m_name
); 
 569   if (m_cset
) delete m_cset
; 
 572 void wxCSConv::SetName(const wxChar 
*charset
) 
 575     m_name 
= wxStrdup(charset
); 
 580 void wxCSConv::LoadNow() 
 582 //  wxPrintf(wxT("Conversion request\n")); 
 586 #if defined(HAVE_LANGINFO_H) && defined(CODESET) 
 587       // GNU libc provides current character set this way 
 588       char*alang 
= nl_langinfo(CODESET
); 
 589       if (alang
) SetName(wxConvLibc
.cMB2WX(alang
)); 
 592       // if we can't get at the character set directly, 
 593       // try to see if it's in the environment variables 
 594       // (in most cases this won't work, but I was out of ideas) 
 596         wxChar 
*lang 
= wxGetenv(wxT("LC_ALL")); 
 597         if (!lang
) lang 
= wxGetenv(wxT("LC_CTYPE")); 
 598         if (!lang
) lang 
= wxGetenv(wxT("LANG")); 
 599         wxChar 
*dot 
= lang 
? wxStrchr(lang
, wxT('.')) : (wxChar 
*)NULL
; 
 600         if (dot
) SetName(dot
+1); 
 604     m_cset 
= wxGetCharacterSet(m_name
); 
 609 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const 
 611   ((wxCSConv 
*)this)->LoadNow(); // discard constness 
 613     return m_cset
->MB2WC(buf
, psz
, n
); 
 616   size_t len
=strlen(psz
); 
 618     for (size_t c
=0; c
<=len
; c
++) 
 619       buf
[c
] = (unsigned char)(psz
[c
]); 
 624 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const 
 626   ((wxCSConv 
*)this)->LoadNow(); // discard constness 
 628     return m_cset
->WC2MB(buf
, psz
, n
); 
 631 #if defined(__BORLANDC__) && (__BORLANDC__ > 0x530) 
 632   size_t len
=std::wcslen(psz
); 
 634   size_t len
=::wcslen(psz
); 
 637     for (size_t c
=0; c
<=len
; c
++) 
 638       buf
[c
] = (psz
[c
]>0xff) ? '?' : psz
[c
]; 
 644 class IC_CharSetConverter
 
 648   IC_CharSetConverter(IC_CharSet
*from
,IC_CharSet
*to
) { 
 649     cnv
=iconv_open(wxConvLibc
.cWX2MB(to
->cname
),wxConvLibc
.cWX2MB(from
->cname
)); 
 651   ~IC_CharSetConverter() { 
 652     if (cnv
!=(iconv_t
)-1) iconv_close(cnv
); 
 654   size_t Convert(char*buf
, const char*psz
, size_t n
) { 
 655     size_t inbuf 
= strlen(psz
); 
 657     size_t res 
= iconv(cnv
,&psz
,&inbuf
,&buf
,&outbuf
); 
 658     if (res
==(size_t)-1) return (size_t)-1; 
 664 class EC_CharSetConverter
 
 667   wxEncodingConverter cnv
; 
 668   EC_CharSetConverter(EC_CharSet
*from
,EC_CharSet
*to
) { 
 669     cnv
.Init(from
->enc
,to
->enc
); 
 671   size_t Convert(char*buf
, const char*psz
, size_t n
) { 
 672     size_t inbuf 
= strlen(psz
); 
 673     if (buf
) cnv
.Convert(psz
,buf
); 
 678 #else // !wxUSE_WCHAR_T 
 680 // ---------------------------------------------------------------------------- 
 681 // stand-ins in absence of wchar_t 
 682 // ---------------------------------------------------------------------------- 
 684 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
, wxConvFile
; 
 686 #endif // wxUSE_WCHAR_T