1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
13 #pragma implementation "strconv.h"
16 // For compilers that support precompilation, includes "wx.h".
17 #include "wx/wxprec.h"
32 #include "wx/strconv.h"
34 //----------------------------------------------------------------------------
36 //----------------------------------------------------------------------------
38 WXDLLEXPORT_DATA(wxMBConv
*) wxConvCurrent
= &wxConvLibc
;
41 //----------------------------------------------------------------------------
42 // stand-ins in absence of wchar_t
43 //----------------------------------------------------------------------------
45 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
, wxConvFile
;
49 //----------------------------------------------------------------------------
51 //----------------------------------------------------------------------------
53 WXDLLEXPORT_DATA(wxMBConv
) wxConvLibc
;
55 size_t wxMBConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
57 return wxMB2WC(buf
, psz
, n
);
60 size_t wxMBConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
62 return wxWC2MB(buf
, psz
, n
);
65 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
69 size_t nLen
= MB2WC((wchar_t *) NULL
, psz
, 0);
70 wxWCharBuffer
buf(nLen
);
71 MB2WC((wchar_t *)(const wchar_t *) buf
, psz
, nLen
);
75 return wxWCharBuffer((wchar_t *) NULL
);
78 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *psz
) const
82 size_t nLen
= WC2MB((char *) NULL
, psz
, 0);
83 wxCharBuffer
buf(nLen
);
84 WC2MB((char *)(const char *) buf
, psz
, nLen
);
88 return wxCharBuffer((char *) NULL
);
91 //----------------------------------------------------------------------------
92 // standard file conversion
93 //----------------------------------------------------------------------------
95 WXDLLEXPORT_DATA(wxMBConvFile
) wxConvFile
;
97 // just use the libc conversion for now
98 size_t wxMBConvFile::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
100 return wxMB2WC(buf
, psz
, n
);
103 size_t wxMBConvFile::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
105 return wxWC2MB(buf
, psz
, n
);
110 //----------------------------------------------------------------------------
111 // standard gdk conversion
112 //----------------------------------------------------------------------------
114 WXDLLEXPORT_DATA(wxMBConvGdk
) wxConvGdk
;
118 size_t wxMBConvGdk::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
121 return gdk_mbstowcs((GdkWChar
*)buf
, psz
, n
);
123 GdkWChar
*nbuf
= new GdkWChar
[n
=strlen(psz
)];
124 size_t len
= gdk_mbstowcs(nbuf
, psz
, n
);
130 size_t wxMBConvGdk::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
132 char *mbstr
= gdk_wcstombs((GdkWChar
*)psz
);
133 size_t len
= mbstr
? strlen(mbstr
) : 0;
135 if (len
> n
) len
= n
;
136 memcpy(buf
, psz
, len
);
137 if (len
< n
) buf
[len
] = 0;
143 // ----------------------------------------------------------------------------
145 // ----------------------------------------------------------------------------
147 WXDLLEXPORT_DATA(wxMBConvUTF7
) wxConvUTF7
;
150 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
151 "abcdefghijklmnopqrstuvwxyz"
152 "0123456789'(),-./:?";
153 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
154 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
155 "abcdefghijklmnopqrstuvwxyz"
159 // TODO: write actual implementations of UTF-7 here
160 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
161 const char * WXUNUSED(psz
),
162 size_t WXUNUSED(n
)) const
167 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
168 const wchar_t * WXUNUSED(psz
),
169 size_t WXUNUSED(n
)) const
174 //----------------------------------------------------------------------------
176 //----------------------------------------------------------------------------
178 WXDLLEXPORT_DATA(wxMBConvUTF8
) wxConvUTF8
;
180 static unsigned long utf8_max
[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
182 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
186 while (*psz
&& ((!buf
) || (len
<n
))) {
187 unsigned char cc
=*psz
++, fc
=cc
;
189 for (cnt
=0; fc
&0x80; cnt
++) fc
<<=1;
197 // invalid UTF-8 sequence
201 unsigned long res
=cc
&(0x3f>>cnt
);
204 if ((cc
&0xC0)!=0x80) {
205 // invalid UTF-8 sequence
208 res
=(res
<<6)|(cc
&0x3f);
210 if (res
<=utf8_max
[ocnt
]) {
211 // illegal UTF-8 encoding
219 if (buf
&& (len
<n
)) *buf
= 0;
223 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
227 while (*psz
&& ((!buf
) || (len
<n
))) {
228 unsigned long cc
=(*psz
++)&0x7fffffff;
230 for (cnt
=0; cc
>utf8_max
[cnt
]; cnt
++);
238 *buf
++=(-128>>cnt
)|((cc
>>(cnt
*6))&(0x3f>>cnt
));
240 *buf
++=0x80|((cc
>>(cnt
*6))&0x3f);
244 if (buf
&& (len
<n
)) *buf
= 0;
248 // ----------------------------------------------------------------------------
249 // specified character set
250 // ----------------------------------------------------------------------------
253 #include "wx/dynarray.h"
254 #include "wx/filefn.h"
255 #include "wx/textfile.h"
256 #include "wx/tokenzr.h"
257 #include "wx/utils.h"
267 WX_DECLARE_OBJARRAY(wxCharacterSet
, wxCSArray
);
268 #include "wx/arrimpl.cpp"
269 WX_DEFINE_OBJARRAY(wxCSArray
);
271 static wxCSArray wxCharsets
;
273 static void wxLoadCharacterSets(void)
275 static bool already_loaded
= FALSE
;
277 if (already_loaded
) return;
279 already_loaded
= TRUE
;
280 #if defined(__UNIX__) && wxUSE_TEXTFILE
281 // search through files in /usr/share/i18n/charmaps
283 for (fname
= ::wxFindFirstFile(wxT("/usr/share/i18n/charmaps/*"));
285 fname
= ::wxFindNextFile()) {
286 wxTextFile
cmap(fname
);
288 wxCharacterSet
*cset
= new wxCharacterSet
;
289 wxString comchar
,escchar
;
290 bool in_charset
= FALSE
;
292 // wxFprintf(stderr,wxT("Loaded: %s\n"),fname.c_str());
295 for (line
= cmap
.GetFirstLine();
297 line
= cmap
.GetNextLine()) {
298 // wxFprintf(stderr,wxT("line contents: %s\n"),line.c_str());
299 wxStringTokenizer
token(line
);
300 wxString cmd
= token
.GetNextToken();
301 if (cmd
== comchar
) {
302 if (token
.GetNextToken() == wxT("alias"))
303 cset
->names
.Add(token
.GetNextToken());
305 else if (cmd
== wxT("<code_set_name>"))
306 cset
->names
.Add(token
.GetNextToken());
307 else if (cmd
== wxT("<comment_char>"))
308 comchar
= token
.GetNextToken();
309 else if (cmd
== wxT("<escape_char>"))
310 escchar
= token
.GetNextToken();
311 else if (cmd
== wxT("<mb_cur_min>")) {
313 cset
= (wxCharacterSet
*) NULL
;
314 break; // we don't support multibyte charsets ourselves (yet)
316 else if (cmd
== wxT("CHARMAP")) {
317 cset
->data
= (wchar_t *)calloc(256, sizeof(wchar_t));
320 else if (cmd
== wxT("END")) {
321 if (token
.GetNextToken() == wxT("CHARMAP"))
324 else if (in_charset
) {
325 // format: <NUL> /x00 <U0000> NULL (NUL)
326 // <A> /x41 <U0041> LATIN CAPITAL LETTER A
327 wxString hex
= token
.GetNextToken();
328 // skip whitespace (why doesn't wxStringTokenizer do this?)
329 while (wxIsEmpty(hex
) && token
.HasMoreTokens()) hex
= token
.GetNextToken();
330 wxString uni
= token
.GetNextToken();
331 // skip whitespace again
332 while (wxIsEmpty(uni
) && token
.HasMoreTokens()) uni
= token
.GetNextToken();
333 if ((hex
.Len() > 2) && (wxString(hex
.GetChar(0)) == escchar
) && (hex
.GetChar(1) == wxT('x')) &&
334 (uni
.Left(2) == wxT("<U"))) {
335 hex
.MakeUpper(); uni
.MakeUpper();
336 int pos
= ::wxHexToDec(hex
.Mid(2,2));
338 unsigned long uni1
= ::wxHexToDec(uni
.Mid(2,2));
339 unsigned long uni2
= ::wxHexToDec(uni
.Mid(4,2));
340 cset
->data
[pos
] = (uni1
<< 16) | uni2
;
341 // wxFprintf(stderr,wxT("char %02x mapped to %04x (%c)\n"),pos,cset->data[pos],cset->data[pos]);
347 cset
->names
.Shrink();
348 wxCharsets
.Add(cset
);
356 static wxCharacterSet
*wxFindCharacterSet(const wxChar
*charset
)
358 if (!charset
) return (wxCharacterSet
*)NULL
;
359 wxLoadCharacterSets();
360 for (size_t n
=0; n
<wxCharsets
.GetCount(); n
++)
361 if (wxCharsets
[n
].names
.Index(charset
) != wxNOT_FOUND
)
362 return &(wxCharsets
[n
]);
363 return (wxCharacterSet
*)NULL
;
366 WXDLLEXPORT_DATA(wxCSConv
) wxConvLocal((const wxChar
*)NULL
);
368 wxCSConv::wxCSConv(const wxChar
*charset
)
370 m_name
= (wxChar
*) NULL
;
371 m_cset
= (wxCharacterSet
*) NULL
;
376 wxCSConv::~wxCSConv()
378 if (m_name
) free(m_name
);
381 void wxCSConv::SetName(const wxChar
*charset
)
385 // first, convert the character set name to standard form
387 if (wxString(charset
,3).CmpNoCase(wxT("ISO")) == 0) {
388 // make sure it's represented in the standard form: ISO_8859-1
389 codeset
= wxT("ISO_");
391 if ((*charset
== wxT('-')) || (*charset
== wxT('_'))) charset
++;
392 if (wxStrlen(charset
)>4) {
393 if (wxString(charset
,4) == wxT("8859")) {
394 codeset
<< wxT("8859-");
395 if (*charset
== wxT('-')) charset
++;
401 m_name
= wxStrdup(codeset
.c_str());
407 void wxCSConv::LoadNow()
409 // wxPrintf(wxT("Conversion request\n"));
413 wxChar
*lang
= wxGetenv(wxT("LANG"));
414 wxChar
*dot
= lang
? wxStrchr(lang
, wxT('.')) : (wxChar
*)NULL
;
415 if (dot
) SetName(dot
+1);
418 m_cset
= wxFindCharacterSet(m_name
);
423 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
425 ((wxCSConv
*)this)->LoadNow(); // discard constness
428 for (size_t c
=0; c
<n
; c
++)
429 buf
[c
] = m_cset
->data
[(unsigned char)(psz
[c
])];
432 for (size_t c
=0; c
<n
; c
++)
433 buf
[c
] = (unsigned char)(psz
[c
]);
440 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
442 ((wxCSConv
*)this)->LoadNow(); // discard constness
445 for (size_t c
=0; c
<n
; c
++) {
447 for (n
=0; (n
<256) && (m_cset
->data
[n
] != psz
[c
]); n
++);
448 buf
[c
] = (n
>0xff) ? '?' : n
;
452 for (size_t c
=0; c
<n
; c
++)
453 buf
[c
] = (psz
[c
]>0xff) ? '?' : psz
[c
];