Commit | Line | Data |
---|---|---|
6001e347 RR |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: strconv.cpp | |
3 | // Purpose: Unicode conversion classes | |
4 | // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin | |
5 | // Modified by: | |
6 | // Created: 29/01/98 | |
7 | // RCS-ID: $Id$ | |
8 | // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin | |
9 | // Licence: wxWindows license | |
10 | ///////////////////////////////////////////////////////////////////////////// | |
11 | ||
12 | #ifdef __GNUG__ | |
13 | #pragma implementation "strconv.h" | |
14 | #endif | |
15 | ||
16 | // For compilers that support precompilation, includes "wx.h". | |
17 | #include "wx/wxprec.h" | |
18 | ||
19 | #ifdef __BORLANDC__ | |
20 | #pragma hdrstop | |
21 | #endif | |
22 | ||
23 | #include <ctype.h> | |
24 | #include <string.h> | |
25 | #include <stdlib.h> | |
26 | ||
27 | #ifdef __SALFORDC__ | |
28 | #include <clib.h> | |
29 | #endif | |
30 | ||
31 | #include "wx/debug.h" | |
32 | #include "wx/strconv.h" | |
33 | ||
34 | //---------------------------------------------------------------------------- | |
35 | // wxConvCurrent | |
36 | //---------------------------------------------------------------------------- | |
37 | ||
38 | WXDLLEXPORT_DATA(wxMBConv *) wxConvCurrent = &wxConvLibc; | |
39 | ||
40 | #if !wxUSE_WCHAR_T | |
41 | //---------------------------------------------------------------------------- | |
42 | // stand-ins in absence of wchar_t | |
43 | //---------------------------------------------------------------------------- | |
44 | ||
45 | WXDLLEXPORT_DATA(wxMBConv) wxConvLibc, wxConvFile; | |
46 | ||
47 | #else | |
48 | ||
49 | //---------------------------------------------------------------------------- | |
50 | // wxMBConv | |
51 | //---------------------------------------------------------------------------- | |
52 | ||
53 | WXDLLEXPORT_DATA(wxMBConv) wxConvLibc; | |
54 | ||
55 | size_t wxMBConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const | |
56 | { | |
57 | return wxMB2WC(buf, psz, n); | |
58 | } | |
59 | ||
60 | size_t wxMBConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const | |
61 | { | |
62 | return wxWC2MB(buf, psz, n); | |
63 | } | |
64 | ||
65 | const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const | |
66 | { | |
67 | if (psz) | |
68 | { | |
69 | size_t nLen = MB2WC((wchar_t *) NULL, psz, 0); | |
70 | wxWCharBuffer buf(nLen); | |
71 | MB2WC((wchar_t *)(const wchar_t *) buf, psz, nLen); | |
72 | return buf; | |
73 | } | |
74 | else | |
75 | return wxWCharBuffer((wchar_t *) NULL); | |
76 | } | |
77 | ||
78 | const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *psz) const | |
79 | { | |
80 | if (psz) | |
81 | { | |
82 | size_t nLen = WC2MB((char *) NULL, psz, 0); | |
83 | wxCharBuffer buf(nLen); | |
84 | WC2MB((char *)(const char *) buf, psz, nLen); | |
85 | return buf; | |
86 | } | |
87 | else | |
88 | return wxCharBuffer((char *) NULL); | |
89 | } | |
90 | ||
91 | //---------------------------------------------------------------------------- | |
92 | // standard file conversion | |
93 | //---------------------------------------------------------------------------- | |
94 | ||
95 | WXDLLEXPORT_DATA(wxMBConvFile) wxConvFile; | |
96 | ||
97 | // just use the libc conversion for now | |
98 | size_t wxMBConvFile::MB2WC(wchar_t *buf, const char *psz, size_t n) const | |
99 | { | |
100 | return wxMB2WC(buf, psz, n); | |
101 | } | |
102 | ||
103 | size_t wxMBConvFile::WC2MB(char *buf, const wchar_t *psz, size_t n) const | |
104 | { | |
105 | return wxWC2MB(buf, psz, n); | |
106 | } | |
107 | ||
108 | #ifdef __WXGTK12__ | |
109 | ||
110 | //---------------------------------------------------------------------------- | |
111 | // standard gdk conversion | |
112 | //---------------------------------------------------------------------------- | |
113 | ||
114 | WXDLLEXPORT_DATA(wxMBConvGdk) wxConvGdk; | |
115 | ||
116 | #include <gdk/gdk.h> | |
117 | ||
118 | size_t wxMBConvGdk::MB2WC(wchar_t *buf, const char *psz, size_t n) const | |
119 | { | |
120 | if (buf) { | |
121 | return gdk_mbstowcs((GdkWChar *)buf, psz, n); | |
122 | } else { | |
123 | GdkWChar *nbuf = new GdkWChar[n=strlen(psz)]; | |
124 | size_t len = gdk_mbstowcs(nbuf, psz, n); | |
125 | delete [] nbuf; | |
126 | return len; | |
127 | } | |
128 | } | |
129 | ||
130 | size_t wxMBConvGdk::WC2MB(char *buf, const wchar_t *psz, size_t n) const | |
131 | { | |
132 | char *mbstr = gdk_wcstombs((GdkWChar *)psz); | |
133 | size_t len = mbstr ? strlen(mbstr) : 0; | |
134 | if (buf) { | |
135 | if (len > n) len = n; | |
136 | memcpy(buf, psz, len); | |
137 | if (len < n) buf[len] = 0; | |
138 | } | |
139 | return len; | |
140 | } | |
141 | #endif // GTK > 1.0 | |
142 | ||
143 | // ---------------------------------------------------------------------------- | |
144 | // UTF-7 | |
145 | // ---------------------------------------------------------------------------- | |
146 | ||
147 | WXDLLEXPORT_DATA(wxMBConvUTF7) wxConvUTF7; | |
148 | ||
149 | #if 0 | |
150 | static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
151 | "abcdefghijklmnopqrstuvwxyz" | |
152 | "0123456789'(),-./:?"; | |
153 | static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}"; | |
154 | static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
155 | "abcdefghijklmnopqrstuvwxyz" | |
156 | "0123456789+/"; | |
157 | #endif | |
158 | ||
159 | // TODO: write actual implementations of UTF-7 here | |
160 | size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf), | |
161 | const char * WXUNUSED(psz), | |
162 | size_t WXUNUSED(n)) const | |
163 | { | |
164 | return 0; | |
165 | } | |
166 | ||
167 | size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf), | |
168 | const wchar_t * WXUNUSED(psz), | |
169 | size_t WXUNUSED(n)) const | |
170 | { | |
171 | return 0; | |
172 | } | |
173 | ||
174 | //---------------------------------------------------------------------------- | |
175 | // UTF-8 | |
176 | //---------------------------------------------------------------------------- | |
177 | ||
178 | WXDLLEXPORT_DATA(wxMBConvUTF8) wxConvUTF8; | |
179 | ||
180 | static unsigned long utf8_max[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff}; | |
181 | ||
182 | size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const | |
183 | { | |
184 | size_t len = 0; | |
185 | ||
186 | while (*psz && ((!buf) || (len<n))) { | |
187 | unsigned char cc=*psz++, fc=cc; | |
188 | unsigned cnt; | |
189 | for (cnt=0; fc&0x80; cnt++) fc<<=1; | |
190 | if (!cnt) { | |
191 | // plain ASCII char | |
192 | if (buf) *buf++=cc; | |
193 | len++; | |
194 | } else { | |
195 | cnt--; | |
196 | if (!cnt) { | |
197 | // invalid UTF-8 sequence | |
198 | return (size_t)-1; | |
199 | } else { | |
200 | unsigned ocnt=cnt-1; | |
201 | unsigned long res=cc&(0x3f>>cnt); | |
202 | while (cnt--) { | |
203 | cc = *psz++; | |
204 | if ((cc&0xC0)!=0x80) { | |
205 | // invalid UTF-8 sequence | |
206 | return (size_t)-1; | |
207 | } | |
208 | res=(res<<6)|(cc&0x3f); | |
209 | } | |
210 | if (res<=utf8_max[ocnt]) { | |
211 | // illegal UTF-8 encoding | |
212 | return (size_t)-1; | |
213 | } | |
214 | if (buf) *buf++=res; | |
215 | len++; | |
216 | } | |
217 | } | |
218 | } | |
219 | if (buf && (len<n)) *buf = 0; | |
220 | return len; | |
221 | } | |
222 | ||
223 | size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const | |
224 | { | |
225 | size_t len = 0; | |
226 | ||
227 | while (*psz && ((!buf) || (len<n))) { | |
228 | unsigned long cc=(*psz++)&0x7fffffff; | |
229 | unsigned cnt; | |
230 | for (cnt=0; cc>utf8_max[cnt]; cnt++); | |
231 | if (!cnt) { | |
232 | // plain ASCII char | |
233 | if (buf) *buf++=cc; | |
234 | len++; | |
235 | } else { | |
236 | len+=cnt+1; | |
237 | if (buf) { | |
238 | *buf++=(-128>>cnt)|((cc>>(cnt*6))&(0x3f>>cnt)); | |
239 | while (cnt--) | |
240 | *buf++=0x80|((cc>>(cnt*6))&0x3f); | |
241 | } | |
242 | } | |
243 | } | |
244 | if (buf && (len<n)) *buf = 0; | |
245 | return len; | |
246 | } | |
247 | ||
248 | // ---------------------------------------------------------------------------- | |
249 | // specified character set | |
250 | // ---------------------------------------------------------------------------- | |
251 | ||
252 | #ifndef WX_PRECOMP | |
253 | #include "wx/dynarray.h" | |
254 | #include "wx/filefn.h" | |
255 | #include "wx/textfile.h" | |
256 | #include "wx/tokenzr.h" | |
257 | #include "wx/utils.h" | |
258 | #endif | |
259 | ||
260 | class wxCharacterSet | |
261 | { | |
262 | public: | |
263 | wxArrayString names; | |
264 | wchar_t *data; | |
265 | }; | |
266 | ||
267 | WX_DECLARE_OBJARRAY(wxCharacterSet, wxCSArray); | |
268 | #include "wx/arrimpl.cpp" | |
269 | WX_DEFINE_OBJARRAY(wxCSArray); | |
270 | ||
271 | static wxCSArray wxCharsets; | |
272 | ||
273 | static void wxLoadCharacterSets(void) | |
274 | { | |
275 | static bool already_loaded = FALSE; | |
276 | ||
277 | if (already_loaded) return; | |
278 | ||
279 | already_loaded = TRUE; | |
280 | #if defined(__UNIX__) && wxUSE_TEXTFILE | |
281 | // search through files in /usr/share/i18n/charmaps | |
282 | wxString fname; | |
223d09f6 | 283 | for (fname = ::wxFindFirstFile(wxT("/usr/share/i18n/charmaps/*")); |
6001e347 RR |
284 | !fname.IsEmpty(); |
285 | fname = ::wxFindNextFile()) { | |
286 | wxTextFile cmap(fname); | |
287 | if (cmap.Open()) { | |
288 | wxCharacterSet *cset = new wxCharacterSet; | |
289 | wxString comchar,escchar; | |
290 | bool in_charset = FALSE; | |
291 | ||
223d09f6 | 292 | // wxFprintf(stderr,wxT("Loaded: %s\n"),fname.c_str()); |
6001e347 RR |
293 | |
294 | wxString line; | |
295 | for (line = cmap.GetFirstLine(); | |
296 | !cmap.Eof(); | |
297 | line = cmap.GetNextLine()) { | |
223d09f6 | 298 | // wxFprintf(stderr,wxT("line contents: %s\n"),line.c_str()); |
6001e347 RR |
299 | wxStringTokenizer token(line); |
300 | wxString cmd = token.GetNextToken(); | |
301 | if (cmd == comchar) { | |
223d09f6 | 302 | if (token.GetNextToken() == wxT("alias")) |
6001e347 RR |
303 | cset->names.Add(token.GetNextToken()); |
304 | } | |
223d09f6 | 305 | else if (cmd == wxT("<code_set_name>")) |
6001e347 | 306 | cset->names.Add(token.GetNextToken()); |
223d09f6 | 307 | else if (cmd == wxT("<comment_char>")) |
6001e347 | 308 | comchar = token.GetNextToken(); |
223d09f6 | 309 | else if (cmd == wxT("<escape_char>")) |
6001e347 | 310 | escchar = token.GetNextToken(); |
223d09f6 | 311 | else if (cmd == wxT("<mb_cur_min>")) { |
6001e347 RR |
312 | delete cset; |
313 | cset = (wxCharacterSet *) NULL; | |
314 | break; // we don't support multibyte charsets ourselves (yet) | |
315 | } | |
223d09f6 | 316 | else if (cmd == wxT("CHARMAP")) { |
6001e347 RR |
317 | cset->data = (wchar_t *)calloc(256, sizeof(wchar_t)); |
318 | in_charset = TRUE; | |
319 | } | |
223d09f6 KB |
320 | else if (cmd == wxT("END")) { |
321 | if (token.GetNextToken() == wxT("CHARMAP")) | |
6001e347 RR |
322 | in_charset = FALSE; |
323 | } | |
324 | else if (in_charset) { | |
325 | // format: <NUL> /x00 <U0000> NULL (NUL) | |
326 | // <A> /x41 <U0041> LATIN CAPITAL LETTER A | |
327 | wxString hex = token.GetNextToken(); | |
328 | // skip whitespace (why doesn't wxStringTokenizer do this?) | |
329 | while (wxIsEmpty(hex) && token.HasMoreTokens()) hex = token.GetNextToken(); | |
330 | wxString uni = token.GetNextToken(); | |
331 | // skip whitespace again | |
332 | while (wxIsEmpty(uni) && token.HasMoreTokens()) uni = token.GetNextToken(); | |
223d09f6 KB |
333 | if ((hex.Len() > 2) && (wxString(hex.GetChar(0)) == escchar) && (hex.GetChar(1) == wxT('x')) && |
334 | (uni.Left(2) == wxT("<U"))) { | |
6001e347 RR |
335 | hex.MakeUpper(); uni.MakeUpper(); |
336 | int pos = ::wxHexToDec(hex.Mid(2,2)); | |
337 | if (pos>=0) { | |
338 | unsigned long uni1 = ::wxHexToDec(uni.Mid(2,2)); | |
339 | unsigned long uni2 = ::wxHexToDec(uni.Mid(4,2)); | |
340 | cset->data[pos] = (uni1 << 16) | uni2; | |
223d09f6 | 341 | // wxFprintf(stderr,wxT("char %02x mapped to %04x (%c)\n"),pos,cset->data[pos],cset->data[pos]); |
6001e347 RR |
342 | } |
343 | } | |
344 | } | |
345 | } | |
346 | if (cset) { | |
347 | cset->names.Shrink(); | |
348 | wxCharsets.Add(cset); | |
349 | } | |
350 | } | |
351 | } | |
352 | #endif | |
353 | wxCharsets.Shrink(); | |
354 | } | |
355 | ||
356 | static wxCharacterSet *wxFindCharacterSet(const wxChar *charset) | |
357 | { | |
358 | if (!charset) return (wxCharacterSet *)NULL; | |
359 | wxLoadCharacterSets(); | |
360 | for (size_t n=0; n<wxCharsets.GetCount(); n++) | |
361 | if (wxCharsets[n].names.Index(charset) != wxNOT_FOUND) | |
362 | return &(wxCharsets[n]); | |
363 | return (wxCharacterSet *)NULL; | |
364 | } | |
365 | ||
366 | WXDLLEXPORT_DATA(wxCSConv) wxConvLocal((const wxChar *)NULL); | |
367 | ||
368 | wxCSConv::wxCSConv(const wxChar *charset) | |
369 | { | |
370 | m_name = (wxChar *) NULL; | |
371 | m_cset = (wxCharacterSet *) NULL; | |
372 | m_deferred = TRUE; | |
373 | SetName(charset); | |
374 | } | |
375 | ||
376 | wxCSConv::~wxCSConv() | |
377 | { | |
378 | if (m_name) free(m_name); | |
379 | } | |
380 | ||
381 | void wxCSConv::SetName(const wxChar *charset) | |
382 | { | |
383 | if (charset) { | |
384 | #ifdef __UNIX__ | |
385 | // first, convert the character set name to standard form | |
386 | wxString codeset; | |
223d09f6 | 387 | if (wxString(charset,3).CmpNoCase(wxT("ISO")) == 0) { |
6001e347 | 388 | // make sure it's represented in the standard form: ISO_8859-1 |
223d09f6 | 389 | codeset = wxT("ISO_"); |
6001e347 | 390 | charset += 3; |
223d09f6 | 391 | if ((*charset == wxT('-')) || (*charset == wxT('_'))) charset++; |
6001e347 | 392 | if (wxStrlen(charset)>4) { |
223d09f6 KB |
393 | if (wxString(charset,4) == wxT("8859")) { |
394 | codeset << wxT("8859-"); | |
395 | if (*charset == wxT('-')) charset++; | |
6001e347 RR |
396 | } |
397 | } | |
398 | } | |
399 | codeset << charset; | |
400 | codeset.MakeUpper(); | |
401 | m_name = wxStrdup(codeset.c_str()); | |
402 | m_deferred = TRUE; | |
403 | #endif | |
404 | } | |
405 | } | |
406 | ||
407 | void wxCSConv::LoadNow() | |
408 | { | |
223d09f6 | 409 | // wxPrintf(wxT("Conversion request\n")); |
6001e347 RR |
410 | if (m_deferred) { |
411 | if (!m_name) { | |
412 | #ifdef __UNIX__ | |
223d09f6 KB |
413 | wxChar *lang = wxGetenv(wxT("LANG")); |
414 | wxChar *dot = lang ? wxStrchr(lang, wxT('.')) : (wxChar *)NULL; | |
6001e347 RR |
415 | if (dot) SetName(dot+1); |
416 | #endif | |
417 | } | |
418 | m_cset = wxFindCharacterSet(m_name); | |
419 | m_deferred = FALSE; | |
420 | } | |
421 | } | |
422 | ||
423 | size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const | |
424 | { | |
425 | ((wxCSConv *)this)->LoadNow(); // discard constness | |
426 | if (buf) { | |
427 | if (m_cset) { | |
428 | for (size_t c=0; c<n; c++) | |
429 | buf[c] = m_cset->data[(unsigned char)(psz[c])]; | |
430 | } else { | |
431 | // latin-1 (direct) | |
432 | for (size_t c=0; c<n; c++) | |
433 | buf[c] = (unsigned char)(psz[c]); | |
434 | } | |
435 | return n; | |
436 | } | |
437 | return strlen(psz); | |
438 | } | |
439 | ||
440 | size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const | |
441 | { | |
442 | ((wxCSConv *)this)->LoadNow(); // discard constness | |
443 | if (buf) { | |
444 | if (m_cset) { | |
445 | for (size_t c=0; c<n; c++) { | |
446 | size_t n; | |
447 | for (n=0; (n<256) && (m_cset->data[n] != psz[c]); n++); | |
448 | buf[c] = (n>0xff) ? '?' : n; | |
449 | } | |
450 | } else { | |
451 | // latin-1 (direct) | |
452 | for (size_t c=0; c<n; c++) | |
453 | buf[c] = (psz[c]>0xff) ? '?' : psz[c]; | |
454 | } | |
455 | return n; | |
456 | } | |
457 | return wcslen(psz); | |
458 | } | |
459 | ||
460 | #endif | |
461 | //wxUSE_WCHAR_T | |
462 | ||
463 |