]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
new wxStringTokenizer
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9 // Licence: wxWindows license
10 /////////////////////////////////////////////////////////////////////////////
11
12 #ifdef __GNUG__
13 #pragma implementation "strconv.h"
14 #endif
15
16 // For compilers that support precompilation, includes "wx.h".
17 #include "wx/wxprec.h"
18
19 #ifdef __BORLANDC__
20 #pragma hdrstop
21 #endif
22
23 #include <ctype.h>
24 #include <string.h>
25 #include <stdlib.h>
26
27 #ifdef __SALFORDC__
28 #include <clib.h>
29 #endif
30
31 #include "wx/debug.h"
32 #include "wx/strconv.h"
33
34 //----------------------------------------------------------------------------
35 // wxConvCurrent
36 //----------------------------------------------------------------------------
37
38 WXDLLEXPORT_DATA(wxMBConv *) wxConvCurrent = &wxConvLibc;
39
40 #if !wxUSE_WCHAR_T
41 //----------------------------------------------------------------------------
42 // stand-ins in absence of wchar_t
43 //----------------------------------------------------------------------------
44
45 WXDLLEXPORT_DATA(wxMBConv) wxConvLibc, wxConvFile;
46
47 #else
48
49 //----------------------------------------------------------------------------
50 // wxMBConv
51 //----------------------------------------------------------------------------
52
53 WXDLLEXPORT_DATA(wxMBConv) wxConvLibc;
54
55 size_t wxMBConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
56 {
57 return wxMB2WC(buf, psz, n);
58 }
59
60 size_t wxMBConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
61 {
62 return wxWC2MB(buf, psz, n);
63 }
64
65 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
66 {
67 if (psz)
68 {
69 size_t nLen = MB2WC((wchar_t *) NULL, psz, 0);
70 wxWCharBuffer buf(nLen);
71 MB2WC((wchar_t *)(const wchar_t *) buf, psz, nLen);
72 return buf;
73 }
74 else
75 return wxWCharBuffer((wchar_t *) NULL);
76 }
77
78 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *psz) const
79 {
80 if (psz)
81 {
82 size_t nLen = WC2MB((char *) NULL, psz, 0);
83 wxCharBuffer buf(nLen);
84 WC2MB((char *)(const char *) buf, psz, nLen);
85 return buf;
86 }
87 else
88 return wxCharBuffer((char *) NULL);
89 }
90
91 //----------------------------------------------------------------------------
92 // standard file conversion
93 //----------------------------------------------------------------------------
94
95 WXDLLEXPORT_DATA(wxMBConvFile) wxConvFile;
96
97 // just use the libc conversion for now
98 size_t wxMBConvFile::MB2WC(wchar_t *buf, const char *psz, size_t n) const
99 {
100 return wxMB2WC(buf, psz, n);
101 }
102
103 size_t wxMBConvFile::WC2MB(char *buf, const wchar_t *psz, size_t n) const
104 {
105 return wxWC2MB(buf, psz, n);
106 }
107
108 #ifdef __WXGTK12__
109
110 //----------------------------------------------------------------------------
111 // standard gdk conversion
112 //----------------------------------------------------------------------------
113
114 WXDLLEXPORT_DATA(wxMBConvGdk) wxConvGdk;
115
116 #include <gdk/gdk.h>
117
118 size_t wxMBConvGdk::MB2WC(wchar_t *buf, const char *psz, size_t n) const
119 {
120 if (buf) {
121 return gdk_mbstowcs((GdkWChar *)buf, psz, n);
122 } else {
123 GdkWChar *nbuf = new GdkWChar[n=strlen(psz)];
124 size_t len = gdk_mbstowcs(nbuf, psz, n);
125 delete [] nbuf;
126 return len;
127 }
128 }
129
130 size_t wxMBConvGdk::WC2MB(char *buf, const wchar_t *psz, size_t n) const
131 {
132 char *mbstr = gdk_wcstombs((GdkWChar *)psz);
133 size_t len = mbstr ? strlen(mbstr) : 0;
134 if (buf) {
135 if (len > n) len = n;
136 memcpy(buf, psz, len);
137 if (len < n) buf[len] = 0;
138 }
139 return len;
140 }
141 #endif // GTK > 1.0
142
143 // ----------------------------------------------------------------------------
144 // UTF-7
145 // ----------------------------------------------------------------------------
146
147 WXDLLEXPORT_DATA(wxMBConvUTF7) wxConvUTF7;
148
149 #if 0
150 static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
151 "abcdefghijklmnopqrstuvwxyz"
152 "0123456789'(),-./:?";
153 static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
154 static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
155 "abcdefghijklmnopqrstuvwxyz"
156 "0123456789+/";
157 #endif
158
159 // TODO: write actual implementations of UTF-7 here
160 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
161 const char * WXUNUSED(psz),
162 size_t WXUNUSED(n)) const
163 {
164 return 0;
165 }
166
167 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
168 const wchar_t * WXUNUSED(psz),
169 size_t WXUNUSED(n)) const
170 {
171 return 0;
172 }
173
174 //----------------------------------------------------------------------------
175 // UTF-8
176 //----------------------------------------------------------------------------
177
178 WXDLLEXPORT_DATA(wxMBConvUTF8) wxConvUTF8;
179
180 static unsigned long utf8_max[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
181
182 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
183 {
184 size_t len = 0;
185
186 while (*psz && ((!buf) || (len<n))) {
187 unsigned char cc=*psz++, fc=cc;
188 unsigned cnt;
189 for (cnt=0; fc&0x80; cnt++) fc<<=1;
190 if (!cnt) {
191 // plain ASCII char
192 if (buf) *buf++=cc;
193 len++;
194 } else {
195 cnt--;
196 if (!cnt) {
197 // invalid UTF-8 sequence
198 return (size_t)-1;
199 } else {
200 unsigned ocnt=cnt-1;
201 unsigned long res=cc&(0x3f>>cnt);
202 while (cnt--) {
203 cc = *psz++;
204 if ((cc&0xC0)!=0x80) {
205 // invalid UTF-8 sequence
206 return (size_t)-1;
207 }
208 res=(res<<6)|(cc&0x3f);
209 }
210 if (res<=utf8_max[ocnt]) {
211 // illegal UTF-8 encoding
212 return (size_t)-1;
213 }
214 if (buf) *buf++=res;
215 len++;
216 }
217 }
218 }
219 if (buf && (len<n)) *buf = 0;
220 return len;
221 }
222
223 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
224 {
225 size_t len = 0;
226
227 while (*psz && ((!buf) || (len<n))) {
228 unsigned long cc=(*psz++)&0x7fffffff;
229 unsigned cnt;
230 for (cnt=0; cc>utf8_max[cnt]; cnt++);
231 if (!cnt) {
232 // plain ASCII char
233 if (buf) *buf++=cc;
234 len++;
235 } else {
236 len+=cnt+1;
237 if (buf) {
238 *buf++=(-128>>cnt)|((cc>>(cnt*6))&(0x3f>>cnt));
239 while (cnt--)
240 *buf++=0x80|((cc>>(cnt*6))&0x3f);
241 }
242 }
243 }
244 if (buf && (len<n)) *buf = 0;
245 return len;
246 }
247
248 // ----------------------------------------------------------------------------
249 // specified character set
250 // ----------------------------------------------------------------------------
251
252 #ifndef WX_PRECOMP
253 #include "wx/dynarray.h"
254 #include "wx/filefn.h"
255 #include "wx/textfile.h"
256 #include "wx/tokenzr.h"
257 #include "wx/utils.h"
258 #endif
259
260 class wxCharacterSet
261 {
262 public:
263 wxArrayString names;
264 wchar_t *data;
265 };
266
267 WX_DECLARE_OBJARRAY(wxCharacterSet, wxCSArray);
268 #include "wx/arrimpl.cpp"
269 WX_DEFINE_OBJARRAY(wxCSArray);
270
271 static wxCSArray wxCharsets;
272
273 static void wxLoadCharacterSets(void)
274 {
275 static bool already_loaded = FALSE;
276
277 if (already_loaded) return;
278
279 already_loaded = TRUE;
280 #if defined(__UNIX__) && wxUSE_TEXTFILE
281 // search through files in /usr/share/i18n/charmaps
282 wxString fname;
283 for (fname = ::wxFindFirstFile(wxT("/usr/share/i18n/charmaps/*"));
284 !fname.IsEmpty();
285 fname = ::wxFindNextFile()) {
286 wxTextFile cmap(fname);
287 if (cmap.Open()) {
288 wxCharacterSet *cset = new wxCharacterSet;
289 wxString comchar,escchar;
290 bool in_charset = FALSE;
291
292 // wxFprintf(stderr,wxT("Loaded: %s\n"),fname.c_str());
293
294 wxString line;
295 for (line = cmap.GetFirstLine();
296 !cmap.Eof();
297 line = cmap.GetNextLine()) {
298 // wxFprintf(stderr,wxT("line contents: %s\n"),line.c_str());
299 wxStringTokenizer token(line);
300 wxString cmd = token.GetNextToken();
301 if (cmd == comchar) {
302 if (token.GetNextToken() == wxT("alias"))
303 cset->names.Add(token.GetNextToken());
304 }
305 else if (cmd == wxT("<code_set_name>"))
306 cset->names.Add(token.GetNextToken());
307 else if (cmd == wxT("<comment_char>"))
308 comchar = token.GetNextToken();
309 else if (cmd == wxT("<escape_char>"))
310 escchar = token.GetNextToken();
311 else if (cmd == wxT("<mb_cur_min>")) {
312 delete cset;
313 cset = (wxCharacterSet *) NULL;
314 break; // we don't support multibyte charsets ourselves (yet)
315 }
316 else if (cmd == wxT("CHARMAP")) {
317 cset->data = (wchar_t *)calloc(256, sizeof(wchar_t));
318 in_charset = TRUE;
319 }
320 else if (cmd == wxT("END")) {
321 if (token.GetNextToken() == wxT("CHARMAP"))
322 in_charset = FALSE;
323 }
324 else if (in_charset) {
325 // format: <NUL> /x00 <U0000> NULL (NUL)
326 // <A> /x41 <U0041> LATIN CAPITAL LETTER A
327 wxString hex = token.GetNextToken();
328 // skip whitespace (why doesn't wxStringTokenizer do this?)
329 while (wxIsEmpty(hex) && token.HasMoreTokens()) hex = token.GetNextToken();
330 wxString uni = token.GetNextToken();
331 // skip whitespace again
332 while (wxIsEmpty(uni) && token.HasMoreTokens()) uni = token.GetNextToken();
333 if ((hex.Len() > 2) && (wxString(hex.GetChar(0)) == escchar) && (hex.GetChar(1) == wxT('x')) &&
334 (uni.Left(2) == wxT("<U"))) {
335 hex.MakeUpper(); uni.MakeUpper();
336 int pos = ::wxHexToDec(hex.Mid(2,2));
337 if (pos>=0) {
338 unsigned long uni1 = ::wxHexToDec(uni.Mid(2,2));
339 unsigned long uni2 = ::wxHexToDec(uni.Mid(4,2));
340 cset->data[pos] = (uni1 << 16) | uni2;
341 // wxFprintf(stderr,wxT("char %02x mapped to %04x (%c)\n"),pos,cset->data[pos],cset->data[pos]);
342 }
343 }
344 }
345 }
346 if (cset) {
347 cset->names.Shrink();
348 wxCharsets.Add(cset);
349 }
350 }
351 }
352 #endif
353 wxCharsets.Shrink();
354 }
355
356 static wxCharacterSet *wxFindCharacterSet(const wxChar *charset)
357 {
358 if (!charset) return (wxCharacterSet *)NULL;
359 wxLoadCharacterSets();
360 for (size_t n=0; n<wxCharsets.GetCount(); n++)
361 if (wxCharsets[n].names.Index(charset) != wxNOT_FOUND)
362 return &(wxCharsets[n]);
363 return (wxCharacterSet *)NULL;
364 }
365
366 WXDLLEXPORT_DATA(wxCSConv) wxConvLocal((const wxChar *)NULL);
367
368 wxCSConv::wxCSConv(const wxChar *charset)
369 {
370 m_name = (wxChar *) NULL;
371 m_cset = (wxCharacterSet *) NULL;
372 m_deferred = TRUE;
373 SetName(charset);
374 }
375
376 wxCSConv::~wxCSConv()
377 {
378 if (m_name) free(m_name);
379 }
380
381 void wxCSConv::SetName(const wxChar *charset)
382 {
383 if (charset) {
384 #ifdef __UNIX__
385 // first, convert the character set name to standard form
386 wxString codeset;
387 if (wxString(charset,3).CmpNoCase(wxT("ISO")) == 0) {
388 // make sure it's represented in the standard form: ISO_8859-1
389 codeset = wxT("ISO_");
390 charset += 3;
391 if ((*charset == wxT('-')) || (*charset == wxT('_'))) charset++;
392 if (wxStrlen(charset)>4) {
393 if (wxString(charset,4) == wxT("8859")) {
394 codeset << wxT("8859-");
395 if (*charset == wxT('-')) charset++;
396 }
397 }
398 }
399 codeset << charset;
400 codeset.MakeUpper();
401 m_name = wxStrdup(codeset.c_str());
402 m_deferred = TRUE;
403 #endif
404 }
405 }
406
407 void wxCSConv::LoadNow()
408 {
409 // wxPrintf(wxT("Conversion request\n"));
410 if (m_deferred) {
411 if (!m_name) {
412 #ifdef __UNIX__
413 wxChar *lang = wxGetenv(wxT("LANG"));
414 wxChar *dot = lang ? wxStrchr(lang, wxT('.')) : (wxChar *)NULL;
415 if (dot) SetName(dot+1);
416 #endif
417 }
418 m_cset = wxFindCharacterSet(m_name);
419 m_deferred = FALSE;
420 }
421 }
422
423 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
424 {
425 ((wxCSConv *)this)->LoadNow(); // discard constness
426 if (buf) {
427 if (m_cset) {
428 for (size_t c=0; c<n; c++)
429 buf[c] = m_cset->data[(unsigned char)(psz[c])];
430 } else {
431 // latin-1 (direct)
432 for (size_t c=0; c<n; c++)
433 buf[c] = (unsigned char)(psz[c]);
434 }
435 return n;
436 }
437 return strlen(psz);
438 }
439
440 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
441 {
442 ((wxCSConv *)this)->LoadNow(); // discard constness
443 if (buf) {
444 if (m_cset) {
445 for (size_t c=0; c<n; c++) {
446 size_t n;
447 for (n=0; (n<256) && (m_cset->data[n] != psz[c]); n++);
448 buf[c] = (n>0xff) ? '?' : n;
449 }
450 } else {
451 // latin-1 (direct)
452 for (size_t c=0; c<n; c++)
453 buf[c] = (psz[c]>0xff) ? '?' : psz[c];
454 }
455 return n;
456 }
457 return wcslen(psz);
458 }
459
460 #endif
461 //wxUSE_WCHAR_T
462
463