]> git.saurik.com Git - wxWidgets.git/blame - src/common/strconv.cpp
serbase.cpp added to the list of files
[wxWidgets.git] / src / common / strconv.cpp
CommitLineData
6001e347
RR
1/////////////////////////////////////////////////////////////////////////////
2// Name: strconv.cpp
3// Purpose: Unicode conversion classes
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin
5// Modified by:
6// Created: 29/01/98
7// RCS-ID: $Id$
8// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9// Licence: wxWindows license
10/////////////////////////////////////////////////////////////////////////////
11
12#ifdef __GNUG__
13 #pragma implementation "strconv.h"
14#endif
15
16// For compilers that support precompilation, includes "wx.h".
17#include "wx/wxprec.h"
18
19#ifdef __BORLANDC__
20 #pragma hdrstop
21#endif
22
23#include <ctype.h>
24#include <string.h>
25#include <stdlib.h>
26
27#ifdef __SALFORDC__
28 #include <clib.h>
29#endif
30
31#include "wx/debug.h"
32#include "wx/strconv.h"
33
34//----------------------------------------------------------------------------
35// wxConvCurrent
36//----------------------------------------------------------------------------
37
38WXDLLEXPORT_DATA(wxMBConv *) wxConvCurrent = &wxConvLibc;
39
40#if !wxUSE_WCHAR_T
41//----------------------------------------------------------------------------
42// stand-ins in absence of wchar_t
43//----------------------------------------------------------------------------
44
45WXDLLEXPORT_DATA(wxMBConv) wxConvLibc, wxConvFile;
46
47#else
48
49//----------------------------------------------------------------------------
50// wxMBConv
51//----------------------------------------------------------------------------
52
53WXDLLEXPORT_DATA(wxMBConv) wxConvLibc;
54
55size_t wxMBConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
56{
57 return wxMB2WC(buf, psz, n);
58}
59
60size_t wxMBConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
61{
62 return wxWC2MB(buf, psz, n);
63}
64
65const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
66{
67 if (psz)
68 {
69 size_t nLen = MB2WC((wchar_t *) NULL, psz, 0);
70 wxWCharBuffer buf(nLen);
71 MB2WC((wchar_t *)(const wchar_t *) buf, psz, nLen);
72 return buf;
73 }
74 else
75 return wxWCharBuffer((wchar_t *) NULL);
76}
77
78const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *psz) const
79{
80 if (psz)
81 {
82 size_t nLen = WC2MB((char *) NULL, psz, 0);
83 wxCharBuffer buf(nLen);
84 WC2MB((char *)(const char *) buf, psz, nLen);
85 return buf;
86 }
87 else
88 return wxCharBuffer((char *) NULL);
89}
90
91//----------------------------------------------------------------------------
92// standard file conversion
93//----------------------------------------------------------------------------
94
95WXDLLEXPORT_DATA(wxMBConvFile) wxConvFile;
96
97// just use the libc conversion for now
98size_t wxMBConvFile::MB2WC(wchar_t *buf, const char *psz, size_t n) const
99{
100 return wxMB2WC(buf, psz, n);
101}
102
103size_t wxMBConvFile::WC2MB(char *buf, const wchar_t *psz, size_t n) const
104{
105 return wxWC2MB(buf, psz, n);
106}
107
108#ifdef __WXGTK12__
109
110//----------------------------------------------------------------------------
111// standard gdk conversion
112//----------------------------------------------------------------------------
113
114WXDLLEXPORT_DATA(wxMBConvGdk) wxConvGdk;
115
116#include <gdk/gdk.h>
117
118size_t wxMBConvGdk::MB2WC(wchar_t *buf, const char *psz, size_t n) const
119{
120 if (buf) {
121 return gdk_mbstowcs((GdkWChar *)buf, psz, n);
122 } else {
123 GdkWChar *nbuf = new GdkWChar[n=strlen(psz)];
124 size_t len = gdk_mbstowcs(nbuf, psz, n);
125 delete [] nbuf;
126 return len;
127 }
128}
129
130size_t wxMBConvGdk::WC2MB(char *buf, const wchar_t *psz, size_t n) const
131{
132 char *mbstr = gdk_wcstombs((GdkWChar *)psz);
133 size_t len = mbstr ? strlen(mbstr) : 0;
134 if (buf) {
135 if (len > n) len = n;
136 memcpy(buf, psz, len);
137 if (len < n) buf[len] = 0;
138 }
139 return len;
140}
141#endif // GTK > 1.0
142
143// ----------------------------------------------------------------------------
144// UTF-7
145// ----------------------------------------------------------------------------
146
147WXDLLEXPORT_DATA(wxMBConvUTF7) wxConvUTF7;
148
149#if 0
150static char utf7_setD[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
151 "abcdefghijklmnopqrstuvwxyz"
152 "0123456789'(),-./:?";
153static char utf7_setO[]="!\"#$%&*;<=>@[]^_`{|}";
154static char utf7_setB[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
155 "abcdefghijklmnopqrstuvwxyz"
156 "0123456789+/";
157#endif
158
159// TODO: write actual implementations of UTF-7 here
160size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf),
161 const char * WXUNUSED(psz),
162 size_t WXUNUSED(n)) const
163{
164 return 0;
165}
166
167size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
168 const wchar_t * WXUNUSED(psz),
169 size_t WXUNUSED(n)) const
170{
171 return 0;
172}
173
174//----------------------------------------------------------------------------
175// UTF-8
176//----------------------------------------------------------------------------
177
178WXDLLEXPORT_DATA(wxMBConvUTF8) wxConvUTF8;
179
180static unsigned long utf8_max[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
181
182size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
183{
184 size_t len = 0;
185
186 while (*psz && ((!buf) || (len<n))) {
187 unsigned char cc=*psz++, fc=cc;
188 unsigned cnt;
189 for (cnt=0; fc&0x80; cnt++) fc<<=1;
190 if (!cnt) {
191 // plain ASCII char
192 if (buf) *buf++=cc;
193 len++;
194 } else {
195 cnt--;
196 if (!cnt) {
197 // invalid UTF-8 sequence
198 return (size_t)-1;
199 } else {
200 unsigned ocnt=cnt-1;
201 unsigned long res=cc&(0x3f>>cnt);
202 while (cnt--) {
203 cc = *psz++;
204 if ((cc&0xC0)!=0x80) {
205 // invalid UTF-8 sequence
206 return (size_t)-1;
207 }
208 res=(res<<6)|(cc&0x3f);
209 }
210 if (res<=utf8_max[ocnt]) {
211 // illegal UTF-8 encoding
212 return (size_t)-1;
213 }
214 if (buf) *buf++=res;
215 len++;
216 }
217 }
218 }
219 if (buf && (len<n)) *buf = 0;
220 return len;
221}
222
223size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
224{
225 size_t len = 0;
226
227 while (*psz && ((!buf) || (len<n))) {
228 unsigned long cc=(*psz++)&0x7fffffff;
229 unsigned cnt;
230 for (cnt=0; cc>utf8_max[cnt]; cnt++);
231 if (!cnt) {
232 // plain ASCII char
233 if (buf) *buf++=cc;
234 len++;
235 } else {
236 len+=cnt+1;
237 if (buf) {
238 *buf++=(-128>>cnt)|((cc>>(cnt*6))&(0x3f>>cnt));
239 while (cnt--)
240 *buf++=0x80|((cc>>(cnt*6))&0x3f);
241 }
242 }
243 }
244 if (buf && (len<n)) *buf = 0;
245 return len;
246}
247
248// ----------------------------------------------------------------------------
249// specified character set
250// ----------------------------------------------------------------------------
251
252#ifndef WX_PRECOMP
253 #include "wx/dynarray.h"
254 #include "wx/filefn.h"
255 #include "wx/textfile.h"
256 #include "wx/tokenzr.h"
257 #include "wx/utils.h"
258#endif
259
260class wxCharacterSet
261{
262public:
263 wxArrayString names;
264 wchar_t *data;
265};
266
267WX_DECLARE_OBJARRAY(wxCharacterSet, wxCSArray);
268#include "wx/arrimpl.cpp"
269WX_DEFINE_OBJARRAY(wxCSArray);
270
271static wxCSArray wxCharsets;
272
273static void wxLoadCharacterSets(void)
274{
275 static bool already_loaded = FALSE;
276
277 if (already_loaded) return;
278
279 already_loaded = TRUE;
280#if defined(__UNIX__) && wxUSE_TEXTFILE
281 // search through files in /usr/share/i18n/charmaps
282 wxString fname;
283 for (fname = ::wxFindFirstFile(_T("/usr/share/i18n/charmaps/*"));
284 !fname.IsEmpty();
285 fname = ::wxFindNextFile()) {
286 wxTextFile cmap(fname);
287 if (cmap.Open()) {
288 wxCharacterSet *cset = new wxCharacterSet;
289 wxString comchar,escchar;
290 bool in_charset = FALSE;
291
292 // wxFprintf(stderr,_T("Loaded: %s\n"),fname.c_str());
293
294 wxString line;
295 for (line = cmap.GetFirstLine();
296 !cmap.Eof();
297 line = cmap.GetNextLine()) {
298 // wxFprintf(stderr,_T("line contents: %s\n"),line.c_str());
299 wxStringTokenizer token(line);
300 wxString cmd = token.GetNextToken();
301 if (cmd == comchar) {
302 if (token.GetNextToken() == _T("alias"))
303 cset->names.Add(token.GetNextToken());
304 }
305 else if (cmd == _T("<code_set_name>"))
306 cset->names.Add(token.GetNextToken());
307 else if (cmd == _T("<comment_char>"))
308 comchar = token.GetNextToken();
309 else if (cmd == _T("<escape_char>"))
310 escchar = token.GetNextToken();
311 else if (cmd == _T("<mb_cur_min>")) {
312 delete cset;
313 cset = (wxCharacterSet *) NULL;
314 break; // we don't support multibyte charsets ourselves (yet)
315 }
316 else if (cmd == _T("CHARMAP")) {
317 cset->data = (wchar_t *)calloc(256, sizeof(wchar_t));
318 in_charset = TRUE;
319 }
320 else if (cmd == _T("END")) {
321 if (token.GetNextToken() == _T("CHARMAP"))
322 in_charset = FALSE;
323 }
324 else if (in_charset) {
325 // format: <NUL> /x00 <U0000> NULL (NUL)
326 // <A> /x41 <U0041> LATIN CAPITAL LETTER A
327 wxString hex = token.GetNextToken();
328 // skip whitespace (why doesn't wxStringTokenizer do this?)
329 while (wxIsEmpty(hex) && token.HasMoreTokens()) hex = token.GetNextToken();
330 wxString uni = token.GetNextToken();
331 // skip whitespace again
332 while (wxIsEmpty(uni) && token.HasMoreTokens()) uni = token.GetNextToken();
333 if ((hex.Len() > 2) && (wxString(hex.GetChar(0)) == escchar) && (hex.GetChar(1) == _T('x')) &&
334 (uni.Left(2) == _T("<U"))) {
335 hex.MakeUpper(); uni.MakeUpper();
336 int pos = ::wxHexToDec(hex.Mid(2,2));
337 if (pos>=0) {
338 unsigned long uni1 = ::wxHexToDec(uni.Mid(2,2));
339 unsigned long uni2 = ::wxHexToDec(uni.Mid(4,2));
340 cset->data[pos] = (uni1 << 16) | uni2;
341 // wxFprintf(stderr,_T("char %02x mapped to %04x (%c)\n"),pos,cset->data[pos],cset->data[pos]);
342 }
343 }
344 }
345 }
346 if (cset) {
347 cset->names.Shrink();
348 wxCharsets.Add(cset);
349 }
350 }
351 }
352#endif
353 wxCharsets.Shrink();
354}
355
356static wxCharacterSet *wxFindCharacterSet(const wxChar *charset)
357{
358 if (!charset) return (wxCharacterSet *)NULL;
359 wxLoadCharacterSets();
360 for (size_t n=0; n<wxCharsets.GetCount(); n++)
361 if (wxCharsets[n].names.Index(charset) != wxNOT_FOUND)
362 return &(wxCharsets[n]);
363 return (wxCharacterSet *)NULL;
364}
365
366WXDLLEXPORT_DATA(wxCSConv) wxConvLocal((const wxChar *)NULL);
367
368wxCSConv::wxCSConv(const wxChar *charset)
369{
370 m_name = (wxChar *) NULL;
371 m_cset = (wxCharacterSet *) NULL;
372 m_deferred = TRUE;
373 SetName(charset);
374}
375
376wxCSConv::~wxCSConv()
377{
378 if (m_name) free(m_name);
379}
380
381void wxCSConv::SetName(const wxChar *charset)
382{
383 if (charset) {
384#ifdef __UNIX__
385 // first, convert the character set name to standard form
386 wxString codeset;
387 if (wxString(charset,3).CmpNoCase(_T("ISO")) == 0) {
388 // make sure it's represented in the standard form: ISO_8859-1
389 codeset = _T("ISO_");
390 charset += 3;
391 if ((*charset == _T('-')) || (*charset == _T('_'))) charset++;
392 if (wxStrlen(charset)>4) {
393 if (wxString(charset,4) == _T("8859")) {
394 codeset << _T("8859-");
395 if (*charset == _T('-')) charset++;
396 }
397 }
398 }
399 codeset << charset;
400 codeset.MakeUpper();
401 m_name = wxStrdup(codeset.c_str());
402 m_deferred = TRUE;
403#endif
404 }
405}
406
407void wxCSConv::LoadNow()
408{
409// wxPrintf(_T("Conversion request\n"));
410 if (m_deferred) {
411 if (!m_name) {
412#ifdef __UNIX__
413 wxChar *lang = wxGetenv(_T("LANG"));
414 wxChar *dot = lang ? wxStrchr(lang, _T('.')) : (wxChar *)NULL;
415 if (dot) SetName(dot+1);
416#endif
417 }
418 m_cset = wxFindCharacterSet(m_name);
419 m_deferred = FALSE;
420 }
421}
422
423size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
424{
425 ((wxCSConv *)this)->LoadNow(); // discard constness
426 if (buf) {
427 if (m_cset) {
428 for (size_t c=0; c<n; c++)
429 buf[c] = m_cset->data[(unsigned char)(psz[c])];
430 } else {
431 // latin-1 (direct)
432 for (size_t c=0; c<n; c++)
433 buf[c] = (unsigned char)(psz[c]);
434 }
435 return n;
436 }
437 return strlen(psz);
438}
439
440size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
441{
442 ((wxCSConv *)this)->LoadNow(); // discard constness
443 if (buf) {
444 if (m_cset) {
445 for (size_t c=0; c<n; c++) {
446 size_t n;
447 for (n=0; (n<256) && (m_cset->data[n] != psz[c]); n++);
448 buf[c] = (n>0xff) ? '?' : n;
449 }
450 } else {
451 // latin-1 (direct)
452 for (size_t c=0; c<n; c++)
453 buf[c] = (psz[c]>0xff) ? '?' : psz[c];
454 }
455 return n;
456 }
457 return wcslen(psz);
458}
459
460#endif
461 //wxUSE_WCHAR_T
462
463