added new To/FromWChar() API with more reasonable semantics than old MB2WC/WC2MB...
[wxWidgets.git] / include / wx / strconv.h
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.h
3 // Purpose: conversion routines for char sets any Unicode
4 // Author: Robert Roebling, Ove Kaaven
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1998 Ove Kaaven, Robert Roebling, Vadim Zeitlin
9 // Licence: wxWindows licence
10 ///////////////////////////////////////////////////////////////////////////////
11
12 #ifndef _WX_WXSTRCONVH__
13 #define _WX_WXSTRCONVH__
14
15 #include "wx/defs.h"
16 #include "wx/wxchar.h"
17 #include "wx/buffer.h"
18
19 #ifdef __DIGITALMARS__
20 #include "typeinfo.h"
21 #endif
22
23 #if defined(__VISAGECPP__) && __IBMCPP__ >= 400
24 # undef __BSEXCPT__
25 #endif
26
27 #include <stdlib.h>
28
29 #if wxUSE_WCHAR_T
30
31 // the error value returned by wxMBConv methods
32 #define wxCONV_FAILED ((size_t)-1)
33
34 // ----------------------------------------------------------------------------
35 // wxMBConv (abstract base class for conversions)
36 // ----------------------------------------------------------------------------
37
38 class WXDLLIMPEXP_BASE wxMBConv
39 {
40 public:
41 // The functions doing actual conversion from/to narrow to/from wide
42 // character strings.
43 //
44 // On success, the return value is the length (i.e. the number of
45 // characters, not bytes) of the converted string including any trailing
46 // L'\0' or (possibly multiple) '\0'(s). If the conversion fails or if
47 // there is not enough space for everything, including the trailing NUL
48 // character(s), in the output buffer, (size_t)-1 is returned.
49 //
50 // In the special case when dstLen is 0 (outputBuf may be NULL then) the
51 // return value is the length of the needed buffer but nothing happens
52 // otherwise. If srcLen is -1, the entire string, up to and including the
53 // trailing NUL(s), is converted, otherwise exactly srcLen bytes are.
54 //
55 // Typical usage:
56 //
57 // size_t dstLen = conv.ToWChar(NULL, 0, src);
58 // if ( dstLen != wxCONV_FAILED )
59 // ... handle error ...
60 // wchar_t *wbuf = new wchar_t[dstLen];
61 // conv.ToWChar(wbuf, dstLen, src);
62 //
63 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
64 const char *src, size_t srcLen = -1) const;
65
66 virtual size_t FromWChar(char *dst, size_t dstLen,
67 const wchar_t *src, size_t srcLen = -1) const;
68
69
70 // Convenience functions for translating NUL-terminated strings: returns
71 // the buffer containing the converted string or NULL pointer if the
72 // conversion failed.
73 const wxWCharBuffer cMB2WC(const char *in) const;
74 const wxCharBuffer cWC2MB(const wchar_t *in) const;
75
76 // Convenience functions for converting strings which may contain embedded
77 // NULs and don't have to be NUL-terminated.
78 //
79 // inLen is the length of the buffer including trailing NUL if any: if the
80 // last 4 bytes of the buffer are all NULs, these functions are more
81 // efficient as they avoid copying the string, but otherwise a copy is made
82 // internally which could be quite bad for (very) long strings.
83 //
84 // outLen receives, if not NULL, the length of the converted string or 0 if
85 // the conversion failed (returning 0 and not -1 in this case makes it
86 // difficult to distinguish between failed conversion and empty input but
87 // this is done for backwards compatibility)
88 const wxWCharBuffer
89 cMB2WC(const char *in, size_t inLen, size_t *outLen) const;
90 const wxCharBuffer
91 cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const;
92
93 // convenience functions for converting MB or WC to/from wxWin default
94 #if wxUSE_UNICODE
95 const wxWCharBuffer cMB2WX(const char *psz) const { return cMB2WC(psz); }
96 const wxCharBuffer cWX2MB(const wchar_t *psz) const { return cWC2MB(psz); }
97 const wchar_t* cWC2WX(const wchar_t *psz) const { return psz; }
98 const wchar_t* cWX2WC(const wchar_t *psz) const { return psz; }
99 #else // ANSI
100 const char* cMB2WX(const char *psz) const { return psz; }
101 const char* cWX2MB(const char *psz) const { return psz; }
102 const wxCharBuffer cWC2WX(const wchar_t *psz) const { return cWC2MB(psz); }
103 const wxWCharBuffer cWX2WC(const char *psz) const { return cMB2WC(psz); }
104 #endif // Unicode/ANSI
105
106 // this function is used in the implementation of cMB2WC() to distinguish
107 // between the following cases:
108 //
109 // a) var width encoding with strings terminated by a single NUL
110 // (usual multibyte encodings): return 1 in this case
111 // b) fixed width encoding with 2 bytes/char and so terminated by
112 // 2 NULs (UTF-16/UCS-2 and variants): return 2 in this case
113 // c) fixed width encoding with 4 bytes/char and so terminated by
114 // 4 NULs (UTF-32/UCS-4 and variants): return 4 in this case
115 //
116 // anything else is not supported currently and -1 should be returned
117 virtual size_t GetMBNulLen() const { return 1; }
118
119 // return the maximal value currently returned by GetMBNulLen() for any
120 // encoding
121 static size_t GetMaxMBNulLen() { return 4 /* for UTF-32 */; }
122
123
124 // The old conversion functions. The existing classes currently mostly
125 // implement these ones but we're in transition to using To/FromWChar()
126 // instead and any new classes should implement just the new functions.
127 // For now, however, we provide default implementation of To/FromWChar() in
128 // this base class in terms of MB2WC/WC2MB() to avoid having to rewrite all
129 // the conversions at once.
130 //
131 // On success, the return value is the length (i.e. the number of
132 // characters, not bytes) not counting the trailing NUL(s) of the converted
133 // string. On failure, (size_t)-1 is returned. In the special case when
134 // outputBuf is NULL the return value is the same one but nothing is
135 // written to the buffer.
136 //
137 // Note that outLen is the length of the output buffer, not the length of
138 // the input (which is always supposed to be terminated by one or more
139 // NULs, as appropriate for the encoding)!
140 virtual size_t MB2WC(wchar_t *out, const char *in, size_t outLen) const = 0;
141 virtual size_t WC2MB(char *out, const wchar_t *in, size_t outLen) const = 0;
142
143
144 // virtual dtor for any base class
145 virtual ~wxMBConv();
146 };
147
148 // ----------------------------------------------------------------------------
149 // wxMBConvLibc uses standard mbstowcs() and wcstombs() functions for
150 // conversion (hence it depends on the current locale)
151 // ----------------------------------------------------------------------------
152
153 class WXDLLIMPEXP_BASE wxMBConvLibc : public wxMBConv
154 {
155 public:
156 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
157 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
158 };
159
160 #ifdef __UNIX__
161
162 // ----------------------------------------------------------------------------
163 // wxConvBrokenFileNames is made for Unix in Unicode mode when
164 // files are accidentally written in an encoding which is not
165 // the system encoding. Typically, the system encoding will be
166 // UTF8 but there might be files stored in ISO8859-1 on disk.
167 // ----------------------------------------------------------------------------
168
169 class WXDLLIMPEXP_BASE wxConvBrokenFileNames : public wxMBConv
170 {
171 public:
172 wxConvBrokenFileNames(const wxChar *charset);
173 virtual ~wxConvBrokenFileNames() { delete m_conv; }
174
175 virtual size_t MB2WC(wchar_t *out, const char *in, size_t outLen) const
176 {
177 return m_conv->MB2WC(out, in, outLen);
178 }
179
180 virtual size_t WC2MB(char *out, const wchar_t *in, size_t outLen) const
181 {
182 return m_conv->WC2MB(out, in, outLen);
183 }
184
185 virtual size_t GetMBNulLen() const
186 {
187 // cast needed to call a private function
188 return m_conv->GetMBNulLen();
189 }
190
191 private:
192 // the conversion object we forward to
193 wxMBConv *m_conv;
194 };
195
196 #endif // __UNIX__
197
198 // ----------------------------------------------------------------------------
199 // wxMBConvUTF7 (for conversion using UTF7 encoding)
200 // ----------------------------------------------------------------------------
201
202 class WXDLLIMPEXP_BASE wxMBConvUTF7 : public wxMBConv
203 {
204 public:
205 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
206 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
207 };
208
209 // ----------------------------------------------------------------------------
210 // wxMBConvUTF8 (for conversion using UTF8 encoding)
211 // ----------------------------------------------------------------------------
212
213 class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
214 {
215 public:
216 enum {
217 MAP_INVALID_UTF8_NOT = 0,
218 MAP_INVALID_UTF8_TO_PUA = 1,
219 MAP_INVALID_UTF8_TO_OCTAL = 2
220 };
221
222 wxMBConvUTF8(int options = MAP_INVALID_UTF8_NOT) : m_options(options) { }
223 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
224 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
225
226 private:
227 int m_options;
228 };
229
230 // ----------------------------------------------------------------------------
231 // wxMBConvUTF16Base: for both LE and BE variants
232 // ----------------------------------------------------------------------------
233
234 class WXDLLIMPEXP_BASE wxMBConvUTF16Base : public wxMBConv
235 {
236 public:
237 virtual size_t GetMBNulLen() const { return 2; }
238 };
239
240 // ----------------------------------------------------------------------------
241 // wxMBConvUTF16LE (for conversion using UTF16 Little Endian encoding)
242 // ----------------------------------------------------------------------------
243
244 class WXDLLIMPEXP_BASE wxMBConvUTF16LE : public wxMBConvUTF16Base
245 {
246 public:
247 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
248 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
249 };
250
251 // ----------------------------------------------------------------------------
252 // wxMBConvUTF16BE (for conversion using UTF16 Big Endian encoding)
253 // ----------------------------------------------------------------------------
254
255 class WXDLLIMPEXP_BASE wxMBConvUTF16BE : public wxMBConvUTF16Base
256 {
257 public:
258 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
259 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
260 };
261
262 // ----------------------------------------------------------------------------
263 // wxMBConvUTF32Base: base class for both LE and BE variants
264 // ----------------------------------------------------------------------------
265
266 class WXDLLIMPEXP_BASE wxMBConvUTF32Base : public wxMBConv
267 {
268 public:
269 virtual size_t GetMBNulLen() const { return 4; }
270 };
271
272 // ----------------------------------------------------------------------------
273 // wxMBConvUTF32LE (for conversion using UTF32 Little Endian encoding)
274 // ----------------------------------------------------------------------------
275
276 class WXDLLIMPEXP_BASE wxMBConvUTF32LE : public wxMBConvUTF32Base
277 {
278 public:
279 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
280 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
281 };
282
283 // ----------------------------------------------------------------------------
284 // wxMBConvUTF32BE (for conversion using UTF32 Big Endian encoding)
285 // ----------------------------------------------------------------------------
286
287 class WXDLLIMPEXP_BASE wxMBConvUTF32BE : public wxMBConvUTF32Base
288 {
289 public:
290 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
291 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
292 };
293
294 // ----------------------------------------------------------------------------
295 // wxCSConv (for conversion based on loadable char sets)
296 // ----------------------------------------------------------------------------
297
298 #include "wx/fontenc.h"
299
300 class WXDLLIMPEXP_BASE wxCSConv : public wxMBConv
301 {
302 public:
303 // we can be created either from charset name or from an encoding constant
304 // but we can't have both at once
305 wxCSConv(const wxChar *charset);
306 wxCSConv(wxFontEncoding encoding);
307
308 wxCSConv(const wxCSConv& conv);
309 virtual ~wxCSConv();
310
311 wxCSConv& operator=(const wxCSConv& conv);
312
313 virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
314 virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
315 virtual size_t GetMBNulLen() const;
316
317 void Clear() ;
318
319 private:
320 // common part of all ctors
321 void Init();
322
323 // creates m_convReal if necessary
324 void CreateConvIfNeeded() const;
325
326 // do create m_convReal (unconditionally)
327 wxMBConv *DoCreate() const;
328
329 // set the name (may be only called when m_name == NULL), makes copy of
330 // charset string
331 void SetName(const wxChar *charset);
332
333
334 // note that we can't use wxString here because of compilation
335 // dependencies: we're included from wx/string.h
336 wxChar *m_name;
337 wxFontEncoding m_encoding;
338
339 // use CreateConvIfNeeded() before accessing m_convReal!
340 wxMBConv *m_convReal;
341 bool m_deferred;
342 };
343
344
345 // ----------------------------------------------------------------------------
346 // declare predefined conversion objects
347 // ----------------------------------------------------------------------------
348
349 // conversion to be used with all standard functions affected by locale, e.g.
350 // strtol(), strftime(), ...
351 extern WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc;
352
353 // conversion ISO-8859-1/UTF-7/UTF-8 <-> wchar_t
354 extern WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1;
355 extern WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7;
356 extern WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8;
357
358 // conversion used for the file names on the systems where they're not Unicode
359 // (basically anything except Windows)
360 //
361 // this is used by all file functions, can be changed by the application
362 //
363 // by default UTF-8 under Mac OS X and wxConvLibc elsewhere (but it's not used
364 // under Windows normally)
365 extern WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName;
366
367 // backwards compatible define
368 #define wxConvFile (*wxConvFileName)
369
370 // the current conversion object, may be set to any conversion, is used by
371 // default in a couple of places inside wx (initially same as wxConvLibc)
372 extern WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent;
373
374 // ???
375 extern WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal;
376
377
378 // ----------------------------------------------------------------------------
379 // endianness-dependent conversions
380 // ----------------------------------------------------------------------------
381
382 #ifdef WORDS_BIGENDIAN
383 typedef wxMBConvUTF16BE wxMBConvUTF16;
384 typedef wxMBConvUTF32BE wxMBConvUTF32;
385 #else
386 typedef wxMBConvUTF16LE wxMBConvUTF16;
387 typedef wxMBConvUTF32LE wxMBConvUTF32;
388 #endif
389
390 // ----------------------------------------------------------------------------
391 // filename conversion macros
392 // ----------------------------------------------------------------------------
393
394 // filenames are multibyte on Unix and probably widechar on Windows?
395 #if defined(__UNIX__) || defined(__BORLANDC__) || defined(__WXMAC__ )
396 #define wxMBFILES 1
397 #else
398 #define wxMBFILES 0
399 #endif
400
401 #if wxMBFILES && wxUSE_UNICODE
402 #define wxFNCONV(name) wxConvFileName->cWX2MB(name)
403 #define wxFNSTRINGCAST wxMBSTRINGCAST
404 #else
405 #if defined( __WXOSX__ ) && wxMBFILES
406 #define wxFNCONV(name) wxConvFileName->cWC2MB( wxConvLocal.cWX2WC(name) )
407 #else
408 #define wxFNCONV(name) name
409 #endif
410 #define wxFNSTRINGCAST WXSTRINGCAST
411 #endif
412
413 #else // !wxUSE_WCHAR_T
414
415 // ----------------------------------------------------------------------------
416 // stand-ins in absence of wchar_t
417 // ----------------------------------------------------------------------------
418
419 class WXDLLIMPEXP_BASE wxMBConv
420 {
421 public:
422 const char* cMB2WX(const char *psz) const { return psz; }
423 const char* cWX2MB(const char *psz) const { return psz; }
424 };
425
426 #define wxConvFile wxConvLocal
427
428 extern WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
429 wxConvLocal,
430 wxConvISO8859_1,
431 wxConvUTF8;
432 extern WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent;
433
434 #define wxFNCONV(name) name
435 #define wxFNSTRINGCAST WXSTRINGCAST
436
437 #endif
438 // wxUSE_WCHAR_T
439
440 // ----------------------------------------------------------------------------
441 // macros for the most common conversions
442 // ----------------------------------------------------------------------------
443
444 #if wxUSE_UNICODE
445 #define wxConvertWX2MB(s) wxConvCurrent->cWX2MB(s)
446 #define wxConvertMB2WX(s) wxConvCurrent->cMB2WX(s)
447 #else // ANSI
448 // no conversions to do
449 #define wxConvertWX2MB(s) (s)
450 #define wxConvertMB2WX(s) (s)
451 #endif // Unicode/ANSI
452
453 #endif
454 // _WX_WXSTRCONVH__
455