]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
better docs for Get/SetLabel methods
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV wxT("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
227 for ( ;; )
228 {
229 // try to convert the current chunk
230 size_t lenChunk = MB2WC(NULL, src, 0);
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dstWritten += lenChunk;
235 if ( !srcEnd )
236 dstWritten++;
237
238 if ( !lenChunk )
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
254 if ( !srcEnd )
255 dst++;
256 }
257
258 if ( !srcEnd )
259 {
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow (and we don't count the trailing NUL in this case)
262 break;
263 }
264
265 // advance the input pointer past the end of this chunk: notice that we
266 // will always stop before srcEnd because we know that the chunk is
267 // always properly NUL-terminated
268 while ( NotAllNULs(src, nulLen) )
269 {
270 // notice that we must skip over multiple bytes here as we suppose
271 // that if NUL takes 2 or 4 bytes, then all the other characters do
272 // too and so if advanced by a single byte we might erroneously
273 // detect sequences of NUL bytes in the middle of the input
274 src += nulLen;
275 }
276
277 // if the buffer ends before this NUL, we shouldn't count it in our
278 // output so skip the code below
279 if ( src == srcEnd )
280 break;
281
282 // do count this terminator as it's inside the buffer we convert
283 dstWritten++;
284 if ( dst )
285 dst++;
286
287 src += nulLen; // skip the terminator itself
288
289 if ( src >= srcEnd )
290 break;
291 }
292
293 return dstWritten;
294 }
295
296 size_t
297 wxMBConv::FromWChar(char *dst, size_t dstLen,
298 const wchar_t *src, size_t srcLen) const
299 {
300 // the number of chars [which would be] written to dst [if it were not NULL]
301 size_t dstWritten = 0;
302
303 // if we don't know its length we have no choice but to assume that it is
304 // NUL-terminated (notice that it can still be NUL-terminated even if
305 // explicit length is given but it doesn't change our return value)
306 const bool isNulTerminated = srcLen == wxNO_LEN;
307
308 // make a copy of the input string unless it is already properly
309 // NUL-terminated
310 wxWCharBuffer bufTmp;
311 if ( isNulTerminated )
312 {
313 srcLen = wxWcslen(src) + 1;
314 }
315 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
316 {
317 // make a copy in order to properly NUL-terminate the string
318 bufTmp = wxWCharBuffer(srcLen);
319 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
320 src = bufTmp;
321 }
322
323 const size_t lenNul = GetMBNulLen();
324 for ( const wchar_t * const srcEnd = src + srcLen;
325 src < srcEnd;
326 src++ /* skip L'\0' too */ )
327 {
328 // try to convert the current chunk
329 size_t lenChunk = WC2MB(NULL, src, 0);
330 if ( lenChunk == wxCONV_FAILED )
331 return wxCONV_FAILED;
332
333 dstWritten += lenChunk;
334
335 const wchar_t * const
336 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
337
338 // our return value accounts for the trailing NUL(s), unlike that of
339 // WC2MB(), however don't do it for the last NUL we artificially added
340 // ourselves above
341 if ( chunkEnd < srcEnd )
342 dstWritten += lenNul;
343
344 if ( dst )
345 {
346 if ( dstWritten > dstLen )
347 return wxCONV_FAILED;
348
349 // if we know that there is enough space in the destination buffer
350 // (because we accounted for lenNul in dstWritten above), we can
351 // convert directly in place -- but otherwise we need another
352 // temporary buffer to ensure that we don't overwrite the output
353 wxCharBuffer dstBuf;
354 char *dstTmp;
355 if ( chunkEnd == srcEnd )
356 {
357 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
358 dstTmp = dstBuf.data();
359 }
360 else
361 {
362 dstTmp = dst;
363 }
364
365 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
366 return wxCONV_FAILED;
367
368 if ( dstTmp != dst )
369 {
370 // copy everything up to but excluding the terminating NUL(s)
371 // into the real output buffer
372 memcpy(dst, dstTmp, lenChunk);
373
374 // micro-optimization: if dstTmp != dst it means that chunkEnd
375 // == srcEnd and so we're done, no need to update anything below
376 break;
377 }
378
379 dst += lenChunk;
380 if ( chunkEnd < srcEnd )
381 dst += lenNul;
382 }
383
384 src = chunkEnd;
385 }
386
387 return dstWritten;
388 }
389
390 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
391 {
392 size_t rc = ToWChar(outBuff, outLen, inBuff);
393 if ( rc != wxCONV_FAILED )
394 {
395 // ToWChar() returns the buffer length, i.e. including the trailing
396 // NUL, while this method doesn't take it into account
397 rc--;
398 }
399
400 return rc;
401 }
402
403 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
404 {
405 size_t rc = FromWChar(outBuff, outLen, inBuff);
406 if ( rc != wxCONV_FAILED )
407 {
408 rc -= GetMBNulLen();
409 }
410
411 return rc;
412 }
413
414 wxMBConv::~wxMBConv()
415 {
416 // nothing to do here (necessary for Darwin linking probably)
417 }
418
419 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
420 {
421 if ( psz )
422 {
423 // calculate the length of the buffer needed first
424 const size_t nLen = ToWChar(NULL, 0, psz);
425 if ( nLen != wxCONV_FAILED )
426 {
427 // now do the actual conversion
428 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
429
430 // +1 for the trailing NULL
431 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
432 return buf;
433 }
434 }
435
436 return wxWCharBuffer();
437 }
438
439 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
440 {
441 if ( pwz )
442 {
443 const size_t nLen = FromWChar(NULL, 0, pwz);
444 if ( nLen != wxCONV_FAILED )
445 {
446 wxCharBuffer buf(nLen - 1);
447 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
448 return buf;
449 }
450 }
451
452 return wxCharBuffer();
453 }
454
455 const wxWCharBuffer
456 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
457 {
458 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
459 if ( dstLen != wxCONV_FAILED )
460 {
461 // notice that we allocate space for dstLen+1 wide characters here
462 // because we want the buffer to always be NUL-terminated, even if the
463 // input isn't (as otherwise the caller has no way to know its length)
464 wxWCharBuffer wbuf(dstLen);
465 wbuf.data()[dstLen] = L'\0';
466 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
467 {
468 if ( outLen )
469 {
470 *outLen = dstLen;
471
472 // we also need to handle NUL-terminated input strings
473 // specially: for them the output is the length of the string
474 // excluding the trailing NUL, however if we're asked to
475 // convert a specific number of characters we return the length
476 // of the resulting output even if it's NUL-terminated
477 if ( inLen == wxNO_LEN )
478 (*outLen)--;
479 }
480
481 return wbuf;
482 }
483 }
484
485 if ( outLen )
486 *outLen = 0;
487
488 return wxWCharBuffer();
489 }
490
491 const wxCharBuffer
492 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
493 {
494 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
495 if ( dstLen != wxCONV_FAILED )
496 {
497 const size_t nulLen = GetMBNulLen();
498
499 // as above, ensure that the buffer is always NUL-terminated, even if
500 // the input is not
501 wxCharBuffer buf(dstLen + nulLen - 1);
502 memset(buf.data() + dstLen, 0, nulLen);
503 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
504 {
505 if ( outLen )
506 {
507 *outLen = dstLen;
508
509 if ( inLen == wxNO_LEN )
510 {
511 // in this case both input and output are NUL-terminated
512 // and we're not supposed to count NUL
513 *outLen -= nulLen;
514 }
515 }
516
517 return buf;
518 }
519 }
520
521 if ( outLen )
522 *outLen = 0;
523
524 return wxCharBuffer();
525 }
526
527 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
528 {
529 const size_t srcLen = buf.length();
530 if ( srcLen )
531 {
532 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
533 if ( dstLen != wxCONV_FAILED )
534 {
535 wxWCharBuffer wbuf(dstLen);
536 wbuf.data()[dstLen] = L'\0';
537 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
538 return wbuf;
539 }
540 }
541
542 return wxWCharBuffer();
543 }
544
545 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
546 {
547 const size_t srcLen = wbuf.length();
548 if ( srcLen )
549 {
550 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
551 if ( dstLen != wxCONV_FAILED )
552 {
553 wxCharBuffer buf(dstLen);
554 buf.data()[dstLen] = '\0';
555 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
556 return buf;
557 }
558 }
559
560 return wxCharBuffer();
561 }
562
563 // ----------------------------------------------------------------------------
564 // wxMBConvLibc
565 // ----------------------------------------------------------------------------
566
567 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
568 {
569 return wxMB2WC(buf, psz, n);
570 }
571
572 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
573 {
574 return wxWC2MB(buf, psz, n);
575 }
576
577 // ----------------------------------------------------------------------------
578 // wxConvBrokenFileNames
579 // ----------------------------------------------------------------------------
580
581 #ifdef __UNIX__
582
583 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
584 {
585 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
586 wxStricmp(charset, wxT("UTF8")) == 0 )
587 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
588 else
589 m_conv = new wxCSConv(charset);
590 }
591
592 #endif // __UNIX__
593
594 // ----------------------------------------------------------------------------
595 // UTF-7
596 // ----------------------------------------------------------------------------
597
598 // Implementation (C) 2004 Fredrik Roubert
599 //
600 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
601
602 //
603 // BASE64 decoding table
604 //
605 static const unsigned char utf7unb64[] =
606 {
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
613 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
614 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
616 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
617 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
618 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
620 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
621 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
622 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
637 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
638 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
639 };
640
641 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
642 const char *src, size_t srcLen) const
643 {
644 DecoderState stateOrig,
645 *statePtr;
646 if ( srcLen == wxNO_LEN )
647 {
648 // convert the entire string, up to and including the trailing NUL
649 srcLen = strlen(src) + 1;
650
651 // when working on the entire strings we don't update nor use the shift
652 // state from the previous call
653 statePtr = &stateOrig;
654 }
655 else // when working with partial strings we do use the shift state
656 {
657 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
658
659 // also save the old state to be able to rollback to it on error
660 stateOrig = m_stateDecoder;
661 }
662
663 // but to simplify the code below we use this variable in both cases
664 DecoderState& state = *statePtr;
665
666
667 // number of characters [which would have been] written to dst [if it were
668 // not NULL]
669 size_t len = 0;
670
671 const char * const srcEnd = src + srcLen;
672
673 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
674 {
675 const unsigned char cc = *src++;
676
677 if ( state.IsShifted() )
678 {
679 const unsigned char dc = utf7unb64[cc];
680 if ( dc == 0xff )
681 {
682 // end of encoded part, check that nothing was left: there can
683 // be up to 4 bits of 0 padding but nothing else (we also need
684 // to check isLSB as we count bits modulo 8 while a valid UTF-7
685 // encoded sequence must contain an integral number of UTF-16
686 // characters)
687 if ( state.isLSB || state.bit > 4 ||
688 (state.accum & ((1 << state.bit) - 1)) )
689 {
690 if ( !len )
691 state = stateOrig;
692
693 return wxCONV_FAILED;
694 }
695
696 state.ToDirect();
697
698 // re-parse this character normally below unless it's '-' which
699 // is consumed by the decoder
700 if ( cc == '-' )
701 continue;
702 }
703 else // valid encoded character
704 {
705 // mini base64 decoder: each character is 6 bits
706 state.bit += 6;
707 state.accum <<= 6;
708 state.accum += dc;
709
710 if ( state.bit >= 8 )
711 {
712 // got the full byte, consume it
713 state.bit -= 8;
714 unsigned char b = (state.accum >> state.bit) & 0x00ff;
715
716 if ( state.isLSB )
717 {
718 // we've got the full word, output it
719 if ( dst )
720 *dst++ = (state.msb << 8) | b;
721 len++;
722 state.isLSB = false;
723 }
724 else // MSB
725 {
726 // just store it while we wait for LSB
727 state.msb = b;
728 state.isLSB = true;
729 }
730 }
731 }
732 }
733
734 if ( state.IsDirect() )
735 {
736 // start of an encoded segment?
737 if ( cc == '+' )
738 {
739 if ( *src == '-' )
740 {
741 // just the encoded plus sign, don't switch to shifted mode
742 if ( dst )
743 *dst++ = '+';
744 len++;
745 src++;
746 }
747 else if ( utf7unb64[(unsigned)*src] == 0xff )
748 {
749 // empty encoded chunks are not allowed
750 if ( !len )
751 state = stateOrig;
752
753 return wxCONV_FAILED;
754 }
755 else // base-64 encoded chunk follows
756 {
757 state.ToShifted();
758 }
759 }
760 else // not '+'
761 {
762 // only printable 7 bit ASCII characters (with the exception of
763 // NUL, TAB, CR and LF) can be used directly
764 if ( cc >= 0x7f || (cc < ' ' &&
765 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
766 return wxCONV_FAILED;
767
768 if ( dst )
769 *dst++ = cc;
770 len++;
771 }
772 }
773 }
774
775 if ( !len )
776 {
777 // as we didn't read any characters we should be called with the same
778 // data (followed by some more new data) again later so don't save our
779 // state
780 state = stateOrig;
781
782 return wxCONV_FAILED;
783 }
784
785 return len;
786 }
787
788 //
789 // BASE64 encoding table
790 //
791 static const unsigned char utf7enb64[] =
792 {
793 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
794 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
795 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
796 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
797 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
798 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
799 'w', 'x', 'y', 'z', '0', '1', '2', '3',
800 '4', '5', '6', '7', '8', '9', '+', '/'
801 };
802
803 //
804 // UTF-7 encoding table
805 //
806 // 0 - Set D (directly encoded characters)
807 // 1 - Set O (optional direct characters)
808 // 2 - whitespace characters (optional)
809 // 3 - special characters
810 //
811 static const unsigned char utf7encode[128] =
812 {
813 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
814 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
815 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
819 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
821 };
822
823 static inline bool wxIsUTF7Direct(wchar_t wc)
824 {
825 return wc < 0x80 && utf7encode[wc] < 1;
826 }
827
828 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
829 const wchar_t *src, size_t srcLen) const
830 {
831 EncoderState stateOrig,
832 *statePtr;
833 if ( srcLen == wxNO_LEN )
834 {
835 // we don't apply the stored state when operating on entire strings at
836 // once
837 statePtr = &stateOrig;
838
839 srcLen = wxWcslen(src) + 1;
840 }
841 else // do use the mode we left the output in previously
842 {
843 stateOrig = m_stateEncoder;
844 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
845 }
846
847 EncoderState& state = *statePtr;
848
849
850 size_t len = 0;
851
852 const wchar_t * const srcEnd = src + srcLen;
853 while ( src < srcEnd && (!dst || len < dstLen) )
854 {
855 wchar_t cc = *src++;
856 if ( wxIsUTF7Direct(cc) )
857 {
858 if ( state.IsShifted() )
859 {
860 // pad with zeros the last encoded block if necessary
861 if ( state.bit )
862 {
863 if ( dst )
864 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
865 len++;
866 }
867
868 state.ToDirect();
869
870 if ( dst )
871 *dst++ = '-';
872 len++;
873 }
874
875 if ( dst )
876 *dst++ = (char)cc;
877 len++;
878 }
879 else if ( cc == '+' && state.IsDirect() )
880 {
881 if ( dst )
882 {
883 *dst++ = '+';
884 *dst++ = '-';
885 }
886
887 len += 2;
888 }
889 #ifndef WC_UTF16
890 else if (((wxUint32)cc) > 0xffff)
891 {
892 // no surrogate pair generation (yet?)
893 return wxCONV_FAILED;
894 }
895 #endif
896 else
897 {
898 if ( state.IsDirect() )
899 {
900 state.ToShifted();
901
902 if ( dst )
903 *dst++ = '+';
904 len++;
905 }
906
907 // BASE64 encode string
908 for ( ;; )
909 {
910 for ( unsigned lsb = 0; lsb < 2; lsb++ )
911 {
912 state.accum <<= 8;
913 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
914
915 for (state.bit += 8; state.bit >= 6; )
916 {
917 state.bit -= 6;
918 if ( dst )
919 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
920 len++;
921 }
922 }
923
924 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
925 break;
926
927 src++;
928 }
929 }
930 }
931
932 // we need to restore the original encoder state if we were called just to
933 // calculate the amount of space needed as we will presumably be called
934 // again to really convert the data now
935 if ( !dst )
936 state = stateOrig;
937
938 return len;
939 }
940
941 // ----------------------------------------------------------------------------
942 // UTF-8
943 // ----------------------------------------------------------------------------
944
945 static const wxUint32 utf8_max[]=
946 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
947
948 // boundaries of the private use area we use to (temporarily) remap invalid
949 // characters invalid in a UTF-8 encoded string
950 const wxUint32 wxUnicodePUA = 0x100000;
951 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
952
953 // this table gives the length of the UTF-8 encoding from its first character:
954 const unsigned char tableUtf8Lengths[256] = {
955 // single-byte sequences (ASCII):
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
964
965 // these are invalid:
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
970 0, 0, // C0,C1
971
972 // two-byte sequences:
973 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
974 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
975
976 // three-byte sequences:
977 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
978
979 // four-byte sequences:
980 4, 4, 4, 4, 4, // F0..F4
981
982 // these are invalid again (5- or 6-byte
983 // sequences and sequences for code points
984 // above U+10FFFF, as restricted by RFC 3629):
985 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
986 };
987
988 size_t
989 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
990 const char *src, size_t srcLen) const
991 {
992 wchar_t *out = dstLen ? dst : NULL;
993 size_t written = 0;
994
995 if ( srcLen == wxNO_LEN )
996 srcLen = strlen(src) + 1;
997
998 for ( const char *p = src; ; p++ )
999 {
1000 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
1001 {
1002 // all done successfully, just add the trailing NULL if we are not
1003 // using explicit length
1004 if ( srcLen == wxNO_LEN )
1005 {
1006 if ( out )
1007 {
1008 if ( !dstLen )
1009 break;
1010
1011 *out = L'\0';
1012 }
1013
1014 written++;
1015 }
1016
1017 return written;
1018 }
1019
1020 if ( out && !dstLen-- )
1021 break;
1022
1023 wxUint32 code;
1024 unsigned char c = *p;
1025
1026 if ( c < 0x80 )
1027 {
1028 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1029 break;
1030
1031 if ( srcLen != wxNO_LEN )
1032 srcLen--;
1033
1034 code = c;
1035 }
1036 else
1037 {
1038 unsigned len = tableUtf8Lengths[c];
1039 if ( !len )
1040 break;
1041
1042 if ( srcLen < len ) // the test works for wxNO_LEN too
1043 break;
1044
1045 if ( srcLen != wxNO_LEN )
1046 srcLen -= len;
1047
1048 // Char. number range | UTF-8 octet sequence
1049 // (hexadecimal) | (binary)
1050 // ----------------------+----------------------------------------
1051 // 0000 0000 - 0000 007F | 0xxxxxxx
1052 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1053 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1054 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1055 //
1056 // Code point value is stored in bits marked with 'x',
1057 // lowest-order bit of the value on the right side in the diagram
1058 // above. (from RFC 3629)
1059
1060 // mask to extract lead byte's value ('x' bits above), by sequence
1061 // length:
1062 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1063
1064 // mask and value of lead byte's most significant bits, by length:
1065 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1066 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1067
1068 len--; // it's more convenient to work with 0-based length here
1069
1070 // extract the lead byte's value bits:
1071 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1072 break;
1073
1074 code = c & leadValueMask[len];
1075
1076 // all remaining bytes, if any, are handled in the same way
1077 // regardless of sequence's length:
1078 for ( ; len; --len )
1079 {
1080 c = *++p;
1081 if ( (c & 0xC0) != 0x80 )
1082 return wxCONV_FAILED;
1083
1084 code <<= 6;
1085 code |= c & 0x3F;
1086 }
1087 }
1088
1089 #ifdef WC_UTF16
1090 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1091 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1092 {
1093 if ( out )
1094 out++;
1095 written++;
1096 }
1097 #else // !WC_UTF16
1098 if ( out )
1099 *out = code;
1100 #endif // WC_UTF16/!WC_UTF16
1101
1102 if ( out )
1103 out++;
1104
1105 written++;
1106 }
1107
1108 return wxCONV_FAILED;
1109 }
1110
1111 size_t
1112 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1113 const wchar_t *src, size_t srcLen) const
1114 {
1115 char *out = dstLen ? dst : NULL;
1116 size_t written = 0;
1117
1118 for ( const wchar_t *wp = src; ; wp++ )
1119 {
1120 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1121 {
1122 // all done successfully, just add the trailing NULL if we are not
1123 // using explicit length
1124 if ( srcLen == wxNO_LEN )
1125 {
1126 if ( out )
1127 {
1128 if ( !dstLen )
1129 break;
1130
1131 *out = '\0';
1132 }
1133
1134 written++;
1135 }
1136
1137 return written;
1138 }
1139
1140 if ( srcLen != wxNO_LEN )
1141 srcLen--;
1142
1143 wxUint32 code;
1144 #ifdef WC_UTF16
1145 // cast is ok for WC_UTF16
1146 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1147 {
1148 // skip the next char too as we decoded a surrogate
1149 wp++;
1150 }
1151 #else // wchar_t is UTF-32
1152 code = *wp & 0x7fffffff;
1153 #endif
1154
1155 unsigned len;
1156 if ( code <= 0x7F )
1157 {
1158 len = 1;
1159 if ( out )
1160 {
1161 if ( dstLen < len )
1162 break;
1163
1164 out[0] = (char)code;
1165 }
1166 }
1167 else if ( code <= 0x07FF )
1168 {
1169 len = 2;
1170 if ( out )
1171 {
1172 if ( dstLen < len )
1173 break;
1174
1175 // NB: this line takes 6 least significant bits, encodes them as
1176 // 10xxxxxx and discards them so that the next byte can be encoded:
1177 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1178 out[0] = 0xC0 | code;
1179 }
1180 }
1181 else if ( code < 0xFFFF )
1182 {
1183 len = 3;
1184 if ( out )
1185 {
1186 if ( dstLen < len )
1187 break;
1188
1189 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1191 out[0] = 0xE0 | code;
1192 }
1193 }
1194 else if ( code <= 0x10FFFF )
1195 {
1196 len = 4;
1197 if ( out )
1198 {
1199 if ( dstLen < len )
1200 break;
1201
1202 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1205 out[0] = 0xF0 | code;
1206 }
1207 }
1208 else
1209 {
1210 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1211 break;
1212 }
1213
1214 if ( out )
1215 {
1216 out += len;
1217 dstLen -= len;
1218 }
1219
1220 written += len;
1221 }
1222
1223 // we only get here if an error occurs during decoding
1224 return wxCONV_FAILED;
1225 }
1226
1227 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1228 const char *psz, size_t srcLen) const
1229 {
1230 if ( m_options == MAP_INVALID_UTF8_NOT )
1231 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1232
1233 size_t len = 0;
1234
1235 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1236 {
1237 const char *opsz = psz;
1238 bool invalid = false;
1239 unsigned char cc = *psz++, fc = cc;
1240 unsigned cnt;
1241 for (cnt = 0; fc & 0x80; cnt++)
1242 fc <<= 1;
1243
1244 if (!cnt)
1245 {
1246 // plain ASCII char
1247 if (buf)
1248 *buf++ = cc;
1249 len++;
1250
1251 // escape the escape character for octal escapes
1252 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1253 && cc == '\\' && (!buf || len < n))
1254 {
1255 if (buf)
1256 *buf++ = cc;
1257 len++;
1258 }
1259 }
1260 else
1261 {
1262 cnt--;
1263 if (!cnt)
1264 {
1265 // invalid UTF-8 sequence
1266 invalid = true;
1267 }
1268 else
1269 {
1270 unsigned ocnt = cnt - 1;
1271 wxUint32 res = cc & (0x3f >> cnt);
1272 while (cnt--)
1273 {
1274 cc = *psz;
1275 if ((cc & 0xC0) != 0x80)
1276 {
1277 // invalid UTF-8 sequence
1278 invalid = true;
1279 break;
1280 }
1281
1282 psz++;
1283 res = (res << 6) | (cc & 0x3f);
1284 }
1285
1286 if (invalid || res <= utf8_max[ocnt])
1287 {
1288 // illegal UTF-8 encoding
1289 invalid = true;
1290 }
1291 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1292 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1293 {
1294 // if one of our PUA characters turns up externally
1295 // it must also be treated as an illegal sequence
1296 // (a bit like you have to escape an escape character)
1297 invalid = true;
1298 }
1299 else
1300 {
1301 #ifdef WC_UTF16
1302 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1303 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1304 if (pa == wxCONV_FAILED)
1305 {
1306 invalid = true;
1307 }
1308 else
1309 {
1310 if (buf)
1311 buf += pa;
1312 len += pa;
1313 }
1314 #else // !WC_UTF16
1315 if (buf)
1316 *buf++ = (wchar_t)res;
1317 len++;
1318 #endif // WC_UTF16/!WC_UTF16
1319 }
1320 }
1321
1322 if (invalid)
1323 {
1324 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1325 {
1326 while (opsz < psz && (!buf || len < n))
1327 {
1328 #ifdef WC_UTF16
1329 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1330 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1331 wxASSERT(pa != wxCONV_FAILED);
1332 if (buf)
1333 buf += pa;
1334 opsz++;
1335 len += pa;
1336 #else
1337 if (buf)
1338 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1339 opsz++;
1340 len++;
1341 #endif
1342 }
1343 }
1344 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1345 {
1346 while (opsz < psz && (!buf || len < n))
1347 {
1348 if ( buf && len + 3 < n )
1349 {
1350 unsigned char on = *opsz;
1351 *buf++ = L'\\';
1352 *buf++ = (wchar_t)( L'0' + on / 0100 );
1353 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1354 *buf++ = (wchar_t)( L'0' + on % 010 );
1355 }
1356
1357 opsz++;
1358 len += 4;
1359 }
1360 }
1361 else // MAP_INVALID_UTF8_NOT
1362 {
1363 return wxCONV_FAILED;
1364 }
1365 }
1366 }
1367 }
1368
1369 if (srcLen == wxNO_LEN && buf && (len < n))
1370 *buf = 0;
1371
1372 return len + 1;
1373 }
1374
1375 static inline bool isoctal(wchar_t wch)
1376 {
1377 return L'0' <= wch && wch <= L'7';
1378 }
1379
1380 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1381 const wchar_t *psz, size_t srcLen) const
1382 {
1383 if ( m_options == MAP_INVALID_UTF8_NOT )
1384 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1385
1386 size_t len = 0;
1387
1388 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1389 {
1390 wxUint32 cc;
1391
1392 #ifdef WC_UTF16
1393 // cast is ok for WC_UTF16
1394 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1395 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1396 #else
1397 cc = (*psz++) & 0x7fffffff;
1398 #endif
1399
1400 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1401 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1402 {
1403 if (buf)
1404 *buf++ = (char)(cc - wxUnicodePUA);
1405 len++;
1406 }
1407 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1408 && cc == L'\\' && psz[0] == L'\\' )
1409 {
1410 if (buf)
1411 *buf++ = (char)cc;
1412 psz++;
1413 len++;
1414 }
1415 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1416 cc == L'\\' &&
1417 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1418 {
1419 if (buf)
1420 {
1421 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1422 (psz[1] - L'0') * 010 +
1423 (psz[2] - L'0'));
1424 }
1425
1426 psz += 3;
1427 len++;
1428 }
1429 else
1430 {
1431 unsigned cnt;
1432 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1433 {
1434 }
1435
1436 if (!cnt)
1437 {
1438 // plain ASCII char
1439 if (buf)
1440 *buf++ = (char) cc;
1441 len++;
1442 }
1443 else
1444 {
1445 len += cnt + 1;
1446 if (buf)
1447 {
1448 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1449 while (cnt--)
1450 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1451 }
1452 }
1453 }
1454 }
1455
1456 if (srcLen == wxNO_LEN && buf && (len < n))
1457 *buf = 0;
1458
1459 return len + 1;
1460 }
1461
1462 // ============================================================================
1463 // UTF-16
1464 // ============================================================================
1465
1466 #ifdef WORDS_BIGENDIAN
1467 #define wxMBConvUTF16straight wxMBConvUTF16BE
1468 #define wxMBConvUTF16swap wxMBConvUTF16LE
1469 #else
1470 #define wxMBConvUTF16swap wxMBConvUTF16BE
1471 #define wxMBConvUTF16straight wxMBConvUTF16LE
1472 #endif
1473
1474 /* static */
1475 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1476 {
1477 if ( srcLen == wxNO_LEN )
1478 {
1479 // count the number of bytes in input, including the trailing NULs
1480 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1481 for ( srcLen = 1; *inBuff++; srcLen++ )
1482 ;
1483
1484 srcLen *= BYTES_PER_CHAR;
1485 }
1486 else // we already have the length
1487 {
1488 // we can only convert an entire number of UTF-16 characters
1489 if ( srcLen % BYTES_PER_CHAR )
1490 return wxCONV_FAILED;
1491 }
1492
1493 return srcLen;
1494 }
1495
1496 // case when in-memory representation is UTF-16 too
1497 #ifdef WC_UTF16
1498
1499 // ----------------------------------------------------------------------------
1500 // conversions without endianness change
1501 // ----------------------------------------------------------------------------
1502
1503 size_t
1504 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1505 const char *src, size_t srcLen) const
1506 {
1507 // set up the scene for using memcpy() (which is presumably more efficient
1508 // than copying the bytes one by one)
1509 srcLen = GetLength(src, srcLen);
1510 if ( srcLen == wxNO_LEN )
1511 return wxCONV_FAILED;
1512
1513 const size_t inLen = srcLen / BYTES_PER_CHAR;
1514 if ( dst )
1515 {
1516 if ( dstLen < inLen )
1517 return wxCONV_FAILED;
1518
1519 memcpy(dst, src, srcLen);
1520 }
1521
1522 return inLen;
1523 }
1524
1525 size_t
1526 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1527 const wchar_t *src, size_t srcLen) const
1528 {
1529 if ( srcLen == wxNO_LEN )
1530 srcLen = wxWcslen(src) + 1;
1531
1532 srcLen *= BYTES_PER_CHAR;
1533
1534 if ( dst )
1535 {
1536 if ( dstLen < srcLen )
1537 return wxCONV_FAILED;
1538
1539 memcpy(dst, src, srcLen);
1540 }
1541
1542 return srcLen;
1543 }
1544
1545 // ----------------------------------------------------------------------------
1546 // endian-reversing conversions
1547 // ----------------------------------------------------------------------------
1548
1549 size_t
1550 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1551 const char *src, size_t srcLen) const
1552 {
1553 srcLen = GetLength(src, srcLen);
1554 if ( srcLen == wxNO_LEN )
1555 return wxCONV_FAILED;
1556
1557 srcLen /= BYTES_PER_CHAR;
1558
1559 if ( dst )
1560 {
1561 if ( dstLen < srcLen )
1562 return wxCONV_FAILED;
1563
1564 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1565 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1566 {
1567 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1568 }
1569 }
1570
1571 return srcLen;
1572 }
1573
1574 size_t
1575 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1576 const wchar_t *src, size_t srcLen) const
1577 {
1578 if ( srcLen == wxNO_LEN )
1579 srcLen = wxWcslen(src) + 1;
1580
1581 srcLen *= BYTES_PER_CHAR;
1582
1583 if ( dst )
1584 {
1585 if ( dstLen < srcLen )
1586 return wxCONV_FAILED;
1587
1588 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1589 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1590 {
1591 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1592 }
1593 }
1594
1595 return srcLen;
1596 }
1597
1598 #else // !WC_UTF16: wchar_t is UTF-32
1599
1600 // ----------------------------------------------------------------------------
1601 // conversions without endianness change
1602 // ----------------------------------------------------------------------------
1603
1604 size_t
1605 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1606 const char *src, size_t srcLen) const
1607 {
1608 srcLen = GetLength(src, srcLen);
1609 if ( srcLen == wxNO_LEN )
1610 return wxCONV_FAILED;
1611
1612 const size_t inLen = srcLen / BYTES_PER_CHAR;
1613 if ( !dst )
1614 {
1615 // optimization: return maximal space which could be needed for this
1616 // string even if the real size could be smaller if the buffer contains
1617 // any surrogates
1618 return inLen;
1619 }
1620
1621 size_t outLen = 0;
1622 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1623 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1624 {
1625 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1626 if ( !inBuff )
1627 return wxCONV_FAILED;
1628
1629 if ( ++outLen > dstLen )
1630 return wxCONV_FAILED;
1631
1632 *dst++ = ch;
1633 }
1634
1635
1636 return outLen;
1637 }
1638
1639 size_t
1640 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1641 const wchar_t *src, size_t srcLen) const
1642 {
1643 if ( srcLen == wxNO_LEN )
1644 srcLen = wxWcslen(src) + 1;
1645
1646 size_t outLen = 0;
1647 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1648 for ( size_t n = 0; n < srcLen; n++ )
1649 {
1650 wxUint16 cc[2];
1651 const size_t numChars = encode_utf16(*src++, cc);
1652 if ( numChars == wxCONV_FAILED )
1653 return wxCONV_FAILED;
1654
1655 outLen += numChars * BYTES_PER_CHAR;
1656 if ( outBuff )
1657 {
1658 if ( outLen > dstLen )
1659 return wxCONV_FAILED;
1660
1661 *outBuff++ = cc[0];
1662 if ( numChars == 2 )
1663 {
1664 // second character of a surrogate
1665 *outBuff++ = cc[1];
1666 }
1667 }
1668 }
1669
1670 return outLen;
1671 }
1672
1673 // ----------------------------------------------------------------------------
1674 // endian-reversing conversions
1675 // ----------------------------------------------------------------------------
1676
1677 size_t
1678 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1679 const char *src, size_t srcLen) const
1680 {
1681 srcLen = GetLength(src, srcLen);
1682 if ( srcLen == wxNO_LEN )
1683 return wxCONV_FAILED;
1684
1685 const size_t inLen = srcLen / BYTES_PER_CHAR;
1686 if ( !dst )
1687 {
1688 // optimization: return maximal space which could be needed for this
1689 // string even if the real size could be smaller if the buffer contains
1690 // any surrogates
1691 return inLen;
1692 }
1693
1694 size_t outLen = 0;
1695 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1696 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1697 {
1698 wxUint32 ch;
1699 wxUint16 tmp[2];
1700
1701 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1702 inBuff++;
1703 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1704
1705 const size_t numChars = decode_utf16(tmp, ch);
1706 if ( numChars == wxCONV_FAILED )
1707 return wxCONV_FAILED;
1708
1709 if ( numChars == 2 )
1710 inBuff++;
1711
1712 if ( ++outLen > dstLen )
1713 return wxCONV_FAILED;
1714
1715 *dst++ = ch;
1716 }
1717
1718
1719 return outLen;
1720 }
1721
1722 size_t
1723 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1724 const wchar_t *src, size_t srcLen) const
1725 {
1726 if ( srcLen == wxNO_LEN )
1727 srcLen = wxWcslen(src) + 1;
1728
1729 size_t outLen = 0;
1730 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1731 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1732 {
1733 wxUint16 cc[2];
1734 const size_t numChars = encode_utf16(*src, cc);
1735 if ( numChars == wxCONV_FAILED )
1736 return wxCONV_FAILED;
1737
1738 outLen += numChars * BYTES_PER_CHAR;
1739 if ( outBuff )
1740 {
1741 if ( outLen > dstLen )
1742 return wxCONV_FAILED;
1743
1744 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1745 if ( numChars == 2 )
1746 {
1747 // second character of a surrogate
1748 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1749 }
1750 }
1751 }
1752
1753 return outLen;
1754 }
1755
1756 #endif // WC_UTF16/!WC_UTF16
1757
1758
1759 // ============================================================================
1760 // UTF-32
1761 // ============================================================================
1762
1763 #ifdef WORDS_BIGENDIAN
1764 #define wxMBConvUTF32straight wxMBConvUTF32BE
1765 #define wxMBConvUTF32swap wxMBConvUTF32LE
1766 #else
1767 #define wxMBConvUTF32swap wxMBConvUTF32BE
1768 #define wxMBConvUTF32straight wxMBConvUTF32LE
1769 #endif
1770
1771
1772 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1773 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1774
1775 /* static */
1776 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1777 {
1778 if ( srcLen == wxNO_LEN )
1779 {
1780 // count the number of bytes in input, including the trailing NULs
1781 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1782 for ( srcLen = 1; *inBuff++; srcLen++ )
1783 ;
1784
1785 srcLen *= BYTES_PER_CHAR;
1786 }
1787 else // we already have the length
1788 {
1789 // we can only convert an entire number of UTF-32 characters
1790 if ( srcLen % BYTES_PER_CHAR )
1791 return wxCONV_FAILED;
1792 }
1793
1794 return srcLen;
1795 }
1796
1797 // case when in-memory representation is UTF-16
1798 #ifdef WC_UTF16
1799
1800 // ----------------------------------------------------------------------------
1801 // conversions without endianness change
1802 // ----------------------------------------------------------------------------
1803
1804 size_t
1805 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1806 const char *src, size_t srcLen) const
1807 {
1808 srcLen = GetLength(src, srcLen);
1809 if ( srcLen == wxNO_LEN )
1810 return wxCONV_FAILED;
1811
1812 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1813 const size_t inLen = srcLen / BYTES_PER_CHAR;
1814 size_t outLen = 0;
1815 for ( size_t n = 0; n < inLen; n++ )
1816 {
1817 wxUint16 cc[2];
1818 const size_t numChars = encode_utf16(*inBuff++, cc);
1819 if ( numChars == wxCONV_FAILED )
1820 return wxCONV_FAILED;
1821
1822 outLen += numChars;
1823 if ( dst )
1824 {
1825 if ( outLen > dstLen )
1826 return wxCONV_FAILED;
1827
1828 *dst++ = cc[0];
1829 if ( numChars == 2 )
1830 {
1831 // second character of a surrogate
1832 *dst++ = cc[1];
1833 }
1834 }
1835 }
1836
1837 return outLen;
1838 }
1839
1840 size_t
1841 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1842 const wchar_t *src, size_t srcLen) const
1843 {
1844 if ( srcLen == wxNO_LEN )
1845 srcLen = wxWcslen(src) + 1;
1846
1847 if ( !dst )
1848 {
1849 // optimization: return maximal space which could be needed for this
1850 // string instead of the exact amount which could be less if there are
1851 // any surrogates in the input
1852 //
1853 // we consider that surrogates are rare enough to make it worthwhile to
1854 // avoid running the loop below at the cost of slightly extra memory
1855 // consumption
1856 return srcLen * BYTES_PER_CHAR;
1857 }
1858
1859 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1860 size_t outLen = 0;
1861 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1862 {
1863 const wxUint32 ch = wxDecodeSurrogate(&src);
1864 if ( !src )
1865 return wxCONV_FAILED;
1866
1867 outLen += BYTES_PER_CHAR;
1868
1869 if ( outLen > dstLen )
1870 return wxCONV_FAILED;
1871
1872 *outBuff++ = ch;
1873 }
1874
1875 return outLen;
1876 }
1877
1878 // ----------------------------------------------------------------------------
1879 // endian-reversing conversions
1880 // ----------------------------------------------------------------------------
1881
1882 size_t
1883 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1884 const char *src, size_t srcLen) const
1885 {
1886 srcLen = GetLength(src, srcLen);
1887 if ( srcLen == wxNO_LEN )
1888 return wxCONV_FAILED;
1889
1890 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1891 const size_t inLen = srcLen / BYTES_PER_CHAR;
1892 size_t outLen = 0;
1893 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1894 {
1895 wxUint16 cc[2];
1896 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1897 if ( numChars == wxCONV_FAILED )
1898 return wxCONV_FAILED;
1899
1900 outLen += numChars;
1901 if ( dst )
1902 {
1903 if ( outLen > dstLen )
1904 return wxCONV_FAILED;
1905
1906 *dst++ = cc[0];
1907 if ( numChars == 2 )
1908 {
1909 // second character of a surrogate
1910 *dst++ = cc[1];
1911 }
1912 }
1913 }
1914
1915 return outLen;
1916 }
1917
1918 size_t
1919 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1920 const wchar_t *src, size_t srcLen) const
1921 {
1922 if ( srcLen == wxNO_LEN )
1923 srcLen = wxWcslen(src) + 1;
1924
1925 if ( !dst )
1926 {
1927 // optimization: return maximal space which could be needed for this
1928 // string instead of the exact amount which could be less if there are
1929 // any surrogates in the input
1930 //
1931 // we consider that surrogates are rare enough to make it worthwhile to
1932 // avoid running the loop below at the cost of slightly extra memory
1933 // consumption
1934 return srcLen*BYTES_PER_CHAR;
1935 }
1936
1937 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1938 size_t outLen = 0;
1939 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1940 {
1941 const wxUint32 ch = wxDecodeSurrogate(&src);
1942 if ( !src )
1943 return wxCONV_FAILED;
1944
1945 outLen += BYTES_PER_CHAR;
1946
1947 if ( outLen > dstLen )
1948 return wxCONV_FAILED;
1949
1950 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1951 }
1952
1953 return outLen;
1954 }
1955
1956 #else // !WC_UTF16: wchar_t is UTF-32
1957
1958 // ----------------------------------------------------------------------------
1959 // conversions without endianness change
1960 // ----------------------------------------------------------------------------
1961
1962 size_t
1963 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1964 const char *src, size_t srcLen) const
1965 {
1966 // use memcpy() as it should be much faster than hand-written loop
1967 srcLen = GetLength(src, srcLen);
1968 if ( srcLen == wxNO_LEN )
1969 return wxCONV_FAILED;
1970
1971 const size_t inLen = srcLen/BYTES_PER_CHAR;
1972 if ( dst )
1973 {
1974 if ( dstLen < inLen )
1975 return wxCONV_FAILED;
1976
1977 memcpy(dst, src, srcLen);
1978 }
1979
1980 return inLen;
1981 }
1982
1983 size_t
1984 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1985 const wchar_t *src, size_t srcLen) const
1986 {
1987 if ( srcLen == wxNO_LEN )
1988 srcLen = wxWcslen(src) + 1;
1989
1990 srcLen *= BYTES_PER_CHAR;
1991
1992 if ( dst )
1993 {
1994 if ( dstLen < srcLen )
1995 return wxCONV_FAILED;
1996
1997 memcpy(dst, src, srcLen);
1998 }
1999
2000 return srcLen;
2001 }
2002
2003 // ----------------------------------------------------------------------------
2004 // endian-reversing conversions
2005 // ----------------------------------------------------------------------------
2006
2007 size_t
2008 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2009 const char *src, size_t srcLen) const
2010 {
2011 srcLen = GetLength(src, srcLen);
2012 if ( srcLen == wxNO_LEN )
2013 return wxCONV_FAILED;
2014
2015 srcLen /= BYTES_PER_CHAR;
2016
2017 if ( dst )
2018 {
2019 if ( dstLen < srcLen )
2020 return wxCONV_FAILED;
2021
2022 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2023 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2024 {
2025 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2026 }
2027 }
2028
2029 return srcLen;
2030 }
2031
2032 size_t
2033 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2034 const wchar_t *src, size_t srcLen) const
2035 {
2036 if ( srcLen == wxNO_LEN )
2037 srcLen = wxWcslen(src) + 1;
2038
2039 srcLen *= BYTES_PER_CHAR;
2040
2041 if ( dst )
2042 {
2043 if ( dstLen < srcLen )
2044 return wxCONV_FAILED;
2045
2046 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2047 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2048 {
2049 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2050 }
2051 }
2052
2053 return srcLen;
2054 }
2055
2056 #endif // WC_UTF16/!WC_UTF16
2057
2058
2059 // ============================================================================
2060 // The classes doing conversion using the iconv_xxx() functions
2061 // ============================================================================
2062
2063 #ifdef HAVE_ICONV
2064
2065 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2066 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2067 // (unless there's yet another bug in glibc) the only case when iconv()
2068 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2069 // left in the input buffer -- when _real_ error occurs,
2070 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2071 // iconv() failure.
2072 // [This bug does not appear in glibc 2.2.]
2073 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2074 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2075 (errno != E2BIG || bufLeft != 0))
2076 #else
2077 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2078 #endif
2079
2080 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2081
2082 #define ICONV_T_INVALID ((iconv_t)-1)
2083
2084 #if SIZEOF_WCHAR_T == 4
2085 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2086 #define WC_ENC wxFONTENCODING_UTF32
2087 #elif SIZEOF_WCHAR_T == 2
2088 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2089 #define WC_ENC wxFONTENCODING_UTF16
2090 #else // sizeof(wchar_t) != 2 nor 4
2091 // does this ever happen?
2092 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2093 #endif
2094
2095 // ----------------------------------------------------------------------------
2096 // wxMBConv_iconv: encapsulates an iconv character set
2097 // ----------------------------------------------------------------------------
2098
2099 class wxMBConv_iconv : public wxMBConv
2100 {
2101 public:
2102 wxMBConv_iconv(const char *name);
2103 virtual ~wxMBConv_iconv();
2104
2105 // implement base class virtual methods
2106 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2107 const char *src, size_t srcLen = wxNO_LEN) const;
2108 virtual size_t FromWChar(char *dst, size_t dstLen,
2109 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2110 virtual size_t GetMBNulLen() const;
2111
2112 #if wxUSE_UNICODE_UTF8
2113 virtual bool IsUTF8() const;
2114 #endif
2115
2116 virtual wxMBConv *Clone() const
2117 {
2118 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2119 p->m_minMBCharWidth = m_minMBCharWidth;
2120 return p;
2121 }
2122
2123 bool IsOk() const
2124 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2125
2126 protected:
2127 // the iconv handlers used to translate from multibyte
2128 // to wide char and in the other direction
2129 iconv_t m2w,
2130 w2m;
2131
2132 #if wxUSE_THREADS
2133 // guards access to m2w and w2m objects
2134 wxMutex m_iconvMutex;
2135 #endif
2136
2137 private:
2138 // the name (for iconv_open()) of a wide char charset -- if none is
2139 // available on this machine, it will remain NULL
2140 static wxString ms_wcCharsetName;
2141
2142 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2143 // different endian-ness than the native one
2144 static bool ms_wcNeedsSwap;
2145
2146
2147 // name of the encoding handled by this conversion
2148 wxString m_name;
2149
2150 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2151 // initially
2152 size_t m_minMBCharWidth;
2153 };
2154
2155 // make the constructor available for unit testing
2156 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2157 {
2158 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2159 if ( !result->IsOk() )
2160 {
2161 delete result;
2162 return 0;
2163 }
2164
2165 return result;
2166 }
2167
2168 wxString wxMBConv_iconv::ms_wcCharsetName;
2169 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2170
2171 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2172 : m_name(name)
2173 {
2174 m_minMBCharWidth = 0;
2175
2176 // check for charset that represents wchar_t:
2177 if ( ms_wcCharsetName.empty() )
2178 {
2179 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2180
2181 #if wxUSE_FONTMAP
2182 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2183 #else // !wxUSE_FONTMAP
2184 static const wxChar *const names_static[] =
2185 {
2186 #if SIZEOF_WCHAR_T == 4
2187 wxT("UCS-4"),
2188 #elif SIZEOF_WCHAR_T = 2
2189 wxT("UCS-2"),
2190 #endif
2191 NULL
2192 };
2193 const wxChar *const *names = names_static;
2194 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2195
2196 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2197 {
2198 const wxString nameCS(*names);
2199
2200 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2201 wxString nameXE(nameCS);
2202
2203 #ifdef WORDS_BIGENDIAN
2204 nameXE += wxT("BE");
2205 #else // little endian
2206 nameXE += wxT("LE");
2207 #endif
2208
2209 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2210 nameXE.c_str());
2211
2212 m2w = iconv_open(nameXE.ToAscii(), name);
2213 if ( m2w == ICONV_T_INVALID )
2214 {
2215 // try charset w/o bytesex info (e.g. "UCS4")
2216 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2217 nameCS.c_str());
2218 m2w = iconv_open(nameCS.ToAscii(), name);
2219
2220 // and check for bytesex ourselves:
2221 if ( m2w != ICONV_T_INVALID )
2222 {
2223 char buf[2], *bufPtr;
2224 wchar_t wbuf[2];
2225 size_t insz, outsz;
2226 size_t res;
2227
2228 buf[0] = 'A';
2229 buf[1] = 0;
2230 wbuf[0] = 0;
2231 insz = 2;
2232 outsz = SIZEOF_WCHAR_T * 2;
2233 char* wbufPtr = (char*)wbuf;
2234 bufPtr = buf;
2235
2236 res = iconv(
2237 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2238 &wbufPtr, &outsz);
2239
2240 if (ICONV_FAILED(res, insz))
2241 {
2242 wxLogLastError(wxT("iconv"));
2243 wxLogError(_("Conversion to charset '%s' doesn't work."),
2244 nameCS.c_str());
2245 }
2246 else // ok, can convert to this encoding, remember it
2247 {
2248 ms_wcCharsetName = nameCS;
2249 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2250 }
2251 }
2252 }
2253 else // use charset not requiring byte swapping
2254 {
2255 ms_wcCharsetName = nameXE;
2256 }
2257 }
2258
2259 wxLogTrace(TRACE_STRCONV,
2260 wxT("iconv wchar_t charset is \"%s\"%s"),
2261 ms_wcCharsetName.empty() ? wxString("<none>")
2262 : ms_wcCharsetName,
2263 ms_wcNeedsSwap ? wxT(" (needs swap)")
2264 : wxT(""));
2265 }
2266 else // we already have ms_wcCharsetName
2267 {
2268 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2269 }
2270
2271 if ( ms_wcCharsetName.empty() )
2272 {
2273 w2m = ICONV_T_INVALID;
2274 }
2275 else
2276 {
2277 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2278 if ( w2m == ICONV_T_INVALID )
2279 {
2280 wxLogTrace(TRACE_STRCONV,
2281 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2282 ms_wcCharsetName.c_str(), name);
2283 }
2284 }
2285 }
2286
2287 wxMBConv_iconv::~wxMBConv_iconv()
2288 {
2289 if ( m2w != ICONV_T_INVALID )
2290 iconv_close(m2w);
2291 if ( w2m != ICONV_T_INVALID )
2292 iconv_close(w2m);
2293 }
2294
2295 size_t
2296 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2297 const char *src, size_t srcLen) const
2298 {
2299 if ( srcLen == wxNO_LEN )
2300 {
2301 // find the string length: notice that must be done differently for
2302 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2303 // consecutive NULs
2304 const size_t nulLen = GetMBNulLen();
2305 switch ( nulLen )
2306 {
2307 default:
2308 return wxCONV_FAILED;
2309
2310 case 1:
2311 srcLen = strlen(src); // arguably more optimized than our version
2312 break;
2313
2314 case 2:
2315 case 4:
2316 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317 // but they also have to start at character boundary and not
2318 // span two adjacent characters
2319 const char *p;
2320 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2321 ;
2322 srcLen = p - src;
2323 break;
2324 }
2325
2326 // when we're determining the length of the string ourselves we count
2327 // the terminating NUL(s) as part of it and always NUL-terminate the
2328 // output
2329 srcLen += nulLen;
2330 }
2331
2332 // we express length in the number of (wide) characters but iconv always
2333 // counts buffer sizes it in bytes
2334 dstLen *= SIZEOF_WCHAR_T;
2335
2336 #if wxUSE_THREADS
2337 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338 // Unfortunately there are a couple of global wxCSConv objects such as
2339 // wxConvLocal that are used all over wx code, so we have to make sure
2340 // the handle is used by at most one thread at the time. Otherwise
2341 // only a few wx classes would be safe to use from non-main threads
2342 // as MB<->WC conversion would fail "randomly".
2343 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2344 #endif // wxUSE_THREADS
2345
2346 size_t res, cres;
2347 const char *pszPtr = src;
2348
2349 if ( dst )
2350 {
2351 char* bufPtr = (char*)dst;
2352
2353 // have destination buffer, convert there
2354 size_t dstLenOrig = dstLen;
2355 cres = iconv(m2w,
2356 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2357 &bufPtr, &dstLen);
2358
2359 // convert the number of bytes converted as returned by iconv to the
2360 // number of (wide) characters converted that we need
2361 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2362
2363 if (ms_wcNeedsSwap)
2364 {
2365 // convert to native endianness
2366 for ( unsigned i = 0; i < res; i++ )
2367 dst[i] = WC_BSWAP(dst[i]);
2368 }
2369 }
2370 else // no destination buffer
2371 {
2372 // convert using temp buffer to calculate the size of the buffer needed
2373 wchar_t tbuf[256];
2374 res = 0;
2375
2376 do
2377 {
2378 char* bufPtr = (char*)tbuf;
2379 dstLen = 8 * SIZEOF_WCHAR_T;
2380
2381 cres = iconv(m2w,
2382 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2383 &bufPtr, &dstLen );
2384
2385 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2386 }
2387 while ((cres == (size_t)-1) && (errno == E2BIG));
2388 }
2389
2390 if (ICONV_FAILED(cres, srcLen))
2391 {
2392 //VS: it is ok if iconv fails, hence trace only
2393 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2394 return wxCONV_FAILED;
2395 }
2396
2397 return res;
2398 }
2399
2400 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2401 const wchar_t *src, size_t srcLen) const
2402 {
2403 #if wxUSE_THREADS
2404 // NB: explained in MB2WC
2405 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2406 #endif
2407
2408 if ( srcLen == wxNO_LEN )
2409 srcLen = wxWcslen(src) + 1;
2410
2411 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2412 size_t outbuflen = dstLen;
2413 size_t res, cres;
2414
2415 wchar_t *tmpbuf = 0;
2416
2417 if (ms_wcNeedsSwap)
2418 {
2419 // need to copy to temp buffer to switch endianness
2420 // (doing WC_BSWAP twice on the original buffer won't work, as it
2421 // could be in read-only memory, or be accessed in some other thread)
2422 tmpbuf = (wchar_t *)malloc(inbuflen);
2423 for ( size_t i = 0; i < srcLen; i++ )
2424 tmpbuf[i] = WC_BSWAP(src[i]);
2425
2426 src = tmpbuf;
2427 }
2428
2429 char* inbuf = (char*)src;
2430 if ( dst )
2431 {
2432 // have destination buffer, convert there
2433 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2434
2435 res = dstLen - outbuflen;
2436 }
2437 else // no destination buffer
2438 {
2439 // convert using temp buffer to calculate the size of the buffer needed
2440 char tbuf[256];
2441 res = 0;
2442 do
2443 {
2444 dst = tbuf;
2445 outbuflen = WXSIZEOF(tbuf);
2446
2447 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2448
2449 res += WXSIZEOF(tbuf) - outbuflen;
2450 }
2451 while ((cres == (size_t)-1) && (errno == E2BIG));
2452 }
2453
2454 if (ms_wcNeedsSwap)
2455 {
2456 free(tmpbuf);
2457 }
2458
2459 if (ICONV_FAILED(cres, inbuflen))
2460 {
2461 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2462 return wxCONV_FAILED;
2463 }
2464
2465 return res;
2466 }
2467
2468 size_t wxMBConv_iconv::GetMBNulLen() const
2469 {
2470 if ( m_minMBCharWidth == 0 )
2471 {
2472 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2473
2474 #if wxUSE_THREADS
2475 // NB: explained in MB2WC
2476 wxMutexLocker lock(self->m_iconvMutex);
2477 #endif
2478
2479 const wchar_t *wnul = L"";
2480 char buf[8]; // should be enough for NUL in any encoding
2481 size_t inLen = sizeof(wchar_t),
2482 outLen = WXSIZEOF(buf);
2483 char *inBuff = (char *)wnul;
2484 char *outBuff = buf;
2485 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2486 {
2487 self->m_minMBCharWidth = (size_t)-1;
2488 }
2489 else // ok
2490 {
2491 self->m_minMBCharWidth = outBuff - buf;
2492 }
2493 }
2494
2495 return m_minMBCharWidth;
2496 }
2497
2498 #if wxUSE_UNICODE_UTF8
2499 bool wxMBConv_iconv::IsUTF8() const
2500 {
2501 return wxStricmp(m_name, "UTF-8") == 0 ||
2502 wxStricmp(m_name, "UTF8") == 0;
2503 }
2504 #endif
2505
2506 #endif // HAVE_ICONV
2507
2508
2509 // ============================================================================
2510 // Win32 conversion classes
2511 // ============================================================================
2512
2513 #ifdef wxHAVE_WIN32_MB2WC
2514
2515 // from utils.cpp
2516 #if wxUSE_FONTMAP
2517 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2518 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2519 #endif
2520
2521 class wxMBConv_win32 : public wxMBConv
2522 {
2523 public:
2524 wxMBConv_win32()
2525 {
2526 m_CodePage = CP_ACP;
2527 m_minMBCharWidth = 0;
2528 }
2529
2530 wxMBConv_win32(const wxMBConv_win32& conv)
2531 : wxMBConv()
2532 {
2533 m_CodePage = conv.m_CodePage;
2534 m_minMBCharWidth = conv.m_minMBCharWidth;
2535 }
2536
2537 #if wxUSE_FONTMAP
2538 wxMBConv_win32(const char* name)
2539 {
2540 m_CodePage = wxCharsetToCodepage(name);
2541 m_minMBCharWidth = 0;
2542 }
2543
2544 wxMBConv_win32(wxFontEncoding encoding)
2545 {
2546 m_CodePage = wxEncodingToCodepage(encoding);
2547 m_minMBCharWidth = 0;
2548 }
2549 #endif // wxUSE_FONTMAP
2550
2551 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2552 {
2553 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554 // the behaviour is not compatible with the Unix version (using iconv)
2555 // and break the library itself, e.g. wxTextInputStream::NextChar()
2556 // wouldn't work if reading an incomplete MB char didn't result in an
2557 // error
2558 //
2559 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2560 // Win XP or newer and it is not supported for UTF-[78] so we always
2561 // use our own conversions in this case. See
2562 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2564 if ( m_CodePage == CP_UTF8 )
2565 {
2566 return wxMBConvUTF8().MB2WC(buf, psz, n);
2567 }
2568
2569 if ( m_CodePage == CP_UTF7 )
2570 {
2571 return wxMBConvUTF7().MB2WC(buf, psz, n);
2572 }
2573
2574 int flags = 0;
2575 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2576 IsAtLeastWin2kSP4() )
2577 {
2578 flags = MB_ERR_INVALID_CHARS;
2579 }
2580
2581 const size_t len = ::MultiByteToWideChar
2582 (
2583 m_CodePage, // code page
2584 flags, // flags: fall on error
2585 psz, // input string
2586 -1, // its length (NUL-terminated)
2587 buf, // output string
2588 buf ? n : 0 // size of output buffer
2589 );
2590 if ( !len )
2591 {
2592 // function totally failed
2593 return wxCONV_FAILED;
2594 }
2595
2596 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597 // check if we succeeded, by doing a double trip:
2598 if ( !flags && buf )
2599 {
2600 const size_t mbLen = strlen(psz);
2601 wxCharBuffer mbBuf(mbLen);
2602 if ( ::WideCharToMultiByte
2603 (
2604 m_CodePage,
2605 0,
2606 buf,
2607 -1,
2608 mbBuf.data(),
2609 mbLen + 1, // size in bytes, not length
2610 NULL,
2611 NULL
2612 ) == 0 ||
2613 strcmp(mbBuf, psz) != 0 )
2614 {
2615 // we didn't obtain the same thing we started from, hence
2616 // the conversion was lossy and we consider that it failed
2617 return wxCONV_FAILED;
2618 }
2619 }
2620
2621 // note that it returns count of written chars for buf != NULL and size
2622 // of the needed buffer for buf == NULL so in either case the length of
2623 // the string (which never includes the terminating NUL) is one less
2624 return len - 1;
2625 }
2626
2627 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2628 {
2629 /*
2630 we have a problem here: by default, WideCharToMultiByte() may
2631 replace characters unrepresentable in the target code page with bad
2632 quality approximations such as turning "1/2" symbol (U+00BD) into
2633 "1" for the code pages which don't have it and we, obviously, want
2634 to avoid this at any price
2635
2636 the trouble is that this function does it _silently_, i.e. it won't
2637 even tell us whether it did or not... Win98/2000 and higher provide
2638 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639 we have to resort to a round trip, i.e. check that converting back
2640 results in the same string -- this is, of course, expensive but
2641 otherwise we simply can't be sure to not garble the data.
2642 */
2643
2644 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645 // it doesn't work with CJK encodings (which we test for rather roughly
2646 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2647 // supporting it
2648 BOOL usedDef wxDUMMY_INITIALIZE(false);
2649 BOOL *pUsedDef;
2650 int flags;
2651 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2652 {
2653 // it's our lucky day
2654 flags = WC_NO_BEST_FIT_CHARS;
2655 pUsedDef = &usedDef;
2656 }
2657 else // old system or unsupported encoding
2658 {
2659 flags = 0;
2660 pUsedDef = NULL;
2661 }
2662
2663 const size_t len = ::WideCharToMultiByte
2664 (
2665 m_CodePage, // code page
2666 flags, // either none or no best fit
2667 pwz, // input string
2668 -1, // it is (wide) NUL-terminated
2669 buf, // output buffer
2670 buf ? n : 0, // and its size
2671 NULL, // default "replacement" char
2672 pUsedDef // [out] was it used?
2673 );
2674
2675 if ( !len )
2676 {
2677 // function totally failed
2678 return wxCONV_FAILED;
2679 }
2680
2681 // we did something, check if we really succeeded
2682 if ( flags )
2683 {
2684 // check if the conversion failed, i.e. if any replacements
2685 // were done
2686 if ( usedDef )
2687 return wxCONV_FAILED;
2688 }
2689 else // we must resort to double tripping...
2690 {
2691 // first we need to ensure that we really have the MB data: this is
2692 // not the case if we're called with NULL buffer, in which case we
2693 // need to do the conversion yet again
2694 wxCharBuffer bufDef;
2695 if ( !buf )
2696 {
2697 bufDef = wxCharBuffer(len);
2698 buf = bufDef.data();
2699 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2700 buf, len, NULL, NULL) )
2701 return wxCONV_FAILED;
2702 }
2703
2704 if ( !n )
2705 n = wcslen(pwz);
2706 wxWCharBuffer wcBuf(n);
2707 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2708 wcscmp(wcBuf, pwz) != 0 )
2709 {
2710 // we didn't obtain the same thing we started from, hence
2711 // the conversion was lossy and we consider that it failed
2712 return wxCONV_FAILED;
2713 }
2714 }
2715
2716 // see the comment above for the reason of "len - 1"
2717 return len - 1;
2718 }
2719
2720 virtual size_t GetMBNulLen() const
2721 {
2722 if ( m_minMBCharWidth == 0 )
2723 {
2724 int len = ::WideCharToMultiByte
2725 (
2726 m_CodePage, // code page
2727 0, // no flags
2728 L"", // input string
2729 1, // translate just the NUL
2730 NULL, // output buffer
2731 0, // and its size
2732 NULL, // no replacement char
2733 NULL // [out] don't care if it was used
2734 );
2735
2736 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2737 switch ( len )
2738 {
2739 default:
2740 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2741 self->m_minMBCharWidth = (size_t)-1;
2742 break;
2743
2744 case 0:
2745 self->m_minMBCharWidth = (size_t)-1;
2746 break;
2747
2748 case 1:
2749 case 2:
2750 case 4:
2751 self->m_minMBCharWidth = len;
2752 break;
2753 }
2754 }
2755
2756 return m_minMBCharWidth;
2757 }
2758
2759 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2760
2761 bool IsOk() const { return m_CodePage != -1; }
2762
2763 private:
2764 static bool CanUseNoBestFit()
2765 {
2766 static int s_isWin98Or2k = -1;
2767
2768 if ( s_isWin98Or2k == -1 )
2769 {
2770 int verMaj, verMin;
2771 switch ( wxGetOsVersion(&verMaj, &verMin) )
2772 {
2773 case wxOS_WINDOWS_9X:
2774 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2775 break;
2776
2777 case wxOS_WINDOWS_NT:
2778 s_isWin98Or2k = verMaj >= 5;
2779 break;
2780
2781 default:
2782 // unknown: be conservative by default
2783 s_isWin98Or2k = 0;
2784 break;
2785 }
2786
2787 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2788 }
2789
2790 return s_isWin98Or2k == 1;
2791 }
2792
2793 static bool IsAtLeastWin2kSP4()
2794 {
2795 #ifdef __WXWINCE__
2796 return false;
2797 #else
2798 static int s_isAtLeastWin2kSP4 = -1;
2799
2800 if ( s_isAtLeastWin2kSP4 == -1 )
2801 {
2802 OSVERSIONINFOEX ver;
2803
2804 memset(&ver, 0, sizeof(ver));
2805 ver.dwOSVersionInfoSize = sizeof(ver);
2806 GetVersionEx((OSVERSIONINFO*)&ver);
2807
2808 s_isAtLeastWin2kSP4 =
2809 ((ver.dwMajorVersion > 5) || // Vista+
2810 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2811 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2812 ver.wServicePackMajor >= 4)) // 2000 SP4+
2813 ? 1 : 0;
2814 }
2815
2816 return s_isAtLeastWin2kSP4 == 1;
2817 #endif
2818 }
2819
2820
2821 // the code page we're working with
2822 long m_CodePage;
2823
2824 // cached result of GetMBNulLen(), set to 0 initially meaning
2825 // "unknown"
2826 size_t m_minMBCharWidth;
2827 };
2828
2829 #endif // wxHAVE_WIN32_MB2WC
2830
2831
2832 // ============================================================================
2833 // wxEncodingConverter based conversion classes
2834 // ============================================================================
2835
2836 #if wxUSE_FONTMAP
2837
2838 class wxMBConv_wxwin : public wxMBConv
2839 {
2840 private:
2841 void Init()
2842 {
2843 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844 // The wxMBConv_cf class does a better job.
2845 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2846 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2847 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2848 }
2849
2850 public:
2851 // temporarily just use wxEncodingConverter stuff,
2852 // so that it works while a better implementation is built
2853 wxMBConv_wxwin(const char* name)
2854 {
2855 if (name)
2856 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2857 else
2858 m_enc = wxFONTENCODING_SYSTEM;
2859
2860 Init();
2861 }
2862
2863 wxMBConv_wxwin(wxFontEncoding enc)
2864 {
2865 m_enc = enc;
2866
2867 Init();
2868 }
2869
2870 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2871 {
2872 size_t inbuf = strlen(psz);
2873 if (buf)
2874 {
2875 if (!m2w.Convert(psz, buf))
2876 return wxCONV_FAILED;
2877 }
2878 return inbuf;
2879 }
2880
2881 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2882 {
2883 const size_t inbuf = wxWcslen(psz);
2884 if (buf)
2885 {
2886 if (!w2m.Convert(psz, buf))
2887 return wxCONV_FAILED;
2888 }
2889
2890 return inbuf;
2891 }
2892
2893 virtual size_t GetMBNulLen() const
2894 {
2895 switch ( m_enc )
2896 {
2897 case wxFONTENCODING_UTF16BE:
2898 case wxFONTENCODING_UTF16LE:
2899 return 2;
2900
2901 case wxFONTENCODING_UTF32BE:
2902 case wxFONTENCODING_UTF32LE:
2903 return 4;
2904
2905 default:
2906 return 1;
2907 }
2908 }
2909
2910 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2911
2912 bool IsOk() const { return m_ok; }
2913
2914 public:
2915 wxFontEncoding m_enc;
2916 wxEncodingConverter m2w, w2m;
2917
2918 private:
2919 // were we initialized successfully?
2920 bool m_ok;
2921
2922 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2923 };
2924
2925 // make the constructors available for unit testing
2926 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2927 {
2928 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2929 if ( !result->IsOk() )
2930 {
2931 delete result;
2932 return 0;
2933 }
2934
2935 return result;
2936 }
2937
2938 #endif // wxUSE_FONTMAP
2939
2940 // ============================================================================
2941 // wxCSConv implementation
2942 // ============================================================================
2943
2944 void wxCSConv::Init()
2945 {
2946 m_name = NULL;
2947 m_convReal = NULL;
2948 m_deferred = true;
2949 }
2950
2951 wxCSConv::wxCSConv(const wxString& charset)
2952 {
2953 Init();
2954
2955 if ( !charset.empty() )
2956 {
2957 SetName(charset.ToAscii());
2958 }
2959
2960 #if wxUSE_FONTMAP
2961 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2962 if ( m_encoding == wxFONTENCODING_MAX )
2963 {
2964 // set to unknown/invalid value
2965 m_encoding = wxFONTENCODING_SYSTEM;
2966 }
2967 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2968 {
2969 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2970 m_encoding = wxFONTENCODING_ISO8859_1;
2971 }
2972 #else
2973 m_encoding = wxFONTENCODING_SYSTEM;
2974 #endif
2975 }
2976
2977 wxCSConv::wxCSConv(wxFontEncoding encoding)
2978 {
2979 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2980 {
2981 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2982
2983 encoding = wxFONTENCODING_SYSTEM;
2984 }
2985
2986 Init();
2987
2988 m_encoding = encoding;
2989 }
2990
2991 wxCSConv::~wxCSConv()
2992 {
2993 Clear();
2994 }
2995
2996 wxCSConv::wxCSConv(const wxCSConv& conv)
2997 : wxMBConv()
2998 {
2999 Init();
3000
3001 SetName(conv.m_name);
3002 m_encoding = conv.m_encoding;
3003 }
3004
3005 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3006 {
3007 Clear();
3008
3009 SetName(conv.m_name);
3010 m_encoding = conv.m_encoding;
3011
3012 return *this;
3013 }
3014
3015 void wxCSConv::Clear()
3016 {
3017 free(m_name);
3018 delete m_convReal;
3019
3020 m_name = NULL;
3021 m_convReal = NULL;
3022 }
3023
3024 void wxCSConv::SetName(const char *charset)
3025 {
3026 if (charset)
3027 {
3028 m_name = wxStrdup(charset);
3029 m_deferred = true;
3030 }
3031 }
3032
3033 #if wxUSE_FONTMAP
3034
3035 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3036 wxEncodingNameCache );
3037
3038 static wxEncodingNameCache gs_nameCache;
3039 #endif
3040
3041 wxMBConv *wxCSConv::DoCreate() const
3042 {
3043 #if wxUSE_FONTMAP
3044 wxLogTrace(TRACE_STRCONV,
3045 wxT("creating conversion for %s"),
3046 (m_name ? m_name
3047 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3048 #endif // wxUSE_FONTMAP
3049
3050 // check for the special case of ASCII or ISO8859-1 charset: as we have
3051 // special knowledge of it anyhow, we don't need to create a special
3052 // conversion object
3053 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3054 m_encoding == wxFONTENCODING_DEFAULT )
3055 {
3056 // don't convert at all
3057 return NULL;
3058 }
3059
3060 // we trust OS to do conversion better than we can so try external
3061 // conversion methods first
3062 //
3063 // the full order is:
3064 // 1. OS conversion (iconv() under Unix or Win32 API)
3065 // 2. hard coded conversions for UTF
3066 // 3. wxEncodingConverter as fall back
3067
3068 // step (1)
3069 #ifdef HAVE_ICONV
3070 #if !wxUSE_FONTMAP
3071 if ( m_name )
3072 #endif // !wxUSE_FONTMAP
3073 {
3074 #if wxUSE_FONTMAP
3075 wxFontEncoding encoding(m_encoding);
3076 #endif
3077
3078 if ( m_name )
3079 {
3080 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3081 if ( conv->IsOk() )
3082 return conv;
3083
3084 delete conv;
3085
3086 #if wxUSE_FONTMAP
3087 encoding =
3088 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3089 #endif // wxUSE_FONTMAP
3090 }
3091 #if wxUSE_FONTMAP
3092 {
3093 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3094 if ( it != gs_nameCache.end() )
3095 {
3096 if ( it->second.empty() )
3097 return NULL;
3098
3099 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3100 if ( conv->IsOk() )
3101 return conv;
3102
3103 delete conv;
3104 }
3105
3106 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3107 // CS : in case this does not return valid names (eg for MacRoman)
3108 // encoding got a 'failure' entry in the cache all the same,
3109 // although it just has to be created using a different method, so
3110 // only store failed iconv creation attempts (or perhaps we
3111 // shoulnd't do this at all ?)
3112 if ( names[0] != NULL )
3113 {
3114 for ( ; *names; ++names )
3115 {
3116 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3117 // will need changes that will obsolete this
3118 wxString name(*names);
3119 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3120 if ( conv->IsOk() )
3121 {
3122 gs_nameCache[encoding] = *names;
3123 return conv;
3124 }
3125
3126 delete conv;
3127 }
3128
3129 gs_nameCache[encoding] = wxT(""); // cache the failure
3130 }
3131 }
3132 #endif // wxUSE_FONTMAP
3133 }
3134 #endif // HAVE_ICONV
3135
3136 #ifdef wxHAVE_WIN32_MB2WC
3137 {
3138 #if wxUSE_FONTMAP
3139 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3140 : new wxMBConv_win32(m_encoding);
3141 if ( conv->IsOk() )
3142 return conv;
3143
3144 delete conv;
3145 #else
3146 return NULL;
3147 #endif
3148 }
3149 #endif // wxHAVE_WIN32_MB2WC
3150
3151 #ifdef __DARWIN__
3152 {
3153 // leave UTF16 and UTF32 to the built-ins of wx
3154 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3155 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3156 {
3157 #if wxUSE_FONTMAP
3158 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3159 : new wxMBConv_cf(m_encoding);
3160 #else
3161 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3162 #endif
3163
3164 if ( conv->IsOk() )
3165 return conv;
3166
3167 delete conv;
3168 }
3169 }
3170 #endif // __DARWIN__
3171
3172 // step (2)
3173 wxFontEncoding enc = m_encoding;
3174 #if wxUSE_FONTMAP
3175 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3176 {
3177 // use "false" to suppress interactive dialogs -- we can be called from
3178 // anywhere and popping up a dialog from here is the last thing we want to
3179 // do
3180 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3181 }
3182 #endif // wxUSE_FONTMAP
3183
3184 switch ( enc )
3185 {
3186 case wxFONTENCODING_UTF7:
3187 return new wxMBConvUTF7;
3188
3189 case wxFONTENCODING_UTF8:
3190 return new wxMBConvUTF8;
3191
3192 case wxFONTENCODING_UTF16BE:
3193 return new wxMBConvUTF16BE;
3194
3195 case wxFONTENCODING_UTF16LE:
3196 return new wxMBConvUTF16LE;
3197
3198 case wxFONTENCODING_UTF32BE:
3199 return new wxMBConvUTF32BE;
3200
3201 case wxFONTENCODING_UTF32LE:
3202 return new wxMBConvUTF32LE;
3203
3204 default:
3205 // nothing to do but put here to suppress gcc warnings
3206 break;
3207 }
3208
3209 // step (3)
3210 #if wxUSE_FONTMAP
3211 {
3212 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3213 : new wxMBConv_wxwin(m_encoding);
3214 if ( conv->IsOk() )
3215 return conv;
3216
3217 delete conv;
3218 }
3219
3220 wxLogTrace(TRACE_STRCONV,
3221 wxT("encoding \"%s\" is not supported by this system"),
3222 (m_name ? wxString(m_name)
3223 : wxFontMapperBase::GetEncodingName(m_encoding)));
3224 #endif // wxUSE_FONTMAP
3225
3226 return NULL;
3227 }
3228
3229 void wxCSConv::CreateConvIfNeeded() const
3230 {
3231 if ( m_deferred )
3232 {
3233 wxCSConv *self = (wxCSConv *)this; // const_cast
3234
3235 // if we don't have neither the name nor the encoding, use the default
3236 // encoding for this system
3237 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3238 {
3239 #if wxUSE_INTL
3240 self->m_encoding = wxLocale::GetSystemEncoding();
3241 #else
3242 // fallback to some reasonable default:
3243 self->m_encoding = wxFONTENCODING_ISO8859_1;
3244 #endif // wxUSE_INTL
3245 }
3246
3247 self->m_convReal = DoCreate();
3248 self->m_deferred = false;
3249 }
3250 }
3251
3252 bool wxCSConv::IsOk() const
3253 {
3254 CreateConvIfNeeded();
3255
3256 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3257 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3258 return true; // always ok as we do it ourselves
3259
3260 // m_convReal->IsOk() is called at its own creation, so we know it must
3261 // be ok if m_convReal is non-NULL
3262 return m_convReal != NULL;
3263 }
3264
3265 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3266 const char *src, size_t srcLen) const
3267 {
3268 CreateConvIfNeeded();
3269
3270 if (m_convReal)
3271 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3272
3273 // latin-1 (direct)
3274 if ( srcLen == wxNO_LEN )
3275 srcLen = strlen(src) + 1; // take trailing NUL too
3276
3277 if ( dst )
3278 {
3279 if ( dstLen < srcLen )
3280 return wxCONV_FAILED;
3281
3282 for ( size_t n = 0; n < srcLen; n++ )
3283 dst[n] = (unsigned char)(src[n]);
3284 }
3285
3286 return srcLen;
3287 }
3288
3289 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3290 const wchar_t *src, size_t srcLen) const
3291 {
3292 CreateConvIfNeeded();
3293
3294 if (m_convReal)
3295 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3296
3297 // latin-1 (direct)
3298 if ( srcLen == wxNO_LEN )
3299 srcLen = wxWcslen(src) + 1;
3300
3301 if ( dst )
3302 {
3303 if ( dstLen < srcLen )
3304 return wxCONV_FAILED;
3305
3306 for ( size_t n = 0; n < srcLen; n++ )
3307 {
3308 if ( src[n] > 0xFF )
3309 return wxCONV_FAILED;
3310
3311 dst[n] = (char)src[n];
3312 }
3313
3314 }
3315 else // still need to check the input validity
3316 {
3317 for ( size_t n = 0; n < srcLen; n++ )
3318 {
3319 if ( src[n] > 0xFF )
3320 return wxCONV_FAILED;
3321 }
3322 }
3323
3324 return srcLen;
3325 }
3326
3327 size_t wxCSConv::GetMBNulLen() const
3328 {
3329 CreateConvIfNeeded();
3330
3331 if ( m_convReal )
3332 {
3333 return m_convReal->GetMBNulLen();
3334 }
3335
3336 // otherwise, we are ISO-8859-1
3337 return 1;
3338 }
3339
3340 #if wxUSE_UNICODE_UTF8
3341 bool wxCSConv::IsUTF8() const
3342 {
3343 CreateConvIfNeeded();
3344
3345 if ( m_convReal )
3346 {
3347 return m_convReal->IsUTF8();
3348 }
3349
3350 // otherwise, we are ISO-8859-1
3351 return false;
3352 }
3353 #endif
3354
3355
3356 #if wxUSE_UNICODE
3357
3358 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3359 {
3360 if ( !s )
3361 return wxWCharBuffer();
3362
3363 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3364 if ( !wbuf )
3365 wbuf = wxMBConvUTF8().cMB2WX(s);
3366 if ( !wbuf )
3367 wbuf = wxConvISO8859_1.cMB2WX(s);
3368
3369 return wbuf;
3370 }
3371
3372 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3373 {
3374 if ( !ws )
3375 return wxCharBuffer();
3376
3377 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3378 if ( !buf )
3379 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3380
3381 return buf;
3382 }
3383
3384 #endif // wxUSE_UNICODE
3385
3386 // ----------------------------------------------------------------------------
3387 // globals
3388 // ----------------------------------------------------------------------------
3389
3390 // NB: The reason why we create converted objects in this convoluted way,
3391 // using a factory function instead of global variable, is that they
3392 // may be used at static initialization time (some of them are used by
3393 // wxString ctors and there may be a global wxString object). In other
3394 // words, possibly _before_ the converter global object would be
3395 // initialized.
3396
3397 #undef wxConvLibc
3398 #undef wxConvUTF8
3399 #undef wxConvUTF7
3400 #undef wxConvLocal
3401 #undef wxConvISO8859_1
3402
3403 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3404 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3405 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3406 { \
3407 static impl_klass name##Obj ctor_args; \
3408 return &name##Obj; \
3409 } \
3410 /* this ensures that all global converter objects are created */ \
3411 /* by the time static initialization is done, i.e. before any */ \
3412 /* thread is launched: */ \
3413 static klass* gs_##name##instance = wxGet_##name##Ptr()
3414
3415 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3416 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3417
3418 #ifdef __INTELC__
3419 // disable warning "variable 'xxx' was declared but never referenced"
3420 #pragma warning(disable: 177)
3421 #endif // Intel C++
3422
3423 #ifdef __WINDOWS__
3424 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3425 #elif 0 // defined(__WXOSX__)
3426 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3427 #else
3428 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3429 #endif
3430
3431 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3432 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3433 // provokes an error message about "not enough macro parameters"; and we
3434 // can't use "()" here as the name##Obj declaration would be parsed as a
3435 // function declaration then, so use a semicolon and live with an extra
3436 // empty statement (and hope that no compilers warns about this)
3437 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3438 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3439
3440 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3441 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3442
3443 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3444 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3445
3446 #ifdef __DARWIN__
3447 // The xnu kernel always communicates file paths in decomposed UTF-8.
3448 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3449 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3450 #endif
3451
3452 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3453 #ifdef __DARWIN__
3454 &wxConvMacUTF8DObj;
3455 #else // !__DARWIN__
3456 wxGet_wxConvLibcPtr();
3457 #endif // __DARWIN__/!__DARWIN__
3458
3459 #else // !wxUSE_WCHAR_T
3460
3461 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3462 // stand-ins in absence of wchar_t
3463 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3464 wxConvISO8859_1,
3465 wxConvLocal,
3466 wxConvUTF8;
3467
3468 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T