]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Explicitly unregister custom wxWebViewIE namespaces when we are done with them. Also...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #ifndef __WXWINCE__
32 #include <errno.h>
33 #endif
34
35 #include <ctype.h>
36 #include <string.h>
37 #include <stdlib.h>
38
39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef HAVE_ICONV
46 #include <iconv.h>
47 #include "wx/thread.h"
48 #endif
49
50 #include "wx/encconv.h"
51 #include "wx/fontmap.h"
52
53 #ifdef __DARWIN__
54 #include "wx/osx/core/private/strconv_cf.h"
55 #endif //def __DARWIN__
56
57
58 #define TRACE_STRCONV wxT("strconv")
59
60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61 // be 4 bytes
62 #if SIZEOF_WCHAR_T == 2
63 #define WC_UTF16
64 #endif
65
66
67 // ============================================================================
68 // implementation
69 // ============================================================================
70
71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
72 static bool NotAllNULs(const char *p, size_t n)
73 {
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78 }
79
80 // ----------------------------------------------------------------------------
81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
82 // ----------------------------------------------------------------------------
83
84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
85 {
86 if (input <= 0xffff)
87 {
88 if (output)
89 *output = (wxUint16) input;
90
91 return 1;
92 }
93 else if (input >= 0x110000)
94 {
95 return wxCONV_FAILED;
96 }
97 else
98 {
99 if (output)
100 {
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
103 }
104
105 return 2;
106 }
107 }
108
109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
110 {
111 if ((*input < 0xd800) || (*input > 0xdfff))
112 {
113 output = *input;
114 return 1;
115 }
116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
117 {
118 output = *input;
119 return wxCONV_FAILED;
120 }
121 else
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
126 }
127
128 #ifdef WC_UTF16
129 typedef wchar_t wxDecodeSurrogate_t;
130 #else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132 #endif // WC_UTF16/!WC_UTF16
133
134 // returns the next UTF-32 character from the wchar_t buffer and advances the
135 // pointer to the character after this one
136 //
137 // if an invalid character is found, *pSrc is set to NULL, the caller must
138 // check for this
139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
140 {
141 wxUint32 out;
142 const size_t
143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150 }
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 size_t
157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
159 {
160 // although new conversion classes are supposed to implement this function
161 // directly, the existing ones only implement the old MB2WC() and so, to
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
170
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
173
174 // the number of NULs terminating this string
175 size_t nulLen = 0; // not really needed, but just to avoid warnings
176
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
183 if ( srcLen != wxNO_LEN )
184 {
185 // we need to know how to find the end of this string
186 nulLen = GetMBNulLen();
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
189
190 // if there are enough NULs we can avoid the copy
191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
192 {
193 // make a copy in order to properly NUL-terminate the string
194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
195 char * const p = bufTmp.data();
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
198 *s = '\0';
199
200 src = bufTmp;
201 }
202
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
209
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
225 for ( ;; )
226 {
227 // try to convert the current chunk
228 size_t lenChunk = MB2WC(NULL, src, 0);
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
231
232 dstWritten += lenChunk;
233 if ( !srcEnd )
234 dstWritten++;
235
236 if ( !lenChunk )
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
252 if ( !srcEnd )
253 dst++;
254 }
255
256 if ( !srcEnd )
257 {
258 // we convert just one chunk in this case as this is the entire
259 // string anyhow (and we don't count the trailing NUL in this case)
260 break;
261 }
262
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
286
287 if ( src >= srcEnd )
288 break;
289 }
290
291 return dstWritten;
292 }
293
294 size_t
295 wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297 {
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
300
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
308 wxWCharBuffer bufTmp;
309 if ( isNulTerminated )
310 {
311 srcLen = wxWcslen(src) + 1;
312 }
313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
314 {
315 // make a copy in order to properly NUL-terminate the string
316 bufTmp = wxWCharBuffer(srcLen);
317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
324 src++ /* skip L'\0' too */ )
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
331 dstWritten += lenChunk;
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
340 dstWritten += lenNul;
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
364 return wxCONV_FAILED;
365
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
377 dst += lenChunk;
378 if ( chunkEnd < srcEnd )
379 dst += lenNul;
380 }
381
382 src = chunkEnd;
383 }
384
385 return dstWritten;
386 }
387
388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
389 {
390 size_t rc = ToWChar(outBuff, outLen, inBuff);
391 if ( rc != wxCONV_FAILED )
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399 }
400
401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
402 {
403 size_t rc = FromWChar(outBuff, outLen, inBuff);
404 if ( rc != wxCONV_FAILED )
405 {
406 rc -= GetMBNulLen();
407 }
408
409 return rc;
410 }
411
412 wxMBConv::~wxMBConv()
413 {
414 // nothing to do here (necessary for Darwin linking probably)
415 }
416
417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418 {
419 if ( psz )
420 {
421 // calculate the length of the buffer needed first
422 const size_t nLen = ToWChar(NULL, 0, psz);
423 if ( nLen != wxCONV_FAILED )
424 {
425 // now do the actual conversion
426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
427
428 // +1 for the trailing NULL
429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
430 return buf;
431 }
432 }
433
434 return wxWCharBuffer();
435 }
436
437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438 {
439 if ( pwz )
440 {
441 const size_t nLen = FromWChar(NULL, 0, pwz);
442 if ( nLen != wxCONV_FAILED )
443 {
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451 }
452
453 const wxWCharBuffer
454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
455 {
456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
457 if ( dstLen != wxCONV_FAILED )
458 {
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
463 wbuf.data()[dstLen] = L'\0';
464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
465 {
466 if ( outLen )
467 {
468 *outLen = dstLen;
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
476 (*outLen)--;
477 }
478
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487 }
488
489 const wxCharBuffer
490 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
491 {
492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
493 if ( dstLen != wxCONV_FAILED )
494 {
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
502 {
503 if ( outLen )
504 {
505 *outLen = dstLen;
506
507 if ( inLen == wxNO_LEN )
508 {
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
511 *outLen -= nulLen;
512 }
513 }
514
515 return buf;
516 }
517 }
518
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
523 }
524
525 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526 {
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
541 }
542
543 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544 {
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
558 return wxScopedCharBuffer::CreateNonOwned("", 0);
559 }
560
561 // ----------------------------------------------------------------------------
562 // wxMBConvLibc
563 // ----------------------------------------------------------------------------
564
565 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566 {
567 return wxMB2WC(buf, psz, n);
568 }
569
570 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571 {
572 return wxWC2MB(buf, psz, n);
573 }
574
575 // ----------------------------------------------------------------------------
576 // wxConvBrokenFileNames
577 // ----------------------------------------------------------------------------
578
579 #ifdef __UNIX__
580
581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
582 {
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
586 else
587 m_conv = new wxCSConv(charset);
588 }
589
590 #endif // __UNIX__
591
592 // ----------------------------------------------------------------------------
593 // UTF-7
594 // ----------------------------------------------------------------------------
595
596 // Implementation (C) 2004 Fredrik Roubert
597 //
598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
599
600 //
601 // BASE64 decoding table
602 //
603 static const unsigned char utf7unb64[] =
604 {
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
637 };
638
639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
641 {
642 DecoderState stateOrig,
643 *statePtr;
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
667 size_t len = 0;
668
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
672 {
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
676 {
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
679 {
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
691 return wxCONV_FAILED;
692 }
693
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
709 {
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
715 {
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
721 }
722 else // MSB
723 {
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
727 }
728 }
729 }
730 }
731
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
736 {
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
770 }
771 }
772
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
782
783 return len;
784 }
785
786 //
787 // BASE64 encoding table
788 //
789 static const unsigned char utf7enb64[] =
790 {
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799 };
800
801 //
802 // UTF-7 encoding table
803 //
804 // 0 - Set D (directly encoded characters)
805 // 1 - Set O (optional direct characters)
806 // 2 - whitespace characters (optional)
807 // 3 - special characters
808 //
809 static const unsigned char utf7encode[128] =
810 {
811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819 };
820
821 static inline bool wxIsUTF7Direct(wchar_t wc)
822 {
823 return wc < 0x80 && utf7encode[wc] < 1;
824 }
825
826 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
828 {
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
843 }
844
845 EncoderState& state = *statePtr;
846
847
848 size_t len = 0;
849
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
852 {
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
855 {
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
865
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
875 len++;
876 }
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
887 #ifndef WC_UTF16
888 else if (((wxUint32)cc) > 0xffff)
889 {
890 // no surrogate pair generation (yet?)
891 return wxCONV_FAILED;
892 }
893 #endif
894 else
895 {
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
899
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
907 {
908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
909 {
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
914 {
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
919 }
920 }
921
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
924
925 src++;
926 }
927 }
928 }
929
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
935
936 return len;
937 }
938
939 // ----------------------------------------------------------------------------
940 // UTF-8
941 // ----------------------------------------------------------------------------
942
943 static const wxUint32 utf8_max[]=
944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
945
946 // boundaries of the private use area we use to (temporarily) remap invalid
947 // characters invalid in a UTF-8 encoded string
948 const wxUint32 wxUnicodePUA = 0x100000;
949 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
951 // this table gives the length of the UTF-8 encoding from its first character:
952 const unsigned char tableUtf8Lengths[256] = {
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984 };
985
986 size_t
987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989 {
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
998 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
1018 if ( out && !dstLen-- )
1019 break;
1020
1021 wxUint32 code;
1022 unsigned char c = *p;
1023
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
1028
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
1031
1032 code = c;
1033 }
1034 else
1035 {
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
1081
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
1085 }
1086
1087 #ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095 #else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098 #endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107 }
1108
1109 size_t
1110 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112 {
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
1118 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
1140
1141 wxUint32 code;
1142 #ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 if ( srcLen != wxNO_LEN )
1149 srcLen--;
1150 }
1151 #else // wchar_t is UTF-32
1152 code = *wp & 0x7fffffff;
1153 #endif
1154
1155 unsigned len;
1156 if ( code <= 0x7F )
1157 {
1158 len = 1;
1159 if ( out )
1160 {
1161 if ( dstLen < len )
1162 break;
1163
1164 out[0] = (char)code;
1165 }
1166 }
1167 else if ( code <= 0x07FF )
1168 {
1169 len = 2;
1170 if ( out )
1171 {
1172 if ( dstLen < len )
1173 break;
1174
1175 // NB: this line takes 6 least significant bits, encodes them as
1176 // 10xxxxxx and discards them so that the next byte can be encoded:
1177 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1178 out[0] = 0xC0 | code;
1179 }
1180 }
1181 else if ( code < 0xFFFF )
1182 {
1183 len = 3;
1184 if ( out )
1185 {
1186 if ( dstLen < len )
1187 break;
1188
1189 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1191 out[0] = 0xE0 | code;
1192 }
1193 }
1194 else if ( code <= 0x10FFFF )
1195 {
1196 len = 4;
1197 if ( out )
1198 {
1199 if ( dstLen < len )
1200 break;
1201
1202 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1205 out[0] = 0xF0 | code;
1206 }
1207 }
1208 else
1209 {
1210 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1211 break;
1212 }
1213
1214 if ( out )
1215 {
1216 out += len;
1217 dstLen -= len;
1218 }
1219
1220 written += len;
1221 }
1222
1223 // we only get here if an error occurs during decoding
1224 return wxCONV_FAILED;
1225 }
1226
1227 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1228 const char *psz, size_t srcLen) const
1229 {
1230 if ( m_options == MAP_INVALID_UTF8_NOT )
1231 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1232
1233 size_t len = 0;
1234
1235 // The length can be either given explicitly or computed implicitly for the
1236 // NUL-terminated strings.
1237 const bool isNulTerminated = srcLen == wxNO_LEN;
1238 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1239 {
1240 const char *opsz = psz;
1241 bool invalid = false;
1242 unsigned char cc = *psz++, fc = cc;
1243 unsigned cnt;
1244 for (cnt = 0; fc & 0x80; cnt++)
1245 fc <<= 1;
1246
1247 if (!cnt)
1248 {
1249 // plain ASCII char
1250 if (buf)
1251 *buf++ = cc;
1252 len++;
1253
1254 // escape the escape character for octal escapes
1255 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1256 && cc == '\\' && (!buf || len < n))
1257 {
1258 if (buf)
1259 *buf++ = cc;
1260 len++;
1261 }
1262 }
1263 else
1264 {
1265 cnt--;
1266 if (!cnt)
1267 {
1268 // invalid UTF-8 sequence
1269 invalid = true;
1270 }
1271 else
1272 {
1273 unsigned ocnt = cnt - 1;
1274 wxUint32 res = cc & (0x3f >> cnt);
1275 while (cnt--)
1276 {
1277 cc = *psz;
1278 if ((cc & 0xC0) != 0x80)
1279 {
1280 // invalid UTF-8 sequence
1281 invalid = true;
1282 break;
1283 }
1284
1285 psz++;
1286 res = (res << 6) | (cc & 0x3f);
1287 }
1288
1289 if (invalid || res <= utf8_max[ocnt])
1290 {
1291 // illegal UTF-8 encoding
1292 invalid = true;
1293 }
1294 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1295 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1296 {
1297 // if one of our PUA characters turns up externally
1298 // it must also be treated as an illegal sequence
1299 // (a bit like you have to escape an escape character)
1300 invalid = true;
1301 }
1302 else
1303 {
1304 #ifdef WC_UTF16
1305 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1306 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1307 if (pa == wxCONV_FAILED)
1308 {
1309 invalid = true;
1310 }
1311 else
1312 {
1313 if (buf)
1314 buf += pa;
1315 len += pa;
1316 }
1317 #else // !WC_UTF16
1318 if (buf)
1319 *buf++ = (wchar_t)res;
1320 len++;
1321 #endif // WC_UTF16/!WC_UTF16
1322 }
1323 }
1324
1325 if (invalid)
1326 {
1327 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1328 {
1329 while (opsz < psz && (!buf || len < n))
1330 {
1331 #ifdef WC_UTF16
1332 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1333 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1334 wxASSERT(pa != wxCONV_FAILED);
1335 if (buf)
1336 buf += pa;
1337 opsz++;
1338 len += pa;
1339 #else
1340 if (buf)
1341 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1342 opsz++;
1343 len++;
1344 #endif
1345 }
1346 }
1347 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1348 {
1349 while (opsz < psz && (!buf || len < n))
1350 {
1351 if ( buf && len + 3 < n )
1352 {
1353 unsigned char on = *opsz;
1354 *buf++ = L'\\';
1355 *buf++ = (wchar_t)( L'0' + on / 0100 );
1356 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1357 *buf++ = (wchar_t)( L'0' + on % 010 );
1358 }
1359
1360 opsz++;
1361 len += 4;
1362 }
1363 }
1364 else // MAP_INVALID_UTF8_NOT
1365 {
1366 return wxCONV_FAILED;
1367 }
1368 }
1369 }
1370 }
1371
1372 if ( isNulTerminated )
1373 {
1374 // Add the trailing NUL in this case if we have a large enough buffer.
1375 if ( buf && (len < n) )
1376 *buf = 0;
1377
1378 // And count it in any case.
1379 len++;
1380 }
1381
1382 return len;
1383 }
1384
1385 static inline bool isoctal(wchar_t wch)
1386 {
1387 return L'0' <= wch && wch <= L'7';
1388 }
1389
1390 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1391 const wchar_t *psz, size_t srcLen) const
1392 {
1393 if ( m_options == MAP_INVALID_UTF8_NOT )
1394 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1395
1396 size_t len = 0;
1397
1398 // The length can be either given explicitly or computed implicitly for the
1399 // NUL-terminated strings.
1400 const bool isNulTerminated = srcLen == wxNO_LEN;
1401 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1402 {
1403 wxUint32 cc;
1404
1405 #ifdef WC_UTF16
1406 // cast is ok for WC_UTF16
1407 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1408 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1409 #else
1410 cc = (*psz++) & 0x7fffffff;
1411 #endif
1412
1413 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1414 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1415 {
1416 if (buf)
1417 *buf++ = (char)(cc - wxUnicodePUA);
1418 len++;
1419 }
1420 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1421 && cc == L'\\' && psz[0] == L'\\' )
1422 {
1423 if (buf)
1424 *buf++ = (char)cc;
1425 psz++;
1426 len++;
1427 }
1428 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1429 cc == L'\\' &&
1430 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1431 {
1432 if (buf)
1433 {
1434 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1435 (psz[1] - L'0') * 010 +
1436 (psz[2] - L'0'));
1437 }
1438
1439 psz += 3;
1440 len++;
1441 }
1442 else
1443 {
1444 unsigned cnt;
1445 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1446 {
1447 }
1448
1449 if (!cnt)
1450 {
1451 // plain ASCII char
1452 if (buf)
1453 *buf++ = (char) cc;
1454 len++;
1455 }
1456 else
1457 {
1458 len += cnt + 1;
1459 if (buf)
1460 {
1461 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1462 while (cnt--)
1463 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1464 }
1465 }
1466 }
1467 }
1468
1469 if ( isNulTerminated )
1470 {
1471 // Add the trailing NUL in this case if we have a large enough buffer.
1472 if ( buf && (len < n) )
1473 *buf = 0;
1474
1475 // And count it in any case.
1476 len++;
1477 }
1478
1479 return len;
1480 }
1481
1482 // ============================================================================
1483 // UTF-16
1484 // ============================================================================
1485
1486 #ifdef WORDS_BIGENDIAN
1487 #define wxMBConvUTF16straight wxMBConvUTF16BE
1488 #define wxMBConvUTF16swap wxMBConvUTF16LE
1489 #else
1490 #define wxMBConvUTF16swap wxMBConvUTF16BE
1491 #define wxMBConvUTF16straight wxMBConvUTF16LE
1492 #endif
1493
1494 /* static */
1495 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1496 {
1497 if ( srcLen == wxNO_LEN )
1498 {
1499 // count the number of bytes in input, including the trailing NULs
1500 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1501 for ( srcLen = 1; *inBuff++; srcLen++ )
1502 ;
1503
1504 srcLen *= BYTES_PER_CHAR;
1505 }
1506 else // we already have the length
1507 {
1508 // we can only convert an entire number of UTF-16 characters
1509 if ( srcLen % BYTES_PER_CHAR )
1510 return wxCONV_FAILED;
1511 }
1512
1513 return srcLen;
1514 }
1515
1516 // case when in-memory representation is UTF-16 too
1517 #ifdef WC_UTF16
1518
1519 // ----------------------------------------------------------------------------
1520 // conversions without endianness change
1521 // ----------------------------------------------------------------------------
1522
1523 size_t
1524 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1525 const char *src, size_t srcLen) const
1526 {
1527 // set up the scene for using memcpy() (which is presumably more efficient
1528 // than copying the bytes one by one)
1529 srcLen = GetLength(src, srcLen);
1530 if ( srcLen == wxNO_LEN )
1531 return wxCONV_FAILED;
1532
1533 const size_t inLen = srcLen / BYTES_PER_CHAR;
1534 if ( dst )
1535 {
1536 if ( dstLen < inLen )
1537 return wxCONV_FAILED;
1538
1539 memcpy(dst, src, srcLen);
1540 }
1541
1542 return inLen;
1543 }
1544
1545 size_t
1546 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1547 const wchar_t *src, size_t srcLen) const
1548 {
1549 if ( srcLen == wxNO_LEN )
1550 srcLen = wxWcslen(src) + 1;
1551
1552 srcLen *= BYTES_PER_CHAR;
1553
1554 if ( dst )
1555 {
1556 if ( dstLen < srcLen )
1557 return wxCONV_FAILED;
1558
1559 memcpy(dst, src, srcLen);
1560 }
1561
1562 return srcLen;
1563 }
1564
1565 // ----------------------------------------------------------------------------
1566 // endian-reversing conversions
1567 // ----------------------------------------------------------------------------
1568
1569 size_t
1570 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1571 const char *src, size_t srcLen) const
1572 {
1573 srcLen = GetLength(src, srcLen);
1574 if ( srcLen == wxNO_LEN )
1575 return wxCONV_FAILED;
1576
1577 srcLen /= BYTES_PER_CHAR;
1578
1579 if ( dst )
1580 {
1581 if ( dstLen < srcLen )
1582 return wxCONV_FAILED;
1583
1584 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1585 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1586 {
1587 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1588 }
1589 }
1590
1591 return srcLen;
1592 }
1593
1594 size_t
1595 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1596 const wchar_t *src, size_t srcLen) const
1597 {
1598 if ( srcLen == wxNO_LEN )
1599 srcLen = wxWcslen(src) + 1;
1600
1601 srcLen *= BYTES_PER_CHAR;
1602
1603 if ( dst )
1604 {
1605 if ( dstLen < srcLen )
1606 return wxCONV_FAILED;
1607
1608 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1609 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1610 {
1611 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1612 }
1613 }
1614
1615 return srcLen;
1616 }
1617
1618 #else // !WC_UTF16: wchar_t is UTF-32
1619
1620 // ----------------------------------------------------------------------------
1621 // conversions without endianness change
1622 // ----------------------------------------------------------------------------
1623
1624 size_t
1625 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1626 const char *src, size_t srcLen) const
1627 {
1628 srcLen = GetLength(src, srcLen);
1629 if ( srcLen == wxNO_LEN )
1630 return wxCONV_FAILED;
1631
1632 const size_t inLen = srcLen / BYTES_PER_CHAR;
1633 if ( !dst )
1634 {
1635 // optimization: return maximal space which could be needed for this
1636 // string even if the real size could be smaller if the buffer contains
1637 // any surrogates
1638 return inLen;
1639 }
1640
1641 size_t outLen = 0;
1642 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1643 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1644 {
1645 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1646 if ( !inBuff )
1647 return wxCONV_FAILED;
1648
1649 if ( ++outLen > dstLen )
1650 return wxCONV_FAILED;
1651
1652 *dst++ = ch;
1653 }
1654
1655
1656 return outLen;
1657 }
1658
1659 size_t
1660 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1661 const wchar_t *src, size_t srcLen) const
1662 {
1663 if ( srcLen == wxNO_LEN )
1664 srcLen = wxWcslen(src) + 1;
1665
1666 size_t outLen = 0;
1667 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1668 for ( size_t n = 0; n < srcLen; n++ )
1669 {
1670 wxUint16 cc[2] = { 0 };
1671 const size_t numChars = encode_utf16(*src++, cc);
1672 if ( numChars == wxCONV_FAILED )
1673 return wxCONV_FAILED;
1674
1675 outLen += numChars * BYTES_PER_CHAR;
1676 if ( outBuff )
1677 {
1678 if ( outLen > dstLen )
1679 return wxCONV_FAILED;
1680
1681 *outBuff++ = cc[0];
1682 if ( numChars == 2 )
1683 {
1684 // second character of a surrogate
1685 *outBuff++ = cc[1];
1686 }
1687 }
1688 }
1689
1690 return outLen;
1691 }
1692
1693 // ----------------------------------------------------------------------------
1694 // endian-reversing conversions
1695 // ----------------------------------------------------------------------------
1696
1697 size_t
1698 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1699 const char *src, size_t srcLen) const
1700 {
1701 srcLen = GetLength(src, srcLen);
1702 if ( srcLen == wxNO_LEN )
1703 return wxCONV_FAILED;
1704
1705 const size_t inLen = srcLen / BYTES_PER_CHAR;
1706 if ( !dst )
1707 {
1708 // optimization: return maximal space which could be needed for this
1709 // string even if the real size could be smaller if the buffer contains
1710 // any surrogates
1711 return inLen;
1712 }
1713
1714 size_t outLen = 0;
1715 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1716 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1717 {
1718 wxUint32 ch;
1719 wxUint16 tmp[2];
1720
1721 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1722 inBuff++;
1723 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1724
1725 const size_t numChars = decode_utf16(tmp, ch);
1726 if ( numChars == wxCONV_FAILED )
1727 return wxCONV_FAILED;
1728
1729 if ( numChars == 2 )
1730 inBuff++;
1731
1732 if ( ++outLen > dstLen )
1733 return wxCONV_FAILED;
1734
1735 *dst++ = ch;
1736 }
1737
1738
1739 return outLen;
1740 }
1741
1742 size_t
1743 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1744 const wchar_t *src, size_t srcLen) const
1745 {
1746 if ( srcLen == wxNO_LEN )
1747 srcLen = wxWcslen(src) + 1;
1748
1749 size_t outLen = 0;
1750 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1751 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1752 {
1753 wxUint16 cc[2] = { 0 };
1754 const size_t numChars = encode_utf16(*src, cc);
1755 if ( numChars == wxCONV_FAILED )
1756 return wxCONV_FAILED;
1757
1758 outLen += numChars * BYTES_PER_CHAR;
1759 if ( outBuff )
1760 {
1761 if ( outLen > dstLen )
1762 return wxCONV_FAILED;
1763
1764 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1765 if ( numChars == 2 )
1766 {
1767 // second character of a surrogate
1768 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1769 }
1770 }
1771 }
1772
1773 return outLen;
1774 }
1775
1776 #endif // WC_UTF16/!WC_UTF16
1777
1778
1779 // ============================================================================
1780 // UTF-32
1781 // ============================================================================
1782
1783 #ifdef WORDS_BIGENDIAN
1784 #define wxMBConvUTF32straight wxMBConvUTF32BE
1785 #define wxMBConvUTF32swap wxMBConvUTF32LE
1786 #else
1787 #define wxMBConvUTF32swap wxMBConvUTF32BE
1788 #define wxMBConvUTF32straight wxMBConvUTF32LE
1789 #endif
1790
1791
1792 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1793 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1794
1795 /* static */
1796 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1797 {
1798 if ( srcLen == wxNO_LEN )
1799 {
1800 // count the number of bytes in input, including the trailing NULs
1801 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1802 for ( srcLen = 1; *inBuff++; srcLen++ )
1803 ;
1804
1805 srcLen *= BYTES_PER_CHAR;
1806 }
1807 else // we already have the length
1808 {
1809 // we can only convert an entire number of UTF-32 characters
1810 if ( srcLen % BYTES_PER_CHAR )
1811 return wxCONV_FAILED;
1812 }
1813
1814 return srcLen;
1815 }
1816
1817 // case when in-memory representation is UTF-16
1818 #ifdef WC_UTF16
1819
1820 // ----------------------------------------------------------------------------
1821 // conversions without endianness change
1822 // ----------------------------------------------------------------------------
1823
1824 size_t
1825 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1826 const char *src, size_t srcLen) const
1827 {
1828 srcLen = GetLength(src, srcLen);
1829 if ( srcLen == wxNO_LEN )
1830 return wxCONV_FAILED;
1831
1832 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1833 const size_t inLen = srcLen / BYTES_PER_CHAR;
1834 size_t outLen = 0;
1835 for ( size_t n = 0; n < inLen; n++ )
1836 {
1837 wxUint16 cc[2] = { 0 };
1838 const size_t numChars = encode_utf16(*inBuff++, cc);
1839 if ( numChars == wxCONV_FAILED )
1840 return wxCONV_FAILED;
1841
1842 outLen += numChars;
1843 if ( dst )
1844 {
1845 if ( outLen > dstLen )
1846 return wxCONV_FAILED;
1847
1848 *dst++ = cc[0];
1849 if ( numChars == 2 )
1850 {
1851 // second character of a surrogate
1852 *dst++ = cc[1];
1853 }
1854 }
1855 }
1856
1857 return outLen;
1858 }
1859
1860 size_t
1861 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1862 const wchar_t *src, size_t srcLen) const
1863 {
1864 if ( srcLen == wxNO_LEN )
1865 srcLen = wxWcslen(src) + 1;
1866
1867 if ( !dst )
1868 {
1869 // optimization: return maximal space which could be needed for this
1870 // string instead of the exact amount which could be less if there are
1871 // any surrogates in the input
1872 //
1873 // we consider that surrogates are rare enough to make it worthwhile to
1874 // avoid running the loop below at the cost of slightly extra memory
1875 // consumption
1876 return srcLen * BYTES_PER_CHAR;
1877 }
1878
1879 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1880 size_t outLen = 0;
1881 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1882 {
1883 const wxUint32 ch = wxDecodeSurrogate(&src);
1884 if ( !src )
1885 return wxCONV_FAILED;
1886
1887 outLen += BYTES_PER_CHAR;
1888
1889 if ( outLen > dstLen )
1890 return wxCONV_FAILED;
1891
1892 *outBuff++ = ch;
1893 }
1894
1895 return outLen;
1896 }
1897
1898 // ----------------------------------------------------------------------------
1899 // endian-reversing conversions
1900 // ----------------------------------------------------------------------------
1901
1902 size_t
1903 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1904 const char *src, size_t srcLen) const
1905 {
1906 srcLen = GetLength(src, srcLen);
1907 if ( srcLen == wxNO_LEN )
1908 return wxCONV_FAILED;
1909
1910 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1911 const size_t inLen = srcLen / BYTES_PER_CHAR;
1912 size_t outLen = 0;
1913 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1914 {
1915 wxUint16 cc[2] = { 0 };
1916 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1917 if ( numChars == wxCONV_FAILED )
1918 return wxCONV_FAILED;
1919
1920 outLen += numChars;
1921 if ( dst )
1922 {
1923 if ( outLen > dstLen )
1924 return wxCONV_FAILED;
1925
1926 *dst++ = cc[0];
1927 if ( numChars == 2 )
1928 {
1929 // second character of a surrogate
1930 *dst++ = cc[1];
1931 }
1932 }
1933 }
1934
1935 return outLen;
1936 }
1937
1938 size_t
1939 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1940 const wchar_t *src, size_t srcLen) const
1941 {
1942 if ( srcLen == wxNO_LEN )
1943 srcLen = wxWcslen(src) + 1;
1944
1945 if ( !dst )
1946 {
1947 // optimization: return maximal space which could be needed for this
1948 // string instead of the exact amount which could be less if there are
1949 // any surrogates in the input
1950 //
1951 // we consider that surrogates are rare enough to make it worthwhile to
1952 // avoid running the loop below at the cost of slightly extra memory
1953 // consumption
1954 return srcLen*BYTES_PER_CHAR;
1955 }
1956
1957 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1958 size_t outLen = 0;
1959 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1960 {
1961 const wxUint32 ch = wxDecodeSurrogate(&src);
1962 if ( !src )
1963 return wxCONV_FAILED;
1964
1965 outLen += BYTES_PER_CHAR;
1966
1967 if ( outLen > dstLen )
1968 return wxCONV_FAILED;
1969
1970 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1971 }
1972
1973 return outLen;
1974 }
1975
1976 #else // !WC_UTF16: wchar_t is UTF-32
1977
1978 // ----------------------------------------------------------------------------
1979 // conversions without endianness change
1980 // ----------------------------------------------------------------------------
1981
1982 size_t
1983 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1984 const char *src, size_t srcLen) const
1985 {
1986 // use memcpy() as it should be much faster than hand-written loop
1987 srcLen = GetLength(src, srcLen);
1988 if ( srcLen == wxNO_LEN )
1989 return wxCONV_FAILED;
1990
1991 const size_t inLen = srcLen/BYTES_PER_CHAR;
1992 if ( dst )
1993 {
1994 if ( dstLen < inLen )
1995 return wxCONV_FAILED;
1996
1997 memcpy(dst, src, srcLen);
1998 }
1999
2000 return inLen;
2001 }
2002
2003 size_t
2004 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2005 const wchar_t *src, size_t srcLen) const
2006 {
2007 if ( srcLen == wxNO_LEN )
2008 srcLen = wxWcslen(src) + 1;
2009
2010 srcLen *= BYTES_PER_CHAR;
2011
2012 if ( dst )
2013 {
2014 if ( dstLen < srcLen )
2015 return wxCONV_FAILED;
2016
2017 memcpy(dst, src, srcLen);
2018 }
2019
2020 return srcLen;
2021 }
2022
2023 // ----------------------------------------------------------------------------
2024 // endian-reversing conversions
2025 // ----------------------------------------------------------------------------
2026
2027 size_t
2028 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2029 const char *src, size_t srcLen) const
2030 {
2031 srcLen = GetLength(src, srcLen);
2032 if ( srcLen == wxNO_LEN )
2033 return wxCONV_FAILED;
2034
2035 srcLen /= BYTES_PER_CHAR;
2036
2037 if ( dst )
2038 {
2039 if ( dstLen < srcLen )
2040 return wxCONV_FAILED;
2041
2042 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2043 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2044 {
2045 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2046 }
2047 }
2048
2049 return srcLen;
2050 }
2051
2052 size_t
2053 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2054 const wchar_t *src, size_t srcLen) const
2055 {
2056 if ( srcLen == wxNO_LEN )
2057 srcLen = wxWcslen(src) + 1;
2058
2059 srcLen *= BYTES_PER_CHAR;
2060
2061 if ( dst )
2062 {
2063 if ( dstLen < srcLen )
2064 return wxCONV_FAILED;
2065
2066 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2067 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2068 {
2069 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2070 }
2071 }
2072
2073 return srcLen;
2074 }
2075
2076 #endif // WC_UTF16/!WC_UTF16
2077
2078
2079 // ============================================================================
2080 // The classes doing conversion using the iconv_xxx() functions
2081 // ============================================================================
2082
2083 #ifdef HAVE_ICONV
2084
2085 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2086 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2087 // (unless there's yet another bug in glibc) the only case when iconv()
2088 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2089 // left in the input buffer -- when _real_ error occurs,
2090 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2091 // iconv() failure.
2092 // [This bug does not appear in glibc 2.2.]
2093 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2094 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2095 (errno != E2BIG || bufLeft != 0))
2096 #else
2097 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2098 #endif
2099
2100 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2101
2102 #define ICONV_T_INVALID ((iconv_t)-1)
2103
2104 #if SIZEOF_WCHAR_T == 4
2105 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2106 #define WC_ENC wxFONTENCODING_UTF32
2107 #elif SIZEOF_WCHAR_T == 2
2108 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2109 #define WC_ENC wxFONTENCODING_UTF16
2110 #else // sizeof(wchar_t) != 2 nor 4
2111 // does this ever happen?
2112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2113 #endif
2114
2115 // ----------------------------------------------------------------------------
2116 // wxMBConv_iconv: encapsulates an iconv character set
2117 // ----------------------------------------------------------------------------
2118
2119 class wxMBConv_iconv : public wxMBConv
2120 {
2121 public:
2122 wxMBConv_iconv(const char *name);
2123 virtual ~wxMBConv_iconv();
2124
2125 // implement base class virtual methods
2126 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2127 const char *src, size_t srcLen = wxNO_LEN) const;
2128 virtual size_t FromWChar(char *dst, size_t dstLen,
2129 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2130 virtual size_t GetMBNulLen() const;
2131
2132 #if wxUSE_UNICODE_UTF8
2133 virtual bool IsUTF8() const;
2134 #endif
2135
2136 virtual wxMBConv *Clone() const
2137 {
2138 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2139 p->m_minMBCharWidth = m_minMBCharWidth;
2140 return p;
2141 }
2142
2143 bool IsOk() const
2144 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2145
2146 protected:
2147 // the iconv handlers used to translate from multibyte
2148 // to wide char and in the other direction
2149 iconv_t m2w,
2150 w2m;
2151
2152 #if wxUSE_THREADS
2153 // guards access to m2w and w2m objects
2154 wxMutex m_iconvMutex;
2155 #endif
2156
2157 private:
2158 // the name (for iconv_open()) of a wide char charset -- if none is
2159 // available on this machine, it will remain NULL
2160 static wxString ms_wcCharsetName;
2161
2162 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2163 // different endian-ness than the native one
2164 static bool ms_wcNeedsSwap;
2165
2166
2167 // name of the encoding handled by this conversion
2168 const char *m_name;
2169
2170 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2171 // initially
2172 size_t m_minMBCharWidth;
2173 };
2174
2175 // make the constructor available for unit testing
2176 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2177 {
2178 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2179 if ( !result->IsOk() )
2180 {
2181 delete result;
2182 return 0;
2183 }
2184
2185 return result;
2186 }
2187
2188 wxString wxMBConv_iconv::ms_wcCharsetName;
2189 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2190
2191 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2192 : m_name(wxStrdup(name))
2193 {
2194 m_minMBCharWidth = 0;
2195
2196 // check for charset that represents wchar_t:
2197 if ( ms_wcCharsetName.empty() )
2198 {
2199 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2200
2201 #if wxUSE_FONTMAP
2202 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2203 #else // !wxUSE_FONTMAP
2204 static const wxChar *const names_static[] =
2205 {
2206 #if SIZEOF_WCHAR_T == 4
2207 wxT("UCS-4"),
2208 #elif SIZEOF_WCHAR_T == 2
2209 wxT("UCS-2"),
2210 #endif
2211 NULL
2212 };
2213 const wxChar *const *names = names_static;
2214 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2215
2216 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2217 {
2218 const wxString nameCS(*names);
2219
2220 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2221 wxString nameXE(nameCS);
2222
2223 #ifdef WORDS_BIGENDIAN
2224 nameXE += wxT("BE");
2225 #else // little endian
2226 nameXE += wxT("LE");
2227 #endif
2228
2229 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2230 nameXE.c_str());
2231
2232 m2w = iconv_open(nameXE.ToAscii(), name);
2233 if ( m2w == ICONV_T_INVALID )
2234 {
2235 // try charset w/o bytesex info (e.g. "UCS4")
2236 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2237 nameCS.c_str());
2238 m2w = iconv_open(nameCS.ToAscii(), name);
2239
2240 // and check for bytesex ourselves:
2241 if ( m2w != ICONV_T_INVALID )
2242 {
2243 char buf[2], *bufPtr;
2244 wchar_t wbuf[2];
2245 size_t insz, outsz;
2246 size_t res;
2247
2248 buf[0] = 'A';
2249 buf[1] = 0;
2250 wbuf[0] = 0;
2251 insz = 2;
2252 outsz = SIZEOF_WCHAR_T * 2;
2253 char* wbufPtr = (char*)wbuf;
2254 bufPtr = buf;
2255
2256 res = iconv(
2257 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2258 &wbufPtr, &outsz);
2259
2260 if (ICONV_FAILED(res, insz))
2261 {
2262 wxLogLastError(wxT("iconv"));
2263 wxLogError(_("Conversion to charset '%s' doesn't work."),
2264 nameCS.c_str());
2265 }
2266 else // ok, can convert to this encoding, remember it
2267 {
2268 ms_wcCharsetName = nameCS;
2269 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2270 }
2271 }
2272 }
2273 else // use charset not requiring byte swapping
2274 {
2275 ms_wcCharsetName = nameXE;
2276 }
2277 }
2278
2279 wxLogTrace(TRACE_STRCONV,
2280 wxT("iconv wchar_t charset is \"%s\"%s"),
2281 ms_wcCharsetName.empty() ? wxString("<none>")
2282 : ms_wcCharsetName,
2283 ms_wcNeedsSwap ? wxT(" (needs swap)")
2284 : wxT(""));
2285 }
2286 else // we already have ms_wcCharsetName
2287 {
2288 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2289 }
2290
2291 if ( ms_wcCharsetName.empty() )
2292 {
2293 w2m = ICONV_T_INVALID;
2294 }
2295 else
2296 {
2297 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2298 if ( w2m == ICONV_T_INVALID )
2299 {
2300 wxLogTrace(TRACE_STRCONV,
2301 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2302 ms_wcCharsetName.c_str(), name);
2303 }
2304 }
2305 }
2306
2307 wxMBConv_iconv::~wxMBConv_iconv()
2308 {
2309 free(const_cast<char *>(m_name));
2310
2311 if ( m2w != ICONV_T_INVALID )
2312 iconv_close(m2w);
2313 if ( w2m != ICONV_T_INVALID )
2314 iconv_close(w2m);
2315 }
2316
2317 size_t
2318 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2319 const char *src, size_t srcLen) const
2320 {
2321 if ( srcLen == wxNO_LEN )
2322 {
2323 // find the string length: notice that must be done differently for
2324 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2325 // consecutive NULs
2326 const size_t nulLen = GetMBNulLen();
2327 switch ( nulLen )
2328 {
2329 default:
2330 return wxCONV_FAILED;
2331
2332 case 1:
2333 srcLen = strlen(src); // arguably more optimized than our version
2334 break;
2335
2336 case 2:
2337 case 4:
2338 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2339 // but they also have to start at character boundary and not
2340 // span two adjacent characters
2341 const char *p;
2342 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2343 ;
2344 srcLen = p - src;
2345 break;
2346 }
2347
2348 // when we're determining the length of the string ourselves we count
2349 // the terminating NUL(s) as part of it and always NUL-terminate the
2350 // output
2351 srcLen += nulLen;
2352 }
2353
2354 // we express length in the number of (wide) characters but iconv always
2355 // counts buffer sizes it in bytes
2356 dstLen *= SIZEOF_WCHAR_T;
2357
2358 #if wxUSE_THREADS
2359 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2360 // Unfortunately there are a couple of global wxCSConv objects such as
2361 // wxConvLocal that are used all over wx code, so we have to make sure
2362 // the handle is used by at most one thread at the time. Otherwise
2363 // only a few wx classes would be safe to use from non-main threads
2364 // as MB<->WC conversion would fail "randomly".
2365 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2366 #endif // wxUSE_THREADS
2367
2368 size_t res, cres;
2369 const char *pszPtr = src;
2370
2371 if ( dst )
2372 {
2373 char* bufPtr = (char*)dst;
2374
2375 // have destination buffer, convert there
2376 size_t dstLenOrig = dstLen;
2377 cres = iconv(m2w,
2378 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2379 &bufPtr, &dstLen);
2380
2381 // convert the number of bytes converted as returned by iconv to the
2382 // number of (wide) characters converted that we need
2383 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2384
2385 if (ms_wcNeedsSwap)
2386 {
2387 // convert to native endianness
2388 for ( unsigned i = 0; i < res; i++ )
2389 dst[i] = WC_BSWAP(dst[i]);
2390 }
2391 }
2392 else // no destination buffer
2393 {
2394 // convert using temp buffer to calculate the size of the buffer needed
2395 wchar_t tbuf[256];
2396 res = 0;
2397
2398 do
2399 {
2400 char* bufPtr = (char*)tbuf;
2401 dstLen = 8 * SIZEOF_WCHAR_T;
2402
2403 cres = iconv(m2w,
2404 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2405 &bufPtr, &dstLen );
2406
2407 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2408 }
2409 while ((cres == (size_t)-1) && (errno == E2BIG));
2410 }
2411
2412 if (ICONV_FAILED(cres, srcLen))
2413 {
2414 //VS: it is ok if iconv fails, hence trace only
2415 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2416 return wxCONV_FAILED;
2417 }
2418
2419 return res;
2420 }
2421
2422 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2423 const wchar_t *src, size_t srcLen) const
2424 {
2425 #if wxUSE_THREADS
2426 // NB: explained in MB2WC
2427 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2428 #endif
2429
2430 if ( srcLen == wxNO_LEN )
2431 srcLen = wxWcslen(src) + 1;
2432
2433 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2434 size_t outbuflen = dstLen;
2435 size_t res, cres;
2436
2437 wchar_t *tmpbuf = 0;
2438
2439 if (ms_wcNeedsSwap)
2440 {
2441 // need to copy to temp buffer to switch endianness
2442 // (doing WC_BSWAP twice on the original buffer won't work, as it
2443 // could be in read-only memory, or be accessed in some other thread)
2444 tmpbuf = (wchar_t *)malloc(inbuflen);
2445 for ( size_t i = 0; i < srcLen; i++ )
2446 tmpbuf[i] = WC_BSWAP(src[i]);
2447
2448 src = tmpbuf;
2449 }
2450
2451 char* inbuf = (char*)src;
2452 if ( dst )
2453 {
2454 // have destination buffer, convert there
2455 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2456
2457 res = dstLen - outbuflen;
2458 }
2459 else // no destination buffer
2460 {
2461 // convert using temp buffer to calculate the size of the buffer needed
2462 char tbuf[256];
2463 res = 0;
2464 do
2465 {
2466 dst = tbuf;
2467 outbuflen = WXSIZEOF(tbuf);
2468
2469 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2470
2471 res += WXSIZEOF(tbuf) - outbuflen;
2472 }
2473 while ((cres == (size_t)-1) && (errno == E2BIG));
2474 }
2475
2476 if (ms_wcNeedsSwap)
2477 {
2478 free(tmpbuf);
2479 }
2480
2481 if (ICONV_FAILED(cres, inbuflen))
2482 {
2483 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2484 return wxCONV_FAILED;
2485 }
2486
2487 return res;
2488 }
2489
2490 size_t wxMBConv_iconv::GetMBNulLen() const
2491 {
2492 if ( m_minMBCharWidth == 0 )
2493 {
2494 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2495
2496 #if wxUSE_THREADS
2497 // NB: explained in MB2WC
2498 wxMutexLocker lock(self->m_iconvMutex);
2499 #endif
2500
2501 const wchar_t *wnul = L"";
2502 char buf[8]; // should be enough for NUL in any encoding
2503 size_t inLen = sizeof(wchar_t),
2504 outLen = WXSIZEOF(buf);
2505 char *inBuff = (char *)wnul;
2506 char *outBuff = buf;
2507 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2508 {
2509 self->m_minMBCharWidth = (size_t)-1;
2510 }
2511 else // ok
2512 {
2513 self->m_minMBCharWidth = outBuff - buf;
2514 }
2515 }
2516
2517 return m_minMBCharWidth;
2518 }
2519
2520 #if wxUSE_UNICODE_UTF8
2521 bool wxMBConv_iconv::IsUTF8() const
2522 {
2523 return wxStricmp(m_name, "UTF-8") == 0 ||
2524 wxStricmp(m_name, "UTF8") == 0;
2525 }
2526 #endif
2527
2528 #endif // HAVE_ICONV
2529
2530
2531 // ============================================================================
2532 // Win32 conversion classes
2533 // ============================================================================
2534
2535 #ifdef wxHAVE_WIN32_MB2WC
2536
2537 // from utils.cpp
2538 #if wxUSE_FONTMAP
2539 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2540 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2541 #endif
2542
2543 class wxMBConv_win32 : public wxMBConv
2544 {
2545 public:
2546 wxMBConv_win32()
2547 {
2548 m_CodePage = CP_ACP;
2549 m_minMBCharWidth = 0;
2550 }
2551
2552 wxMBConv_win32(const wxMBConv_win32& conv)
2553 : wxMBConv()
2554 {
2555 m_CodePage = conv.m_CodePage;
2556 m_minMBCharWidth = conv.m_minMBCharWidth;
2557 }
2558
2559 #if wxUSE_FONTMAP
2560 wxMBConv_win32(const char* name)
2561 {
2562 m_CodePage = wxCharsetToCodepage(name);
2563 m_minMBCharWidth = 0;
2564 }
2565
2566 wxMBConv_win32(wxFontEncoding encoding)
2567 {
2568 m_CodePage = wxEncodingToCodepage(encoding);
2569 m_minMBCharWidth = 0;
2570 }
2571 #endif // wxUSE_FONTMAP
2572
2573 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2574 {
2575 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2576 // the behaviour is not compatible with the Unix version (using iconv)
2577 // and break the library itself, e.g. wxTextInputStream::NextChar()
2578 // wouldn't work if reading an incomplete MB char didn't result in an
2579 // error
2580 //
2581 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2582 // Win XP or newer and it is not supported for UTF-[78] so we always
2583 // use our own conversions in this case. See
2584 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2585 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2586 if ( m_CodePage == CP_UTF8 )
2587 {
2588 return wxMBConvUTF8().MB2WC(buf, psz, n);
2589 }
2590
2591 if ( m_CodePage == CP_UTF7 )
2592 {
2593 return wxMBConvUTF7().MB2WC(buf, psz, n);
2594 }
2595
2596 int flags = 0;
2597 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2598 IsAtLeastWin2kSP4() )
2599 {
2600 flags = MB_ERR_INVALID_CHARS;
2601 }
2602
2603 const size_t len = ::MultiByteToWideChar
2604 (
2605 m_CodePage, // code page
2606 flags, // flags: fall on error
2607 psz, // input string
2608 -1, // its length (NUL-terminated)
2609 buf, // output string
2610 buf ? n : 0 // size of output buffer
2611 );
2612 if ( !len )
2613 {
2614 // function totally failed
2615 return wxCONV_FAILED;
2616 }
2617
2618 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2619 // check if we succeeded, by doing a double trip:
2620 if ( !flags && buf )
2621 {
2622 const size_t mbLen = strlen(psz);
2623 wxCharBuffer mbBuf(mbLen);
2624 if ( ::WideCharToMultiByte
2625 (
2626 m_CodePage,
2627 0,
2628 buf,
2629 -1,
2630 mbBuf.data(),
2631 mbLen + 1, // size in bytes, not length
2632 NULL,
2633 NULL
2634 ) == 0 ||
2635 strcmp(mbBuf, psz) != 0 )
2636 {
2637 // we didn't obtain the same thing we started from, hence
2638 // the conversion was lossy and we consider that it failed
2639 return wxCONV_FAILED;
2640 }
2641 }
2642
2643 // note that it returns count of written chars for buf != NULL and size
2644 // of the needed buffer for buf == NULL so in either case the length of
2645 // the string (which never includes the terminating NUL) is one less
2646 return len - 1;
2647 }
2648
2649 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2650 {
2651 /*
2652 we have a problem here: by default, WideCharToMultiByte() may
2653 replace characters unrepresentable in the target code page with bad
2654 quality approximations such as turning "1/2" symbol (U+00BD) into
2655 "1" for the code pages which don't have it and we, obviously, want
2656 to avoid this at any price
2657
2658 the trouble is that this function does it _silently_, i.e. it won't
2659 even tell us whether it did or not... Win98/2000 and higher provide
2660 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2661 we have to resort to a round trip, i.e. check that converting back
2662 results in the same string -- this is, of course, expensive but
2663 otherwise we simply can't be sure to not garble the data.
2664 */
2665
2666 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2667 // it doesn't work with CJK encodings (which we test for rather roughly
2668 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2669 // supporting it
2670 BOOL usedDef wxDUMMY_INITIALIZE(false);
2671 BOOL *pUsedDef;
2672 int flags;
2673 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2674 {
2675 // it's our lucky day
2676 flags = WC_NO_BEST_FIT_CHARS;
2677 pUsedDef = &usedDef;
2678 }
2679 else // old system or unsupported encoding
2680 {
2681 flags = 0;
2682 pUsedDef = NULL;
2683 }
2684
2685 const size_t len = ::WideCharToMultiByte
2686 (
2687 m_CodePage, // code page
2688 flags, // either none or no best fit
2689 pwz, // input string
2690 -1, // it is (wide) NUL-terminated
2691 buf, // output buffer
2692 buf ? n : 0, // and its size
2693 NULL, // default "replacement" char
2694 pUsedDef // [out] was it used?
2695 );
2696
2697 if ( !len )
2698 {
2699 // function totally failed
2700 return wxCONV_FAILED;
2701 }
2702
2703 // we did something, check if we really succeeded
2704 if ( flags )
2705 {
2706 // check if the conversion failed, i.e. if any replacements
2707 // were done
2708 if ( usedDef )
2709 return wxCONV_FAILED;
2710 }
2711 else // we must resort to double tripping...
2712 {
2713 // first we need to ensure that we really have the MB data: this is
2714 // not the case if we're called with NULL buffer, in which case we
2715 // need to do the conversion yet again
2716 wxCharBuffer bufDef;
2717 if ( !buf )
2718 {
2719 bufDef = wxCharBuffer(len);
2720 buf = bufDef.data();
2721 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2722 buf, len, NULL, NULL) )
2723 return wxCONV_FAILED;
2724 }
2725
2726 if ( !n )
2727 n = wcslen(pwz);
2728 wxWCharBuffer wcBuf(n);
2729 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2730 wcscmp(wcBuf, pwz) != 0 )
2731 {
2732 // we didn't obtain the same thing we started from, hence
2733 // the conversion was lossy and we consider that it failed
2734 return wxCONV_FAILED;
2735 }
2736 }
2737
2738 // see the comment above for the reason of "len - 1"
2739 return len - 1;
2740 }
2741
2742 virtual size_t GetMBNulLen() const
2743 {
2744 if ( m_minMBCharWidth == 0 )
2745 {
2746 int len = ::WideCharToMultiByte
2747 (
2748 m_CodePage, // code page
2749 0, // no flags
2750 L"", // input string
2751 1, // translate just the NUL
2752 NULL, // output buffer
2753 0, // and its size
2754 NULL, // no replacement char
2755 NULL // [out] don't care if it was used
2756 );
2757
2758 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2759 switch ( len )
2760 {
2761 default:
2762 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2763 self->m_minMBCharWidth = (size_t)-1;
2764 break;
2765
2766 case 0:
2767 self->m_minMBCharWidth = (size_t)-1;
2768 break;
2769
2770 case 1:
2771 case 2:
2772 case 4:
2773 self->m_minMBCharWidth = len;
2774 break;
2775 }
2776 }
2777
2778 return m_minMBCharWidth;
2779 }
2780
2781 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2782
2783 bool IsOk() const { return m_CodePage != -1; }
2784
2785 private:
2786 static bool CanUseNoBestFit()
2787 {
2788 static int s_isWin98Or2k = -1;
2789
2790 if ( s_isWin98Or2k == -1 )
2791 {
2792 int verMaj, verMin;
2793 switch ( wxGetOsVersion(&verMaj, &verMin) )
2794 {
2795 case wxOS_WINDOWS_9X:
2796 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2797 break;
2798
2799 case wxOS_WINDOWS_NT:
2800 s_isWin98Or2k = verMaj >= 5;
2801 break;
2802
2803 default:
2804 // unknown: be conservative by default
2805 s_isWin98Or2k = 0;
2806 break;
2807 }
2808
2809 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2810 }
2811
2812 return s_isWin98Or2k == 1;
2813 }
2814
2815 static bool IsAtLeastWin2kSP4()
2816 {
2817 #ifdef __WXWINCE__
2818 return false;
2819 #else
2820 static int s_isAtLeastWin2kSP4 = -1;
2821
2822 if ( s_isAtLeastWin2kSP4 == -1 )
2823 {
2824 OSVERSIONINFOEX ver;
2825
2826 memset(&ver, 0, sizeof(ver));
2827 ver.dwOSVersionInfoSize = sizeof(ver);
2828 GetVersionEx((OSVERSIONINFO*)&ver);
2829
2830 s_isAtLeastWin2kSP4 =
2831 ((ver.dwMajorVersion > 5) || // Vista+
2832 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2833 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2834 ver.wServicePackMajor >= 4)) // 2000 SP4+
2835 ? 1 : 0;
2836 }
2837
2838 return s_isAtLeastWin2kSP4 == 1;
2839 #endif
2840 }
2841
2842
2843 // the code page we're working with
2844 long m_CodePage;
2845
2846 // cached result of GetMBNulLen(), set to 0 initially meaning
2847 // "unknown"
2848 size_t m_minMBCharWidth;
2849 };
2850
2851 #endif // wxHAVE_WIN32_MB2WC
2852
2853
2854 // ============================================================================
2855 // wxEncodingConverter based conversion classes
2856 // ============================================================================
2857
2858 #if wxUSE_FONTMAP
2859
2860 class wxMBConv_wxwin : public wxMBConv
2861 {
2862 private:
2863 void Init()
2864 {
2865 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2866 // The wxMBConv_cf class does a better job.
2867 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2868 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2869 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2870 }
2871
2872 public:
2873 // temporarily just use wxEncodingConverter stuff,
2874 // so that it works while a better implementation is built
2875 wxMBConv_wxwin(const char* name)
2876 {
2877 if (name)
2878 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2879 else
2880 m_enc = wxFONTENCODING_SYSTEM;
2881
2882 Init();
2883 }
2884
2885 wxMBConv_wxwin(wxFontEncoding enc)
2886 {
2887 m_enc = enc;
2888
2889 Init();
2890 }
2891
2892 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2893 {
2894 size_t inbuf = strlen(psz);
2895 if (buf)
2896 {
2897 if (!m2w.Convert(psz, buf))
2898 return wxCONV_FAILED;
2899 }
2900 return inbuf;
2901 }
2902
2903 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2904 {
2905 const size_t inbuf = wxWcslen(psz);
2906 if (buf)
2907 {
2908 if (!w2m.Convert(psz, buf))
2909 return wxCONV_FAILED;
2910 }
2911
2912 return inbuf;
2913 }
2914
2915 virtual size_t GetMBNulLen() const
2916 {
2917 switch ( m_enc )
2918 {
2919 case wxFONTENCODING_UTF16BE:
2920 case wxFONTENCODING_UTF16LE:
2921 return 2;
2922
2923 case wxFONTENCODING_UTF32BE:
2924 case wxFONTENCODING_UTF32LE:
2925 return 4;
2926
2927 default:
2928 return 1;
2929 }
2930 }
2931
2932 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2933
2934 bool IsOk() const { return m_ok; }
2935
2936 public:
2937 wxFontEncoding m_enc;
2938 wxEncodingConverter m2w, w2m;
2939
2940 private:
2941 // were we initialized successfully?
2942 bool m_ok;
2943
2944 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2945 };
2946
2947 // make the constructors available for unit testing
2948 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2949 {
2950 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2951 if ( !result->IsOk() )
2952 {
2953 delete result;
2954 return 0;
2955 }
2956
2957 return result;
2958 }
2959
2960 #endif // wxUSE_FONTMAP
2961
2962 // ============================================================================
2963 // wxCSConv implementation
2964 // ============================================================================
2965
2966 void wxCSConv::Init()
2967 {
2968 m_name = NULL;
2969 m_convReal = NULL;
2970 }
2971
2972 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2973 {
2974 switch ( encoding )
2975 {
2976 case wxFONTENCODING_MAX:
2977 case wxFONTENCODING_SYSTEM:
2978 if ( m_name )
2979 {
2980 // It's ok to not have encoding value if we have a name for it.
2981 m_encoding = wxFONTENCODING_SYSTEM;
2982 }
2983 else // No name neither.
2984 {
2985 // Fall back to the system default encoding in this case (not
2986 // sure how much sense does this make but this is how the old
2987 // code used to behave).
2988 #if wxUSE_INTL
2989 m_encoding = wxLocale::GetSystemEncoding();
2990 if ( m_encoding == wxFONTENCODING_SYSTEM )
2991 #endif // wxUSE_INTL
2992 m_encoding = wxFONTENCODING_ISO8859_1;
2993 }
2994 break;
2995
2996 case wxFONTENCODING_DEFAULT:
2997 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2998 m_encoding = wxFONTENCODING_ISO8859_1;
2999 break;
3000
3001 default:
3002 // Just use the provided encoding.
3003 m_encoding = encoding;
3004 }
3005 }
3006
3007 wxCSConv::wxCSConv(const wxString& charset)
3008 {
3009 Init();
3010
3011 if ( !charset.empty() )
3012 {
3013 SetName(charset.ToAscii());
3014 }
3015
3016 #if wxUSE_FONTMAP
3017 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3018 #else
3019 SetEncoding(wxFONTENCODING_SYSTEM);
3020 #endif
3021
3022 m_convReal = DoCreate();
3023 }
3024
3025 wxCSConv::wxCSConv(wxFontEncoding encoding)
3026 {
3027 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3028 {
3029 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3030
3031 encoding = wxFONTENCODING_SYSTEM;
3032 }
3033
3034 Init();
3035
3036 SetEncoding(encoding);
3037
3038 m_convReal = DoCreate();
3039 }
3040
3041 wxCSConv::~wxCSConv()
3042 {
3043 Clear();
3044 }
3045
3046 wxCSConv::wxCSConv(const wxCSConv& conv)
3047 : wxMBConv()
3048 {
3049 Init();
3050
3051 SetName(conv.m_name);
3052 SetEncoding(conv.m_encoding);
3053
3054 m_convReal = DoCreate();
3055 }
3056
3057 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3058 {
3059 Clear();
3060
3061 SetName(conv.m_name);
3062 SetEncoding(conv.m_encoding);
3063
3064 m_convReal = DoCreate();
3065
3066 return *this;
3067 }
3068
3069 void wxCSConv::Clear()
3070 {
3071 free(m_name);
3072 m_name = NULL;
3073
3074 wxDELETE(m_convReal);
3075 }
3076
3077 void wxCSConv::SetName(const char *charset)
3078 {
3079 if ( charset )
3080 m_name = wxStrdup(charset);
3081 }
3082
3083 #if wxUSE_FONTMAP
3084
3085 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3086 wxEncodingNameCache );
3087
3088 static wxEncodingNameCache gs_nameCache;
3089 #endif
3090
3091 wxMBConv *wxCSConv::DoCreate() const
3092 {
3093 #if wxUSE_FONTMAP
3094 wxLogTrace(TRACE_STRCONV,
3095 wxT("creating conversion for %s"),
3096 (m_name ? m_name
3097 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3098 #endif // wxUSE_FONTMAP
3099
3100 // check for the special case of ASCII or ISO8859-1 charset: as we have
3101 // special knowledge of it anyhow, we don't need to create a special
3102 // conversion object
3103 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3104 {
3105 // don't convert at all
3106 return NULL;
3107 }
3108
3109 // we trust OS to do conversion better than we can so try external
3110 // conversion methods first
3111 //
3112 // the full order is:
3113 // 1. OS conversion (iconv() under Unix or Win32 API)
3114 // 2. hard coded conversions for UTF
3115 // 3. wxEncodingConverter as fall back
3116
3117 // step (1)
3118 #ifdef HAVE_ICONV
3119 #if !wxUSE_FONTMAP
3120 if ( m_name )
3121 #endif // !wxUSE_FONTMAP
3122 {
3123 #if wxUSE_FONTMAP
3124 wxFontEncoding encoding(m_encoding);
3125 #endif
3126
3127 if ( m_name )
3128 {
3129 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3130 if ( conv->IsOk() )
3131 return conv;
3132
3133 delete conv;
3134
3135 #if wxUSE_FONTMAP
3136 encoding =
3137 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3138 #endif // wxUSE_FONTMAP
3139 }
3140 #if wxUSE_FONTMAP
3141 {
3142 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3143 if ( it != gs_nameCache.end() )
3144 {
3145 if ( it->second.empty() )
3146 return NULL;
3147
3148 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3149 if ( conv->IsOk() )
3150 return conv;
3151
3152 delete conv;
3153 }
3154
3155 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3156 // CS : in case this does not return valid names (eg for MacRoman)
3157 // encoding got a 'failure' entry in the cache all the same,
3158 // although it just has to be created using a different method, so
3159 // only store failed iconv creation attempts (or perhaps we
3160 // shoulnd't do this at all ?)
3161 if ( names[0] != NULL )
3162 {
3163 for ( ; *names; ++names )
3164 {
3165 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3166 // will need changes that will obsolete this
3167 wxString name(*names);
3168 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3169 if ( conv->IsOk() )
3170 {
3171 gs_nameCache[encoding] = *names;
3172 return conv;
3173 }
3174
3175 delete conv;
3176 }
3177
3178 gs_nameCache[encoding] = wxT(""); // cache the failure
3179 }
3180 }
3181 #endif // wxUSE_FONTMAP
3182 }
3183 #endif // HAVE_ICONV
3184
3185 #ifdef wxHAVE_WIN32_MB2WC
3186 {
3187 #if wxUSE_FONTMAP
3188 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3189 : new wxMBConv_win32(m_encoding);
3190 if ( conv->IsOk() )
3191 return conv;
3192
3193 delete conv;
3194 #else
3195 return NULL;
3196 #endif
3197 }
3198 #endif // wxHAVE_WIN32_MB2WC
3199
3200 #ifdef __DARWIN__
3201 {
3202 // leave UTF16 and UTF32 to the built-ins of wx
3203 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3204 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3205 {
3206 #if wxUSE_FONTMAP
3207 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3208 : new wxMBConv_cf(m_encoding);
3209 #else
3210 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3211 #endif
3212
3213 if ( conv->IsOk() )
3214 return conv;
3215
3216 delete conv;
3217 }
3218 }
3219 #endif // __DARWIN__
3220
3221 // step (2)
3222 wxFontEncoding enc = m_encoding;
3223 #if wxUSE_FONTMAP
3224 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225 {
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3228 // do
3229 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3230 }
3231 #endif // wxUSE_FONTMAP
3232
3233 switch ( enc )
3234 {
3235 case wxFONTENCODING_UTF7:
3236 return new wxMBConvUTF7;
3237
3238 case wxFONTENCODING_UTF8:
3239 return new wxMBConvUTF8;
3240
3241 case wxFONTENCODING_UTF16BE:
3242 return new wxMBConvUTF16BE;
3243
3244 case wxFONTENCODING_UTF16LE:
3245 return new wxMBConvUTF16LE;
3246
3247 case wxFONTENCODING_UTF32BE:
3248 return new wxMBConvUTF32BE;
3249
3250 case wxFONTENCODING_UTF32LE:
3251 return new wxMBConvUTF32LE;
3252
3253 default:
3254 // nothing to do but put here to suppress gcc warnings
3255 break;
3256 }
3257
3258 // step (3)
3259 #if wxUSE_FONTMAP
3260 {
3261 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262 : new wxMBConv_wxwin(m_encoding);
3263 if ( conv->IsOk() )
3264 return conv;
3265
3266 delete conv;
3267 }
3268
3269 wxLogTrace(TRACE_STRCONV,
3270 wxT("encoding \"%s\" is not supported by this system"),
3271 (m_name ? wxString(m_name)
3272 : wxFontMapperBase::GetEncodingName(m_encoding)));
3273 #endif // wxUSE_FONTMAP
3274
3275 return NULL;
3276 }
3277
3278 bool wxCSConv::IsOk() const
3279 {
3280 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3281 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3282 return true; // always ok as we do it ourselves
3283
3284 // m_convReal->IsOk() is called at its own creation, so we know it must
3285 // be ok if m_convReal is non-NULL
3286 return m_convReal != NULL;
3287 }
3288
3289 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3290 const char *src, size_t srcLen) const
3291 {
3292 if (m_convReal)
3293 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3294
3295 // latin-1 (direct)
3296 if ( srcLen == wxNO_LEN )
3297 srcLen = strlen(src) + 1; // take trailing NUL too
3298
3299 if ( dst )
3300 {
3301 if ( dstLen < srcLen )
3302 return wxCONV_FAILED;
3303
3304 for ( size_t n = 0; n < srcLen; n++ )
3305 dst[n] = (unsigned char)(src[n]);
3306 }
3307
3308 return srcLen;
3309 }
3310
3311 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3312 const wchar_t *src, size_t srcLen) const
3313 {
3314 if (m_convReal)
3315 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3316
3317 // latin-1 (direct)
3318 if ( srcLen == wxNO_LEN )
3319 srcLen = wxWcslen(src) + 1;
3320
3321 if ( dst )
3322 {
3323 if ( dstLen < srcLen )
3324 return wxCONV_FAILED;
3325
3326 for ( size_t n = 0; n < srcLen; n++ )
3327 {
3328 if ( src[n] > 0xFF )
3329 return wxCONV_FAILED;
3330
3331 dst[n] = (char)src[n];
3332 }
3333
3334 }
3335 else // still need to check the input validity
3336 {
3337 for ( size_t n = 0; n < srcLen; n++ )
3338 {
3339 if ( src[n] > 0xFF )
3340 return wxCONV_FAILED;
3341 }
3342 }
3343
3344 return srcLen;
3345 }
3346
3347 size_t wxCSConv::GetMBNulLen() const
3348 {
3349 if ( m_convReal )
3350 return m_convReal->GetMBNulLen();
3351
3352 // otherwise, we are ISO-8859-1
3353 return 1;
3354 }
3355
3356 #if wxUSE_UNICODE_UTF8
3357 bool wxCSConv::IsUTF8() const
3358 {
3359 if ( m_convReal )
3360 return m_convReal->IsUTF8();
3361
3362 // otherwise, we are ISO-8859-1
3363 return false;
3364 }
3365 #endif
3366
3367
3368 #if wxUSE_UNICODE
3369
3370 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3371 {
3372 if ( !s )
3373 return wxWCharBuffer();
3374
3375 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3376 if ( !wbuf )
3377 wbuf = wxMBConvUTF8().cMB2WX(s);
3378 if ( !wbuf )
3379 wbuf = wxConvISO8859_1.cMB2WX(s);
3380
3381 return wbuf;
3382 }
3383
3384 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3385 {
3386 if ( !ws )
3387 return wxCharBuffer();
3388
3389 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3390 if ( !buf )
3391 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3392
3393 return buf;
3394 }
3395
3396 #endif // wxUSE_UNICODE
3397
3398 // ----------------------------------------------------------------------------
3399 // globals
3400 // ----------------------------------------------------------------------------
3401
3402 // NB: The reason why we create converted objects in this convoluted way,
3403 // using a factory function instead of global variable, is that they
3404 // may be used at static initialization time (some of them are used by
3405 // wxString ctors and there may be a global wxString object). In other
3406 // words, possibly _before_ the converter global object would be
3407 // initialized.
3408
3409 #undef wxConvLibc
3410 #undef wxConvUTF8
3411 #undef wxConvUTF7
3412 #undef wxConvLocal
3413 #undef wxConvISO8859_1
3414
3415 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3416 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3417 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3418 { \
3419 static impl_klass name##Obj ctor_args; \
3420 return &name##Obj; \
3421 } \
3422 /* this ensures that all global converter objects are created */ \
3423 /* by the time static initialization is done, i.e. before any */ \
3424 /* thread is launched: */ \
3425 static klass* gs_##name##instance = wxGet_##name##Ptr()
3426
3427 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3428 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3429
3430 #ifdef __INTELC__
3431 // disable warning "variable 'xxx' was declared but never referenced"
3432 #pragma warning(disable: 177)
3433 #endif // Intel C++
3434
3435 #ifdef __WINDOWS__
3436 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3437 #elif 0 // defined(__WXOSX__)
3438 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3439 #else
3440 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3441 #endif
3442
3443 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3444 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3445 // provokes an error message about "not enough macro parameters"; and we
3446 // can't use "()" here as the name##Obj declaration would be parsed as a
3447 // function declaration then, so use a semicolon and live with an extra
3448 // empty statement (and hope that no compilers warns about this)
3449 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3450 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3451
3452 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3453 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3454
3455 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3456 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3457
3458 #ifdef __DARWIN__
3459 // It is important to use this conversion object under Darwin as it ensures
3460 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3461 // decomposed form internally (at least for the file names).
3462 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3463 #endif
3464
3465 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3466 #ifdef __DARWIN__
3467 &wxConvMacUTF8DObj;
3468 #else // !__DARWIN__
3469 wxGet_wxConvLibcPtr();
3470 #endif // __DARWIN__/!__DARWIN__