Return valid buffer from wxMBConv::c{MB,WC}2{WC,MB} for empty input.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #ifndef __WXWINCE__
32 #include <errno.h>
33 #endif
34
35 #include <ctype.h>
36 #include <string.h>
37 #include <stdlib.h>
38
39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef HAVE_ICONV
46 #include <iconv.h>
47 #include "wx/thread.h"
48 #endif
49
50 #include "wx/encconv.h"
51 #include "wx/fontmap.h"
52
53 #ifdef __DARWIN__
54 #include "wx/osx/core/private/strconv_cf.h"
55 #endif //def __DARWIN__
56
57
58 #define TRACE_STRCONV wxT("strconv")
59
60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61 // be 4 bytes
62 #if SIZEOF_WCHAR_T == 2
63 #define WC_UTF16
64 #endif
65
66
67 // ============================================================================
68 // implementation
69 // ============================================================================
70
71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
72 static bool NotAllNULs(const char *p, size_t n)
73 {
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78 }
79
80 // ----------------------------------------------------------------------------
81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
82 // ----------------------------------------------------------------------------
83
84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
85 {
86 if (input <= 0xffff)
87 {
88 if (output)
89 *output = (wxUint16) input;
90
91 return 1;
92 }
93 else if (input >= 0x110000)
94 {
95 return wxCONV_FAILED;
96 }
97 else
98 {
99 if (output)
100 {
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
103 }
104
105 return 2;
106 }
107 }
108
109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
110 {
111 if ((*input < 0xd800) || (*input > 0xdfff))
112 {
113 output = *input;
114 return 1;
115 }
116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
117 {
118 output = *input;
119 return wxCONV_FAILED;
120 }
121 else
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
126 }
127
128 #ifdef WC_UTF16
129 typedef wchar_t wxDecodeSurrogate_t;
130 #else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132 #endif // WC_UTF16/!WC_UTF16
133
134 // returns the next UTF-32 character from the wchar_t buffer and advances the
135 // pointer to the character after this one
136 //
137 // if an invalid character is found, *pSrc is set to NULL, the caller must
138 // check for this
139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
140 {
141 wxUint32 out;
142 const size_t
143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150 }
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 size_t
157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
159 {
160 // although new conversion classes are supposed to implement this function
161 // directly, the existing ones only implement the old MB2WC() and so, to
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
170
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
173
174 // the number of NULs terminating this string
175 size_t nulLen = 0; // not really needed, but just to avoid warnings
176
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
183 if ( srcLen != wxNO_LEN )
184 {
185 // we need to know how to find the end of this string
186 nulLen = GetMBNulLen();
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
189
190 // if there are enough NULs we can avoid the copy
191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
192 {
193 // make a copy in order to properly NUL-terminate the string
194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
195 char * const p = bufTmp.data();
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
198 *s = '\0';
199
200 src = bufTmp;
201 }
202
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
209
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
225 for ( ;; )
226 {
227 // try to convert the current chunk
228 size_t lenChunk = MB2WC(NULL, src, 0);
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
231
232 dstWritten += lenChunk;
233 if ( !srcEnd )
234 dstWritten++;
235
236 if ( !lenChunk )
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
252 if ( !srcEnd )
253 dst++;
254 }
255
256 if ( !srcEnd )
257 {
258 // we convert just one chunk in this case as this is the entire
259 // string anyhow (and we don't count the trailing NUL in this case)
260 break;
261 }
262
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
286
287 if ( src >= srcEnd )
288 break;
289 }
290
291 return dstWritten;
292 }
293
294 size_t
295 wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297 {
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
300
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
308 wxWCharBuffer bufTmp;
309 if ( isNulTerminated )
310 {
311 srcLen = wxWcslen(src) + 1;
312 }
313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
314 {
315 // make a copy in order to properly NUL-terminate the string
316 bufTmp = wxWCharBuffer(srcLen);
317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
324 src++ /* skip L'\0' too */ )
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
331 dstWritten += lenChunk;
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
340 dstWritten += lenNul;
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
364 return wxCONV_FAILED;
365
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
377 dst += lenChunk;
378 if ( chunkEnd < srcEnd )
379 dst += lenNul;
380 }
381
382 src = chunkEnd;
383 }
384
385 return dstWritten;
386 }
387
388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
389 {
390 size_t rc = ToWChar(outBuff, outLen, inBuff);
391 if ( rc != wxCONV_FAILED )
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399 }
400
401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
402 {
403 size_t rc = FromWChar(outBuff, outLen, inBuff);
404 if ( rc != wxCONV_FAILED )
405 {
406 rc -= GetMBNulLen();
407 }
408
409 return rc;
410 }
411
412 wxMBConv::~wxMBConv()
413 {
414 // nothing to do here (necessary for Darwin linking probably)
415 }
416
417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418 {
419 if ( psz )
420 {
421 // calculate the length of the buffer needed first
422 const size_t nLen = ToWChar(NULL, 0, psz);
423 if ( nLen != wxCONV_FAILED )
424 {
425 // now do the actual conversion
426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
427
428 // +1 for the trailing NULL
429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
430 return buf;
431 }
432 }
433
434 return wxWCharBuffer();
435 }
436
437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438 {
439 if ( pwz )
440 {
441 const size_t nLen = FromWChar(NULL, 0, pwz);
442 if ( nLen != wxCONV_FAILED )
443 {
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451 }
452
453 const wxWCharBuffer
454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
455 {
456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
457 if ( dstLen != wxCONV_FAILED )
458 {
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
463 wbuf.data()[dstLen] = L'\0';
464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
465 {
466 if ( outLen )
467 {
468 *outLen = dstLen;
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
476 (*outLen)--;
477 }
478
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487 }
488
489 const wxCharBuffer
490 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
491 {
492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
493 if ( dstLen != wxCONV_FAILED )
494 {
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
502 {
503 if ( outLen )
504 {
505 *outLen = dstLen;
506
507 if ( inLen == wxNO_LEN )
508 {
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
511 *outLen -= nulLen;
512 }
513 }
514
515 return buf;
516 }
517 }
518
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
523 }
524
525 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526 {
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
541 }
542
543 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544 {
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
558 return wxScopedCharBuffer::CreateNonOwned("", 0);
559 }
560
561 // ----------------------------------------------------------------------------
562 // wxMBConvLibc
563 // ----------------------------------------------------------------------------
564
565 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566 {
567 return wxMB2WC(buf, psz, n);
568 }
569
570 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571 {
572 return wxWC2MB(buf, psz, n);
573 }
574
575 // ----------------------------------------------------------------------------
576 // wxConvBrokenFileNames
577 // ----------------------------------------------------------------------------
578
579 #ifdef __UNIX__
580
581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
582 {
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
586 else
587 m_conv = new wxCSConv(charset);
588 }
589
590 #endif // __UNIX__
591
592 // ----------------------------------------------------------------------------
593 // UTF-7
594 // ----------------------------------------------------------------------------
595
596 // Implementation (C) 2004 Fredrik Roubert
597 //
598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
599
600 //
601 // BASE64 decoding table
602 //
603 static const unsigned char utf7unb64[] =
604 {
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
637 };
638
639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
641 {
642 DecoderState stateOrig,
643 *statePtr;
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
667 size_t len = 0;
668
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
672 {
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
676 {
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
679 {
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
691 return wxCONV_FAILED;
692 }
693
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
709 {
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
715 {
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
721 }
722 else // MSB
723 {
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
727 }
728 }
729 }
730 }
731
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
736 {
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
770 }
771 }
772
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
782
783 return len;
784 }
785
786 //
787 // BASE64 encoding table
788 //
789 static const unsigned char utf7enb64[] =
790 {
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799 };
800
801 //
802 // UTF-7 encoding table
803 //
804 // 0 - Set D (directly encoded characters)
805 // 1 - Set O (optional direct characters)
806 // 2 - whitespace characters (optional)
807 // 3 - special characters
808 //
809 static const unsigned char utf7encode[128] =
810 {
811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819 };
820
821 static inline bool wxIsUTF7Direct(wchar_t wc)
822 {
823 return wc < 0x80 && utf7encode[wc] < 1;
824 }
825
826 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
828 {
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
843 }
844
845 EncoderState& state = *statePtr;
846
847
848 size_t len = 0;
849
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
852 {
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
855 {
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
865
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
875 len++;
876 }
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
887 #ifndef WC_UTF16
888 else if (((wxUint32)cc) > 0xffff)
889 {
890 // no surrogate pair generation (yet?)
891 return wxCONV_FAILED;
892 }
893 #endif
894 else
895 {
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
899
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
907 {
908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
909 {
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
914 {
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
919 }
920 }
921
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
924
925 src++;
926 }
927 }
928 }
929
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
935
936 return len;
937 }
938
939 // ----------------------------------------------------------------------------
940 // UTF-8
941 // ----------------------------------------------------------------------------
942
943 static const wxUint32 utf8_max[]=
944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
945
946 // boundaries of the private use area we use to (temporarily) remap invalid
947 // characters invalid in a UTF-8 encoded string
948 const wxUint32 wxUnicodePUA = 0x100000;
949 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
951 // this table gives the length of the UTF-8 encoding from its first character:
952 const unsigned char tableUtf8Lengths[256] = {
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984 };
985
986 size_t
987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989 {
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
998 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
1018 if ( out && !dstLen-- )
1019 break;
1020
1021 wxUint32 code;
1022 unsigned char c = *p;
1023
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
1028
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
1031
1032 code = c;
1033 }
1034 else
1035 {
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
1081
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
1085 }
1086
1087 #ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095 #else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098 #endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107 }
1108
1109 size_t
1110 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112 {
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
1118 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
1140
1141 wxUint32 code;
1142 #ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 }
1149 #else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151 #endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223 }
1224
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
1227 {
1228 if ( m_options == MAP_INVALID_UTF8_NOT )
1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1230
1231 size_t len = 0;
1232
1233 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1234 {
1235 const char *opsz = psz;
1236 bool invalid = false;
1237 unsigned char cc = *psz++, fc = cc;
1238 unsigned cnt;
1239 for (cnt = 0; fc & 0x80; cnt++)
1240 fc <<= 1;
1241
1242 if (!cnt)
1243 {
1244 // plain ASCII char
1245 if (buf)
1246 *buf++ = cc;
1247 len++;
1248
1249 // escape the escape character for octal escapes
1250 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1251 && cc == '\\' && (!buf || len < n))
1252 {
1253 if (buf)
1254 *buf++ = cc;
1255 len++;
1256 }
1257 }
1258 else
1259 {
1260 cnt--;
1261 if (!cnt)
1262 {
1263 // invalid UTF-8 sequence
1264 invalid = true;
1265 }
1266 else
1267 {
1268 unsigned ocnt = cnt - 1;
1269 wxUint32 res = cc & (0x3f >> cnt);
1270 while (cnt--)
1271 {
1272 cc = *psz;
1273 if ((cc & 0xC0) != 0x80)
1274 {
1275 // invalid UTF-8 sequence
1276 invalid = true;
1277 break;
1278 }
1279
1280 psz++;
1281 res = (res << 6) | (cc & 0x3f);
1282 }
1283
1284 if (invalid || res <= utf8_max[ocnt])
1285 {
1286 // illegal UTF-8 encoding
1287 invalid = true;
1288 }
1289 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1290 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1291 {
1292 // if one of our PUA characters turns up externally
1293 // it must also be treated as an illegal sequence
1294 // (a bit like you have to escape an escape character)
1295 invalid = true;
1296 }
1297 else
1298 {
1299 #ifdef WC_UTF16
1300 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1301 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1302 if (pa == wxCONV_FAILED)
1303 {
1304 invalid = true;
1305 }
1306 else
1307 {
1308 if (buf)
1309 buf += pa;
1310 len += pa;
1311 }
1312 #else // !WC_UTF16
1313 if (buf)
1314 *buf++ = (wchar_t)res;
1315 len++;
1316 #endif // WC_UTF16/!WC_UTF16
1317 }
1318 }
1319
1320 if (invalid)
1321 {
1322 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1323 {
1324 while (opsz < psz && (!buf || len < n))
1325 {
1326 #ifdef WC_UTF16
1327 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1328 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1329 wxASSERT(pa != wxCONV_FAILED);
1330 if (buf)
1331 buf += pa;
1332 opsz++;
1333 len += pa;
1334 #else
1335 if (buf)
1336 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1337 opsz++;
1338 len++;
1339 #endif
1340 }
1341 }
1342 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1343 {
1344 while (opsz < psz && (!buf || len < n))
1345 {
1346 if ( buf && len + 3 < n )
1347 {
1348 unsigned char on = *opsz;
1349 *buf++ = L'\\';
1350 *buf++ = (wchar_t)( L'0' + on / 0100 );
1351 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1352 *buf++ = (wchar_t)( L'0' + on % 010 );
1353 }
1354
1355 opsz++;
1356 len += 4;
1357 }
1358 }
1359 else // MAP_INVALID_UTF8_NOT
1360 {
1361 return wxCONV_FAILED;
1362 }
1363 }
1364 }
1365 }
1366
1367 if (srcLen == wxNO_LEN && buf && (len < n))
1368 *buf = 0;
1369
1370 return len + 1;
1371 }
1372
1373 static inline bool isoctal(wchar_t wch)
1374 {
1375 return L'0' <= wch && wch <= L'7';
1376 }
1377
1378 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379 const wchar_t *psz, size_t srcLen) const
1380 {
1381 if ( m_options == MAP_INVALID_UTF8_NOT )
1382 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1383
1384 size_t len = 0;
1385
1386 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1387 {
1388 wxUint32 cc;
1389
1390 #ifdef WC_UTF16
1391 // cast is ok for WC_UTF16
1392 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1393 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1394 #else
1395 cc = (*psz++) & 0x7fffffff;
1396 #endif
1397
1398 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1399 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1400 {
1401 if (buf)
1402 *buf++ = (char)(cc - wxUnicodePUA);
1403 len++;
1404 }
1405 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1406 && cc == L'\\' && psz[0] == L'\\' )
1407 {
1408 if (buf)
1409 *buf++ = (char)cc;
1410 psz++;
1411 len++;
1412 }
1413 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1414 cc == L'\\' &&
1415 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1416 {
1417 if (buf)
1418 {
1419 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1420 (psz[1] - L'0') * 010 +
1421 (psz[2] - L'0'));
1422 }
1423
1424 psz += 3;
1425 len++;
1426 }
1427 else
1428 {
1429 unsigned cnt;
1430 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1431 {
1432 }
1433
1434 if (!cnt)
1435 {
1436 // plain ASCII char
1437 if (buf)
1438 *buf++ = (char) cc;
1439 len++;
1440 }
1441 else
1442 {
1443 len += cnt + 1;
1444 if (buf)
1445 {
1446 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1447 while (cnt--)
1448 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1449 }
1450 }
1451 }
1452 }
1453
1454 if (srcLen == wxNO_LEN && buf && (len < n))
1455 *buf = 0;
1456
1457 return len + 1;
1458 }
1459
1460 // ============================================================================
1461 // UTF-16
1462 // ============================================================================
1463
1464 #ifdef WORDS_BIGENDIAN
1465 #define wxMBConvUTF16straight wxMBConvUTF16BE
1466 #define wxMBConvUTF16swap wxMBConvUTF16LE
1467 #else
1468 #define wxMBConvUTF16swap wxMBConvUTF16BE
1469 #define wxMBConvUTF16straight wxMBConvUTF16LE
1470 #endif
1471
1472 /* static */
1473 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1474 {
1475 if ( srcLen == wxNO_LEN )
1476 {
1477 // count the number of bytes in input, including the trailing NULs
1478 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1479 for ( srcLen = 1; *inBuff++; srcLen++ )
1480 ;
1481
1482 srcLen *= BYTES_PER_CHAR;
1483 }
1484 else // we already have the length
1485 {
1486 // we can only convert an entire number of UTF-16 characters
1487 if ( srcLen % BYTES_PER_CHAR )
1488 return wxCONV_FAILED;
1489 }
1490
1491 return srcLen;
1492 }
1493
1494 // case when in-memory representation is UTF-16 too
1495 #ifdef WC_UTF16
1496
1497 // ----------------------------------------------------------------------------
1498 // conversions without endianness change
1499 // ----------------------------------------------------------------------------
1500
1501 size_t
1502 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1503 const char *src, size_t srcLen) const
1504 {
1505 // set up the scene for using memcpy() (which is presumably more efficient
1506 // than copying the bytes one by one)
1507 srcLen = GetLength(src, srcLen);
1508 if ( srcLen == wxNO_LEN )
1509 return wxCONV_FAILED;
1510
1511 const size_t inLen = srcLen / BYTES_PER_CHAR;
1512 if ( dst )
1513 {
1514 if ( dstLen < inLen )
1515 return wxCONV_FAILED;
1516
1517 memcpy(dst, src, srcLen);
1518 }
1519
1520 return inLen;
1521 }
1522
1523 size_t
1524 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1525 const wchar_t *src, size_t srcLen) const
1526 {
1527 if ( srcLen == wxNO_LEN )
1528 srcLen = wxWcslen(src) + 1;
1529
1530 srcLen *= BYTES_PER_CHAR;
1531
1532 if ( dst )
1533 {
1534 if ( dstLen < srcLen )
1535 return wxCONV_FAILED;
1536
1537 memcpy(dst, src, srcLen);
1538 }
1539
1540 return srcLen;
1541 }
1542
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1546
1547 size_t
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549 const char *src, size_t srcLen) const
1550 {
1551 srcLen = GetLength(src, srcLen);
1552 if ( srcLen == wxNO_LEN )
1553 return wxCONV_FAILED;
1554
1555 srcLen /= BYTES_PER_CHAR;
1556
1557 if ( dst )
1558 {
1559 if ( dstLen < srcLen )
1560 return wxCONV_FAILED;
1561
1562 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1563 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1564 {
1565 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1566 }
1567 }
1568
1569 return srcLen;
1570 }
1571
1572 size_t
1573 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1574 const wchar_t *src, size_t srcLen) const
1575 {
1576 if ( srcLen == wxNO_LEN )
1577 srcLen = wxWcslen(src) + 1;
1578
1579 srcLen *= BYTES_PER_CHAR;
1580
1581 if ( dst )
1582 {
1583 if ( dstLen < srcLen )
1584 return wxCONV_FAILED;
1585
1586 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1587 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1588 {
1589 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1590 }
1591 }
1592
1593 return srcLen;
1594 }
1595
1596 #else // !WC_UTF16: wchar_t is UTF-32
1597
1598 // ----------------------------------------------------------------------------
1599 // conversions without endianness change
1600 // ----------------------------------------------------------------------------
1601
1602 size_t
1603 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1604 const char *src, size_t srcLen) const
1605 {
1606 srcLen = GetLength(src, srcLen);
1607 if ( srcLen == wxNO_LEN )
1608 return wxCONV_FAILED;
1609
1610 const size_t inLen = srcLen / BYTES_PER_CHAR;
1611 if ( !dst )
1612 {
1613 // optimization: return maximal space which could be needed for this
1614 // string even if the real size could be smaller if the buffer contains
1615 // any surrogates
1616 return inLen;
1617 }
1618
1619 size_t outLen = 0;
1620 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1621 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1622 {
1623 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1624 if ( !inBuff )
1625 return wxCONV_FAILED;
1626
1627 if ( ++outLen > dstLen )
1628 return wxCONV_FAILED;
1629
1630 *dst++ = ch;
1631 }
1632
1633
1634 return outLen;
1635 }
1636
1637 size_t
1638 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1639 const wchar_t *src, size_t srcLen) const
1640 {
1641 if ( srcLen == wxNO_LEN )
1642 srcLen = wxWcslen(src) + 1;
1643
1644 size_t outLen = 0;
1645 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1646 for ( size_t n = 0; n < srcLen; n++ )
1647 {
1648 wxUint16 cc[2];
1649 const size_t numChars = encode_utf16(*src++, cc);
1650 if ( numChars == wxCONV_FAILED )
1651 return wxCONV_FAILED;
1652
1653 outLen += numChars * BYTES_PER_CHAR;
1654 if ( outBuff )
1655 {
1656 if ( outLen > dstLen )
1657 return wxCONV_FAILED;
1658
1659 *outBuff++ = cc[0];
1660 if ( numChars == 2 )
1661 {
1662 // second character of a surrogate
1663 *outBuff++ = cc[1];
1664 }
1665 }
1666 }
1667
1668 return outLen;
1669 }
1670
1671 // ----------------------------------------------------------------------------
1672 // endian-reversing conversions
1673 // ----------------------------------------------------------------------------
1674
1675 size_t
1676 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1677 const char *src, size_t srcLen) const
1678 {
1679 srcLen = GetLength(src, srcLen);
1680 if ( srcLen == wxNO_LEN )
1681 return wxCONV_FAILED;
1682
1683 const size_t inLen = srcLen / BYTES_PER_CHAR;
1684 if ( !dst )
1685 {
1686 // optimization: return maximal space which could be needed for this
1687 // string even if the real size could be smaller if the buffer contains
1688 // any surrogates
1689 return inLen;
1690 }
1691
1692 size_t outLen = 0;
1693 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1694 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1695 {
1696 wxUint32 ch;
1697 wxUint16 tmp[2];
1698
1699 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1700 inBuff++;
1701 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1702
1703 const size_t numChars = decode_utf16(tmp, ch);
1704 if ( numChars == wxCONV_FAILED )
1705 return wxCONV_FAILED;
1706
1707 if ( numChars == 2 )
1708 inBuff++;
1709
1710 if ( ++outLen > dstLen )
1711 return wxCONV_FAILED;
1712
1713 *dst++ = ch;
1714 }
1715
1716
1717 return outLen;
1718 }
1719
1720 size_t
1721 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1722 const wchar_t *src, size_t srcLen) const
1723 {
1724 if ( srcLen == wxNO_LEN )
1725 srcLen = wxWcslen(src) + 1;
1726
1727 size_t outLen = 0;
1728 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1729 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1730 {
1731 wxUint16 cc[2];
1732 const size_t numChars = encode_utf16(*src, cc);
1733 if ( numChars == wxCONV_FAILED )
1734 return wxCONV_FAILED;
1735
1736 outLen += numChars * BYTES_PER_CHAR;
1737 if ( outBuff )
1738 {
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
1741
1742 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1743 if ( numChars == 2 )
1744 {
1745 // second character of a surrogate
1746 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1747 }
1748 }
1749 }
1750
1751 return outLen;
1752 }
1753
1754 #endif // WC_UTF16/!WC_UTF16
1755
1756
1757 // ============================================================================
1758 // UTF-32
1759 // ============================================================================
1760
1761 #ifdef WORDS_BIGENDIAN
1762 #define wxMBConvUTF32straight wxMBConvUTF32BE
1763 #define wxMBConvUTF32swap wxMBConvUTF32LE
1764 #else
1765 #define wxMBConvUTF32swap wxMBConvUTF32BE
1766 #define wxMBConvUTF32straight wxMBConvUTF32LE
1767 #endif
1768
1769
1770 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1771 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1772
1773 /* static */
1774 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1775 {
1776 if ( srcLen == wxNO_LEN )
1777 {
1778 // count the number of bytes in input, including the trailing NULs
1779 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1780 for ( srcLen = 1; *inBuff++; srcLen++ )
1781 ;
1782
1783 srcLen *= BYTES_PER_CHAR;
1784 }
1785 else // we already have the length
1786 {
1787 // we can only convert an entire number of UTF-32 characters
1788 if ( srcLen % BYTES_PER_CHAR )
1789 return wxCONV_FAILED;
1790 }
1791
1792 return srcLen;
1793 }
1794
1795 // case when in-memory representation is UTF-16
1796 #ifdef WC_UTF16
1797
1798 // ----------------------------------------------------------------------------
1799 // conversions without endianness change
1800 // ----------------------------------------------------------------------------
1801
1802 size_t
1803 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1804 const char *src, size_t srcLen) const
1805 {
1806 srcLen = GetLength(src, srcLen);
1807 if ( srcLen == wxNO_LEN )
1808 return wxCONV_FAILED;
1809
1810 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1811 const size_t inLen = srcLen / BYTES_PER_CHAR;
1812 size_t outLen = 0;
1813 for ( size_t n = 0; n < inLen; n++ )
1814 {
1815 wxUint16 cc[2];
1816 const size_t numChars = encode_utf16(*inBuff++, cc);
1817 if ( numChars == wxCONV_FAILED )
1818 return wxCONV_FAILED;
1819
1820 outLen += numChars;
1821 if ( dst )
1822 {
1823 if ( outLen > dstLen )
1824 return wxCONV_FAILED;
1825
1826 *dst++ = cc[0];
1827 if ( numChars == 2 )
1828 {
1829 // second character of a surrogate
1830 *dst++ = cc[1];
1831 }
1832 }
1833 }
1834
1835 return outLen;
1836 }
1837
1838 size_t
1839 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1840 const wchar_t *src, size_t srcLen) const
1841 {
1842 if ( srcLen == wxNO_LEN )
1843 srcLen = wxWcslen(src) + 1;
1844
1845 if ( !dst )
1846 {
1847 // optimization: return maximal space which could be needed for this
1848 // string instead of the exact amount which could be less if there are
1849 // any surrogates in the input
1850 //
1851 // we consider that surrogates are rare enough to make it worthwhile to
1852 // avoid running the loop below at the cost of slightly extra memory
1853 // consumption
1854 return srcLen * BYTES_PER_CHAR;
1855 }
1856
1857 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1858 size_t outLen = 0;
1859 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1860 {
1861 const wxUint32 ch = wxDecodeSurrogate(&src);
1862 if ( !src )
1863 return wxCONV_FAILED;
1864
1865 outLen += BYTES_PER_CHAR;
1866
1867 if ( outLen > dstLen )
1868 return wxCONV_FAILED;
1869
1870 *outBuff++ = ch;
1871 }
1872
1873 return outLen;
1874 }
1875
1876 // ----------------------------------------------------------------------------
1877 // endian-reversing conversions
1878 // ----------------------------------------------------------------------------
1879
1880 size_t
1881 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1882 const char *src, size_t srcLen) const
1883 {
1884 srcLen = GetLength(src, srcLen);
1885 if ( srcLen == wxNO_LEN )
1886 return wxCONV_FAILED;
1887
1888 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1889 const size_t inLen = srcLen / BYTES_PER_CHAR;
1890 size_t outLen = 0;
1891 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1892 {
1893 wxUint16 cc[2];
1894 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1895 if ( numChars == wxCONV_FAILED )
1896 return wxCONV_FAILED;
1897
1898 outLen += numChars;
1899 if ( dst )
1900 {
1901 if ( outLen > dstLen )
1902 return wxCONV_FAILED;
1903
1904 *dst++ = cc[0];
1905 if ( numChars == 2 )
1906 {
1907 // second character of a surrogate
1908 *dst++ = cc[1];
1909 }
1910 }
1911 }
1912
1913 return outLen;
1914 }
1915
1916 size_t
1917 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1918 const wchar_t *src, size_t srcLen) const
1919 {
1920 if ( srcLen == wxNO_LEN )
1921 srcLen = wxWcslen(src) + 1;
1922
1923 if ( !dst )
1924 {
1925 // optimization: return maximal space which could be needed for this
1926 // string instead of the exact amount which could be less if there are
1927 // any surrogates in the input
1928 //
1929 // we consider that surrogates are rare enough to make it worthwhile to
1930 // avoid running the loop below at the cost of slightly extra memory
1931 // consumption
1932 return srcLen*BYTES_PER_CHAR;
1933 }
1934
1935 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1936 size_t outLen = 0;
1937 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1938 {
1939 const wxUint32 ch = wxDecodeSurrogate(&src);
1940 if ( !src )
1941 return wxCONV_FAILED;
1942
1943 outLen += BYTES_PER_CHAR;
1944
1945 if ( outLen > dstLen )
1946 return wxCONV_FAILED;
1947
1948 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1949 }
1950
1951 return outLen;
1952 }
1953
1954 #else // !WC_UTF16: wchar_t is UTF-32
1955
1956 // ----------------------------------------------------------------------------
1957 // conversions without endianness change
1958 // ----------------------------------------------------------------------------
1959
1960 size_t
1961 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1962 const char *src, size_t srcLen) const
1963 {
1964 // use memcpy() as it should be much faster than hand-written loop
1965 srcLen = GetLength(src, srcLen);
1966 if ( srcLen == wxNO_LEN )
1967 return wxCONV_FAILED;
1968
1969 const size_t inLen = srcLen/BYTES_PER_CHAR;
1970 if ( dst )
1971 {
1972 if ( dstLen < inLen )
1973 return wxCONV_FAILED;
1974
1975 memcpy(dst, src, srcLen);
1976 }
1977
1978 return inLen;
1979 }
1980
1981 size_t
1982 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1983 const wchar_t *src, size_t srcLen) const
1984 {
1985 if ( srcLen == wxNO_LEN )
1986 srcLen = wxWcslen(src) + 1;
1987
1988 srcLen *= BYTES_PER_CHAR;
1989
1990 if ( dst )
1991 {
1992 if ( dstLen < srcLen )
1993 return wxCONV_FAILED;
1994
1995 memcpy(dst, src, srcLen);
1996 }
1997
1998 return srcLen;
1999 }
2000
2001 // ----------------------------------------------------------------------------
2002 // endian-reversing conversions
2003 // ----------------------------------------------------------------------------
2004
2005 size_t
2006 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2007 const char *src, size_t srcLen) const
2008 {
2009 srcLen = GetLength(src, srcLen);
2010 if ( srcLen == wxNO_LEN )
2011 return wxCONV_FAILED;
2012
2013 srcLen /= BYTES_PER_CHAR;
2014
2015 if ( dst )
2016 {
2017 if ( dstLen < srcLen )
2018 return wxCONV_FAILED;
2019
2020 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2021 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2022 {
2023 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2024 }
2025 }
2026
2027 return srcLen;
2028 }
2029
2030 size_t
2031 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2032 const wchar_t *src, size_t srcLen) const
2033 {
2034 if ( srcLen == wxNO_LEN )
2035 srcLen = wxWcslen(src) + 1;
2036
2037 srcLen *= BYTES_PER_CHAR;
2038
2039 if ( dst )
2040 {
2041 if ( dstLen < srcLen )
2042 return wxCONV_FAILED;
2043
2044 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2045 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2046 {
2047 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2048 }
2049 }
2050
2051 return srcLen;
2052 }
2053
2054 #endif // WC_UTF16/!WC_UTF16
2055
2056
2057 // ============================================================================
2058 // The classes doing conversion using the iconv_xxx() functions
2059 // ============================================================================
2060
2061 #ifdef HAVE_ICONV
2062
2063 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2064 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2065 // (unless there's yet another bug in glibc) the only case when iconv()
2066 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2067 // left in the input buffer -- when _real_ error occurs,
2068 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2069 // iconv() failure.
2070 // [This bug does not appear in glibc 2.2.]
2071 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2072 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2073 (errno != E2BIG || bufLeft != 0))
2074 #else
2075 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2076 #endif
2077
2078 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2079
2080 #define ICONV_T_INVALID ((iconv_t)-1)
2081
2082 #if SIZEOF_WCHAR_T == 4
2083 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2084 #define WC_ENC wxFONTENCODING_UTF32
2085 #elif SIZEOF_WCHAR_T == 2
2086 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2087 #define WC_ENC wxFONTENCODING_UTF16
2088 #else // sizeof(wchar_t) != 2 nor 4
2089 // does this ever happen?
2090 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2091 #endif
2092
2093 // ----------------------------------------------------------------------------
2094 // wxMBConv_iconv: encapsulates an iconv character set
2095 // ----------------------------------------------------------------------------
2096
2097 class wxMBConv_iconv : public wxMBConv
2098 {
2099 public:
2100 wxMBConv_iconv(const char *name);
2101 virtual ~wxMBConv_iconv();
2102
2103 // implement base class virtual methods
2104 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2105 const char *src, size_t srcLen = wxNO_LEN) const;
2106 virtual size_t FromWChar(char *dst, size_t dstLen,
2107 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2108 virtual size_t GetMBNulLen() const;
2109
2110 #if wxUSE_UNICODE_UTF8
2111 virtual bool IsUTF8() const;
2112 #endif
2113
2114 virtual wxMBConv *Clone() const
2115 {
2116 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2117 p->m_minMBCharWidth = m_minMBCharWidth;
2118 return p;
2119 }
2120
2121 bool IsOk() const
2122 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2123
2124 protected:
2125 // the iconv handlers used to translate from multibyte
2126 // to wide char and in the other direction
2127 iconv_t m2w,
2128 w2m;
2129
2130 #if wxUSE_THREADS
2131 // guards access to m2w and w2m objects
2132 wxMutex m_iconvMutex;
2133 #endif
2134
2135 private:
2136 // the name (for iconv_open()) of a wide char charset -- if none is
2137 // available on this machine, it will remain NULL
2138 static wxString ms_wcCharsetName;
2139
2140 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2141 // different endian-ness than the native one
2142 static bool ms_wcNeedsSwap;
2143
2144
2145 // name of the encoding handled by this conversion
2146 wxString m_name;
2147
2148 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2149 // initially
2150 size_t m_minMBCharWidth;
2151 };
2152
2153 // make the constructor available for unit testing
2154 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2155 {
2156 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2157 if ( !result->IsOk() )
2158 {
2159 delete result;
2160 return 0;
2161 }
2162
2163 return result;
2164 }
2165
2166 wxString wxMBConv_iconv::ms_wcCharsetName;
2167 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2168
2169 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2170 : m_name(name)
2171 {
2172 m_minMBCharWidth = 0;
2173
2174 // check for charset that represents wchar_t:
2175 if ( ms_wcCharsetName.empty() )
2176 {
2177 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2178
2179 #if wxUSE_FONTMAP
2180 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2181 #else // !wxUSE_FONTMAP
2182 static const wxChar *const names_static[] =
2183 {
2184 #if SIZEOF_WCHAR_T == 4
2185 wxT("UCS-4"),
2186 #elif SIZEOF_WCHAR_T = 2
2187 wxT("UCS-2"),
2188 #endif
2189 NULL
2190 };
2191 const wxChar *const *names = names_static;
2192 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2193
2194 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2195 {
2196 const wxString nameCS(*names);
2197
2198 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2199 wxString nameXE(nameCS);
2200
2201 #ifdef WORDS_BIGENDIAN
2202 nameXE += wxT("BE");
2203 #else // little endian
2204 nameXE += wxT("LE");
2205 #endif
2206
2207 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2208 nameXE.c_str());
2209
2210 m2w = iconv_open(nameXE.ToAscii(), name);
2211 if ( m2w == ICONV_T_INVALID )
2212 {
2213 // try charset w/o bytesex info (e.g. "UCS4")
2214 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2215 nameCS.c_str());
2216 m2w = iconv_open(nameCS.ToAscii(), name);
2217
2218 // and check for bytesex ourselves:
2219 if ( m2w != ICONV_T_INVALID )
2220 {
2221 char buf[2], *bufPtr;
2222 wchar_t wbuf[2];
2223 size_t insz, outsz;
2224 size_t res;
2225
2226 buf[0] = 'A';
2227 buf[1] = 0;
2228 wbuf[0] = 0;
2229 insz = 2;
2230 outsz = SIZEOF_WCHAR_T * 2;
2231 char* wbufPtr = (char*)wbuf;
2232 bufPtr = buf;
2233
2234 res = iconv(
2235 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2236 &wbufPtr, &outsz);
2237
2238 if (ICONV_FAILED(res, insz))
2239 {
2240 wxLogLastError(wxT("iconv"));
2241 wxLogError(_("Conversion to charset '%s' doesn't work."),
2242 nameCS.c_str());
2243 }
2244 else // ok, can convert to this encoding, remember it
2245 {
2246 ms_wcCharsetName = nameCS;
2247 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2248 }
2249 }
2250 }
2251 else // use charset not requiring byte swapping
2252 {
2253 ms_wcCharsetName = nameXE;
2254 }
2255 }
2256
2257 wxLogTrace(TRACE_STRCONV,
2258 wxT("iconv wchar_t charset is \"%s\"%s"),
2259 ms_wcCharsetName.empty() ? wxString("<none>")
2260 : ms_wcCharsetName,
2261 ms_wcNeedsSwap ? wxT(" (needs swap)")
2262 : wxT(""));
2263 }
2264 else // we already have ms_wcCharsetName
2265 {
2266 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2267 }
2268
2269 if ( ms_wcCharsetName.empty() )
2270 {
2271 w2m = ICONV_T_INVALID;
2272 }
2273 else
2274 {
2275 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2276 if ( w2m == ICONV_T_INVALID )
2277 {
2278 wxLogTrace(TRACE_STRCONV,
2279 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2280 ms_wcCharsetName.c_str(), name);
2281 }
2282 }
2283 }
2284
2285 wxMBConv_iconv::~wxMBConv_iconv()
2286 {
2287 if ( m2w != ICONV_T_INVALID )
2288 iconv_close(m2w);
2289 if ( w2m != ICONV_T_INVALID )
2290 iconv_close(w2m);
2291 }
2292
2293 size_t
2294 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2295 const char *src, size_t srcLen) const
2296 {
2297 if ( srcLen == wxNO_LEN )
2298 {
2299 // find the string length: notice that must be done differently for
2300 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2301 // consecutive NULs
2302 const size_t nulLen = GetMBNulLen();
2303 switch ( nulLen )
2304 {
2305 default:
2306 return wxCONV_FAILED;
2307
2308 case 1:
2309 srcLen = strlen(src); // arguably more optimized than our version
2310 break;
2311
2312 case 2:
2313 case 4:
2314 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2315 // but they also have to start at character boundary and not
2316 // span two adjacent characters
2317 const char *p;
2318 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2319 ;
2320 srcLen = p - src;
2321 break;
2322 }
2323
2324 // when we're determining the length of the string ourselves we count
2325 // the terminating NUL(s) as part of it and always NUL-terminate the
2326 // output
2327 srcLen += nulLen;
2328 }
2329
2330 // we express length in the number of (wide) characters but iconv always
2331 // counts buffer sizes it in bytes
2332 dstLen *= SIZEOF_WCHAR_T;
2333
2334 #if wxUSE_THREADS
2335 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2336 // Unfortunately there are a couple of global wxCSConv objects such as
2337 // wxConvLocal that are used all over wx code, so we have to make sure
2338 // the handle is used by at most one thread at the time. Otherwise
2339 // only a few wx classes would be safe to use from non-main threads
2340 // as MB<->WC conversion would fail "randomly".
2341 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2342 #endif // wxUSE_THREADS
2343
2344 size_t res, cres;
2345 const char *pszPtr = src;
2346
2347 if ( dst )
2348 {
2349 char* bufPtr = (char*)dst;
2350
2351 // have destination buffer, convert there
2352 size_t dstLenOrig = dstLen;
2353 cres = iconv(m2w,
2354 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2355 &bufPtr, &dstLen);
2356
2357 // convert the number of bytes converted as returned by iconv to the
2358 // number of (wide) characters converted that we need
2359 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2360
2361 if (ms_wcNeedsSwap)
2362 {
2363 // convert to native endianness
2364 for ( unsigned i = 0; i < res; i++ )
2365 dst[i] = WC_BSWAP(dst[i]);
2366 }
2367 }
2368 else // no destination buffer
2369 {
2370 // convert using temp buffer to calculate the size of the buffer needed
2371 wchar_t tbuf[256];
2372 res = 0;
2373
2374 do
2375 {
2376 char* bufPtr = (char*)tbuf;
2377 dstLen = 8 * SIZEOF_WCHAR_T;
2378
2379 cres = iconv(m2w,
2380 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2381 &bufPtr, &dstLen );
2382
2383 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2384 }
2385 while ((cres == (size_t)-1) && (errno == E2BIG));
2386 }
2387
2388 if (ICONV_FAILED(cres, srcLen))
2389 {
2390 //VS: it is ok if iconv fails, hence trace only
2391 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2392 return wxCONV_FAILED;
2393 }
2394
2395 return res;
2396 }
2397
2398 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2399 const wchar_t *src, size_t srcLen) const
2400 {
2401 #if wxUSE_THREADS
2402 // NB: explained in MB2WC
2403 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2404 #endif
2405
2406 if ( srcLen == wxNO_LEN )
2407 srcLen = wxWcslen(src) + 1;
2408
2409 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2410 size_t outbuflen = dstLen;
2411 size_t res, cres;
2412
2413 wchar_t *tmpbuf = 0;
2414
2415 if (ms_wcNeedsSwap)
2416 {
2417 // need to copy to temp buffer to switch endianness
2418 // (doing WC_BSWAP twice on the original buffer won't work, as it
2419 // could be in read-only memory, or be accessed in some other thread)
2420 tmpbuf = (wchar_t *)malloc(inbuflen);
2421 for ( size_t i = 0; i < srcLen; i++ )
2422 tmpbuf[i] = WC_BSWAP(src[i]);
2423
2424 src = tmpbuf;
2425 }
2426
2427 char* inbuf = (char*)src;
2428 if ( dst )
2429 {
2430 // have destination buffer, convert there
2431 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2432
2433 res = dstLen - outbuflen;
2434 }
2435 else // no destination buffer
2436 {
2437 // convert using temp buffer to calculate the size of the buffer needed
2438 char tbuf[256];
2439 res = 0;
2440 do
2441 {
2442 dst = tbuf;
2443 outbuflen = WXSIZEOF(tbuf);
2444
2445 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2446
2447 res += WXSIZEOF(tbuf) - outbuflen;
2448 }
2449 while ((cres == (size_t)-1) && (errno == E2BIG));
2450 }
2451
2452 if (ms_wcNeedsSwap)
2453 {
2454 free(tmpbuf);
2455 }
2456
2457 if (ICONV_FAILED(cres, inbuflen))
2458 {
2459 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2460 return wxCONV_FAILED;
2461 }
2462
2463 return res;
2464 }
2465
2466 size_t wxMBConv_iconv::GetMBNulLen() const
2467 {
2468 if ( m_minMBCharWidth == 0 )
2469 {
2470 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2471
2472 #if wxUSE_THREADS
2473 // NB: explained in MB2WC
2474 wxMutexLocker lock(self->m_iconvMutex);
2475 #endif
2476
2477 const wchar_t *wnul = L"";
2478 char buf[8]; // should be enough for NUL in any encoding
2479 size_t inLen = sizeof(wchar_t),
2480 outLen = WXSIZEOF(buf);
2481 char *inBuff = (char *)wnul;
2482 char *outBuff = buf;
2483 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2484 {
2485 self->m_minMBCharWidth = (size_t)-1;
2486 }
2487 else // ok
2488 {
2489 self->m_minMBCharWidth = outBuff - buf;
2490 }
2491 }
2492
2493 return m_minMBCharWidth;
2494 }
2495
2496 #if wxUSE_UNICODE_UTF8
2497 bool wxMBConv_iconv::IsUTF8() const
2498 {
2499 return wxStricmp(m_name, "UTF-8") == 0 ||
2500 wxStricmp(m_name, "UTF8") == 0;
2501 }
2502 #endif
2503
2504 #endif // HAVE_ICONV
2505
2506
2507 // ============================================================================
2508 // Win32 conversion classes
2509 // ============================================================================
2510
2511 #ifdef wxHAVE_WIN32_MB2WC
2512
2513 // from utils.cpp
2514 #if wxUSE_FONTMAP
2515 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2516 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2517 #endif
2518
2519 class wxMBConv_win32 : public wxMBConv
2520 {
2521 public:
2522 wxMBConv_win32()
2523 {
2524 m_CodePage = CP_ACP;
2525 m_minMBCharWidth = 0;
2526 }
2527
2528 wxMBConv_win32(const wxMBConv_win32& conv)
2529 : wxMBConv()
2530 {
2531 m_CodePage = conv.m_CodePage;
2532 m_minMBCharWidth = conv.m_minMBCharWidth;
2533 }
2534
2535 #if wxUSE_FONTMAP
2536 wxMBConv_win32(const char* name)
2537 {
2538 m_CodePage = wxCharsetToCodepage(name);
2539 m_minMBCharWidth = 0;
2540 }
2541
2542 wxMBConv_win32(wxFontEncoding encoding)
2543 {
2544 m_CodePage = wxEncodingToCodepage(encoding);
2545 m_minMBCharWidth = 0;
2546 }
2547 #endif // wxUSE_FONTMAP
2548
2549 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2550 {
2551 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2552 // the behaviour is not compatible with the Unix version (using iconv)
2553 // and break the library itself, e.g. wxTextInputStream::NextChar()
2554 // wouldn't work if reading an incomplete MB char didn't result in an
2555 // error
2556 //
2557 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2558 // Win XP or newer and it is not supported for UTF-[78] so we always
2559 // use our own conversions in this case. See
2560 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2561 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2562 if ( m_CodePage == CP_UTF8 )
2563 {
2564 return wxMBConvUTF8().MB2WC(buf, psz, n);
2565 }
2566
2567 if ( m_CodePage == CP_UTF7 )
2568 {
2569 return wxMBConvUTF7().MB2WC(buf, psz, n);
2570 }
2571
2572 int flags = 0;
2573 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2574 IsAtLeastWin2kSP4() )
2575 {
2576 flags = MB_ERR_INVALID_CHARS;
2577 }
2578
2579 const size_t len = ::MultiByteToWideChar
2580 (
2581 m_CodePage, // code page
2582 flags, // flags: fall on error
2583 psz, // input string
2584 -1, // its length (NUL-terminated)
2585 buf, // output string
2586 buf ? n : 0 // size of output buffer
2587 );
2588 if ( !len )
2589 {
2590 // function totally failed
2591 return wxCONV_FAILED;
2592 }
2593
2594 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2595 // check if we succeeded, by doing a double trip:
2596 if ( !flags && buf )
2597 {
2598 const size_t mbLen = strlen(psz);
2599 wxCharBuffer mbBuf(mbLen);
2600 if ( ::WideCharToMultiByte
2601 (
2602 m_CodePage,
2603 0,
2604 buf,
2605 -1,
2606 mbBuf.data(),
2607 mbLen + 1, // size in bytes, not length
2608 NULL,
2609 NULL
2610 ) == 0 ||
2611 strcmp(mbBuf, psz) != 0 )
2612 {
2613 // we didn't obtain the same thing we started from, hence
2614 // the conversion was lossy and we consider that it failed
2615 return wxCONV_FAILED;
2616 }
2617 }
2618
2619 // note that it returns count of written chars for buf != NULL and size
2620 // of the needed buffer for buf == NULL so in either case the length of
2621 // the string (which never includes the terminating NUL) is one less
2622 return len - 1;
2623 }
2624
2625 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2626 {
2627 /*
2628 we have a problem here: by default, WideCharToMultiByte() may
2629 replace characters unrepresentable in the target code page with bad
2630 quality approximations such as turning "1/2" symbol (U+00BD) into
2631 "1" for the code pages which don't have it and we, obviously, want
2632 to avoid this at any price
2633
2634 the trouble is that this function does it _silently_, i.e. it won't
2635 even tell us whether it did or not... Win98/2000 and higher provide
2636 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2637 we have to resort to a round trip, i.e. check that converting back
2638 results in the same string -- this is, of course, expensive but
2639 otherwise we simply can't be sure to not garble the data.
2640 */
2641
2642 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2643 // it doesn't work with CJK encodings (which we test for rather roughly
2644 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2645 // supporting it
2646 BOOL usedDef wxDUMMY_INITIALIZE(false);
2647 BOOL *pUsedDef;
2648 int flags;
2649 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2650 {
2651 // it's our lucky day
2652 flags = WC_NO_BEST_FIT_CHARS;
2653 pUsedDef = &usedDef;
2654 }
2655 else // old system or unsupported encoding
2656 {
2657 flags = 0;
2658 pUsedDef = NULL;
2659 }
2660
2661 const size_t len = ::WideCharToMultiByte
2662 (
2663 m_CodePage, // code page
2664 flags, // either none or no best fit
2665 pwz, // input string
2666 -1, // it is (wide) NUL-terminated
2667 buf, // output buffer
2668 buf ? n : 0, // and its size
2669 NULL, // default "replacement" char
2670 pUsedDef // [out] was it used?
2671 );
2672
2673 if ( !len )
2674 {
2675 // function totally failed
2676 return wxCONV_FAILED;
2677 }
2678
2679 // we did something, check if we really succeeded
2680 if ( flags )
2681 {
2682 // check if the conversion failed, i.e. if any replacements
2683 // were done
2684 if ( usedDef )
2685 return wxCONV_FAILED;
2686 }
2687 else // we must resort to double tripping...
2688 {
2689 // first we need to ensure that we really have the MB data: this is
2690 // not the case if we're called with NULL buffer, in which case we
2691 // need to do the conversion yet again
2692 wxCharBuffer bufDef;
2693 if ( !buf )
2694 {
2695 bufDef = wxCharBuffer(len);
2696 buf = bufDef.data();
2697 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2698 buf, len, NULL, NULL) )
2699 return wxCONV_FAILED;
2700 }
2701
2702 if ( !n )
2703 n = wcslen(pwz);
2704 wxWCharBuffer wcBuf(n);
2705 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2706 wcscmp(wcBuf, pwz) != 0 )
2707 {
2708 // we didn't obtain the same thing we started from, hence
2709 // the conversion was lossy and we consider that it failed
2710 return wxCONV_FAILED;
2711 }
2712 }
2713
2714 // see the comment above for the reason of "len - 1"
2715 return len - 1;
2716 }
2717
2718 virtual size_t GetMBNulLen() const
2719 {
2720 if ( m_minMBCharWidth == 0 )
2721 {
2722 int len = ::WideCharToMultiByte
2723 (
2724 m_CodePage, // code page
2725 0, // no flags
2726 L"", // input string
2727 1, // translate just the NUL
2728 NULL, // output buffer
2729 0, // and its size
2730 NULL, // no replacement char
2731 NULL // [out] don't care if it was used
2732 );
2733
2734 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2735 switch ( len )
2736 {
2737 default:
2738 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2739 self->m_minMBCharWidth = (size_t)-1;
2740 break;
2741
2742 case 0:
2743 self->m_minMBCharWidth = (size_t)-1;
2744 break;
2745
2746 case 1:
2747 case 2:
2748 case 4:
2749 self->m_minMBCharWidth = len;
2750 break;
2751 }
2752 }
2753
2754 return m_minMBCharWidth;
2755 }
2756
2757 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2758
2759 bool IsOk() const { return m_CodePage != -1; }
2760
2761 private:
2762 static bool CanUseNoBestFit()
2763 {
2764 static int s_isWin98Or2k = -1;
2765
2766 if ( s_isWin98Or2k == -1 )
2767 {
2768 int verMaj, verMin;
2769 switch ( wxGetOsVersion(&verMaj, &verMin) )
2770 {
2771 case wxOS_WINDOWS_9X:
2772 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2773 break;
2774
2775 case wxOS_WINDOWS_NT:
2776 s_isWin98Or2k = verMaj >= 5;
2777 break;
2778
2779 default:
2780 // unknown: be conservative by default
2781 s_isWin98Or2k = 0;
2782 break;
2783 }
2784
2785 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2786 }
2787
2788 return s_isWin98Or2k == 1;
2789 }
2790
2791 static bool IsAtLeastWin2kSP4()
2792 {
2793 #ifdef __WXWINCE__
2794 return false;
2795 #else
2796 static int s_isAtLeastWin2kSP4 = -1;
2797
2798 if ( s_isAtLeastWin2kSP4 == -1 )
2799 {
2800 OSVERSIONINFOEX ver;
2801
2802 memset(&ver, 0, sizeof(ver));
2803 ver.dwOSVersionInfoSize = sizeof(ver);
2804 GetVersionEx((OSVERSIONINFO*)&ver);
2805
2806 s_isAtLeastWin2kSP4 =
2807 ((ver.dwMajorVersion > 5) || // Vista+
2808 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2809 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2810 ver.wServicePackMajor >= 4)) // 2000 SP4+
2811 ? 1 : 0;
2812 }
2813
2814 return s_isAtLeastWin2kSP4 == 1;
2815 #endif
2816 }
2817
2818
2819 // the code page we're working with
2820 long m_CodePage;
2821
2822 // cached result of GetMBNulLen(), set to 0 initially meaning
2823 // "unknown"
2824 size_t m_minMBCharWidth;
2825 };
2826
2827 #endif // wxHAVE_WIN32_MB2WC
2828
2829
2830 // ============================================================================
2831 // wxEncodingConverter based conversion classes
2832 // ============================================================================
2833
2834 #if wxUSE_FONTMAP
2835
2836 class wxMBConv_wxwin : public wxMBConv
2837 {
2838 private:
2839 void Init()
2840 {
2841 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2842 // The wxMBConv_cf class does a better job.
2843 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2844 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2845 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2846 }
2847
2848 public:
2849 // temporarily just use wxEncodingConverter stuff,
2850 // so that it works while a better implementation is built
2851 wxMBConv_wxwin(const char* name)
2852 {
2853 if (name)
2854 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2855 else
2856 m_enc = wxFONTENCODING_SYSTEM;
2857
2858 Init();
2859 }
2860
2861 wxMBConv_wxwin(wxFontEncoding enc)
2862 {
2863 m_enc = enc;
2864
2865 Init();
2866 }
2867
2868 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2869 {
2870 size_t inbuf = strlen(psz);
2871 if (buf)
2872 {
2873 if (!m2w.Convert(psz, buf))
2874 return wxCONV_FAILED;
2875 }
2876 return inbuf;
2877 }
2878
2879 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2880 {
2881 const size_t inbuf = wxWcslen(psz);
2882 if (buf)
2883 {
2884 if (!w2m.Convert(psz, buf))
2885 return wxCONV_FAILED;
2886 }
2887
2888 return inbuf;
2889 }
2890
2891 virtual size_t GetMBNulLen() const
2892 {
2893 switch ( m_enc )
2894 {
2895 case wxFONTENCODING_UTF16BE:
2896 case wxFONTENCODING_UTF16LE:
2897 return 2;
2898
2899 case wxFONTENCODING_UTF32BE:
2900 case wxFONTENCODING_UTF32LE:
2901 return 4;
2902
2903 default:
2904 return 1;
2905 }
2906 }
2907
2908 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2909
2910 bool IsOk() const { return m_ok; }
2911
2912 public:
2913 wxFontEncoding m_enc;
2914 wxEncodingConverter m2w, w2m;
2915
2916 private:
2917 // were we initialized successfully?
2918 bool m_ok;
2919
2920 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2921 };
2922
2923 // make the constructors available for unit testing
2924 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2925 {
2926 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2927 if ( !result->IsOk() )
2928 {
2929 delete result;
2930 return 0;
2931 }
2932
2933 return result;
2934 }
2935
2936 #endif // wxUSE_FONTMAP
2937
2938 // ============================================================================
2939 // wxCSConv implementation
2940 // ============================================================================
2941
2942 void wxCSConv::Init()
2943 {
2944 m_name = NULL;
2945 m_convReal = NULL;
2946 m_deferred = true;
2947 }
2948
2949 wxCSConv::wxCSConv(const wxString& charset)
2950 {
2951 Init();
2952
2953 if ( !charset.empty() )
2954 {
2955 SetName(charset.ToAscii());
2956 }
2957
2958 #if wxUSE_FONTMAP
2959 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2960 if ( m_encoding == wxFONTENCODING_MAX )
2961 {
2962 // set to unknown/invalid value
2963 m_encoding = wxFONTENCODING_SYSTEM;
2964 }
2965 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2966 {
2967 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2968 m_encoding = wxFONTENCODING_ISO8859_1;
2969 }
2970 #else
2971 m_encoding = wxFONTENCODING_SYSTEM;
2972 #endif
2973 }
2974
2975 wxCSConv::wxCSConv(wxFontEncoding encoding)
2976 {
2977 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2978 {
2979 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2980
2981 encoding = wxFONTENCODING_SYSTEM;
2982 }
2983
2984 Init();
2985
2986 m_encoding = encoding;
2987 }
2988
2989 wxCSConv::~wxCSConv()
2990 {
2991 Clear();
2992 }
2993
2994 wxCSConv::wxCSConv(const wxCSConv& conv)
2995 : wxMBConv()
2996 {
2997 Init();
2998
2999 SetName(conv.m_name);
3000 m_encoding = conv.m_encoding;
3001 }
3002
3003 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3004 {
3005 Clear();
3006
3007 SetName(conv.m_name);
3008 m_encoding = conv.m_encoding;
3009
3010 return *this;
3011 }
3012
3013 void wxCSConv::Clear()
3014 {
3015 free(m_name);
3016 wxDELETE(m_convReal);
3017
3018 m_name = NULL;
3019 }
3020
3021 void wxCSConv::SetName(const char *charset)
3022 {
3023 if (charset)
3024 {
3025 m_name = wxStrdup(charset);
3026 m_deferred = true;
3027 }
3028 }
3029
3030 #if wxUSE_FONTMAP
3031
3032 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3033 wxEncodingNameCache );
3034
3035 static wxEncodingNameCache gs_nameCache;
3036 #endif
3037
3038 wxMBConv *wxCSConv::DoCreate() const
3039 {
3040 #if wxUSE_FONTMAP
3041 wxLogTrace(TRACE_STRCONV,
3042 wxT("creating conversion for %s"),
3043 (m_name ? m_name
3044 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3045 #endif // wxUSE_FONTMAP
3046
3047 // check for the special case of ASCII or ISO8859-1 charset: as we have
3048 // special knowledge of it anyhow, we don't need to create a special
3049 // conversion object
3050 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3051 m_encoding == wxFONTENCODING_DEFAULT )
3052 {
3053 // don't convert at all
3054 return NULL;
3055 }
3056
3057 // we trust OS to do conversion better than we can so try external
3058 // conversion methods first
3059 //
3060 // the full order is:
3061 // 1. OS conversion (iconv() under Unix or Win32 API)
3062 // 2. hard coded conversions for UTF
3063 // 3. wxEncodingConverter as fall back
3064
3065 // step (1)
3066 #ifdef HAVE_ICONV
3067 #if !wxUSE_FONTMAP
3068 if ( m_name )
3069 #endif // !wxUSE_FONTMAP
3070 {
3071 #if wxUSE_FONTMAP
3072 wxFontEncoding encoding(m_encoding);
3073 #endif
3074
3075 if ( m_name )
3076 {
3077 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3078 if ( conv->IsOk() )
3079 return conv;
3080
3081 delete conv;
3082
3083 #if wxUSE_FONTMAP
3084 encoding =
3085 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3086 #endif // wxUSE_FONTMAP
3087 }
3088 #if wxUSE_FONTMAP
3089 {
3090 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3091 if ( it != gs_nameCache.end() )
3092 {
3093 if ( it->second.empty() )
3094 return NULL;
3095
3096 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3097 if ( conv->IsOk() )
3098 return conv;
3099
3100 delete conv;
3101 }
3102
3103 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3104 // CS : in case this does not return valid names (eg for MacRoman)
3105 // encoding got a 'failure' entry in the cache all the same,
3106 // although it just has to be created using a different method, so
3107 // only store failed iconv creation attempts (or perhaps we
3108 // shoulnd't do this at all ?)
3109 if ( names[0] != NULL )
3110 {
3111 for ( ; *names; ++names )
3112 {
3113 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3114 // will need changes that will obsolete this
3115 wxString name(*names);
3116 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3117 if ( conv->IsOk() )
3118 {
3119 gs_nameCache[encoding] = *names;
3120 return conv;
3121 }
3122
3123 delete conv;
3124 }
3125
3126 gs_nameCache[encoding] = wxT(""); // cache the failure
3127 }
3128 }
3129 #endif // wxUSE_FONTMAP
3130 }
3131 #endif // HAVE_ICONV
3132
3133 #ifdef wxHAVE_WIN32_MB2WC
3134 {
3135 #if wxUSE_FONTMAP
3136 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3137 : new wxMBConv_win32(m_encoding);
3138 if ( conv->IsOk() )
3139 return conv;
3140
3141 delete conv;
3142 #else
3143 return NULL;
3144 #endif
3145 }
3146 #endif // wxHAVE_WIN32_MB2WC
3147
3148 #ifdef __DARWIN__
3149 {
3150 // leave UTF16 and UTF32 to the built-ins of wx
3151 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3152 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3153 {
3154 #if wxUSE_FONTMAP
3155 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3156 : new wxMBConv_cf(m_encoding);
3157 #else
3158 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3159 #endif
3160
3161 if ( conv->IsOk() )
3162 return conv;
3163
3164 delete conv;
3165 }
3166 }
3167 #endif // __DARWIN__
3168
3169 // step (2)
3170 wxFontEncoding enc = m_encoding;
3171 #if wxUSE_FONTMAP
3172 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3173 {
3174 // use "false" to suppress interactive dialogs -- we can be called from
3175 // anywhere and popping up a dialog from here is the last thing we want to
3176 // do
3177 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3178 }
3179 #endif // wxUSE_FONTMAP
3180
3181 switch ( enc )
3182 {
3183 case wxFONTENCODING_UTF7:
3184 return new wxMBConvUTF7;
3185
3186 case wxFONTENCODING_UTF8:
3187 return new wxMBConvUTF8;
3188
3189 case wxFONTENCODING_UTF16BE:
3190 return new wxMBConvUTF16BE;
3191
3192 case wxFONTENCODING_UTF16LE:
3193 return new wxMBConvUTF16LE;
3194
3195 case wxFONTENCODING_UTF32BE:
3196 return new wxMBConvUTF32BE;
3197
3198 case wxFONTENCODING_UTF32LE:
3199 return new wxMBConvUTF32LE;
3200
3201 default:
3202 // nothing to do but put here to suppress gcc warnings
3203 break;
3204 }
3205
3206 // step (3)
3207 #if wxUSE_FONTMAP
3208 {
3209 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3210 : new wxMBConv_wxwin(m_encoding);
3211 if ( conv->IsOk() )
3212 return conv;
3213
3214 delete conv;
3215 }
3216
3217 wxLogTrace(TRACE_STRCONV,
3218 wxT("encoding \"%s\" is not supported by this system"),
3219 (m_name ? wxString(m_name)
3220 : wxFontMapperBase::GetEncodingName(m_encoding)));
3221 #endif // wxUSE_FONTMAP
3222
3223 return NULL;
3224 }
3225
3226 void wxCSConv::CreateConvIfNeeded() const
3227 {
3228 if ( m_deferred )
3229 {
3230 wxCSConv *self = (wxCSConv *)this; // const_cast
3231
3232 // if we don't have neither the name nor the encoding, use the default
3233 // encoding for this system
3234 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3235 {
3236 #if wxUSE_INTL
3237 self->m_encoding = wxLocale::GetSystemEncoding();
3238 #else
3239 // fallback to some reasonable default:
3240 self->m_encoding = wxFONTENCODING_ISO8859_1;
3241 #endif // wxUSE_INTL
3242 }
3243
3244 self->m_convReal = DoCreate();
3245 self->m_deferred = false;
3246 }
3247 }
3248
3249 bool wxCSConv::IsOk() const
3250 {
3251 CreateConvIfNeeded();
3252
3253 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3254 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3255 return true; // always ok as we do it ourselves
3256
3257 // m_convReal->IsOk() is called at its own creation, so we know it must
3258 // be ok if m_convReal is non-NULL
3259 return m_convReal != NULL;
3260 }
3261
3262 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3263 const char *src, size_t srcLen) const
3264 {
3265 CreateConvIfNeeded();
3266
3267 if (m_convReal)
3268 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3269
3270 // latin-1 (direct)
3271 if ( srcLen == wxNO_LEN )
3272 srcLen = strlen(src) + 1; // take trailing NUL too
3273
3274 if ( dst )
3275 {
3276 if ( dstLen < srcLen )
3277 return wxCONV_FAILED;
3278
3279 for ( size_t n = 0; n < srcLen; n++ )
3280 dst[n] = (unsigned char)(src[n]);
3281 }
3282
3283 return srcLen;
3284 }
3285
3286 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3287 const wchar_t *src, size_t srcLen) const
3288 {
3289 CreateConvIfNeeded();
3290
3291 if (m_convReal)
3292 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3293
3294 // latin-1 (direct)
3295 if ( srcLen == wxNO_LEN )
3296 srcLen = wxWcslen(src) + 1;
3297
3298 if ( dst )
3299 {
3300 if ( dstLen < srcLen )
3301 return wxCONV_FAILED;
3302
3303 for ( size_t n = 0; n < srcLen; n++ )
3304 {
3305 if ( src[n] > 0xFF )
3306 return wxCONV_FAILED;
3307
3308 dst[n] = (char)src[n];
3309 }
3310
3311 }
3312 else // still need to check the input validity
3313 {
3314 for ( size_t n = 0; n < srcLen; n++ )
3315 {
3316 if ( src[n] > 0xFF )
3317 return wxCONV_FAILED;
3318 }
3319 }
3320
3321 return srcLen;
3322 }
3323
3324 size_t wxCSConv::GetMBNulLen() const
3325 {
3326 CreateConvIfNeeded();
3327
3328 if ( m_convReal )
3329 {
3330 return m_convReal->GetMBNulLen();
3331 }
3332
3333 // otherwise, we are ISO-8859-1
3334 return 1;
3335 }
3336
3337 #if wxUSE_UNICODE_UTF8
3338 bool wxCSConv::IsUTF8() const
3339 {
3340 CreateConvIfNeeded();
3341
3342 if ( m_convReal )
3343 {
3344 return m_convReal->IsUTF8();
3345 }
3346
3347 // otherwise, we are ISO-8859-1
3348 return false;
3349 }
3350 #endif
3351
3352
3353 #if wxUSE_UNICODE
3354
3355 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3356 {
3357 if ( !s )
3358 return wxWCharBuffer();
3359
3360 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3361 if ( !wbuf )
3362 wbuf = wxMBConvUTF8().cMB2WX(s);
3363 if ( !wbuf )
3364 wbuf = wxConvISO8859_1.cMB2WX(s);
3365
3366 return wbuf;
3367 }
3368
3369 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3370 {
3371 if ( !ws )
3372 return wxCharBuffer();
3373
3374 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3375 if ( !buf )
3376 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3377
3378 return buf;
3379 }
3380
3381 #endif // wxUSE_UNICODE
3382
3383 // ----------------------------------------------------------------------------
3384 // globals
3385 // ----------------------------------------------------------------------------
3386
3387 // NB: The reason why we create converted objects in this convoluted way,
3388 // using a factory function instead of global variable, is that they
3389 // may be used at static initialization time (some of them are used by
3390 // wxString ctors and there may be a global wxString object). In other
3391 // words, possibly _before_ the converter global object would be
3392 // initialized.
3393
3394 #undef wxConvLibc
3395 #undef wxConvUTF8
3396 #undef wxConvUTF7
3397 #undef wxConvLocal
3398 #undef wxConvISO8859_1
3399
3400 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3401 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3402 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3403 { \
3404 static impl_klass name##Obj ctor_args; \
3405 return &name##Obj; \
3406 } \
3407 /* this ensures that all global converter objects are created */ \
3408 /* by the time static initialization is done, i.e. before any */ \
3409 /* thread is launched: */ \
3410 static klass* gs_##name##instance = wxGet_##name##Ptr()
3411
3412 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3413 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3414
3415 #ifdef __INTELC__
3416 // disable warning "variable 'xxx' was declared but never referenced"
3417 #pragma warning(disable: 177)
3418 #endif // Intel C++
3419
3420 #ifdef __WINDOWS__
3421 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3422 #elif 0 // defined(__WXOSX__)
3423 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3424 #else
3425 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3426 #endif
3427
3428 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3429 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3430 // provokes an error message about "not enough macro parameters"; and we
3431 // can't use "()" here as the name##Obj declaration would be parsed as a
3432 // function declaration then, so use a semicolon and live with an extra
3433 // empty statement (and hope that no compilers warns about this)
3434 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3435 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3436
3437 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3438 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3439
3440 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3441 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3442
3443 #ifdef __DARWIN__
3444 // The xnu kernel always communicates file paths in decomposed UTF-8.
3445 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3446 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3447 #endif
3448
3449 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3450 #ifdef __DARWIN__
3451 &wxConvMacUTF8DObj;
3452 #else // !__DARWIN__
3453 wxGet_wxConvLibcPtr();
3454 #endif // __DARWIN__/!__DARWIN__