Fix return value of wxMBConvUTF8::ToWChar() when not using MAP_INVALID_UTF8_NOT.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #ifndef __WXWINCE__
32 #include <errno.h>
33 #endif
34
35 #include <ctype.h>
36 #include <string.h>
37 #include <stdlib.h>
38
39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef HAVE_ICONV
46 #include <iconv.h>
47 #include "wx/thread.h"
48 #endif
49
50 #include "wx/encconv.h"
51 #include "wx/fontmap.h"
52
53 #ifdef __DARWIN__
54 #include "wx/osx/core/private/strconv_cf.h"
55 #endif //def __DARWIN__
56
57
58 #define TRACE_STRCONV wxT("strconv")
59
60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61 // be 4 bytes
62 #if SIZEOF_WCHAR_T == 2
63 #define WC_UTF16
64 #endif
65
66
67 // ============================================================================
68 // implementation
69 // ============================================================================
70
71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
72 static bool NotAllNULs(const char *p, size_t n)
73 {
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78 }
79
80 // ----------------------------------------------------------------------------
81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
82 // ----------------------------------------------------------------------------
83
84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
85 {
86 if (input <= 0xffff)
87 {
88 if (output)
89 *output = (wxUint16) input;
90
91 return 1;
92 }
93 else if (input >= 0x110000)
94 {
95 return wxCONV_FAILED;
96 }
97 else
98 {
99 if (output)
100 {
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
103 }
104
105 return 2;
106 }
107 }
108
109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
110 {
111 if ((*input < 0xd800) || (*input > 0xdfff))
112 {
113 output = *input;
114 return 1;
115 }
116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
117 {
118 output = *input;
119 return wxCONV_FAILED;
120 }
121 else
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
126 }
127
128 #ifdef WC_UTF16
129 typedef wchar_t wxDecodeSurrogate_t;
130 #else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132 #endif // WC_UTF16/!WC_UTF16
133
134 // returns the next UTF-32 character from the wchar_t buffer and advances the
135 // pointer to the character after this one
136 //
137 // if an invalid character is found, *pSrc is set to NULL, the caller must
138 // check for this
139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
140 {
141 wxUint32 out;
142 const size_t
143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150 }
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 size_t
157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
159 {
160 // although new conversion classes are supposed to implement this function
161 // directly, the existing ones only implement the old MB2WC() and so, to
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
170
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
173
174 // the number of NULs terminating this string
175 size_t nulLen = 0; // not really needed, but just to avoid warnings
176
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
183 if ( srcLen != wxNO_LEN )
184 {
185 // we need to know how to find the end of this string
186 nulLen = GetMBNulLen();
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
189
190 // if there are enough NULs we can avoid the copy
191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
192 {
193 // make a copy in order to properly NUL-terminate the string
194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
195 char * const p = bufTmp.data();
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
198 *s = '\0';
199
200 src = bufTmp;
201 }
202
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
209
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
225 for ( ;; )
226 {
227 // try to convert the current chunk
228 size_t lenChunk = MB2WC(NULL, src, 0);
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
231
232 dstWritten += lenChunk;
233 if ( !srcEnd )
234 dstWritten++;
235
236 if ( !lenChunk )
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
252 if ( !srcEnd )
253 dst++;
254 }
255
256 if ( !srcEnd )
257 {
258 // we convert just one chunk in this case as this is the entire
259 // string anyhow (and we don't count the trailing NUL in this case)
260 break;
261 }
262
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
286
287 if ( src >= srcEnd )
288 break;
289 }
290
291 return dstWritten;
292 }
293
294 size_t
295 wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297 {
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
300
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
308 wxWCharBuffer bufTmp;
309 if ( isNulTerminated )
310 {
311 srcLen = wxWcslen(src) + 1;
312 }
313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
314 {
315 // make a copy in order to properly NUL-terminate the string
316 bufTmp = wxWCharBuffer(srcLen);
317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
324 src++ /* skip L'\0' too */ )
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
331 dstWritten += lenChunk;
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
340 dstWritten += lenNul;
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
364 return wxCONV_FAILED;
365
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
377 dst += lenChunk;
378 if ( chunkEnd < srcEnd )
379 dst += lenNul;
380 }
381
382 src = chunkEnd;
383 }
384
385 return dstWritten;
386 }
387
388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
389 {
390 size_t rc = ToWChar(outBuff, outLen, inBuff);
391 if ( rc != wxCONV_FAILED )
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399 }
400
401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
402 {
403 size_t rc = FromWChar(outBuff, outLen, inBuff);
404 if ( rc != wxCONV_FAILED )
405 {
406 rc -= GetMBNulLen();
407 }
408
409 return rc;
410 }
411
412 wxMBConv::~wxMBConv()
413 {
414 // nothing to do here (necessary for Darwin linking probably)
415 }
416
417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418 {
419 if ( psz )
420 {
421 // calculate the length of the buffer needed first
422 const size_t nLen = ToWChar(NULL, 0, psz);
423 if ( nLen != wxCONV_FAILED )
424 {
425 // now do the actual conversion
426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
427
428 // +1 for the trailing NULL
429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
430 return buf;
431 }
432 }
433
434 return wxWCharBuffer();
435 }
436
437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438 {
439 if ( pwz )
440 {
441 const size_t nLen = FromWChar(NULL, 0, pwz);
442 if ( nLen != wxCONV_FAILED )
443 {
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451 }
452
453 const wxWCharBuffer
454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
455 {
456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
457 if ( dstLen != wxCONV_FAILED )
458 {
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
463 wbuf.data()[dstLen] = L'\0';
464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
465 {
466 if ( outLen )
467 {
468 *outLen = dstLen;
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
476 (*outLen)--;
477 }
478
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487 }
488
489 const wxCharBuffer
490 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
491 {
492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
493 if ( dstLen != wxCONV_FAILED )
494 {
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
502 {
503 if ( outLen )
504 {
505 *outLen = dstLen;
506
507 if ( inLen == wxNO_LEN )
508 {
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
511 *outLen -= nulLen;
512 }
513 }
514
515 return buf;
516 }
517 }
518
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
523 }
524
525 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526 {
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
541 }
542
543 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544 {
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
558 return wxScopedCharBuffer::CreateNonOwned("", 0);
559 }
560
561 // ----------------------------------------------------------------------------
562 // wxMBConvLibc
563 // ----------------------------------------------------------------------------
564
565 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566 {
567 return wxMB2WC(buf, psz, n);
568 }
569
570 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571 {
572 return wxWC2MB(buf, psz, n);
573 }
574
575 // ----------------------------------------------------------------------------
576 // wxConvBrokenFileNames
577 // ----------------------------------------------------------------------------
578
579 #ifdef __UNIX__
580
581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
582 {
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
586 else
587 m_conv = new wxCSConv(charset);
588 }
589
590 #endif // __UNIX__
591
592 // ----------------------------------------------------------------------------
593 // UTF-7
594 // ----------------------------------------------------------------------------
595
596 // Implementation (C) 2004 Fredrik Roubert
597 //
598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
599
600 //
601 // BASE64 decoding table
602 //
603 static const unsigned char utf7unb64[] =
604 {
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
637 };
638
639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
641 {
642 DecoderState stateOrig,
643 *statePtr;
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
667 size_t len = 0;
668
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
672 {
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
676 {
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
679 {
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
691 return wxCONV_FAILED;
692 }
693
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
709 {
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
715 {
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
721 }
722 else // MSB
723 {
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
727 }
728 }
729 }
730 }
731
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
736 {
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
770 }
771 }
772
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
782
783 return len;
784 }
785
786 //
787 // BASE64 encoding table
788 //
789 static const unsigned char utf7enb64[] =
790 {
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799 };
800
801 //
802 // UTF-7 encoding table
803 //
804 // 0 - Set D (directly encoded characters)
805 // 1 - Set O (optional direct characters)
806 // 2 - whitespace characters (optional)
807 // 3 - special characters
808 //
809 static const unsigned char utf7encode[128] =
810 {
811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819 };
820
821 static inline bool wxIsUTF7Direct(wchar_t wc)
822 {
823 return wc < 0x80 && utf7encode[wc] < 1;
824 }
825
826 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
828 {
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
843 }
844
845 EncoderState& state = *statePtr;
846
847
848 size_t len = 0;
849
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
852 {
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
855 {
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
865
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
875 len++;
876 }
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
887 #ifndef WC_UTF16
888 else if (((wxUint32)cc) > 0xffff)
889 {
890 // no surrogate pair generation (yet?)
891 return wxCONV_FAILED;
892 }
893 #endif
894 else
895 {
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
899
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
907 {
908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
909 {
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
914 {
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
919 }
920 }
921
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
924
925 src++;
926 }
927 }
928 }
929
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
935
936 return len;
937 }
938
939 // ----------------------------------------------------------------------------
940 // UTF-8
941 // ----------------------------------------------------------------------------
942
943 static const wxUint32 utf8_max[]=
944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
945
946 // boundaries of the private use area we use to (temporarily) remap invalid
947 // characters invalid in a UTF-8 encoded string
948 const wxUint32 wxUnicodePUA = 0x100000;
949 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
951 // this table gives the length of the UTF-8 encoding from its first character:
952 const unsigned char tableUtf8Lengths[256] = {
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984 };
985
986 size_t
987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989 {
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
998 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
1018 if ( out && !dstLen-- )
1019 break;
1020
1021 wxUint32 code;
1022 unsigned char c = *p;
1023
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
1028
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
1031
1032 code = c;
1033 }
1034 else
1035 {
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
1081
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
1085 }
1086
1087 #ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095 #else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098 #endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107 }
1108
1109 size_t
1110 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112 {
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
1118 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
1140
1141 wxUint32 code;
1142 #ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 }
1149 #else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151 #endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223 }
1224
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
1227 {
1228 if ( m_options == MAP_INVALID_UTF8_NOT )
1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1230
1231 size_t len = 0;
1232
1233 // The length can be either given explicitly or computed implicitly for the
1234 // NUL-terminated strings.
1235 const bool isNulTerminated = srcLen == wxNO_LEN;
1236 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1237 {
1238 const char *opsz = psz;
1239 bool invalid = false;
1240 unsigned char cc = *psz++, fc = cc;
1241 unsigned cnt;
1242 for (cnt = 0; fc & 0x80; cnt++)
1243 fc <<= 1;
1244
1245 if (!cnt)
1246 {
1247 // plain ASCII char
1248 if (buf)
1249 *buf++ = cc;
1250 len++;
1251
1252 // escape the escape character for octal escapes
1253 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1254 && cc == '\\' && (!buf || len < n))
1255 {
1256 if (buf)
1257 *buf++ = cc;
1258 len++;
1259 }
1260 }
1261 else
1262 {
1263 cnt--;
1264 if (!cnt)
1265 {
1266 // invalid UTF-8 sequence
1267 invalid = true;
1268 }
1269 else
1270 {
1271 unsigned ocnt = cnt - 1;
1272 wxUint32 res = cc & (0x3f >> cnt);
1273 while (cnt--)
1274 {
1275 cc = *psz;
1276 if ((cc & 0xC0) != 0x80)
1277 {
1278 // invalid UTF-8 sequence
1279 invalid = true;
1280 break;
1281 }
1282
1283 psz++;
1284 res = (res << 6) | (cc & 0x3f);
1285 }
1286
1287 if (invalid || res <= utf8_max[ocnt])
1288 {
1289 // illegal UTF-8 encoding
1290 invalid = true;
1291 }
1292 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1293 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1294 {
1295 // if one of our PUA characters turns up externally
1296 // it must also be treated as an illegal sequence
1297 // (a bit like you have to escape an escape character)
1298 invalid = true;
1299 }
1300 else
1301 {
1302 #ifdef WC_UTF16
1303 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1304 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1305 if (pa == wxCONV_FAILED)
1306 {
1307 invalid = true;
1308 }
1309 else
1310 {
1311 if (buf)
1312 buf += pa;
1313 len += pa;
1314 }
1315 #else // !WC_UTF16
1316 if (buf)
1317 *buf++ = (wchar_t)res;
1318 len++;
1319 #endif // WC_UTF16/!WC_UTF16
1320 }
1321 }
1322
1323 if (invalid)
1324 {
1325 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1326 {
1327 while (opsz < psz && (!buf || len < n))
1328 {
1329 #ifdef WC_UTF16
1330 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1331 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1332 wxASSERT(pa != wxCONV_FAILED);
1333 if (buf)
1334 buf += pa;
1335 opsz++;
1336 len += pa;
1337 #else
1338 if (buf)
1339 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1340 opsz++;
1341 len++;
1342 #endif
1343 }
1344 }
1345 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1346 {
1347 while (opsz < psz && (!buf || len < n))
1348 {
1349 if ( buf && len + 3 < n )
1350 {
1351 unsigned char on = *opsz;
1352 *buf++ = L'\\';
1353 *buf++ = (wchar_t)( L'0' + on / 0100 );
1354 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1355 *buf++ = (wchar_t)( L'0' + on % 010 );
1356 }
1357
1358 opsz++;
1359 len += 4;
1360 }
1361 }
1362 else // MAP_INVALID_UTF8_NOT
1363 {
1364 return wxCONV_FAILED;
1365 }
1366 }
1367 }
1368 }
1369
1370 if ( isNulTerminated )
1371 {
1372 // Add the trailing NUL in this case if we have a large enough buffer.
1373 if ( buf && (len < n) )
1374 *buf = 0;
1375
1376 // And count it in any case.
1377 len++;
1378 }
1379
1380 return len;
1381 }
1382
1383 static inline bool isoctal(wchar_t wch)
1384 {
1385 return L'0' <= wch && wch <= L'7';
1386 }
1387
1388 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1389 const wchar_t *psz, size_t srcLen) const
1390 {
1391 if ( m_options == MAP_INVALID_UTF8_NOT )
1392 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1393
1394 size_t len = 0;
1395
1396 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1397 {
1398 wxUint32 cc;
1399
1400 #ifdef WC_UTF16
1401 // cast is ok for WC_UTF16
1402 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1403 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1404 #else
1405 cc = (*psz++) & 0x7fffffff;
1406 #endif
1407
1408 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1409 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1410 {
1411 if (buf)
1412 *buf++ = (char)(cc - wxUnicodePUA);
1413 len++;
1414 }
1415 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1416 && cc == L'\\' && psz[0] == L'\\' )
1417 {
1418 if (buf)
1419 *buf++ = (char)cc;
1420 psz++;
1421 len++;
1422 }
1423 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1424 cc == L'\\' &&
1425 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1426 {
1427 if (buf)
1428 {
1429 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1430 (psz[1] - L'0') * 010 +
1431 (psz[2] - L'0'));
1432 }
1433
1434 psz += 3;
1435 len++;
1436 }
1437 else
1438 {
1439 unsigned cnt;
1440 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1441 {
1442 }
1443
1444 if (!cnt)
1445 {
1446 // plain ASCII char
1447 if (buf)
1448 *buf++ = (char) cc;
1449 len++;
1450 }
1451 else
1452 {
1453 len += cnt + 1;
1454 if (buf)
1455 {
1456 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1457 while (cnt--)
1458 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1459 }
1460 }
1461 }
1462 }
1463
1464 if (srcLen == wxNO_LEN && buf && (len < n))
1465 *buf = 0;
1466
1467 return len + 1;
1468 }
1469
1470 // ============================================================================
1471 // UTF-16
1472 // ============================================================================
1473
1474 #ifdef WORDS_BIGENDIAN
1475 #define wxMBConvUTF16straight wxMBConvUTF16BE
1476 #define wxMBConvUTF16swap wxMBConvUTF16LE
1477 #else
1478 #define wxMBConvUTF16swap wxMBConvUTF16BE
1479 #define wxMBConvUTF16straight wxMBConvUTF16LE
1480 #endif
1481
1482 /* static */
1483 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1484 {
1485 if ( srcLen == wxNO_LEN )
1486 {
1487 // count the number of bytes in input, including the trailing NULs
1488 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1489 for ( srcLen = 1; *inBuff++; srcLen++ )
1490 ;
1491
1492 srcLen *= BYTES_PER_CHAR;
1493 }
1494 else // we already have the length
1495 {
1496 // we can only convert an entire number of UTF-16 characters
1497 if ( srcLen % BYTES_PER_CHAR )
1498 return wxCONV_FAILED;
1499 }
1500
1501 return srcLen;
1502 }
1503
1504 // case when in-memory representation is UTF-16 too
1505 #ifdef WC_UTF16
1506
1507 // ----------------------------------------------------------------------------
1508 // conversions without endianness change
1509 // ----------------------------------------------------------------------------
1510
1511 size_t
1512 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1513 const char *src, size_t srcLen) const
1514 {
1515 // set up the scene for using memcpy() (which is presumably more efficient
1516 // than copying the bytes one by one)
1517 srcLen = GetLength(src, srcLen);
1518 if ( srcLen == wxNO_LEN )
1519 return wxCONV_FAILED;
1520
1521 const size_t inLen = srcLen / BYTES_PER_CHAR;
1522 if ( dst )
1523 {
1524 if ( dstLen < inLen )
1525 return wxCONV_FAILED;
1526
1527 memcpy(dst, src, srcLen);
1528 }
1529
1530 return inLen;
1531 }
1532
1533 size_t
1534 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1535 const wchar_t *src, size_t srcLen) const
1536 {
1537 if ( srcLen == wxNO_LEN )
1538 srcLen = wxWcslen(src) + 1;
1539
1540 srcLen *= BYTES_PER_CHAR;
1541
1542 if ( dst )
1543 {
1544 if ( dstLen < srcLen )
1545 return wxCONV_FAILED;
1546
1547 memcpy(dst, src, srcLen);
1548 }
1549
1550 return srcLen;
1551 }
1552
1553 // ----------------------------------------------------------------------------
1554 // endian-reversing conversions
1555 // ----------------------------------------------------------------------------
1556
1557 size_t
1558 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1559 const char *src, size_t srcLen) const
1560 {
1561 srcLen = GetLength(src, srcLen);
1562 if ( srcLen == wxNO_LEN )
1563 return wxCONV_FAILED;
1564
1565 srcLen /= BYTES_PER_CHAR;
1566
1567 if ( dst )
1568 {
1569 if ( dstLen < srcLen )
1570 return wxCONV_FAILED;
1571
1572 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1573 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1574 {
1575 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1576 }
1577 }
1578
1579 return srcLen;
1580 }
1581
1582 size_t
1583 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1584 const wchar_t *src, size_t srcLen) const
1585 {
1586 if ( srcLen == wxNO_LEN )
1587 srcLen = wxWcslen(src) + 1;
1588
1589 srcLen *= BYTES_PER_CHAR;
1590
1591 if ( dst )
1592 {
1593 if ( dstLen < srcLen )
1594 return wxCONV_FAILED;
1595
1596 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1597 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1598 {
1599 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1600 }
1601 }
1602
1603 return srcLen;
1604 }
1605
1606 #else // !WC_UTF16: wchar_t is UTF-32
1607
1608 // ----------------------------------------------------------------------------
1609 // conversions without endianness change
1610 // ----------------------------------------------------------------------------
1611
1612 size_t
1613 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1614 const char *src, size_t srcLen) const
1615 {
1616 srcLen = GetLength(src, srcLen);
1617 if ( srcLen == wxNO_LEN )
1618 return wxCONV_FAILED;
1619
1620 const size_t inLen = srcLen / BYTES_PER_CHAR;
1621 if ( !dst )
1622 {
1623 // optimization: return maximal space which could be needed for this
1624 // string even if the real size could be smaller if the buffer contains
1625 // any surrogates
1626 return inLen;
1627 }
1628
1629 size_t outLen = 0;
1630 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1631 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1632 {
1633 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1634 if ( !inBuff )
1635 return wxCONV_FAILED;
1636
1637 if ( ++outLen > dstLen )
1638 return wxCONV_FAILED;
1639
1640 *dst++ = ch;
1641 }
1642
1643
1644 return outLen;
1645 }
1646
1647 size_t
1648 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1649 const wchar_t *src, size_t srcLen) const
1650 {
1651 if ( srcLen == wxNO_LEN )
1652 srcLen = wxWcslen(src) + 1;
1653
1654 size_t outLen = 0;
1655 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1656 for ( size_t n = 0; n < srcLen; n++ )
1657 {
1658 wxUint16 cc[2] = { 0 };
1659 const size_t numChars = encode_utf16(*src++, cc);
1660 if ( numChars == wxCONV_FAILED )
1661 return wxCONV_FAILED;
1662
1663 outLen += numChars * BYTES_PER_CHAR;
1664 if ( outBuff )
1665 {
1666 if ( outLen > dstLen )
1667 return wxCONV_FAILED;
1668
1669 *outBuff++ = cc[0];
1670 if ( numChars == 2 )
1671 {
1672 // second character of a surrogate
1673 *outBuff++ = cc[1];
1674 }
1675 }
1676 }
1677
1678 return outLen;
1679 }
1680
1681 // ----------------------------------------------------------------------------
1682 // endian-reversing conversions
1683 // ----------------------------------------------------------------------------
1684
1685 size_t
1686 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1687 const char *src, size_t srcLen) const
1688 {
1689 srcLen = GetLength(src, srcLen);
1690 if ( srcLen == wxNO_LEN )
1691 return wxCONV_FAILED;
1692
1693 const size_t inLen = srcLen / BYTES_PER_CHAR;
1694 if ( !dst )
1695 {
1696 // optimization: return maximal space which could be needed for this
1697 // string even if the real size could be smaller if the buffer contains
1698 // any surrogates
1699 return inLen;
1700 }
1701
1702 size_t outLen = 0;
1703 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1704 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1705 {
1706 wxUint32 ch;
1707 wxUint16 tmp[2];
1708
1709 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1710 inBuff++;
1711 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1712
1713 const size_t numChars = decode_utf16(tmp, ch);
1714 if ( numChars == wxCONV_FAILED )
1715 return wxCONV_FAILED;
1716
1717 if ( numChars == 2 )
1718 inBuff++;
1719
1720 if ( ++outLen > dstLen )
1721 return wxCONV_FAILED;
1722
1723 *dst++ = ch;
1724 }
1725
1726
1727 return outLen;
1728 }
1729
1730 size_t
1731 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1732 const wchar_t *src, size_t srcLen) const
1733 {
1734 if ( srcLen == wxNO_LEN )
1735 srcLen = wxWcslen(src) + 1;
1736
1737 size_t outLen = 0;
1738 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1739 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1740 {
1741 wxUint16 cc[2] = { 0 };
1742 const size_t numChars = encode_utf16(*src, cc);
1743 if ( numChars == wxCONV_FAILED )
1744 return wxCONV_FAILED;
1745
1746 outLen += numChars * BYTES_PER_CHAR;
1747 if ( outBuff )
1748 {
1749 if ( outLen > dstLen )
1750 return wxCONV_FAILED;
1751
1752 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1753 if ( numChars == 2 )
1754 {
1755 // second character of a surrogate
1756 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1757 }
1758 }
1759 }
1760
1761 return outLen;
1762 }
1763
1764 #endif // WC_UTF16/!WC_UTF16
1765
1766
1767 // ============================================================================
1768 // UTF-32
1769 // ============================================================================
1770
1771 #ifdef WORDS_BIGENDIAN
1772 #define wxMBConvUTF32straight wxMBConvUTF32BE
1773 #define wxMBConvUTF32swap wxMBConvUTF32LE
1774 #else
1775 #define wxMBConvUTF32swap wxMBConvUTF32BE
1776 #define wxMBConvUTF32straight wxMBConvUTF32LE
1777 #endif
1778
1779
1780 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1781 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1782
1783 /* static */
1784 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1785 {
1786 if ( srcLen == wxNO_LEN )
1787 {
1788 // count the number of bytes in input, including the trailing NULs
1789 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1790 for ( srcLen = 1; *inBuff++; srcLen++ )
1791 ;
1792
1793 srcLen *= BYTES_PER_CHAR;
1794 }
1795 else // we already have the length
1796 {
1797 // we can only convert an entire number of UTF-32 characters
1798 if ( srcLen % BYTES_PER_CHAR )
1799 return wxCONV_FAILED;
1800 }
1801
1802 return srcLen;
1803 }
1804
1805 // case when in-memory representation is UTF-16
1806 #ifdef WC_UTF16
1807
1808 // ----------------------------------------------------------------------------
1809 // conversions without endianness change
1810 // ----------------------------------------------------------------------------
1811
1812 size_t
1813 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1814 const char *src, size_t srcLen) const
1815 {
1816 srcLen = GetLength(src, srcLen);
1817 if ( srcLen == wxNO_LEN )
1818 return wxCONV_FAILED;
1819
1820 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1821 const size_t inLen = srcLen / BYTES_PER_CHAR;
1822 size_t outLen = 0;
1823 for ( size_t n = 0; n < inLen; n++ )
1824 {
1825 wxUint16 cc[2] = { 0 };
1826 const size_t numChars = encode_utf16(*inBuff++, cc);
1827 if ( numChars == wxCONV_FAILED )
1828 return wxCONV_FAILED;
1829
1830 outLen += numChars;
1831 if ( dst )
1832 {
1833 if ( outLen > dstLen )
1834 return wxCONV_FAILED;
1835
1836 *dst++ = cc[0];
1837 if ( numChars == 2 )
1838 {
1839 // second character of a surrogate
1840 *dst++ = cc[1];
1841 }
1842 }
1843 }
1844
1845 return outLen;
1846 }
1847
1848 size_t
1849 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1850 const wchar_t *src, size_t srcLen) const
1851 {
1852 if ( srcLen == wxNO_LEN )
1853 srcLen = wxWcslen(src) + 1;
1854
1855 if ( !dst )
1856 {
1857 // optimization: return maximal space which could be needed for this
1858 // string instead of the exact amount which could be less if there are
1859 // any surrogates in the input
1860 //
1861 // we consider that surrogates are rare enough to make it worthwhile to
1862 // avoid running the loop below at the cost of slightly extra memory
1863 // consumption
1864 return srcLen * BYTES_PER_CHAR;
1865 }
1866
1867 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1868 size_t outLen = 0;
1869 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1870 {
1871 const wxUint32 ch = wxDecodeSurrogate(&src);
1872 if ( !src )
1873 return wxCONV_FAILED;
1874
1875 outLen += BYTES_PER_CHAR;
1876
1877 if ( outLen > dstLen )
1878 return wxCONV_FAILED;
1879
1880 *outBuff++ = ch;
1881 }
1882
1883 return outLen;
1884 }
1885
1886 // ----------------------------------------------------------------------------
1887 // endian-reversing conversions
1888 // ----------------------------------------------------------------------------
1889
1890 size_t
1891 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1892 const char *src, size_t srcLen) const
1893 {
1894 srcLen = GetLength(src, srcLen);
1895 if ( srcLen == wxNO_LEN )
1896 return wxCONV_FAILED;
1897
1898 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1899 const size_t inLen = srcLen / BYTES_PER_CHAR;
1900 size_t outLen = 0;
1901 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1902 {
1903 wxUint16 cc[2] = { 0 };
1904 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1905 if ( numChars == wxCONV_FAILED )
1906 return wxCONV_FAILED;
1907
1908 outLen += numChars;
1909 if ( dst )
1910 {
1911 if ( outLen > dstLen )
1912 return wxCONV_FAILED;
1913
1914 *dst++ = cc[0];
1915 if ( numChars == 2 )
1916 {
1917 // second character of a surrogate
1918 *dst++ = cc[1];
1919 }
1920 }
1921 }
1922
1923 return outLen;
1924 }
1925
1926 size_t
1927 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1928 const wchar_t *src, size_t srcLen) const
1929 {
1930 if ( srcLen == wxNO_LEN )
1931 srcLen = wxWcslen(src) + 1;
1932
1933 if ( !dst )
1934 {
1935 // optimization: return maximal space which could be needed for this
1936 // string instead of the exact amount which could be less if there are
1937 // any surrogates in the input
1938 //
1939 // we consider that surrogates are rare enough to make it worthwhile to
1940 // avoid running the loop below at the cost of slightly extra memory
1941 // consumption
1942 return srcLen*BYTES_PER_CHAR;
1943 }
1944
1945 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1946 size_t outLen = 0;
1947 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1948 {
1949 const wxUint32 ch = wxDecodeSurrogate(&src);
1950 if ( !src )
1951 return wxCONV_FAILED;
1952
1953 outLen += BYTES_PER_CHAR;
1954
1955 if ( outLen > dstLen )
1956 return wxCONV_FAILED;
1957
1958 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1959 }
1960
1961 return outLen;
1962 }
1963
1964 #else // !WC_UTF16: wchar_t is UTF-32
1965
1966 // ----------------------------------------------------------------------------
1967 // conversions without endianness change
1968 // ----------------------------------------------------------------------------
1969
1970 size_t
1971 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1972 const char *src, size_t srcLen) const
1973 {
1974 // use memcpy() as it should be much faster than hand-written loop
1975 srcLen = GetLength(src, srcLen);
1976 if ( srcLen == wxNO_LEN )
1977 return wxCONV_FAILED;
1978
1979 const size_t inLen = srcLen/BYTES_PER_CHAR;
1980 if ( dst )
1981 {
1982 if ( dstLen < inLen )
1983 return wxCONV_FAILED;
1984
1985 memcpy(dst, src, srcLen);
1986 }
1987
1988 return inLen;
1989 }
1990
1991 size_t
1992 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1993 const wchar_t *src, size_t srcLen) const
1994 {
1995 if ( srcLen == wxNO_LEN )
1996 srcLen = wxWcslen(src) + 1;
1997
1998 srcLen *= BYTES_PER_CHAR;
1999
2000 if ( dst )
2001 {
2002 if ( dstLen < srcLen )
2003 return wxCONV_FAILED;
2004
2005 memcpy(dst, src, srcLen);
2006 }
2007
2008 return srcLen;
2009 }
2010
2011 // ----------------------------------------------------------------------------
2012 // endian-reversing conversions
2013 // ----------------------------------------------------------------------------
2014
2015 size_t
2016 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2017 const char *src, size_t srcLen) const
2018 {
2019 srcLen = GetLength(src, srcLen);
2020 if ( srcLen == wxNO_LEN )
2021 return wxCONV_FAILED;
2022
2023 srcLen /= BYTES_PER_CHAR;
2024
2025 if ( dst )
2026 {
2027 if ( dstLen < srcLen )
2028 return wxCONV_FAILED;
2029
2030 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2031 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2032 {
2033 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2034 }
2035 }
2036
2037 return srcLen;
2038 }
2039
2040 size_t
2041 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2042 const wchar_t *src, size_t srcLen) const
2043 {
2044 if ( srcLen == wxNO_LEN )
2045 srcLen = wxWcslen(src) + 1;
2046
2047 srcLen *= BYTES_PER_CHAR;
2048
2049 if ( dst )
2050 {
2051 if ( dstLen < srcLen )
2052 return wxCONV_FAILED;
2053
2054 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2055 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2056 {
2057 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2058 }
2059 }
2060
2061 return srcLen;
2062 }
2063
2064 #endif // WC_UTF16/!WC_UTF16
2065
2066
2067 // ============================================================================
2068 // The classes doing conversion using the iconv_xxx() functions
2069 // ============================================================================
2070
2071 #ifdef HAVE_ICONV
2072
2073 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2074 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2075 // (unless there's yet another bug in glibc) the only case when iconv()
2076 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2077 // left in the input buffer -- when _real_ error occurs,
2078 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2079 // iconv() failure.
2080 // [This bug does not appear in glibc 2.2.]
2081 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2082 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2083 (errno != E2BIG || bufLeft != 0))
2084 #else
2085 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2086 #endif
2087
2088 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2089
2090 #define ICONV_T_INVALID ((iconv_t)-1)
2091
2092 #if SIZEOF_WCHAR_T == 4
2093 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2094 #define WC_ENC wxFONTENCODING_UTF32
2095 #elif SIZEOF_WCHAR_T == 2
2096 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2097 #define WC_ENC wxFONTENCODING_UTF16
2098 #else // sizeof(wchar_t) != 2 nor 4
2099 // does this ever happen?
2100 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2101 #endif
2102
2103 // ----------------------------------------------------------------------------
2104 // wxMBConv_iconv: encapsulates an iconv character set
2105 // ----------------------------------------------------------------------------
2106
2107 class wxMBConv_iconv : public wxMBConv
2108 {
2109 public:
2110 wxMBConv_iconv(const char *name);
2111 virtual ~wxMBConv_iconv();
2112
2113 // implement base class virtual methods
2114 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2115 const char *src, size_t srcLen = wxNO_LEN) const;
2116 virtual size_t FromWChar(char *dst, size_t dstLen,
2117 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2118 virtual size_t GetMBNulLen() const;
2119
2120 #if wxUSE_UNICODE_UTF8
2121 virtual bool IsUTF8() const;
2122 #endif
2123
2124 virtual wxMBConv *Clone() const
2125 {
2126 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2127 p->m_minMBCharWidth = m_minMBCharWidth;
2128 return p;
2129 }
2130
2131 bool IsOk() const
2132 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2133
2134 protected:
2135 // the iconv handlers used to translate from multibyte
2136 // to wide char and in the other direction
2137 iconv_t m2w,
2138 w2m;
2139
2140 #if wxUSE_THREADS
2141 // guards access to m2w and w2m objects
2142 wxMutex m_iconvMutex;
2143 #endif
2144
2145 private:
2146 // the name (for iconv_open()) of a wide char charset -- if none is
2147 // available on this machine, it will remain NULL
2148 static wxString ms_wcCharsetName;
2149
2150 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2151 // different endian-ness than the native one
2152 static bool ms_wcNeedsSwap;
2153
2154
2155 // name of the encoding handled by this conversion
2156 const char *m_name;
2157
2158 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2159 // initially
2160 size_t m_minMBCharWidth;
2161 };
2162
2163 // make the constructor available for unit testing
2164 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2165 {
2166 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2167 if ( !result->IsOk() )
2168 {
2169 delete result;
2170 return 0;
2171 }
2172
2173 return result;
2174 }
2175
2176 wxString wxMBConv_iconv::ms_wcCharsetName;
2177 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2178
2179 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2180 : m_name(wxStrdup(name))
2181 {
2182 m_minMBCharWidth = 0;
2183
2184 // check for charset that represents wchar_t:
2185 if ( ms_wcCharsetName.empty() )
2186 {
2187 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2188
2189 #if wxUSE_FONTMAP
2190 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2191 #else // !wxUSE_FONTMAP
2192 static const wxChar *const names_static[] =
2193 {
2194 #if SIZEOF_WCHAR_T == 4
2195 wxT("UCS-4"),
2196 #elif SIZEOF_WCHAR_T == 2
2197 wxT("UCS-2"),
2198 #endif
2199 NULL
2200 };
2201 const wxChar *const *names = names_static;
2202 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2203
2204 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2205 {
2206 const wxString nameCS(*names);
2207
2208 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2209 wxString nameXE(nameCS);
2210
2211 #ifdef WORDS_BIGENDIAN
2212 nameXE += wxT("BE");
2213 #else // little endian
2214 nameXE += wxT("LE");
2215 #endif
2216
2217 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2218 nameXE.c_str());
2219
2220 m2w = iconv_open(nameXE.ToAscii(), name);
2221 if ( m2w == ICONV_T_INVALID )
2222 {
2223 // try charset w/o bytesex info (e.g. "UCS4")
2224 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2225 nameCS.c_str());
2226 m2w = iconv_open(nameCS.ToAscii(), name);
2227
2228 // and check for bytesex ourselves:
2229 if ( m2w != ICONV_T_INVALID )
2230 {
2231 char buf[2], *bufPtr;
2232 wchar_t wbuf[2];
2233 size_t insz, outsz;
2234 size_t res;
2235
2236 buf[0] = 'A';
2237 buf[1] = 0;
2238 wbuf[0] = 0;
2239 insz = 2;
2240 outsz = SIZEOF_WCHAR_T * 2;
2241 char* wbufPtr = (char*)wbuf;
2242 bufPtr = buf;
2243
2244 res = iconv(
2245 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2246 &wbufPtr, &outsz);
2247
2248 if (ICONV_FAILED(res, insz))
2249 {
2250 wxLogLastError(wxT("iconv"));
2251 wxLogError(_("Conversion to charset '%s' doesn't work."),
2252 nameCS.c_str());
2253 }
2254 else // ok, can convert to this encoding, remember it
2255 {
2256 ms_wcCharsetName = nameCS;
2257 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2258 }
2259 }
2260 }
2261 else // use charset not requiring byte swapping
2262 {
2263 ms_wcCharsetName = nameXE;
2264 }
2265 }
2266
2267 wxLogTrace(TRACE_STRCONV,
2268 wxT("iconv wchar_t charset is \"%s\"%s"),
2269 ms_wcCharsetName.empty() ? wxString("<none>")
2270 : ms_wcCharsetName,
2271 ms_wcNeedsSwap ? wxT(" (needs swap)")
2272 : wxT(""));
2273 }
2274 else // we already have ms_wcCharsetName
2275 {
2276 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2277 }
2278
2279 if ( ms_wcCharsetName.empty() )
2280 {
2281 w2m = ICONV_T_INVALID;
2282 }
2283 else
2284 {
2285 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2286 if ( w2m == ICONV_T_INVALID )
2287 {
2288 wxLogTrace(TRACE_STRCONV,
2289 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2290 ms_wcCharsetName.c_str(), name);
2291 }
2292 }
2293 }
2294
2295 wxMBConv_iconv::~wxMBConv_iconv()
2296 {
2297 free(const_cast<char *>(m_name));
2298
2299 if ( m2w != ICONV_T_INVALID )
2300 iconv_close(m2w);
2301 if ( w2m != ICONV_T_INVALID )
2302 iconv_close(w2m);
2303 }
2304
2305 size_t
2306 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2307 const char *src, size_t srcLen) const
2308 {
2309 if ( srcLen == wxNO_LEN )
2310 {
2311 // find the string length: notice that must be done differently for
2312 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2313 // consecutive NULs
2314 const size_t nulLen = GetMBNulLen();
2315 switch ( nulLen )
2316 {
2317 default:
2318 return wxCONV_FAILED;
2319
2320 case 1:
2321 srcLen = strlen(src); // arguably more optimized than our version
2322 break;
2323
2324 case 2:
2325 case 4:
2326 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2327 // but they also have to start at character boundary and not
2328 // span two adjacent characters
2329 const char *p;
2330 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2331 ;
2332 srcLen = p - src;
2333 break;
2334 }
2335
2336 // when we're determining the length of the string ourselves we count
2337 // the terminating NUL(s) as part of it and always NUL-terminate the
2338 // output
2339 srcLen += nulLen;
2340 }
2341
2342 // we express length in the number of (wide) characters but iconv always
2343 // counts buffer sizes it in bytes
2344 dstLen *= SIZEOF_WCHAR_T;
2345
2346 #if wxUSE_THREADS
2347 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2348 // Unfortunately there are a couple of global wxCSConv objects such as
2349 // wxConvLocal that are used all over wx code, so we have to make sure
2350 // the handle is used by at most one thread at the time. Otherwise
2351 // only a few wx classes would be safe to use from non-main threads
2352 // as MB<->WC conversion would fail "randomly".
2353 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2354 #endif // wxUSE_THREADS
2355
2356 size_t res, cres;
2357 const char *pszPtr = src;
2358
2359 if ( dst )
2360 {
2361 char* bufPtr = (char*)dst;
2362
2363 // have destination buffer, convert there
2364 size_t dstLenOrig = dstLen;
2365 cres = iconv(m2w,
2366 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2367 &bufPtr, &dstLen);
2368
2369 // convert the number of bytes converted as returned by iconv to the
2370 // number of (wide) characters converted that we need
2371 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2372
2373 if (ms_wcNeedsSwap)
2374 {
2375 // convert to native endianness
2376 for ( unsigned i = 0; i < res; i++ )
2377 dst[i] = WC_BSWAP(dst[i]);
2378 }
2379 }
2380 else // no destination buffer
2381 {
2382 // convert using temp buffer to calculate the size of the buffer needed
2383 wchar_t tbuf[256];
2384 res = 0;
2385
2386 do
2387 {
2388 char* bufPtr = (char*)tbuf;
2389 dstLen = 8 * SIZEOF_WCHAR_T;
2390
2391 cres = iconv(m2w,
2392 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2393 &bufPtr, &dstLen );
2394
2395 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2396 }
2397 while ((cres == (size_t)-1) && (errno == E2BIG));
2398 }
2399
2400 if (ICONV_FAILED(cres, srcLen))
2401 {
2402 //VS: it is ok if iconv fails, hence trace only
2403 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2404 return wxCONV_FAILED;
2405 }
2406
2407 return res;
2408 }
2409
2410 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2411 const wchar_t *src, size_t srcLen) const
2412 {
2413 #if wxUSE_THREADS
2414 // NB: explained in MB2WC
2415 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2416 #endif
2417
2418 if ( srcLen == wxNO_LEN )
2419 srcLen = wxWcslen(src) + 1;
2420
2421 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2422 size_t outbuflen = dstLen;
2423 size_t res, cres;
2424
2425 wchar_t *tmpbuf = 0;
2426
2427 if (ms_wcNeedsSwap)
2428 {
2429 // need to copy to temp buffer to switch endianness
2430 // (doing WC_BSWAP twice on the original buffer won't work, as it
2431 // could be in read-only memory, or be accessed in some other thread)
2432 tmpbuf = (wchar_t *)malloc(inbuflen);
2433 for ( size_t i = 0; i < srcLen; i++ )
2434 tmpbuf[i] = WC_BSWAP(src[i]);
2435
2436 src = tmpbuf;
2437 }
2438
2439 char* inbuf = (char*)src;
2440 if ( dst )
2441 {
2442 // have destination buffer, convert there
2443 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2444
2445 res = dstLen - outbuflen;
2446 }
2447 else // no destination buffer
2448 {
2449 // convert using temp buffer to calculate the size of the buffer needed
2450 char tbuf[256];
2451 res = 0;
2452 do
2453 {
2454 dst = tbuf;
2455 outbuflen = WXSIZEOF(tbuf);
2456
2457 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2458
2459 res += WXSIZEOF(tbuf) - outbuflen;
2460 }
2461 while ((cres == (size_t)-1) && (errno == E2BIG));
2462 }
2463
2464 if (ms_wcNeedsSwap)
2465 {
2466 free(tmpbuf);
2467 }
2468
2469 if (ICONV_FAILED(cres, inbuflen))
2470 {
2471 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2472 return wxCONV_FAILED;
2473 }
2474
2475 return res;
2476 }
2477
2478 size_t wxMBConv_iconv::GetMBNulLen() const
2479 {
2480 if ( m_minMBCharWidth == 0 )
2481 {
2482 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2483
2484 #if wxUSE_THREADS
2485 // NB: explained in MB2WC
2486 wxMutexLocker lock(self->m_iconvMutex);
2487 #endif
2488
2489 const wchar_t *wnul = L"";
2490 char buf[8]; // should be enough for NUL in any encoding
2491 size_t inLen = sizeof(wchar_t),
2492 outLen = WXSIZEOF(buf);
2493 char *inBuff = (char *)wnul;
2494 char *outBuff = buf;
2495 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2496 {
2497 self->m_minMBCharWidth = (size_t)-1;
2498 }
2499 else // ok
2500 {
2501 self->m_minMBCharWidth = outBuff - buf;
2502 }
2503 }
2504
2505 return m_minMBCharWidth;
2506 }
2507
2508 #if wxUSE_UNICODE_UTF8
2509 bool wxMBConv_iconv::IsUTF8() const
2510 {
2511 return wxStricmp(m_name, "UTF-8") == 0 ||
2512 wxStricmp(m_name, "UTF8") == 0;
2513 }
2514 #endif
2515
2516 #endif // HAVE_ICONV
2517
2518
2519 // ============================================================================
2520 // Win32 conversion classes
2521 // ============================================================================
2522
2523 #ifdef wxHAVE_WIN32_MB2WC
2524
2525 // from utils.cpp
2526 #if wxUSE_FONTMAP
2527 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2528 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2529 #endif
2530
2531 class wxMBConv_win32 : public wxMBConv
2532 {
2533 public:
2534 wxMBConv_win32()
2535 {
2536 m_CodePage = CP_ACP;
2537 m_minMBCharWidth = 0;
2538 }
2539
2540 wxMBConv_win32(const wxMBConv_win32& conv)
2541 : wxMBConv()
2542 {
2543 m_CodePage = conv.m_CodePage;
2544 m_minMBCharWidth = conv.m_minMBCharWidth;
2545 }
2546
2547 #if wxUSE_FONTMAP
2548 wxMBConv_win32(const char* name)
2549 {
2550 m_CodePage = wxCharsetToCodepage(name);
2551 m_minMBCharWidth = 0;
2552 }
2553
2554 wxMBConv_win32(wxFontEncoding encoding)
2555 {
2556 m_CodePage = wxEncodingToCodepage(encoding);
2557 m_minMBCharWidth = 0;
2558 }
2559 #endif // wxUSE_FONTMAP
2560
2561 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2562 {
2563 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2564 // the behaviour is not compatible with the Unix version (using iconv)
2565 // and break the library itself, e.g. wxTextInputStream::NextChar()
2566 // wouldn't work if reading an incomplete MB char didn't result in an
2567 // error
2568 //
2569 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2570 // Win XP or newer and it is not supported for UTF-[78] so we always
2571 // use our own conversions in this case. See
2572 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2573 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2574 if ( m_CodePage == CP_UTF8 )
2575 {
2576 return wxMBConvUTF8().MB2WC(buf, psz, n);
2577 }
2578
2579 if ( m_CodePage == CP_UTF7 )
2580 {
2581 return wxMBConvUTF7().MB2WC(buf, psz, n);
2582 }
2583
2584 int flags = 0;
2585 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2586 IsAtLeastWin2kSP4() )
2587 {
2588 flags = MB_ERR_INVALID_CHARS;
2589 }
2590
2591 const size_t len = ::MultiByteToWideChar
2592 (
2593 m_CodePage, // code page
2594 flags, // flags: fall on error
2595 psz, // input string
2596 -1, // its length (NUL-terminated)
2597 buf, // output string
2598 buf ? n : 0 // size of output buffer
2599 );
2600 if ( !len )
2601 {
2602 // function totally failed
2603 return wxCONV_FAILED;
2604 }
2605
2606 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2607 // check if we succeeded, by doing a double trip:
2608 if ( !flags && buf )
2609 {
2610 const size_t mbLen = strlen(psz);
2611 wxCharBuffer mbBuf(mbLen);
2612 if ( ::WideCharToMultiByte
2613 (
2614 m_CodePage,
2615 0,
2616 buf,
2617 -1,
2618 mbBuf.data(),
2619 mbLen + 1, // size in bytes, not length
2620 NULL,
2621 NULL
2622 ) == 0 ||
2623 strcmp(mbBuf, psz) != 0 )
2624 {
2625 // we didn't obtain the same thing we started from, hence
2626 // the conversion was lossy and we consider that it failed
2627 return wxCONV_FAILED;
2628 }
2629 }
2630
2631 // note that it returns count of written chars for buf != NULL and size
2632 // of the needed buffer for buf == NULL so in either case the length of
2633 // the string (which never includes the terminating NUL) is one less
2634 return len - 1;
2635 }
2636
2637 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2638 {
2639 /*
2640 we have a problem here: by default, WideCharToMultiByte() may
2641 replace characters unrepresentable in the target code page with bad
2642 quality approximations such as turning "1/2" symbol (U+00BD) into
2643 "1" for the code pages which don't have it and we, obviously, want
2644 to avoid this at any price
2645
2646 the trouble is that this function does it _silently_, i.e. it won't
2647 even tell us whether it did or not... Win98/2000 and higher provide
2648 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2649 we have to resort to a round trip, i.e. check that converting back
2650 results in the same string -- this is, of course, expensive but
2651 otherwise we simply can't be sure to not garble the data.
2652 */
2653
2654 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2655 // it doesn't work with CJK encodings (which we test for rather roughly
2656 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2657 // supporting it
2658 BOOL usedDef wxDUMMY_INITIALIZE(false);
2659 BOOL *pUsedDef;
2660 int flags;
2661 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2662 {
2663 // it's our lucky day
2664 flags = WC_NO_BEST_FIT_CHARS;
2665 pUsedDef = &usedDef;
2666 }
2667 else // old system or unsupported encoding
2668 {
2669 flags = 0;
2670 pUsedDef = NULL;
2671 }
2672
2673 const size_t len = ::WideCharToMultiByte
2674 (
2675 m_CodePage, // code page
2676 flags, // either none or no best fit
2677 pwz, // input string
2678 -1, // it is (wide) NUL-terminated
2679 buf, // output buffer
2680 buf ? n : 0, // and its size
2681 NULL, // default "replacement" char
2682 pUsedDef // [out] was it used?
2683 );
2684
2685 if ( !len )
2686 {
2687 // function totally failed
2688 return wxCONV_FAILED;
2689 }
2690
2691 // we did something, check if we really succeeded
2692 if ( flags )
2693 {
2694 // check if the conversion failed, i.e. if any replacements
2695 // were done
2696 if ( usedDef )
2697 return wxCONV_FAILED;
2698 }
2699 else // we must resort to double tripping...
2700 {
2701 // first we need to ensure that we really have the MB data: this is
2702 // not the case if we're called with NULL buffer, in which case we
2703 // need to do the conversion yet again
2704 wxCharBuffer bufDef;
2705 if ( !buf )
2706 {
2707 bufDef = wxCharBuffer(len);
2708 buf = bufDef.data();
2709 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2710 buf, len, NULL, NULL) )
2711 return wxCONV_FAILED;
2712 }
2713
2714 if ( !n )
2715 n = wcslen(pwz);
2716 wxWCharBuffer wcBuf(n);
2717 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2718 wcscmp(wcBuf, pwz) != 0 )
2719 {
2720 // we didn't obtain the same thing we started from, hence
2721 // the conversion was lossy and we consider that it failed
2722 return wxCONV_FAILED;
2723 }
2724 }
2725
2726 // see the comment above for the reason of "len - 1"
2727 return len - 1;
2728 }
2729
2730 virtual size_t GetMBNulLen() const
2731 {
2732 if ( m_minMBCharWidth == 0 )
2733 {
2734 int len = ::WideCharToMultiByte
2735 (
2736 m_CodePage, // code page
2737 0, // no flags
2738 L"", // input string
2739 1, // translate just the NUL
2740 NULL, // output buffer
2741 0, // and its size
2742 NULL, // no replacement char
2743 NULL // [out] don't care if it was used
2744 );
2745
2746 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2747 switch ( len )
2748 {
2749 default:
2750 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2751 self->m_minMBCharWidth = (size_t)-1;
2752 break;
2753
2754 case 0:
2755 self->m_minMBCharWidth = (size_t)-1;
2756 break;
2757
2758 case 1:
2759 case 2:
2760 case 4:
2761 self->m_minMBCharWidth = len;
2762 break;
2763 }
2764 }
2765
2766 return m_minMBCharWidth;
2767 }
2768
2769 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2770
2771 bool IsOk() const { return m_CodePage != -1; }
2772
2773 private:
2774 static bool CanUseNoBestFit()
2775 {
2776 static int s_isWin98Or2k = -1;
2777
2778 if ( s_isWin98Or2k == -1 )
2779 {
2780 int verMaj, verMin;
2781 switch ( wxGetOsVersion(&verMaj, &verMin) )
2782 {
2783 case wxOS_WINDOWS_9X:
2784 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2785 break;
2786
2787 case wxOS_WINDOWS_NT:
2788 s_isWin98Or2k = verMaj >= 5;
2789 break;
2790
2791 default:
2792 // unknown: be conservative by default
2793 s_isWin98Or2k = 0;
2794 break;
2795 }
2796
2797 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2798 }
2799
2800 return s_isWin98Or2k == 1;
2801 }
2802
2803 static bool IsAtLeastWin2kSP4()
2804 {
2805 #ifdef __WXWINCE__
2806 return false;
2807 #else
2808 static int s_isAtLeastWin2kSP4 = -1;
2809
2810 if ( s_isAtLeastWin2kSP4 == -1 )
2811 {
2812 OSVERSIONINFOEX ver;
2813
2814 memset(&ver, 0, sizeof(ver));
2815 ver.dwOSVersionInfoSize = sizeof(ver);
2816 GetVersionEx((OSVERSIONINFO*)&ver);
2817
2818 s_isAtLeastWin2kSP4 =
2819 ((ver.dwMajorVersion > 5) || // Vista+
2820 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2821 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2822 ver.wServicePackMajor >= 4)) // 2000 SP4+
2823 ? 1 : 0;
2824 }
2825
2826 return s_isAtLeastWin2kSP4 == 1;
2827 #endif
2828 }
2829
2830
2831 // the code page we're working with
2832 long m_CodePage;
2833
2834 // cached result of GetMBNulLen(), set to 0 initially meaning
2835 // "unknown"
2836 size_t m_minMBCharWidth;
2837 };
2838
2839 #endif // wxHAVE_WIN32_MB2WC
2840
2841
2842 // ============================================================================
2843 // wxEncodingConverter based conversion classes
2844 // ============================================================================
2845
2846 #if wxUSE_FONTMAP
2847
2848 class wxMBConv_wxwin : public wxMBConv
2849 {
2850 private:
2851 void Init()
2852 {
2853 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2854 // The wxMBConv_cf class does a better job.
2855 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2856 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2857 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2858 }
2859
2860 public:
2861 // temporarily just use wxEncodingConverter stuff,
2862 // so that it works while a better implementation is built
2863 wxMBConv_wxwin(const char* name)
2864 {
2865 if (name)
2866 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2867 else
2868 m_enc = wxFONTENCODING_SYSTEM;
2869
2870 Init();
2871 }
2872
2873 wxMBConv_wxwin(wxFontEncoding enc)
2874 {
2875 m_enc = enc;
2876
2877 Init();
2878 }
2879
2880 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2881 {
2882 size_t inbuf = strlen(psz);
2883 if (buf)
2884 {
2885 if (!m2w.Convert(psz, buf))
2886 return wxCONV_FAILED;
2887 }
2888 return inbuf;
2889 }
2890
2891 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2892 {
2893 const size_t inbuf = wxWcslen(psz);
2894 if (buf)
2895 {
2896 if (!w2m.Convert(psz, buf))
2897 return wxCONV_FAILED;
2898 }
2899
2900 return inbuf;
2901 }
2902
2903 virtual size_t GetMBNulLen() const
2904 {
2905 switch ( m_enc )
2906 {
2907 case wxFONTENCODING_UTF16BE:
2908 case wxFONTENCODING_UTF16LE:
2909 return 2;
2910
2911 case wxFONTENCODING_UTF32BE:
2912 case wxFONTENCODING_UTF32LE:
2913 return 4;
2914
2915 default:
2916 return 1;
2917 }
2918 }
2919
2920 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2921
2922 bool IsOk() const { return m_ok; }
2923
2924 public:
2925 wxFontEncoding m_enc;
2926 wxEncodingConverter m2w, w2m;
2927
2928 private:
2929 // were we initialized successfully?
2930 bool m_ok;
2931
2932 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2933 };
2934
2935 // make the constructors available for unit testing
2936 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2937 {
2938 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2939 if ( !result->IsOk() )
2940 {
2941 delete result;
2942 return 0;
2943 }
2944
2945 return result;
2946 }
2947
2948 #endif // wxUSE_FONTMAP
2949
2950 // ============================================================================
2951 // wxCSConv implementation
2952 // ============================================================================
2953
2954 void wxCSConv::Init()
2955 {
2956 m_name = NULL;
2957 m_convReal = NULL;
2958 }
2959
2960 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2961 {
2962 switch ( encoding )
2963 {
2964 case wxFONTENCODING_MAX:
2965 case wxFONTENCODING_SYSTEM:
2966 if ( m_name )
2967 {
2968 // It's ok to not have encoding value if we have a name for it.
2969 m_encoding = wxFONTENCODING_SYSTEM;
2970 }
2971 else // No name neither.
2972 {
2973 // Fall back to the system default encoding in this case (not
2974 // sure how much sense does this make but this is how the old
2975 // code used to behave).
2976 #if wxUSE_INTL
2977 m_encoding = wxLocale::GetSystemEncoding();
2978 if ( m_encoding == wxFONTENCODING_SYSTEM )
2979 #endif // wxUSE_INTL
2980 m_encoding = wxFONTENCODING_ISO8859_1;
2981 }
2982 break;
2983
2984 case wxFONTENCODING_DEFAULT:
2985 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2986 m_encoding = wxFONTENCODING_ISO8859_1;
2987 break;
2988
2989 default:
2990 // Just use the provided encoding.
2991 m_encoding = encoding;
2992 }
2993 }
2994
2995 wxCSConv::wxCSConv(const wxString& charset)
2996 {
2997 Init();
2998
2999 if ( !charset.empty() )
3000 {
3001 SetName(charset.ToAscii());
3002 }
3003
3004 #if wxUSE_FONTMAP
3005 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3006 #else
3007 SetEncoding(wxFONTENCODING_SYSTEM);
3008 #endif
3009
3010 m_convReal = DoCreate();
3011 }
3012
3013 wxCSConv::wxCSConv(wxFontEncoding encoding)
3014 {
3015 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3016 {
3017 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3018
3019 encoding = wxFONTENCODING_SYSTEM;
3020 }
3021
3022 Init();
3023
3024 SetEncoding(encoding);
3025
3026 m_convReal = DoCreate();
3027 }
3028
3029 wxCSConv::~wxCSConv()
3030 {
3031 Clear();
3032 }
3033
3034 wxCSConv::wxCSConv(const wxCSConv& conv)
3035 : wxMBConv()
3036 {
3037 Init();
3038
3039 SetName(conv.m_name);
3040 SetEncoding(conv.m_encoding);
3041
3042 m_convReal = DoCreate();
3043 }
3044
3045 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3046 {
3047 Clear();
3048
3049 SetName(conv.m_name);
3050 SetEncoding(conv.m_encoding);
3051
3052 m_convReal = DoCreate();
3053
3054 return *this;
3055 }
3056
3057 void wxCSConv::Clear()
3058 {
3059 free(m_name);
3060 m_name = NULL;
3061
3062 wxDELETE(m_convReal);
3063 }
3064
3065 void wxCSConv::SetName(const char *charset)
3066 {
3067 if ( charset )
3068 m_name = wxStrdup(charset);
3069 }
3070
3071 #if wxUSE_FONTMAP
3072
3073 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3074 wxEncodingNameCache );
3075
3076 static wxEncodingNameCache gs_nameCache;
3077 #endif
3078
3079 wxMBConv *wxCSConv::DoCreate() const
3080 {
3081 #if wxUSE_FONTMAP
3082 wxLogTrace(TRACE_STRCONV,
3083 wxT("creating conversion for %s"),
3084 (m_name ? m_name
3085 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3086 #endif // wxUSE_FONTMAP
3087
3088 // check for the special case of ASCII or ISO8859-1 charset: as we have
3089 // special knowledge of it anyhow, we don't need to create a special
3090 // conversion object
3091 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3092 {
3093 // don't convert at all
3094 return NULL;
3095 }
3096
3097 // we trust OS to do conversion better than we can so try external
3098 // conversion methods first
3099 //
3100 // the full order is:
3101 // 1. OS conversion (iconv() under Unix or Win32 API)
3102 // 2. hard coded conversions for UTF
3103 // 3. wxEncodingConverter as fall back
3104
3105 // step (1)
3106 #ifdef HAVE_ICONV
3107 #if !wxUSE_FONTMAP
3108 if ( m_name )
3109 #endif // !wxUSE_FONTMAP
3110 {
3111 #if wxUSE_FONTMAP
3112 wxFontEncoding encoding(m_encoding);
3113 #endif
3114
3115 if ( m_name )
3116 {
3117 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3118 if ( conv->IsOk() )
3119 return conv;
3120
3121 delete conv;
3122
3123 #if wxUSE_FONTMAP
3124 encoding =
3125 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3126 #endif // wxUSE_FONTMAP
3127 }
3128 #if wxUSE_FONTMAP
3129 {
3130 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3131 if ( it != gs_nameCache.end() )
3132 {
3133 if ( it->second.empty() )
3134 return NULL;
3135
3136 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3137 if ( conv->IsOk() )
3138 return conv;
3139
3140 delete conv;
3141 }
3142
3143 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3144 // CS : in case this does not return valid names (eg for MacRoman)
3145 // encoding got a 'failure' entry in the cache all the same,
3146 // although it just has to be created using a different method, so
3147 // only store failed iconv creation attempts (or perhaps we
3148 // shoulnd't do this at all ?)
3149 if ( names[0] != NULL )
3150 {
3151 for ( ; *names; ++names )
3152 {
3153 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3154 // will need changes that will obsolete this
3155 wxString name(*names);
3156 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3157 if ( conv->IsOk() )
3158 {
3159 gs_nameCache[encoding] = *names;
3160 return conv;
3161 }
3162
3163 delete conv;
3164 }
3165
3166 gs_nameCache[encoding] = wxT(""); // cache the failure
3167 }
3168 }
3169 #endif // wxUSE_FONTMAP
3170 }
3171 #endif // HAVE_ICONV
3172
3173 #ifdef wxHAVE_WIN32_MB2WC
3174 {
3175 #if wxUSE_FONTMAP
3176 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3177 : new wxMBConv_win32(m_encoding);
3178 if ( conv->IsOk() )
3179 return conv;
3180
3181 delete conv;
3182 #else
3183 return NULL;
3184 #endif
3185 }
3186 #endif // wxHAVE_WIN32_MB2WC
3187
3188 #ifdef __DARWIN__
3189 {
3190 // leave UTF16 and UTF32 to the built-ins of wx
3191 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3192 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3193 {
3194 #if wxUSE_FONTMAP
3195 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3196 : new wxMBConv_cf(m_encoding);
3197 #else
3198 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3199 #endif
3200
3201 if ( conv->IsOk() )
3202 return conv;
3203
3204 delete conv;
3205 }
3206 }
3207 #endif // __DARWIN__
3208
3209 // step (2)
3210 wxFontEncoding enc = m_encoding;
3211 #if wxUSE_FONTMAP
3212 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3213 {
3214 // use "false" to suppress interactive dialogs -- we can be called from
3215 // anywhere and popping up a dialog from here is the last thing we want to
3216 // do
3217 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3218 }
3219 #endif // wxUSE_FONTMAP
3220
3221 switch ( enc )
3222 {
3223 case wxFONTENCODING_UTF7:
3224 return new wxMBConvUTF7;
3225
3226 case wxFONTENCODING_UTF8:
3227 return new wxMBConvUTF8;
3228
3229 case wxFONTENCODING_UTF16BE:
3230 return new wxMBConvUTF16BE;
3231
3232 case wxFONTENCODING_UTF16LE:
3233 return new wxMBConvUTF16LE;
3234
3235 case wxFONTENCODING_UTF32BE:
3236 return new wxMBConvUTF32BE;
3237
3238 case wxFONTENCODING_UTF32LE:
3239 return new wxMBConvUTF32LE;
3240
3241 default:
3242 // nothing to do but put here to suppress gcc warnings
3243 break;
3244 }
3245
3246 // step (3)
3247 #if wxUSE_FONTMAP
3248 {
3249 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3250 : new wxMBConv_wxwin(m_encoding);
3251 if ( conv->IsOk() )
3252 return conv;
3253
3254 delete conv;
3255 }
3256
3257 wxLogTrace(TRACE_STRCONV,
3258 wxT("encoding \"%s\" is not supported by this system"),
3259 (m_name ? wxString(m_name)
3260 : wxFontMapperBase::GetEncodingName(m_encoding)));
3261 #endif // wxUSE_FONTMAP
3262
3263 return NULL;
3264 }
3265
3266 bool wxCSConv::IsOk() const
3267 {
3268 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3269 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3270 return true; // always ok as we do it ourselves
3271
3272 // m_convReal->IsOk() is called at its own creation, so we know it must
3273 // be ok if m_convReal is non-NULL
3274 return m_convReal != NULL;
3275 }
3276
3277 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3278 const char *src, size_t srcLen) const
3279 {
3280 if (m_convReal)
3281 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3282
3283 // latin-1 (direct)
3284 if ( srcLen == wxNO_LEN )
3285 srcLen = strlen(src) + 1; // take trailing NUL too
3286
3287 if ( dst )
3288 {
3289 if ( dstLen < srcLen )
3290 return wxCONV_FAILED;
3291
3292 for ( size_t n = 0; n < srcLen; n++ )
3293 dst[n] = (unsigned char)(src[n]);
3294 }
3295
3296 return srcLen;
3297 }
3298
3299 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3300 const wchar_t *src, size_t srcLen) const
3301 {
3302 if (m_convReal)
3303 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3304
3305 // latin-1 (direct)
3306 if ( srcLen == wxNO_LEN )
3307 srcLen = wxWcslen(src) + 1;
3308
3309 if ( dst )
3310 {
3311 if ( dstLen < srcLen )
3312 return wxCONV_FAILED;
3313
3314 for ( size_t n = 0; n < srcLen; n++ )
3315 {
3316 if ( src[n] > 0xFF )
3317 return wxCONV_FAILED;
3318
3319 dst[n] = (char)src[n];
3320 }
3321
3322 }
3323 else // still need to check the input validity
3324 {
3325 for ( size_t n = 0; n < srcLen; n++ )
3326 {
3327 if ( src[n] > 0xFF )
3328 return wxCONV_FAILED;
3329 }
3330 }
3331
3332 return srcLen;
3333 }
3334
3335 size_t wxCSConv::GetMBNulLen() const
3336 {
3337 if ( m_convReal )
3338 return m_convReal->GetMBNulLen();
3339
3340 // otherwise, we are ISO-8859-1
3341 return 1;
3342 }
3343
3344 #if wxUSE_UNICODE_UTF8
3345 bool wxCSConv::IsUTF8() const
3346 {
3347 if ( m_convReal )
3348 return m_convReal->IsUTF8();
3349
3350 // otherwise, we are ISO-8859-1
3351 return false;
3352 }
3353 #endif
3354
3355
3356 #if wxUSE_UNICODE
3357
3358 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3359 {
3360 if ( !s )
3361 return wxWCharBuffer();
3362
3363 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3364 if ( !wbuf )
3365 wbuf = wxMBConvUTF8().cMB2WX(s);
3366 if ( !wbuf )
3367 wbuf = wxConvISO8859_1.cMB2WX(s);
3368
3369 return wbuf;
3370 }
3371
3372 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3373 {
3374 if ( !ws )
3375 return wxCharBuffer();
3376
3377 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3378 if ( !buf )
3379 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3380
3381 return buf;
3382 }
3383
3384 #endif // wxUSE_UNICODE
3385
3386 // ----------------------------------------------------------------------------
3387 // globals
3388 // ----------------------------------------------------------------------------
3389
3390 // NB: The reason why we create converted objects in this convoluted way,
3391 // using a factory function instead of global variable, is that they
3392 // may be used at static initialization time (some of them are used by
3393 // wxString ctors and there may be a global wxString object). In other
3394 // words, possibly _before_ the converter global object would be
3395 // initialized.
3396
3397 #undef wxConvLibc
3398 #undef wxConvUTF8
3399 #undef wxConvUTF7
3400 #undef wxConvLocal
3401 #undef wxConvISO8859_1
3402
3403 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3404 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3405 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3406 { \
3407 static impl_klass name##Obj ctor_args; \
3408 return &name##Obj; \
3409 } \
3410 /* this ensures that all global converter objects are created */ \
3411 /* by the time static initialization is done, i.e. before any */ \
3412 /* thread is launched: */ \
3413 static klass* gs_##name##instance = wxGet_##name##Ptr()
3414
3415 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3416 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3417
3418 #ifdef __INTELC__
3419 // disable warning "variable 'xxx' was declared but never referenced"
3420 #pragma warning(disable: 177)
3421 #endif // Intel C++
3422
3423 #ifdef __WINDOWS__
3424 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3425 #elif 0 // defined(__WXOSX__)
3426 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3427 #else
3428 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3429 #endif
3430
3431 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3432 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3433 // provokes an error message about "not enough macro parameters"; and we
3434 // can't use "()" here as the name##Obj declaration would be parsed as a
3435 // function declaration then, so use a semicolon and live with an extra
3436 // empty statement (and hope that no compilers warns about this)
3437 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3438 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3439
3440 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3441 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3442
3443 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3444 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3445
3446 #ifdef __DARWIN__
3447 // It is important to use this conversion object under Darwin as it ensures
3448 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3449 // decomposed form internally (at least for the file names).
3450 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3451 #endif
3452
3453 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3454 #ifdef __DARWIN__
3455 &wxConvMacUTF8DObj;
3456 #else // !__DARWIN__
3457 wxGet_wxConvLibcPtr();
3458 #endif // __DARWIN__/!__DARWIN__