]> git.saurik.com Git - wxWidgets.git/blame_incremental - src/common/strconv.cpp
add very simple (but already exposing many problems) wxIPC benchmark
[wxWidgets.git] / src / common / strconv.cpp
... / ...
CommitLineData
1/////////////////////////////////////////////////////////////////////////////
2// Name: src/common/strconv.cpp
3// Purpose: Unicode conversion classes
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
11// (c) 2004 Ryan Norton, Fredrik Roubert
12// Licence: wxWindows licence
13/////////////////////////////////////////////////////////////////////////////
14
15// For compilers that support precompilation, includes "wx.h".
16#include "wx/wxprec.h"
17
18#ifdef __BORLANDC__
19 #pragma hdrstop
20#endif //__BORLANDC__
21
22#ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27#endif
28
29#include "wx/strconv.h"
30
31#if wxUSE_WCHAR_T
32
33#ifndef __WXWINCE__
34#include <errno.h>
35#endif
36
37#include <ctype.h>
38#include <string.h>
39#include <stdlib.h>
40
41#if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45#endif
46
47#ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50#endif
51
52#include "wx/encconv.h"
53#include "wx/fontmap.h"
54
55#ifdef __DARWIN__
56#include "wx/osx/core/private/strconv_cf.h"
57#endif //def __DARWIN__
58
59
60#define TRACE_STRCONV _T("strconv")
61
62// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63// be 4 bytes
64#if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66#endif
67
68
69// ============================================================================
70// implementation
71// ============================================================================
72
73// helper function of cMB2WC(): check if n bytes at this location are all NUL
74static bool NotAllNULs(const char *p, size_t n)
75{
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80}
81
82// ----------------------------------------------------------------------------
83// UTF-16 en/decoding to/from UCS-4 with surrogates handling
84// ----------------------------------------------------------------------------
85
86static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87{
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109}
110
111static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112{
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128}
129
130#ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132#else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134#endif // WC_UTF16/!WC_UTF16
135
136// returns the next UTF-32 character from the wchar_t buffer and advances the
137// pointer to the character after this one
138//
139// if an invalid character is found, *pSrc is set to NULL, the caller must
140// check for this
141static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142{
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152}
153
154// ----------------------------------------------------------------------------
155// wxMBConv
156// ----------------------------------------------------------------------------
157
158size_t
159wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161{
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
227 for ( ;; )
228 {
229 // try to convert the current chunk
230 size_t lenChunk = MB2WC(NULL, src, 0);
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dstWritten += lenChunk;
235 if ( !srcEnd )
236 dstWritten++;
237
238 if ( !lenChunk )
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
254 if ( !srcEnd )
255 dst++;
256 }
257
258 if ( !srcEnd )
259 {
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
262 break;
263 }
264
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 src += nulLen; // skipping over its terminator as well
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
280 if ( src >= srcEnd )
281 break;
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
292 }
293
294 return dstWritten;
295}
296
297size_t
298wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
300{
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
303
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
311 wxWCharBuffer bufTmp;
312 if ( isNulTerminated )
313 {
314 srcLen = wxWcslen(src) + 1;
315 }
316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
317 {
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp = wxWCharBuffer(srcLen);
320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
335 dstWritten += lenChunk;
336 if ( isNulTerminated )
337 dstWritten += lenNul;
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
348 if ( isNulTerminated )
349 dst += lenNul;
350 }
351 }
352
353 return dstWritten;
354}
355
356size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
357{
358 // add 1 to available buffer length because MB2WC() parameter counts the
359 // number of non-NUL characters while ToWChar() counts everything
360 size_t rc = ToWChar(outBuff, outLen + 1, inBuff);
361 if ( rc != wxCONV_FAILED )
362 {
363 // ToWChar() returns the buffer length, i.e. including the trailing
364 // NUL, while this method doesn't take it into account
365 rc--;
366 }
367
368 return rc;
369}
370
371size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
372{
373 const size_t nulLen = GetMBNulLen();
374
375 size_t rc = FromWChar(outBuff, outLen + nulLen, inBuff);
376 if ( rc != wxCONV_FAILED )
377 {
378 rc -= nulLen;
379 }
380
381 return rc;
382}
383
384wxMBConv::~wxMBConv()
385{
386 // nothing to do here (necessary for Darwin linking probably)
387}
388
389const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
390{
391 if ( psz )
392 {
393 // calculate the length of the buffer needed first
394 const size_t nLen = ToWChar(NULL, 0, psz);
395 if ( nLen != wxCONV_FAILED )
396 {
397 // now do the actual conversion
398 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
399
400 // +1 for the trailing NULL
401 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
402 return buf;
403 }
404 }
405
406 return wxWCharBuffer();
407}
408
409const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
410{
411 if ( pwz )
412 {
413 const size_t nLen = FromWChar(NULL, 0, pwz);
414 if ( nLen != wxCONV_FAILED )
415 {
416 wxCharBuffer buf(nLen - 1);
417 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
418 return buf;
419 }
420 }
421
422 return wxCharBuffer();
423}
424
425const wxWCharBuffer
426wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
427{
428 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
429 if ( dstLen != wxCONV_FAILED )
430 {
431 // notice that we allocate space for dstLen+1 wide characters here
432 // because we want the buffer to always be NUL-terminated, even if the
433 // input isn't (as otherwise the caller has no way to know its length)
434 wxWCharBuffer wbuf(dstLen);
435 wbuf.data()[dstLen] = L'\0';
436 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
437 {
438 if ( outLen )
439 {
440 *outLen = dstLen;
441
442 // we also need to handle NUL-terminated input strings
443 // specially: for them the output is the length of the string
444 // excluding the trailing NUL, however if we're asked to
445 // convert a specific number of characters we return the length
446 // of the resulting output even if it's NUL-terminated
447 if ( inLen == wxNO_LEN )
448 (*outLen)--;
449 }
450
451 return wbuf;
452 }
453 }
454
455 if ( outLen )
456 *outLen = 0;
457
458 return wxWCharBuffer();
459}
460
461const wxCharBuffer
462wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
463{
464 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
465 if ( dstLen != wxCONV_FAILED )
466 {
467 const size_t nulLen = GetMBNulLen();
468
469 // as above, ensure that the buffer is always NUL-terminated, even if
470 // the input is not
471 wxCharBuffer buf(dstLen + nulLen - 1);
472 memset(buf.data() + dstLen, 0, nulLen);
473 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
474 {
475 if ( outLen )
476 {
477 *outLen = dstLen;
478
479 if ( inLen == wxNO_LEN )
480 {
481 // in this case both input and output are NUL-terminated
482 // and we're not supposed to count NUL
483 *outLen -= nulLen;
484 }
485 }
486
487 return buf;
488 }
489 }
490
491 if ( outLen )
492 *outLen = 0;
493
494 return wxCharBuffer();
495}
496
497// ----------------------------------------------------------------------------
498// wxMBConvLibc
499// ----------------------------------------------------------------------------
500
501size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
502{
503 return wxMB2WC(buf, psz, n);
504}
505
506size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
507{
508 return wxWC2MB(buf, psz, n);
509}
510
511// ----------------------------------------------------------------------------
512// wxConvBrokenFileNames
513// ----------------------------------------------------------------------------
514
515#ifdef __UNIX__
516
517wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
518{
519 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
520 wxStricmp(charset, _T("UTF8")) == 0 )
521 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
522 else
523 m_conv = new wxCSConv(charset);
524}
525
526#endif // __UNIX__
527
528// ----------------------------------------------------------------------------
529// UTF-7
530// ----------------------------------------------------------------------------
531
532// Implementation (C) 2004 Fredrik Roubert
533//
534// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
535
536//
537// BASE64 decoding table
538//
539static const unsigned char utf7unb64[] =
540{
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
543 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
544 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
546 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
547 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
548 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
550 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
551 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
552 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
554 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
555 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
556 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
569 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
570 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
571 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
573};
574
575size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
576 const char *src, size_t srcLen) const
577{
578 DecoderState stateOrig,
579 *statePtr;
580 if ( srcLen == wxNO_LEN )
581 {
582 // convert the entire string, up to and including the trailing NUL
583 srcLen = strlen(src) + 1;
584
585 // when working on the entire strings we don't update nor use the shift
586 // state from the previous call
587 statePtr = &stateOrig;
588 }
589 else // when working with partial strings we do use the shift state
590 {
591 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
592
593 // also save the old state to be able to rollback to it on error
594 stateOrig = m_stateDecoder;
595 }
596
597 // but to simplify the code below we use this variable in both cases
598 DecoderState& state = *statePtr;
599
600
601 // number of characters [which would have been] written to dst [if it were
602 // not NULL]
603 size_t len = 0;
604
605 const char * const srcEnd = src + srcLen;
606
607 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
608 {
609 const unsigned char cc = *src++;
610
611 if ( state.IsShifted() )
612 {
613 const unsigned char dc = utf7unb64[cc];
614 if ( dc == 0xff )
615 {
616 // end of encoded part, check that nothing was left: there can
617 // be up to 4 bits of 0 padding but nothing else (we also need
618 // to check isLSB as we count bits modulo 8 while a valid UTF-7
619 // encoded sequence must contain an integral number of UTF-16
620 // characters)
621 if ( state.isLSB || state.bit > 4 ||
622 (state.accum & ((1 << state.bit) - 1)) )
623 {
624 if ( !len )
625 state = stateOrig;
626
627 return wxCONV_FAILED;
628 }
629
630 state.ToDirect();
631
632 // re-parse this character normally below unless it's '-' which
633 // is consumed by the decoder
634 if ( cc == '-' )
635 continue;
636 }
637 else // valid encoded character
638 {
639 // mini base64 decoder: each character is 6 bits
640 state.bit += 6;
641 state.accum <<= 6;
642 state.accum += dc;
643
644 if ( state.bit >= 8 )
645 {
646 // got the full byte, consume it
647 state.bit -= 8;
648 unsigned char b = (state.accum >> state.bit) & 0x00ff;
649
650 if ( state.isLSB )
651 {
652 // we've got the full word, output it
653 if ( dst )
654 *dst++ = (state.msb << 8) | b;
655 len++;
656 state.isLSB = false;
657 }
658 else // MSB
659 {
660 // just store it while we wait for LSB
661 state.msb = b;
662 state.isLSB = true;
663 }
664 }
665 }
666 }
667
668 if ( state.IsDirect() )
669 {
670 // start of an encoded segment?
671 if ( cc == '+' )
672 {
673 if ( *src == '-' )
674 {
675 // just the encoded plus sign, don't switch to shifted mode
676 if ( dst )
677 *dst++ = '+';
678 len++;
679 src++;
680 }
681 else if ( utf7unb64[(unsigned)*src] == 0xff )
682 {
683 // empty encoded chunks are not allowed
684 if ( !len )
685 state = stateOrig;
686
687 return wxCONV_FAILED;
688 }
689 else // base-64 encoded chunk follows
690 {
691 state.ToShifted();
692 }
693 }
694 else // not '+'
695 {
696 // only printable 7 bit ASCII characters (with the exception of
697 // NUL, TAB, CR and LF) can be used directly
698 if ( cc >= 0x7f || (cc < ' ' &&
699 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
700 return wxCONV_FAILED;
701
702 if ( dst )
703 *dst++ = cc;
704 len++;
705 }
706 }
707 }
708
709 if ( !len )
710 {
711 // as we didn't read any characters we should be called with the same
712 // data (followed by some more new data) again later so don't save our
713 // state
714 state = stateOrig;
715
716 return wxCONV_FAILED;
717 }
718
719 return len;
720}
721
722//
723// BASE64 encoding table
724//
725static const unsigned char utf7enb64[] =
726{
727 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
728 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
729 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
730 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
731 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
732 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
733 'w', 'x', 'y', 'z', '0', '1', '2', '3',
734 '4', '5', '6', '7', '8', '9', '+', '/'
735};
736
737//
738// UTF-7 encoding table
739//
740// 0 - Set D (directly encoded characters)
741// 1 - Set O (optional direct characters)
742// 2 - whitespace characters (optional)
743// 3 - special characters
744//
745static const unsigned char utf7encode[128] =
746{
747 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
748 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
749 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
751 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
753 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
755};
756
757static inline bool wxIsUTF7Direct(wchar_t wc)
758{
759 return wc < 0x80 && utf7encode[wc] < 1;
760}
761
762size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
763 const wchar_t *src, size_t srcLen) const
764{
765 EncoderState stateOrig,
766 *statePtr;
767 if ( srcLen == wxNO_LEN )
768 {
769 // we don't apply the stored state when operating on entire strings at
770 // once
771 statePtr = &stateOrig;
772
773 srcLen = wxWcslen(src) + 1;
774 }
775 else // do use the mode we left the output in previously
776 {
777 stateOrig = m_stateEncoder;
778 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
779 }
780
781 EncoderState& state = *statePtr;
782
783
784 size_t len = 0;
785
786 const wchar_t * const srcEnd = src + srcLen;
787 while ( src < srcEnd && (!dst || len < dstLen) )
788 {
789 wchar_t cc = *src++;
790 if ( wxIsUTF7Direct(cc) )
791 {
792 if ( state.IsShifted() )
793 {
794 // pad with zeros the last encoded block if necessary
795 if ( state.bit )
796 {
797 if ( dst )
798 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
799 len++;
800 }
801
802 state.ToDirect();
803
804 if ( dst )
805 *dst++ = '-';
806 len++;
807 }
808
809 if ( dst )
810 *dst++ = (char)cc;
811 len++;
812 }
813 else if ( cc == '+' && state.IsDirect() )
814 {
815 if ( dst )
816 {
817 *dst++ = '+';
818 *dst++ = '-';
819 }
820
821 len += 2;
822 }
823#ifndef WC_UTF16
824 else if (((wxUint32)cc) > 0xffff)
825 {
826 // no surrogate pair generation (yet?)
827 return wxCONV_FAILED;
828 }
829#endif
830 else
831 {
832 if ( state.IsDirect() )
833 {
834 state.ToShifted();
835
836 if ( dst )
837 *dst++ = '+';
838 len++;
839 }
840
841 // BASE64 encode string
842 for ( ;; )
843 {
844 for ( unsigned lsb = 0; lsb < 2; lsb++ )
845 {
846 state.accum <<= 8;
847 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
848
849 for (state.bit += 8; state.bit >= 6; )
850 {
851 state.bit -= 6;
852 if ( dst )
853 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
854 len++;
855 }
856 }
857
858 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
859 break;
860
861 src++;
862 }
863 }
864 }
865
866 // we need to restore the original encoder state if we were called just to
867 // calculate the amount of space needed as we will presumably be called
868 // again to really convert the data now
869 if ( !dst )
870 state = stateOrig;
871
872 return len;
873}
874
875// ----------------------------------------------------------------------------
876// UTF-8
877// ----------------------------------------------------------------------------
878
879static const wxUint32 utf8_max[]=
880 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
881
882// boundaries of the private use area we use to (temporarily) remap invalid
883// characters invalid in a UTF-8 encoded string
884const wxUint32 wxUnicodePUA = 0x100000;
885const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
886
887// this table gives the length of the UTF-8 encoding from its first character:
888const unsigned char tableUtf8Lengths[256] = {
889 // single-byte sequences (ASCII):
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
895 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
896 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
897 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
898
899 // these are invalid:
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
904 0, 0, // C0,C1
905
906 // two-byte sequences:
907 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
908 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
909
910 // three-byte sequences:
911 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
912
913 // four-byte sequences:
914 4, 4, 4, 4, 4, // F0..F4
915
916 // these are invalid again (5- or 6-byte
917 // sequences and sequences for code points
918 // above U+10FFFF, as restricted by RFC 3629):
919 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
920};
921
922size_t
923wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
924 const char *src, size_t srcLen) const
925{
926 wchar_t *out = dstLen ? dst : NULL;
927 size_t written = 0;
928
929 if ( srcLen == wxNO_LEN )
930 srcLen = strlen(src) + 1;
931
932 for ( const char *p = src; ; p++ )
933 {
934 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
935 {
936 // all done successfully, just add the trailing NULL if we are not
937 // using explicit length
938 if ( srcLen == wxNO_LEN )
939 {
940 if ( out )
941 {
942 if ( !dstLen )
943 break;
944
945 *out = L'\0';
946 }
947
948 written++;
949 }
950
951 return written;
952 }
953
954 if ( out && !dstLen-- )
955 break;
956
957 wxUint32 code;
958 unsigned char c = *p;
959
960 if ( c < 0x80 )
961 {
962 if ( srcLen == 0 ) // the test works for wxNO_LEN too
963 break;
964
965 if ( srcLen != wxNO_LEN )
966 srcLen--;
967
968 code = c;
969 }
970 else
971 {
972 unsigned len = tableUtf8Lengths[c];
973 if ( !len )
974 break;
975
976 if ( srcLen < len ) // the test works for wxNO_LEN too
977 break;
978
979 if ( srcLen != wxNO_LEN )
980 srcLen -= len;
981
982 // Char. number range | UTF-8 octet sequence
983 // (hexadecimal) | (binary)
984 // ----------------------+----------------------------------------
985 // 0000 0000 - 0000 007F | 0xxxxxxx
986 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
987 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
988 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
989 //
990 // Code point value is stored in bits marked with 'x',
991 // lowest-order bit of the value on the right side in the diagram
992 // above. (from RFC 3629)
993
994 // mask to extract lead byte's value ('x' bits above), by sequence
995 // length:
996 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
997
998 // mask and value of lead byte's most significant bits, by length:
999 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1000 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1001
1002 len--; // it's more convenient to work with 0-based length here
1003
1004 // extract the lead byte's value bits:
1005 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1006 break;
1007
1008 code = c & leadValueMask[len];
1009
1010 // all remaining bytes, if any, are handled in the same way
1011 // regardless of sequence's length:
1012 for ( ; len; --len )
1013 {
1014 c = *++p;
1015 if ( (c & 0xC0) != 0x80 )
1016 return wxCONV_FAILED;
1017
1018 code <<= 6;
1019 code |= c & 0x3F;
1020 }
1021 }
1022
1023#ifdef WC_UTF16
1024 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1025 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1026 {
1027 if ( out )
1028 out++;
1029 written++;
1030 }
1031#else // !WC_UTF16
1032 if ( out )
1033 *out = code;
1034#endif // WC_UTF16/!WC_UTF16
1035
1036 if ( out )
1037 out++;
1038
1039 written++;
1040 }
1041
1042 return wxCONV_FAILED;
1043}
1044
1045size_t
1046wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1047 const wchar_t *src, size_t srcLen) const
1048{
1049 char *out = dstLen ? dst : NULL;
1050 size_t written = 0;
1051
1052 for ( const wchar_t *wp = src; ; wp++ )
1053 {
1054 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1055 {
1056 // all done successfully, just add the trailing NULL if we are not
1057 // using explicit length
1058 if ( srcLen == wxNO_LEN )
1059 {
1060 if ( out )
1061 {
1062 if ( !dstLen )
1063 break;
1064
1065 *out = '\0';
1066 }
1067
1068 written++;
1069 }
1070
1071 return written;
1072 }
1073
1074 if ( srcLen != wxNO_LEN )
1075 srcLen--;
1076
1077 wxUint32 code;
1078#ifdef WC_UTF16
1079 // cast is ok for WC_UTF16
1080 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1081 {
1082 // skip the next char too as we decoded a surrogate
1083 wp++;
1084 }
1085#else // wchar_t is UTF-32
1086 code = *wp & 0x7fffffff;
1087#endif
1088
1089 unsigned len;
1090 if ( code <= 0x7F )
1091 {
1092 len = 1;
1093 if ( out )
1094 {
1095 if ( dstLen < len )
1096 break;
1097
1098 out[0] = (char)code;
1099 }
1100 }
1101 else if ( code <= 0x07FF )
1102 {
1103 len = 2;
1104 if ( out )
1105 {
1106 if ( dstLen < len )
1107 break;
1108
1109 // NB: this line takes 6 least significant bits, encodes them as
1110 // 10xxxxxx and discards them so that the next byte can be encoded:
1111 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1112 out[0] = 0xC0 | code;
1113 }
1114 }
1115 else if ( code < 0xFFFF )
1116 {
1117 len = 3;
1118 if ( out )
1119 {
1120 if ( dstLen < len )
1121 break;
1122
1123 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1124 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1125 out[0] = 0xE0 | code;
1126 }
1127 }
1128 else if ( code <= 0x10FFFF )
1129 {
1130 len = 4;
1131 if ( out )
1132 {
1133 if ( dstLen < len )
1134 break;
1135
1136 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1137 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1138 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1139 out[0] = 0xF0 | code;
1140 }
1141 }
1142 else
1143 {
1144 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1145 break;
1146 }
1147
1148 if ( out )
1149 {
1150 out += len;
1151 dstLen -= len;
1152 }
1153
1154 written += len;
1155 }
1156
1157 // we only get here if an error occurs during decoding
1158 return wxCONV_FAILED;
1159}
1160
1161size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1162 const char *psz, size_t srcLen) const
1163{
1164 if ( m_options == MAP_INVALID_UTF8_NOT )
1165 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1166
1167 size_t len = 0;
1168
1169 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1170 {
1171 const char *opsz = psz;
1172 bool invalid = false;
1173 unsigned char cc = *psz++, fc = cc;
1174 unsigned cnt;
1175 for (cnt = 0; fc & 0x80; cnt++)
1176 fc <<= 1;
1177
1178 if (!cnt)
1179 {
1180 // plain ASCII char
1181 if (buf)
1182 *buf++ = cc;
1183 len++;
1184
1185 // escape the escape character for octal escapes
1186 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1187 && cc == '\\' && (!buf || len < n))
1188 {
1189 if (buf)
1190 *buf++ = cc;
1191 len++;
1192 }
1193 }
1194 else
1195 {
1196 cnt--;
1197 if (!cnt)
1198 {
1199 // invalid UTF-8 sequence
1200 invalid = true;
1201 }
1202 else
1203 {
1204 unsigned ocnt = cnt - 1;
1205 wxUint32 res = cc & (0x3f >> cnt);
1206 while (cnt--)
1207 {
1208 cc = *psz;
1209 if ((cc & 0xC0) != 0x80)
1210 {
1211 // invalid UTF-8 sequence
1212 invalid = true;
1213 break;
1214 }
1215
1216 psz++;
1217 res = (res << 6) | (cc & 0x3f);
1218 }
1219
1220 if (invalid || res <= utf8_max[ocnt])
1221 {
1222 // illegal UTF-8 encoding
1223 invalid = true;
1224 }
1225 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1226 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1227 {
1228 // if one of our PUA characters turns up externally
1229 // it must also be treated as an illegal sequence
1230 // (a bit like you have to escape an escape character)
1231 invalid = true;
1232 }
1233 else
1234 {
1235#ifdef WC_UTF16
1236 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1237 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1238 if (pa == wxCONV_FAILED)
1239 {
1240 invalid = true;
1241 }
1242 else
1243 {
1244 if (buf)
1245 buf += pa;
1246 len += pa;
1247 }
1248#else // !WC_UTF16
1249 if (buf)
1250 *buf++ = (wchar_t)res;
1251 len++;
1252#endif // WC_UTF16/!WC_UTF16
1253 }
1254 }
1255
1256 if (invalid)
1257 {
1258 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1259 {
1260 while (opsz < psz && (!buf || len < n))
1261 {
1262#ifdef WC_UTF16
1263 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1264 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1265 wxASSERT(pa != wxCONV_FAILED);
1266 if (buf)
1267 buf += pa;
1268 opsz++;
1269 len += pa;
1270#else
1271 if (buf)
1272 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1273 opsz++;
1274 len++;
1275#endif
1276 }
1277 }
1278 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1279 {
1280 while (opsz < psz && (!buf || len < n))
1281 {
1282 if ( buf && len + 3 < n )
1283 {
1284 unsigned char on = *opsz;
1285 *buf++ = L'\\';
1286 *buf++ = (wchar_t)( L'0' + on / 0100 );
1287 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1288 *buf++ = (wchar_t)( L'0' + on % 010 );
1289 }
1290
1291 opsz++;
1292 len += 4;
1293 }
1294 }
1295 else // MAP_INVALID_UTF8_NOT
1296 {
1297 return wxCONV_FAILED;
1298 }
1299 }
1300 }
1301 }
1302
1303 if (srcLen == wxNO_LEN && buf && (len < n))
1304 *buf = 0;
1305
1306 return len + 1;
1307}
1308
1309static inline bool isoctal(wchar_t wch)
1310{
1311 return L'0' <= wch && wch <= L'7';
1312}
1313
1314size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1315 const wchar_t *psz, size_t srcLen) const
1316{
1317 if ( m_options == MAP_INVALID_UTF8_NOT )
1318 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1319
1320 size_t len = 0;
1321
1322 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1323 {
1324 wxUint32 cc;
1325
1326#ifdef WC_UTF16
1327 // cast is ok for WC_UTF16
1328 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1329 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1330#else
1331 cc = (*psz++) & 0x7fffffff;
1332#endif
1333
1334 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1335 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1336 {
1337 if (buf)
1338 *buf++ = (char)(cc - wxUnicodePUA);
1339 len++;
1340 }
1341 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1342 && cc == L'\\' && psz[0] == L'\\' )
1343 {
1344 if (buf)
1345 *buf++ = (char)cc;
1346 psz++;
1347 len++;
1348 }
1349 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1350 cc == L'\\' &&
1351 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1352 {
1353 if (buf)
1354 {
1355 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1356 (psz[1] - L'0') * 010 +
1357 (psz[2] - L'0'));
1358 }
1359
1360 psz += 3;
1361 len++;
1362 }
1363 else
1364 {
1365 unsigned cnt;
1366 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1367 {
1368 }
1369
1370 if (!cnt)
1371 {
1372 // plain ASCII char
1373 if (buf)
1374 *buf++ = (char) cc;
1375 len++;
1376 }
1377 else
1378 {
1379 len += cnt + 1;
1380 if (buf)
1381 {
1382 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1383 while (cnt--)
1384 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1385 }
1386 }
1387 }
1388 }
1389
1390 if (srcLen == wxNO_LEN && buf && (len < n))
1391 *buf = 0;
1392
1393 return len + 1;
1394}
1395
1396// ============================================================================
1397// UTF-16
1398// ============================================================================
1399
1400#ifdef WORDS_BIGENDIAN
1401 #define wxMBConvUTF16straight wxMBConvUTF16BE
1402 #define wxMBConvUTF16swap wxMBConvUTF16LE
1403#else
1404 #define wxMBConvUTF16swap wxMBConvUTF16BE
1405 #define wxMBConvUTF16straight wxMBConvUTF16LE
1406#endif
1407
1408/* static */
1409size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1410{
1411 if ( srcLen == wxNO_LEN )
1412 {
1413 // count the number of bytes in input, including the trailing NULs
1414 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1415 for ( srcLen = 1; *inBuff++; srcLen++ )
1416 ;
1417
1418 srcLen *= BYTES_PER_CHAR;
1419 }
1420 else // we already have the length
1421 {
1422 // we can only convert an entire number of UTF-16 characters
1423 if ( srcLen % BYTES_PER_CHAR )
1424 return wxCONV_FAILED;
1425 }
1426
1427 return srcLen;
1428}
1429
1430// case when in-memory representation is UTF-16 too
1431#ifdef WC_UTF16
1432
1433// ----------------------------------------------------------------------------
1434// conversions without endianness change
1435// ----------------------------------------------------------------------------
1436
1437size_t
1438wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1439 const char *src, size_t srcLen) const
1440{
1441 // set up the scene for using memcpy() (which is presumably more efficient
1442 // than copying the bytes one by one)
1443 srcLen = GetLength(src, srcLen);
1444 if ( srcLen == wxNO_LEN )
1445 return wxCONV_FAILED;
1446
1447 const size_t inLen = srcLen / BYTES_PER_CHAR;
1448 if ( dst )
1449 {
1450 if ( dstLen < inLen )
1451 return wxCONV_FAILED;
1452
1453 memcpy(dst, src, srcLen);
1454 }
1455
1456 return inLen;
1457}
1458
1459size_t
1460wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1461 const wchar_t *src, size_t srcLen) const
1462{
1463 if ( srcLen == wxNO_LEN )
1464 srcLen = wxWcslen(src) + 1;
1465
1466 srcLen *= BYTES_PER_CHAR;
1467
1468 if ( dst )
1469 {
1470 if ( dstLen < srcLen )
1471 return wxCONV_FAILED;
1472
1473 memcpy(dst, src, srcLen);
1474 }
1475
1476 return srcLen;
1477}
1478
1479// ----------------------------------------------------------------------------
1480// endian-reversing conversions
1481// ----------------------------------------------------------------------------
1482
1483size_t
1484wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1485 const char *src, size_t srcLen) const
1486{
1487 srcLen = GetLength(src, srcLen);
1488 if ( srcLen == wxNO_LEN )
1489 return wxCONV_FAILED;
1490
1491 srcLen /= BYTES_PER_CHAR;
1492
1493 if ( dst )
1494 {
1495 if ( dstLen < srcLen )
1496 return wxCONV_FAILED;
1497
1498 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1499 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1500 {
1501 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1502 }
1503 }
1504
1505 return srcLen;
1506}
1507
1508size_t
1509wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1510 const wchar_t *src, size_t srcLen) const
1511{
1512 if ( srcLen == wxNO_LEN )
1513 srcLen = wxWcslen(src) + 1;
1514
1515 srcLen *= BYTES_PER_CHAR;
1516
1517 if ( dst )
1518 {
1519 if ( dstLen < srcLen )
1520 return wxCONV_FAILED;
1521
1522 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1523 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1524 {
1525 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1526 }
1527 }
1528
1529 return srcLen;
1530}
1531
1532#else // !WC_UTF16: wchar_t is UTF-32
1533
1534// ----------------------------------------------------------------------------
1535// conversions without endianness change
1536// ----------------------------------------------------------------------------
1537
1538size_t
1539wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1540 const char *src, size_t srcLen) const
1541{
1542 srcLen = GetLength(src, srcLen);
1543 if ( srcLen == wxNO_LEN )
1544 return wxCONV_FAILED;
1545
1546 const size_t inLen = srcLen / BYTES_PER_CHAR;
1547 if ( !dst )
1548 {
1549 // optimization: return maximal space which could be needed for this
1550 // string even if the real size could be smaller if the buffer contains
1551 // any surrogates
1552 return inLen;
1553 }
1554
1555 size_t outLen = 0;
1556 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1557 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1558 {
1559 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1560 if ( !inBuff )
1561 return wxCONV_FAILED;
1562
1563 if ( ++outLen > dstLen )
1564 return wxCONV_FAILED;
1565
1566 *dst++ = ch;
1567 }
1568
1569
1570 return outLen;
1571}
1572
1573size_t
1574wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1575 const wchar_t *src, size_t srcLen) const
1576{
1577 if ( srcLen == wxNO_LEN )
1578 srcLen = wxWcslen(src) + 1;
1579
1580 size_t outLen = 0;
1581 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1582 for ( size_t n = 0; n < srcLen; n++ )
1583 {
1584 wxUint16 cc[2];
1585 const size_t numChars = encode_utf16(*src++, cc);
1586 if ( numChars == wxCONV_FAILED )
1587 return wxCONV_FAILED;
1588
1589 outLen += numChars * BYTES_PER_CHAR;
1590 if ( outBuff )
1591 {
1592 if ( outLen > dstLen )
1593 return wxCONV_FAILED;
1594
1595 *outBuff++ = cc[0];
1596 if ( numChars == 2 )
1597 {
1598 // second character of a surrogate
1599 *outBuff++ = cc[1];
1600 }
1601 }
1602 }
1603
1604 return outLen;
1605}
1606
1607// ----------------------------------------------------------------------------
1608// endian-reversing conversions
1609// ----------------------------------------------------------------------------
1610
1611size_t
1612wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1613 const char *src, size_t srcLen) const
1614{
1615 srcLen = GetLength(src, srcLen);
1616 if ( srcLen == wxNO_LEN )
1617 return wxCONV_FAILED;
1618
1619 const size_t inLen = srcLen / BYTES_PER_CHAR;
1620 if ( !dst )
1621 {
1622 // optimization: return maximal space which could be needed for this
1623 // string even if the real size could be smaller if the buffer contains
1624 // any surrogates
1625 return inLen;
1626 }
1627
1628 size_t outLen = 0;
1629 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1630 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1631 {
1632 wxUint32 ch;
1633 wxUint16 tmp[2];
1634
1635 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1636 inBuff++;
1637 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1638
1639 const size_t numChars = decode_utf16(tmp, ch);
1640 if ( numChars == wxCONV_FAILED )
1641 return wxCONV_FAILED;
1642
1643 if ( numChars == 2 )
1644 inBuff++;
1645
1646 if ( ++outLen > dstLen )
1647 return wxCONV_FAILED;
1648
1649 *dst++ = ch;
1650 }
1651
1652
1653 return outLen;
1654}
1655
1656size_t
1657wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1658 const wchar_t *src, size_t srcLen) const
1659{
1660 if ( srcLen == wxNO_LEN )
1661 srcLen = wxWcslen(src) + 1;
1662
1663 size_t outLen = 0;
1664 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1665 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1666 {
1667 wxUint16 cc[2];
1668 const size_t numChars = encode_utf16(*src, cc);
1669 if ( numChars == wxCONV_FAILED )
1670 return wxCONV_FAILED;
1671
1672 outLen += numChars * BYTES_PER_CHAR;
1673 if ( outBuff )
1674 {
1675 if ( outLen > dstLen )
1676 return wxCONV_FAILED;
1677
1678 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1679 if ( numChars == 2 )
1680 {
1681 // second character of a surrogate
1682 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1683 }
1684 }
1685 }
1686
1687 return outLen;
1688}
1689
1690#endif // WC_UTF16/!WC_UTF16
1691
1692
1693// ============================================================================
1694// UTF-32
1695// ============================================================================
1696
1697#ifdef WORDS_BIGENDIAN
1698 #define wxMBConvUTF32straight wxMBConvUTF32BE
1699 #define wxMBConvUTF32swap wxMBConvUTF32LE
1700#else
1701 #define wxMBConvUTF32swap wxMBConvUTF32BE
1702 #define wxMBConvUTF32straight wxMBConvUTF32LE
1703#endif
1704
1705
1706WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1707WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1708
1709/* static */
1710size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1711{
1712 if ( srcLen == wxNO_LEN )
1713 {
1714 // count the number of bytes in input, including the trailing NULs
1715 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1716 for ( srcLen = 1; *inBuff++; srcLen++ )
1717 ;
1718
1719 srcLen *= BYTES_PER_CHAR;
1720 }
1721 else // we already have the length
1722 {
1723 // we can only convert an entire number of UTF-32 characters
1724 if ( srcLen % BYTES_PER_CHAR )
1725 return wxCONV_FAILED;
1726 }
1727
1728 return srcLen;
1729}
1730
1731// case when in-memory representation is UTF-16
1732#ifdef WC_UTF16
1733
1734// ----------------------------------------------------------------------------
1735// conversions without endianness change
1736// ----------------------------------------------------------------------------
1737
1738size_t
1739wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1740 const char *src, size_t srcLen) const
1741{
1742 srcLen = GetLength(src, srcLen);
1743 if ( srcLen == wxNO_LEN )
1744 return wxCONV_FAILED;
1745
1746 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1747 const size_t inLen = srcLen / BYTES_PER_CHAR;
1748 size_t outLen = 0;
1749 for ( size_t n = 0; n < inLen; n++ )
1750 {
1751 wxUint16 cc[2];
1752 const size_t numChars = encode_utf16(*inBuff++, cc);
1753 if ( numChars == wxCONV_FAILED )
1754 return wxCONV_FAILED;
1755
1756 outLen += numChars;
1757 if ( dst )
1758 {
1759 if ( outLen > dstLen )
1760 return wxCONV_FAILED;
1761
1762 *dst++ = cc[0];
1763 if ( numChars == 2 )
1764 {
1765 // second character of a surrogate
1766 *dst++ = cc[1];
1767 }
1768 }
1769 }
1770
1771 return outLen;
1772}
1773
1774size_t
1775wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1776 const wchar_t *src, size_t srcLen) const
1777{
1778 if ( srcLen == wxNO_LEN )
1779 srcLen = wxWcslen(src) + 1;
1780
1781 if ( !dst )
1782 {
1783 // optimization: return maximal space which could be needed for this
1784 // string instead of the exact amount which could be less if there are
1785 // any surrogates in the input
1786 //
1787 // we consider that surrogates are rare enough to make it worthwhile to
1788 // avoid running the loop below at the cost of slightly extra memory
1789 // consumption
1790 return srcLen * BYTES_PER_CHAR;
1791 }
1792
1793 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1794 size_t outLen = 0;
1795 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1796 {
1797 const wxUint32 ch = wxDecodeSurrogate(&src);
1798 if ( !src )
1799 return wxCONV_FAILED;
1800
1801 outLen += BYTES_PER_CHAR;
1802
1803 if ( outLen > dstLen )
1804 return wxCONV_FAILED;
1805
1806 *outBuff++ = ch;
1807 }
1808
1809 return outLen;
1810}
1811
1812// ----------------------------------------------------------------------------
1813// endian-reversing conversions
1814// ----------------------------------------------------------------------------
1815
1816size_t
1817wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1818 const char *src, size_t srcLen) const
1819{
1820 srcLen = GetLength(src, srcLen);
1821 if ( srcLen == wxNO_LEN )
1822 return wxCONV_FAILED;
1823
1824 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1825 const size_t inLen = srcLen / BYTES_PER_CHAR;
1826 size_t outLen = 0;
1827 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1828 {
1829 wxUint16 cc[2];
1830 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1831 if ( numChars == wxCONV_FAILED )
1832 return wxCONV_FAILED;
1833
1834 outLen += numChars;
1835 if ( dst )
1836 {
1837 if ( outLen > dstLen )
1838 return wxCONV_FAILED;
1839
1840 *dst++ = cc[0];
1841 if ( numChars == 2 )
1842 {
1843 // second character of a surrogate
1844 *dst++ = cc[1];
1845 }
1846 }
1847 }
1848
1849 return outLen;
1850}
1851
1852size_t
1853wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1854 const wchar_t *src, size_t srcLen) const
1855{
1856 if ( srcLen == wxNO_LEN )
1857 srcLen = wxWcslen(src) + 1;
1858
1859 if ( !dst )
1860 {
1861 // optimization: return maximal space which could be needed for this
1862 // string instead of the exact amount which could be less if there are
1863 // any surrogates in the input
1864 //
1865 // we consider that surrogates are rare enough to make it worthwhile to
1866 // avoid running the loop below at the cost of slightly extra memory
1867 // consumption
1868 return srcLen*BYTES_PER_CHAR;
1869 }
1870
1871 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1872 size_t outLen = 0;
1873 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1874 {
1875 const wxUint32 ch = wxDecodeSurrogate(&src);
1876 if ( !src )
1877 return wxCONV_FAILED;
1878
1879 outLen += BYTES_PER_CHAR;
1880
1881 if ( outLen > dstLen )
1882 return wxCONV_FAILED;
1883
1884 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1885 }
1886
1887 return outLen;
1888}
1889
1890#else // !WC_UTF16: wchar_t is UTF-32
1891
1892// ----------------------------------------------------------------------------
1893// conversions without endianness change
1894// ----------------------------------------------------------------------------
1895
1896size_t
1897wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1898 const char *src, size_t srcLen) const
1899{
1900 // use memcpy() as it should be much faster than hand-written loop
1901 srcLen = GetLength(src, srcLen);
1902 if ( srcLen == wxNO_LEN )
1903 return wxCONV_FAILED;
1904
1905 const size_t inLen = srcLen/BYTES_PER_CHAR;
1906 if ( dst )
1907 {
1908 if ( dstLen < inLen )
1909 return wxCONV_FAILED;
1910
1911 memcpy(dst, src, srcLen);
1912 }
1913
1914 return inLen;
1915}
1916
1917size_t
1918wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1919 const wchar_t *src, size_t srcLen) const
1920{
1921 if ( srcLen == wxNO_LEN )
1922 srcLen = wxWcslen(src) + 1;
1923
1924 srcLen *= BYTES_PER_CHAR;
1925
1926 if ( dst )
1927 {
1928 if ( dstLen < srcLen )
1929 return wxCONV_FAILED;
1930
1931 memcpy(dst, src, srcLen);
1932 }
1933
1934 return srcLen;
1935}
1936
1937// ----------------------------------------------------------------------------
1938// endian-reversing conversions
1939// ----------------------------------------------------------------------------
1940
1941size_t
1942wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1943 const char *src, size_t srcLen) const
1944{
1945 srcLen = GetLength(src, srcLen);
1946 if ( srcLen == wxNO_LEN )
1947 return wxCONV_FAILED;
1948
1949 srcLen /= BYTES_PER_CHAR;
1950
1951 if ( dst )
1952 {
1953 if ( dstLen < srcLen )
1954 return wxCONV_FAILED;
1955
1956 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1957 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1958 {
1959 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1960 }
1961 }
1962
1963 return srcLen;
1964}
1965
1966size_t
1967wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1968 const wchar_t *src, size_t srcLen) const
1969{
1970 if ( srcLen == wxNO_LEN )
1971 srcLen = wxWcslen(src) + 1;
1972
1973 srcLen *= BYTES_PER_CHAR;
1974
1975 if ( dst )
1976 {
1977 if ( dstLen < srcLen )
1978 return wxCONV_FAILED;
1979
1980 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1981 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1982 {
1983 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1984 }
1985 }
1986
1987 return srcLen;
1988}
1989
1990#endif // WC_UTF16/!WC_UTF16
1991
1992
1993// ============================================================================
1994// The classes doing conversion using the iconv_xxx() functions
1995// ============================================================================
1996
1997#ifdef HAVE_ICONV
1998
1999// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2000// E2BIG if output buffer is _exactly_ as big as needed. Such case is
2001// (unless there's yet another bug in glibc) the only case when iconv()
2002// returns with (size_t)-1 (which means error) and says there are 0 bytes
2003// left in the input buffer -- when _real_ error occurs,
2004// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2005// iconv() failure.
2006// [This bug does not appear in glibc 2.2.]
2007#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2008#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2009 (errno != E2BIG || bufLeft != 0))
2010#else
2011#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2012#endif
2013
2014#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2015
2016#define ICONV_T_INVALID ((iconv_t)-1)
2017
2018#if SIZEOF_WCHAR_T == 4
2019 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2020 #define WC_ENC wxFONTENCODING_UTF32
2021#elif SIZEOF_WCHAR_T == 2
2022 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2023 #define WC_ENC wxFONTENCODING_UTF16
2024#else // sizeof(wchar_t) != 2 nor 4
2025 // does this ever happen?
2026 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2027#endif
2028
2029// ----------------------------------------------------------------------------
2030// wxMBConv_iconv: encapsulates an iconv character set
2031// ----------------------------------------------------------------------------
2032
2033class wxMBConv_iconv : public wxMBConv
2034{
2035public:
2036 wxMBConv_iconv(const char *name);
2037 virtual ~wxMBConv_iconv();
2038
2039 // implement base class virtual methods
2040 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2041 const char *src, size_t srcLen = wxNO_LEN) const;
2042 virtual size_t FromWChar(char *dst, size_t dstLen,
2043 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2044 virtual size_t GetMBNulLen() const;
2045
2046#if wxUSE_UNICODE_UTF8
2047 virtual bool IsUTF8() const;
2048#endif
2049
2050 virtual wxMBConv *Clone() const
2051 {
2052 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2053 p->m_minMBCharWidth = m_minMBCharWidth;
2054 return p;
2055 }
2056
2057 bool IsOk() const
2058 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2059
2060protected:
2061 // the iconv handlers used to translate from multibyte
2062 // to wide char and in the other direction
2063 iconv_t m2w,
2064 w2m;
2065
2066#if wxUSE_THREADS
2067 // guards access to m2w and w2m objects
2068 wxMutex m_iconvMutex;
2069#endif
2070
2071private:
2072 // the name (for iconv_open()) of a wide char charset -- if none is
2073 // available on this machine, it will remain NULL
2074 static wxString ms_wcCharsetName;
2075
2076 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2077 // different endian-ness than the native one
2078 static bool ms_wcNeedsSwap;
2079
2080
2081 // name of the encoding handled by this conversion
2082 wxString m_name;
2083
2084 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2085 // initially
2086 size_t m_minMBCharWidth;
2087};
2088
2089// make the constructor available for unit testing
2090WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2091{
2092 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2093 if ( !result->IsOk() )
2094 {
2095 delete result;
2096 return 0;
2097 }
2098
2099 return result;
2100}
2101
2102wxString wxMBConv_iconv::ms_wcCharsetName;
2103bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2104
2105wxMBConv_iconv::wxMBConv_iconv(const char *name)
2106 : m_name(name)
2107{
2108 m_minMBCharWidth = 0;
2109
2110 // check for charset that represents wchar_t:
2111 if ( ms_wcCharsetName.empty() )
2112 {
2113 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2114
2115#if wxUSE_FONTMAP
2116 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2117#else // !wxUSE_FONTMAP
2118 static const wxChar *names_static[] =
2119 {
2120#if SIZEOF_WCHAR_T == 4
2121 _T("UCS-4"),
2122#elif SIZEOF_WCHAR_T = 2
2123 _T("UCS-2"),
2124#endif
2125 NULL
2126 };
2127 const wxChar **names = names_static;
2128#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2129
2130 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2131 {
2132 const wxString nameCS(*names);
2133
2134 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2135 wxString nameXE(nameCS);
2136
2137#ifdef WORDS_BIGENDIAN
2138 nameXE += _T("BE");
2139#else // little endian
2140 nameXE += _T("LE");
2141#endif
2142
2143 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2144 nameXE.c_str());
2145
2146 m2w = iconv_open(nameXE.ToAscii(), name);
2147 if ( m2w == ICONV_T_INVALID )
2148 {
2149 // try charset w/o bytesex info (e.g. "UCS4")
2150 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2151 nameCS.c_str());
2152 m2w = iconv_open(nameCS.ToAscii(), name);
2153
2154 // and check for bytesex ourselves:
2155 if ( m2w != ICONV_T_INVALID )
2156 {
2157 char buf[2], *bufPtr;
2158 wchar_t wbuf[2];
2159 size_t insz, outsz;
2160 size_t res;
2161
2162 buf[0] = 'A';
2163 buf[1] = 0;
2164 wbuf[0] = 0;
2165 insz = 2;
2166 outsz = SIZEOF_WCHAR_T * 2;
2167 char* wbufPtr = (char*)wbuf;
2168 bufPtr = buf;
2169
2170 res = iconv(
2171 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2172 &wbufPtr, &outsz);
2173
2174 if (ICONV_FAILED(res, insz))
2175 {
2176 wxLogLastError(wxT("iconv"));
2177 wxLogError(_("Conversion to charset '%s' doesn't work."),
2178 nameCS.c_str());
2179 }
2180 else // ok, can convert to this encoding, remember it
2181 {
2182 ms_wcCharsetName = nameCS;
2183 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2184 }
2185 }
2186 }
2187 else // use charset not requiring byte swapping
2188 {
2189 ms_wcCharsetName = nameXE;
2190 }
2191 }
2192
2193 wxLogTrace(TRACE_STRCONV,
2194 wxT("iconv wchar_t charset is \"%s\"%s"),
2195 ms_wcCharsetName.empty() ? wxString("<none>")
2196 : ms_wcCharsetName,
2197 ms_wcNeedsSwap ? _T(" (needs swap)")
2198 : _T(""));
2199 }
2200 else // we already have ms_wcCharsetName
2201 {
2202 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2203 }
2204
2205 if ( ms_wcCharsetName.empty() )
2206 {
2207 w2m = ICONV_T_INVALID;
2208 }
2209 else
2210 {
2211 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2212 if ( w2m == ICONV_T_INVALID )
2213 {
2214 wxLogTrace(TRACE_STRCONV,
2215 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2216 ms_wcCharsetName.c_str(), name);
2217 }
2218 }
2219}
2220
2221wxMBConv_iconv::~wxMBConv_iconv()
2222{
2223 if ( m2w != ICONV_T_INVALID )
2224 iconv_close(m2w);
2225 if ( w2m != ICONV_T_INVALID )
2226 iconv_close(w2m);
2227}
2228
2229size_t
2230wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2231 const char *src, size_t srcLen) const
2232{
2233 if ( srcLen == wxNO_LEN )
2234 {
2235 // find the string length: notice that must be done differently for
2236 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2237 // consecutive NULs
2238 const size_t nulLen = GetMBNulLen();
2239 switch ( nulLen )
2240 {
2241 default:
2242 return wxCONV_FAILED;
2243
2244 case 1:
2245 srcLen = strlen(src); // arguably more optimized than our version
2246 break;
2247
2248 case 2:
2249 case 4:
2250 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2251 // but they also have to start at character boundary and not
2252 // span two adjacent characters
2253 const char *p;
2254 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2255 ;
2256 srcLen = p - src;
2257 break;
2258 }
2259
2260 // when we're determining the length of the string ourselves we count
2261 // the terminating NUL(s) as part of it and always NUL-terminate the
2262 // output
2263 srcLen += nulLen;
2264 }
2265
2266 // we express length in the number of (wide) characters but iconv always
2267 // counts buffer sizes it in bytes
2268 dstLen *= SIZEOF_WCHAR_T;
2269
2270#if wxUSE_THREADS
2271 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2272 // Unfortunately there are a couple of global wxCSConv objects such as
2273 // wxConvLocal that are used all over wx code, so we have to make sure
2274 // the handle is used by at most one thread at the time. Otherwise
2275 // only a few wx classes would be safe to use from non-main threads
2276 // as MB<->WC conversion would fail "randomly".
2277 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2278#endif // wxUSE_THREADS
2279
2280 size_t res, cres;
2281 const char *pszPtr = src;
2282
2283 if ( dst )
2284 {
2285 char* bufPtr = (char*)dst;
2286
2287 // have destination buffer, convert there
2288 size_t dstLenOrig = dstLen;
2289 cres = iconv(m2w,
2290 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2291 &bufPtr, &dstLen);
2292
2293 // convert the number of bytes converted as returned by iconv to the
2294 // number of (wide) characters converted that we need
2295 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2296
2297 if (ms_wcNeedsSwap)
2298 {
2299 // convert to native endianness
2300 for ( unsigned i = 0; i < res; i++ )
2301 dst[i] = WC_BSWAP(dst[i]);
2302 }
2303 }
2304 else // no destination buffer
2305 {
2306 // convert using temp buffer to calculate the size of the buffer needed
2307 wchar_t tbuf[8];
2308 res = 0;
2309
2310 do
2311 {
2312 char* bufPtr = (char*)tbuf;
2313 dstLen = 8 * SIZEOF_WCHAR_T;
2314
2315 cres = iconv(m2w,
2316 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2317 &bufPtr, &dstLen );
2318
2319 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2320 }
2321 while ((cres == (size_t)-1) && (errno == E2BIG));
2322 }
2323
2324 if (ICONV_FAILED(cres, srcLen))
2325 {
2326 //VS: it is ok if iconv fails, hence trace only
2327 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2328 return wxCONV_FAILED;
2329 }
2330
2331 return res;
2332}
2333
2334size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2335 const wchar_t *src, size_t srcLen) const
2336{
2337#if wxUSE_THREADS
2338 // NB: explained in MB2WC
2339 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2340#endif
2341
2342 if ( srcLen == wxNO_LEN )
2343 srcLen = wxWcslen(src) + 1;
2344
2345 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2346 size_t outbuflen = dstLen;
2347 size_t res, cres;
2348
2349 wchar_t *tmpbuf = 0;
2350
2351 if (ms_wcNeedsSwap)
2352 {
2353 // need to copy to temp buffer to switch endianness
2354 // (doing WC_BSWAP twice on the original buffer won't help, as it
2355 // could be in read-only memory, or be accessed in some other thread)
2356 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2357 for ( size_t i = 0; i < srcLen; i++ )
2358 tmpbuf[i] = WC_BSWAP(src[i]);
2359
2360 tmpbuf[srcLen] = L'\0';
2361 src = tmpbuf;
2362 }
2363
2364 char* inbuf = (char*)src;
2365 if ( dst )
2366 {
2367 // have destination buffer, convert there
2368 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2369
2370 res = dstLen - outbuflen;
2371 }
2372 else // no destination buffer
2373 {
2374 // convert using temp buffer to calculate the size of the buffer needed
2375 char tbuf[16];
2376 res = 0;
2377 do
2378 {
2379 dst = tbuf;
2380 outbuflen = 16;
2381
2382 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2383
2384 res += 16 - outbuflen;
2385 }
2386 while ((cres == (size_t)-1) && (errno == E2BIG));
2387 }
2388
2389 if (ms_wcNeedsSwap)
2390 {
2391 free(tmpbuf);
2392 }
2393
2394 if (ICONV_FAILED(cres, inbuflen))
2395 {
2396 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2397 return wxCONV_FAILED;
2398 }
2399
2400 return res;
2401}
2402
2403size_t wxMBConv_iconv::GetMBNulLen() const
2404{
2405 if ( m_minMBCharWidth == 0 )
2406 {
2407 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2408
2409#if wxUSE_THREADS
2410 // NB: explained in MB2WC
2411 wxMutexLocker lock(self->m_iconvMutex);
2412#endif
2413
2414 const wchar_t *wnul = L"";
2415 char buf[8]; // should be enough for NUL in any encoding
2416 size_t inLen = sizeof(wchar_t),
2417 outLen = WXSIZEOF(buf);
2418 char *inBuff = (char *)wnul;
2419 char *outBuff = buf;
2420 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2421 {
2422 self->m_minMBCharWidth = (size_t)-1;
2423 }
2424 else // ok
2425 {
2426 self->m_minMBCharWidth = outBuff - buf;
2427 }
2428 }
2429
2430 return m_minMBCharWidth;
2431}
2432
2433#if wxUSE_UNICODE_UTF8
2434bool wxMBConv_iconv::IsUTF8() const
2435{
2436 return wxStricmp(m_name, "UTF-8") == 0 ||
2437 wxStricmp(m_name, "UTF8") == 0;
2438}
2439#endif
2440
2441#endif // HAVE_ICONV
2442
2443
2444// ============================================================================
2445// Win32 conversion classes
2446// ============================================================================
2447
2448#ifdef wxHAVE_WIN32_MB2WC
2449
2450// from utils.cpp
2451#if wxUSE_FONTMAP
2452extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2453extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2454#endif
2455
2456class wxMBConv_win32 : public wxMBConv
2457{
2458public:
2459 wxMBConv_win32()
2460 {
2461 m_CodePage = CP_ACP;
2462 m_minMBCharWidth = 0;
2463 }
2464
2465 wxMBConv_win32(const wxMBConv_win32& conv)
2466 : wxMBConv()
2467 {
2468 m_CodePage = conv.m_CodePage;
2469 m_minMBCharWidth = conv.m_minMBCharWidth;
2470 }
2471
2472#if wxUSE_FONTMAP
2473 wxMBConv_win32(const char* name)
2474 {
2475 m_CodePage = wxCharsetToCodepage(name);
2476 m_minMBCharWidth = 0;
2477 }
2478
2479 wxMBConv_win32(wxFontEncoding encoding)
2480 {
2481 m_CodePage = wxEncodingToCodepage(encoding);
2482 m_minMBCharWidth = 0;
2483 }
2484#endif // wxUSE_FONTMAP
2485
2486 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2487 {
2488 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2489 // the behaviour is not compatible with the Unix version (using iconv)
2490 // and break the library itself, e.g. wxTextInputStream::NextChar()
2491 // wouldn't work if reading an incomplete MB char didn't result in an
2492 // error
2493 //
2494 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2495 // Win XP or newer and it is not supported for UTF-[78] so we always
2496 // use our own conversions in this case. See
2497 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2498 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2499 if ( m_CodePage == CP_UTF8 )
2500 {
2501 return wxMBConvUTF8().MB2WC(buf, psz, n);
2502 }
2503
2504 if ( m_CodePage == CP_UTF7 )
2505 {
2506 return wxMBConvUTF7().MB2WC(buf, psz, n);
2507 }
2508
2509 int flags = 0;
2510 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2511 IsAtLeastWin2kSP4() )
2512 {
2513 flags = MB_ERR_INVALID_CHARS;
2514 }
2515
2516 const size_t len = ::MultiByteToWideChar
2517 (
2518 m_CodePage, // code page
2519 flags, // flags: fall on error
2520 psz, // input string
2521 -1, // its length (NUL-terminated)
2522 buf, // output string
2523 buf ? n : 0 // size of output buffer
2524 );
2525 if ( !len )
2526 {
2527 // function totally failed
2528 return wxCONV_FAILED;
2529 }
2530
2531 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2532 // check if we succeeded, by doing a double trip:
2533 if ( !flags && buf )
2534 {
2535 const size_t mbLen = strlen(psz);
2536 wxCharBuffer mbBuf(mbLen);
2537 if ( ::WideCharToMultiByte
2538 (
2539 m_CodePage,
2540 0,
2541 buf,
2542 -1,
2543 mbBuf.data(),
2544 mbLen + 1, // size in bytes, not length
2545 NULL,
2546 NULL
2547 ) == 0 ||
2548 strcmp(mbBuf, psz) != 0 )
2549 {
2550 // we didn't obtain the same thing we started from, hence
2551 // the conversion was lossy and we consider that it failed
2552 return wxCONV_FAILED;
2553 }
2554 }
2555
2556 // note that it returns count of written chars for buf != NULL and size
2557 // of the needed buffer for buf == NULL so in either case the length of
2558 // the string (which never includes the terminating NUL) is one less
2559 return len - 1;
2560 }
2561
2562 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2563 {
2564 /*
2565 we have a problem here: by default, WideCharToMultiByte() may
2566 replace characters unrepresentable in the target code page with bad
2567 quality approximations such as turning "1/2" symbol (U+00BD) into
2568 "1" for the code pages which don't have it and we, obviously, want
2569 to avoid this at any price
2570
2571 the trouble is that this function does it _silently_, i.e. it won't
2572 even tell us whether it did or not... Win98/2000 and higher provide
2573 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2574 we have to resort to a round trip, i.e. check that converting back
2575 results in the same string -- this is, of course, expensive but
2576 otherwise we simply can't be sure to not garble the data.
2577 */
2578
2579 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2580 // it doesn't work with CJK encodings (which we test for rather roughly
2581 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2582 // supporting it
2583 BOOL usedDef wxDUMMY_INITIALIZE(false);
2584 BOOL *pUsedDef;
2585 int flags;
2586 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2587 {
2588 // it's our lucky day
2589 flags = WC_NO_BEST_FIT_CHARS;
2590 pUsedDef = &usedDef;
2591 }
2592 else // old system or unsupported encoding
2593 {
2594 flags = 0;
2595 pUsedDef = NULL;
2596 }
2597
2598 const size_t len = ::WideCharToMultiByte
2599 (
2600 m_CodePage, // code page
2601 flags, // either none or no best fit
2602 pwz, // input string
2603 -1, // it is (wide) NUL-terminated
2604 buf, // output buffer
2605 buf ? n : 0, // and its size
2606 NULL, // default "replacement" char
2607 pUsedDef // [out] was it used?
2608 );
2609
2610 if ( !len )
2611 {
2612 // function totally failed
2613 return wxCONV_FAILED;
2614 }
2615
2616 // we did something, check if we really succeeded
2617 if ( flags )
2618 {
2619 // check if the conversion failed, i.e. if any replacements
2620 // were done
2621 if ( usedDef )
2622 return wxCONV_FAILED;
2623 }
2624 else // we must resort to double tripping...
2625 {
2626 // first we need to ensure that we really have the MB data: this is
2627 // not the case if we're called with NULL buffer, in which case we
2628 // need to do the conversion yet again
2629 wxCharBuffer bufDef;
2630 if ( !buf )
2631 {
2632 bufDef = wxCharBuffer(len);
2633 buf = bufDef.data();
2634 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2635 buf, len, NULL, NULL) )
2636 return wxCONV_FAILED;
2637 }
2638
2639 if ( !n )
2640 n = wcslen(pwz);
2641 wxWCharBuffer wcBuf(n);
2642 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2643 wcscmp(wcBuf, pwz) != 0 )
2644 {
2645 // we didn't obtain the same thing we started from, hence
2646 // the conversion was lossy and we consider that it failed
2647 return wxCONV_FAILED;
2648 }
2649 }
2650
2651 // see the comment above for the reason of "len - 1"
2652 return len - 1;
2653 }
2654
2655 virtual size_t GetMBNulLen() const
2656 {
2657 if ( m_minMBCharWidth == 0 )
2658 {
2659 int len = ::WideCharToMultiByte
2660 (
2661 m_CodePage, // code page
2662 0, // no flags
2663 L"", // input string
2664 1, // translate just the NUL
2665 NULL, // output buffer
2666 0, // and its size
2667 NULL, // no replacement char
2668 NULL // [out] don't care if it was used
2669 );
2670
2671 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2672 switch ( len )
2673 {
2674 default:
2675 wxLogDebug(_T("Unexpected NUL length %d"), len);
2676 self->m_minMBCharWidth = (size_t)-1;
2677 break;
2678
2679 case 0:
2680 self->m_minMBCharWidth = (size_t)-1;
2681 break;
2682
2683 case 1:
2684 case 2:
2685 case 4:
2686 self->m_minMBCharWidth = len;
2687 break;
2688 }
2689 }
2690
2691 return m_minMBCharWidth;
2692 }
2693
2694 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2695
2696 bool IsOk() const { return m_CodePage != -1; }
2697
2698private:
2699 static bool CanUseNoBestFit()
2700 {
2701 static int s_isWin98Or2k = -1;
2702
2703 if ( s_isWin98Or2k == -1 )
2704 {
2705 int verMaj, verMin;
2706 switch ( wxGetOsVersion(&verMaj, &verMin) )
2707 {
2708 case wxOS_WINDOWS_9X:
2709 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2710 break;
2711
2712 case wxOS_WINDOWS_NT:
2713 s_isWin98Or2k = verMaj >= 5;
2714 break;
2715
2716 default:
2717 // unknown: be conservative by default
2718 s_isWin98Or2k = 0;
2719 break;
2720 }
2721
2722 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2723 }
2724
2725 return s_isWin98Or2k == 1;
2726 }
2727
2728 static bool IsAtLeastWin2kSP4()
2729 {
2730#ifdef __WXWINCE__
2731 return false;
2732#else
2733 static int s_isAtLeastWin2kSP4 = -1;
2734
2735 if ( s_isAtLeastWin2kSP4 == -1 )
2736 {
2737 OSVERSIONINFOEX ver;
2738
2739 memset(&ver, 0, sizeof(ver));
2740 ver.dwOSVersionInfoSize = sizeof(ver);
2741 GetVersionEx((OSVERSIONINFO*)&ver);
2742
2743 s_isAtLeastWin2kSP4 =
2744 ((ver.dwMajorVersion > 5) || // Vista+
2745 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2746 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2747 ver.wServicePackMajor >= 4)) // 2000 SP4+
2748 ? 1 : 0;
2749 }
2750
2751 return s_isAtLeastWin2kSP4 == 1;
2752#endif
2753 }
2754
2755
2756 // the code page we're working with
2757 long m_CodePage;
2758
2759 // cached result of GetMBNulLen(), set to 0 initially meaning
2760 // "unknown"
2761 size_t m_minMBCharWidth;
2762};
2763
2764#endif // wxHAVE_WIN32_MB2WC
2765
2766
2767// ============================================================================
2768// wxEncodingConverter based conversion classes
2769// ============================================================================
2770
2771#if wxUSE_FONTMAP
2772
2773class wxMBConv_wxwin : public wxMBConv
2774{
2775private:
2776 void Init()
2777 {
2778 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2779 // The wxMBConv_cf class does a better job.
2780 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2781 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2782 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2783 }
2784
2785public:
2786 // temporarily just use wxEncodingConverter stuff,
2787 // so that it works while a better implementation is built
2788 wxMBConv_wxwin(const char* name)
2789 {
2790 if (name)
2791 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2792 else
2793 m_enc = wxFONTENCODING_SYSTEM;
2794
2795 Init();
2796 }
2797
2798 wxMBConv_wxwin(wxFontEncoding enc)
2799 {
2800 m_enc = enc;
2801
2802 Init();
2803 }
2804
2805 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2806 {
2807 size_t inbuf = strlen(psz);
2808 if (buf)
2809 {
2810 if (!m2w.Convert(psz, buf))
2811 return wxCONV_FAILED;
2812 }
2813 return inbuf;
2814 }
2815
2816 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2817 {
2818 const size_t inbuf = wxWcslen(psz);
2819 if (buf)
2820 {
2821 if (!w2m.Convert(psz, buf))
2822 return wxCONV_FAILED;
2823 }
2824
2825 return inbuf;
2826 }
2827
2828 virtual size_t GetMBNulLen() const
2829 {
2830 switch ( m_enc )
2831 {
2832 case wxFONTENCODING_UTF16BE:
2833 case wxFONTENCODING_UTF16LE:
2834 return 2;
2835
2836 case wxFONTENCODING_UTF32BE:
2837 case wxFONTENCODING_UTF32LE:
2838 return 4;
2839
2840 default:
2841 return 1;
2842 }
2843 }
2844
2845 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2846
2847 bool IsOk() const { return m_ok; }
2848
2849public:
2850 wxFontEncoding m_enc;
2851 wxEncodingConverter m2w, w2m;
2852
2853private:
2854 // were we initialized successfully?
2855 bool m_ok;
2856
2857 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2858};
2859
2860// make the constructors available for unit testing
2861WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2862{
2863 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2864 if ( !result->IsOk() )
2865 {
2866 delete result;
2867 return 0;
2868 }
2869
2870 return result;
2871}
2872
2873#endif // wxUSE_FONTMAP
2874
2875// ============================================================================
2876// wxCSConv implementation
2877// ============================================================================
2878
2879void wxCSConv::Init()
2880{
2881 m_name = NULL;
2882 m_convReal = NULL;
2883 m_deferred = true;
2884}
2885
2886wxCSConv::wxCSConv(const wxString& charset)
2887{
2888 Init();
2889
2890 if ( !charset.empty() )
2891 {
2892 SetName(charset.ToAscii());
2893 }
2894
2895#if wxUSE_FONTMAP
2896 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2897 if ( m_encoding == wxFONTENCODING_MAX )
2898 {
2899 // set to unknown/invalid value
2900 m_encoding = wxFONTENCODING_SYSTEM;
2901 }
2902 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2903 {
2904 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2905 m_encoding = wxFONTENCODING_ISO8859_1;
2906 }
2907#else
2908 m_encoding = wxFONTENCODING_SYSTEM;
2909#endif
2910}
2911
2912wxCSConv::wxCSConv(wxFontEncoding encoding)
2913{
2914 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2915 {
2916 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2917
2918 encoding = wxFONTENCODING_SYSTEM;
2919 }
2920
2921 Init();
2922
2923 m_encoding = encoding;
2924}
2925
2926wxCSConv::~wxCSConv()
2927{
2928 Clear();
2929}
2930
2931wxCSConv::wxCSConv(const wxCSConv& conv)
2932 : wxMBConv()
2933{
2934 Init();
2935
2936 SetName(conv.m_name);
2937 m_encoding = conv.m_encoding;
2938}
2939
2940wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2941{
2942 Clear();
2943
2944 SetName(conv.m_name);
2945 m_encoding = conv.m_encoding;
2946
2947 return *this;
2948}
2949
2950void wxCSConv::Clear()
2951{
2952 free(m_name);
2953 delete m_convReal;
2954
2955 m_name = NULL;
2956 m_convReal = NULL;
2957}
2958
2959void wxCSConv::SetName(const char *charset)
2960{
2961 if (charset)
2962 {
2963 m_name = wxStrdup(charset);
2964 m_deferred = true;
2965 }
2966}
2967
2968#if wxUSE_FONTMAP
2969
2970WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2971 wxEncodingNameCache );
2972
2973static wxEncodingNameCache gs_nameCache;
2974#endif
2975
2976wxMBConv *wxCSConv::DoCreate() const
2977{
2978#if wxUSE_FONTMAP
2979 wxLogTrace(TRACE_STRCONV,
2980 wxT("creating conversion for %s"),
2981 (m_name ? m_name
2982 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2983#endif // wxUSE_FONTMAP
2984
2985 // check for the special case of ASCII or ISO8859-1 charset: as we have
2986 // special knowledge of it anyhow, we don't need to create a special
2987 // conversion object
2988 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2989 m_encoding == wxFONTENCODING_DEFAULT )
2990 {
2991 // don't convert at all
2992 return NULL;
2993 }
2994
2995 // we trust OS to do conversion better than we can so try external
2996 // conversion methods first
2997 //
2998 // the full order is:
2999 // 1. OS conversion (iconv() under Unix or Win32 API)
3000 // 2. hard coded conversions for UTF
3001 // 3. wxEncodingConverter as fall back
3002
3003 // step (1)
3004#ifdef HAVE_ICONV
3005#if !wxUSE_FONTMAP
3006 if ( m_name )
3007#endif // !wxUSE_FONTMAP
3008 {
3009#if wxUSE_FONTMAP
3010 wxFontEncoding encoding(m_encoding);
3011#endif
3012
3013 if ( m_name )
3014 {
3015 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3016 if ( conv->IsOk() )
3017 return conv;
3018
3019 delete conv;
3020
3021#if wxUSE_FONTMAP
3022 encoding =
3023 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3024#endif // wxUSE_FONTMAP
3025 }
3026#if wxUSE_FONTMAP
3027 {
3028 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3029 if ( it != gs_nameCache.end() )
3030 {
3031 if ( it->second.empty() )
3032 return NULL;
3033
3034 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3035 if ( conv->IsOk() )
3036 return conv;
3037
3038 delete conv;
3039 }
3040
3041 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3042 // CS : in case this does not return valid names (eg for MacRoman)
3043 // encoding got a 'failure' entry in the cache all the same,
3044 // although it just has to be created using a different method, so
3045 // only store failed iconv creation attempts (or perhaps we
3046 // shoulnd't do this at all ?)
3047 if ( names[0] != NULL )
3048 {
3049 for ( ; *names; ++names )
3050 {
3051 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3052 // will need changes that will obsolete this
3053 wxString name(*names);
3054 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3055 if ( conv->IsOk() )
3056 {
3057 gs_nameCache[encoding] = *names;
3058 return conv;
3059 }
3060
3061 delete conv;
3062 }
3063
3064 gs_nameCache[encoding] = _T(""); // cache the failure
3065 }
3066 }
3067#endif // wxUSE_FONTMAP
3068 }
3069#endif // HAVE_ICONV
3070
3071#ifdef wxHAVE_WIN32_MB2WC
3072 {
3073#if wxUSE_FONTMAP
3074 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3075 : new wxMBConv_win32(m_encoding);
3076 if ( conv->IsOk() )
3077 return conv;
3078
3079 delete conv;
3080#else
3081 return NULL;
3082#endif
3083 }
3084#endif // wxHAVE_WIN32_MB2WC
3085
3086#ifdef __DARWIN__
3087 {
3088 // leave UTF16 and UTF32 to the built-ins of wx
3089 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3090 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3091 {
3092#if wxUSE_FONTMAP
3093 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3094 : new wxMBConv_cf(m_encoding);
3095#else
3096 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3097#endif
3098
3099 if ( conv->IsOk() )
3100 return conv;
3101
3102 delete conv;
3103 }
3104 }
3105#endif // __DARWIN__
3106
3107 // step (2)
3108 wxFontEncoding enc = m_encoding;
3109#if wxUSE_FONTMAP
3110 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3111 {
3112 // use "false" to suppress interactive dialogs -- we can be called from
3113 // anywhere and popping up a dialog from here is the last thing we want to
3114 // do
3115 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3116 }
3117#endif // wxUSE_FONTMAP
3118
3119 switch ( enc )
3120 {
3121 case wxFONTENCODING_UTF7:
3122 return new wxMBConvUTF7;
3123
3124 case wxFONTENCODING_UTF8:
3125 return new wxMBConvUTF8;
3126
3127 case wxFONTENCODING_UTF16BE:
3128 return new wxMBConvUTF16BE;
3129
3130 case wxFONTENCODING_UTF16LE:
3131 return new wxMBConvUTF16LE;
3132
3133 case wxFONTENCODING_UTF32BE:
3134 return new wxMBConvUTF32BE;
3135
3136 case wxFONTENCODING_UTF32LE:
3137 return new wxMBConvUTF32LE;
3138
3139 default:
3140 // nothing to do but put here to suppress gcc warnings
3141 break;
3142 }
3143
3144 // step (3)
3145#if wxUSE_FONTMAP
3146 {
3147 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3148 : new wxMBConv_wxwin(m_encoding);
3149 if ( conv->IsOk() )
3150 return conv;
3151
3152 delete conv;
3153 }
3154#endif // wxUSE_FONTMAP
3155
3156 // NB: This is a hack to prevent deadlock. What could otherwise happen
3157 // in Unicode build: wxConvLocal creation ends up being here
3158 // because of some failure and logs the error. But wxLog will try to
3159 // attach a timestamp, for which it will need wxConvLocal (to convert
3160 // time to char* and then wchar_t*), but that fails, tries to log the
3161 // error, but wxLog has an (already locked) critical section that
3162 // guards the static buffer.
3163 static bool alreadyLoggingError = false;
3164 if (!alreadyLoggingError)
3165 {
3166 alreadyLoggingError = true;
3167 wxLogError(_("Cannot convert from the charset '%s'!"),
3168 m_name ? m_name
3169 :
3170#if wxUSE_FONTMAP
3171 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3172#else // !wxUSE_FONTMAP
3173 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3174#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3175 );
3176
3177 alreadyLoggingError = false;
3178 }
3179
3180 return NULL;
3181}
3182
3183void wxCSConv::CreateConvIfNeeded() const
3184{
3185 if ( m_deferred )
3186 {
3187 wxCSConv *self = (wxCSConv *)this; // const_cast
3188
3189 // if we don't have neither the name nor the encoding, use the default
3190 // encoding for this system
3191 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3192 {
3193#if wxUSE_INTL
3194 self->m_encoding = wxLocale::GetSystemEncoding();
3195#else
3196 // fallback to some reasonable default:
3197 self->m_encoding = wxFONTENCODING_ISO8859_1;
3198#endif // wxUSE_INTL
3199 }
3200
3201 self->m_convReal = DoCreate();
3202 self->m_deferred = false;
3203 }
3204}
3205
3206bool wxCSConv::IsOk() const
3207{
3208 CreateConvIfNeeded();
3209
3210 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3211 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3212 return true; // always ok as we do it ourselves
3213
3214 // m_convReal->IsOk() is called at its own creation, so we know it must
3215 // be ok if m_convReal is non-NULL
3216 return m_convReal != NULL;
3217}
3218
3219size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3220 const char *src, size_t srcLen) const
3221{
3222 CreateConvIfNeeded();
3223
3224 if (m_convReal)
3225 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3226
3227 // latin-1 (direct)
3228 if ( srcLen == wxNO_LEN )
3229 srcLen = strlen(src) + 1; // take trailing NUL too
3230
3231 if ( dst )
3232 {
3233 if ( dstLen < srcLen )
3234 return wxCONV_FAILED;
3235
3236 for ( size_t n = 0; n < srcLen; n++ )
3237 dst[n] = (unsigned char)(src[n]);
3238 }
3239
3240 return srcLen;
3241}
3242
3243size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3244 const wchar_t *src, size_t srcLen) const
3245{
3246 CreateConvIfNeeded();
3247
3248 if (m_convReal)
3249 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3250
3251 // latin-1 (direct)
3252 if ( srcLen == wxNO_LEN )
3253 srcLen = wxWcslen(src) + 1;
3254
3255 if ( dst )
3256 {
3257 if ( dstLen < srcLen )
3258 return wxCONV_FAILED;
3259
3260 for ( size_t n = 0; n < srcLen; n++ )
3261 {
3262 if ( src[n] > 0xFF )
3263 return wxCONV_FAILED;
3264
3265 dst[n] = (char)src[n];
3266 }
3267
3268 }
3269 else // still need to check the input validity
3270 {
3271 for ( size_t n = 0; n < srcLen; n++ )
3272 {
3273 if ( src[n] > 0xFF )
3274 return wxCONV_FAILED;
3275 }
3276 }
3277
3278 return srcLen;
3279}
3280
3281size_t wxCSConv::GetMBNulLen() const
3282{
3283 CreateConvIfNeeded();
3284
3285 if ( m_convReal )
3286 {
3287 return m_convReal->GetMBNulLen();
3288 }
3289
3290 // otherwise, we are ISO-8859-1
3291 return 1;
3292}
3293
3294#if wxUSE_UNICODE_UTF8
3295bool wxCSConv::IsUTF8() const
3296{
3297 CreateConvIfNeeded();
3298
3299 if ( m_convReal )
3300 {
3301 return m_convReal->IsUTF8();
3302 }
3303
3304 // otherwise, we are ISO-8859-1
3305 return false;
3306}
3307#endif
3308
3309
3310#if wxUSE_UNICODE
3311
3312wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3313{
3314 if ( !s )
3315 return wxWCharBuffer();
3316
3317 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3318 if ( !wbuf )
3319 wbuf = wxMBConvUTF8().cMB2WX(s);
3320 if ( !wbuf )
3321 wbuf = wxConvISO8859_1.cMB2WX(s);
3322
3323 return wbuf;
3324}
3325
3326wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3327{
3328 if ( !ws )
3329 return wxCharBuffer();
3330
3331 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3332 if ( !buf )
3333 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3334
3335 return buf;
3336}
3337
3338#endif // wxUSE_UNICODE
3339
3340// ----------------------------------------------------------------------------
3341// globals
3342// ----------------------------------------------------------------------------
3343
3344// NB: The reason why we create converted objects in this convoluted way,
3345// using a factory function instead of global variable, is that they
3346// may be used at static initialization time (some of them are used by
3347// wxString ctors and there may be a global wxString object). In other
3348// words, possibly _before_ the converter global object would be
3349// initialized.
3350
3351#undef wxConvLibc
3352#undef wxConvUTF8
3353#undef wxConvUTF7
3354#undef wxConvLocal
3355#undef wxConvISO8859_1
3356
3357#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3358 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3359 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3360 { \
3361 static impl_klass name##Obj ctor_args; \
3362 return &name##Obj; \
3363 } \
3364 /* this ensures that all global converter objects are created */ \
3365 /* by the time static initialization is done, i.e. before any */ \
3366 /* thread is launched: */ \
3367 static klass* gs_##name##instance = wxGet_##name##Ptr()
3368
3369#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3370 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3371
3372#ifdef __WINDOWS__
3373 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3374#else
3375 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3376#endif
3377
3378// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3379// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3380// provokes an error message about "not enough macro parameters"; and we
3381// can't use "()" here as the name##Obj declaration would be parsed as a
3382// function declaration then, so use a semicolon and live with an extra
3383// empty statement (and hope that no compilers warns about this)
3384WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3385WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3386
3387WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3388WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3389
3390WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3391WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3392
3393#ifdef __DARWIN__
3394// The xnu kernel always communicates file paths in decomposed UTF-8.
3395// WARNING: Are we sure that CFString's conversion will cause decomposition?
3396static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3397#endif
3398
3399WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3400#ifdef __DARWIN__
3401 &wxConvMacUTF8DObj;
3402#else // !__DARWIN__
3403 wxGet_wxConvLibcPtr();
3404#endif // __DARWIN__/!__DARWIN__
3405
3406#else // !wxUSE_WCHAR_T
3407
3408// FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3409// stand-ins in absence of wchar_t
3410WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3411 wxConvISO8859_1,
3412 wxConvLocal,
3413 wxConvUTF8;
3414
3415#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T