fix bug introduced in r54646: we still need to count the embedded NULs when convertin...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
227 for ( ;; )
228 {
229 // try to convert the current chunk
230 size_t lenChunk = MB2WC(NULL, src, 0);
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dstWritten += lenChunk;
235 if ( !srcEnd )
236 dstWritten++;
237
238 if ( !lenChunk )
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
254 if ( !srcEnd )
255 dst++;
256 }
257
258 if ( !srcEnd )
259 {
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
262 break;
263 }
264
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 src += nulLen; // skipping over its terminator as well
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
280 if ( src >= srcEnd )
281 break;
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
292 }
293
294 return dstWritten;
295 }
296
297 size_t
298 wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
300 {
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
303
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
311 wxWCharBuffer bufTmp;
312 if ( isNulTerminated )
313 {
314 srcLen = wxWcslen(src) + 1;
315 }
316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
317 {
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp = wxWCharBuffer(srcLen);
320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
335 dstWritten += lenChunk;
336 if ( isNulTerminated )
337 dstWritten += lenNul;
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
348 if ( isNulTerminated )
349 dst += lenNul;
350 }
351 }
352
353 return dstWritten;
354 }
355
356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
357 {
358 size_t rc = ToWChar(outBuff, outLen, inBuff);
359 if ( rc != wxCONV_FAILED )
360 {
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
363 rc--;
364 }
365
366 return rc;
367 }
368
369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
370 {
371 size_t rc = FromWChar(outBuff, outLen, inBuff);
372 if ( rc != wxCONV_FAILED )
373 {
374 rc -= GetMBNulLen();
375 }
376
377 return rc;
378 }
379
380 wxMBConv::~wxMBConv()
381 {
382 // nothing to do here (necessary for Darwin linking probably)
383 }
384
385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
386 {
387 if ( psz )
388 {
389 // calculate the length of the buffer needed first
390 const size_t nLen = ToWChar(NULL, 0, psz);
391 if ( nLen != wxCONV_FAILED )
392 {
393 // now do the actual conversion
394 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
395
396 // +1 for the trailing NULL
397 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
398 return buf;
399 }
400 }
401
402 return wxWCharBuffer();
403 }
404
405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
406 {
407 if ( pwz )
408 {
409 const size_t nLen = FromWChar(NULL, 0, pwz);
410 if ( nLen != wxCONV_FAILED )
411 {
412 wxCharBuffer buf(nLen - 1);
413 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
414 return buf;
415 }
416 }
417
418 return wxCharBuffer();
419 }
420
421 const wxWCharBuffer
422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
423 {
424 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
425 if ( dstLen != wxCONV_FAILED )
426 {
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer wbuf(dstLen);
431 wbuf.data()[dstLen] = L'\0';
432 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
433 {
434 if ( outLen )
435 {
436 *outLen = dstLen;
437
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen == wxNO_LEN )
444 (*outLen)--;
445 }
446
447 return wbuf;
448 }
449 }
450
451 if ( outLen )
452 *outLen = 0;
453
454 return wxWCharBuffer();
455 }
456
457 const wxCharBuffer
458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
459 {
460 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
461 if ( dstLen != wxCONV_FAILED )
462 {
463 const size_t nulLen = GetMBNulLen();
464
465 // as above, ensure that the buffer is always NUL-terminated, even if
466 // the input is not
467 wxCharBuffer buf(dstLen + nulLen - 1);
468 memset(buf.data() + dstLen, 0, nulLen);
469 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
470 {
471 if ( outLen )
472 {
473 *outLen = dstLen;
474
475 if ( inLen == wxNO_LEN )
476 {
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
479 *outLen -= nulLen;
480 }
481 }
482
483 return buf;
484 }
485 }
486
487 if ( outLen )
488 *outLen = 0;
489
490 return wxCharBuffer();
491 }
492
493 // ----------------------------------------------------------------------------
494 // wxMBConvLibc
495 // ----------------------------------------------------------------------------
496
497 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
498 {
499 return wxMB2WC(buf, psz, n);
500 }
501
502 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
503 {
504 return wxWC2MB(buf, psz, n);
505 }
506
507 // ----------------------------------------------------------------------------
508 // wxConvBrokenFileNames
509 // ----------------------------------------------------------------------------
510
511 #ifdef __UNIX__
512
513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
514 {
515 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
516 wxStricmp(charset, _T("UTF8")) == 0 )
517 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
518 else
519 m_conv = new wxCSConv(charset);
520 }
521
522 #endif // __UNIX__
523
524 // ----------------------------------------------------------------------------
525 // UTF-7
526 // ----------------------------------------------------------------------------
527
528 // Implementation (C) 2004 Fredrik Roubert
529 //
530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
531
532 //
533 // BASE64 decoding table
534 //
535 static const unsigned char utf7unb64[] =
536 {
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
543 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
544 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
546 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
547 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
548 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
550 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
551 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
552 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
554 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
555 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
556 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
569 };
570
571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
572 const char *src, size_t srcLen) const
573 {
574 DecoderState stateOrig,
575 *statePtr;
576 if ( srcLen == wxNO_LEN )
577 {
578 // convert the entire string, up to and including the trailing NUL
579 srcLen = strlen(src) + 1;
580
581 // when working on the entire strings we don't update nor use the shift
582 // state from the previous call
583 statePtr = &stateOrig;
584 }
585 else // when working with partial strings we do use the shift state
586 {
587 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
588
589 // also save the old state to be able to rollback to it on error
590 stateOrig = m_stateDecoder;
591 }
592
593 // but to simplify the code below we use this variable in both cases
594 DecoderState& state = *statePtr;
595
596
597 // number of characters [which would have been] written to dst [if it were
598 // not NULL]
599 size_t len = 0;
600
601 const char * const srcEnd = src + srcLen;
602
603 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
604 {
605 const unsigned char cc = *src++;
606
607 if ( state.IsShifted() )
608 {
609 const unsigned char dc = utf7unb64[cc];
610 if ( dc == 0xff )
611 {
612 // end of encoded part, check that nothing was left: the bit
613 // field cycles through 0,6,4,2 sequence so check that we're at
614 // the end of it
615 if ( state.bit != 2 )
616 return wxCONV_FAILED;
617
618 state.ToDirect();
619
620 // re-parse this character normally below unless it's '-' which
621 // is consumed by the decoder
622 if ( cc == '-' )
623 continue;
624 }
625 else // valid encoded character
626 {
627 // mini base64 decoder: each character is 6 bits
628 state.bit += 6;
629 state.accum <<= 6;
630 state.accum += dc;
631
632 if ( state.bit >= 8 )
633 {
634 // got the full byte, consume it
635 state.bit -= 8;
636 unsigned char b = (state.accum >> state.bit) & 0x00ff;
637
638 if ( state.isLSB )
639 {
640 // we've got the full word, output it
641 if ( dst )
642 *dst++ = (state.msb << 8) | b;
643 len++;
644 state.isLSB = false;
645 }
646 else // MSB
647 {
648 // just store it while we wait for LSB
649 state.msb = b;
650 state.isLSB = true;
651 }
652 }
653 }
654 }
655
656 if ( state.IsDirect() )
657 {
658 // start of an encoded segment?
659 if ( cc == '+' )
660 {
661 if ( *src == '-' )
662 {
663 // just the encoded plus sign, don't switch to shifted mode
664 if ( dst )
665 *dst++ = '+';
666 len++;
667 src++;
668 }
669 else
670 {
671 state.ToShifted();
672 }
673 }
674 else // not '+'
675 {
676 // only printable 7 bit ASCII characters (with the exception of
677 // NUL, TAB, CR and LF) can be used directly
678 if ( cc >= 0x7f || (cc < ' ' &&
679 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
680 return wxCONV_FAILED;
681
682 if ( dst )
683 *dst++ = cc;
684 len++;
685 }
686 }
687 }
688
689 if ( !len )
690 {
691 // as we didn't read any characters we should be called with the same
692 // data (followed by some more new data) again later so don't save our
693 // state
694 state = stateOrig;
695
696 return wxCONV_FAILED;
697 }
698
699 return len;
700 }
701
702 //
703 // BASE64 encoding table
704 //
705 static const unsigned char utf7enb64[] =
706 {
707 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
708 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
709 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
710 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
711 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
712 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
713 'w', 'x', 'y', 'z', '0', '1', '2', '3',
714 '4', '5', '6', '7', '8', '9', '+', '/'
715 };
716
717 //
718 // UTF-7 encoding table
719 //
720 // 0 - Set D (directly encoded characters)
721 // 1 - Set O (optional direct characters)
722 // 2 - whitespace characters (optional)
723 // 3 - special characters
724 //
725 static const unsigned char utf7encode[128] =
726 {
727 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
728 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
729 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
731 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
733 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
735 };
736
737 static inline bool wxIsUTF7Direct(wchar_t wc)
738 {
739 return wc < 0x80 && utf7encode[wc] < 1;
740 }
741
742 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
743 const wchar_t *src, size_t srcLen) const
744 {
745 EncoderState stateOrig,
746 *statePtr;
747 if ( srcLen == wxNO_LEN )
748 {
749 // we don't apply the stored state when operating on entire strings at
750 // once
751 statePtr = &stateOrig;
752
753 srcLen = wxWcslen(src) + 1;
754 }
755 else // do use the mode we left the output in previously
756 {
757 stateOrig = m_stateEncoder;
758 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
759 }
760
761 EncoderState& state = *statePtr;
762
763
764 size_t len = 0;
765
766 const wchar_t * const srcEnd = src + srcLen;
767 while ( src < srcEnd && (!dst || len < dstLen) )
768 {
769 wchar_t cc = *src++;
770 if ( wxIsUTF7Direct(cc) )
771 {
772 if ( state.IsShifted() )
773 {
774 // pad with zeros the last encoded block if necessary
775 if ( state.bit )
776 {
777 if ( dst )
778 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
779 len++;
780 }
781
782 state.ToDirect();
783
784 if ( dst )
785 *dst++ = '-';
786 len++;
787 }
788
789 if ( dst )
790 *dst++ = (char)cc;
791 len++;
792 }
793 else if ( cc == '+' && state.IsDirect() )
794 {
795 if ( dst )
796 {
797 *dst++ = '+';
798 *dst++ = '-';
799 }
800
801 len += 2;
802 }
803 #ifndef WC_UTF16
804 else if (((wxUint32)cc) > 0xffff)
805 {
806 // no surrogate pair generation (yet?)
807 return wxCONV_FAILED;
808 }
809 #endif
810 else
811 {
812 if ( state.IsDirect() )
813 {
814 state.ToShifted();
815
816 if ( dst )
817 *dst++ = '+';
818 len++;
819 }
820
821 // BASE64 encode string
822 for ( ;; )
823 {
824 for ( unsigned lsb = 0; lsb < 2; lsb++ )
825 {
826 state.accum <<= 8;
827 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
828
829 for (state.bit += 8; state.bit >= 6; )
830 {
831 state.bit -= 6;
832 if ( dst )
833 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
834 len++;
835 }
836 }
837
838 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
839 break;
840
841 src++;
842 }
843 }
844 }
845
846 // we need to restore the original encoder state if we were called just to
847 // calculate the amount of space needed as we will presumably be called
848 // again to really convert the data now
849 if ( !dst )
850 state = stateOrig;
851
852 return len;
853 }
854
855 // ----------------------------------------------------------------------------
856 // UTF-8
857 // ----------------------------------------------------------------------------
858
859 static const wxUint32 utf8_max[]=
860 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
861
862 // boundaries of the private use area we use to (temporarily) remap invalid
863 // characters invalid in a UTF-8 encoded string
864 const wxUint32 wxUnicodePUA = 0x100000;
865 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
866
867 // this table gives the length of the UTF-8 encoding from its first character:
868 const unsigned char tableUtf8Lengths[256] = {
869 // single-byte sequences (ASCII):
870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
876 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
877 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
878
879 // these are invalid:
880 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
881 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
882 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
884 0, 0, // C0,C1
885
886 // two-byte sequences:
887 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
888 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
889
890 // three-byte sequences:
891 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
892
893 // four-byte sequences:
894 4, 4, 4, 4, 4, // F0..F4
895
896 // these are invalid again (5- or 6-byte
897 // sequences and sequences for code points
898 // above U+10FFFF, as restricted by RFC 3629):
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
900 };
901
902 size_t
903 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
904 const char *src, size_t srcLen) const
905 {
906 wchar_t *out = dstLen ? dst : NULL;
907 size_t written = 0;
908
909 if ( srcLen == wxNO_LEN )
910 srcLen = strlen(src) + 1;
911
912 for ( const char *p = src; ; p++ )
913 {
914 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
915 {
916 // all done successfully, just add the trailing NULL if we are not
917 // using explicit length
918 if ( srcLen == wxNO_LEN )
919 {
920 if ( out )
921 {
922 if ( !dstLen )
923 break;
924
925 *out = L'\0';
926 }
927
928 written++;
929 }
930
931 return written;
932 }
933
934 if ( out && !dstLen-- )
935 break;
936
937 wxUint32 code;
938 unsigned char c = *p;
939
940 if ( c < 0x80 )
941 {
942 if ( srcLen == 0 ) // the test works for wxNO_LEN too
943 break;
944
945 if ( srcLen != wxNO_LEN )
946 srcLen--;
947
948 code = c;
949 }
950 else
951 {
952 unsigned len = tableUtf8Lengths[c];
953 if ( !len )
954 break;
955
956 if ( srcLen < len ) // the test works for wxNO_LEN too
957 break;
958
959 if ( srcLen != wxNO_LEN )
960 srcLen -= len;
961
962 // Char. number range | UTF-8 octet sequence
963 // (hexadecimal) | (binary)
964 // ----------------------+----------------------------------------
965 // 0000 0000 - 0000 007F | 0xxxxxxx
966 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
967 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
968 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
969 //
970 // Code point value is stored in bits marked with 'x',
971 // lowest-order bit of the value on the right side in the diagram
972 // above. (from RFC 3629)
973
974 // mask to extract lead byte's value ('x' bits above), by sequence
975 // length:
976 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
977
978 // mask and value of lead byte's most significant bits, by length:
979 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
980 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
981
982 len--; // it's more convenient to work with 0-based length here
983
984 // extract the lead byte's value bits:
985 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
986 break;
987
988 code = c & leadValueMask[len];
989
990 // all remaining bytes, if any, are handled in the same way
991 // regardless of sequence's length:
992 for ( ; len; --len )
993 {
994 c = *++p;
995 if ( (c & 0xC0) != 0x80 )
996 return wxCONV_FAILED;
997
998 code <<= 6;
999 code |= c & 0x3F;
1000 }
1001 }
1002
1003 #ifdef WC_UTF16
1004 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1005 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1006 {
1007 if ( out )
1008 out++;
1009 written++;
1010 }
1011 #else // !WC_UTF16
1012 if ( out )
1013 *out = code;
1014 #endif // WC_UTF16/!WC_UTF16
1015
1016 if ( out )
1017 out++;
1018
1019 written++;
1020 }
1021
1022 return wxCONV_FAILED;
1023 }
1024
1025 size_t
1026 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1027 const wchar_t *src, size_t srcLen) const
1028 {
1029 char *out = dstLen ? dst : NULL;
1030 size_t written = 0;
1031
1032 for ( const wchar_t *wp = src; ; wp++ )
1033 {
1034 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1035 {
1036 // all done successfully, just add the trailing NULL if we are not
1037 // using explicit length
1038 if ( srcLen == wxNO_LEN )
1039 {
1040 if ( out )
1041 {
1042 if ( !dstLen )
1043 break;
1044
1045 *out = '\0';
1046 }
1047
1048 written++;
1049 }
1050
1051 return written;
1052 }
1053
1054 if ( srcLen != wxNO_LEN )
1055 srcLen--;
1056
1057 wxUint32 code;
1058 #ifdef WC_UTF16
1059 // cast is ok for WC_UTF16
1060 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1061 {
1062 // skip the next char too as we decoded a surrogate
1063 wp++;
1064 }
1065 #else // wchar_t is UTF-32
1066 code = *wp & 0x7fffffff;
1067 #endif
1068
1069 unsigned len;
1070 if ( code <= 0x7F )
1071 {
1072 len = 1;
1073 if ( out )
1074 {
1075 if ( dstLen < len )
1076 break;
1077
1078 out[0] = (char)code;
1079 }
1080 }
1081 else if ( code <= 0x07FF )
1082 {
1083 len = 2;
1084 if ( out )
1085 {
1086 if ( dstLen < len )
1087 break;
1088
1089 // NB: this line takes 6 least significant bits, encodes them as
1090 // 10xxxxxx and discards them so that the next byte can be encoded:
1091 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1092 out[0] = 0xC0 | code;
1093 }
1094 }
1095 else if ( code < 0xFFFF )
1096 {
1097 len = 3;
1098 if ( out )
1099 {
1100 if ( dstLen < len )
1101 break;
1102
1103 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1104 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1105 out[0] = 0xE0 | code;
1106 }
1107 }
1108 else if ( code <= 0x10FFFF )
1109 {
1110 len = 4;
1111 if ( out )
1112 {
1113 if ( dstLen < len )
1114 break;
1115
1116 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1117 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1118 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1119 out[0] = 0xF0 | code;
1120 }
1121 }
1122 else
1123 {
1124 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1125 break;
1126 }
1127
1128 if ( out )
1129 {
1130 out += len;
1131 dstLen -= len;
1132 }
1133
1134 written += len;
1135 }
1136
1137 // we only get here if an error occurs during decoding
1138 return wxCONV_FAILED;
1139 }
1140
1141 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1142 const char *psz, size_t srcLen) const
1143 {
1144 if ( m_options == MAP_INVALID_UTF8_NOT )
1145 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1146
1147 size_t len = 0;
1148
1149 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1150 {
1151 const char *opsz = psz;
1152 bool invalid = false;
1153 unsigned char cc = *psz++, fc = cc;
1154 unsigned cnt;
1155 for (cnt = 0; fc & 0x80; cnt++)
1156 fc <<= 1;
1157
1158 if (!cnt)
1159 {
1160 // plain ASCII char
1161 if (buf)
1162 *buf++ = cc;
1163 len++;
1164
1165 // escape the escape character for octal escapes
1166 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1167 && cc == '\\' && (!buf || len < n))
1168 {
1169 if (buf)
1170 *buf++ = cc;
1171 len++;
1172 }
1173 }
1174 else
1175 {
1176 cnt--;
1177 if (!cnt)
1178 {
1179 // invalid UTF-8 sequence
1180 invalid = true;
1181 }
1182 else
1183 {
1184 unsigned ocnt = cnt - 1;
1185 wxUint32 res = cc & (0x3f >> cnt);
1186 while (cnt--)
1187 {
1188 cc = *psz;
1189 if ((cc & 0xC0) != 0x80)
1190 {
1191 // invalid UTF-8 sequence
1192 invalid = true;
1193 break;
1194 }
1195
1196 psz++;
1197 res = (res << 6) | (cc & 0x3f);
1198 }
1199
1200 if (invalid || res <= utf8_max[ocnt])
1201 {
1202 // illegal UTF-8 encoding
1203 invalid = true;
1204 }
1205 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1206 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1207 {
1208 // if one of our PUA characters turns up externally
1209 // it must also be treated as an illegal sequence
1210 // (a bit like you have to escape an escape character)
1211 invalid = true;
1212 }
1213 else
1214 {
1215 #ifdef WC_UTF16
1216 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1217 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1218 if (pa == wxCONV_FAILED)
1219 {
1220 invalid = true;
1221 }
1222 else
1223 {
1224 if (buf)
1225 buf += pa;
1226 len += pa;
1227 }
1228 #else // !WC_UTF16
1229 if (buf)
1230 *buf++ = (wchar_t)res;
1231 len++;
1232 #endif // WC_UTF16/!WC_UTF16
1233 }
1234 }
1235
1236 if (invalid)
1237 {
1238 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1239 {
1240 while (opsz < psz && (!buf || len < n))
1241 {
1242 #ifdef WC_UTF16
1243 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1244 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1245 wxASSERT(pa != wxCONV_FAILED);
1246 if (buf)
1247 buf += pa;
1248 opsz++;
1249 len += pa;
1250 #else
1251 if (buf)
1252 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1253 opsz++;
1254 len++;
1255 #endif
1256 }
1257 }
1258 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1259 {
1260 while (opsz < psz && (!buf || len < n))
1261 {
1262 if ( buf && len + 3 < n )
1263 {
1264 unsigned char on = *opsz;
1265 *buf++ = L'\\';
1266 *buf++ = (wchar_t)( L'0' + on / 0100 );
1267 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1268 *buf++ = (wchar_t)( L'0' + on % 010 );
1269 }
1270
1271 opsz++;
1272 len += 4;
1273 }
1274 }
1275 else // MAP_INVALID_UTF8_NOT
1276 {
1277 return wxCONV_FAILED;
1278 }
1279 }
1280 }
1281 }
1282
1283 if (srcLen == wxNO_LEN && buf && (len < n))
1284 *buf = 0;
1285
1286 return len + 1;
1287 }
1288
1289 static inline bool isoctal(wchar_t wch)
1290 {
1291 return L'0' <= wch && wch <= L'7';
1292 }
1293
1294 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1295 const wchar_t *psz, size_t srcLen) const
1296 {
1297 if ( m_options == MAP_INVALID_UTF8_NOT )
1298 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1299
1300 size_t len = 0;
1301
1302 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1303 {
1304 wxUint32 cc;
1305
1306 #ifdef WC_UTF16
1307 // cast is ok for WC_UTF16
1308 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1309 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1310 #else
1311 cc = (*psz++) & 0x7fffffff;
1312 #endif
1313
1314 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1315 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1316 {
1317 if (buf)
1318 *buf++ = (char)(cc - wxUnicodePUA);
1319 len++;
1320 }
1321 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1322 && cc == L'\\' && psz[0] == L'\\' )
1323 {
1324 if (buf)
1325 *buf++ = (char)cc;
1326 psz++;
1327 len++;
1328 }
1329 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1330 cc == L'\\' &&
1331 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1332 {
1333 if (buf)
1334 {
1335 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1336 (psz[1] - L'0') * 010 +
1337 (psz[2] - L'0'));
1338 }
1339
1340 psz += 3;
1341 len++;
1342 }
1343 else
1344 {
1345 unsigned cnt;
1346 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1347 {
1348 }
1349
1350 if (!cnt)
1351 {
1352 // plain ASCII char
1353 if (buf)
1354 *buf++ = (char) cc;
1355 len++;
1356 }
1357 else
1358 {
1359 len += cnt + 1;
1360 if (buf)
1361 {
1362 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1363 while (cnt--)
1364 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1365 }
1366 }
1367 }
1368 }
1369
1370 if (srcLen == wxNO_LEN && buf && (len < n))
1371 *buf = 0;
1372
1373 return len + 1;
1374 }
1375
1376 // ============================================================================
1377 // UTF-16
1378 // ============================================================================
1379
1380 #ifdef WORDS_BIGENDIAN
1381 #define wxMBConvUTF16straight wxMBConvUTF16BE
1382 #define wxMBConvUTF16swap wxMBConvUTF16LE
1383 #else
1384 #define wxMBConvUTF16swap wxMBConvUTF16BE
1385 #define wxMBConvUTF16straight wxMBConvUTF16LE
1386 #endif
1387
1388 /* static */
1389 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1390 {
1391 if ( srcLen == wxNO_LEN )
1392 {
1393 // count the number of bytes in input, including the trailing NULs
1394 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1395 for ( srcLen = 1; *inBuff++; srcLen++ )
1396 ;
1397
1398 srcLen *= BYTES_PER_CHAR;
1399 }
1400 else // we already have the length
1401 {
1402 // we can only convert an entire number of UTF-16 characters
1403 if ( srcLen % BYTES_PER_CHAR )
1404 return wxCONV_FAILED;
1405 }
1406
1407 return srcLen;
1408 }
1409
1410 // case when in-memory representation is UTF-16 too
1411 #ifdef WC_UTF16
1412
1413 // ----------------------------------------------------------------------------
1414 // conversions without endianness change
1415 // ----------------------------------------------------------------------------
1416
1417 size_t
1418 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1419 const char *src, size_t srcLen) const
1420 {
1421 // set up the scene for using memcpy() (which is presumably more efficient
1422 // than copying the bytes one by one)
1423 srcLen = GetLength(src, srcLen);
1424 if ( srcLen == wxNO_LEN )
1425 return wxCONV_FAILED;
1426
1427 const size_t inLen = srcLen / BYTES_PER_CHAR;
1428 if ( dst )
1429 {
1430 if ( dstLen < inLen )
1431 return wxCONV_FAILED;
1432
1433 memcpy(dst, src, srcLen);
1434 }
1435
1436 return inLen;
1437 }
1438
1439 size_t
1440 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1441 const wchar_t *src, size_t srcLen) const
1442 {
1443 if ( srcLen == wxNO_LEN )
1444 srcLen = wxWcslen(src) + 1;
1445
1446 srcLen *= BYTES_PER_CHAR;
1447
1448 if ( dst )
1449 {
1450 if ( dstLen < srcLen )
1451 return wxCONV_FAILED;
1452
1453 memcpy(dst, src, srcLen);
1454 }
1455
1456 return srcLen;
1457 }
1458
1459 // ----------------------------------------------------------------------------
1460 // endian-reversing conversions
1461 // ----------------------------------------------------------------------------
1462
1463 size_t
1464 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1465 const char *src, size_t srcLen) const
1466 {
1467 srcLen = GetLength(src, srcLen);
1468 if ( srcLen == wxNO_LEN )
1469 return wxCONV_FAILED;
1470
1471 srcLen /= BYTES_PER_CHAR;
1472
1473 if ( dst )
1474 {
1475 if ( dstLen < srcLen )
1476 return wxCONV_FAILED;
1477
1478 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1479 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1480 {
1481 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1482 }
1483 }
1484
1485 return srcLen;
1486 }
1487
1488 size_t
1489 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1490 const wchar_t *src, size_t srcLen) const
1491 {
1492 if ( srcLen == wxNO_LEN )
1493 srcLen = wxWcslen(src) + 1;
1494
1495 srcLen *= BYTES_PER_CHAR;
1496
1497 if ( dst )
1498 {
1499 if ( dstLen < srcLen )
1500 return wxCONV_FAILED;
1501
1502 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1503 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1504 {
1505 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1506 }
1507 }
1508
1509 return srcLen;
1510 }
1511
1512 #else // !WC_UTF16: wchar_t is UTF-32
1513
1514 // ----------------------------------------------------------------------------
1515 // conversions without endianness change
1516 // ----------------------------------------------------------------------------
1517
1518 size_t
1519 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1520 const char *src, size_t srcLen) const
1521 {
1522 srcLen = GetLength(src, srcLen);
1523 if ( srcLen == wxNO_LEN )
1524 return wxCONV_FAILED;
1525
1526 const size_t inLen = srcLen / BYTES_PER_CHAR;
1527 if ( !dst )
1528 {
1529 // optimization: return maximal space which could be needed for this
1530 // string even if the real size could be smaller if the buffer contains
1531 // any surrogates
1532 return inLen;
1533 }
1534
1535 size_t outLen = 0;
1536 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1537 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1538 {
1539 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1540 if ( !inBuff )
1541 return wxCONV_FAILED;
1542
1543 if ( ++outLen > dstLen )
1544 return wxCONV_FAILED;
1545
1546 *dst++ = ch;
1547 }
1548
1549
1550 return outLen;
1551 }
1552
1553 size_t
1554 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1555 const wchar_t *src, size_t srcLen) const
1556 {
1557 if ( srcLen == wxNO_LEN )
1558 srcLen = wxWcslen(src) + 1;
1559
1560 size_t outLen = 0;
1561 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1562 for ( size_t n = 0; n < srcLen; n++ )
1563 {
1564 wxUint16 cc[2];
1565 const size_t numChars = encode_utf16(*src++, cc);
1566 if ( numChars == wxCONV_FAILED )
1567 return wxCONV_FAILED;
1568
1569 outLen += numChars * BYTES_PER_CHAR;
1570 if ( outBuff )
1571 {
1572 if ( outLen > dstLen )
1573 return wxCONV_FAILED;
1574
1575 *outBuff++ = cc[0];
1576 if ( numChars == 2 )
1577 {
1578 // second character of a surrogate
1579 *outBuff++ = cc[1];
1580 }
1581 }
1582 }
1583
1584 return outLen;
1585 }
1586
1587 // ----------------------------------------------------------------------------
1588 // endian-reversing conversions
1589 // ----------------------------------------------------------------------------
1590
1591 size_t
1592 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1593 const char *src, size_t srcLen) const
1594 {
1595 srcLen = GetLength(src, srcLen);
1596 if ( srcLen == wxNO_LEN )
1597 return wxCONV_FAILED;
1598
1599 const size_t inLen = srcLen / BYTES_PER_CHAR;
1600 if ( !dst )
1601 {
1602 // optimization: return maximal space which could be needed for this
1603 // string even if the real size could be smaller if the buffer contains
1604 // any surrogates
1605 return inLen;
1606 }
1607
1608 size_t outLen = 0;
1609 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1610 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1611 {
1612 wxUint32 ch;
1613 wxUint16 tmp[2];
1614
1615 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1616 inBuff++;
1617 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1618
1619 const size_t numChars = decode_utf16(tmp, ch);
1620 if ( numChars == wxCONV_FAILED )
1621 return wxCONV_FAILED;
1622
1623 if ( numChars == 2 )
1624 inBuff++;
1625
1626 if ( ++outLen > dstLen )
1627 return wxCONV_FAILED;
1628
1629 *dst++ = ch;
1630 }
1631
1632
1633 return outLen;
1634 }
1635
1636 size_t
1637 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1638 const wchar_t *src, size_t srcLen) const
1639 {
1640 if ( srcLen == wxNO_LEN )
1641 srcLen = wxWcslen(src) + 1;
1642
1643 size_t outLen = 0;
1644 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1645 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1646 {
1647 wxUint16 cc[2];
1648 const size_t numChars = encode_utf16(*src, cc);
1649 if ( numChars == wxCONV_FAILED )
1650 return wxCONV_FAILED;
1651
1652 outLen += numChars * BYTES_PER_CHAR;
1653 if ( outBuff )
1654 {
1655 if ( outLen > dstLen )
1656 return wxCONV_FAILED;
1657
1658 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1659 if ( numChars == 2 )
1660 {
1661 // second character of a surrogate
1662 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1663 }
1664 }
1665 }
1666
1667 return outLen;
1668 }
1669
1670 #endif // WC_UTF16/!WC_UTF16
1671
1672
1673 // ============================================================================
1674 // UTF-32
1675 // ============================================================================
1676
1677 #ifdef WORDS_BIGENDIAN
1678 #define wxMBConvUTF32straight wxMBConvUTF32BE
1679 #define wxMBConvUTF32swap wxMBConvUTF32LE
1680 #else
1681 #define wxMBConvUTF32swap wxMBConvUTF32BE
1682 #define wxMBConvUTF32straight wxMBConvUTF32LE
1683 #endif
1684
1685
1686 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1687 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1688
1689 /* static */
1690 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1691 {
1692 if ( srcLen == wxNO_LEN )
1693 {
1694 // count the number of bytes in input, including the trailing NULs
1695 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1696 for ( srcLen = 1; *inBuff++; srcLen++ )
1697 ;
1698
1699 srcLen *= BYTES_PER_CHAR;
1700 }
1701 else // we already have the length
1702 {
1703 // we can only convert an entire number of UTF-32 characters
1704 if ( srcLen % BYTES_PER_CHAR )
1705 return wxCONV_FAILED;
1706 }
1707
1708 return srcLen;
1709 }
1710
1711 // case when in-memory representation is UTF-16
1712 #ifdef WC_UTF16
1713
1714 // ----------------------------------------------------------------------------
1715 // conversions without endianness change
1716 // ----------------------------------------------------------------------------
1717
1718 size_t
1719 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1720 const char *src, size_t srcLen) const
1721 {
1722 srcLen = GetLength(src, srcLen);
1723 if ( srcLen == wxNO_LEN )
1724 return wxCONV_FAILED;
1725
1726 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1727 const size_t inLen = srcLen / BYTES_PER_CHAR;
1728 size_t outLen = 0;
1729 for ( size_t n = 0; n < inLen; n++ )
1730 {
1731 wxUint16 cc[2];
1732 const size_t numChars = encode_utf16(*inBuff++, cc);
1733 if ( numChars == wxCONV_FAILED )
1734 return wxCONV_FAILED;
1735
1736 outLen += numChars;
1737 if ( dst )
1738 {
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
1741
1742 *dst++ = cc[0];
1743 if ( numChars == 2 )
1744 {
1745 // second character of a surrogate
1746 *dst++ = cc[1];
1747 }
1748 }
1749 }
1750
1751 return outLen;
1752 }
1753
1754 size_t
1755 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1756 const wchar_t *src, size_t srcLen) const
1757 {
1758 if ( srcLen == wxNO_LEN )
1759 srcLen = wxWcslen(src) + 1;
1760
1761 if ( !dst )
1762 {
1763 // optimization: return maximal space which could be needed for this
1764 // string instead of the exact amount which could be less if there are
1765 // any surrogates in the input
1766 //
1767 // we consider that surrogates are rare enough to make it worthwhile to
1768 // avoid running the loop below at the cost of slightly extra memory
1769 // consumption
1770 return srcLen * BYTES_PER_CHAR;
1771 }
1772
1773 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1774 size_t outLen = 0;
1775 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1776 {
1777 const wxUint32 ch = wxDecodeSurrogate(&src);
1778 if ( !src )
1779 return wxCONV_FAILED;
1780
1781 outLen += BYTES_PER_CHAR;
1782
1783 if ( outLen > dstLen )
1784 return wxCONV_FAILED;
1785
1786 *outBuff++ = ch;
1787 }
1788
1789 return outLen;
1790 }
1791
1792 // ----------------------------------------------------------------------------
1793 // endian-reversing conversions
1794 // ----------------------------------------------------------------------------
1795
1796 size_t
1797 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1798 const char *src, size_t srcLen) const
1799 {
1800 srcLen = GetLength(src, srcLen);
1801 if ( srcLen == wxNO_LEN )
1802 return wxCONV_FAILED;
1803
1804 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1805 const size_t inLen = srcLen / BYTES_PER_CHAR;
1806 size_t outLen = 0;
1807 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1808 {
1809 wxUint16 cc[2];
1810 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1811 if ( numChars == wxCONV_FAILED )
1812 return wxCONV_FAILED;
1813
1814 outLen += numChars;
1815 if ( dst )
1816 {
1817 if ( outLen > dstLen )
1818 return wxCONV_FAILED;
1819
1820 *dst++ = cc[0];
1821 if ( numChars == 2 )
1822 {
1823 // second character of a surrogate
1824 *dst++ = cc[1];
1825 }
1826 }
1827 }
1828
1829 return outLen;
1830 }
1831
1832 size_t
1833 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1834 const wchar_t *src, size_t srcLen) const
1835 {
1836 if ( srcLen == wxNO_LEN )
1837 srcLen = wxWcslen(src) + 1;
1838
1839 if ( !dst )
1840 {
1841 // optimization: return maximal space which could be needed for this
1842 // string instead of the exact amount which could be less if there are
1843 // any surrogates in the input
1844 //
1845 // we consider that surrogates are rare enough to make it worthwhile to
1846 // avoid running the loop below at the cost of slightly extra memory
1847 // consumption
1848 return srcLen*BYTES_PER_CHAR;
1849 }
1850
1851 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1852 size_t outLen = 0;
1853 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1854 {
1855 const wxUint32 ch = wxDecodeSurrogate(&src);
1856 if ( !src )
1857 return wxCONV_FAILED;
1858
1859 outLen += BYTES_PER_CHAR;
1860
1861 if ( outLen > dstLen )
1862 return wxCONV_FAILED;
1863
1864 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1865 }
1866
1867 return outLen;
1868 }
1869
1870 #else // !WC_UTF16: wchar_t is UTF-32
1871
1872 // ----------------------------------------------------------------------------
1873 // conversions without endianness change
1874 // ----------------------------------------------------------------------------
1875
1876 size_t
1877 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1878 const char *src, size_t srcLen) const
1879 {
1880 // use memcpy() as it should be much faster than hand-written loop
1881 srcLen = GetLength(src, srcLen);
1882 if ( srcLen == wxNO_LEN )
1883 return wxCONV_FAILED;
1884
1885 const size_t inLen = srcLen/BYTES_PER_CHAR;
1886 if ( dst )
1887 {
1888 if ( dstLen < inLen )
1889 return wxCONV_FAILED;
1890
1891 memcpy(dst, src, srcLen);
1892 }
1893
1894 return inLen;
1895 }
1896
1897 size_t
1898 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1899 const wchar_t *src, size_t srcLen) const
1900 {
1901 if ( srcLen == wxNO_LEN )
1902 srcLen = wxWcslen(src) + 1;
1903
1904 srcLen *= BYTES_PER_CHAR;
1905
1906 if ( dst )
1907 {
1908 if ( dstLen < srcLen )
1909 return wxCONV_FAILED;
1910
1911 memcpy(dst, src, srcLen);
1912 }
1913
1914 return srcLen;
1915 }
1916
1917 // ----------------------------------------------------------------------------
1918 // endian-reversing conversions
1919 // ----------------------------------------------------------------------------
1920
1921 size_t
1922 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1923 const char *src, size_t srcLen) const
1924 {
1925 srcLen = GetLength(src, srcLen);
1926 if ( srcLen == wxNO_LEN )
1927 return wxCONV_FAILED;
1928
1929 srcLen /= BYTES_PER_CHAR;
1930
1931 if ( dst )
1932 {
1933 if ( dstLen < srcLen )
1934 return wxCONV_FAILED;
1935
1936 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1937 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1938 {
1939 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1940 }
1941 }
1942
1943 return srcLen;
1944 }
1945
1946 size_t
1947 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1948 const wchar_t *src, size_t srcLen) const
1949 {
1950 if ( srcLen == wxNO_LEN )
1951 srcLen = wxWcslen(src) + 1;
1952
1953 srcLen *= BYTES_PER_CHAR;
1954
1955 if ( dst )
1956 {
1957 if ( dstLen < srcLen )
1958 return wxCONV_FAILED;
1959
1960 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1961 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1962 {
1963 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1964 }
1965 }
1966
1967 return srcLen;
1968 }
1969
1970 #endif // WC_UTF16/!WC_UTF16
1971
1972
1973 // ============================================================================
1974 // The classes doing conversion using the iconv_xxx() functions
1975 // ============================================================================
1976
1977 #ifdef HAVE_ICONV
1978
1979 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1980 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1981 // (unless there's yet another bug in glibc) the only case when iconv()
1982 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1983 // left in the input buffer -- when _real_ error occurs,
1984 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1985 // iconv() failure.
1986 // [This bug does not appear in glibc 2.2.]
1987 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1988 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1989 (errno != E2BIG || bufLeft != 0))
1990 #else
1991 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1992 #endif
1993
1994 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1995
1996 #define ICONV_T_INVALID ((iconv_t)-1)
1997
1998 #if SIZEOF_WCHAR_T == 4
1999 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2000 #define WC_ENC wxFONTENCODING_UTF32
2001 #elif SIZEOF_WCHAR_T == 2
2002 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2003 #define WC_ENC wxFONTENCODING_UTF16
2004 #else // sizeof(wchar_t) != 2 nor 4
2005 // does this ever happen?
2006 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2007 #endif
2008
2009 // ----------------------------------------------------------------------------
2010 // wxMBConv_iconv: encapsulates an iconv character set
2011 // ----------------------------------------------------------------------------
2012
2013 class wxMBConv_iconv : public wxMBConv
2014 {
2015 public:
2016 wxMBConv_iconv(const char *name);
2017 virtual ~wxMBConv_iconv();
2018
2019 // implement base class virtual methods
2020 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2021 const char *src, size_t srcLen = wxNO_LEN) const;
2022 virtual size_t FromWChar(char *dst, size_t dstLen,
2023 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2024 virtual size_t GetMBNulLen() const;
2025
2026 #if wxUSE_UNICODE_UTF8
2027 virtual bool IsUTF8() const;
2028 #endif
2029
2030 virtual wxMBConv *Clone() const
2031 {
2032 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2033 p->m_minMBCharWidth = m_minMBCharWidth;
2034 return p;
2035 }
2036
2037 bool IsOk() const
2038 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2039
2040 protected:
2041 // the iconv handlers used to translate from multibyte
2042 // to wide char and in the other direction
2043 iconv_t m2w,
2044 w2m;
2045
2046 #if wxUSE_THREADS
2047 // guards access to m2w and w2m objects
2048 wxMutex m_iconvMutex;
2049 #endif
2050
2051 private:
2052 // the name (for iconv_open()) of a wide char charset -- if none is
2053 // available on this machine, it will remain NULL
2054 static wxString ms_wcCharsetName;
2055
2056 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2057 // different endian-ness than the native one
2058 static bool ms_wcNeedsSwap;
2059
2060
2061 // name of the encoding handled by this conversion
2062 wxString m_name;
2063
2064 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2065 // initially
2066 size_t m_minMBCharWidth;
2067 };
2068
2069 // make the constructor available for unit testing
2070 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2071 {
2072 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2073 if ( !result->IsOk() )
2074 {
2075 delete result;
2076 return 0;
2077 }
2078
2079 return result;
2080 }
2081
2082 wxString wxMBConv_iconv::ms_wcCharsetName;
2083 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2084
2085 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2086 : m_name(name)
2087 {
2088 m_minMBCharWidth = 0;
2089
2090 // check for charset that represents wchar_t:
2091 if ( ms_wcCharsetName.empty() )
2092 {
2093 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2094
2095 #if wxUSE_FONTMAP
2096 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2097 #else // !wxUSE_FONTMAP
2098 static const wxChar *names_static[] =
2099 {
2100 #if SIZEOF_WCHAR_T == 4
2101 _T("UCS-4"),
2102 #elif SIZEOF_WCHAR_T = 2
2103 _T("UCS-2"),
2104 #endif
2105 NULL
2106 };
2107 const wxChar **names = names_static;
2108 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2109
2110 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2111 {
2112 const wxString nameCS(*names);
2113
2114 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2115 wxString nameXE(nameCS);
2116
2117 #ifdef WORDS_BIGENDIAN
2118 nameXE += _T("BE");
2119 #else // little endian
2120 nameXE += _T("LE");
2121 #endif
2122
2123 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2124 nameXE.c_str());
2125
2126 m2w = iconv_open(nameXE.ToAscii(), name);
2127 if ( m2w == ICONV_T_INVALID )
2128 {
2129 // try charset w/o bytesex info (e.g. "UCS4")
2130 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2131 nameCS.c_str());
2132 m2w = iconv_open(nameCS.ToAscii(), name);
2133
2134 // and check for bytesex ourselves:
2135 if ( m2w != ICONV_T_INVALID )
2136 {
2137 char buf[2], *bufPtr;
2138 wchar_t wbuf[2];
2139 size_t insz, outsz;
2140 size_t res;
2141
2142 buf[0] = 'A';
2143 buf[1] = 0;
2144 wbuf[0] = 0;
2145 insz = 2;
2146 outsz = SIZEOF_WCHAR_T * 2;
2147 char* wbufPtr = (char*)wbuf;
2148 bufPtr = buf;
2149
2150 res = iconv(
2151 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2152 &wbufPtr, &outsz);
2153
2154 if (ICONV_FAILED(res, insz))
2155 {
2156 wxLogLastError(wxT("iconv"));
2157 wxLogError(_("Conversion to charset '%s' doesn't work."),
2158 nameCS.c_str());
2159 }
2160 else // ok, can convert to this encoding, remember it
2161 {
2162 ms_wcCharsetName = nameCS;
2163 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2164 }
2165 }
2166 }
2167 else // use charset not requiring byte swapping
2168 {
2169 ms_wcCharsetName = nameXE;
2170 }
2171 }
2172
2173 wxLogTrace(TRACE_STRCONV,
2174 wxT("iconv wchar_t charset is \"%s\"%s"),
2175 ms_wcCharsetName.empty() ? wxString("<none>")
2176 : ms_wcCharsetName,
2177 ms_wcNeedsSwap ? _T(" (needs swap)")
2178 : _T(""));
2179 }
2180 else // we already have ms_wcCharsetName
2181 {
2182 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2183 }
2184
2185 if ( ms_wcCharsetName.empty() )
2186 {
2187 w2m = ICONV_T_INVALID;
2188 }
2189 else
2190 {
2191 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2192 if ( w2m == ICONV_T_INVALID )
2193 {
2194 wxLogTrace(TRACE_STRCONV,
2195 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2196 ms_wcCharsetName.c_str(), name);
2197 }
2198 }
2199 }
2200
2201 wxMBConv_iconv::~wxMBConv_iconv()
2202 {
2203 if ( m2w != ICONV_T_INVALID )
2204 iconv_close(m2w);
2205 if ( w2m != ICONV_T_INVALID )
2206 iconv_close(w2m);
2207 }
2208
2209 size_t
2210 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2211 const char *src, size_t srcLen) const
2212 {
2213 if ( srcLen == wxNO_LEN )
2214 {
2215 // find the string length: notice that must be done differently for
2216 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2217 // consecutive NULs
2218 const size_t nulLen = GetMBNulLen();
2219 switch ( nulLen )
2220 {
2221 default:
2222 return wxCONV_FAILED;
2223
2224 case 1:
2225 srcLen = strlen(src); // arguably more optimized than our version
2226 break;
2227
2228 case 2:
2229 case 4:
2230 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2231 // but they also have to start at character boundary and not
2232 // span two adjacent characters
2233 const char *p;
2234 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2235 ;
2236 srcLen = p - src;
2237 break;
2238 }
2239
2240 // when we're determining the length of the string ourselves we count
2241 // the terminating NUL(s) as part of it and always NUL-terminate the
2242 // output
2243 srcLen += nulLen;
2244 }
2245
2246 // we express length in the number of (wide) characters but iconv always
2247 // counts buffer sizes it in bytes
2248 dstLen *= SIZEOF_WCHAR_T;
2249
2250 #if wxUSE_THREADS
2251 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2252 // Unfortunately there are a couple of global wxCSConv objects such as
2253 // wxConvLocal that are used all over wx code, so we have to make sure
2254 // the handle is used by at most one thread at the time. Otherwise
2255 // only a few wx classes would be safe to use from non-main threads
2256 // as MB<->WC conversion would fail "randomly".
2257 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2258 #endif // wxUSE_THREADS
2259
2260 size_t res, cres;
2261 const char *pszPtr = src;
2262
2263 if ( dst )
2264 {
2265 char* bufPtr = (char*)dst;
2266
2267 // have destination buffer, convert there
2268 size_t dstLenOrig = dstLen;
2269 cres = iconv(m2w,
2270 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2271 &bufPtr, &dstLen);
2272
2273 // convert the number of bytes converted as returned by iconv to the
2274 // number of (wide) characters converted that we need
2275 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2276
2277 if (ms_wcNeedsSwap)
2278 {
2279 // convert to native endianness
2280 for ( unsigned i = 0; i < res; i++ )
2281 dst[i] = WC_BSWAP(dst[i]);
2282 }
2283 }
2284 else // no destination buffer
2285 {
2286 // convert using temp buffer to calculate the size of the buffer needed
2287 wchar_t tbuf[8];
2288 res = 0;
2289
2290 do
2291 {
2292 char* bufPtr = (char*)tbuf;
2293 dstLen = 8 * SIZEOF_WCHAR_T;
2294
2295 cres = iconv(m2w,
2296 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2297 &bufPtr, &dstLen );
2298
2299 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2300 }
2301 while ((cres == (size_t)-1) && (errno == E2BIG));
2302 }
2303
2304 if (ICONV_FAILED(cres, srcLen))
2305 {
2306 //VS: it is ok if iconv fails, hence trace only
2307 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2308 return wxCONV_FAILED;
2309 }
2310
2311 return res;
2312 }
2313
2314 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2315 const wchar_t *src, size_t srcLen) const
2316 {
2317 #if wxUSE_THREADS
2318 // NB: explained in MB2WC
2319 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2320 #endif
2321
2322 if ( srcLen == wxNO_LEN )
2323 srcLen = wxWcslen(src) + 1;
2324
2325 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2326 size_t outbuflen = dstLen;
2327 size_t res, cres;
2328
2329 wchar_t *tmpbuf = 0;
2330
2331 if (ms_wcNeedsSwap)
2332 {
2333 // need to copy to temp buffer to switch endianness
2334 // (doing WC_BSWAP twice on the original buffer won't help, as it
2335 // could be in read-only memory, or be accessed in some other thread)
2336 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2337 for ( size_t i = 0; i < srcLen; i++ )
2338 tmpbuf[i] = WC_BSWAP(src[i]);
2339
2340 tmpbuf[srcLen] = L'\0';
2341 src = tmpbuf;
2342 }
2343
2344 char* inbuf = (char*)src;
2345 if ( dst )
2346 {
2347 // have destination buffer, convert there
2348 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2349
2350 res = dstLen - outbuflen;
2351 }
2352 else // no destination buffer
2353 {
2354 // convert using temp buffer to calculate the size of the buffer needed
2355 char tbuf[16];
2356 res = 0;
2357 do
2358 {
2359 dst = tbuf;
2360 outbuflen = 16;
2361
2362 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2363
2364 res += 16 - outbuflen;
2365 }
2366 while ((cres == (size_t)-1) && (errno == E2BIG));
2367 }
2368
2369 if (ms_wcNeedsSwap)
2370 {
2371 free(tmpbuf);
2372 }
2373
2374 if (ICONV_FAILED(cres, inbuflen))
2375 {
2376 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2377 return wxCONV_FAILED;
2378 }
2379
2380 return res;
2381 }
2382
2383 size_t wxMBConv_iconv::GetMBNulLen() const
2384 {
2385 if ( m_minMBCharWidth == 0 )
2386 {
2387 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2388
2389 #if wxUSE_THREADS
2390 // NB: explained in MB2WC
2391 wxMutexLocker lock(self->m_iconvMutex);
2392 #endif
2393
2394 const wchar_t *wnul = L"";
2395 char buf[8]; // should be enough for NUL in any encoding
2396 size_t inLen = sizeof(wchar_t),
2397 outLen = WXSIZEOF(buf);
2398 char *inBuff = (char *)wnul;
2399 char *outBuff = buf;
2400 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2401 {
2402 self->m_minMBCharWidth = (size_t)-1;
2403 }
2404 else // ok
2405 {
2406 self->m_minMBCharWidth = outBuff - buf;
2407 }
2408 }
2409
2410 return m_minMBCharWidth;
2411 }
2412
2413 #if wxUSE_UNICODE_UTF8
2414 bool wxMBConv_iconv::IsUTF8() const
2415 {
2416 return wxStricmp(m_name, "UTF-8") == 0 ||
2417 wxStricmp(m_name, "UTF8") == 0;
2418 }
2419 #endif
2420
2421 #endif // HAVE_ICONV
2422
2423
2424 // ============================================================================
2425 // Win32 conversion classes
2426 // ============================================================================
2427
2428 #ifdef wxHAVE_WIN32_MB2WC
2429
2430 // from utils.cpp
2431 #if wxUSE_FONTMAP
2432 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2433 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2434 #endif
2435
2436 class wxMBConv_win32 : public wxMBConv
2437 {
2438 public:
2439 wxMBConv_win32()
2440 {
2441 m_CodePage = CP_ACP;
2442 m_minMBCharWidth = 0;
2443 }
2444
2445 wxMBConv_win32(const wxMBConv_win32& conv)
2446 : wxMBConv()
2447 {
2448 m_CodePage = conv.m_CodePage;
2449 m_minMBCharWidth = conv.m_minMBCharWidth;
2450 }
2451
2452 #if wxUSE_FONTMAP
2453 wxMBConv_win32(const char* name)
2454 {
2455 m_CodePage = wxCharsetToCodepage(name);
2456 m_minMBCharWidth = 0;
2457 }
2458
2459 wxMBConv_win32(wxFontEncoding encoding)
2460 {
2461 m_CodePage = wxEncodingToCodepage(encoding);
2462 m_minMBCharWidth = 0;
2463 }
2464 #endif // wxUSE_FONTMAP
2465
2466 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2467 {
2468 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2469 // the behaviour is not compatible with the Unix version (using iconv)
2470 // and break the library itself, e.g. wxTextInputStream::NextChar()
2471 // wouldn't work if reading an incomplete MB char didn't result in an
2472 // error
2473 //
2474 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2475 // Win XP or newer and it is not supported for UTF-[78] so we always
2476 // use our own conversions in this case. See
2477 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2478 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2479 if ( m_CodePage == CP_UTF8 )
2480 {
2481 return wxMBConvUTF8().MB2WC(buf, psz, n);
2482 }
2483
2484 if ( m_CodePage == CP_UTF7 )
2485 {
2486 return wxMBConvUTF7().MB2WC(buf, psz, n);
2487 }
2488
2489 int flags = 0;
2490 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2491 IsAtLeastWin2kSP4() )
2492 {
2493 flags = MB_ERR_INVALID_CHARS;
2494 }
2495
2496 const size_t len = ::MultiByteToWideChar
2497 (
2498 m_CodePage, // code page
2499 flags, // flags: fall on error
2500 psz, // input string
2501 -1, // its length (NUL-terminated)
2502 buf, // output string
2503 buf ? n : 0 // size of output buffer
2504 );
2505 if ( !len )
2506 {
2507 // function totally failed
2508 return wxCONV_FAILED;
2509 }
2510
2511 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2512 // check if we succeeded, by doing a double trip:
2513 if ( !flags && buf )
2514 {
2515 const size_t mbLen = strlen(psz);
2516 wxCharBuffer mbBuf(mbLen);
2517 if ( ::WideCharToMultiByte
2518 (
2519 m_CodePage,
2520 0,
2521 buf,
2522 -1,
2523 mbBuf.data(),
2524 mbLen + 1, // size in bytes, not length
2525 NULL,
2526 NULL
2527 ) == 0 ||
2528 strcmp(mbBuf, psz) != 0 )
2529 {
2530 // we didn't obtain the same thing we started from, hence
2531 // the conversion was lossy and we consider that it failed
2532 return wxCONV_FAILED;
2533 }
2534 }
2535
2536 // note that it returns count of written chars for buf != NULL and size
2537 // of the needed buffer for buf == NULL so in either case the length of
2538 // the string (which never includes the terminating NUL) is one less
2539 return len - 1;
2540 }
2541
2542 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2543 {
2544 /*
2545 we have a problem here: by default, WideCharToMultiByte() may
2546 replace characters unrepresentable in the target code page with bad
2547 quality approximations such as turning "1/2" symbol (U+00BD) into
2548 "1" for the code pages which don't have it and we, obviously, want
2549 to avoid this at any price
2550
2551 the trouble is that this function does it _silently_, i.e. it won't
2552 even tell us whether it did or not... Win98/2000 and higher provide
2553 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2554 we have to resort to a round trip, i.e. check that converting back
2555 results in the same string -- this is, of course, expensive but
2556 otherwise we simply can't be sure to not garble the data.
2557 */
2558
2559 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2560 // it doesn't work with CJK encodings (which we test for rather roughly
2561 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2562 // supporting it
2563 BOOL usedDef wxDUMMY_INITIALIZE(false);
2564 BOOL *pUsedDef;
2565 int flags;
2566 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2567 {
2568 // it's our lucky day
2569 flags = WC_NO_BEST_FIT_CHARS;
2570 pUsedDef = &usedDef;
2571 }
2572 else // old system or unsupported encoding
2573 {
2574 flags = 0;
2575 pUsedDef = NULL;
2576 }
2577
2578 const size_t len = ::WideCharToMultiByte
2579 (
2580 m_CodePage, // code page
2581 flags, // either none or no best fit
2582 pwz, // input string
2583 -1, // it is (wide) NUL-terminated
2584 buf, // output buffer
2585 buf ? n : 0, // and its size
2586 NULL, // default "replacement" char
2587 pUsedDef // [out] was it used?
2588 );
2589
2590 if ( !len )
2591 {
2592 // function totally failed
2593 return wxCONV_FAILED;
2594 }
2595
2596 // we did something, check if we really succeeded
2597 if ( flags )
2598 {
2599 // check if the conversion failed, i.e. if any replacements
2600 // were done
2601 if ( usedDef )
2602 return wxCONV_FAILED;
2603 }
2604 else // we must resort to double tripping...
2605 {
2606 // first we need to ensure that we really have the MB data: this is
2607 // not the case if we're called with NULL buffer, in which case we
2608 // need to do the conversion yet again
2609 wxCharBuffer bufDef;
2610 if ( !buf )
2611 {
2612 bufDef = wxCharBuffer(len);
2613 buf = bufDef.data();
2614 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2615 buf, len, NULL, NULL) )
2616 return wxCONV_FAILED;
2617 }
2618
2619 if ( !n )
2620 n = wcslen(pwz);
2621 wxWCharBuffer wcBuf(n);
2622 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2623 wcscmp(wcBuf, pwz) != 0 )
2624 {
2625 // we didn't obtain the same thing we started from, hence
2626 // the conversion was lossy and we consider that it failed
2627 return wxCONV_FAILED;
2628 }
2629 }
2630
2631 // see the comment above for the reason of "len - 1"
2632 return len - 1;
2633 }
2634
2635 virtual size_t GetMBNulLen() const
2636 {
2637 if ( m_minMBCharWidth == 0 )
2638 {
2639 int len = ::WideCharToMultiByte
2640 (
2641 m_CodePage, // code page
2642 0, // no flags
2643 L"", // input string
2644 1, // translate just the NUL
2645 NULL, // output buffer
2646 0, // and its size
2647 NULL, // no replacement char
2648 NULL // [out] don't care if it was used
2649 );
2650
2651 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2652 switch ( len )
2653 {
2654 default:
2655 wxLogDebug(_T("Unexpected NUL length %d"), len);
2656 self->m_minMBCharWidth = (size_t)-1;
2657 break;
2658
2659 case 0:
2660 self->m_minMBCharWidth = (size_t)-1;
2661 break;
2662
2663 case 1:
2664 case 2:
2665 case 4:
2666 self->m_minMBCharWidth = len;
2667 break;
2668 }
2669 }
2670
2671 return m_minMBCharWidth;
2672 }
2673
2674 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2675
2676 bool IsOk() const { return m_CodePage != -1; }
2677
2678 private:
2679 static bool CanUseNoBestFit()
2680 {
2681 static int s_isWin98Or2k = -1;
2682
2683 if ( s_isWin98Or2k == -1 )
2684 {
2685 int verMaj, verMin;
2686 switch ( wxGetOsVersion(&verMaj, &verMin) )
2687 {
2688 case wxOS_WINDOWS_9X:
2689 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2690 break;
2691
2692 case wxOS_WINDOWS_NT:
2693 s_isWin98Or2k = verMaj >= 5;
2694 break;
2695
2696 default:
2697 // unknown: be conservative by default
2698 s_isWin98Or2k = 0;
2699 break;
2700 }
2701
2702 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2703 }
2704
2705 return s_isWin98Or2k == 1;
2706 }
2707
2708 static bool IsAtLeastWin2kSP4()
2709 {
2710 #ifdef __WXWINCE__
2711 return false;
2712 #else
2713 static int s_isAtLeastWin2kSP4 = -1;
2714
2715 if ( s_isAtLeastWin2kSP4 == -1 )
2716 {
2717 OSVERSIONINFOEX ver;
2718
2719 memset(&ver, 0, sizeof(ver));
2720 ver.dwOSVersionInfoSize = sizeof(ver);
2721 GetVersionEx((OSVERSIONINFO*)&ver);
2722
2723 s_isAtLeastWin2kSP4 =
2724 ((ver.dwMajorVersion > 5) || // Vista+
2725 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2726 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2727 ver.wServicePackMajor >= 4)) // 2000 SP4+
2728 ? 1 : 0;
2729 }
2730
2731 return s_isAtLeastWin2kSP4 == 1;
2732 #endif
2733 }
2734
2735
2736 // the code page we're working with
2737 long m_CodePage;
2738
2739 // cached result of GetMBNulLen(), set to 0 initially meaning
2740 // "unknown"
2741 size_t m_minMBCharWidth;
2742 };
2743
2744 #endif // wxHAVE_WIN32_MB2WC
2745
2746
2747 // ============================================================================
2748 // wxEncodingConverter based conversion classes
2749 // ============================================================================
2750
2751 #if wxUSE_FONTMAP
2752
2753 class wxMBConv_wxwin : public wxMBConv
2754 {
2755 private:
2756 void Init()
2757 {
2758 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2759 // The wxMBConv_cf class does a better job.
2760 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2761 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2762 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2763 }
2764
2765 public:
2766 // temporarily just use wxEncodingConverter stuff,
2767 // so that it works while a better implementation is built
2768 wxMBConv_wxwin(const char* name)
2769 {
2770 if (name)
2771 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2772 else
2773 m_enc = wxFONTENCODING_SYSTEM;
2774
2775 Init();
2776 }
2777
2778 wxMBConv_wxwin(wxFontEncoding enc)
2779 {
2780 m_enc = enc;
2781
2782 Init();
2783 }
2784
2785 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2786 {
2787 size_t inbuf = strlen(psz);
2788 if (buf)
2789 {
2790 if (!m2w.Convert(psz, buf))
2791 return wxCONV_FAILED;
2792 }
2793 return inbuf;
2794 }
2795
2796 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2797 {
2798 const size_t inbuf = wxWcslen(psz);
2799 if (buf)
2800 {
2801 if (!w2m.Convert(psz, buf))
2802 return wxCONV_FAILED;
2803 }
2804
2805 return inbuf;
2806 }
2807
2808 virtual size_t GetMBNulLen() const
2809 {
2810 switch ( m_enc )
2811 {
2812 case wxFONTENCODING_UTF16BE:
2813 case wxFONTENCODING_UTF16LE:
2814 return 2;
2815
2816 case wxFONTENCODING_UTF32BE:
2817 case wxFONTENCODING_UTF32LE:
2818 return 4;
2819
2820 default:
2821 return 1;
2822 }
2823 }
2824
2825 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2826
2827 bool IsOk() const { return m_ok; }
2828
2829 public:
2830 wxFontEncoding m_enc;
2831 wxEncodingConverter m2w, w2m;
2832
2833 private:
2834 // were we initialized successfully?
2835 bool m_ok;
2836
2837 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2838 };
2839
2840 // make the constructors available for unit testing
2841 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2842 {
2843 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2844 if ( !result->IsOk() )
2845 {
2846 delete result;
2847 return 0;
2848 }
2849
2850 return result;
2851 }
2852
2853 #endif // wxUSE_FONTMAP
2854
2855 // ============================================================================
2856 // wxCSConv implementation
2857 // ============================================================================
2858
2859 void wxCSConv::Init()
2860 {
2861 m_name = NULL;
2862 m_convReal = NULL;
2863 m_deferred = true;
2864 }
2865
2866 wxCSConv::wxCSConv(const wxString& charset)
2867 {
2868 Init();
2869
2870 if ( !charset.empty() )
2871 {
2872 SetName(charset.ToAscii());
2873 }
2874
2875 #if wxUSE_FONTMAP
2876 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2877 #else
2878 m_encoding = wxFONTENCODING_SYSTEM;
2879 #endif
2880 }
2881
2882 wxCSConv::wxCSConv(wxFontEncoding encoding)
2883 {
2884 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2885 {
2886 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2887
2888 encoding = wxFONTENCODING_SYSTEM;
2889 }
2890
2891 Init();
2892
2893 m_encoding = encoding;
2894 }
2895
2896 wxCSConv::~wxCSConv()
2897 {
2898 Clear();
2899 }
2900
2901 wxCSConv::wxCSConv(const wxCSConv& conv)
2902 : wxMBConv()
2903 {
2904 Init();
2905
2906 SetName(conv.m_name);
2907 m_encoding = conv.m_encoding;
2908 }
2909
2910 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2911 {
2912 Clear();
2913
2914 SetName(conv.m_name);
2915 m_encoding = conv.m_encoding;
2916
2917 return *this;
2918 }
2919
2920 void wxCSConv::Clear()
2921 {
2922 free(m_name);
2923 delete m_convReal;
2924
2925 m_name = NULL;
2926 m_convReal = NULL;
2927 }
2928
2929 void wxCSConv::SetName(const char *charset)
2930 {
2931 if (charset)
2932 {
2933 m_name = wxStrdup(charset);
2934 m_deferred = true;
2935 }
2936 }
2937
2938 #if wxUSE_FONTMAP
2939
2940 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2941 wxEncodingNameCache );
2942
2943 static wxEncodingNameCache gs_nameCache;
2944 #endif
2945
2946 wxMBConv *wxCSConv::DoCreate() const
2947 {
2948 #if wxUSE_FONTMAP
2949 wxLogTrace(TRACE_STRCONV,
2950 wxT("creating conversion for %s"),
2951 (m_name ? m_name
2952 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2953 #endif // wxUSE_FONTMAP
2954
2955 // check for the special case of ASCII or ISO8859-1 charset: as we have
2956 // special knowledge of it anyhow, we don't need to create a special
2957 // conversion object
2958 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2959 m_encoding == wxFONTENCODING_DEFAULT )
2960 {
2961 // don't convert at all
2962 return NULL;
2963 }
2964
2965 // we trust OS to do conversion better than we can so try external
2966 // conversion methods first
2967 //
2968 // the full order is:
2969 // 1. OS conversion (iconv() under Unix or Win32 API)
2970 // 2. hard coded conversions for UTF
2971 // 3. wxEncodingConverter as fall back
2972
2973 // step (1)
2974 #ifdef HAVE_ICONV
2975 #if !wxUSE_FONTMAP
2976 if ( m_name )
2977 #endif // !wxUSE_FONTMAP
2978 {
2979 #if wxUSE_FONTMAP
2980 wxFontEncoding encoding(m_encoding);
2981 #endif
2982
2983 if ( m_name )
2984 {
2985 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2986 if ( conv->IsOk() )
2987 return conv;
2988
2989 delete conv;
2990
2991 #if wxUSE_FONTMAP
2992 encoding =
2993 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2994 #endif // wxUSE_FONTMAP
2995 }
2996 #if wxUSE_FONTMAP
2997 {
2998 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2999 if ( it != gs_nameCache.end() )
3000 {
3001 if ( it->second.empty() )
3002 return NULL;
3003
3004 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3005 if ( conv->IsOk() )
3006 return conv;
3007
3008 delete conv;
3009 }
3010
3011 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3012 // CS : in case this does not return valid names (eg for MacRoman)
3013 // encoding got a 'failure' entry in the cache all the same,
3014 // although it just has to be created using a different method, so
3015 // only store failed iconv creation attempts (or perhaps we
3016 // shoulnd't do this at all ?)
3017 if ( names[0] != NULL )
3018 {
3019 for ( ; *names; ++names )
3020 {
3021 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3022 // will need changes that will obsolete this
3023 wxString name(*names);
3024 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3025 if ( conv->IsOk() )
3026 {
3027 gs_nameCache[encoding] = *names;
3028 return conv;
3029 }
3030
3031 delete conv;
3032 }
3033
3034 gs_nameCache[encoding] = _T(""); // cache the failure
3035 }
3036 }
3037 #endif // wxUSE_FONTMAP
3038 }
3039 #endif // HAVE_ICONV
3040
3041 #ifdef wxHAVE_WIN32_MB2WC
3042 {
3043 #if wxUSE_FONTMAP
3044 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3045 : new wxMBConv_win32(m_encoding);
3046 if ( conv->IsOk() )
3047 return conv;
3048
3049 delete conv;
3050 #else
3051 return NULL;
3052 #endif
3053 }
3054 #endif // wxHAVE_WIN32_MB2WC
3055
3056 #ifdef __DARWIN__
3057 {
3058 // leave UTF16 and UTF32 to the built-ins of wx
3059 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3060 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3061 {
3062 #if wxUSE_FONTMAP
3063 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3064 : new wxMBConv_cf(m_encoding);
3065 #else
3066 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3067 #endif
3068
3069 if ( conv->IsOk() )
3070 return conv;
3071
3072 delete conv;
3073 }
3074 }
3075 #endif // __DARWIN__
3076
3077 // step (2)
3078 wxFontEncoding enc = m_encoding;
3079 #if wxUSE_FONTMAP
3080 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3081 {
3082 // use "false" to suppress interactive dialogs -- we can be called from
3083 // anywhere and popping up a dialog from here is the last thing we want to
3084 // do
3085 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3086 }
3087 #endif // wxUSE_FONTMAP
3088
3089 switch ( enc )
3090 {
3091 case wxFONTENCODING_UTF7:
3092 return new wxMBConvUTF7;
3093
3094 case wxFONTENCODING_UTF8:
3095 return new wxMBConvUTF8;
3096
3097 case wxFONTENCODING_UTF16BE:
3098 return new wxMBConvUTF16BE;
3099
3100 case wxFONTENCODING_UTF16LE:
3101 return new wxMBConvUTF16LE;
3102
3103 case wxFONTENCODING_UTF32BE:
3104 return new wxMBConvUTF32BE;
3105
3106 case wxFONTENCODING_UTF32LE:
3107 return new wxMBConvUTF32LE;
3108
3109 default:
3110 // nothing to do but put here to suppress gcc warnings
3111 break;
3112 }
3113
3114 // step (3)
3115 #if wxUSE_FONTMAP
3116 {
3117 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3118 : new wxMBConv_wxwin(m_encoding);
3119 if ( conv->IsOk() )
3120 return conv;
3121
3122 delete conv;
3123 }
3124 #endif // wxUSE_FONTMAP
3125
3126 // NB: This is a hack to prevent deadlock. What could otherwise happen
3127 // in Unicode build: wxConvLocal creation ends up being here
3128 // because of some failure and logs the error. But wxLog will try to
3129 // attach a timestamp, for which it will need wxConvLocal (to convert
3130 // time to char* and then wchar_t*), but that fails, tries to log the
3131 // error, but wxLog has an (already locked) critical section that
3132 // guards the static buffer.
3133 static bool alreadyLoggingError = false;
3134 if (!alreadyLoggingError)
3135 {
3136 alreadyLoggingError = true;
3137 wxLogError(_("Cannot convert from the charset '%s'!"),
3138 m_name ? m_name
3139 :
3140 #if wxUSE_FONTMAP
3141 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3142 #else // !wxUSE_FONTMAP
3143 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3144 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3145 );
3146
3147 alreadyLoggingError = false;
3148 }
3149
3150 return NULL;
3151 }
3152
3153 void wxCSConv::CreateConvIfNeeded() const
3154 {
3155 if ( m_deferred )
3156 {
3157 wxCSConv *self = (wxCSConv *)this; // const_cast
3158
3159 // if we don't have neither the name nor the encoding, use the default
3160 // encoding for this system
3161 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3162 {
3163 #if wxUSE_INTL
3164 self->m_encoding = wxLocale::GetSystemEncoding();
3165 #else
3166 // fallback to some reasonable default:
3167 self->m_encoding = wxFONTENCODING_ISO8859_1;
3168 #endif // wxUSE_INTL
3169 }
3170
3171 self->m_convReal = DoCreate();
3172 self->m_deferred = false;
3173 }
3174 }
3175
3176 bool wxCSConv::IsOk() const
3177 {
3178 CreateConvIfNeeded();
3179
3180 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3181 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3182 return true; // always ok as we do it ourselves
3183
3184 // m_convReal->IsOk() is called at its own creation, so we know it must
3185 // be ok if m_convReal is non-NULL
3186 return m_convReal != NULL;
3187 }
3188
3189 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3190 const char *src, size_t srcLen) const
3191 {
3192 CreateConvIfNeeded();
3193
3194 if (m_convReal)
3195 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3196
3197 // latin-1 (direct)
3198 if ( srcLen == wxNO_LEN )
3199 srcLen = strlen(src) + 1; // take trailing NUL too
3200
3201 if ( dst )
3202 {
3203 if ( dstLen < srcLen )
3204 return wxCONV_FAILED;
3205
3206 for ( size_t n = 0; n < srcLen; n++ )
3207 dst[n] = (unsigned char)(src[n]);
3208 }
3209
3210 return srcLen;
3211 }
3212
3213 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3214 const wchar_t *src, size_t srcLen) const
3215 {
3216 CreateConvIfNeeded();
3217
3218 if (m_convReal)
3219 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3220
3221 // latin-1 (direct)
3222 if ( srcLen == wxNO_LEN )
3223 srcLen = wxWcslen(src) + 1;
3224
3225 if ( dst )
3226 {
3227 if ( dstLen < srcLen )
3228 return wxCONV_FAILED;
3229
3230 for ( size_t n = 0; n < srcLen; n++ )
3231 {
3232 if ( src[n] > 0xFF )
3233 return wxCONV_FAILED;
3234
3235 dst[n] = (char)src[n];
3236 }
3237
3238 }
3239 else // still need to check the input validity
3240 {
3241 for ( size_t n = 0; n < srcLen; n++ )
3242 {
3243 if ( src[n] > 0xFF )
3244 return wxCONV_FAILED;
3245 }
3246 }
3247
3248 return srcLen;
3249 }
3250
3251 size_t wxCSConv::GetMBNulLen() const
3252 {
3253 CreateConvIfNeeded();
3254
3255 if ( m_convReal )
3256 {
3257 return m_convReal->GetMBNulLen();
3258 }
3259
3260 // otherwise, we are ISO-8859-1
3261 return 1;
3262 }
3263
3264 #if wxUSE_UNICODE_UTF8
3265 bool wxCSConv::IsUTF8() const
3266 {
3267 CreateConvIfNeeded();
3268
3269 if ( m_convReal )
3270 {
3271 return m_convReal->IsUTF8();
3272 }
3273
3274 // otherwise, we are ISO-8859-1
3275 return false;
3276 }
3277 #endif
3278
3279
3280 #if wxUSE_UNICODE
3281
3282 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3283 {
3284 if ( !s )
3285 return wxWCharBuffer();
3286
3287 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3288 if ( !wbuf )
3289 wbuf = wxMBConvUTF8().cMB2WX(s);
3290 if ( !wbuf )
3291 wbuf = wxConvISO8859_1.cMB2WX(s);
3292
3293 return wbuf;
3294 }
3295
3296 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3297 {
3298 if ( !ws )
3299 return wxCharBuffer();
3300
3301 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3302 if ( !buf )
3303 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3304
3305 return buf;
3306 }
3307
3308 #endif // wxUSE_UNICODE
3309
3310 // ----------------------------------------------------------------------------
3311 // globals
3312 // ----------------------------------------------------------------------------
3313
3314 // NB: The reason why we create converted objects in this convoluted way,
3315 // using a factory function instead of global variable, is that they
3316 // may be used at static initialization time (some of them are used by
3317 // wxString ctors and there may be a global wxString object). In other
3318 // words, possibly _before_ the converter global object would be
3319 // initialized.
3320
3321 #undef wxConvLibc
3322 #undef wxConvUTF8
3323 #undef wxConvUTF7
3324 #undef wxConvLocal
3325 #undef wxConvISO8859_1
3326
3327 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3328 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3329 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3330 { \
3331 static impl_klass name##Obj ctor_args; \
3332 return &name##Obj; \
3333 } \
3334 /* this ensures that all global converter objects are created */ \
3335 /* by the time static initialization is done, i.e. before any */ \
3336 /* thread is launched: */ \
3337 static klass* gs_##name##instance = wxGet_##name##Ptr()
3338
3339 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3340 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3341
3342 #ifdef __WINDOWS__
3343 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3344 #else
3345 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3346 #endif
3347
3348 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3349 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3350 // provokes an error message about "not enough macro parameters"; and we
3351 // can't use "()" here as the name##Obj declaration would be parsed as a
3352 // function declaration then, so use a semicolon and live with an extra
3353 // empty statement (and hope that no compilers warns about this)
3354 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3355 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3356
3357 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3358 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3359
3360 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3361 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3362
3363 #ifdef __DARWIN__
3364 // The xnu kernel always communicates file paths in decomposed UTF-8.
3365 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3366 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3367 #endif
3368
3369 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3370 #ifdef __DARWIN__
3371 &wxConvMacUTF8DObj;
3372 #else // !__DARWIN__
3373 wxGet_wxConvLibcPtr();
3374 #endif // __DARWIN__/!__DARWIN__
3375
3376 #else // !wxUSE_WCHAR_T
3377
3378 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3379 // stand-ins in absence of wchar_t
3380 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3381 wxConvISO8859_1,
3382 wxConvLocal,
3383 wxConvUTF8;
3384
3385 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T