properly detect missing data at the end of UTF-7-encoded segment and fail the convers...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
171
172 // the number of NULs terminating this string
173 size_t nulLen = 0; // not really needed, but just to avoid warnings
174
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
181 if ( srcLen != wxNO_LEN )
182 {
183 // we need to know how to find the end of this string
184 nulLen = GetMBNulLen();
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
187
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
190 {
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
193 char * const p = bufTmp.data();
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
196 *s = '\0';
197
198 src = bufTmp;
199 }
200
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
207
208 for ( ;; )
209 {
210 // try to convert the current chunk
211 size_t lenChunk = MB2WC(NULL, src, 0);
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 dstWritten += lenChunk;
216 if ( !srcEnd )
217 dstWritten++;
218
219 if ( !lenChunk )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
230 // +1 is for trailing NUL
231 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dst += lenChunk;
235 if ( !srcEnd )
236 dst++;
237 }
238
239 if ( !srcEnd )
240 {
241 // we convert just one chunk in this case as this is the entire
242 // string anyhow
243 break;
244 }
245
246 // advance the input pointer past the end of this chunk
247 while ( NotAllNULs(src, nulLen) )
248 {
249 // notice that we must skip over multiple bytes here as we suppose
250 // that if NUL takes 2 or 4 bytes, then all the other characters do
251 // too and so if advanced by a single byte we might erroneously
252 // detect sequences of NUL bytes in the middle of the input
253 src += nulLen;
254 }
255
256 src += nulLen; // skipping over its terminator as well
257
258 // note that ">=" (and not just "==") is needed here as the terminator
259 // we skipped just above could be inside or just after the buffer
260 // delimited by inEnd
261 if ( src >= srcEnd )
262 break;
263 }
264
265 return dstWritten;
266 }
267
268 size_t
269 wxMBConv::FromWChar(char *dst, size_t dstLen,
270 const wchar_t *src, size_t srcLen) const
271 {
272 // the number of chars [which would be] written to dst [if it were not NULL]
273 size_t dstWritten = 0;
274
275 // if we don't know its length we have no choice but to assume that it is
276 // NUL-terminated (notice that it can still be NUL-terminated even if
277 // explicit length is given but it doesn't change our return value)
278 const bool isNulTerminated = srcLen == wxNO_LEN;
279
280 // make a copy of the input string unless it is already properly
281 // NUL-terminated
282 wxWCharBuffer bufTmp;
283 if ( isNulTerminated )
284 {
285 srcLen = wxWcslen(src) + 1;
286 }
287 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
288 {
289 // make a copy in order to properly NUL-terminate the string
290 bufTmp = wxWCharBuffer(srcLen);
291 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
292 src = bufTmp;
293 }
294
295 const size_t lenNul = GetMBNulLen();
296 for ( const wchar_t * const srcEnd = src + srcLen;
297 src < srcEnd;
298 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
299 {
300 // try to convert the current chunk
301 size_t lenChunk = WC2MB(NULL, src, 0);
302
303 if ( lenChunk == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
306 dstWritten += lenChunk;
307 if ( isNulTerminated )
308 dstWritten += lenNul;
309
310 if ( dst )
311 {
312 if ( dstWritten > dstLen )
313 return wxCONV_FAILED;
314
315 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
316 return wxCONV_FAILED;
317
318 dst += lenChunk;
319 if ( isNulTerminated )
320 dst += lenNul;
321 }
322 }
323
324 return dstWritten;
325 }
326
327 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
328 {
329 size_t rc = ToWChar(outBuff, outLen, inBuff);
330 if ( rc != wxCONV_FAILED )
331 {
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
334 rc--;
335 }
336
337 return rc;
338 }
339
340 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
341 {
342 size_t rc = FromWChar(outBuff, outLen, inBuff);
343 if ( rc != wxCONV_FAILED )
344 {
345 rc -= GetMBNulLen();
346 }
347
348 return rc;
349 }
350
351 wxMBConv::~wxMBConv()
352 {
353 // nothing to do here (necessary for Darwin linking probably)
354 }
355
356 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
357 {
358 if ( psz )
359 {
360 // calculate the length of the buffer needed first
361 const size_t nLen = ToWChar(NULL, 0, psz);
362 if ( nLen != wxCONV_FAILED )
363 {
364 // now do the actual conversion
365 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
366
367 // +1 for the trailing NULL
368 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
369 return buf;
370 }
371 }
372
373 return wxWCharBuffer();
374 }
375
376 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
377 {
378 if ( pwz )
379 {
380 const size_t nLen = FromWChar(NULL, 0, pwz);
381 if ( nLen != wxCONV_FAILED )
382 {
383 wxCharBuffer buf(nLen - 1);
384 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
385 return buf;
386 }
387 }
388
389 return wxCharBuffer();
390 }
391
392 const wxWCharBuffer
393 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
394 {
395 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
396 if ( dstLen != wxCONV_FAILED )
397 {
398 // notice that we allocate space for dstLen+1 wide characters here
399 // because we want the buffer to always be NUL-terminated, even if the
400 // input isn't (as otherwise the caller has no way to know its length)
401 wxWCharBuffer wbuf(dstLen);
402 wbuf.data()[dstLen] = L'\0';
403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
404 {
405 if ( outLen )
406 {
407 *outLen = dstLen;
408
409 // we also need to handle NUL-terminated input strings
410 // specially: for them the output is the length of the string
411 // excluding the trailing NUL, however if we're asked to
412 // convert a specific number of characters we return the length
413 // of the resulting output even if it's NUL-terminated
414 if ( inLen == wxNO_LEN )
415 (*outLen)--;
416 }
417
418 return wbuf;
419 }
420 }
421
422 if ( outLen )
423 *outLen = 0;
424
425 return wxWCharBuffer();
426 }
427
428 const wxCharBuffer
429 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
430 {
431 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
432 if ( dstLen != wxCONV_FAILED )
433 {
434 const size_t nulLen = GetMBNulLen();
435
436 // as above, ensure that the buffer is always NUL-terminated, even if
437 // the input is not
438 wxCharBuffer buf(dstLen + nulLen - 1);
439 memset(buf.data() + dstLen, 0, nulLen);
440 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
441 {
442 if ( outLen )
443 {
444 *outLen = dstLen;
445
446 if ( inLen == wxNO_LEN )
447 {
448 // in this case both input and output are NUL-terminated
449 // and we're not supposed to count NUL
450 *outLen -= nulLen;
451 }
452 }
453
454 return buf;
455 }
456 }
457
458 if ( outLen )
459 *outLen = 0;
460
461 return wxCharBuffer();
462 }
463
464 // ----------------------------------------------------------------------------
465 // wxMBConvLibc
466 // ----------------------------------------------------------------------------
467
468 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
469 {
470 return wxMB2WC(buf, psz, n);
471 }
472
473 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
474 {
475 return wxWC2MB(buf, psz, n);
476 }
477
478 // ----------------------------------------------------------------------------
479 // wxConvBrokenFileNames
480 // ----------------------------------------------------------------------------
481
482 #ifdef __UNIX__
483
484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
485 {
486 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
487 wxStricmp(charset, _T("UTF8")) == 0 )
488 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
489 else
490 m_conv = new wxCSConv(charset);
491 }
492
493 #endif // __UNIX__
494
495 // ----------------------------------------------------------------------------
496 // UTF-7
497 // ----------------------------------------------------------------------------
498
499 // Implementation (C) 2004 Fredrik Roubert
500 //
501 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
502
503 //
504 // BASE64 decoding table
505 //
506 static const unsigned char utf7unb64[] =
507 {
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
540 };
541
542 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
543 const char *src, size_t srcLen) const
544 {
545 DecoderState stateOrig,
546 *statePtr;
547 if ( srcLen == wxNO_LEN )
548 {
549 // convert the entire string, up to and including the trailing NUL
550 srcLen = strlen(src) + 1;
551
552 // when working on the entire strings we don't update nor use the shift
553 // state from the previous call
554 statePtr = &stateOrig;
555 }
556 else // when working with partial strings we do use the shift state
557 {
558 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
559
560 // also save the old state to be able to rollback to it on error
561 stateOrig = m_stateDecoder;
562 }
563
564 // but to simplify the code below we use this variable in both cases
565 DecoderState& state = *statePtr;
566
567
568 // number of characters [which would have been] written to dst [if it were
569 // not NULL]
570 size_t len = 0;
571
572 const char * const srcEnd = src + srcLen;
573
574 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
575 {
576 const unsigned char cc = *src++;
577
578 if ( state.IsShifted() )
579 {
580 const unsigned char dc = utf7unb64[cc];
581 if ( dc == 0xff )
582 {
583 // end of encoded part, check that nothing was left: the bit
584 // field cycles through 0,6,4,2 sequence so check that we're at
585 // the end of it
586 if ( state.bit != 2 )
587 return wxCONV_FAILED;
588
589 state.ToDirect();
590
591 // re-parse this character normally below unless it's '-' which
592 // is consumed by the decoder
593 if ( cc == '-' )
594 continue;
595 }
596 else // valid encoded character
597 {
598 // mini base64 decoder: each character is 6 bits
599 state.bit += 6;
600 state.accum <<= 6;
601 state.accum += dc;
602
603 if ( state.bit >= 8 )
604 {
605 // got the full byte, consume it
606 state.bit -= 8;
607 unsigned char b = (state.accum >> state.bit) & 0x00ff;
608
609 if ( state.isLSB )
610 {
611 // we've got the full word, output it
612 if ( dst )
613 *dst++ = (state.msb << 8) | b;
614 len++;
615 state.isLSB = false;
616 }
617 else // MSB
618 {
619 // just store it while we wait for LSB
620 state.msb = b;
621 state.isLSB = true;
622 }
623 }
624 }
625 }
626
627 if ( state.IsDirect() )
628 {
629 // start of an encoded segment?
630 if ( cc == '+' )
631 {
632 if ( *src == '-' )
633 {
634 // just the encoded plus sign, don't switch to shifted mode
635 if ( dst )
636 *dst++ = '+';
637 len++;
638 src++;
639 }
640 else
641 {
642 state.ToShifted();
643 }
644 }
645 else // not '+'
646 {
647 // only printable 7 bit ASCII characters (with the exception of
648 // NUL, TAB, CR and LF) can be used directly
649 if ( cc >= 0x7f || (cc < ' ' &&
650 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
651 return wxCONV_FAILED;
652
653 if ( dst )
654 *dst++ = cc;
655 len++;
656 }
657 }
658 }
659
660 if ( !len )
661 {
662 // as we didn't read any characters we should be called with the same
663 // data (followed by some more new data) again later so don't save our
664 // state
665 state = stateOrig;
666
667 return wxCONV_FAILED;
668 }
669
670 return len;
671 }
672
673 //
674 // BASE64 encoding table
675 //
676 static const unsigned char utf7enb64[] =
677 {
678 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
679 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
680 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
681 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
682 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
683 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
684 'w', 'x', 'y', 'z', '0', '1', '2', '3',
685 '4', '5', '6', '7', '8', '9', '+', '/'
686 };
687
688 //
689 // UTF-7 encoding table
690 //
691 // 0 - Set D (directly encoded characters)
692 // 1 - Set O (optional direct characters)
693 // 2 - whitespace characters (optional)
694 // 3 - special characters
695 //
696 static const unsigned char utf7encode[128] =
697 {
698 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
699 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
700 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
704 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
706 };
707
708 static inline bool wxIsUTF7Direct(wchar_t wc)
709 {
710 return wc < 0x80 && utf7encode[wc] < 1;
711 }
712
713 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
714 const wchar_t *src, size_t srcLen) const
715 {
716 EncoderState stateOrig,
717 *statePtr;
718 if ( srcLen == wxNO_LEN )
719 {
720 // we don't apply the stored state when operating on entire strings at
721 // once
722 statePtr = &stateOrig;
723
724 srcLen = wxWcslen(src) + 1;
725 }
726 else // do use the mode we left the output in previously
727 {
728 stateOrig = m_stateEncoder;
729 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
730 }
731
732 EncoderState& state = *statePtr;
733
734
735 size_t len = 0;
736
737 const wchar_t * const srcEnd = src + srcLen;
738 while ( src < srcEnd && (!dst || len < dstLen) )
739 {
740 wchar_t cc = *src++;
741 if ( wxIsUTF7Direct(cc) )
742 {
743 if ( state.IsShifted() )
744 {
745 // pad with zeros the last encoded block if necessary
746 if ( state.bit )
747 {
748 if ( dst )
749 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
750 len++;
751 }
752
753 state.ToDirect();
754
755 if ( dst )
756 *dst++ = '-';
757 len++;
758 }
759
760 if ( dst )
761 *dst++ = (char)cc;
762 len++;
763 }
764 else if ( cc == '+' && state.IsDirect() )
765 {
766 if ( dst )
767 {
768 *dst++ = '+';
769 *dst++ = '-';
770 }
771
772 len += 2;
773 }
774 #ifndef WC_UTF16
775 else if (((wxUint32)cc) > 0xffff)
776 {
777 // no surrogate pair generation (yet?)
778 return wxCONV_FAILED;
779 }
780 #endif
781 else
782 {
783 if ( state.IsDirect() )
784 {
785 state.ToShifted();
786
787 if ( dst )
788 *dst++ = '+';
789 len++;
790 }
791
792 // BASE64 encode string
793 for ( ;; )
794 {
795 for ( unsigned lsb = 0; lsb < 2; lsb++ )
796 {
797 state.accum <<= 8;
798 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
799
800 for (state.bit += 8; state.bit >= 6; )
801 {
802 state.bit -= 6;
803 if ( dst )
804 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
805 len++;
806 }
807 }
808
809 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
810 break;
811
812 src++;
813 }
814 }
815 }
816
817 // we need to restore the original encoder state if we were called just to
818 // calculate the amount of space needed as we will presumably be called
819 // again to really convert the data now
820 if ( !dst )
821 state = stateOrig;
822
823 return len;
824 }
825
826 // ----------------------------------------------------------------------------
827 // UTF-8
828 // ----------------------------------------------------------------------------
829
830 static const wxUint32 utf8_max[]=
831 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
832
833 // boundaries of the private use area we use to (temporarily) remap invalid
834 // characters invalid in a UTF-8 encoded string
835 const wxUint32 wxUnicodePUA = 0x100000;
836 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
837
838 // this table gives the length of the UTF-8 encoding from its first character:
839 const unsigned char tableUtf8Lengths[256] = {
840 // single-byte sequences (ASCII):
841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
847 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
849
850 // these are invalid:
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
855 0, 0, // C0,C1
856
857 // two-byte sequences:
858 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
859 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
860
861 // three-byte sequences:
862 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
863
864 // four-byte sequences:
865 4, 4, 4, 4, 4, // F0..F4
866
867 // these are invalid again (5- or 6-byte
868 // sequences and sequences for code points
869 // above U+10FFFF, as restricted by RFC 3629):
870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
871 };
872
873 size_t
874 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
875 const char *src, size_t srcLen) const
876 {
877 wchar_t *out = dstLen ? dst : NULL;
878 size_t written = 0;
879
880 if ( srcLen == wxNO_LEN )
881 srcLen = strlen(src) + 1;
882
883 for ( const char *p = src; ; p++ )
884 {
885 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
886 {
887 // all done successfully, just add the trailing NULL if we are not
888 // using explicit length
889 if ( srcLen == wxNO_LEN )
890 {
891 if ( out )
892 {
893 if ( !dstLen )
894 break;
895
896 *out = L'\0';
897 }
898
899 written++;
900 }
901
902 return written;
903 }
904
905 if ( out && !dstLen-- )
906 break;
907
908 wxUint32 code;
909 unsigned char c = *p;
910
911 if ( c < 0x80 )
912 {
913 if ( srcLen == 0 ) // the test works for wxNO_LEN too
914 break;
915
916 if ( srcLen != wxNO_LEN )
917 srcLen--;
918
919 code = c;
920 }
921 else
922 {
923 unsigned len = tableUtf8Lengths[c];
924 if ( !len )
925 break;
926
927 if ( srcLen < len ) // the test works for wxNO_LEN too
928 break;
929
930 if ( srcLen != wxNO_LEN )
931 srcLen -= len;
932
933 // Char. number range | UTF-8 octet sequence
934 // (hexadecimal) | (binary)
935 // ----------------------+----------------------------------------
936 // 0000 0000 - 0000 007F | 0xxxxxxx
937 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
938 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
939 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
940 //
941 // Code point value is stored in bits marked with 'x',
942 // lowest-order bit of the value on the right side in the diagram
943 // above. (from RFC 3629)
944
945 // mask to extract lead byte's value ('x' bits above), by sequence
946 // length:
947 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
948
949 // mask and value of lead byte's most significant bits, by length:
950 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
951 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
952
953 len--; // it's more convenient to work with 0-based length here
954
955 // extract the lead byte's value bits:
956 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
957 break;
958
959 code = c & leadValueMask[len];
960
961 // all remaining bytes, if any, are handled in the same way
962 // regardless of sequence's length:
963 for ( ; len; --len )
964 {
965 c = *++p;
966 if ( (c & 0xC0) != 0x80 )
967 return wxCONV_FAILED;
968
969 code <<= 6;
970 code |= c & 0x3F;
971 }
972 }
973
974 #ifdef WC_UTF16
975 // cast is ok because wchar_t == wxUint16 if WC_UTF16
976 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
977 {
978 if ( out )
979 out++;
980 written++;
981 }
982 #else // !WC_UTF16
983 if ( out )
984 *out = code;
985 #endif // WC_UTF16/!WC_UTF16
986
987 if ( out )
988 out++;
989
990 written++;
991 }
992
993 return wxCONV_FAILED;
994 }
995
996 size_t
997 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
998 const wchar_t *src, size_t srcLen) const
999 {
1000 char *out = dstLen ? dst : NULL;
1001 size_t written = 0;
1002
1003 for ( const wchar_t *wp = src; ; wp++ )
1004 {
1005 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1006 {
1007 // all done successfully, just add the trailing NULL if we are not
1008 // using explicit length
1009 if ( srcLen == wxNO_LEN )
1010 {
1011 if ( out )
1012 {
1013 if ( !dstLen )
1014 break;
1015
1016 *out = '\0';
1017 }
1018
1019 written++;
1020 }
1021
1022 return written;
1023 }
1024
1025 if ( srcLen != wxNO_LEN )
1026 srcLen--;
1027
1028 wxUint32 code;
1029 #ifdef WC_UTF16
1030 // cast is ok for WC_UTF16
1031 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1032 {
1033 // skip the next char too as we decoded a surrogate
1034 wp++;
1035 }
1036 #else // wchar_t is UTF-32
1037 code = *wp & 0x7fffffff;
1038 #endif
1039
1040 unsigned len;
1041 if ( code <= 0x7F )
1042 {
1043 len = 1;
1044 if ( out )
1045 {
1046 if ( dstLen < len )
1047 break;
1048
1049 out[0] = (char)code;
1050 }
1051 }
1052 else if ( code <= 0x07FF )
1053 {
1054 len = 2;
1055 if ( out )
1056 {
1057 if ( dstLen < len )
1058 break;
1059
1060 // NB: this line takes 6 least significant bits, encodes them as
1061 // 10xxxxxx and discards them so that the next byte can be encoded:
1062 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1063 out[0] = 0xC0 | code;
1064 }
1065 }
1066 else if ( code < 0xFFFF )
1067 {
1068 len = 3;
1069 if ( out )
1070 {
1071 if ( dstLen < len )
1072 break;
1073
1074 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1075 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1076 out[0] = 0xE0 | code;
1077 }
1078 }
1079 else if ( code <= 0x10FFFF )
1080 {
1081 len = 4;
1082 if ( out )
1083 {
1084 if ( dstLen < len )
1085 break;
1086
1087 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1088 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1089 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1090 out[0] = 0xF0 | code;
1091 }
1092 }
1093 else
1094 {
1095 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1096 break;
1097 }
1098
1099 if ( out )
1100 {
1101 out += len;
1102 dstLen -= len;
1103 }
1104
1105 written += len;
1106 }
1107
1108 // we only get here if an error occurs during decoding
1109 return wxCONV_FAILED;
1110 }
1111
1112 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1113 const char *psz, size_t srcLen) const
1114 {
1115 if ( m_options == MAP_INVALID_UTF8_NOT )
1116 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1117
1118 size_t len = 0;
1119
1120 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1121 {
1122 const char *opsz = psz;
1123 bool invalid = false;
1124 unsigned char cc = *psz++, fc = cc;
1125 unsigned cnt;
1126 for (cnt = 0; fc & 0x80; cnt++)
1127 fc <<= 1;
1128
1129 if (!cnt)
1130 {
1131 // plain ASCII char
1132 if (buf)
1133 *buf++ = cc;
1134 len++;
1135
1136 // escape the escape character for octal escapes
1137 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1138 && cc == '\\' && (!buf || len < n))
1139 {
1140 if (buf)
1141 *buf++ = cc;
1142 len++;
1143 }
1144 }
1145 else
1146 {
1147 cnt--;
1148 if (!cnt)
1149 {
1150 // invalid UTF-8 sequence
1151 invalid = true;
1152 }
1153 else
1154 {
1155 unsigned ocnt = cnt - 1;
1156 wxUint32 res = cc & (0x3f >> cnt);
1157 while (cnt--)
1158 {
1159 cc = *psz;
1160 if ((cc & 0xC0) != 0x80)
1161 {
1162 // invalid UTF-8 sequence
1163 invalid = true;
1164 break;
1165 }
1166
1167 psz++;
1168 res = (res << 6) | (cc & 0x3f);
1169 }
1170
1171 if (invalid || res <= utf8_max[ocnt])
1172 {
1173 // illegal UTF-8 encoding
1174 invalid = true;
1175 }
1176 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1177 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1178 {
1179 // if one of our PUA characters turns up externally
1180 // it must also be treated as an illegal sequence
1181 // (a bit like you have to escape an escape character)
1182 invalid = true;
1183 }
1184 else
1185 {
1186 #ifdef WC_UTF16
1187 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1188 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1189 if (pa == wxCONV_FAILED)
1190 {
1191 invalid = true;
1192 }
1193 else
1194 {
1195 if (buf)
1196 buf += pa;
1197 len += pa;
1198 }
1199 #else // !WC_UTF16
1200 if (buf)
1201 *buf++ = (wchar_t)res;
1202 len++;
1203 #endif // WC_UTF16/!WC_UTF16
1204 }
1205 }
1206
1207 if (invalid)
1208 {
1209 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1210 {
1211 while (opsz < psz && (!buf || len < n))
1212 {
1213 #ifdef WC_UTF16
1214 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1215 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1216 wxASSERT(pa != wxCONV_FAILED);
1217 if (buf)
1218 buf += pa;
1219 opsz++;
1220 len += pa;
1221 #else
1222 if (buf)
1223 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1224 opsz++;
1225 len++;
1226 #endif
1227 }
1228 }
1229 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1230 {
1231 while (opsz < psz && (!buf || len < n))
1232 {
1233 if ( buf && len + 3 < n )
1234 {
1235 unsigned char on = *opsz;
1236 *buf++ = L'\\';
1237 *buf++ = (wchar_t)( L'0' + on / 0100 );
1238 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1239 *buf++ = (wchar_t)( L'0' + on % 010 );
1240 }
1241
1242 opsz++;
1243 len += 4;
1244 }
1245 }
1246 else // MAP_INVALID_UTF8_NOT
1247 {
1248 return wxCONV_FAILED;
1249 }
1250 }
1251 }
1252 }
1253
1254 if (srcLen == wxNO_LEN && buf && (len < n))
1255 *buf = 0;
1256
1257 return len + 1;
1258 }
1259
1260 static inline bool isoctal(wchar_t wch)
1261 {
1262 return L'0' <= wch && wch <= L'7';
1263 }
1264
1265 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1266 const wchar_t *psz, size_t srcLen) const
1267 {
1268 if ( m_options == MAP_INVALID_UTF8_NOT )
1269 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1270
1271 size_t len = 0;
1272
1273 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1274 {
1275 wxUint32 cc;
1276
1277 #ifdef WC_UTF16
1278 // cast is ok for WC_UTF16
1279 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1280 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1281 #else
1282 cc = (*psz++) & 0x7fffffff;
1283 #endif
1284
1285 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1286 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1287 {
1288 if (buf)
1289 *buf++ = (char)(cc - wxUnicodePUA);
1290 len++;
1291 }
1292 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1293 && cc == L'\\' && psz[0] == L'\\' )
1294 {
1295 if (buf)
1296 *buf++ = (char)cc;
1297 psz++;
1298 len++;
1299 }
1300 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1301 cc == L'\\' &&
1302 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1303 {
1304 if (buf)
1305 {
1306 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1307 (psz[1] - L'0') * 010 +
1308 (psz[2] - L'0'));
1309 }
1310
1311 psz += 3;
1312 len++;
1313 }
1314 else
1315 {
1316 unsigned cnt;
1317 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1318 {
1319 }
1320
1321 if (!cnt)
1322 {
1323 // plain ASCII char
1324 if (buf)
1325 *buf++ = (char) cc;
1326 len++;
1327 }
1328 else
1329 {
1330 len += cnt + 1;
1331 if (buf)
1332 {
1333 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1334 while (cnt--)
1335 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1336 }
1337 }
1338 }
1339 }
1340
1341 if (srcLen == wxNO_LEN && buf && (len < n))
1342 *buf = 0;
1343
1344 return len + 1;
1345 }
1346
1347 // ============================================================================
1348 // UTF-16
1349 // ============================================================================
1350
1351 #ifdef WORDS_BIGENDIAN
1352 #define wxMBConvUTF16straight wxMBConvUTF16BE
1353 #define wxMBConvUTF16swap wxMBConvUTF16LE
1354 #else
1355 #define wxMBConvUTF16swap wxMBConvUTF16BE
1356 #define wxMBConvUTF16straight wxMBConvUTF16LE
1357 #endif
1358
1359 /* static */
1360 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1361 {
1362 if ( srcLen == wxNO_LEN )
1363 {
1364 // count the number of bytes in input, including the trailing NULs
1365 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1366 for ( srcLen = 1; *inBuff++; srcLen++ )
1367 ;
1368
1369 srcLen *= BYTES_PER_CHAR;
1370 }
1371 else // we already have the length
1372 {
1373 // we can only convert an entire number of UTF-16 characters
1374 if ( srcLen % BYTES_PER_CHAR )
1375 return wxCONV_FAILED;
1376 }
1377
1378 return srcLen;
1379 }
1380
1381 // case when in-memory representation is UTF-16 too
1382 #ifdef WC_UTF16
1383
1384 // ----------------------------------------------------------------------------
1385 // conversions without endianness change
1386 // ----------------------------------------------------------------------------
1387
1388 size_t
1389 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1390 const char *src, size_t srcLen) const
1391 {
1392 // set up the scene for using memcpy() (which is presumably more efficient
1393 // than copying the bytes one by one)
1394 srcLen = GetLength(src, srcLen);
1395 if ( srcLen == wxNO_LEN )
1396 return wxCONV_FAILED;
1397
1398 const size_t inLen = srcLen / BYTES_PER_CHAR;
1399 if ( dst )
1400 {
1401 if ( dstLen < inLen )
1402 return wxCONV_FAILED;
1403
1404 memcpy(dst, src, srcLen);
1405 }
1406
1407 return inLen;
1408 }
1409
1410 size_t
1411 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1412 const wchar_t *src, size_t srcLen) const
1413 {
1414 if ( srcLen == wxNO_LEN )
1415 srcLen = wxWcslen(src) + 1;
1416
1417 srcLen *= BYTES_PER_CHAR;
1418
1419 if ( dst )
1420 {
1421 if ( dstLen < srcLen )
1422 return wxCONV_FAILED;
1423
1424 memcpy(dst, src, srcLen);
1425 }
1426
1427 return srcLen;
1428 }
1429
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1433
1434 size_t
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1436 const char *src, size_t srcLen) const
1437 {
1438 srcLen = GetLength(src, srcLen);
1439 if ( srcLen == wxNO_LEN )
1440 return wxCONV_FAILED;
1441
1442 srcLen /= BYTES_PER_CHAR;
1443
1444 if ( dst )
1445 {
1446 if ( dstLen < srcLen )
1447 return wxCONV_FAILED;
1448
1449 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1450 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1451 {
1452 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1453 }
1454 }
1455
1456 return srcLen;
1457 }
1458
1459 size_t
1460 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1461 const wchar_t *src, size_t srcLen) const
1462 {
1463 if ( srcLen == wxNO_LEN )
1464 srcLen = wxWcslen(src) + 1;
1465
1466 srcLen *= BYTES_PER_CHAR;
1467
1468 if ( dst )
1469 {
1470 if ( dstLen < srcLen )
1471 return wxCONV_FAILED;
1472
1473 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1474 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1475 {
1476 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1477 }
1478 }
1479
1480 return srcLen;
1481 }
1482
1483 #else // !WC_UTF16: wchar_t is UTF-32
1484
1485 // ----------------------------------------------------------------------------
1486 // conversions without endianness change
1487 // ----------------------------------------------------------------------------
1488
1489 size_t
1490 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1491 const char *src, size_t srcLen) const
1492 {
1493 srcLen = GetLength(src, srcLen);
1494 if ( srcLen == wxNO_LEN )
1495 return wxCONV_FAILED;
1496
1497 const size_t inLen = srcLen / BYTES_PER_CHAR;
1498 if ( !dst )
1499 {
1500 // optimization: return maximal space which could be needed for this
1501 // string even if the real size could be smaller if the buffer contains
1502 // any surrogates
1503 return inLen;
1504 }
1505
1506 size_t outLen = 0;
1507 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1508 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1509 {
1510 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1511 if ( !inBuff )
1512 return wxCONV_FAILED;
1513
1514 if ( ++outLen > dstLen )
1515 return wxCONV_FAILED;
1516
1517 *dst++ = ch;
1518 }
1519
1520
1521 return outLen;
1522 }
1523
1524 size_t
1525 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1526 const wchar_t *src, size_t srcLen) const
1527 {
1528 if ( srcLen == wxNO_LEN )
1529 srcLen = wxWcslen(src) + 1;
1530
1531 size_t outLen = 0;
1532 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1533 for ( size_t n = 0; n < srcLen; n++ )
1534 {
1535 wxUint16 cc[2];
1536 const size_t numChars = encode_utf16(*src++, cc);
1537 if ( numChars == wxCONV_FAILED )
1538 return wxCONV_FAILED;
1539
1540 outLen += numChars * BYTES_PER_CHAR;
1541 if ( outBuff )
1542 {
1543 if ( outLen > dstLen )
1544 return wxCONV_FAILED;
1545
1546 *outBuff++ = cc[0];
1547 if ( numChars == 2 )
1548 {
1549 // second character of a surrogate
1550 *outBuff++ = cc[1];
1551 }
1552 }
1553 }
1554
1555 return outLen;
1556 }
1557
1558 // ----------------------------------------------------------------------------
1559 // endian-reversing conversions
1560 // ----------------------------------------------------------------------------
1561
1562 size_t
1563 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1564 const char *src, size_t srcLen) const
1565 {
1566 srcLen = GetLength(src, srcLen);
1567 if ( srcLen == wxNO_LEN )
1568 return wxCONV_FAILED;
1569
1570 const size_t inLen = srcLen / BYTES_PER_CHAR;
1571 if ( !dst )
1572 {
1573 // optimization: return maximal space which could be needed for this
1574 // string even if the real size could be smaller if the buffer contains
1575 // any surrogates
1576 return inLen;
1577 }
1578
1579 size_t outLen = 0;
1580 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1581 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1582 {
1583 wxUint32 ch;
1584 wxUint16 tmp[2];
1585
1586 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1587 inBuff++;
1588 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1589
1590 const size_t numChars = decode_utf16(tmp, ch);
1591 if ( numChars == wxCONV_FAILED )
1592 return wxCONV_FAILED;
1593
1594 if ( numChars == 2 )
1595 inBuff++;
1596
1597 if ( ++outLen > dstLen )
1598 return wxCONV_FAILED;
1599
1600 *dst++ = ch;
1601 }
1602
1603
1604 return outLen;
1605 }
1606
1607 size_t
1608 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1609 const wchar_t *src, size_t srcLen) const
1610 {
1611 if ( srcLen == wxNO_LEN )
1612 srcLen = wxWcslen(src) + 1;
1613
1614 size_t outLen = 0;
1615 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1616 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1617 {
1618 wxUint16 cc[2];
1619 const size_t numChars = encode_utf16(*src, cc);
1620 if ( numChars == wxCONV_FAILED )
1621 return wxCONV_FAILED;
1622
1623 outLen += numChars * BYTES_PER_CHAR;
1624 if ( outBuff )
1625 {
1626 if ( outLen > dstLen )
1627 return wxCONV_FAILED;
1628
1629 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1630 if ( numChars == 2 )
1631 {
1632 // second character of a surrogate
1633 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1634 }
1635 }
1636 }
1637
1638 return outLen;
1639 }
1640
1641 #endif // WC_UTF16/!WC_UTF16
1642
1643
1644 // ============================================================================
1645 // UTF-32
1646 // ============================================================================
1647
1648 #ifdef WORDS_BIGENDIAN
1649 #define wxMBConvUTF32straight wxMBConvUTF32BE
1650 #define wxMBConvUTF32swap wxMBConvUTF32LE
1651 #else
1652 #define wxMBConvUTF32swap wxMBConvUTF32BE
1653 #define wxMBConvUTF32straight wxMBConvUTF32LE
1654 #endif
1655
1656
1657 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1658 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1659
1660 /* static */
1661 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1662 {
1663 if ( srcLen == wxNO_LEN )
1664 {
1665 // count the number of bytes in input, including the trailing NULs
1666 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1667 for ( srcLen = 1; *inBuff++; srcLen++ )
1668 ;
1669
1670 srcLen *= BYTES_PER_CHAR;
1671 }
1672 else // we already have the length
1673 {
1674 // we can only convert an entire number of UTF-32 characters
1675 if ( srcLen % BYTES_PER_CHAR )
1676 return wxCONV_FAILED;
1677 }
1678
1679 return srcLen;
1680 }
1681
1682 // case when in-memory representation is UTF-16
1683 #ifdef WC_UTF16
1684
1685 // ----------------------------------------------------------------------------
1686 // conversions without endianness change
1687 // ----------------------------------------------------------------------------
1688
1689 size_t
1690 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1691 const char *src, size_t srcLen) const
1692 {
1693 srcLen = GetLength(src, srcLen);
1694 if ( srcLen == wxNO_LEN )
1695 return wxCONV_FAILED;
1696
1697 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1698 const size_t inLen = srcLen / BYTES_PER_CHAR;
1699 size_t outLen = 0;
1700 for ( size_t n = 0; n < inLen; n++ )
1701 {
1702 wxUint16 cc[2];
1703 const size_t numChars = encode_utf16(*inBuff++, cc);
1704 if ( numChars == wxCONV_FAILED )
1705 return wxCONV_FAILED;
1706
1707 outLen += numChars;
1708 if ( dst )
1709 {
1710 if ( outLen > dstLen )
1711 return wxCONV_FAILED;
1712
1713 *dst++ = cc[0];
1714 if ( numChars == 2 )
1715 {
1716 // second character of a surrogate
1717 *dst++ = cc[1];
1718 }
1719 }
1720 }
1721
1722 return outLen;
1723 }
1724
1725 size_t
1726 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1727 const wchar_t *src, size_t srcLen) const
1728 {
1729 if ( srcLen == wxNO_LEN )
1730 srcLen = wxWcslen(src) + 1;
1731
1732 if ( !dst )
1733 {
1734 // optimization: return maximal space which could be needed for this
1735 // string instead of the exact amount which could be less if there are
1736 // any surrogates in the input
1737 //
1738 // we consider that surrogates are rare enough to make it worthwhile to
1739 // avoid running the loop below at the cost of slightly extra memory
1740 // consumption
1741 return srcLen * BYTES_PER_CHAR;
1742 }
1743
1744 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1745 size_t outLen = 0;
1746 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1747 {
1748 const wxUint32 ch = wxDecodeSurrogate(&src);
1749 if ( !src )
1750 return wxCONV_FAILED;
1751
1752 outLen += BYTES_PER_CHAR;
1753
1754 if ( outLen > dstLen )
1755 return wxCONV_FAILED;
1756
1757 *outBuff++ = ch;
1758 }
1759
1760 return outLen;
1761 }
1762
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1766
1767 size_t
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1769 const char *src, size_t srcLen) const
1770 {
1771 srcLen = GetLength(src, srcLen);
1772 if ( srcLen == wxNO_LEN )
1773 return wxCONV_FAILED;
1774
1775 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1776 const size_t inLen = srcLen / BYTES_PER_CHAR;
1777 size_t outLen = 0;
1778 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1779 {
1780 wxUint16 cc[2];
1781 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1782 if ( numChars == wxCONV_FAILED )
1783 return wxCONV_FAILED;
1784
1785 outLen += numChars;
1786 if ( dst )
1787 {
1788 if ( outLen > dstLen )
1789 return wxCONV_FAILED;
1790
1791 *dst++ = cc[0];
1792 if ( numChars == 2 )
1793 {
1794 // second character of a surrogate
1795 *dst++ = cc[1];
1796 }
1797 }
1798 }
1799
1800 return outLen;
1801 }
1802
1803 size_t
1804 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1805 const wchar_t *src, size_t srcLen) const
1806 {
1807 if ( srcLen == wxNO_LEN )
1808 srcLen = wxWcslen(src) + 1;
1809
1810 if ( !dst )
1811 {
1812 // optimization: return maximal space which could be needed for this
1813 // string instead of the exact amount which could be less if there are
1814 // any surrogates in the input
1815 //
1816 // we consider that surrogates are rare enough to make it worthwhile to
1817 // avoid running the loop below at the cost of slightly extra memory
1818 // consumption
1819 return srcLen*BYTES_PER_CHAR;
1820 }
1821
1822 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1823 size_t outLen = 0;
1824 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1825 {
1826 const wxUint32 ch = wxDecodeSurrogate(&src);
1827 if ( !src )
1828 return wxCONV_FAILED;
1829
1830 outLen += BYTES_PER_CHAR;
1831
1832 if ( outLen > dstLen )
1833 return wxCONV_FAILED;
1834
1835 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1836 }
1837
1838 return outLen;
1839 }
1840
1841 #else // !WC_UTF16: wchar_t is UTF-32
1842
1843 // ----------------------------------------------------------------------------
1844 // conversions without endianness change
1845 // ----------------------------------------------------------------------------
1846
1847 size_t
1848 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1849 const char *src, size_t srcLen) const
1850 {
1851 // use memcpy() as it should be much faster than hand-written loop
1852 srcLen = GetLength(src, srcLen);
1853 if ( srcLen == wxNO_LEN )
1854 return wxCONV_FAILED;
1855
1856 const size_t inLen = srcLen/BYTES_PER_CHAR;
1857 if ( dst )
1858 {
1859 if ( dstLen < inLen )
1860 return wxCONV_FAILED;
1861
1862 memcpy(dst, src, srcLen);
1863 }
1864
1865 return inLen;
1866 }
1867
1868 size_t
1869 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1870 const wchar_t *src, size_t srcLen) const
1871 {
1872 if ( srcLen == wxNO_LEN )
1873 srcLen = wxWcslen(src) + 1;
1874
1875 srcLen *= BYTES_PER_CHAR;
1876
1877 if ( dst )
1878 {
1879 if ( dstLen < srcLen )
1880 return wxCONV_FAILED;
1881
1882 memcpy(dst, src, srcLen);
1883 }
1884
1885 return srcLen;
1886 }
1887
1888 // ----------------------------------------------------------------------------
1889 // endian-reversing conversions
1890 // ----------------------------------------------------------------------------
1891
1892 size_t
1893 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1894 const char *src, size_t srcLen) const
1895 {
1896 srcLen = GetLength(src, srcLen);
1897 if ( srcLen == wxNO_LEN )
1898 return wxCONV_FAILED;
1899
1900 srcLen /= BYTES_PER_CHAR;
1901
1902 if ( dst )
1903 {
1904 if ( dstLen < srcLen )
1905 return wxCONV_FAILED;
1906
1907 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1908 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1909 {
1910 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1911 }
1912 }
1913
1914 return srcLen;
1915 }
1916
1917 size_t
1918 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1919 const wchar_t *src, size_t srcLen) const
1920 {
1921 if ( srcLen == wxNO_LEN )
1922 srcLen = wxWcslen(src) + 1;
1923
1924 srcLen *= BYTES_PER_CHAR;
1925
1926 if ( dst )
1927 {
1928 if ( dstLen < srcLen )
1929 return wxCONV_FAILED;
1930
1931 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1932 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1933 {
1934 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1935 }
1936 }
1937
1938 return srcLen;
1939 }
1940
1941 #endif // WC_UTF16/!WC_UTF16
1942
1943
1944 // ============================================================================
1945 // The classes doing conversion using the iconv_xxx() functions
1946 // ============================================================================
1947
1948 #ifdef HAVE_ICONV
1949
1950 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1951 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1952 // (unless there's yet another bug in glibc) the only case when iconv()
1953 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1954 // left in the input buffer -- when _real_ error occurs,
1955 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1956 // iconv() failure.
1957 // [This bug does not appear in glibc 2.2.]
1958 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1959 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1960 (errno != E2BIG || bufLeft != 0))
1961 #else
1962 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1963 #endif
1964
1965 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1966
1967 #define ICONV_T_INVALID ((iconv_t)-1)
1968
1969 #if SIZEOF_WCHAR_T == 4
1970 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1971 #define WC_ENC wxFONTENCODING_UTF32
1972 #elif SIZEOF_WCHAR_T == 2
1973 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1974 #define WC_ENC wxFONTENCODING_UTF16
1975 #else // sizeof(wchar_t) != 2 nor 4
1976 // does this ever happen?
1977 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1978 #endif
1979
1980 // ----------------------------------------------------------------------------
1981 // wxMBConv_iconv: encapsulates an iconv character set
1982 // ----------------------------------------------------------------------------
1983
1984 class wxMBConv_iconv : public wxMBConv
1985 {
1986 public:
1987 wxMBConv_iconv(const char *name);
1988 virtual ~wxMBConv_iconv();
1989
1990 // implement base class virtual methods
1991 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1992 const char *src, size_t srcLen = wxNO_LEN) const;
1993 virtual size_t FromWChar(char *dst, size_t dstLen,
1994 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1995 virtual size_t GetMBNulLen() const;
1996
1997 #if wxUSE_UNICODE_UTF8
1998 virtual bool IsUTF8() const;
1999 #endif
2000
2001 virtual wxMBConv *Clone() const
2002 {
2003 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2004 p->m_minMBCharWidth = m_minMBCharWidth;
2005 return p;
2006 }
2007
2008 bool IsOk() const
2009 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2010
2011 protected:
2012 // the iconv handlers used to translate from multibyte
2013 // to wide char and in the other direction
2014 iconv_t m2w,
2015 w2m;
2016
2017 #if wxUSE_THREADS
2018 // guards access to m2w and w2m objects
2019 wxMutex m_iconvMutex;
2020 #endif
2021
2022 private:
2023 // the name (for iconv_open()) of a wide char charset -- if none is
2024 // available on this machine, it will remain NULL
2025 static wxString ms_wcCharsetName;
2026
2027 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2028 // different endian-ness than the native one
2029 static bool ms_wcNeedsSwap;
2030
2031
2032 // name of the encoding handled by this conversion
2033 wxString m_name;
2034
2035 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2036 // initially
2037 size_t m_minMBCharWidth;
2038 };
2039
2040 // make the constructor available for unit testing
2041 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2042 {
2043 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2044 if ( !result->IsOk() )
2045 {
2046 delete result;
2047 return 0;
2048 }
2049
2050 return result;
2051 }
2052
2053 wxString wxMBConv_iconv::ms_wcCharsetName;
2054 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2055
2056 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2057 : m_name(name)
2058 {
2059 m_minMBCharWidth = 0;
2060
2061 // check for charset that represents wchar_t:
2062 if ( ms_wcCharsetName.empty() )
2063 {
2064 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2065
2066 #if wxUSE_FONTMAP
2067 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2068 #else // !wxUSE_FONTMAP
2069 static const wxChar *names_static[] =
2070 {
2071 #if SIZEOF_WCHAR_T == 4
2072 _T("UCS-4"),
2073 #elif SIZEOF_WCHAR_T = 2
2074 _T("UCS-2"),
2075 #endif
2076 NULL
2077 };
2078 const wxChar **names = names_static;
2079 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2080
2081 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2082 {
2083 const wxString nameCS(*names);
2084
2085 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2086 wxString nameXE(nameCS);
2087
2088 #ifdef WORDS_BIGENDIAN
2089 nameXE += _T("BE");
2090 #else // little endian
2091 nameXE += _T("LE");
2092 #endif
2093
2094 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2095 nameXE.c_str());
2096
2097 m2w = iconv_open(nameXE.ToAscii(), name);
2098 if ( m2w == ICONV_T_INVALID )
2099 {
2100 // try charset w/o bytesex info (e.g. "UCS4")
2101 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2102 nameCS.c_str());
2103 m2w = iconv_open(nameCS.ToAscii(), name);
2104
2105 // and check for bytesex ourselves:
2106 if ( m2w != ICONV_T_INVALID )
2107 {
2108 char buf[2], *bufPtr;
2109 wchar_t wbuf[2];
2110 size_t insz, outsz;
2111 size_t res;
2112
2113 buf[0] = 'A';
2114 buf[1] = 0;
2115 wbuf[0] = 0;
2116 insz = 2;
2117 outsz = SIZEOF_WCHAR_T * 2;
2118 char* wbufPtr = (char*)wbuf;
2119 bufPtr = buf;
2120
2121 res = iconv(
2122 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2123 &wbufPtr, &outsz);
2124
2125 if (ICONV_FAILED(res, insz))
2126 {
2127 wxLogLastError(wxT("iconv"));
2128 wxLogError(_("Conversion to charset '%s' doesn't work."),
2129 nameCS.c_str());
2130 }
2131 else // ok, can convert to this encoding, remember it
2132 {
2133 ms_wcCharsetName = nameCS;
2134 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2135 }
2136 }
2137 }
2138 else // use charset not requiring byte swapping
2139 {
2140 ms_wcCharsetName = nameXE;
2141 }
2142 }
2143
2144 wxLogTrace(TRACE_STRCONV,
2145 wxT("iconv wchar_t charset is \"%s\"%s"),
2146 ms_wcCharsetName.empty() ? wxString("<none>")
2147 : ms_wcCharsetName,
2148 ms_wcNeedsSwap ? _T(" (needs swap)")
2149 : _T(""));
2150 }
2151 else // we already have ms_wcCharsetName
2152 {
2153 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2154 }
2155
2156 if ( ms_wcCharsetName.empty() )
2157 {
2158 w2m = ICONV_T_INVALID;
2159 }
2160 else
2161 {
2162 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2163 if ( w2m == ICONV_T_INVALID )
2164 {
2165 wxLogTrace(TRACE_STRCONV,
2166 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2167 ms_wcCharsetName.c_str(), name);
2168 }
2169 }
2170 }
2171
2172 wxMBConv_iconv::~wxMBConv_iconv()
2173 {
2174 if ( m2w != ICONV_T_INVALID )
2175 iconv_close(m2w);
2176 if ( w2m != ICONV_T_INVALID )
2177 iconv_close(w2m);
2178 }
2179
2180 size_t
2181 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2182 const char *src, size_t srcLen) const
2183 {
2184 if ( srcLen == wxNO_LEN )
2185 {
2186 // find the string length: notice that must be done differently for
2187 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2188 // consecutive NULs
2189 const size_t nulLen = GetMBNulLen();
2190 switch ( nulLen )
2191 {
2192 default:
2193 return wxCONV_FAILED;
2194
2195 case 1:
2196 srcLen = strlen(src); // arguably more optimized than our version
2197 break;
2198
2199 case 2:
2200 case 4:
2201 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2202 // but they also have to start at character boundary and not
2203 // span two adjacent characters
2204 const char *p;
2205 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2206 ;
2207 srcLen = p - src;
2208 break;
2209 }
2210
2211 // when we're determining the length of the string ourselves we count
2212 // the terminating NUL(s) as part of it and always NUL-terminate the
2213 // output
2214 srcLen += nulLen;
2215 }
2216
2217 // we express length in the number of (wide) characters but iconv always
2218 // counts buffer sizes it in bytes
2219 dstLen *= SIZEOF_WCHAR_T;
2220
2221 #if wxUSE_THREADS
2222 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2223 // Unfortunately there are a couple of global wxCSConv objects such as
2224 // wxConvLocal that are used all over wx code, so we have to make sure
2225 // the handle is used by at most one thread at the time. Otherwise
2226 // only a few wx classes would be safe to use from non-main threads
2227 // as MB<->WC conversion would fail "randomly".
2228 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2229 #endif // wxUSE_THREADS
2230
2231 size_t res, cres;
2232 const char *pszPtr = src;
2233
2234 if ( dst )
2235 {
2236 char* bufPtr = (char*)dst;
2237
2238 // have destination buffer, convert there
2239 size_t dstLenOrig = dstLen;
2240 cres = iconv(m2w,
2241 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2242 &bufPtr, &dstLen);
2243
2244 // convert the number of bytes converted as returned by iconv to the
2245 // number of (wide) characters converted that we need
2246 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2247
2248 if (ms_wcNeedsSwap)
2249 {
2250 // convert to native endianness
2251 for ( unsigned i = 0; i < res; i++ )
2252 dst[i] = WC_BSWAP(dst[i]);
2253 }
2254 }
2255 else // no destination buffer
2256 {
2257 // convert using temp buffer to calculate the size of the buffer needed
2258 wchar_t tbuf[8];
2259 res = 0;
2260
2261 do
2262 {
2263 char* bufPtr = (char*)tbuf;
2264 dstLen = 8 * SIZEOF_WCHAR_T;
2265
2266 cres = iconv(m2w,
2267 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2268 &bufPtr, &dstLen );
2269
2270 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2271 }
2272 while ((cres == (size_t)-1) && (errno == E2BIG));
2273 }
2274
2275 if (ICONV_FAILED(cres, srcLen))
2276 {
2277 //VS: it is ok if iconv fails, hence trace only
2278 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2279 return wxCONV_FAILED;
2280 }
2281
2282 return res;
2283 }
2284
2285 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2286 const wchar_t *src, size_t srcLen) const
2287 {
2288 #if wxUSE_THREADS
2289 // NB: explained in MB2WC
2290 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2291 #endif
2292
2293 if ( srcLen == wxNO_LEN )
2294 srcLen = wxWcslen(src) + 1;
2295
2296 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2297 size_t outbuflen = dstLen;
2298 size_t res, cres;
2299
2300 wchar_t *tmpbuf = 0;
2301
2302 if (ms_wcNeedsSwap)
2303 {
2304 // need to copy to temp buffer to switch endianness
2305 // (doing WC_BSWAP twice on the original buffer won't help, as it
2306 // could be in read-only memory, or be accessed in some other thread)
2307 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2308 for ( size_t i = 0; i < srcLen; i++ )
2309 tmpbuf[i] = WC_BSWAP(src[i]);
2310
2311 tmpbuf[srcLen] = L'\0';
2312 src = tmpbuf;
2313 }
2314
2315 char* inbuf = (char*)src;
2316 if ( dst )
2317 {
2318 // have destination buffer, convert there
2319 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2320
2321 res = dstLen - outbuflen;
2322 }
2323 else // no destination buffer
2324 {
2325 // convert using temp buffer to calculate the size of the buffer needed
2326 char tbuf[16];
2327 res = 0;
2328 do
2329 {
2330 dst = tbuf;
2331 outbuflen = 16;
2332
2333 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2334
2335 res += 16 - outbuflen;
2336 }
2337 while ((cres == (size_t)-1) && (errno == E2BIG));
2338 }
2339
2340 if (ms_wcNeedsSwap)
2341 {
2342 free(tmpbuf);
2343 }
2344
2345 if (ICONV_FAILED(cres, inbuflen))
2346 {
2347 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2348 return wxCONV_FAILED;
2349 }
2350
2351 return res;
2352 }
2353
2354 size_t wxMBConv_iconv::GetMBNulLen() const
2355 {
2356 if ( m_minMBCharWidth == 0 )
2357 {
2358 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2359
2360 #if wxUSE_THREADS
2361 // NB: explained in MB2WC
2362 wxMutexLocker lock(self->m_iconvMutex);
2363 #endif
2364
2365 const wchar_t *wnul = L"";
2366 char buf[8]; // should be enough for NUL in any encoding
2367 size_t inLen = sizeof(wchar_t),
2368 outLen = WXSIZEOF(buf);
2369 char *inBuff = (char *)wnul;
2370 char *outBuff = buf;
2371 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2372 {
2373 self->m_minMBCharWidth = (size_t)-1;
2374 }
2375 else // ok
2376 {
2377 self->m_minMBCharWidth = outBuff - buf;
2378 }
2379 }
2380
2381 return m_minMBCharWidth;
2382 }
2383
2384 #if wxUSE_UNICODE_UTF8
2385 bool wxMBConv_iconv::IsUTF8() const
2386 {
2387 return wxStricmp(m_name, "UTF-8") == 0 ||
2388 wxStricmp(m_name, "UTF8") == 0;
2389 }
2390 #endif
2391
2392 #endif // HAVE_ICONV
2393
2394
2395 // ============================================================================
2396 // Win32 conversion classes
2397 // ============================================================================
2398
2399 #ifdef wxHAVE_WIN32_MB2WC
2400
2401 // from utils.cpp
2402 #if wxUSE_FONTMAP
2403 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2404 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2405 #endif
2406
2407 class wxMBConv_win32 : public wxMBConv
2408 {
2409 public:
2410 wxMBConv_win32()
2411 {
2412 m_CodePage = CP_ACP;
2413 m_minMBCharWidth = 0;
2414 }
2415
2416 wxMBConv_win32(const wxMBConv_win32& conv)
2417 : wxMBConv()
2418 {
2419 m_CodePage = conv.m_CodePage;
2420 m_minMBCharWidth = conv.m_minMBCharWidth;
2421 }
2422
2423 #if wxUSE_FONTMAP
2424 wxMBConv_win32(const char* name)
2425 {
2426 m_CodePage = wxCharsetToCodepage(name);
2427 m_minMBCharWidth = 0;
2428 }
2429
2430 wxMBConv_win32(wxFontEncoding encoding)
2431 {
2432 m_CodePage = wxEncodingToCodepage(encoding);
2433 m_minMBCharWidth = 0;
2434 }
2435 #endif // wxUSE_FONTMAP
2436
2437 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2438 {
2439 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2440 // the behaviour is not compatible with the Unix version (using iconv)
2441 // and break the library itself, e.g. wxTextInputStream::NextChar()
2442 // wouldn't work if reading an incomplete MB char didn't result in an
2443 // error
2444 //
2445 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2446 // Win XP or newer and it is not supported for UTF-[78] so we always
2447 // use our own conversions in this case. See
2448 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2449 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2450 if ( m_CodePage == CP_UTF8 )
2451 {
2452 return wxMBConvUTF8().MB2WC(buf, psz, n);
2453 }
2454
2455 if ( m_CodePage == CP_UTF7 )
2456 {
2457 return wxMBConvUTF7().MB2WC(buf, psz, n);
2458 }
2459
2460 int flags = 0;
2461 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2462 IsAtLeastWin2kSP4() )
2463 {
2464 flags = MB_ERR_INVALID_CHARS;
2465 }
2466
2467 const size_t len = ::MultiByteToWideChar
2468 (
2469 m_CodePage, // code page
2470 flags, // flags: fall on error
2471 psz, // input string
2472 -1, // its length (NUL-terminated)
2473 buf, // output string
2474 buf ? n : 0 // size of output buffer
2475 );
2476 if ( !len )
2477 {
2478 // function totally failed
2479 return wxCONV_FAILED;
2480 }
2481
2482 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2483 // check if we succeeded, by doing a double trip:
2484 if ( !flags && buf )
2485 {
2486 const size_t mbLen = strlen(psz);
2487 wxCharBuffer mbBuf(mbLen);
2488 if ( ::WideCharToMultiByte
2489 (
2490 m_CodePage,
2491 0,
2492 buf,
2493 -1,
2494 mbBuf.data(),
2495 mbLen + 1, // size in bytes, not length
2496 NULL,
2497 NULL
2498 ) == 0 ||
2499 strcmp(mbBuf, psz) != 0 )
2500 {
2501 // we didn't obtain the same thing we started from, hence
2502 // the conversion was lossy and we consider that it failed
2503 return wxCONV_FAILED;
2504 }
2505 }
2506
2507 // note that it returns count of written chars for buf != NULL and size
2508 // of the needed buffer for buf == NULL so in either case the length of
2509 // the string (which never includes the terminating NUL) is one less
2510 return len - 1;
2511 }
2512
2513 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2514 {
2515 /*
2516 we have a problem here: by default, WideCharToMultiByte() may
2517 replace characters unrepresentable in the target code page with bad
2518 quality approximations such as turning "1/2" symbol (U+00BD) into
2519 "1" for the code pages which don't have it and we, obviously, want
2520 to avoid this at any price
2521
2522 the trouble is that this function does it _silently_, i.e. it won't
2523 even tell us whether it did or not... Win98/2000 and higher provide
2524 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2525 we have to resort to a round trip, i.e. check that converting back
2526 results in the same string -- this is, of course, expensive but
2527 otherwise we simply can't be sure to not garble the data.
2528 */
2529
2530 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2531 // it doesn't work with CJK encodings (which we test for rather roughly
2532 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2533 // supporting it
2534 BOOL usedDef wxDUMMY_INITIALIZE(false);
2535 BOOL *pUsedDef;
2536 int flags;
2537 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2538 {
2539 // it's our lucky day
2540 flags = WC_NO_BEST_FIT_CHARS;
2541 pUsedDef = &usedDef;
2542 }
2543 else // old system or unsupported encoding
2544 {
2545 flags = 0;
2546 pUsedDef = NULL;
2547 }
2548
2549 const size_t len = ::WideCharToMultiByte
2550 (
2551 m_CodePage, // code page
2552 flags, // either none or no best fit
2553 pwz, // input string
2554 -1, // it is (wide) NUL-terminated
2555 buf, // output buffer
2556 buf ? n : 0, // and its size
2557 NULL, // default "replacement" char
2558 pUsedDef // [out] was it used?
2559 );
2560
2561 if ( !len )
2562 {
2563 // function totally failed
2564 return wxCONV_FAILED;
2565 }
2566
2567 // we did something, check if we really succeeded
2568 if ( flags )
2569 {
2570 // check if the conversion failed, i.e. if any replacements
2571 // were done
2572 if ( usedDef )
2573 return wxCONV_FAILED;
2574 }
2575 else // we must resort to double tripping...
2576 {
2577 // first we need to ensure that we really have the MB data: this is
2578 // not the case if we're called with NULL buffer, in which case we
2579 // need to do the conversion yet again
2580 wxCharBuffer bufDef;
2581 if ( !buf )
2582 {
2583 bufDef = wxCharBuffer(len);
2584 buf = bufDef.data();
2585 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2586 buf, len, NULL, NULL) )
2587 return wxCONV_FAILED;
2588 }
2589
2590 if ( !n )
2591 n = wcslen(pwz);
2592 wxWCharBuffer wcBuf(n);
2593 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2594 wcscmp(wcBuf, pwz) != 0 )
2595 {
2596 // we didn't obtain the same thing we started from, hence
2597 // the conversion was lossy and we consider that it failed
2598 return wxCONV_FAILED;
2599 }
2600 }
2601
2602 // see the comment above for the reason of "len - 1"
2603 return len - 1;
2604 }
2605
2606 virtual size_t GetMBNulLen() const
2607 {
2608 if ( m_minMBCharWidth == 0 )
2609 {
2610 int len = ::WideCharToMultiByte
2611 (
2612 m_CodePage, // code page
2613 0, // no flags
2614 L"", // input string
2615 1, // translate just the NUL
2616 NULL, // output buffer
2617 0, // and its size
2618 NULL, // no replacement char
2619 NULL // [out] don't care if it was used
2620 );
2621
2622 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2623 switch ( len )
2624 {
2625 default:
2626 wxLogDebug(_T("Unexpected NUL length %d"), len);
2627 self->m_minMBCharWidth = (size_t)-1;
2628 break;
2629
2630 case 0:
2631 self->m_minMBCharWidth = (size_t)-1;
2632 break;
2633
2634 case 1:
2635 case 2:
2636 case 4:
2637 self->m_minMBCharWidth = len;
2638 break;
2639 }
2640 }
2641
2642 return m_minMBCharWidth;
2643 }
2644
2645 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2646
2647 bool IsOk() const { return m_CodePage != -1; }
2648
2649 private:
2650 static bool CanUseNoBestFit()
2651 {
2652 static int s_isWin98Or2k = -1;
2653
2654 if ( s_isWin98Or2k == -1 )
2655 {
2656 int verMaj, verMin;
2657 switch ( wxGetOsVersion(&verMaj, &verMin) )
2658 {
2659 case wxOS_WINDOWS_9X:
2660 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2661 break;
2662
2663 case wxOS_WINDOWS_NT:
2664 s_isWin98Or2k = verMaj >= 5;
2665 break;
2666
2667 default:
2668 // unknown: be conservative by default
2669 s_isWin98Or2k = 0;
2670 break;
2671 }
2672
2673 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2674 }
2675
2676 return s_isWin98Or2k == 1;
2677 }
2678
2679 static bool IsAtLeastWin2kSP4()
2680 {
2681 #ifdef __WXWINCE__
2682 return false;
2683 #else
2684 static int s_isAtLeastWin2kSP4 = -1;
2685
2686 if ( s_isAtLeastWin2kSP4 == -1 )
2687 {
2688 OSVERSIONINFOEX ver;
2689
2690 memset(&ver, 0, sizeof(ver));
2691 ver.dwOSVersionInfoSize = sizeof(ver);
2692 GetVersionEx((OSVERSIONINFO*)&ver);
2693
2694 s_isAtLeastWin2kSP4 =
2695 ((ver.dwMajorVersion > 5) || // Vista+
2696 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2697 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2698 ver.wServicePackMajor >= 4)) // 2000 SP4+
2699 ? 1 : 0;
2700 }
2701
2702 return s_isAtLeastWin2kSP4 == 1;
2703 #endif
2704 }
2705
2706
2707 // the code page we're working with
2708 long m_CodePage;
2709
2710 // cached result of GetMBNulLen(), set to 0 initially meaning
2711 // "unknown"
2712 size_t m_minMBCharWidth;
2713 };
2714
2715 #endif // wxHAVE_WIN32_MB2WC
2716
2717
2718 // ============================================================================
2719 // wxEncodingConverter based conversion classes
2720 // ============================================================================
2721
2722 #if wxUSE_FONTMAP
2723
2724 class wxMBConv_wxwin : public wxMBConv
2725 {
2726 private:
2727 void Init()
2728 {
2729 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2730 // The wxMBConv_cf class does a better job.
2731 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2732 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2733 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2734 }
2735
2736 public:
2737 // temporarily just use wxEncodingConverter stuff,
2738 // so that it works while a better implementation is built
2739 wxMBConv_wxwin(const char* name)
2740 {
2741 if (name)
2742 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2743 else
2744 m_enc = wxFONTENCODING_SYSTEM;
2745
2746 Init();
2747 }
2748
2749 wxMBConv_wxwin(wxFontEncoding enc)
2750 {
2751 m_enc = enc;
2752
2753 Init();
2754 }
2755
2756 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2757 {
2758 size_t inbuf = strlen(psz);
2759 if (buf)
2760 {
2761 if (!m2w.Convert(psz, buf))
2762 return wxCONV_FAILED;
2763 }
2764 return inbuf;
2765 }
2766
2767 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2768 {
2769 const size_t inbuf = wxWcslen(psz);
2770 if (buf)
2771 {
2772 if (!w2m.Convert(psz, buf))
2773 return wxCONV_FAILED;
2774 }
2775
2776 return inbuf;
2777 }
2778
2779 virtual size_t GetMBNulLen() const
2780 {
2781 switch ( m_enc )
2782 {
2783 case wxFONTENCODING_UTF16BE:
2784 case wxFONTENCODING_UTF16LE:
2785 return 2;
2786
2787 case wxFONTENCODING_UTF32BE:
2788 case wxFONTENCODING_UTF32LE:
2789 return 4;
2790
2791 default:
2792 return 1;
2793 }
2794 }
2795
2796 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2797
2798 bool IsOk() const { return m_ok; }
2799
2800 public:
2801 wxFontEncoding m_enc;
2802 wxEncodingConverter m2w, w2m;
2803
2804 private:
2805 // were we initialized successfully?
2806 bool m_ok;
2807
2808 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2809 };
2810
2811 // make the constructors available for unit testing
2812 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2813 {
2814 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2815 if ( !result->IsOk() )
2816 {
2817 delete result;
2818 return 0;
2819 }
2820
2821 return result;
2822 }
2823
2824 #endif // wxUSE_FONTMAP
2825
2826 // ============================================================================
2827 // wxCSConv implementation
2828 // ============================================================================
2829
2830 void wxCSConv::Init()
2831 {
2832 m_name = NULL;
2833 m_convReal = NULL;
2834 m_deferred = true;
2835 }
2836
2837 wxCSConv::wxCSConv(const wxString& charset)
2838 {
2839 Init();
2840
2841 if ( !charset.empty() )
2842 {
2843 SetName(charset.ToAscii());
2844 }
2845
2846 #if wxUSE_FONTMAP
2847 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2848 #else
2849 m_encoding = wxFONTENCODING_SYSTEM;
2850 #endif
2851 }
2852
2853 wxCSConv::wxCSConv(wxFontEncoding encoding)
2854 {
2855 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2856 {
2857 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2858
2859 encoding = wxFONTENCODING_SYSTEM;
2860 }
2861
2862 Init();
2863
2864 m_encoding = encoding;
2865 }
2866
2867 wxCSConv::~wxCSConv()
2868 {
2869 Clear();
2870 }
2871
2872 wxCSConv::wxCSConv(const wxCSConv& conv)
2873 : wxMBConv()
2874 {
2875 Init();
2876
2877 SetName(conv.m_name);
2878 m_encoding = conv.m_encoding;
2879 }
2880
2881 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2882 {
2883 Clear();
2884
2885 SetName(conv.m_name);
2886 m_encoding = conv.m_encoding;
2887
2888 return *this;
2889 }
2890
2891 void wxCSConv::Clear()
2892 {
2893 free(m_name);
2894 delete m_convReal;
2895
2896 m_name = NULL;
2897 m_convReal = NULL;
2898 }
2899
2900 void wxCSConv::SetName(const char *charset)
2901 {
2902 if (charset)
2903 {
2904 m_name = wxStrdup(charset);
2905 m_deferred = true;
2906 }
2907 }
2908
2909 #if wxUSE_FONTMAP
2910
2911 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2912 wxEncodingNameCache );
2913
2914 static wxEncodingNameCache gs_nameCache;
2915 #endif
2916
2917 wxMBConv *wxCSConv::DoCreate() const
2918 {
2919 #if wxUSE_FONTMAP
2920 wxLogTrace(TRACE_STRCONV,
2921 wxT("creating conversion for %s"),
2922 (m_name ? m_name
2923 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2924 #endif // wxUSE_FONTMAP
2925
2926 // check for the special case of ASCII or ISO8859-1 charset: as we have
2927 // special knowledge of it anyhow, we don't need to create a special
2928 // conversion object
2929 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2930 m_encoding == wxFONTENCODING_DEFAULT )
2931 {
2932 // don't convert at all
2933 return NULL;
2934 }
2935
2936 // we trust OS to do conversion better than we can so try external
2937 // conversion methods first
2938 //
2939 // the full order is:
2940 // 1. OS conversion (iconv() under Unix or Win32 API)
2941 // 2. hard coded conversions for UTF
2942 // 3. wxEncodingConverter as fall back
2943
2944 // step (1)
2945 #ifdef HAVE_ICONV
2946 #if !wxUSE_FONTMAP
2947 if ( m_name )
2948 #endif // !wxUSE_FONTMAP
2949 {
2950 #if wxUSE_FONTMAP
2951 wxFontEncoding encoding(m_encoding);
2952 #endif
2953
2954 if ( m_name )
2955 {
2956 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2957 if ( conv->IsOk() )
2958 return conv;
2959
2960 delete conv;
2961
2962 #if wxUSE_FONTMAP
2963 encoding =
2964 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2965 #endif // wxUSE_FONTMAP
2966 }
2967 #if wxUSE_FONTMAP
2968 {
2969 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2970 if ( it != gs_nameCache.end() )
2971 {
2972 if ( it->second.empty() )
2973 return NULL;
2974
2975 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2976 if ( conv->IsOk() )
2977 return conv;
2978
2979 delete conv;
2980 }
2981
2982 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2983 // CS : in case this does not return valid names (eg for MacRoman)
2984 // encoding got a 'failure' entry in the cache all the same,
2985 // although it just has to be created using a different method, so
2986 // only store failed iconv creation attempts (or perhaps we
2987 // shoulnd't do this at all ?)
2988 if ( names[0] != NULL )
2989 {
2990 for ( ; *names; ++names )
2991 {
2992 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2993 // will need changes that will obsolete this
2994 wxString name(*names);
2995 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2996 if ( conv->IsOk() )
2997 {
2998 gs_nameCache[encoding] = *names;
2999 return conv;
3000 }
3001
3002 delete conv;
3003 }
3004
3005 gs_nameCache[encoding] = _T(""); // cache the failure
3006 }
3007 }
3008 #endif // wxUSE_FONTMAP
3009 }
3010 #endif // HAVE_ICONV
3011
3012 #ifdef wxHAVE_WIN32_MB2WC
3013 {
3014 #if wxUSE_FONTMAP
3015 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3016 : new wxMBConv_win32(m_encoding);
3017 if ( conv->IsOk() )
3018 return conv;
3019
3020 delete conv;
3021 #else
3022 return NULL;
3023 #endif
3024 }
3025 #endif // wxHAVE_WIN32_MB2WC
3026
3027 #ifdef __DARWIN__
3028 {
3029 // leave UTF16 and UTF32 to the built-ins of wx
3030 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3031 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3032 {
3033 #if wxUSE_FONTMAP
3034 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3035 : new wxMBConv_cf(m_encoding);
3036 #else
3037 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3038 #endif
3039
3040 if ( conv->IsOk() )
3041 return conv;
3042
3043 delete conv;
3044 }
3045 }
3046 #endif // __DARWIN__
3047
3048 // step (2)
3049 wxFontEncoding enc = m_encoding;
3050 #if wxUSE_FONTMAP
3051 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3052 {
3053 // use "false" to suppress interactive dialogs -- we can be called from
3054 // anywhere and popping up a dialog from here is the last thing we want to
3055 // do
3056 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3057 }
3058 #endif // wxUSE_FONTMAP
3059
3060 switch ( enc )
3061 {
3062 case wxFONTENCODING_UTF7:
3063 return new wxMBConvUTF7;
3064
3065 case wxFONTENCODING_UTF8:
3066 return new wxMBConvUTF8;
3067
3068 case wxFONTENCODING_UTF16BE:
3069 return new wxMBConvUTF16BE;
3070
3071 case wxFONTENCODING_UTF16LE:
3072 return new wxMBConvUTF16LE;
3073
3074 case wxFONTENCODING_UTF32BE:
3075 return new wxMBConvUTF32BE;
3076
3077 case wxFONTENCODING_UTF32LE:
3078 return new wxMBConvUTF32LE;
3079
3080 default:
3081 // nothing to do but put here to suppress gcc warnings
3082 break;
3083 }
3084
3085 // step (3)
3086 #if wxUSE_FONTMAP
3087 {
3088 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3089 : new wxMBConv_wxwin(m_encoding);
3090 if ( conv->IsOk() )
3091 return conv;
3092
3093 delete conv;
3094 }
3095 #endif // wxUSE_FONTMAP
3096
3097 // NB: This is a hack to prevent deadlock. What could otherwise happen
3098 // in Unicode build: wxConvLocal creation ends up being here
3099 // because of some failure and logs the error. But wxLog will try to
3100 // attach a timestamp, for which it will need wxConvLocal (to convert
3101 // time to char* and then wchar_t*), but that fails, tries to log the
3102 // error, but wxLog has an (already locked) critical section that
3103 // guards the static buffer.
3104 static bool alreadyLoggingError = false;
3105 if (!alreadyLoggingError)
3106 {
3107 alreadyLoggingError = true;
3108 wxLogError(_("Cannot convert from the charset '%s'!"),
3109 m_name ? m_name
3110 :
3111 #if wxUSE_FONTMAP
3112 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3113 #else // !wxUSE_FONTMAP
3114 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3115 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3116 );
3117
3118 alreadyLoggingError = false;
3119 }
3120
3121 return NULL;
3122 }
3123
3124 void wxCSConv::CreateConvIfNeeded() const
3125 {
3126 if ( m_deferred )
3127 {
3128 wxCSConv *self = (wxCSConv *)this; // const_cast
3129
3130 // if we don't have neither the name nor the encoding, use the default
3131 // encoding for this system
3132 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3133 {
3134 #if wxUSE_INTL
3135 self->m_encoding = wxLocale::GetSystemEncoding();
3136 #else
3137 // fallback to some reasonable default:
3138 self->m_encoding = wxFONTENCODING_ISO8859_1;
3139 #endif // wxUSE_INTL
3140 }
3141
3142 self->m_convReal = DoCreate();
3143 self->m_deferred = false;
3144 }
3145 }
3146
3147 bool wxCSConv::IsOk() const
3148 {
3149 CreateConvIfNeeded();
3150
3151 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3152 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3153 return true; // always ok as we do it ourselves
3154
3155 // m_convReal->IsOk() is called at its own creation, so we know it must
3156 // be ok if m_convReal is non-NULL
3157 return m_convReal != NULL;
3158 }
3159
3160 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3161 const char *src, size_t srcLen) const
3162 {
3163 CreateConvIfNeeded();
3164
3165 if (m_convReal)
3166 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3167
3168 // latin-1 (direct)
3169 if ( srcLen == wxNO_LEN )
3170 srcLen = strlen(src) + 1; // take trailing NUL too
3171
3172 if ( dst )
3173 {
3174 if ( dstLen < srcLen )
3175 return wxCONV_FAILED;
3176
3177 for ( size_t n = 0; n < srcLen; n++ )
3178 dst[n] = (unsigned char)(src[n]);
3179 }
3180
3181 return srcLen;
3182 }
3183
3184 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3185 const wchar_t *src, size_t srcLen) const
3186 {
3187 CreateConvIfNeeded();
3188
3189 if (m_convReal)
3190 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3191
3192 // latin-1 (direct)
3193 if ( srcLen == wxNO_LEN )
3194 srcLen = wxWcslen(src) + 1;
3195
3196 if ( dst )
3197 {
3198 if ( dstLen < srcLen )
3199 return wxCONV_FAILED;
3200
3201 for ( size_t n = 0; n < srcLen; n++ )
3202 {
3203 if ( src[n] > 0xFF )
3204 return wxCONV_FAILED;
3205
3206 dst[n] = (char)src[n];
3207 }
3208
3209 }
3210 else // still need to check the input validity
3211 {
3212 for ( size_t n = 0; n < srcLen; n++ )
3213 {
3214 if ( src[n] > 0xFF )
3215 return wxCONV_FAILED;
3216 }
3217 }
3218
3219 return srcLen;
3220 }
3221
3222 size_t wxCSConv::GetMBNulLen() const
3223 {
3224 CreateConvIfNeeded();
3225
3226 if ( m_convReal )
3227 {
3228 return m_convReal->GetMBNulLen();
3229 }
3230
3231 // otherwise, we are ISO-8859-1
3232 return 1;
3233 }
3234
3235 #if wxUSE_UNICODE_UTF8
3236 bool wxCSConv::IsUTF8() const
3237 {
3238 CreateConvIfNeeded();
3239
3240 if ( m_convReal )
3241 {
3242 return m_convReal->IsUTF8();
3243 }
3244
3245 // otherwise, we are ISO-8859-1
3246 return false;
3247 }
3248 #endif
3249
3250
3251 #if wxUSE_UNICODE
3252
3253 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3254 {
3255 if ( !s )
3256 return wxWCharBuffer();
3257
3258 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3259 if ( !wbuf )
3260 wbuf = wxMBConvUTF8().cMB2WX(s);
3261 if ( !wbuf )
3262 wbuf = wxConvISO8859_1.cMB2WX(s);
3263
3264 return wbuf;
3265 }
3266
3267 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3268 {
3269 if ( !ws )
3270 return wxCharBuffer();
3271
3272 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3273 if ( !buf )
3274 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3275
3276 return buf;
3277 }
3278
3279 #endif // wxUSE_UNICODE
3280
3281 // ----------------------------------------------------------------------------
3282 // globals
3283 // ----------------------------------------------------------------------------
3284
3285 // NB: The reason why we create converted objects in this convoluted way,
3286 // using a factory function instead of global variable, is that they
3287 // may be used at static initialization time (some of them are used by
3288 // wxString ctors and there may be a global wxString object). In other
3289 // words, possibly _before_ the converter global object would be
3290 // initialized.
3291
3292 #undef wxConvLibc
3293 #undef wxConvUTF8
3294 #undef wxConvUTF7
3295 #undef wxConvLocal
3296 #undef wxConvISO8859_1
3297
3298 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3299 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3300 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3301 { \
3302 static impl_klass name##Obj ctor_args; \
3303 return &name##Obj; \
3304 } \
3305 /* this ensures that all global converter objects are created */ \
3306 /* by the time static initialization is done, i.e. before any */ \
3307 /* thread is launched: */ \
3308 static klass* gs_##name##instance = wxGet_##name##Ptr()
3309
3310 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3311 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3312
3313 #ifdef __WINDOWS__
3314 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3315 #else
3316 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3317 #endif
3318
3319 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3320 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3321 // provokes an error message about "not enough macro parameters"; and we
3322 // can't use "()" here as the name##Obj declaration would be parsed as a
3323 // function declaration then, so use a semicolon and live with an extra
3324 // empty statement (and hope that no compilers warns about this)
3325 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3326 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3327
3328 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3329 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3330
3331 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3332 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3333
3334 #ifdef __DARWIN__
3335 // The xnu kernel always communicates file paths in decomposed UTF-8.
3336 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3337 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3338 #endif
3339
3340 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3341 #ifdef __DARWIN__
3342 &wxConvMacUTF8DObj;
3343 #else // !__DARWIN__
3344 wxGet_wxConvLibcPtr();
3345 #endif // __DARWIN__/!__DARWIN__
3346
3347 #else // !wxUSE_WCHAR_T
3348
3349 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3350 // stand-ins in absence of wchar_t
3351 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3352 wxConvISO8859_1,
3353 wxConvLocal,
3354 wxConvUTF8;
3355
3356 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T