fix the output length returned from cWC2MB/MB2WC() to be consistent with From/ToWChar...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
171
172 // the number of NULs terminating this string
173 size_t nulLen = 0; // not really needed, but just to avoid warnings
174
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
181 if ( srcLen != wxNO_LEN )
182 {
183 // we need to know how to find the end of this string
184 nulLen = GetMBNulLen();
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
187
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
190 {
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
193 char * const p = bufTmp.data();
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
196 *s = '\0';
197
198 src = bufTmp;
199 }
200
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
207
208 for ( ;; )
209 {
210 // try to convert the current chunk
211 size_t lenChunk = MB2WC(NULL, src, 0);
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 dstWritten += lenChunk;
216 if ( !srcEnd )
217 dstWritten++;
218
219 if ( !lenChunk )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
230 // +1 is for trailing NUL
231 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dst += lenChunk;
235 if ( !srcEnd )
236 dst++;
237 }
238
239 if ( !srcEnd )
240 {
241 // we convert just one chunk in this case as this is the entire
242 // string anyhow
243 break;
244 }
245
246 // advance the input pointer past the end of this chunk
247 while ( NotAllNULs(src, nulLen) )
248 {
249 // notice that we must skip over multiple bytes here as we suppose
250 // that if NUL takes 2 or 4 bytes, then all the other characters do
251 // too and so if advanced by a single byte we might erroneously
252 // detect sequences of NUL bytes in the middle of the input
253 src += nulLen;
254 }
255
256 src += nulLen; // skipping over its terminator as well
257
258 // note that ">=" (and not just "==") is needed here as the terminator
259 // we skipped just above could be inside or just after the buffer
260 // delimited by inEnd
261 if ( src >= srcEnd )
262 break;
263 }
264
265 return dstWritten;
266 }
267
268 size_t
269 wxMBConv::FromWChar(char *dst, size_t dstLen,
270 const wchar_t *src, size_t srcLen) const
271 {
272 // the number of chars [which would be] written to dst [if it were not NULL]
273 size_t dstWritten = 0;
274
275 // if we don't know its length we have no choice but to assume that it is
276 // NUL-terminated (notice that it can still be NUL-terminated even if
277 // explicit length is given but it doesn't change our return value)
278 const bool isNulTerminated = srcLen == wxNO_LEN;
279
280 // make a copy of the input string unless it is already properly
281 // NUL-terminated
282 wxWCharBuffer bufTmp;
283 if ( isNulTerminated )
284 {
285 srcLen = wxWcslen(src) + 1;
286 }
287 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
288 {
289 // make a copy in order to properly NUL-terminate the string
290 bufTmp = wxWCharBuffer(srcLen);
291 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
292 src = bufTmp;
293 }
294
295 const size_t lenNul = GetMBNulLen();
296 for ( const wchar_t * const srcEnd = src + srcLen;
297 src < srcEnd;
298 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
299 {
300 // try to convert the current chunk
301 size_t lenChunk = WC2MB(NULL, src, 0);
302
303 if ( lenChunk == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
306 dstWritten += lenChunk;
307 if ( isNulTerminated )
308 dstWritten += lenNul;
309
310 if ( dst )
311 {
312 if ( dstWritten > dstLen )
313 return wxCONV_FAILED;
314
315 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
316 return wxCONV_FAILED;
317
318 dst += lenChunk;
319 if ( isNulTerminated )
320 dst += lenNul;
321 }
322 }
323
324 return dstWritten;
325 }
326
327 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
328 {
329 size_t rc = ToWChar(outBuff, outLen, inBuff);
330 if ( rc != wxCONV_FAILED )
331 {
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
334 rc--;
335 }
336
337 return rc;
338 }
339
340 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
341 {
342 size_t rc = FromWChar(outBuff, outLen, inBuff);
343 if ( rc != wxCONV_FAILED )
344 {
345 rc -= GetMBNulLen();
346 }
347
348 return rc;
349 }
350
351 wxMBConv::~wxMBConv()
352 {
353 // nothing to do here (necessary for Darwin linking probably)
354 }
355
356 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
357 {
358 if ( psz )
359 {
360 // calculate the length of the buffer needed first
361 const size_t nLen = ToWChar(NULL, 0, psz);
362 if ( nLen != wxCONV_FAILED )
363 {
364 // now do the actual conversion
365 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
366
367 // +1 for the trailing NULL
368 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
369 return buf;
370 }
371 }
372
373 return wxWCharBuffer();
374 }
375
376 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
377 {
378 if ( pwz )
379 {
380 const size_t nLen = FromWChar(NULL, 0, pwz);
381 if ( nLen != wxCONV_FAILED )
382 {
383 wxCharBuffer buf(nLen - 1);
384 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
385 return buf;
386 }
387 }
388
389 return wxCharBuffer();
390 }
391
392 const wxWCharBuffer
393 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
394 {
395 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
396 if ( dstLen != wxCONV_FAILED )
397 {
398 // notice that we allocate space for dstLen+1 wide characters here
399 // because we want the buffer to always be NUL-terminated, even if the
400 // input isn't (as otherwise the caller has no way to know its length)
401 wxWCharBuffer wbuf(dstLen);
402 wbuf.data()[dstLen] = L'\0';
403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
404 {
405 if ( outLen )
406 {
407 *outLen = dstLen;
408
409 // we also need to handle NUL-terminated input strings
410 // specially: for them the output is the length of the string
411 // excluding the trailing NUL, however if we're asked to
412 // convert a specific number of characters we return the length
413 // of the resulting output even if it's NUL-terminated
414 if ( inLen == wxNO_LEN )
415 (*outLen)--;
416 }
417
418 return wbuf;
419 }
420 }
421
422 if ( outLen )
423 *outLen = 0;
424
425 return wxWCharBuffer();
426 }
427
428 const wxCharBuffer
429 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
430 {
431 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
432 if ( dstLen != wxCONV_FAILED )
433 {
434 const size_t nulLen = GetMBNulLen();
435
436 // as above, ensure that the buffer is always NUL-terminated, even if
437 // the input is not
438 wxCharBuffer buf(dstLen + nulLen - 1);
439 memset(buf.data() + dstLen, 0, nulLen);
440 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
441 {
442 if ( outLen )
443 {
444 *outLen = dstLen;
445
446 if ( inLen == wxNO_LEN )
447 {
448 // in this case both input and output are NUL-terminated
449 // and we're not supposed to count NUL
450 *outLen -= nulLen;
451 }
452 }
453
454 return buf;
455 }
456 }
457
458 if ( outLen )
459 *outLen = 0;
460
461 return wxCharBuffer();
462 }
463
464 // ----------------------------------------------------------------------------
465 // wxMBConvLibc
466 // ----------------------------------------------------------------------------
467
468 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
469 {
470 return wxMB2WC(buf, psz, n);
471 }
472
473 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
474 {
475 return wxWC2MB(buf, psz, n);
476 }
477
478 // ----------------------------------------------------------------------------
479 // wxConvBrokenFileNames
480 // ----------------------------------------------------------------------------
481
482 #ifdef __UNIX__
483
484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
485 {
486 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
487 wxStricmp(charset, _T("UTF8")) == 0 )
488 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
489 else
490 m_conv = new wxCSConv(charset);
491 }
492
493 #endif // __UNIX__
494
495 // ----------------------------------------------------------------------------
496 // UTF-7
497 // ----------------------------------------------------------------------------
498
499 // Implementation (C) 2004 Fredrik Roubert
500 //
501 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
502
503 //
504 // BASE64 decoding table
505 //
506 static const unsigned char utf7unb64[] =
507 {
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
540 };
541
542 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
543 const char *src, size_t srcLen) const
544 {
545 DecoderState stateOrig,
546 *statePtr;
547 if ( srcLen == wxNO_LEN )
548 {
549 // convert the entire string, up to and including the trailing NUL
550 srcLen = strlen(src) + 1;
551
552 // when working on the entire strings we don't update nor use the shift
553 // state from the previous call
554 statePtr = &stateOrig;
555 }
556 else // when working with partial strings we do use the shift state
557 {
558 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
559
560 // also save the old state to be able to rollback to it on error
561 stateOrig = m_stateDecoder;
562 }
563
564 // but to simplify the code below we use this variable in both cases
565 DecoderState& state = *statePtr;
566
567
568 // number of characters [which would have been] written to dst [if it were
569 // not NULL]
570 size_t len = 0;
571
572 const char * const srcEnd = src + srcLen;
573
574 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
575 {
576 const unsigned char cc = *src++;
577
578 if ( state.IsShifted() )
579 {
580 const unsigned char dc = utf7unb64[cc];
581 if ( dc == 0xff )
582 {
583 // end of encoded part
584 state.ToDirect();
585
586 // re-parse this character normally below unless it's '-' which
587 // is consumed by the decoder
588 if ( cc == '-' )
589 continue;
590 }
591 else // valid encoded character
592 {
593 // mini base64 decoder: each character is 6 bits
594 state.bit += 6;
595 state.accum <<= 6;
596 state.accum += dc;
597
598 if ( state.bit >= 8 )
599 {
600 // got the full byte, consume it
601 state.bit -= 8;
602 unsigned char b = (state.accum >> state.bit) & 0x00ff;
603
604 if ( state.isLSB )
605 {
606 // we've got the full word, output it
607 if ( dst )
608 *dst++ = (state.msb << 8) | b;
609 len++;
610 state.isLSB = false;
611 }
612 else // MSB
613 {
614 // just store it while we wait for LSB
615 state.msb = b;
616 state.isLSB = true;
617 }
618 }
619 }
620 }
621
622 if ( state.IsDirect() )
623 {
624 // start of an encoded segment?
625 if ( cc == '+' )
626 {
627 if ( src == srcEnd )
628 return wxCONV_FAILED; // can't have '+' at the end
629
630 if ( *src == '-' )
631 {
632 // just the encoded plus sign, don't switch to shifted mode
633 if ( dst )
634 *dst++ = '+';
635 len++;
636 src++;
637 }
638 else
639 {
640 state.ToShifted();
641 }
642 }
643 else // not '+'
644 {
645 // only printable 7 bit ASCII characters (with the exception of
646 // NUL, TAB, CR and LF) can be used directly
647 if ( cc >= 0x7f || (cc < ' ' &&
648 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
649 return wxCONV_FAILED;
650
651 if ( dst )
652 *dst++ = cc;
653 len++;
654 }
655 }
656 }
657
658 if ( !len )
659 {
660 // as we didn't read any characters we should be called with the same
661 // data (followed by some more new data) again later so don't save our
662 // state
663 state = stateOrig;
664
665 return wxCONV_FAILED;
666 }
667
668 return len;
669 }
670
671 //
672 // BASE64 encoding table
673 //
674 static const unsigned char utf7enb64[] =
675 {
676 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
677 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
678 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
679 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
680 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
681 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
682 'w', 'x', 'y', 'z', '0', '1', '2', '3',
683 '4', '5', '6', '7', '8', '9', '+', '/'
684 };
685
686 //
687 // UTF-7 encoding table
688 //
689 // 0 - Set D (directly encoded characters)
690 // 1 - Set O (optional direct characters)
691 // 2 - whitespace characters (optional)
692 // 3 - special characters
693 //
694 static const unsigned char utf7encode[128] =
695 {
696 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
697 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
698 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
700 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
704 };
705
706 static inline bool wxIsUTF7Direct(wchar_t wc)
707 {
708 return wc < 0x80 && utf7encode[wc] < 1;
709 }
710
711 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
712 const wchar_t *src, size_t srcLen) const
713 {
714 EncoderState stateOrig,
715 *statePtr;
716 if ( srcLen == wxNO_LEN )
717 {
718 // we don't apply the stored state when operating on entire strings at
719 // once
720 statePtr = &stateOrig;
721
722 srcLen = wxWcslen(src) + 1;
723 }
724 else // do use the mode we left the output in previously
725 {
726 stateOrig = m_stateEncoder;
727 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
728 }
729
730 EncoderState& state = *statePtr;
731
732
733 size_t len = 0;
734
735 const wchar_t * const srcEnd = src + srcLen;
736 while ( src < srcEnd && (!dst || len < dstLen) )
737 {
738 wchar_t cc = *src++;
739 if ( wxIsUTF7Direct(cc) )
740 {
741 if ( state.IsShifted() )
742 {
743 // pad with zeros the last encoded block if necessary
744 if ( state.bit )
745 {
746 if ( dst )
747 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
748 len++;
749 }
750
751 state.ToDirect();
752
753 if ( dst )
754 *dst++ = '-';
755 len++;
756 }
757
758 if ( dst )
759 *dst++ = (char)cc;
760 len++;
761 }
762 else if ( cc == '+' && state.IsDirect() )
763 {
764 if ( dst )
765 {
766 *dst++ = '+';
767 *dst++ = '-';
768 }
769
770 len += 2;
771 }
772 #ifndef WC_UTF16
773 else if (((wxUint32)cc) > 0xffff)
774 {
775 // no surrogate pair generation (yet?)
776 return wxCONV_FAILED;
777 }
778 #endif
779 else
780 {
781 if ( state.IsDirect() )
782 {
783 state.ToShifted();
784
785 if ( dst )
786 *dst++ = '+';
787 len++;
788 }
789
790 // BASE64 encode string
791 for ( ;; )
792 {
793 for ( unsigned lsb = 0; lsb < 2; lsb++ )
794 {
795 state.accum <<= 8;
796 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
797
798 for (state.bit += 8; state.bit >= 6; )
799 {
800 state.bit -= 6;
801 if ( dst )
802 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
803 len++;
804 }
805 }
806
807 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
808 break;
809
810 src++;
811 }
812 }
813 }
814
815 // we need to restore the original encoder state if we were called just to
816 // calculate the amount of space needed as we will presumably be called
817 // again to really convert the data now
818 if ( !dst )
819 state = stateOrig;
820
821 return len;
822 }
823
824 // ----------------------------------------------------------------------------
825 // UTF-8
826 // ----------------------------------------------------------------------------
827
828 static const wxUint32 utf8_max[]=
829 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
830
831 // boundaries of the private use area we use to (temporarily) remap invalid
832 // characters invalid in a UTF-8 encoded string
833 const wxUint32 wxUnicodePUA = 0x100000;
834 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
835
836 // this table gives the length of the UTF-8 encoding from its first character:
837 const unsigned char tableUtf8Lengths[256] = {
838 // single-byte sequences (ASCII):
839 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
840 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
841 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
842 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
844 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
847
848 // these are invalid:
849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
852 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
853 0, 0, // C0,C1
854
855 // two-byte sequences:
856 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
857 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
858
859 // three-byte sequences:
860 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
861
862 // four-byte sequences:
863 4, 4, 4, 4, 4, // F0..F4
864
865 // these are invalid again (5- or 6-byte
866 // sequences and sequences for code points
867 // above U+10FFFF, as restricted by RFC 3629):
868 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
869 };
870
871 size_t
872 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
873 const char *src, size_t srcLen) const
874 {
875 wchar_t *out = dstLen ? dst : NULL;
876 size_t written = 0;
877
878 if ( srcLen == wxNO_LEN )
879 srcLen = strlen(src) + 1;
880
881 for ( const char *p = src; ; p++ )
882 {
883 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
884 {
885 // all done successfully, just add the trailing NULL if we are not
886 // using explicit length
887 if ( srcLen == wxNO_LEN )
888 {
889 if ( out )
890 {
891 if ( !dstLen )
892 break;
893
894 *out = L'\0';
895 }
896
897 written++;
898 }
899
900 return written;
901 }
902
903 if ( out && !dstLen-- )
904 break;
905
906 wxUint32 code;
907 unsigned char c = *p;
908
909 if ( c < 0x80 )
910 {
911 if ( srcLen == 0 ) // the test works for wxNO_LEN too
912 break;
913
914 if ( srcLen != wxNO_LEN )
915 srcLen--;
916
917 code = c;
918 }
919 else
920 {
921 unsigned len = tableUtf8Lengths[c];
922 if ( !len )
923 break;
924
925 if ( srcLen < len ) // the test works for wxNO_LEN too
926 break;
927
928 if ( srcLen != wxNO_LEN )
929 srcLen -= len;
930
931 // Char. number range | UTF-8 octet sequence
932 // (hexadecimal) | (binary)
933 // ----------------------+----------------------------------------
934 // 0000 0000 - 0000 007F | 0xxxxxxx
935 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
936 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
937 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
938 //
939 // Code point value is stored in bits marked with 'x',
940 // lowest-order bit of the value on the right side in the diagram
941 // above. (from RFC 3629)
942
943 // mask to extract lead byte's value ('x' bits above), by sequence
944 // length:
945 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
946
947 // mask and value of lead byte's most significant bits, by length:
948 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
949 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
950
951 len--; // it's more convenient to work with 0-based length here
952
953 // extract the lead byte's value bits:
954 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
955 break;
956
957 code = c & leadValueMask[len];
958
959 // all remaining bytes, if any, are handled in the same way
960 // regardless of sequence's length:
961 for ( ; len; --len )
962 {
963 c = *++p;
964 if ( (c & 0xC0) != 0x80 )
965 return wxCONV_FAILED;
966
967 code <<= 6;
968 code |= c & 0x3F;
969 }
970 }
971
972 #ifdef WC_UTF16
973 // cast is ok because wchar_t == wxUint16 if WC_UTF16
974 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
975 {
976 if ( out )
977 out++;
978 written++;
979 }
980 #else // !WC_UTF16
981 if ( out )
982 *out = code;
983 #endif // WC_UTF16/!WC_UTF16
984
985 if ( out )
986 out++;
987
988 written++;
989 }
990
991 return wxCONV_FAILED;
992 }
993
994 size_t
995 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
996 const wchar_t *src, size_t srcLen) const
997 {
998 char *out = dstLen ? dst : NULL;
999 size_t written = 0;
1000
1001 for ( const wchar_t *wp = src; ; wp++ )
1002 {
1003 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1004 {
1005 // all done successfully, just add the trailing NULL if we are not
1006 // using explicit length
1007 if ( srcLen == wxNO_LEN )
1008 {
1009 if ( out )
1010 {
1011 if ( !dstLen )
1012 break;
1013
1014 *out = '\0';
1015 }
1016
1017 written++;
1018 }
1019
1020 return written;
1021 }
1022
1023 if ( srcLen != wxNO_LEN )
1024 srcLen--;
1025
1026 wxUint32 code;
1027 #ifdef WC_UTF16
1028 // cast is ok for WC_UTF16
1029 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1030 {
1031 // skip the next char too as we decoded a surrogate
1032 wp++;
1033 }
1034 #else // wchar_t is UTF-32
1035 code = *wp & 0x7fffffff;
1036 #endif
1037
1038 unsigned len;
1039 if ( code <= 0x7F )
1040 {
1041 len = 1;
1042 if ( out )
1043 {
1044 if ( dstLen < len )
1045 break;
1046
1047 out[0] = (char)code;
1048 }
1049 }
1050 else if ( code <= 0x07FF )
1051 {
1052 len = 2;
1053 if ( out )
1054 {
1055 if ( dstLen < len )
1056 break;
1057
1058 // NB: this line takes 6 least significant bits, encodes them as
1059 // 10xxxxxx and discards them so that the next byte can be encoded:
1060 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1061 out[0] = 0xC0 | code;
1062 }
1063 }
1064 else if ( code < 0xFFFF )
1065 {
1066 len = 3;
1067 if ( out )
1068 {
1069 if ( dstLen < len )
1070 break;
1071
1072 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1073 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1074 out[0] = 0xE0 | code;
1075 }
1076 }
1077 else if ( code <= 0x10FFFF )
1078 {
1079 len = 4;
1080 if ( out )
1081 {
1082 if ( dstLen < len )
1083 break;
1084
1085 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1086 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1087 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1088 out[0] = 0xF0 | code;
1089 }
1090 }
1091 else
1092 {
1093 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1094 break;
1095 }
1096
1097 if ( out )
1098 {
1099 out += len;
1100 dstLen -= len;
1101 }
1102
1103 written += len;
1104 }
1105
1106 // we only get here if an error occurs during decoding
1107 return wxCONV_FAILED;
1108 }
1109
1110 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1111 const char *psz, size_t srcLen) const
1112 {
1113 if ( m_options == MAP_INVALID_UTF8_NOT )
1114 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1115
1116 size_t len = 0;
1117
1118 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1119 {
1120 const char *opsz = psz;
1121 bool invalid = false;
1122 unsigned char cc = *psz++, fc = cc;
1123 unsigned cnt;
1124 for (cnt = 0; fc & 0x80; cnt++)
1125 fc <<= 1;
1126
1127 if (!cnt)
1128 {
1129 // plain ASCII char
1130 if (buf)
1131 *buf++ = cc;
1132 len++;
1133
1134 // escape the escape character for octal escapes
1135 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1136 && cc == '\\' && (!buf || len < n))
1137 {
1138 if (buf)
1139 *buf++ = cc;
1140 len++;
1141 }
1142 }
1143 else
1144 {
1145 cnt--;
1146 if (!cnt)
1147 {
1148 // invalid UTF-8 sequence
1149 invalid = true;
1150 }
1151 else
1152 {
1153 unsigned ocnt = cnt - 1;
1154 wxUint32 res = cc & (0x3f >> cnt);
1155 while (cnt--)
1156 {
1157 cc = *psz;
1158 if ((cc & 0xC0) != 0x80)
1159 {
1160 // invalid UTF-8 sequence
1161 invalid = true;
1162 break;
1163 }
1164
1165 psz++;
1166 res = (res << 6) | (cc & 0x3f);
1167 }
1168
1169 if (invalid || res <= utf8_max[ocnt])
1170 {
1171 // illegal UTF-8 encoding
1172 invalid = true;
1173 }
1174 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1175 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1176 {
1177 // if one of our PUA characters turns up externally
1178 // it must also be treated as an illegal sequence
1179 // (a bit like you have to escape an escape character)
1180 invalid = true;
1181 }
1182 else
1183 {
1184 #ifdef WC_UTF16
1185 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1186 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1187 if (pa == wxCONV_FAILED)
1188 {
1189 invalid = true;
1190 }
1191 else
1192 {
1193 if (buf)
1194 buf += pa;
1195 len += pa;
1196 }
1197 #else // !WC_UTF16
1198 if (buf)
1199 *buf++ = (wchar_t)res;
1200 len++;
1201 #endif // WC_UTF16/!WC_UTF16
1202 }
1203 }
1204
1205 if (invalid)
1206 {
1207 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1208 {
1209 while (opsz < psz && (!buf || len < n))
1210 {
1211 #ifdef WC_UTF16
1212 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1213 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1214 wxASSERT(pa != wxCONV_FAILED);
1215 if (buf)
1216 buf += pa;
1217 opsz++;
1218 len += pa;
1219 #else
1220 if (buf)
1221 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1222 opsz++;
1223 len++;
1224 #endif
1225 }
1226 }
1227 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1228 {
1229 while (opsz < psz && (!buf || len < n))
1230 {
1231 if ( buf && len + 3 < n )
1232 {
1233 unsigned char on = *opsz;
1234 *buf++ = L'\\';
1235 *buf++ = (wchar_t)( L'0' + on / 0100 );
1236 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1237 *buf++ = (wchar_t)( L'0' + on % 010 );
1238 }
1239
1240 opsz++;
1241 len += 4;
1242 }
1243 }
1244 else // MAP_INVALID_UTF8_NOT
1245 {
1246 return wxCONV_FAILED;
1247 }
1248 }
1249 }
1250 }
1251
1252 if (srcLen == wxNO_LEN && buf && (len < n))
1253 *buf = 0;
1254
1255 return len + 1;
1256 }
1257
1258 static inline bool isoctal(wchar_t wch)
1259 {
1260 return L'0' <= wch && wch <= L'7';
1261 }
1262
1263 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1264 const wchar_t *psz, size_t srcLen) const
1265 {
1266 if ( m_options == MAP_INVALID_UTF8_NOT )
1267 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1268
1269 size_t len = 0;
1270
1271 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1272 {
1273 wxUint32 cc;
1274
1275 #ifdef WC_UTF16
1276 // cast is ok for WC_UTF16
1277 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1278 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1279 #else
1280 cc = (*psz++) & 0x7fffffff;
1281 #endif
1282
1283 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1284 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1285 {
1286 if (buf)
1287 *buf++ = (char)(cc - wxUnicodePUA);
1288 len++;
1289 }
1290 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1291 && cc == L'\\' && psz[0] == L'\\' )
1292 {
1293 if (buf)
1294 *buf++ = (char)cc;
1295 psz++;
1296 len++;
1297 }
1298 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1299 cc == L'\\' &&
1300 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1301 {
1302 if (buf)
1303 {
1304 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1305 (psz[1] - L'0') * 010 +
1306 (psz[2] - L'0'));
1307 }
1308
1309 psz += 3;
1310 len++;
1311 }
1312 else
1313 {
1314 unsigned cnt;
1315 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1316 {
1317 }
1318
1319 if (!cnt)
1320 {
1321 // plain ASCII char
1322 if (buf)
1323 *buf++ = (char) cc;
1324 len++;
1325 }
1326 else
1327 {
1328 len += cnt + 1;
1329 if (buf)
1330 {
1331 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1332 while (cnt--)
1333 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1334 }
1335 }
1336 }
1337 }
1338
1339 if (srcLen == wxNO_LEN && buf && (len < n))
1340 *buf = 0;
1341
1342 return len + 1;
1343 }
1344
1345 // ============================================================================
1346 // UTF-16
1347 // ============================================================================
1348
1349 #ifdef WORDS_BIGENDIAN
1350 #define wxMBConvUTF16straight wxMBConvUTF16BE
1351 #define wxMBConvUTF16swap wxMBConvUTF16LE
1352 #else
1353 #define wxMBConvUTF16swap wxMBConvUTF16BE
1354 #define wxMBConvUTF16straight wxMBConvUTF16LE
1355 #endif
1356
1357 /* static */
1358 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1359 {
1360 if ( srcLen == wxNO_LEN )
1361 {
1362 // count the number of bytes in input, including the trailing NULs
1363 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1364 for ( srcLen = 1; *inBuff++; srcLen++ )
1365 ;
1366
1367 srcLen *= BYTES_PER_CHAR;
1368 }
1369 else // we already have the length
1370 {
1371 // we can only convert an entire number of UTF-16 characters
1372 if ( srcLen % BYTES_PER_CHAR )
1373 return wxCONV_FAILED;
1374 }
1375
1376 return srcLen;
1377 }
1378
1379 // case when in-memory representation is UTF-16 too
1380 #ifdef WC_UTF16
1381
1382 // ----------------------------------------------------------------------------
1383 // conversions without endianness change
1384 // ----------------------------------------------------------------------------
1385
1386 size_t
1387 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1388 const char *src, size_t srcLen) const
1389 {
1390 // set up the scene for using memcpy() (which is presumably more efficient
1391 // than copying the bytes one by one)
1392 srcLen = GetLength(src, srcLen);
1393 if ( srcLen == wxNO_LEN )
1394 return wxCONV_FAILED;
1395
1396 const size_t inLen = srcLen / BYTES_PER_CHAR;
1397 if ( dst )
1398 {
1399 if ( dstLen < inLen )
1400 return wxCONV_FAILED;
1401
1402 memcpy(dst, src, srcLen);
1403 }
1404
1405 return inLen;
1406 }
1407
1408 size_t
1409 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1410 const wchar_t *src, size_t srcLen) const
1411 {
1412 if ( srcLen == wxNO_LEN )
1413 srcLen = wxWcslen(src) + 1;
1414
1415 srcLen *= BYTES_PER_CHAR;
1416
1417 if ( dst )
1418 {
1419 if ( dstLen < srcLen )
1420 return wxCONV_FAILED;
1421
1422 memcpy(dst, src, srcLen);
1423 }
1424
1425 return srcLen;
1426 }
1427
1428 // ----------------------------------------------------------------------------
1429 // endian-reversing conversions
1430 // ----------------------------------------------------------------------------
1431
1432 size_t
1433 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1434 const char *src, size_t srcLen) const
1435 {
1436 srcLen = GetLength(src, srcLen);
1437 if ( srcLen == wxNO_LEN )
1438 return wxCONV_FAILED;
1439
1440 srcLen /= BYTES_PER_CHAR;
1441
1442 if ( dst )
1443 {
1444 if ( dstLen < srcLen )
1445 return wxCONV_FAILED;
1446
1447 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1448 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1449 {
1450 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1451 }
1452 }
1453
1454 return srcLen;
1455 }
1456
1457 size_t
1458 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1459 const wchar_t *src, size_t srcLen) const
1460 {
1461 if ( srcLen == wxNO_LEN )
1462 srcLen = wxWcslen(src) + 1;
1463
1464 srcLen *= BYTES_PER_CHAR;
1465
1466 if ( dst )
1467 {
1468 if ( dstLen < srcLen )
1469 return wxCONV_FAILED;
1470
1471 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1472 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1473 {
1474 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1475 }
1476 }
1477
1478 return srcLen;
1479 }
1480
1481 #else // !WC_UTF16: wchar_t is UTF-32
1482
1483 // ----------------------------------------------------------------------------
1484 // conversions without endianness change
1485 // ----------------------------------------------------------------------------
1486
1487 size_t
1488 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1489 const char *src, size_t srcLen) const
1490 {
1491 srcLen = GetLength(src, srcLen);
1492 if ( srcLen == wxNO_LEN )
1493 return wxCONV_FAILED;
1494
1495 const size_t inLen = srcLen / BYTES_PER_CHAR;
1496 if ( !dst )
1497 {
1498 // optimization: return maximal space which could be needed for this
1499 // string even if the real size could be smaller if the buffer contains
1500 // any surrogates
1501 return inLen;
1502 }
1503
1504 size_t outLen = 0;
1505 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1506 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1507 {
1508 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1509 if ( !inBuff )
1510 return wxCONV_FAILED;
1511
1512 if ( ++outLen > dstLen )
1513 return wxCONV_FAILED;
1514
1515 *dst++ = ch;
1516 }
1517
1518
1519 return outLen;
1520 }
1521
1522 size_t
1523 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1524 const wchar_t *src, size_t srcLen) const
1525 {
1526 if ( srcLen == wxNO_LEN )
1527 srcLen = wxWcslen(src) + 1;
1528
1529 size_t outLen = 0;
1530 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1531 for ( size_t n = 0; n < srcLen; n++ )
1532 {
1533 wxUint16 cc[2];
1534 const size_t numChars = encode_utf16(*src++, cc);
1535 if ( numChars == wxCONV_FAILED )
1536 return wxCONV_FAILED;
1537
1538 outLen += numChars * BYTES_PER_CHAR;
1539 if ( outBuff )
1540 {
1541 if ( outLen > dstLen )
1542 return wxCONV_FAILED;
1543
1544 *outBuff++ = cc[0];
1545 if ( numChars == 2 )
1546 {
1547 // second character of a surrogate
1548 *outBuff++ = cc[1];
1549 }
1550 }
1551 }
1552
1553 return outLen;
1554 }
1555
1556 // ----------------------------------------------------------------------------
1557 // endian-reversing conversions
1558 // ----------------------------------------------------------------------------
1559
1560 size_t
1561 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1562 const char *src, size_t srcLen) const
1563 {
1564 srcLen = GetLength(src, srcLen);
1565 if ( srcLen == wxNO_LEN )
1566 return wxCONV_FAILED;
1567
1568 const size_t inLen = srcLen / BYTES_PER_CHAR;
1569 if ( !dst )
1570 {
1571 // optimization: return maximal space which could be needed for this
1572 // string even if the real size could be smaller if the buffer contains
1573 // any surrogates
1574 return inLen;
1575 }
1576
1577 size_t outLen = 0;
1578 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1579 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1580 {
1581 wxUint32 ch;
1582 wxUint16 tmp[2];
1583
1584 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1585 inBuff++;
1586 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1587
1588 const size_t numChars = decode_utf16(tmp, ch);
1589 if ( numChars == wxCONV_FAILED )
1590 return wxCONV_FAILED;
1591
1592 if ( numChars == 2 )
1593 inBuff++;
1594
1595 if ( ++outLen > dstLen )
1596 return wxCONV_FAILED;
1597
1598 *dst++ = ch;
1599 }
1600
1601
1602 return outLen;
1603 }
1604
1605 size_t
1606 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1607 const wchar_t *src, size_t srcLen) const
1608 {
1609 if ( srcLen == wxNO_LEN )
1610 srcLen = wxWcslen(src) + 1;
1611
1612 size_t outLen = 0;
1613 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1614 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1615 {
1616 wxUint16 cc[2];
1617 const size_t numChars = encode_utf16(*src, cc);
1618 if ( numChars == wxCONV_FAILED )
1619 return wxCONV_FAILED;
1620
1621 outLen += numChars * BYTES_PER_CHAR;
1622 if ( outBuff )
1623 {
1624 if ( outLen > dstLen )
1625 return wxCONV_FAILED;
1626
1627 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1628 if ( numChars == 2 )
1629 {
1630 // second character of a surrogate
1631 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1632 }
1633 }
1634 }
1635
1636 return outLen;
1637 }
1638
1639 #endif // WC_UTF16/!WC_UTF16
1640
1641
1642 // ============================================================================
1643 // UTF-32
1644 // ============================================================================
1645
1646 #ifdef WORDS_BIGENDIAN
1647 #define wxMBConvUTF32straight wxMBConvUTF32BE
1648 #define wxMBConvUTF32swap wxMBConvUTF32LE
1649 #else
1650 #define wxMBConvUTF32swap wxMBConvUTF32BE
1651 #define wxMBConvUTF32straight wxMBConvUTF32LE
1652 #endif
1653
1654
1655 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1656 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1657
1658 /* static */
1659 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1660 {
1661 if ( srcLen == wxNO_LEN )
1662 {
1663 // count the number of bytes in input, including the trailing NULs
1664 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1665 for ( srcLen = 1; *inBuff++; srcLen++ )
1666 ;
1667
1668 srcLen *= BYTES_PER_CHAR;
1669 }
1670 else // we already have the length
1671 {
1672 // we can only convert an entire number of UTF-32 characters
1673 if ( srcLen % BYTES_PER_CHAR )
1674 return wxCONV_FAILED;
1675 }
1676
1677 return srcLen;
1678 }
1679
1680 // case when in-memory representation is UTF-16
1681 #ifdef WC_UTF16
1682
1683 // ----------------------------------------------------------------------------
1684 // conversions without endianness change
1685 // ----------------------------------------------------------------------------
1686
1687 size_t
1688 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1689 const char *src, size_t srcLen) const
1690 {
1691 srcLen = GetLength(src, srcLen);
1692 if ( srcLen == wxNO_LEN )
1693 return wxCONV_FAILED;
1694
1695 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1696 const size_t inLen = srcLen / BYTES_PER_CHAR;
1697 size_t outLen = 0;
1698 for ( size_t n = 0; n < inLen; n++ )
1699 {
1700 wxUint16 cc[2];
1701 const size_t numChars = encode_utf16(*inBuff++, cc);
1702 if ( numChars == wxCONV_FAILED )
1703 return wxCONV_FAILED;
1704
1705 outLen += numChars;
1706 if ( dst )
1707 {
1708 if ( outLen > dstLen )
1709 return wxCONV_FAILED;
1710
1711 *dst++ = cc[0];
1712 if ( numChars == 2 )
1713 {
1714 // second character of a surrogate
1715 *dst++ = cc[1];
1716 }
1717 }
1718 }
1719
1720 return outLen;
1721 }
1722
1723 size_t
1724 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1725 const wchar_t *src, size_t srcLen) const
1726 {
1727 if ( srcLen == wxNO_LEN )
1728 srcLen = wxWcslen(src) + 1;
1729
1730 if ( !dst )
1731 {
1732 // optimization: return maximal space which could be needed for this
1733 // string instead of the exact amount which could be less if there are
1734 // any surrogates in the input
1735 //
1736 // we consider that surrogates are rare enough to make it worthwhile to
1737 // avoid running the loop below at the cost of slightly extra memory
1738 // consumption
1739 return srcLen * BYTES_PER_CHAR;
1740 }
1741
1742 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1743 size_t outLen = 0;
1744 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1745 {
1746 const wxUint32 ch = wxDecodeSurrogate(&src);
1747 if ( !src )
1748 return wxCONV_FAILED;
1749
1750 outLen += BYTES_PER_CHAR;
1751
1752 if ( outLen > dstLen )
1753 return wxCONV_FAILED;
1754
1755 *outBuff++ = ch;
1756 }
1757
1758 return outLen;
1759 }
1760
1761 // ----------------------------------------------------------------------------
1762 // endian-reversing conversions
1763 // ----------------------------------------------------------------------------
1764
1765 size_t
1766 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1767 const char *src, size_t srcLen) const
1768 {
1769 srcLen = GetLength(src, srcLen);
1770 if ( srcLen == wxNO_LEN )
1771 return wxCONV_FAILED;
1772
1773 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1774 const size_t inLen = srcLen / BYTES_PER_CHAR;
1775 size_t outLen = 0;
1776 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1777 {
1778 wxUint16 cc[2];
1779 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1780 if ( numChars == wxCONV_FAILED )
1781 return wxCONV_FAILED;
1782
1783 outLen += numChars;
1784 if ( dst )
1785 {
1786 if ( outLen > dstLen )
1787 return wxCONV_FAILED;
1788
1789 *dst++ = cc[0];
1790 if ( numChars == 2 )
1791 {
1792 // second character of a surrogate
1793 *dst++ = cc[1];
1794 }
1795 }
1796 }
1797
1798 return outLen;
1799 }
1800
1801 size_t
1802 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1803 const wchar_t *src, size_t srcLen) const
1804 {
1805 if ( srcLen == wxNO_LEN )
1806 srcLen = wxWcslen(src) + 1;
1807
1808 if ( !dst )
1809 {
1810 // optimization: return maximal space which could be needed for this
1811 // string instead of the exact amount which could be less if there are
1812 // any surrogates in the input
1813 //
1814 // we consider that surrogates are rare enough to make it worthwhile to
1815 // avoid running the loop below at the cost of slightly extra memory
1816 // consumption
1817 return srcLen*BYTES_PER_CHAR;
1818 }
1819
1820 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1821 size_t outLen = 0;
1822 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1823 {
1824 const wxUint32 ch = wxDecodeSurrogate(&src);
1825 if ( !src )
1826 return wxCONV_FAILED;
1827
1828 outLen += BYTES_PER_CHAR;
1829
1830 if ( outLen > dstLen )
1831 return wxCONV_FAILED;
1832
1833 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1834 }
1835
1836 return outLen;
1837 }
1838
1839 #else // !WC_UTF16: wchar_t is UTF-32
1840
1841 // ----------------------------------------------------------------------------
1842 // conversions without endianness change
1843 // ----------------------------------------------------------------------------
1844
1845 size_t
1846 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1847 const char *src, size_t srcLen) const
1848 {
1849 // use memcpy() as it should be much faster than hand-written loop
1850 srcLen = GetLength(src, srcLen);
1851 if ( srcLen == wxNO_LEN )
1852 return wxCONV_FAILED;
1853
1854 const size_t inLen = srcLen/BYTES_PER_CHAR;
1855 if ( dst )
1856 {
1857 if ( dstLen < inLen )
1858 return wxCONV_FAILED;
1859
1860 memcpy(dst, src, srcLen);
1861 }
1862
1863 return inLen;
1864 }
1865
1866 size_t
1867 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1868 const wchar_t *src, size_t srcLen) const
1869 {
1870 if ( srcLen == wxNO_LEN )
1871 srcLen = wxWcslen(src) + 1;
1872
1873 srcLen *= BYTES_PER_CHAR;
1874
1875 if ( dst )
1876 {
1877 if ( dstLen < srcLen )
1878 return wxCONV_FAILED;
1879
1880 memcpy(dst, src, srcLen);
1881 }
1882
1883 return srcLen;
1884 }
1885
1886 // ----------------------------------------------------------------------------
1887 // endian-reversing conversions
1888 // ----------------------------------------------------------------------------
1889
1890 size_t
1891 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1892 const char *src, size_t srcLen) const
1893 {
1894 srcLen = GetLength(src, srcLen);
1895 if ( srcLen == wxNO_LEN )
1896 return wxCONV_FAILED;
1897
1898 srcLen /= BYTES_PER_CHAR;
1899
1900 if ( dst )
1901 {
1902 if ( dstLen < srcLen )
1903 return wxCONV_FAILED;
1904
1905 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1906 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1907 {
1908 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1909 }
1910 }
1911
1912 return srcLen;
1913 }
1914
1915 size_t
1916 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1917 const wchar_t *src, size_t srcLen) const
1918 {
1919 if ( srcLen == wxNO_LEN )
1920 srcLen = wxWcslen(src) + 1;
1921
1922 srcLen *= BYTES_PER_CHAR;
1923
1924 if ( dst )
1925 {
1926 if ( dstLen < srcLen )
1927 return wxCONV_FAILED;
1928
1929 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1930 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1931 {
1932 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1933 }
1934 }
1935
1936 return srcLen;
1937 }
1938
1939 #endif // WC_UTF16/!WC_UTF16
1940
1941
1942 // ============================================================================
1943 // The classes doing conversion using the iconv_xxx() functions
1944 // ============================================================================
1945
1946 #ifdef HAVE_ICONV
1947
1948 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1949 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1950 // (unless there's yet another bug in glibc) the only case when iconv()
1951 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1952 // left in the input buffer -- when _real_ error occurs,
1953 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1954 // iconv() failure.
1955 // [This bug does not appear in glibc 2.2.]
1956 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1957 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1958 (errno != E2BIG || bufLeft != 0))
1959 #else
1960 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1961 #endif
1962
1963 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1964
1965 #define ICONV_T_INVALID ((iconv_t)-1)
1966
1967 #if SIZEOF_WCHAR_T == 4
1968 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1969 #define WC_ENC wxFONTENCODING_UTF32
1970 #elif SIZEOF_WCHAR_T == 2
1971 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1972 #define WC_ENC wxFONTENCODING_UTF16
1973 #else // sizeof(wchar_t) != 2 nor 4
1974 // does this ever happen?
1975 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1976 #endif
1977
1978 // ----------------------------------------------------------------------------
1979 // wxMBConv_iconv: encapsulates an iconv character set
1980 // ----------------------------------------------------------------------------
1981
1982 class wxMBConv_iconv : public wxMBConv
1983 {
1984 public:
1985 wxMBConv_iconv(const char *name);
1986 virtual ~wxMBConv_iconv();
1987
1988 // implement base class virtual methods
1989 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1990 const char *src, size_t srcLen = wxNO_LEN) const;
1991 virtual size_t FromWChar(char *dst, size_t dstLen,
1992 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1993 virtual size_t GetMBNulLen() const;
1994
1995 #if wxUSE_UNICODE_UTF8
1996 virtual bool IsUTF8() const;
1997 #endif
1998
1999 virtual wxMBConv *Clone() const
2000 {
2001 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2002 p->m_minMBCharWidth = m_minMBCharWidth;
2003 return p;
2004 }
2005
2006 bool IsOk() const
2007 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2008
2009 protected:
2010 // the iconv handlers used to translate from multibyte
2011 // to wide char and in the other direction
2012 iconv_t m2w,
2013 w2m;
2014
2015 #if wxUSE_THREADS
2016 // guards access to m2w and w2m objects
2017 wxMutex m_iconvMutex;
2018 #endif
2019
2020 private:
2021 // the name (for iconv_open()) of a wide char charset -- if none is
2022 // available on this machine, it will remain NULL
2023 static wxString ms_wcCharsetName;
2024
2025 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2026 // different endian-ness than the native one
2027 static bool ms_wcNeedsSwap;
2028
2029
2030 // name of the encoding handled by this conversion
2031 wxString m_name;
2032
2033 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2034 // initially
2035 size_t m_minMBCharWidth;
2036 };
2037
2038 // make the constructor available for unit testing
2039 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2040 {
2041 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2042 if ( !result->IsOk() )
2043 {
2044 delete result;
2045 return 0;
2046 }
2047
2048 return result;
2049 }
2050
2051 wxString wxMBConv_iconv::ms_wcCharsetName;
2052 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2053
2054 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2055 : m_name(name)
2056 {
2057 m_minMBCharWidth = 0;
2058
2059 // check for charset that represents wchar_t:
2060 if ( ms_wcCharsetName.empty() )
2061 {
2062 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2063
2064 #if wxUSE_FONTMAP
2065 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2066 #else // !wxUSE_FONTMAP
2067 static const wxChar *names_static[] =
2068 {
2069 #if SIZEOF_WCHAR_T == 4
2070 _T("UCS-4"),
2071 #elif SIZEOF_WCHAR_T = 2
2072 _T("UCS-2"),
2073 #endif
2074 NULL
2075 };
2076 const wxChar **names = names_static;
2077 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2078
2079 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2080 {
2081 const wxString nameCS(*names);
2082
2083 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2084 wxString nameXE(nameCS);
2085
2086 #ifdef WORDS_BIGENDIAN
2087 nameXE += _T("BE");
2088 #else // little endian
2089 nameXE += _T("LE");
2090 #endif
2091
2092 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2093 nameXE.c_str());
2094
2095 m2w = iconv_open(nameXE.ToAscii(), name);
2096 if ( m2w == ICONV_T_INVALID )
2097 {
2098 // try charset w/o bytesex info (e.g. "UCS4")
2099 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2100 nameCS.c_str());
2101 m2w = iconv_open(nameCS.ToAscii(), name);
2102
2103 // and check for bytesex ourselves:
2104 if ( m2w != ICONV_T_INVALID )
2105 {
2106 char buf[2], *bufPtr;
2107 wchar_t wbuf[2];
2108 size_t insz, outsz;
2109 size_t res;
2110
2111 buf[0] = 'A';
2112 buf[1] = 0;
2113 wbuf[0] = 0;
2114 insz = 2;
2115 outsz = SIZEOF_WCHAR_T * 2;
2116 char* wbufPtr = (char*)wbuf;
2117 bufPtr = buf;
2118
2119 res = iconv(
2120 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2121 &wbufPtr, &outsz);
2122
2123 if (ICONV_FAILED(res, insz))
2124 {
2125 wxLogLastError(wxT("iconv"));
2126 wxLogError(_("Conversion to charset '%s' doesn't work."),
2127 nameCS.c_str());
2128 }
2129 else // ok, can convert to this encoding, remember it
2130 {
2131 ms_wcCharsetName = nameCS;
2132 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2133 }
2134 }
2135 }
2136 else // use charset not requiring byte swapping
2137 {
2138 ms_wcCharsetName = nameXE;
2139 }
2140 }
2141
2142 wxLogTrace(TRACE_STRCONV,
2143 wxT("iconv wchar_t charset is \"%s\"%s"),
2144 ms_wcCharsetName.empty() ? wxString("<none>")
2145 : ms_wcCharsetName,
2146 ms_wcNeedsSwap ? _T(" (needs swap)")
2147 : _T(""));
2148 }
2149 else // we already have ms_wcCharsetName
2150 {
2151 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2152 }
2153
2154 if ( ms_wcCharsetName.empty() )
2155 {
2156 w2m = ICONV_T_INVALID;
2157 }
2158 else
2159 {
2160 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2161 if ( w2m == ICONV_T_INVALID )
2162 {
2163 wxLogTrace(TRACE_STRCONV,
2164 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2165 ms_wcCharsetName.c_str(), name);
2166 }
2167 }
2168 }
2169
2170 wxMBConv_iconv::~wxMBConv_iconv()
2171 {
2172 if ( m2w != ICONV_T_INVALID )
2173 iconv_close(m2w);
2174 if ( w2m != ICONV_T_INVALID )
2175 iconv_close(w2m);
2176 }
2177
2178 size_t
2179 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2180 const char *src, size_t srcLen) const
2181 {
2182 if ( srcLen == wxNO_LEN )
2183 {
2184 // find the string length: notice that must be done differently for
2185 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2186 // consecutive NULs
2187 const size_t nulLen = GetMBNulLen();
2188 switch ( nulLen )
2189 {
2190 default:
2191 return wxCONV_FAILED;
2192
2193 case 1:
2194 srcLen = strlen(src); // arguably more optimized than our version
2195 break;
2196
2197 case 2:
2198 case 4:
2199 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2200 // but they also have to start at character boundary and not
2201 // span two adjacent characters
2202 const char *p;
2203 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2204 ;
2205 srcLen = p - src;
2206 break;
2207 }
2208
2209 // when we're determining the length of the string ourselves we count
2210 // the terminating NUL(s) as part of it and always NUL-terminate the
2211 // output
2212 srcLen += nulLen;
2213 }
2214
2215 // we express length in the number of (wide) characters but iconv always
2216 // counts buffer sizes it in bytes
2217 dstLen *= SIZEOF_WCHAR_T;
2218
2219 #if wxUSE_THREADS
2220 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2221 // Unfortunately there are a couple of global wxCSConv objects such as
2222 // wxConvLocal that are used all over wx code, so we have to make sure
2223 // the handle is used by at most one thread at the time. Otherwise
2224 // only a few wx classes would be safe to use from non-main threads
2225 // as MB<->WC conversion would fail "randomly".
2226 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2227 #endif // wxUSE_THREADS
2228
2229 size_t res, cres;
2230 const char *pszPtr = src;
2231
2232 if ( dst )
2233 {
2234 char* bufPtr = (char*)dst;
2235
2236 // have destination buffer, convert there
2237 size_t dstLenOrig = dstLen;
2238 cres = iconv(m2w,
2239 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2240 &bufPtr, &dstLen);
2241
2242 // convert the number of bytes converted as returned by iconv to the
2243 // number of (wide) characters converted that we need
2244 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2245
2246 if (ms_wcNeedsSwap)
2247 {
2248 // convert to native endianness
2249 for ( unsigned i = 0; i < res; i++ )
2250 dst[i] = WC_BSWAP(dst[i]);
2251 }
2252 }
2253 else // no destination buffer
2254 {
2255 // convert using temp buffer to calculate the size of the buffer needed
2256 wchar_t tbuf[8];
2257 res = 0;
2258
2259 do
2260 {
2261 char* bufPtr = (char*)tbuf;
2262 dstLen = 8 * SIZEOF_WCHAR_T;
2263
2264 cres = iconv(m2w,
2265 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2266 &bufPtr, &dstLen );
2267
2268 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2269 }
2270 while ((cres == (size_t)-1) && (errno == E2BIG));
2271 }
2272
2273 if (ICONV_FAILED(cres, srcLen))
2274 {
2275 //VS: it is ok if iconv fails, hence trace only
2276 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2277 return wxCONV_FAILED;
2278 }
2279
2280 return res;
2281 }
2282
2283 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2284 const wchar_t *src, size_t srcLen) const
2285 {
2286 #if wxUSE_THREADS
2287 // NB: explained in MB2WC
2288 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2289 #endif
2290
2291 if ( srcLen == wxNO_LEN )
2292 srcLen = wxWcslen(src) + 1;
2293
2294 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2295 size_t outbuflen = dstLen;
2296 size_t res, cres;
2297
2298 wchar_t *tmpbuf = 0;
2299
2300 if (ms_wcNeedsSwap)
2301 {
2302 // need to copy to temp buffer to switch endianness
2303 // (doing WC_BSWAP twice on the original buffer won't help, as it
2304 // could be in read-only memory, or be accessed in some other thread)
2305 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2306 for ( size_t i = 0; i < srcLen; i++ )
2307 tmpbuf[i] = WC_BSWAP(src[i]);
2308
2309 tmpbuf[srcLen] = L'\0';
2310 src = tmpbuf;
2311 }
2312
2313 char* inbuf = (char*)src;
2314 if ( dst )
2315 {
2316 // have destination buffer, convert there
2317 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2318
2319 res = dstLen - outbuflen;
2320 }
2321 else // no destination buffer
2322 {
2323 // convert using temp buffer to calculate the size of the buffer needed
2324 char tbuf[16];
2325 res = 0;
2326 do
2327 {
2328 dst = tbuf;
2329 outbuflen = 16;
2330
2331 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2332
2333 res += 16 - outbuflen;
2334 }
2335 while ((cres == (size_t)-1) && (errno == E2BIG));
2336 }
2337
2338 if (ms_wcNeedsSwap)
2339 {
2340 free(tmpbuf);
2341 }
2342
2343 if (ICONV_FAILED(cres, inbuflen))
2344 {
2345 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2346 return wxCONV_FAILED;
2347 }
2348
2349 return res;
2350 }
2351
2352 size_t wxMBConv_iconv::GetMBNulLen() const
2353 {
2354 if ( m_minMBCharWidth == 0 )
2355 {
2356 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2357
2358 #if wxUSE_THREADS
2359 // NB: explained in MB2WC
2360 wxMutexLocker lock(self->m_iconvMutex);
2361 #endif
2362
2363 const wchar_t *wnul = L"";
2364 char buf[8]; // should be enough for NUL in any encoding
2365 size_t inLen = sizeof(wchar_t),
2366 outLen = WXSIZEOF(buf);
2367 char *inBuff = (char *)wnul;
2368 char *outBuff = buf;
2369 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2370 {
2371 self->m_minMBCharWidth = (size_t)-1;
2372 }
2373 else // ok
2374 {
2375 self->m_minMBCharWidth = outBuff - buf;
2376 }
2377 }
2378
2379 return m_minMBCharWidth;
2380 }
2381
2382 #if wxUSE_UNICODE_UTF8
2383 bool wxMBConv_iconv::IsUTF8() const
2384 {
2385 return wxStricmp(m_name, "UTF-8") == 0 ||
2386 wxStricmp(m_name, "UTF8") == 0;
2387 }
2388 #endif
2389
2390 #endif // HAVE_ICONV
2391
2392
2393 // ============================================================================
2394 // Win32 conversion classes
2395 // ============================================================================
2396
2397 #ifdef wxHAVE_WIN32_MB2WC
2398
2399 // from utils.cpp
2400 #if wxUSE_FONTMAP
2401 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2402 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2403 #endif
2404
2405 class wxMBConv_win32 : public wxMBConv
2406 {
2407 public:
2408 wxMBConv_win32()
2409 {
2410 m_CodePage = CP_ACP;
2411 m_minMBCharWidth = 0;
2412 }
2413
2414 wxMBConv_win32(const wxMBConv_win32& conv)
2415 : wxMBConv()
2416 {
2417 m_CodePage = conv.m_CodePage;
2418 m_minMBCharWidth = conv.m_minMBCharWidth;
2419 }
2420
2421 #if wxUSE_FONTMAP
2422 wxMBConv_win32(const char* name)
2423 {
2424 m_CodePage = wxCharsetToCodepage(name);
2425 m_minMBCharWidth = 0;
2426 }
2427
2428 wxMBConv_win32(wxFontEncoding encoding)
2429 {
2430 m_CodePage = wxEncodingToCodepage(encoding);
2431 m_minMBCharWidth = 0;
2432 }
2433 #endif // wxUSE_FONTMAP
2434
2435 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2436 {
2437 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2438 // the behaviour is not compatible with the Unix version (using iconv)
2439 // and break the library itself, e.g. wxTextInputStream::NextChar()
2440 // wouldn't work if reading an incomplete MB char didn't result in an
2441 // error
2442 //
2443 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2444 // Win XP or newer and it is not supported for UTF-[78] so we always
2445 // use our own conversions in this case. See
2446 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2447 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2448 if ( m_CodePage == CP_UTF8 )
2449 {
2450 return wxMBConvUTF8().MB2WC(buf, psz, n);
2451 }
2452
2453 if ( m_CodePage == CP_UTF7 )
2454 {
2455 return wxMBConvUTF7().MB2WC(buf, psz, n);
2456 }
2457
2458 int flags = 0;
2459 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2460 IsAtLeastWin2kSP4() )
2461 {
2462 flags = MB_ERR_INVALID_CHARS;
2463 }
2464
2465 const size_t len = ::MultiByteToWideChar
2466 (
2467 m_CodePage, // code page
2468 flags, // flags: fall on error
2469 psz, // input string
2470 -1, // its length (NUL-terminated)
2471 buf, // output string
2472 buf ? n : 0 // size of output buffer
2473 );
2474 if ( !len )
2475 {
2476 // function totally failed
2477 return wxCONV_FAILED;
2478 }
2479
2480 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2481 // check if we succeeded, by doing a double trip:
2482 if ( !flags && buf )
2483 {
2484 const size_t mbLen = strlen(psz);
2485 wxCharBuffer mbBuf(mbLen);
2486 if ( ::WideCharToMultiByte
2487 (
2488 m_CodePage,
2489 0,
2490 buf,
2491 -1,
2492 mbBuf.data(),
2493 mbLen + 1, // size in bytes, not length
2494 NULL,
2495 NULL
2496 ) == 0 ||
2497 strcmp(mbBuf, psz) != 0 )
2498 {
2499 // we didn't obtain the same thing we started from, hence
2500 // the conversion was lossy and we consider that it failed
2501 return wxCONV_FAILED;
2502 }
2503 }
2504
2505 // note that it returns count of written chars for buf != NULL and size
2506 // of the needed buffer for buf == NULL so in either case the length of
2507 // the string (which never includes the terminating NUL) is one less
2508 return len - 1;
2509 }
2510
2511 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2512 {
2513 /*
2514 we have a problem here: by default, WideCharToMultiByte() may
2515 replace characters unrepresentable in the target code page with bad
2516 quality approximations such as turning "1/2" symbol (U+00BD) into
2517 "1" for the code pages which don't have it and we, obviously, want
2518 to avoid this at any price
2519
2520 the trouble is that this function does it _silently_, i.e. it won't
2521 even tell us whether it did or not... Win98/2000 and higher provide
2522 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2523 we have to resort to a round trip, i.e. check that converting back
2524 results in the same string -- this is, of course, expensive but
2525 otherwise we simply can't be sure to not garble the data.
2526 */
2527
2528 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2529 // it doesn't work with CJK encodings (which we test for rather roughly
2530 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2531 // supporting it
2532 BOOL usedDef wxDUMMY_INITIALIZE(false);
2533 BOOL *pUsedDef;
2534 int flags;
2535 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2536 {
2537 // it's our lucky day
2538 flags = WC_NO_BEST_FIT_CHARS;
2539 pUsedDef = &usedDef;
2540 }
2541 else // old system or unsupported encoding
2542 {
2543 flags = 0;
2544 pUsedDef = NULL;
2545 }
2546
2547 const size_t len = ::WideCharToMultiByte
2548 (
2549 m_CodePage, // code page
2550 flags, // either none or no best fit
2551 pwz, // input string
2552 -1, // it is (wide) NUL-terminated
2553 buf, // output buffer
2554 buf ? n : 0, // and its size
2555 NULL, // default "replacement" char
2556 pUsedDef // [out] was it used?
2557 );
2558
2559 if ( !len )
2560 {
2561 // function totally failed
2562 return wxCONV_FAILED;
2563 }
2564
2565 // we did something, check if we really succeeded
2566 if ( flags )
2567 {
2568 // check if the conversion failed, i.e. if any replacements
2569 // were done
2570 if ( usedDef )
2571 return wxCONV_FAILED;
2572 }
2573 else // we must resort to double tripping...
2574 {
2575 // first we need to ensure that we really have the MB data: this is
2576 // not the case if we're called with NULL buffer, in which case we
2577 // need to do the conversion yet again
2578 wxCharBuffer bufDef;
2579 if ( !buf )
2580 {
2581 bufDef = wxCharBuffer(len);
2582 buf = bufDef.data();
2583 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2584 buf, len, NULL, NULL) )
2585 return wxCONV_FAILED;
2586 }
2587
2588 if ( !n )
2589 n = wcslen(pwz);
2590 wxWCharBuffer wcBuf(n);
2591 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2592 wcscmp(wcBuf, pwz) != 0 )
2593 {
2594 // we didn't obtain the same thing we started from, hence
2595 // the conversion was lossy and we consider that it failed
2596 return wxCONV_FAILED;
2597 }
2598 }
2599
2600 // see the comment above for the reason of "len - 1"
2601 return len - 1;
2602 }
2603
2604 virtual size_t GetMBNulLen() const
2605 {
2606 if ( m_minMBCharWidth == 0 )
2607 {
2608 int len = ::WideCharToMultiByte
2609 (
2610 m_CodePage, // code page
2611 0, // no flags
2612 L"", // input string
2613 1, // translate just the NUL
2614 NULL, // output buffer
2615 0, // and its size
2616 NULL, // no replacement char
2617 NULL // [out] don't care if it was used
2618 );
2619
2620 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2621 switch ( len )
2622 {
2623 default:
2624 wxLogDebug(_T("Unexpected NUL length %d"), len);
2625 self->m_minMBCharWidth = (size_t)-1;
2626 break;
2627
2628 case 0:
2629 self->m_minMBCharWidth = (size_t)-1;
2630 break;
2631
2632 case 1:
2633 case 2:
2634 case 4:
2635 self->m_minMBCharWidth = len;
2636 break;
2637 }
2638 }
2639
2640 return m_minMBCharWidth;
2641 }
2642
2643 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2644
2645 bool IsOk() const { return m_CodePage != -1; }
2646
2647 private:
2648 static bool CanUseNoBestFit()
2649 {
2650 static int s_isWin98Or2k = -1;
2651
2652 if ( s_isWin98Or2k == -1 )
2653 {
2654 int verMaj, verMin;
2655 switch ( wxGetOsVersion(&verMaj, &verMin) )
2656 {
2657 case wxOS_WINDOWS_9X:
2658 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2659 break;
2660
2661 case wxOS_WINDOWS_NT:
2662 s_isWin98Or2k = verMaj >= 5;
2663 break;
2664
2665 default:
2666 // unknown: be conservative by default
2667 s_isWin98Or2k = 0;
2668 break;
2669 }
2670
2671 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2672 }
2673
2674 return s_isWin98Or2k == 1;
2675 }
2676
2677 static bool IsAtLeastWin2kSP4()
2678 {
2679 #ifdef __WXWINCE__
2680 return false;
2681 #else
2682 static int s_isAtLeastWin2kSP4 = -1;
2683
2684 if ( s_isAtLeastWin2kSP4 == -1 )
2685 {
2686 OSVERSIONINFOEX ver;
2687
2688 memset(&ver, 0, sizeof(ver));
2689 ver.dwOSVersionInfoSize = sizeof(ver);
2690 GetVersionEx((OSVERSIONINFO*)&ver);
2691
2692 s_isAtLeastWin2kSP4 =
2693 ((ver.dwMajorVersion > 5) || // Vista+
2694 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2695 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2696 ver.wServicePackMajor >= 4)) // 2000 SP4+
2697 ? 1 : 0;
2698 }
2699
2700 return s_isAtLeastWin2kSP4 == 1;
2701 #endif
2702 }
2703
2704
2705 // the code page we're working with
2706 long m_CodePage;
2707
2708 // cached result of GetMBNulLen(), set to 0 initially meaning
2709 // "unknown"
2710 size_t m_minMBCharWidth;
2711 };
2712
2713 #endif // wxHAVE_WIN32_MB2WC
2714
2715
2716 // ============================================================================
2717 // wxEncodingConverter based conversion classes
2718 // ============================================================================
2719
2720 #if wxUSE_FONTMAP
2721
2722 class wxMBConv_wxwin : public wxMBConv
2723 {
2724 private:
2725 void Init()
2726 {
2727 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2728 // The wxMBConv_cf class does a better job.
2729 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2730 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2731 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2732 }
2733
2734 public:
2735 // temporarily just use wxEncodingConverter stuff,
2736 // so that it works while a better implementation is built
2737 wxMBConv_wxwin(const char* name)
2738 {
2739 if (name)
2740 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2741 else
2742 m_enc = wxFONTENCODING_SYSTEM;
2743
2744 Init();
2745 }
2746
2747 wxMBConv_wxwin(wxFontEncoding enc)
2748 {
2749 m_enc = enc;
2750
2751 Init();
2752 }
2753
2754 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2755 {
2756 size_t inbuf = strlen(psz);
2757 if (buf)
2758 {
2759 if (!m2w.Convert(psz, buf))
2760 return wxCONV_FAILED;
2761 }
2762 return inbuf;
2763 }
2764
2765 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2766 {
2767 const size_t inbuf = wxWcslen(psz);
2768 if (buf)
2769 {
2770 if (!w2m.Convert(psz, buf))
2771 return wxCONV_FAILED;
2772 }
2773
2774 return inbuf;
2775 }
2776
2777 virtual size_t GetMBNulLen() const
2778 {
2779 switch ( m_enc )
2780 {
2781 case wxFONTENCODING_UTF16BE:
2782 case wxFONTENCODING_UTF16LE:
2783 return 2;
2784
2785 case wxFONTENCODING_UTF32BE:
2786 case wxFONTENCODING_UTF32LE:
2787 return 4;
2788
2789 default:
2790 return 1;
2791 }
2792 }
2793
2794 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2795
2796 bool IsOk() const { return m_ok; }
2797
2798 public:
2799 wxFontEncoding m_enc;
2800 wxEncodingConverter m2w, w2m;
2801
2802 private:
2803 // were we initialized successfully?
2804 bool m_ok;
2805
2806 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2807 };
2808
2809 // make the constructors available for unit testing
2810 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2811 {
2812 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2813 if ( !result->IsOk() )
2814 {
2815 delete result;
2816 return 0;
2817 }
2818
2819 return result;
2820 }
2821
2822 #endif // wxUSE_FONTMAP
2823
2824 // ============================================================================
2825 // wxCSConv implementation
2826 // ============================================================================
2827
2828 void wxCSConv::Init()
2829 {
2830 m_name = NULL;
2831 m_convReal = NULL;
2832 m_deferred = true;
2833 }
2834
2835 wxCSConv::wxCSConv(const wxString& charset)
2836 {
2837 Init();
2838
2839 if ( !charset.empty() )
2840 {
2841 SetName(charset.ToAscii());
2842 }
2843
2844 #if wxUSE_FONTMAP
2845 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2846 #else
2847 m_encoding = wxFONTENCODING_SYSTEM;
2848 #endif
2849 }
2850
2851 wxCSConv::wxCSConv(wxFontEncoding encoding)
2852 {
2853 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2854 {
2855 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2856
2857 encoding = wxFONTENCODING_SYSTEM;
2858 }
2859
2860 Init();
2861
2862 m_encoding = encoding;
2863 }
2864
2865 wxCSConv::~wxCSConv()
2866 {
2867 Clear();
2868 }
2869
2870 wxCSConv::wxCSConv(const wxCSConv& conv)
2871 : wxMBConv()
2872 {
2873 Init();
2874
2875 SetName(conv.m_name);
2876 m_encoding = conv.m_encoding;
2877 }
2878
2879 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2880 {
2881 Clear();
2882
2883 SetName(conv.m_name);
2884 m_encoding = conv.m_encoding;
2885
2886 return *this;
2887 }
2888
2889 void wxCSConv::Clear()
2890 {
2891 free(m_name);
2892 delete m_convReal;
2893
2894 m_name = NULL;
2895 m_convReal = NULL;
2896 }
2897
2898 void wxCSConv::SetName(const char *charset)
2899 {
2900 if (charset)
2901 {
2902 m_name = wxStrdup(charset);
2903 m_deferred = true;
2904 }
2905 }
2906
2907 #if wxUSE_FONTMAP
2908
2909 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2910 wxEncodingNameCache );
2911
2912 static wxEncodingNameCache gs_nameCache;
2913 #endif
2914
2915 wxMBConv *wxCSConv::DoCreate() const
2916 {
2917 #if wxUSE_FONTMAP
2918 wxLogTrace(TRACE_STRCONV,
2919 wxT("creating conversion for %s"),
2920 (m_name ? m_name
2921 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2922 #endif // wxUSE_FONTMAP
2923
2924 // check for the special case of ASCII or ISO8859-1 charset: as we have
2925 // special knowledge of it anyhow, we don't need to create a special
2926 // conversion object
2927 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2928 m_encoding == wxFONTENCODING_DEFAULT )
2929 {
2930 // don't convert at all
2931 return NULL;
2932 }
2933
2934 // we trust OS to do conversion better than we can so try external
2935 // conversion methods first
2936 //
2937 // the full order is:
2938 // 1. OS conversion (iconv() under Unix or Win32 API)
2939 // 2. hard coded conversions for UTF
2940 // 3. wxEncodingConverter as fall back
2941
2942 // step (1)
2943 #ifdef HAVE_ICONV
2944 #if !wxUSE_FONTMAP
2945 if ( m_name )
2946 #endif // !wxUSE_FONTMAP
2947 {
2948 #if wxUSE_FONTMAP
2949 wxFontEncoding encoding(m_encoding);
2950 #endif
2951
2952 if ( m_name )
2953 {
2954 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2955 if ( conv->IsOk() )
2956 return conv;
2957
2958 delete conv;
2959
2960 #if wxUSE_FONTMAP
2961 encoding =
2962 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2963 #endif // wxUSE_FONTMAP
2964 }
2965 #if wxUSE_FONTMAP
2966 {
2967 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2968 if ( it != gs_nameCache.end() )
2969 {
2970 if ( it->second.empty() )
2971 return NULL;
2972
2973 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2974 if ( conv->IsOk() )
2975 return conv;
2976
2977 delete conv;
2978 }
2979
2980 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2981 // CS : in case this does not return valid names (eg for MacRoman)
2982 // encoding got a 'failure' entry in the cache all the same,
2983 // although it just has to be created using a different method, so
2984 // only store failed iconv creation attempts (or perhaps we
2985 // shoulnd't do this at all ?)
2986 if ( names[0] != NULL )
2987 {
2988 for ( ; *names; ++names )
2989 {
2990 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2991 // will need changes that will obsolete this
2992 wxString name(*names);
2993 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2994 if ( conv->IsOk() )
2995 {
2996 gs_nameCache[encoding] = *names;
2997 return conv;
2998 }
2999
3000 delete conv;
3001 }
3002
3003 gs_nameCache[encoding] = _T(""); // cache the failure
3004 }
3005 }
3006 #endif // wxUSE_FONTMAP
3007 }
3008 #endif // HAVE_ICONV
3009
3010 #ifdef wxHAVE_WIN32_MB2WC
3011 {
3012 #if wxUSE_FONTMAP
3013 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3014 : new wxMBConv_win32(m_encoding);
3015 if ( conv->IsOk() )
3016 return conv;
3017
3018 delete conv;
3019 #else
3020 return NULL;
3021 #endif
3022 }
3023 #endif // wxHAVE_WIN32_MB2WC
3024
3025 #ifdef __DARWIN__
3026 {
3027 // leave UTF16 and UTF32 to the built-ins of wx
3028 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3029 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3030 {
3031 #if wxUSE_FONTMAP
3032 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3033 : new wxMBConv_cf(m_encoding);
3034 #else
3035 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3036 #endif
3037
3038 if ( conv->IsOk() )
3039 return conv;
3040
3041 delete conv;
3042 }
3043 }
3044 #endif // __DARWIN__
3045
3046 // step (2)
3047 wxFontEncoding enc = m_encoding;
3048 #if wxUSE_FONTMAP
3049 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3050 {
3051 // use "false" to suppress interactive dialogs -- we can be called from
3052 // anywhere and popping up a dialog from here is the last thing we want to
3053 // do
3054 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3055 }
3056 #endif // wxUSE_FONTMAP
3057
3058 switch ( enc )
3059 {
3060 case wxFONTENCODING_UTF7:
3061 return new wxMBConvUTF7;
3062
3063 case wxFONTENCODING_UTF8:
3064 return new wxMBConvUTF8;
3065
3066 case wxFONTENCODING_UTF16BE:
3067 return new wxMBConvUTF16BE;
3068
3069 case wxFONTENCODING_UTF16LE:
3070 return new wxMBConvUTF16LE;
3071
3072 case wxFONTENCODING_UTF32BE:
3073 return new wxMBConvUTF32BE;
3074
3075 case wxFONTENCODING_UTF32LE:
3076 return new wxMBConvUTF32LE;
3077
3078 default:
3079 // nothing to do but put here to suppress gcc warnings
3080 break;
3081 }
3082
3083 // step (3)
3084 #if wxUSE_FONTMAP
3085 {
3086 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3087 : new wxMBConv_wxwin(m_encoding);
3088 if ( conv->IsOk() )
3089 return conv;
3090
3091 delete conv;
3092 }
3093 #endif // wxUSE_FONTMAP
3094
3095 // NB: This is a hack to prevent deadlock. What could otherwise happen
3096 // in Unicode build: wxConvLocal creation ends up being here
3097 // because of some failure and logs the error. But wxLog will try to
3098 // attach a timestamp, for which it will need wxConvLocal (to convert
3099 // time to char* and then wchar_t*), but that fails, tries to log the
3100 // error, but wxLog has an (already locked) critical section that
3101 // guards the static buffer.
3102 static bool alreadyLoggingError = false;
3103 if (!alreadyLoggingError)
3104 {
3105 alreadyLoggingError = true;
3106 wxLogError(_("Cannot convert from the charset '%s'!"),
3107 m_name ? m_name
3108 :
3109 #if wxUSE_FONTMAP
3110 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3111 #else // !wxUSE_FONTMAP
3112 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3113 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3114 );
3115
3116 alreadyLoggingError = false;
3117 }
3118
3119 return NULL;
3120 }
3121
3122 void wxCSConv::CreateConvIfNeeded() const
3123 {
3124 if ( m_deferred )
3125 {
3126 wxCSConv *self = (wxCSConv *)this; // const_cast
3127
3128 // if we don't have neither the name nor the encoding, use the default
3129 // encoding for this system
3130 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3131 {
3132 #if wxUSE_INTL
3133 self->m_encoding = wxLocale::GetSystemEncoding();
3134 #else
3135 // fallback to some reasonable default:
3136 self->m_encoding = wxFONTENCODING_ISO8859_1;
3137 #endif // wxUSE_INTL
3138 }
3139
3140 self->m_convReal = DoCreate();
3141 self->m_deferred = false;
3142 }
3143 }
3144
3145 bool wxCSConv::IsOk() const
3146 {
3147 CreateConvIfNeeded();
3148
3149 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3150 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3151 return true; // always ok as we do it ourselves
3152
3153 // m_convReal->IsOk() is called at its own creation, so we know it must
3154 // be ok if m_convReal is non-NULL
3155 return m_convReal != NULL;
3156 }
3157
3158 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3159 const char *src, size_t srcLen) const
3160 {
3161 CreateConvIfNeeded();
3162
3163 if (m_convReal)
3164 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3165
3166 // latin-1 (direct)
3167 if ( srcLen == wxNO_LEN )
3168 srcLen = strlen(src) + 1; // take trailing NUL too
3169
3170 if ( dst )
3171 {
3172 if ( dstLen < srcLen )
3173 return wxCONV_FAILED;
3174
3175 for ( size_t n = 0; n < srcLen; n++ )
3176 dst[n] = (unsigned char)(src[n]);
3177 }
3178
3179 return srcLen;
3180 }
3181
3182 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3183 const wchar_t *src, size_t srcLen) const
3184 {
3185 CreateConvIfNeeded();
3186
3187 if (m_convReal)
3188 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3189
3190 // latin-1 (direct)
3191 if ( srcLen == wxNO_LEN )
3192 srcLen = wxWcslen(src) + 1;
3193
3194 if ( dst )
3195 {
3196 if ( dstLen < srcLen )
3197 return wxCONV_FAILED;
3198
3199 for ( size_t n = 0; n < srcLen; n++ )
3200 {
3201 if ( src[n] > 0xFF )
3202 return wxCONV_FAILED;
3203
3204 dst[n] = (char)src[n];
3205 }
3206
3207 }
3208 else // still need to check the input validity
3209 {
3210 for ( size_t n = 0; n < srcLen; n++ )
3211 {
3212 if ( src[n] > 0xFF )
3213 return wxCONV_FAILED;
3214 }
3215 }
3216
3217 return srcLen;
3218 }
3219
3220 size_t wxCSConv::GetMBNulLen() const
3221 {
3222 CreateConvIfNeeded();
3223
3224 if ( m_convReal )
3225 {
3226 return m_convReal->GetMBNulLen();
3227 }
3228
3229 // otherwise, we are ISO-8859-1
3230 return 1;
3231 }
3232
3233 #if wxUSE_UNICODE_UTF8
3234 bool wxCSConv::IsUTF8() const
3235 {
3236 CreateConvIfNeeded();
3237
3238 if ( m_convReal )
3239 {
3240 return m_convReal->IsUTF8();
3241 }
3242
3243 // otherwise, we are ISO-8859-1
3244 return false;
3245 }
3246 #endif
3247
3248
3249 #if wxUSE_UNICODE
3250
3251 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3252 {
3253 if ( !s )
3254 return wxWCharBuffer();
3255
3256 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3257 if ( !wbuf )
3258 wbuf = wxMBConvUTF8().cMB2WX(s);
3259 if ( !wbuf )
3260 wbuf = wxConvISO8859_1.cMB2WX(s);
3261
3262 return wbuf;
3263 }
3264
3265 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3266 {
3267 if ( !ws )
3268 return wxCharBuffer();
3269
3270 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3271 if ( !buf )
3272 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3273
3274 return buf;
3275 }
3276
3277 #endif // wxUSE_UNICODE
3278
3279 // ----------------------------------------------------------------------------
3280 // globals
3281 // ----------------------------------------------------------------------------
3282
3283 // NB: The reason why we create converted objects in this convoluted way,
3284 // using a factory function instead of global variable, is that they
3285 // may be used at static initialization time (some of them are used by
3286 // wxString ctors and there may be a global wxString object). In other
3287 // words, possibly _before_ the converter global object would be
3288 // initialized.
3289
3290 #undef wxConvLibc
3291 #undef wxConvUTF8
3292 #undef wxConvUTF7
3293 #undef wxConvLocal
3294 #undef wxConvISO8859_1
3295
3296 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3297 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3298 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3299 { \
3300 static impl_klass name##Obj ctor_args; \
3301 return &name##Obj; \
3302 } \
3303 /* this ensures that all global converter objects are created */ \
3304 /* by the time static initialization is done, i.e. before any */ \
3305 /* thread is launched: */ \
3306 static klass* gs_##name##instance = wxGet_##name##Ptr()
3307
3308 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3309 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3310
3311 #ifdef __WINDOWS__
3312 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3313 #else
3314 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3315 #endif
3316
3317 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3318 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3319 // provokes an error message about "not enough macro parameters"; and we
3320 // can't use "()" here as the name##Obj declaration would be parsed as a
3321 // function declaration then, so use a semicolon and live with an extra
3322 // empty statement (and hope that no compilers warns about this)
3323 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3324 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3325
3326 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3327 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3328
3329 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3330 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3331
3332 #ifdef __DARWIN__
3333 // The xnu kernel always communicates file paths in decomposed UTF-8.
3334 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3335 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3336 #endif
3337
3338 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3339 #ifdef __DARWIN__
3340 &wxConvMacUTF8DObj;
3341 #else // !__DARWIN__
3342 wxGet_wxConvLibcPtr();
3343 #endif // __DARWIN__/!__DARWIN__
3344
3345 #else // !wxUSE_WCHAR_T
3346
3347 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3348 // stand-ins in absence of wchar_t
3349 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3350 wxConvISO8859_1,
3351 wxConvLocal,
3352 wxConvUTF8;
3353
3354 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T