remove wxCSConv::MB2WC/WC2MB, implement Latin-1 fallback conversion in To/FromWChar...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
171
172 // the number of NULs terminating this string
173 size_t nulLen = 0; // not really needed, but just to avoid warnings
174
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
181 if ( srcLen != wxNO_LEN )
182 {
183 // we need to know how to find the end of this string
184 nulLen = GetMBNulLen();
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
187
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
190 {
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
193 char * const p = bufTmp.data();
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
196 *s = '\0';
197
198 src = bufTmp;
199 }
200
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
207
208 for ( ;; )
209 {
210 // try to convert the current chunk
211 size_t lenChunk = MB2WC(NULL, src, 0);
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for the L'\0' at the end of this chunk
216
217 dstWritten += lenChunk;
218
219 if ( lenChunk == 1 )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
230 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
235
236 if ( !srcEnd )
237 {
238 // we convert just one chunk in this case as this is the entire
239 // string anyhow
240 break;
241 }
242
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src, nulLen) )
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
250 src += nulLen;
251 }
252
253 src += nulLen; // skipping over its terminator as well
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
258 if ( src >= srcEnd )
259 break;
260 }
261
262 return dstWritten;
263 }
264
265 size_t
266 wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
268 {
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
271
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
278 if ( srcLen == wxNO_LEN )
279 {
280 srcLen = wxWcslen(src) + 1;
281 }
282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
283 {
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp = wxWCharBuffer(srcLen);
286 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
314 }
315
316 return dstWritten;
317 }
318
319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
320 {
321 size_t rc = ToWChar(outBuff, outLen, inBuff);
322 if ( rc != wxCONV_FAILED )
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330 }
331
332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
333 {
334 size_t rc = FromWChar(outBuff, outLen, inBuff);
335 if ( rc != wxCONV_FAILED )
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341 }
342
343 wxMBConv::~wxMBConv()
344 {
345 // nothing to do here (necessary for Darwin linking probably)
346 }
347
348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349 {
350 if ( psz )
351 {
352 // calculate the length of the buffer needed first
353 const size_t nLen = ToWChar(NULL, 0, psz);
354 if ( nLen != wxCONV_FAILED )
355 {
356 // now do the actual conversion
357 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
358
359 // +1 for the trailing NULL
360 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
361 return buf;
362 }
363 }
364
365 return wxWCharBuffer();
366 }
367
368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369 {
370 if ( pwz )
371 {
372 const size_t nLen = FromWChar(NULL, 0, pwz);
373 if ( nLen != wxCONV_FAILED )
374 {
375 wxCharBuffer buf(nLen - 1);
376 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
377 return buf;
378 }
379 }
380
381 return wxCharBuffer();
382 }
383
384 const wxWCharBuffer
385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
386 {
387 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
388 if ( dstLen != wxCONV_FAILED )
389 {
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer wbuf(dstLen);
394 wbuf.data()[dstLen - 1] = L'\0';
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 const size_t nulLen = GetMBNulLen();
421
422 // as above, ensure that the buffer is always NUL-terminated, even if
423 // the input is not
424 wxCharBuffer buf(dstLen + nulLen - 1);
425 memset(buf.data() + dstLen, 0, nulLen);
426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
427 {
428 if ( outLen )
429 {
430 *outLen = dstLen;
431
432 if ( dstLen >= nulLen &&
433 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
434 {
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
437 *outLen -= nulLen;
438 }
439 }
440
441 return buf;
442 }
443 }
444
445 if ( outLen )
446 *outLen = 0;
447
448 return wxCharBuffer();
449 }
450
451 // ----------------------------------------------------------------------------
452 // wxMBConvLibc
453 // ----------------------------------------------------------------------------
454
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456 {
457 return wxMB2WC(buf, psz, n);
458 }
459
460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
461 {
462 return wxWC2MB(buf, psz, n);
463 }
464
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
468
469 #ifdef __UNIX__
470
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
472 {
473 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
474 wxStricmp(charset, _T("UTF8")) == 0 )
475 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
476 else
477 m_conv = new wxCSConv(charset);
478 }
479
480 #endif // __UNIX__
481
482 // ----------------------------------------------------------------------------
483 // UTF-7
484 // ----------------------------------------------------------------------------
485
486 // Implementation (C) 2004 Fredrik Roubert
487 //
488 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
489
490 //
491 // BASE64 decoding table
492 //
493 static const unsigned char utf7unb64[] =
494 {
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
527 };
528
529 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
530 const char *src, size_t srcLen) const
531 {
532 DecoderState stateOrig,
533 *statePtr;
534 if ( srcLen == wxNO_LEN )
535 {
536 // convert the entire string, up to and including the trailing NUL
537 srcLen = strlen(src) + 1;
538
539 // when working on the entire strings we don't update nor use the shift
540 // state from the previous call
541 statePtr = &stateOrig;
542 }
543 else // when working with partial strings we do use the shift state
544 {
545 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
546
547 // also save the old state to be able to rollback to it on error
548 stateOrig = m_stateDecoder;
549 }
550
551 // but to simplify the code below we use this variable in both cases
552 DecoderState& state = *statePtr;
553
554
555 // number of characters [which would have been] written to dst [if it were
556 // not NULL]
557 size_t len = 0;
558
559 const char * const srcEnd = src + srcLen;
560
561 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
562 {
563 const unsigned char cc = *src++;
564
565 if ( state.IsShifted() )
566 {
567 const unsigned char dc = utf7unb64[cc];
568 if ( dc == 0xff )
569 {
570 // end of encoded part
571 state.ToDirect();
572
573 // re-parse this character normally below unless it's '-' which
574 // is consumed by the decoder
575 if ( cc == '-' )
576 continue;
577 }
578 else // valid encoded character
579 {
580 // mini base64 decoder: each character is 6 bits
581 state.bit += 6;
582 state.accum <<= 6;
583 state.accum += dc;
584
585 if ( state.bit >= 8 )
586 {
587 // got the full byte, consume it
588 state.bit -= 8;
589 unsigned char b = (state.accum >> state.bit) & 0x00ff;
590
591 if ( state.isLSB )
592 {
593 // we've got the full word, output it
594 if ( dst )
595 *dst++ = (state.msb << 8) | b;
596 len++;
597 state.isLSB = false;
598 }
599 else // MSB
600 {
601 // just store it while we wait for LSB
602 state.msb = b;
603 state.isLSB = true;
604 }
605 }
606 }
607 }
608
609 if ( state.IsDirect() )
610 {
611 // start of an encoded segment?
612 if ( cc == '+' )
613 {
614 if ( src == srcEnd )
615 return wxCONV_FAILED; // can't have '+' at the end
616
617 if ( *src == '-' )
618 {
619 // just the encoded plus sign, don't switch to shifted mode
620 if ( dst )
621 *dst++ = '+';
622 len++;
623 src++;
624 }
625 else
626 {
627 state.ToShifted();
628 }
629 }
630 else // not '+'
631 {
632 // only printable 7 bit ASCII characters (with the exception of
633 // NUL, TAB, CR and LF) can be used directly
634 if ( cc >= 0x7f || (cc < ' ' &&
635 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
636 return wxCONV_FAILED;
637
638 if ( dst )
639 *dst++ = cc;
640 len++;
641 }
642 }
643 }
644
645 if ( !len )
646 {
647 // as we didn't read any characters we should be called with the same
648 // data (followed by some more new data) again later so don't save our
649 // state
650 state = stateOrig;
651
652 return wxCONV_FAILED;
653 }
654
655 return len;
656 }
657
658 //
659 // BASE64 encoding table
660 //
661 static const unsigned char utf7enb64[] =
662 {
663 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
664 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
665 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
666 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
667 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
668 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
669 'w', 'x', 'y', 'z', '0', '1', '2', '3',
670 '4', '5', '6', '7', '8', '9', '+', '/'
671 };
672
673 //
674 // UTF-7 encoding table
675 //
676 // 0 - Set D (directly encoded characters)
677 // 1 - Set O (optional direct characters)
678 // 2 - whitespace characters (optional)
679 // 3 - special characters
680 //
681 static const unsigned char utf7encode[128] =
682 {
683 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
684 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
685 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
689 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
691 };
692
693 static inline bool wxIsUTF7Direct(wchar_t wc)
694 {
695 return wc < 0x80 && utf7encode[wc] < 1;
696 }
697
698 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
699 const wchar_t *src, size_t srcLen) const
700 {
701 EncoderState stateOrig,
702 *statePtr;
703 if ( srcLen == wxNO_LEN )
704 {
705 // we don't apply the stored state when operating on entire strings at
706 // once
707 statePtr = &stateOrig;
708
709 srcLen = wxWcslen(src) + 1;
710 }
711 else // do use the mode we left the output in previously
712 {
713 stateOrig = m_stateEncoder;
714 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
715 }
716
717 EncoderState& state = *statePtr;
718
719
720 size_t len = 0;
721
722 const wchar_t * const srcEnd = src + srcLen;
723 while ( src < srcEnd && (!dst || len < dstLen) )
724 {
725 wchar_t cc = *src++;
726 if ( wxIsUTF7Direct(cc) )
727 {
728 if ( state.IsShifted() )
729 {
730 // pad with zeros the last encoded block if necessary
731 if ( state.bit )
732 {
733 if ( dst )
734 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
735 len++;
736 }
737
738 state.ToDirect();
739
740 if ( dst )
741 *dst++ = '-';
742 len++;
743 }
744
745 if ( dst )
746 *dst++ = (char)cc;
747 len++;
748 }
749 else if ( cc == '+' && state.IsDirect() )
750 {
751 if ( dst )
752 {
753 *dst++ = '+';
754 *dst++ = '-';
755 }
756
757 len += 2;
758 }
759 #ifndef WC_UTF16
760 else if (((wxUint32)cc) > 0xffff)
761 {
762 // no surrogate pair generation (yet?)
763 return wxCONV_FAILED;
764 }
765 #endif
766 else
767 {
768 if ( state.IsDirect() )
769 {
770 state.ToShifted();
771
772 if ( dst )
773 *dst++ = '+';
774 len++;
775 }
776
777 // BASE64 encode string
778 for ( ;; )
779 {
780 for ( unsigned lsb = 0; lsb < 2; lsb++ )
781 {
782 state.accum <<= 8;
783 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
784
785 for (state.bit += 8; state.bit >= 6; )
786 {
787 state.bit -= 6;
788 if ( dst )
789 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
790 len++;
791 }
792 }
793
794 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
795 break;
796
797 src++;
798 }
799 }
800 }
801
802 // we need to restore the original encoder state if we were called just to
803 // calculate the amount of space needed as we will presumably be called
804 // again to really convert the data now
805 if ( !dst )
806 state = stateOrig;
807
808 return len;
809 }
810
811 // ----------------------------------------------------------------------------
812 // UTF-8
813 // ----------------------------------------------------------------------------
814
815 static const wxUint32 utf8_max[]=
816 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
817
818 // boundaries of the private use area we use to (temporarily) remap invalid
819 // characters invalid in a UTF-8 encoded string
820 const wxUint32 wxUnicodePUA = 0x100000;
821 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
822
823 // this table gives the length of the UTF-8 encoding from its first character:
824 const unsigned char tableUtf8Lengths[256] = {
825 // single-byte sequences (ASCII):
826 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
828 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
834
835 // these are invalid:
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
840 0, 0, // C0,C1
841
842 // two-byte sequences:
843 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
845
846 // three-byte sequences:
847 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
848
849 // four-byte sequences:
850 4, 4, 4, 4, 4, // F0..F4
851
852 // these are invalid again (5- or 6-byte
853 // sequences and sequences for code points
854 // above U+10FFFF, as restricted by RFC 3629):
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
856 };
857
858 size_t
859 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
860 const char *src, size_t srcLen) const
861 {
862 wchar_t *out = dstLen ? dst : NULL;
863 size_t written = 0;
864
865 if ( srcLen == wxNO_LEN )
866 srcLen = strlen(src) + 1;
867
868 for ( const char *p = src; ; p++ )
869 {
870 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
871 {
872 // all done successfully, just add the trailing NULL if we are not
873 // using explicit length
874 if ( srcLen == wxNO_LEN )
875 {
876 if ( out )
877 {
878 if ( !dstLen )
879 break;
880
881 *out = L'\0';
882 }
883
884 written++;
885 }
886
887 return written;
888 }
889
890 if ( out && !dstLen-- )
891 break;
892
893 wxUint32 code;
894 unsigned char c = *p;
895
896 if ( c < 0x80 )
897 {
898 if ( srcLen == 0 ) // the test works for wxNO_LEN too
899 break;
900
901 if ( srcLen != wxNO_LEN )
902 srcLen--;
903
904 code = c;
905 }
906 else
907 {
908 unsigned len = tableUtf8Lengths[c];
909 if ( !len )
910 break;
911
912 if ( srcLen < len ) // the test works for wxNO_LEN too
913 break;
914
915 if ( srcLen != wxNO_LEN )
916 srcLen -= len;
917
918 // Char. number range | UTF-8 octet sequence
919 // (hexadecimal) | (binary)
920 // ----------------------+----------------------------------------
921 // 0000 0000 - 0000 007F | 0xxxxxxx
922 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
923 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
924 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
925 //
926 // Code point value is stored in bits marked with 'x',
927 // lowest-order bit of the value on the right side in the diagram
928 // above. (from RFC 3629)
929
930 // mask to extract lead byte's value ('x' bits above), by sequence
931 // length:
932 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
933
934 // mask and value of lead byte's most significant bits, by length:
935 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
936 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
937
938 len--; // it's more convenient to work with 0-based length here
939
940 // extract the lead byte's value bits:
941 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
942 break;
943
944 code = c & leadValueMask[len];
945
946 // all remaining bytes, if any, are handled in the same way
947 // regardless of sequence's length:
948 for ( ; len; --len )
949 {
950 c = *++p;
951 if ( (c & 0xC0) != 0x80 )
952 return wxCONV_FAILED;
953
954 code <<= 6;
955 code |= c & 0x3F;
956 }
957 }
958
959 #ifdef WC_UTF16
960 // cast is ok because wchar_t == wxUint16 if WC_UTF16
961 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
962 {
963 if ( out )
964 out++;
965 written++;
966 }
967 #else // !WC_UTF16
968 if ( out )
969 *out = code;
970 #endif // WC_UTF16/!WC_UTF16
971
972 if ( out )
973 out++;
974
975 written++;
976 }
977
978 return wxCONV_FAILED;
979 }
980
981 size_t
982 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
983 const wchar_t *src, size_t srcLen) const
984 {
985 char *out = dstLen ? dst : NULL;
986 size_t written = 0;
987
988 for ( const wchar_t *wp = src; ; wp++ )
989 {
990 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
991 {
992 // all done successfully, just add the trailing NULL if we are not
993 // using explicit length
994 if ( srcLen == wxNO_LEN )
995 {
996 if ( out )
997 {
998 if ( !dstLen )
999 break;
1000
1001 *out = '\0';
1002 }
1003
1004 written++;
1005 }
1006
1007 return written;
1008 }
1009
1010 if ( srcLen != wxNO_LEN )
1011 srcLen--;
1012
1013 wxUint32 code;
1014 #ifdef WC_UTF16
1015 // cast is ok for WC_UTF16
1016 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1017 {
1018 // skip the next char too as we decoded a surrogate
1019 wp++;
1020 }
1021 #else // wchar_t is UTF-32
1022 code = *wp & 0x7fffffff;
1023 #endif
1024
1025 unsigned len;
1026 if ( code <= 0x7F )
1027 {
1028 len = 1;
1029 if ( out )
1030 {
1031 if ( dstLen < len )
1032 break;
1033
1034 out[0] = (char)code;
1035 }
1036 }
1037 else if ( code <= 0x07FF )
1038 {
1039 len = 2;
1040 if ( out )
1041 {
1042 if ( dstLen < len )
1043 break;
1044
1045 // NB: this line takes 6 least significant bits, encodes them as
1046 // 10xxxxxx and discards them so that the next byte can be encoded:
1047 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1048 out[0] = 0xC0 | code;
1049 }
1050 }
1051 else if ( code < 0xFFFF )
1052 {
1053 len = 3;
1054 if ( out )
1055 {
1056 if ( dstLen < len )
1057 break;
1058
1059 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1060 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1061 out[0] = 0xE0 | code;
1062 }
1063 }
1064 else if ( code <= 0x10FFFF )
1065 {
1066 len = 4;
1067 if ( out )
1068 {
1069 if ( dstLen < len )
1070 break;
1071
1072 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1073 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1074 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1075 out[0] = 0xF0 | code;
1076 }
1077 }
1078 else
1079 {
1080 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1081 break;
1082 }
1083
1084 if ( out )
1085 {
1086 out += len;
1087 dstLen -= len;
1088 }
1089
1090 written += len;
1091 }
1092
1093 // we only get here if an error occurs during decoding
1094 return wxCONV_FAILED;
1095 }
1096
1097 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1098 const char *psz, size_t srcLen) const
1099 {
1100 if ( m_options == MAP_INVALID_UTF8_NOT )
1101 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1102
1103 size_t len = 0;
1104
1105 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1106 {
1107 const char *opsz = psz;
1108 bool invalid = false;
1109 unsigned char cc = *psz++, fc = cc;
1110 unsigned cnt;
1111 for (cnt = 0; fc & 0x80; cnt++)
1112 fc <<= 1;
1113
1114 if (!cnt)
1115 {
1116 // plain ASCII char
1117 if (buf)
1118 *buf++ = cc;
1119 len++;
1120
1121 // escape the escape character for octal escapes
1122 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1123 && cc == '\\' && (!buf || len < n))
1124 {
1125 if (buf)
1126 *buf++ = cc;
1127 len++;
1128 }
1129 }
1130 else
1131 {
1132 cnt--;
1133 if (!cnt)
1134 {
1135 // invalid UTF-8 sequence
1136 invalid = true;
1137 }
1138 else
1139 {
1140 unsigned ocnt = cnt - 1;
1141 wxUint32 res = cc & (0x3f >> cnt);
1142 while (cnt--)
1143 {
1144 cc = *psz;
1145 if ((cc & 0xC0) != 0x80)
1146 {
1147 // invalid UTF-8 sequence
1148 invalid = true;
1149 break;
1150 }
1151
1152 psz++;
1153 res = (res << 6) | (cc & 0x3f);
1154 }
1155
1156 if (invalid || res <= utf8_max[ocnt])
1157 {
1158 // illegal UTF-8 encoding
1159 invalid = true;
1160 }
1161 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1162 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1163 {
1164 // if one of our PUA characters turns up externally
1165 // it must also be treated as an illegal sequence
1166 // (a bit like you have to escape an escape character)
1167 invalid = true;
1168 }
1169 else
1170 {
1171 #ifdef WC_UTF16
1172 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1173 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1174 if (pa == wxCONV_FAILED)
1175 {
1176 invalid = true;
1177 }
1178 else
1179 {
1180 if (buf)
1181 buf += pa;
1182 len += pa;
1183 }
1184 #else // !WC_UTF16
1185 if (buf)
1186 *buf++ = (wchar_t)res;
1187 len++;
1188 #endif // WC_UTF16/!WC_UTF16
1189 }
1190 }
1191
1192 if (invalid)
1193 {
1194 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1195 {
1196 while (opsz < psz && (!buf || len < n))
1197 {
1198 #ifdef WC_UTF16
1199 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1200 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1201 wxASSERT(pa != wxCONV_FAILED);
1202 if (buf)
1203 buf += pa;
1204 opsz++;
1205 len += pa;
1206 #else
1207 if (buf)
1208 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1209 opsz++;
1210 len++;
1211 #endif
1212 }
1213 }
1214 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1215 {
1216 while (opsz < psz && (!buf || len < n))
1217 {
1218 if ( buf && len + 3 < n )
1219 {
1220 unsigned char on = *opsz;
1221 *buf++ = L'\\';
1222 *buf++ = (wchar_t)( L'0' + on / 0100 );
1223 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1224 *buf++ = (wchar_t)( L'0' + on % 010 );
1225 }
1226
1227 opsz++;
1228 len += 4;
1229 }
1230 }
1231 else // MAP_INVALID_UTF8_NOT
1232 {
1233 return wxCONV_FAILED;
1234 }
1235 }
1236 }
1237 }
1238
1239 if (srcLen == wxNO_LEN && buf && (len < n))
1240 *buf = 0;
1241
1242 return len + 1;
1243 }
1244
1245 static inline bool isoctal(wchar_t wch)
1246 {
1247 return L'0' <= wch && wch <= L'7';
1248 }
1249
1250 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1251 const wchar_t *psz, size_t srcLen) const
1252 {
1253 if ( m_options == MAP_INVALID_UTF8_NOT )
1254 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1255
1256 size_t len = 0;
1257
1258 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1259 {
1260 wxUint32 cc;
1261
1262 #ifdef WC_UTF16
1263 // cast is ok for WC_UTF16
1264 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1265 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1266 #else
1267 cc = (*psz++) & 0x7fffffff;
1268 #endif
1269
1270 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1271 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1272 {
1273 if (buf)
1274 *buf++ = (char)(cc - wxUnicodePUA);
1275 len++;
1276 }
1277 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1278 && cc == L'\\' && psz[0] == L'\\' )
1279 {
1280 if (buf)
1281 *buf++ = (char)cc;
1282 psz++;
1283 len++;
1284 }
1285 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1286 cc == L'\\' &&
1287 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1288 {
1289 if (buf)
1290 {
1291 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1292 (psz[1] - L'0') * 010 +
1293 (psz[2] - L'0'));
1294 }
1295
1296 psz += 3;
1297 len++;
1298 }
1299 else
1300 {
1301 unsigned cnt;
1302 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1303 {
1304 }
1305
1306 if (!cnt)
1307 {
1308 // plain ASCII char
1309 if (buf)
1310 *buf++ = (char) cc;
1311 len++;
1312 }
1313 else
1314 {
1315 len += cnt + 1;
1316 if (buf)
1317 {
1318 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1319 while (cnt--)
1320 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1321 }
1322 }
1323 }
1324 }
1325
1326 if (srcLen == wxNO_LEN && buf && (len < n))
1327 *buf = 0;
1328
1329 return len + 1;
1330 }
1331
1332 // ============================================================================
1333 // UTF-16
1334 // ============================================================================
1335
1336 #ifdef WORDS_BIGENDIAN
1337 #define wxMBConvUTF16straight wxMBConvUTF16BE
1338 #define wxMBConvUTF16swap wxMBConvUTF16LE
1339 #else
1340 #define wxMBConvUTF16swap wxMBConvUTF16BE
1341 #define wxMBConvUTF16straight wxMBConvUTF16LE
1342 #endif
1343
1344 /* static */
1345 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1346 {
1347 if ( srcLen == wxNO_LEN )
1348 {
1349 // count the number of bytes in input, including the trailing NULs
1350 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1351 for ( srcLen = 1; *inBuff++; srcLen++ )
1352 ;
1353
1354 srcLen *= BYTES_PER_CHAR;
1355 }
1356 else // we already have the length
1357 {
1358 // we can only convert an entire number of UTF-16 characters
1359 if ( srcLen % BYTES_PER_CHAR )
1360 return wxCONV_FAILED;
1361 }
1362
1363 return srcLen;
1364 }
1365
1366 // case when in-memory representation is UTF-16 too
1367 #ifdef WC_UTF16
1368
1369 // ----------------------------------------------------------------------------
1370 // conversions without endianness change
1371 // ----------------------------------------------------------------------------
1372
1373 size_t
1374 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1375 const char *src, size_t srcLen) const
1376 {
1377 // set up the scene for using memcpy() (which is presumably more efficient
1378 // than copying the bytes one by one)
1379 srcLen = GetLength(src, srcLen);
1380 if ( srcLen == wxNO_LEN )
1381 return wxCONV_FAILED;
1382
1383 const size_t inLen = srcLen / BYTES_PER_CHAR;
1384 if ( dst )
1385 {
1386 if ( dstLen < inLen )
1387 return wxCONV_FAILED;
1388
1389 memcpy(dst, src, srcLen);
1390 }
1391
1392 return inLen;
1393 }
1394
1395 size_t
1396 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1397 const wchar_t *src, size_t srcLen) const
1398 {
1399 if ( srcLen == wxNO_LEN )
1400 srcLen = wxWcslen(src) + 1;
1401
1402 srcLen *= BYTES_PER_CHAR;
1403
1404 if ( dst )
1405 {
1406 if ( dstLen < srcLen )
1407 return wxCONV_FAILED;
1408
1409 memcpy(dst, src, srcLen);
1410 }
1411
1412 return srcLen;
1413 }
1414
1415 // ----------------------------------------------------------------------------
1416 // endian-reversing conversions
1417 // ----------------------------------------------------------------------------
1418
1419 size_t
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1421 const char *src, size_t srcLen) const
1422 {
1423 srcLen = GetLength(src, srcLen);
1424 if ( srcLen == wxNO_LEN )
1425 return wxCONV_FAILED;
1426
1427 srcLen /= BYTES_PER_CHAR;
1428
1429 if ( dst )
1430 {
1431 if ( dstLen < srcLen )
1432 return wxCONV_FAILED;
1433
1434 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1435 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1436 {
1437 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1438 }
1439 }
1440
1441 return srcLen;
1442 }
1443
1444 size_t
1445 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1446 const wchar_t *src, size_t srcLen) const
1447 {
1448 if ( srcLen == wxNO_LEN )
1449 srcLen = wxWcslen(src) + 1;
1450
1451 srcLen *= BYTES_PER_CHAR;
1452
1453 if ( dst )
1454 {
1455 if ( dstLen < srcLen )
1456 return wxCONV_FAILED;
1457
1458 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1459 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1460 {
1461 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1462 }
1463 }
1464
1465 return srcLen;
1466 }
1467
1468 #else // !WC_UTF16: wchar_t is UTF-32
1469
1470 // ----------------------------------------------------------------------------
1471 // conversions without endianness change
1472 // ----------------------------------------------------------------------------
1473
1474 size_t
1475 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1476 const char *src, size_t srcLen) const
1477 {
1478 srcLen = GetLength(src, srcLen);
1479 if ( srcLen == wxNO_LEN )
1480 return wxCONV_FAILED;
1481
1482 const size_t inLen = srcLen / BYTES_PER_CHAR;
1483 if ( !dst )
1484 {
1485 // optimization: return maximal space which could be needed for this
1486 // string even if the real size could be smaller if the buffer contains
1487 // any surrogates
1488 return inLen;
1489 }
1490
1491 size_t outLen = 0;
1492 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1493 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1494 {
1495 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1496 if ( !inBuff )
1497 return wxCONV_FAILED;
1498
1499 if ( ++outLen > dstLen )
1500 return wxCONV_FAILED;
1501
1502 *dst++ = ch;
1503 }
1504
1505
1506 return outLen;
1507 }
1508
1509 size_t
1510 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1511 const wchar_t *src, size_t srcLen) const
1512 {
1513 if ( srcLen == wxNO_LEN )
1514 srcLen = wxWcslen(src) + 1;
1515
1516 size_t outLen = 0;
1517 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1518 for ( size_t n = 0; n < srcLen; n++ )
1519 {
1520 wxUint16 cc[2];
1521 const size_t numChars = encode_utf16(*src++, cc);
1522 if ( numChars == wxCONV_FAILED )
1523 return wxCONV_FAILED;
1524
1525 outLen += numChars * BYTES_PER_CHAR;
1526 if ( outBuff )
1527 {
1528 if ( outLen > dstLen )
1529 return wxCONV_FAILED;
1530
1531 *outBuff++ = cc[0];
1532 if ( numChars == 2 )
1533 {
1534 // second character of a surrogate
1535 *outBuff++ = cc[1];
1536 }
1537 }
1538 }
1539
1540 return outLen;
1541 }
1542
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1546
1547 size_t
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549 const char *src, size_t srcLen) const
1550 {
1551 srcLen = GetLength(src, srcLen);
1552 if ( srcLen == wxNO_LEN )
1553 return wxCONV_FAILED;
1554
1555 const size_t inLen = srcLen / BYTES_PER_CHAR;
1556 if ( !dst )
1557 {
1558 // optimization: return maximal space which could be needed for this
1559 // string even if the real size could be smaller if the buffer contains
1560 // any surrogates
1561 return inLen;
1562 }
1563
1564 size_t outLen = 0;
1565 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1566 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1567 {
1568 wxUint32 ch;
1569 wxUint16 tmp[2];
1570
1571 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1572 inBuff++;
1573 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1574
1575 const size_t numChars = decode_utf16(tmp, ch);
1576 if ( numChars == wxCONV_FAILED )
1577 return wxCONV_FAILED;
1578
1579 if ( numChars == 2 )
1580 inBuff++;
1581
1582 if ( ++outLen > dstLen )
1583 return wxCONV_FAILED;
1584
1585 *dst++ = ch;
1586 }
1587
1588
1589 return outLen;
1590 }
1591
1592 size_t
1593 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1594 const wchar_t *src, size_t srcLen) const
1595 {
1596 if ( srcLen == wxNO_LEN )
1597 srcLen = wxWcslen(src) + 1;
1598
1599 size_t outLen = 0;
1600 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1601 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1602 {
1603 wxUint16 cc[2];
1604 const size_t numChars = encode_utf16(*src, cc);
1605 if ( numChars == wxCONV_FAILED )
1606 return wxCONV_FAILED;
1607
1608 outLen += numChars * BYTES_PER_CHAR;
1609 if ( outBuff )
1610 {
1611 if ( outLen > dstLen )
1612 return wxCONV_FAILED;
1613
1614 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1615 if ( numChars == 2 )
1616 {
1617 // second character of a surrogate
1618 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1619 }
1620 }
1621 }
1622
1623 return outLen;
1624 }
1625
1626 #endif // WC_UTF16/!WC_UTF16
1627
1628
1629 // ============================================================================
1630 // UTF-32
1631 // ============================================================================
1632
1633 #ifdef WORDS_BIGENDIAN
1634 #define wxMBConvUTF32straight wxMBConvUTF32BE
1635 #define wxMBConvUTF32swap wxMBConvUTF32LE
1636 #else
1637 #define wxMBConvUTF32swap wxMBConvUTF32BE
1638 #define wxMBConvUTF32straight wxMBConvUTF32LE
1639 #endif
1640
1641
1642 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1643 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1644
1645 /* static */
1646 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1647 {
1648 if ( srcLen == wxNO_LEN )
1649 {
1650 // count the number of bytes in input, including the trailing NULs
1651 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652 for ( srcLen = 1; *inBuff++; srcLen++ )
1653 ;
1654
1655 srcLen *= BYTES_PER_CHAR;
1656 }
1657 else // we already have the length
1658 {
1659 // we can only convert an entire number of UTF-32 characters
1660 if ( srcLen % BYTES_PER_CHAR )
1661 return wxCONV_FAILED;
1662 }
1663
1664 return srcLen;
1665 }
1666
1667 // case when in-memory representation is UTF-16
1668 #ifdef WC_UTF16
1669
1670 // ----------------------------------------------------------------------------
1671 // conversions without endianness change
1672 // ----------------------------------------------------------------------------
1673
1674 size_t
1675 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1676 const char *src, size_t srcLen) const
1677 {
1678 srcLen = GetLength(src, srcLen);
1679 if ( srcLen == wxNO_LEN )
1680 return wxCONV_FAILED;
1681
1682 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1683 const size_t inLen = srcLen / BYTES_PER_CHAR;
1684 size_t outLen = 0;
1685 for ( size_t n = 0; n < inLen; n++ )
1686 {
1687 wxUint16 cc[2];
1688 const size_t numChars = encode_utf16(*inBuff++, cc);
1689 if ( numChars == wxCONV_FAILED )
1690 return wxCONV_FAILED;
1691
1692 outLen += numChars;
1693 if ( dst )
1694 {
1695 if ( outLen > dstLen )
1696 return wxCONV_FAILED;
1697
1698 *dst++ = cc[0];
1699 if ( numChars == 2 )
1700 {
1701 // second character of a surrogate
1702 *dst++ = cc[1];
1703 }
1704 }
1705 }
1706
1707 return outLen;
1708 }
1709
1710 size_t
1711 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1712 const wchar_t *src, size_t srcLen) const
1713 {
1714 if ( srcLen == wxNO_LEN )
1715 srcLen = wxWcslen(src) + 1;
1716
1717 if ( !dst )
1718 {
1719 // optimization: return maximal space which could be needed for this
1720 // string instead of the exact amount which could be less if there are
1721 // any surrogates in the input
1722 //
1723 // we consider that surrogates are rare enough to make it worthwhile to
1724 // avoid running the loop below at the cost of slightly extra memory
1725 // consumption
1726 return srcLen * BYTES_PER_CHAR;
1727 }
1728
1729 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1730 size_t outLen = 0;
1731 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1732 {
1733 const wxUint32 ch = wxDecodeSurrogate(&src);
1734 if ( !src )
1735 return wxCONV_FAILED;
1736
1737 outLen += BYTES_PER_CHAR;
1738
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
1741
1742 *outBuff++ = ch;
1743 }
1744
1745 return outLen;
1746 }
1747
1748 // ----------------------------------------------------------------------------
1749 // endian-reversing conversions
1750 // ----------------------------------------------------------------------------
1751
1752 size_t
1753 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1754 const char *src, size_t srcLen) const
1755 {
1756 srcLen = GetLength(src, srcLen);
1757 if ( srcLen == wxNO_LEN )
1758 return wxCONV_FAILED;
1759
1760 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1761 const size_t inLen = srcLen / BYTES_PER_CHAR;
1762 size_t outLen = 0;
1763 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1764 {
1765 wxUint16 cc[2];
1766 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1767 if ( numChars == wxCONV_FAILED )
1768 return wxCONV_FAILED;
1769
1770 outLen += numChars;
1771 if ( dst )
1772 {
1773 if ( outLen > dstLen )
1774 return wxCONV_FAILED;
1775
1776 *dst++ = cc[0];
1777 if ( numChars == 2 )
1778 {
1779 // second character of a surrogate
1780 *dst++ = cc[1];
1781 }
1782 }
1783 }
1784
1785 return outLen;
1786 }
1787
1788 size_t
1789 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1790 const wchar_t *src, size_t srcLen) const
1791 {
1792 if ( srcLen == wxNO_LEN )
1793 srcLen = wxWcslen(src) + 1;
1794
1795 if ( !dst )
1796 {
1797 // optimization: return maximal space which could be needed for this
1798 // string instead of the exact amount which could be less if there are
1799 // any surrogates in the input
1800 //
1801 // we consider that surrogates are rare enough to make it worthwhile to
1802 // avoid running the loop below at the cost of slightly extra memory
1803 // consumption
1804 return srcLen*BYTES_PER_CHAR;
1805 }
1806
1807 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1808 size_t outLen = 0;
1809 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1810 {
1811 const wxUint32 ch = wxDecodeSurrogate(&src);
1812 if ( !src )
1813 return wxCONV_FAILED;
1814
1815 outLen += BYTES_PER_CHAR;
1816
1817 if ( outLen > dstLen )
1818 return wxCONV_FAILED;
1819
1820 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1821 }
1822
1823 return outLen;
1824 }
1825
1826 #else // !WC_UTF16: wchar_t is UTF-32
1827
1828 // ----------------------------------------------------------------------------
1829 // conversions without endianness change
1830 // ----------------------------------------------------------------------------
1831
1832 size_t
1833 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1834 const char *src, size_t srcLen) const
1835 {
1836 // use memcpy() as it should be much faster than hand-written loop
1837 srcLen = GetLength(src, srcLen);
1838 if ( srcLen == wxNO_LEN )
1839 return wxCONV_FAILED;
1840
1841 const size_t inLen = srcLen/BYTES_PER_CHAR;
1842 if ( dst )
1843 {
1844 if ( dstLen < inLen )
1845 return wxCONV_FAILED;
1846
1847 memcpy(dst, src, srcLen);
1848 }
1849
1850 return inLen;
1851 }
1852
1853 size_t
1854 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1855 const wchar_t *src, size_t srcLen) const
1856 {
1857 if ( srcLen == wxNO_LEN )
1858 srcLen = wxWcslen(src) + 1;
1859
1860 srcLen *= BYTES_PER_CHAR;
1861
1862 if ( dst )
1863 {
1864 if ( dstLen < srcLen )
1865 return wxCONV_FAILED;
1866
1867 memcpy(dst, src, srcLen);
1868 }
1869
1870 return srcLen;
1871 }
1872
1873 // ----------------------------------------------------------------------------
1874 // endian-reversing conversions
1875 // ----------------------------------------------------------------------------
1876
1877 size_t
1878 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1879 const char *src, size_t srcLen) const
1880 {
1881 srcLen = GetLength(src, srcLen);
1882 if ( srcLen == wxNO_LEN )
1883 return wxCONV_FAILED;
1884
1885 srcLen /= BYTES_PER_CHAR;
1886
1887 if ( dst )
1888 {
1889 if ( dstLen < srcLen )
1890 return wxCONV_FAILED;
1891
1892 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1893 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1894 {
1895 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1896 }
1897 }
1898
1899 return srcLen;
1900 }
1901
1902 size_t
1903 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1904 const wchar_t *src, size_t srcLen) const
1905 {
1906 if ( srcLen == wxNO_LEN )
1907 srcLen = wxWcslen(src) + 1;
1908
1909 srcLen *= BYTES_PER_CHAR;
1910
1911 if ( dst )
1912 {
1913 if ( dstLen < srcLen )
1914 return wxCONV_FAILED;
1915
1916 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1917 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1918 {
1919 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1920 }
1921 }
1922
1923 return srcLen;
1924 }
1925
1926 #endif // WC_UTF16/!WC_UTF16
1927
1928
1929 // ============================================================================
1930 // The classes doing conversion using the iconv_xxx() functions
1931 // ============================================================================
1932
1933 #ifdef HAVE_ICONV
1934
1935 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1936 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1937 // (unless there's yet another bug in glibc) the only case when iconv()
1938 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1939 // left in the input buffer -- when _real_ error occurs,
1940 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1941 // iconv() failure.
1942 // [This bug does not appear in glibc 2.2.]
1943 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1944 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1945 (errno != E2BIG || bufLeft != 0))
1946 #else
1947 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1948 #endif
1949
1950 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1951
1952 #define ICONV_T_INVALID ((iconv_t)-1)
1953
1954 #if SIZEOF_WCHAR_T == 4
1955 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1956 #define WC_ENC wxFONTENCODING_UTF32
1957 #elif SIZEOF_WCHAR_T == 2
1958 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1959 #define WC_ENC wxFONTENCODING_UTF16
1960 #else // sizeof(wchar_t) != 2 nor 4
1961 // does this ever happen?
1962 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1963 #endif
1964
1965 // ----------------------------------------------------------------------------
1966 // wxMBConv_iconv: encapsulates an iconv character set
1967 // ----------------------------------------------------------------------------
1968
1969 class wxMBConv_iconv : public wxMBConv
1970 {
1971 public:
1972 wxMBConv_iconv(const char *name);
1973 virtual ~wxMBConv_iconv();
1974
1975 // implement base class virtual methods
1976 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1977 const char *src, size_t srcLen = wxNO_LEN) const;
1978 virtual size_t FromWChar(char *dst, size_t dstLen,
1979 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1980 virtual size_t GetMBNulLen() const;
1981
1982 #if wxUSE_UNICODE_UTF8
1983 virtual bool IsUTF8() const;
1984 #endif
1985
1986 virtual wxMBConv *Clone() const
1987 {
1988 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1989 p->m_minMBCharWidth = m_minMBCharWidth;
1990 return p;
1991 }
1992
1993 bool IsOk() const
1994 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1995
1996 protected:
1997 // the iconv handlers used to translate from multibyte
1998 // to wide char and in the other direction
1999 iconv_t m2w,
2000 w2m;
2001
2002 #if wxUSE_THREADS
2003 // guards access to m2w and w2m objects
2004 wxMutex m_iconvMutex;
2005 #endif
2006
2007 private:
2008 // the name (for iconv_open()) of a wide char charset -- if none is
2009 // available on this machine, it will remain NULL
2010 static wxString ms_wcCharsetName;
2011
2012 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2013 // different endian-ness than the native one
2014 static bool ms_wcNeedsSwap;
2015
2016
2017 // name of the encoding handled by this conversion
2018 wxString m_name;
2019
2020 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2021 // initially
2022 size_t m_minMBCharWidth;
2023 };
2024
2025 // make the constructor available for unit testing
2026 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2027 {
2028 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2029 if ( !result->IsOk() )
2030 {
2031 delete result;
2032 return 0;
2033 }
2034
2035 return result;
2036 }
2037
2038 wxString wxMBConv_iconv::ms_wcCharsetName;
2039 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2040
2041 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2042 : m_name(name)
2043 {
2044 m_minMBCharWidth = 0;
2045
2046 // check for charset that represents wchar_t:
2047 if ( ms_wcCharsetName.empty() )
2048 {
2049 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2050
2051 #if wxUSE_FONTMAP
2052 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2053 #else // !wxUSE_FONTMAP
2054 static const wxChar *names_static[] =
2055 {
2056 #if SIZEOF_WCHAR_T == 4
2057 _T("UCS-4"),
2058 #elif SIZEOF_WCHAR_T = 2
2059 _T("UCS-2"),
2060 #endif
2061 NULL
2062 };
2063 const wxChar **names = names_static;
2064 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2065
2066 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2067 {
2068 const wxString nameCS(*names);
2069
2070 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2071 wxString nameXE(nameCS);
2072
2073 #ifdef WORDS_BIGENDIAN
2074 nameXE += _T("BE");
2075 #else // little endian
2076 nameXE += _T("LE");
2077 #endif
2078
2079 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2080 nameXE.c_str());
2081
2082 m2w = iconv_open(nameXE.ToAscii(), name);
2083 if ( m2w == ICONV_T_INVALID )
2084 {
2085 // try charset w/o bytesex info (e.g. "UCS4")
2086 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2087 nameCS.c_str());
2088 m2w = iconv_open(nameCS.ToAscii(), name);
2089
2090 // and check for bytesex ourselves:
2091 if ( m2w != ICONV_T_INVALID )
2092 {
2093 char buf[2], *bufPtr;
2094 wchar_t wbuf[2];
2095 size_t insz, outsz;
2096 size_t res;
2097
2098 buf[0] = 'A';
2099 buf[1] = 0;
2100 wbuf[0] = 0;
2101 insz = 2;
2102 outsz = SIZEOF_WCHAR_T * 2;
2103 char* wbufPtr = (char*)wbuf;
2104 bufPtr = buf;
2105
2106 res = iconv(
2107 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2108 &wbufPtr, &outsz);
2109
2110 if (ICONV_FAILED(res, insz))
2111 {
2112 wxLogLastError(wxT("iconv"));
2113 wxLogError(_("Conversion to charset '%s' doesn't work."),
2114 nameCS.c_str());
2115 }
2116 else // ok, can convert to this encoding, remember it
2117 {
2118 ms_wcCharsetName = nameCS;
2119 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2120 }
2121 }
2122 }
2123 else // use charset not requiring byte swapping
2124 {
2125 ms_wcCharsetName = nameXE;
2126 }
2127 }
2128
2129 wxLogTrace(TRACE_STRCONV,
2130 wxT("iconv wchar_t charset is \"%s\"%s"),
2131 ms_wcCharsetName.empty() ? wxString("<none>")
2132 : ms_wcCharsetName,
2133 ms_wcNeedsSwap ? _T(" (needs swap)")
2134 : _T(""));
2135 }
2136 else // we already have ms_wcCharsetName
2137 {
2138 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2139 }
2140
2141 if ( ms_wcCharsetName.empty() )
2142 {
2143 w2m = ICONV_T_INVALID;
2144 }
2145 else
2146 {
2147 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2148 if ( w2m == ICONV_T_INVALID )
2149 {
2150 wxLogTrace(TRACE_STRCONV,
2151 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2152 ms_wcCharsetName.c_str(), name);
2153 }
2154 }
2155 }
2156
2157 wxMBConv_iconv::~wxMBConv_iconv()
2158 {
2159 if ( m2w != ICONV_T_INVALID )
2160 iconv_close(m2w);
2161 if ( w2m != ICONV_T_INVALID )
2162 iconv_close(w2m);
2163 }
2164
2165 size_t
2166 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2167 const char *src, size_t srcLen) const
2168 {
2169 if ( srcLen == wxNO_LEN )
2170 {
2171 // find the string length: notice that must be done differently for
2172 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2173 // consecutive NULs
2174 const size_t nulLen = GetMBNulLen();
2175 switch ( nulLen )
2176 {
2177 default:
2178 return wxCONV_FAILED;
2179
2180 case 1:
2181 srcLen = strlen(src); // arguably more optimized than our version
2182 break;
2183
2184 case 2:
2185 case 4:
2186 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2187 // but they also have to start at character boundary and not
2188 // span two adjacent characters
2189 const char *p;
2190 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2191 ;
2192 srcLen = p - src;
2193 break;
2194 }
2195
2196 // when we're determining the length of the string ourselves we count
2197 // the terminating NUL(s) as part of it and always NUL-terminate the
2198 // output
2199 srcLen += nulLen;
2200 }
2201
2202 // we express length in the number of (wide) characters but iconv always
2203 // counts buffer sizes it in bytes
2204 dstLen *= SIZEOF_WCHAR_T;
2205
2206 #if wxUSE_THREADS
2207 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2208 // Unfortunately there are a couple of global wxCSConv objects such as
2209 // wxConvLocal that are used all over wx code, so we have to make sure
2210 // the handle is used by at most one thread at the time. Otherwise
2211 // only a few wx classes would be safe to use from non-main threads
2212 // as MB<->WC conversion would fail "randomly".
2213 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2214 #endif // wxUSE_THREADS
2215
2216 size_t res, cres;
2217 const char *pszPtr = src;
2218
2219 if ( dst )
2220 {
2221 char* bufPtr = (char*)dst;
2222
2223 // have destination buffer, convert there
2224 size_t dstLenOrig = dstLen;
2225 cres = iconv(m2w,
2226 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2227 &bufPtr, &dstLen);
2228
2229 // convert the number of bytes converted as returned by iconv to the
2230 // number of (wide) characters converted that we need
2231 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2232
2233 if (ms_wcNeedsSwap)
2234 {
2235 // convert to native endianness
2236 for ( unsigned i = 0; i < res; i++ )
2237 dst[i] = WC_BSWAP(dst[i]);
2238 }
2239 }
2240 else // no destination buffer
2241 {
2242 // convert using temp buffer to calculate the size of the buffer needed
2243 wchar_t tbuf[8];
2244 res = 0;
2245
2246 do
2247 {
2248 char* bufPtr = (char*)tbuf;
2249 dstLen = 8 * SIZEOF_WCHAR_T;
2250
2251 cres = iconv(m2w,
2252 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2253 &bufPtr, &dstLen );
2254
2255 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2256 }
2257 while ((cres == (size_t)-1) && (errno == E2BIG));
2258 }
2259
2260 if (ICONV_FAILED(cres, srcLen))
2261 {
2262 //VS: it is ok if iconv fails, hence trace only
2263 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2264 return wxCONV_FAILED;
2265 }
2266
2267 return res;
2268 }
2269
2270 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2271 const wchar_t *src, size_t srcLen) const
2272 {
2273 #if wxUSE_THREADS
2274 // NB: explained in MB2WC
2275 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2276 #endif
2277
2278 if ( srcLen == wxNO_LEN )
2279 srcLen = wxWcslen(src) + 1;
2280
2281 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2282 size_t outbuflen = dstLen;
2283 size_t res, cres;
2284
2285 wchar_t *tmpbuf = 0;
2286
2287 if (ms_wcNeedsSwap)
2288 {
2289 // need to copy to temp buffer to switch endianness
2290 // (doing WC_BSWAP twice on the original buffer won't help, as it
2291 // could be in read-only memory, or be accessed in some other thread)
2292 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2293 for ( size_t i = 0; i < srcLen; i++ )
2294 tmpbuf[i] = WC_BSWAP(src[i]);
2295
2296 tmpbuf[srcLen] = L'\0';
2297 src = tmpbuf;
2298 }
2299
2300 char* inbuf = (char*)src;
2301 if ( dst )
2302 {
2303 // have destination buffer, convert there
2304 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2305
2306 res = dstLen - outbuflen;
2307 }
2308 else // no destination buffer
2309 {
2310 // convert using temp buffer to calculate the size of the buffer needed
2311 char tbuf[16];
2312 res = 0;
2313 do
2314 {
2315 dst = tbuf;
2316 outbuflen = 16;
2317
2318 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2319
2320 res += 16 - outbuflen;
2321 }
2322 while ((cres == (size_t)-1) && (errno == E2BIG));
2323 }
2324
2325 if (ms_wcNeedsSwap)
2326 {
2327 free(tmpbuf);
2328 }
2329
2330 if (ICONV_FAILED(cres, inbuflen))
2331 {
2332 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2333 return wxCONV_FAILED;
2334 }
2335
2336 return res;
2337 }
2338
2339 size_t wxMBConv_iconv::GetMBNulLen() const
2340 {
2341 if ( m_minMBCharWidth == 0 )
2342 {
2343 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2344
2345 #if wxUSE_THREADS
2346 // NB: explained in MB2WC
2347 wxMutexLocker lock(self->m_iconvMutex);
2348 #endif
2349
2350 const wchar_t *wnul = L"";
2351 char buf[8]; // should be enough for NUL in any encoding
2352 size_t inLen = sizeof(wchar_t),
2353 outLen = WXSIZEOF(buf);
2354 char *inBuff = (char *)wnul;
2355 char *outBuff = buf;
2356 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2357 {
2358 self->m_minMBCharWidth = (size_t)-1;
2359 }
2360 else // ok
2361 {
2362 self->m_minMBCharWidth = outBuff - buf;
2363 }
2364 }
2365
2366 return m_minMBCharWidth;
2367 }
2368
2369 #if wxUSE_UNICODE_UTF8
2370 bool wxMBConv_iconv::IsUTF8() const
2371 {
2372 return wxStricmp(m_name, "UTF-8") == 0 ||
2373 wxStricmp(m_name, "UTF8") == 0;
2374 }
2375 #endif
2376
2377 #endif // HAVE_ICONV
2378
2379
2380 // ============================================================================
2381 // Win32 conversion classes
2382 // ============================================================================
2383
2384 #ifdef wxHAVE_WIN32_MB2WC
2385
2386 // from utils.cpp
2387 #if wxUSE_FONTMAP
2388 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2389 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2390 #endif
2391
2392 class wxMBConv_win32 : public wxMBConv
2393 {
2394 public:
2395 wxMBConv_win32()
2396 {
2397 m_CodePage = CP_ACP;
2398 m_minMBCharWidth = 0;
2399 }
2400
2401 wxMBConv_win32(const wxMBConv_win32& conv)
2402 : wxMBConv()
2403 {
2404 m_CodePage = conv.m_CodePage;
2405 m_minMBCharWidth = conv.m_minMBCharWidth;
2406 }
2407
2408 #if wxUSE_FONTMAP
2409 wxMBConv_win32(const char* name)
2410 {
2411 m_CodePage = wxCharsetToCodepage(name);
2412 m_minMBCharWidth = 0;
2413 }
2414
2415 wxMBConv_win32(wxFontEncoding encoding)
2416 {
2417 m_CodePage = wxEncodingToCodepage(encoding);
2418 m_minMBCharWidth = 0;
2419 }
2420 #endif // wxUSE_FONTMAP
2421
2422 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2423 {
2424 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2425 // the behaviour is not compatible with the Unix version (using iconv)
2426 // and break the library itself, e.g. wxTextInputStream::NextChar()
2427 // wouldn't work if reading an incomplete MB char didn't result in an
2428 // error
2429 //
2430 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2431 // Win XP or newer and it is not supported for UTF-[78] so we always
2432 // use our own conversions in this case. See
2433 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2434 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2435 if ( m_CodePage == CP_UTF8 )
2436 {
2437 return wxMBConvUTF8().MB2WC(buf, psz, n);
2438 }
2439
2440 if ( m_CodePage == CP_UTF7 )
2441 {
2442 return wxMBConvUTF7().MB2WC(buf, psz, n);
2443 }
2444
2445 int flags = 0;
2446 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2447 IsAtLeastWin2kSP4() )
2448 {
2449 flags = MB_ERR_INVALID_CHARS;
2450 }
2451
2452 const size_t len = ::MultiByteToWideChar
2453 (
2454 m_CodePage, // code page
2455 flags, // flags: fall on error
2456 psz, // input string
2457 -1, // its length (NUL-terminated)
2458 buf, // output string
2459 buf ? n : 0 // size of output buffer
2460 );
2461 if ( !len )
2462 {
2463 // function totally failed
2464 return wxCONV_FAILED;
2465 }
2466
2467 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2468 // check if we succeeded, by doing a double trip:
2469 if ( !flags && buf )
2470 {
2471 const size_t mbLen = strlen(psz);
2472 wxCharBuffer mbBuf(mbLen);
2473 if ( ::WideCharToMultiByte
2474 (
2475 m_CodePage,
2476 0,
2477 buf,
2478 -1,
2479 mbBuf.data(),
2480 mbLen + 1, // size in bytes, not length
2481 NULL,
2482 NULL
2483 ) == 0 ||
2484 strcmp(mbBuf, psz) != 0 )
2485 {
2486 // we didn't obtain the same thing we started from, hence
2487 // the conversion was lossy and we consider that it failed
2488 return wxCONV_FAILED;
2489 }
2490 }
2491
2492 // note that it returns count of written chars for buf != NULL and size
2493 // of the needed buffer for buf == NULL so in either case the length of
2494 // the string (which never includes the terminating NUL) is one less
2495 return len - 1;
2496 }
2497
2498 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2499 {
2500 /*
2501 we have a problem here: by default, WideCharToMultiByte() may
2502 replace characters unrepresentable in the target code page with bad
2503 quality approximations such as turning "1/2" symbol (U+00BD) into
2504 "1" for the code pages which don't have it and we, obviously, want
2505 to avoid this at any price
2506
2507 the trouble is that this function does it _silently_, i.e. it won't
2508 even tell us whether it did or not... Win98/2000 and higher provide
2509 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2510 we have to resort to a round trip, i.e. check that converting back
2511 results in the same string -- this is, of course, expensive but
2512 otherwise we simply can't be sure to not garble the data.
2513 */
2514
2515 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2516 // it doesn't work with CJK encodings (which we test for rather roughly
2517 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2518 // supporting it
2519 BOOL usedDef wxDUMMY_INITIALIZE(false);
2520 BOOL *pUsedDef;
2521 int flags;
2522 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2523 {
2524 // it's our lucky day
2525 flags = WC_NO_BEST_FIT_CHARS;
2526 pUsedDef = &usedDef;
2527 }
2528 else // old system or unsupported encoding
2529 {
2530 flags = 0;
2531 pUsedDef = NULL;
2532 }
2533
2534 const size_t len = ::WideCharToMultiByte
2535 (
2536 m_CodePage, // code page
2537 flags, // either none or no best fit
2538 pwz, // input string
2539 -1, // it is (wide) NUL-terminated
2540 buf, // output buffer
2541 buf ? n : 0, // and its size
2542 NULL, // default "replacement" char
2543 pUsedDef // [out] was it used?
2544 );
2545
2546 if ( !len )
2547 {
2548 // function totally failed
2549 return wxCONV_FAILED;
2550 }
2551
2552 // we did something, check if we really succeeded
2553 if ( flags )
2554 {
2555 // check if the conversion failed, i.e. if any replacements
2556 // were done
2557 if ( usedDef )
2558 return wxCONV_FAILED;
2559 }
2560 else // we must resort to double tripping...
2561 {
2562 // first we need to ensure that we really have the MB data: this is
2563 // not the case if we're called with NULL buffer, in which case we
2564 // need to do the conversion yet again
2565 wxCharBuffer bufDef;
2566 if ( !buf )
2567 {
2568 bufDef = wxCharBuffer(len);
2569 buf = bufDef.data();
2570 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2571 buf, len, NULL, NULL) )
2572 return wxCONV_FAILED;
2573 }
2574
2575 if ( !n )
2576 n = wcslen(pwz);
2577 wxWCharBuffer wcBuf(n);
2578 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2579 wcscmp(wcBuf, pwz) != 0 )
2580 {
2581 // we didn't obtain the same thing we started from, hence
2582 // the conversion was lossy and we consider that it failed
2583 return wxCONV_FAILED;
2584 }
2585 }
2586
2587 // see the comment above for the reason of "len - 1"
2588 return len - 1;
2589 }
2590
2591 virtual size_t GetMBNulLen() const
2592 {
2593 if ( m_minMBCharWidth == 0 )
2594 {
2595 int len = ::WideCharToMultiByte
2596 (
2597 m_CodePage, // code page
2598 0, // no flags
2599 L"", // input string
2600 1, // translate just the NUL
2601 NULL, // output buffer
2602 0, // and its size
2603 NULL, // no replacement char
2604 NULL // [out] don't care if it was used
2605 );
2606
2607 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2608 switch ( len )
2609 {
2610 default:
2611 wxLogDebug(_T("Unexpected NUL length %d"), len);
2612 self->m_minMBCharWidth = (size_t)-1;
2613 break;
2614
2615 case 0:
2616 self->m_minMBCharWidth = (size_t)-1;
2617 break;
2618
2619 case 1:
2620 case 2:
2621 case 4:
2622 self->m_minMBCharWidth = len;
2623 break;
2624 }
2625 }
2626
2627 return m_minMBCharWidth;
2628 }
2629
2630 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2631
2632 bool IsOk() const { return m_CodePage != -1; }
2633
2634 private:
2635 static bool CanUseNoBestFit()
2636 {
2637 static int s_isWin98Or2k = -1;
2638
2639 if ( s_isWin98Or2k == -1 )
2640 {
2641 int verMaj, verMin;
2642 switch ( wxGetOsVersion(&verMaj, &verMin) )
2643 {
2644 case wxOS_WINDOWS_9X:
2645 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2646 break;
2647
2648 case wxOS_WINDOWS_NT:
2649 s_isWin98Or2k = verMaj >= 5;
2650 break;
2651
2652 default:
2653 // unknown: be conservative by default
2654 s_isWin98Or2k = 0;
2655 break;
2656 }
2657
2658 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2659 }
2660
2661 return s_isWin98Or2k == 1;
2662 }
2663
2664 static bool IsAtLeastWin2kSP4()
2665 {
2666 #ifdef __WXWINCE__
2667 return false;
2668 #else
2669 static int s_isAtLeastWin2kSP4 = -1;
2670
2671 if ( s_isAtLeastWin2kSP4 == -1 )
2672 {
2673 OSVERSIONINFOEX ver;
2674
2675 memset(&ver, 0, sizeof(ver));
2676 ver.dwOSVersionInfoSize = sizeof(ver);
2677 GetVersionEx((OSVERSIONINFO*)&ver);
2678
2679 s_isAtLeastWin2kSP4 =
2680 ((ver.dwMajorVersion > 5) || // Vista+
2681 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2682 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2683 ver.wServicePackMajor >= 4)) // 2000 SP4+
2684 ? 1 : 0;
2685 }
2686
2687 return s_isAtLeastWin2kSP4 == 1;
2688 #endif
2689 }
2690
2691
2692 // the code page we're working with
2693 long m_CodePage;
2694
2695 // cached result of GetMBNulLen(), set to 0 initially meaning
2696 // "unknown"
2697 size_t m_minMBCharWidth;
2698 };
2699
2700 #endif // wxHAVE_WIN32_MB2WC
2701
2702
2703 // ============================================================================
2704 // wxEncodingConverter based conversion classes
2705 // ============================================================================
2706
2707 #if wxUSE_FONTMAP
2708
2709 class wxMBConv_wxwin : public wxMBConv
2710 {
2711 private:
2712 void Init()
2713 {
2714 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2715 // The wxMBConv_cf class does a better job.
2716 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2717 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2718 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2719 }
2720
2721 public:
2722 // temporarily just use wxEncodingConverter stuff,
2723 // so that it works while a better implementation is built
2724 wxMBConv_wxwin(const char* name)
2725 {
2726 if (name)
2727 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2728 else
2729 m_enc = wxFONTENCODING_SYSTEM;
2730
2731 Init();
2732 }
2733
2734 wxMBConv_wxwin(wxFontEncoding enc)
2735 {
2736 m_enc = enc;
2737
2738 Init();
2739 }
2740
2741 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2742 {
2743 size_t inbuf = strlen(psz);
2744 if (buf)
2745 {
2746 if (!m2w.Convert(psz, buf))
2747 return wxCONV_FAILED;
2748 }
2749 return inbuf;
2750 }
2751
2752 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2753 {
2754 const size_t inbuf = wxWcslen(psz);
2755 if (buf)
2756 {
2757 if (!w2m.Convert(psz, buf))
2758 return wxCONV_FAILED;
2759 }
2760
2761 return inbuf;
2762 }
2763
2764 virtual size_t GetMBNulLen() const
2765 {
2766 switch ( m_enc )
2767 {
2768 case wxFONTENCODING_UTF16BE:
2769 case wxFONTENCODING_UTF16LE:
2770 return 2;
2771
2772 case wxFONTENCODING_UTF32BE:
2773 case wxFONTENCODING_UTF32LE:
2774 return 4;
2775
2776 default:
2777 return 1;
2778 }
2779 }
2780
2781 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2782
2783 bool IsOk() const { return m_ok; }
2784
2785 public:
2786 wxFontEncoding m_enc;
2787 wxEncodingConverter m2w, w2m;
2788
2789 private:
2790 // were we initialized successfully?
2791 bool m_ok;
2792
2793 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2794 };
2795
2796 // make the constructors available for unit testing
2797 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2798 {
2799 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2800 if ( !result->IsOk() )
2801 {
2802 delete result;
2803 return 0;
2804 }
2805
2806 return result;
2807 }
2808
2809 #endif // wxUSE_FONTMAP
2810
2811 // ============================================================================
2812 // wxCSConv implementation
2813 // ============================================================================
2814
2815 void wxCSConv::Init()
2816 {
2817 m_name = NULL;
2818 m_convReal = NULL;
2819 m_deferred = true;
2820 }
2821
2822 wxCSConv::wxCSConv(const wxString& charset)
2823 {
2824 Init();
2825
2826 if ( !charset.empty() )
2827 {
2828 SetName(charset.ToAscii());
2829 }
2830
2831 #if wxUSE_FONTMAP
2832 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2833 #else
2834 m_encoding = wxFONTENCODING_SYSTEM;
2835 #endif
2836 }
2837
2838 wxCSConv::wxCSConv(wxFontEncoding encoding)
2839 {
2840 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2841 {
2842 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2843
2844 encoding = wxFONTENCODING_SYSTEM;
2845 }
2846
2847 Init();
2848
2849 m_encoding = encoding;
2850 }
2851
2852 wxCSConv::~wxCSConv()
2853 {
2854 Clear();
2855 }
2856
2857 wxCSConv::wxCSConv(const wxCSConv& conv)
2858 : wxMBConv()
2859 {
2860 Init();
2861
2862 SetName(conv.m_name);
2863 m_encoding = conv.m_encoding;
2864 }
2865
2866 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2867 {
2868 Clear();
2869
2870 SetName(conv.m_name);
2871 m_encoding = conv.m_encoding;
2872
2873 return *this;
2874 }
2875
2876 void wxCSConv::Clear()
2877 {
2878 free(m_name);
2879 delete m_convReal;
2880
2881 m_name = NULL;
2882 m_convReal = NULL;
2883 }
2884
2885 void wxCSConv::SetName(const char *charset)
2886 {
2887 if (charset)
2888 {
2889 m_name = wxStrdup(charset);
2890 m_deferred = true;
2891 }
2892 }
2893
2894 #if wxUSE_FONTMAP
2895
2896 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2897 wxEncodingNameCache );
2898
2899 static wxEncodingNameCache gs_nameCache;
2900 #endif
2901
2902 wxMBConv *wxCSConv::DoCreate() const
2903 {
2904 #if wxUSE_FONTMAP
2905 wxLogTrace(TRACE_STRCONV,
2906 wxT("creating conversion for %s"),
2907 (m_name ? m_name
2908 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2909 #endif // wxUSE_FONTMAP
2910
2911 // check for the special case of ASCII or ISO8859-1 charset: as we have
2912 // special knowledge of it anyhow, we don't need to create a special
2913 // conversion object
2914 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2915 m_encoding == wxFONTENCODING_DEFAULT )
2916 {
2917 // don't convert at all
2918 return NULL;
2919 }
2920
2921 // we trust OS to do conversion better than we can so try external
2922 // conversion methods first
2923 //
2924 // the full order is:
2925 // 1. OS conversion (iconv() under Unix or Win32 API)
2926 // 2. hard coded conversions for UTF
2927 // 3. wxEncodingConverter as fall back
2928
2929 // step (1)
2930 #ifdef HAVE_ICONV
2931 #if !wxUSE_FONTMAP
2932 if ( m_name )
2933 #endif // !wxUSE_FONTMAP
2934 {
2935 #if wxUSE_FONTMAP
2936 wxFontEncoding encoding(m_encoding);
2937 #endif
2938
2939 if ( m_name )
2940 {
2941 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2942 if ( conv->IsOk() )
2943 return conv;
2944
2945 delete conv;
2946
2947 #if wxUSE_FONTMAP
2948 encoding =
2949 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2950 #endif // wxUSE_FONTMAP
2951 }
2952 #if wxUSE_FONTMAP
2953 {
2954 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2955 if ( it != gs_nameCache.end() )
2956 {
2957 if ( it->second.empty() )
2958 return NULL;
2959
2960 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2961 if ( conv->IsOk() )
2962 return conv;
2963
2964 delete conv;
2965 }
2966
2967 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2968 // CS : in case this does not return valid names (eg for MacRoman)
2969 // encoding got a 'failure' entry in the cache all the same,
2970 // although it just has to be created using a different method, so
2971 // only store failed iconv creation attempts (or perhaps we
2972 // shoulnd't do this at all ?)
2973 if ( names[0] != NULL )
2974 {
2975 for ( ; *names; ++names )
2976 {
2977 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2978 // will need changes that will obsolete this
2979 wxString name(*names);
2980 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2981 if ( conv->IsOk() )
2982 {
2983 gs_nameCache[encoding] = *names;
2984 return conv;
2985 }
2986
2987 delete conv;
2988 }
2989
2990 gs_nameCache[encoding] = _T(""); // cache the failure
2991 }
2992 }
2993 #endif // wxUSE_FONTMAP
2994 }
2995 #endif // HAVE_ICONV
2996
2997 #ifdef wxHAVE_WIN32_MB2WC
2998 {
2999 #if wxUSE_FONTMAP
3000 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3001 : new wxMBConv_win32(m_encoding);
3002 if ( conv->IsOk() )
3003 return conv;
3004
3005 delete conv;
3006 #else
3007 return NULL;
3008 #endif
3009 }
3010 #endif // wxHAVE_WIN32_MB2WC
3011
3012 #ifdef __DARWIN__
3013 {
3014 // leave UTF16 and UTF32 to the built-ins of wx
3015 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3016 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3017 {
3018 #if wxUSE_FONTMAP
3019 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3020 : new wxMBConv_cf(m_encoding);
3021 #else
3022 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3023 #endif
3024
3025 if ( conv->IsOk() )
3026 return conv;
3027
3028 delete conv;
3029 }
3030 }
3031 #endif // __DARWIN__
3032
3033 // step (2)
3034 wxFontEncoding enc = m_encoding;
3035 #if wxUSE_FONTMAP
3036 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3037 {
3038 // use "false" to suppress interactive dialogs -- we can be called from
3039 // anywhere and popping up a dialog from here is the last thing we want to
3040 // do
3041 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3042 }
3043 #endif // wxUSE_FONTMAP
3044
3045 switch ( enc )
3046 {
3047 case wxFONTENCODING_UTF7:
3048 return new wxMBConvUTF7;
3049
3050 case wxFONTENCODING_UTF8:
3051 return new wxMBConvUTF8;
3052
3053 case wxFONTENCODING_UTF16BE:
3054 return new wxMBConvUTF16BE;
3055
3056 case wxFONTENCODING_UTF16LE:
3057 return new wxMBConvUTF16LE;
3058
3059 case wxFONTENCODING_UTF32BE:
3060 return new wxMBConvUTF32BE;
3061
3062 case wxFONTENCODING_UTF32LE:
3063 return new wxMBConvUTF32LE;
3064
3065 default:
3066 // nothing to do but put here to suppress gcc warnings
3067 break;
3068 }
3069
3070 // step (3)
3071 #if wxUSE_FONTMAP
3072 {
3073 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3074 : new wxMBConv_wxwin(m_encoding);
3075 if ( conv->IsOk() )
3076 return conv;
3077
3078 delete conv;
3079 }
3080 #endif // wxUSE_FONTMAP
3081
3082 // NB: This is a hack to prevent deadlock. What could otherwise happen
3083 // in Unicode build: wxConvLocal creation ends up being here
3084 // because of some failure and logs the error. But wxLog will try to
3085 // attach a timestamp, for which it will need wxConvLocal (to convert
3086 // time to char* and then wchar_t*), but that fails, tries to log the
3087 // error, but wxLog has an (already locked) critical section that
3088 // guards the static buffer.
3089 static bool alreadyLoggingError = false;
3090 if (!alreadyLoggingError)
3091 {
3092 alreadyLoggingError = true;
3093 wxLogError(_("Cannot convert from the charset '%s'!"),
3094 m_name ? m_name
3095 :
3096 #if wxUSE_FONTMAP
3097 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3098 #else // !wxUSE_FONTMAP
3099 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3100 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3101 );
3102
3103 alreadyLoggingError = false;
3104 }
3105
3106 return NULL;
3107 }
3108
3109 void wxCSConv::CreateConvIfNeeded() const
3110 {
3111 if ( m_deferred )
3112 {
3113 wxCSConv *self = (wxCSConv *)this; // const_cast
3114
3115 // if we don't have neither the name nor the encoding, use the default
3116 // encoding for this system
3117 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3118 {
3119 #if wxUSE_INTL
3120 self->m_encoding = wxLocale::GetSystemEncoding();
3121 #else
3122 // fallback to some reasonable default:
3123 self->m_encoding = wxFONTENCODING_ISO8859_1;
3124 #endif // wxUSE_INTL
3125 }
3126
3127 self->m_convReal = DoCreate();
3128 self->m_deferred = false;
3129 }
3130 }
3131
3132 bool wxCSConv::IsOk() const
3133 {
3134 CreateConvIfNeeded();
3135
3136 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3137 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3138 return true; // always ok as we do it ourselves
3139
3140 // m_convReal->IsOk() is called at its own creation, so we know it must
3141 // be ok if m_convReal is non-NULL
3142 return m_convReal != NULL;
3143 }
3144
3145 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3146 const char *src, size_t srcLen) const
3147 {
3148 CreateConvIfNeeded();
3149
3150 if (m_convReal)
3151 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3152
3153 // latin-1 (direct)
3154 if ( srcLen == wxNO_LEN )
3155 srcLen = strlen(src) + 1; // take trailing NUL too
3156
3157 if ( dst )
3158 {
3159 if ( dstLen < srcLen )
3160 return wxCONV_FAILED;
3161
3162 for ( size_t n = 0; n < srcLen; n++ )
3163 dst[n] = (unsigned char)(src[n]);
3164 }
3165
3166 return srcLen;
3167 }
3168
3169 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3170 const wchar_t *src, size_t srcLen) const
3171 {
3172 CreateConvIfNeeded();
3173
3174 if (m_convReal)
3175 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3176
3177 // latin-1 (direct)
3178 if ( srcLen == wxNO_LEN )
3179 srcLen = wxWcslen(src) + 1;
3180
3181 if ( dst )
3182 {
3183 if ( dstLen < srcLen )
3184 return wxCONV_FAILED;
3185
3186 for ( size_t n = 0; n < srcLen; n++ )
3187 {
3188 if ( src[n] > 0xFF )
3189 return wxCONV_FAILED;
3190
3191 dst[n] = (char)src[n];
3192 }
3193
3194 }
3195 else // still need to check the input validity
3196 {
3197 for ( size_t n = 0; n < srcLen; n++ )
3198 {
3199 if ( src[n] > 0xFF )
3200 return wxCONV_FAILED;
3201 }
3202 }
3203
3204 return srcLen;
3205 }
3206
3207 size_t wxCSConv::GetMBNulLen() const
3208 {
3209 CreateConvIfNeeded();
3210
3211 if ( m_convReal )
3212 {
3213 return m_convReal->GetMBNulLen();
3214 }
3215
3216 // otherwise, we are ISO-8859-1
3217 return 1;
3218 }
3219
3220 #if wxUSE_UNICODE_UTF8
3221 bool wxCSConv::IsUTF8() const
3222 {
3223 CreateConvIfNeeded();
3224
3225 if ( m_convReal )
3226 {
3227 return m_convReal->IsUTF8();
3228 }
3229
3230 // otherwise, we are ISO-8859-1
3231 return false;
3232 }
3233 #endif
3234
3235
3236 #if wxUSE_UNICODE
3237
3238 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3239 {
3240 if ( !s )
3241 return wxWCharBuffer();
3242
3243 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3244 if ( !wbuf )
3245 wbuf = wxMBConvUTF8().cMB2WX(s);
3246 if ( !wbuf )
3247 wbuf = wxConvISO8859_1.cMB2WX(s);
3248
3249 return wbuf;
3250 }
3251
3252 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3253 {
3254 if ( !ws )
3255 return wxCharBuffer();
3256
3257 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3258 if ( !buf )
3259 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3260
3261 return buf;
3262 }
3263
3264 #endif // wxUSE_UNICODE
3265
3266 // ----------------------------------------------------------------------------
3267 // globals
3268 // ----------------------------------------------------------------------------
3269
3270 // NB: The reason why we create converted objects in this convoluted way,
3271 // using a factory function instead of global variable, is that they
3272 // may be used at static initialization time (some of them are used by
3273 // wxString ctors and there may be a global wxString object). In other
3274 // words, possibly _before_ the converter global object would be
3275 // initialized.
3276
3277 #undef wxConvLibc
3278 #undef wxConvUTF8
3279 #undef wxConvUTF7
3280 #undef wxConvLocal
3281 #undef wxConvISO8859_1
3282
3283 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3284 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3285 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3286 { \
3287 static impl_klass name##Obj ctor_args; \
3288 return &name##Obj; \
3289 } \
3290 /* this ensures that all global converter objects are created */ \
3291 /* by the time static initialization is done, i.e. before any */ \
3292 /* thread is launched: */ \
3293 static klass* gs_##name##instance = wxGet_##name##Ptr()
3294
3295 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3296 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3297
3298 #ifdef __WINDOWS__
3299 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3300 #else
3301 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3302 #endif
3303
3304 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3305 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3306 // provokes an error message about "not enough macro parameters"; and we
3307 // can't use "()" here as the name##Obj declaration would be parsed as a
3308 // function declaration then, so use a semicolon and live with an extra
3309 // empty statement (and hope that no compilers warns about this)
3310 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3311 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3312
3313 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3314 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3315
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3317 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3318
3319 #ifdef __DARWIN__
3320 // The xnu kernel always communicates file paths in decomposed UTF-8.
3321 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3322 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3323 #endif
3324
3325 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3326 #ifdef __DARWIN__
3327 &wxConvMacUTF8DObj;
3328 #else // !__DARWIN__
3329 wxGet_wxConvLibcPtr();
3330 #endif // __DARWIN__/!__DARWIN__
3331
3332 #else // !wxUSE_WCHAR_T
3333
3334 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3335 // stand-ins in absence of wchar_t
3336 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3337 wxConvISO8859_1,
3338 wxConvLocal,
3339 wxConvUTF8;
3340
3341 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T