]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
102f0d82db071dc811646c92a95273d2a2f3db3e
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
171
172 // the number of NULs terminating this string
173 size_t nulLen = 0; // not really needed, but just to avoid warnings
174
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
181 if ( srcLen != wxNO_LEN )
182 {
183 // we need to know how to find the end of this string
184 nulLen = GetMBNulLen();
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
187
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
190 {
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
193 char * const p = bufTmp.data();
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
196 *s = '\0';
197
198 src = bufTmp;
199 }
200
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
207
208 for ( ;; )
209 {
210 // try to convert the current chunk
211 size_t lenChunk = MB2WC(NULL, src, 0);
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for the L'\0' at the end of this chunk
216
217 dstWritten += lenChunk;
218
219 if ( lenChunk == 1 )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
230 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
235
236 if ( !srcEnd )
237 {
238 // we convert just one chunk in this case as this is the entire
239 // string anyhow
240 break;
241 }
242
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src, nulLen) )
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
250 src += nulLen;
251 }
252
253 src += nulLen; // skipping over its terminator as well
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
258 if ( src >= srcEnd )
259 break;
260 }
261
262 return dstWritten;
263 }
264
265 size_t
266 wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
268 {
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
271
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
278 if ( srcLen == wxNO_LEN )
279 {
280 srcLen = wxWcslen(src) + 1;
281 }
282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
283 {
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp = wxWCharBuffer(srcLen);
286 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
314 }
315
316 return dstWritten;
317 }
318
319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
320 {
321 size_t rc = ToWChar(outBuff, outLen, inBuff);
322 if ( rc != wxCONV_FAILED )
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330 }
331
332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
333 {
334 size_t rc = FromWChar(outBuff, outLen, inBuff);
335 if ( rc != wxCONV_FAILED )
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341 }
342
343 wxMBConv::~wxMBConv()
344 {
345 // nothing to do here (necessary for Darwin linking probably)
346 }
347
348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349 {
350 if ( psz )
351 {
352 // calculate the length of the buffer needed first
353 const size_t nLen = ToWChar(NULL, 0, psz);
354 if ( nLen != wxCONV_FAILED )
355 {
356 // now do the actual conversion
357 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
358
359 // +1 for the trailing NULL
360 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
361 return buf;
362 }
363 }
364
365 return wxWCharBuffer();
366 }
367
368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369 {
370 if ( pwz )
371 {
372 const size_t nLen = FromWChar(NULL, 0, pwz);
373 if ( nLen != wxCONV_FAILED )
374 {
375 wxCharBuffer buf(nLen - 1);
376 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
377 return buf;
378 }
379 }
380
381 return wxCharBuffer();
382 }
383
384 const wxWCharBuffer
385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
386 {
387 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
388 if ( dstLen != wxCONV_FAILED )
389 {
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer wbuf(dstLen);
394 wbuf.data()[dstLen - 1] = L'\0';
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 const size_t nulLen = GetMBNulLen();
421
422 // as above, ensure that the buffer is always NUL-terminated, even if
423 // the input is not
424 wxCharBuffer buf(dstLen + nulLen - 1);
425 memset(buf.data() + dstLen, 0, nulLen);
426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
427 {
428 if ( outLen )
429 {
430 *outLen = dstLen;
431
432 if ( dstLen >= nulLen &&
433 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
434 {
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
437 *outLen -= nulLen;
438 }
439 }
440
441 return buf;
442 }
443 }
444
445 if ( outLen )
446 *outLen = 0;
447
448 return wxCharBuffer();
449 }
450
451 // ----------------------------------------------------------------------------
452 // wxMBConvLibc
453 // ----------------------------------------------------------------------------
454
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456 {
457 return wxMB2WC(buf, psz, n);
458 }
459
460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
461 {
462 return wxWC2MB(buf, psz, n);
463 }
464
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
468
469 #ifdef __UNIX__
470
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
472 {
473 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
474 wxStricmp(charset, _T("UTF8")) == 0 )
475 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
476 else
477 m_conv = new wxCSConv(charset);
478 }
479
480 #endif // __UNIX__
481
482 // ----------------------------------------------------------------------------
483 // UTF-7
484 // ----------------------------------------------------------------------------
485
486 // Implementation (C) 2004 Fredrik Roubert
487 //
488 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
489
490 //
491 // BASE64 decoding table
492 //
493 static const unsigned char utf7unb64[] =
494 {
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
527 };
528
529 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
530 const char *src, size_t srcLen) const
531 {
532 DecoderState stateOrig,
533 *statePtr;
534 if ( srcLen == wxNO_LEN )
535 {
536 // convert the entire string, up to and including the trailing NUL
537 srcLen = strlen(src) + 1;
538
539 // when working on the entire strings we don't update nor use the shift
540 // state from the previous call
541 statePtr = &stateOrig;
542 }
543 else // when working with partial strings we do use the shift state
544 {
545 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
546
547 // also save the old state to be able to rollback to it on error
548 stateOrig = m_stateDecoder;
549 }
550
551 // but to simplify the code below we use this variable in both cases
552 DecoderState& state = *statePtr;
553
554
555 // number of characters [which would have been] written to dst [if it were
556 // not NULL]
557 size_t len = 0;
558
559 const char * const srcEnd = src + srcLen;
560
561 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
562 {
563 const unsigned char cc = *src++;
564
565 if ( state.IsShifted() )
566 {
567 const unsigned char dc = utf7unb64[cc];
568 if ( dc == 0xff )
569 {
570 // end of encoded part
571 state.ToDirect();
572
573 // re-parse this character normally below unless it's '-' which
574 // is consumed by the decoder
575 if ( cc == '-' )
576 continue;
577 }
578 else // valid encoded character
579 {
580 // mini base64 decoder: each character is 6 bits
581 state.bit += 6;
582 state.accum <<= 6;
583 state.accum += dc;
584
585 if ( state.bit >= 8 )
586 {
587 // got the full byte, consume it
588 state.bit -= 8;
589 unsigned char b = (state.accum >> state.bit) & 0x00ff;
590
591 if ( state.isLSB )
592 {
593 // we've got the full word, output it
594 if ( dst )
595 *dst++ = (state.msb << 8) | b;
596 len++;
597 state.isLSB = false;
598 }
599 else // MSB
600 {
601 // just store it while we wait for LSB
602 state.msb = b;
603 state.isLSB = true;
604 }
605 }
606 }
607 }
608
609 if ( state.IsDirect() )
610 {
611 // start of an encoded segment?
612 if ( cc == '+' )
613 {
614 if ( src == srcEnd )
615 return wxCONV_FAILED; // can't have '+' at the end
616
617 if ( *src == '-' )
618 {
619 // just the encoded plus sign, don't switch to shifted mode
620 if ( dst )
621 *dst++ = '+';
622 len++;
623 src++;
624 }
625 else
626 {
627 state.ToShifted();
628 }
629 }
630 else // not '+'
631 {
632 // only printable 7 bit ASCII characters (with the exception of
633 // NUL, TAB, CR and LF) can be used directly
634 if ( cc >= 0x7f || (cc < ' ' &&
635 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
636 return wxCONV_FAILED;
637
638 if ( dst )
639 *dst++ = cc;
640 len++;
641 }
642 }
643 }
644
645 if ( !len )
646 {
647 // as we didn't read any characters we should be called with the same
648 // data (followed by some more new data) again later so don't save our
649 // state
650 state = stateOrig;
651
652 return wxCONV_FAILED;
653 }
654
655 return len;
656 }
657
658 //
659 // BASE64 encoding table
660 //
661 static const unsigned char utf7enb64[] =
662 {
663 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
664 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
665 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
666 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
667 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
668 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
669 'w', 'x', 'y', 'z', '0', '1', '2', '3',
670 '4', '5', '6', '7', '8', '9', '+', '/'
671 };
672
673 //
674 // UTF-7 encoding table
675 //
676 // 0 - Set D (directly encoded characters)
677 // 1 - Set O (optional direct characters)
678 // 2 - whitespace characters (optional)
679 // 3 - special characters
680 //
681 static const unsigned char utf7encode[128] =
682 {
683 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
684 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
685 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
689 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
691 };
692
693 static inline bool wxIsUTF7Direct(wchar_t wc)
694 {
695 return wc < 0x80 && utf7encode[wc] < 1;
696 }
697
698 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
699 const wchar_t *src, size_t srcLen) const
700 {
701 EncoderState stateOrig,
702 *statePtr;
703 if ( srcLen == wxNO_LEN )
704 {
705 // we don't apply the stored state when operating on entire strings at
706 // once
707 statePtr = &stateOrig;
708
709 srcLen = wxWcslen(src) + 1;
710 }
711 else // do use the mode we left the output in previously
712 {
713 stateOrig = m_stateEncoder;
714 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
715 }
716
717 EncoderState& state = *statePtr;
718
719
720 size_t len = 0;
721
722 const wchar_t * const srcEnd = src + srcLen;
723 while ( src < srcEnd && (!dst || len < dstLen) )
724 {
725 wchar_t cc = *src++;
726 if ( wxIsUTF7Direct(cc) )
727 {
728 if ( state.IsShifted() )
729 {
730 // pad with zeros the last encoded block if necessary
731 if ( state.bit )
732 {
733 if ( dst )
734 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
735 len++;
736 }
737
738 state.ToDirect();
739
740 if ( dst )
741 *dst++ = '-';
742 len++;
743 }
744
745 if ( dst )
746 *dst++ = (char)cc;
747 len++;
748 }
749 else if ( cc == '+' && state.IsDirect() )
750 {
751 if ( dst )
752 {
753 *dst++ = '+';
754 *dst++ = '-';
755 }
756
757 len += 2;
758 }
759 #ifndef WC_UTF16
760 else if (((wxUint32)cc) > 0xffff)
761 {
762 // no surrogate pair generation (yet?)
763 return wxCONV_FAILED;
764 }
765 #endif
766 else
767 {
768 if ( state.IsDirect() )
769 {
770 state.ToShifted();
771
772 if ( dst )
773 *dst++ = '+';
774 len++;
775 }
776
777 // BASE64 encode string
778 for ( ;; )
779 {
780 for ( unsigned lsb = 0; lsb < 2; lsb++ )
781 {
782 state.accum <<= 8;
783 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
784
785 for (state.bit += 8; state.bit >= 6; )
786 {
787 state.bit -= 6;
788 if ( dst )
789 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
790 len++;
791 }
792 }
793
794 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
795 break;
796
797 src++;
798 }
799 }
800 }
801
802 // we need to restore the original encoder state if we were called just to
803 // calculate the amount of space needed as we will presumably be called
804 // again to really convert the data now
805 if ( !dst )
806 state = stateOrig;
807
808 return len;
809 }
810
811 // ----------------------------------------------------------------------------
812 // UTF-8
813 // ----------------------------------------------------------------------------
814
815 static const wxUint32 utf8_max[]=
816 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
817
818 // boundaries of the private use area we use to (temporarily) remap invalid
819 // characters invalid in a UTF-8 encoded string
820 const wxUint32 wxUnicodePUA = 0x100000;
821 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
822
823 // this table gives the length of the UTF-8 encoding from its first character:
824 const unsigned char tableUtf8Lengths[256] = {
825 // single-byte sequences (ASCII):
826 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
828 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
834
835 // these are invalid:
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
840 0, 0, // C0,C1
841
842 // two-byte sequences:
843 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
845
846 // three-byte sequences:
847 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
848
849 // four-byte sequences:
850 4, 4, 4, 4, 4, // F0..F4
851
852 // these are invalid again (5- or 6-byte
853 // sequences and sequences for code points
854 // above U+10FFFF, as restricted by RFC 3629):
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
856 };
857
858 size_t
859 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
860 const char *src, size_t srcLen) const
861 {
862 wchar_t *out = dstLen ? dst : NULL;
863 size_t written = 0;
864
865 if ( srcLen == wxNO_LEN )
866 srcLen = strlen(src) + 1;
867
868 for ( const char *p = src; ; p++ )
869 {
870 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
871 {
872 // all done successfully, just add the trailing NULL if we are not
873 // using explicit length
874 if ( srcLen == wxNO_LEN )
875 {
876 if ( out )
877 {
878 if ( !dstLen )
879 break;
880
881 *out = L'\0';
882 }
883
884 written++;
885 }
886
887 return written;
888 }
889
890 if ( out && !dstLen-- )
891 break;
892
893 wxUint32 code;
894 unsigned char c = *p;
895
896 if ( c < 0x80 )
897 {
898 if ( srcLen == 0 ) // the test works for wxNO_LEN too
899 break;
900
901 if ( srcLen != wxNO_LEN )
902 srcLen--;
903
904 code = c;
905 }
906 else
907 {
908 unsigned len = tableUtf8Lengths[c];
909 if ( !len )
910 break;
911
912 if ( srcLen < len ) // the test works for wxNO_LEN too
913 break;
914
915 if ( srcLen != wxNO_LEN )
916 srcLen -= len;
917
918 // Char. number range | UTF-8 octet sequence
919 // (hexadecimal) | (binary)
920 // ----------------------+----------------------------------------
921 // 0000 0000 - 0000 007F | 0xxxxxxx
922 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
923 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
924 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
925 //
926 // Code point value is stored in bits marked with 'x',
927 // lowest-order bit of the value on the right side in the diagram
928 // above. (from RFC 3629)
929
930 // mask to extract lead byte's value ('x' bits above), by sequence
931 // length:
932 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
933
934 // mask and value of lead byte's most significant bits, by length:
935 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
936 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
937
938 len--; // it's more convenient to work with 0-based length here
939
940 // extract the lead byte's value bits:
941 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
942 break;
943
944 code = c & leadValueMask[len];
945
946 // all remaining bytes, if any, are handled in the same way
947 // regardless of sequence's length:
948 for ( ; len; --len )
949 {
950 c = *++p;
951 if ( (c & 0xC0) != 0x80 )
952 return wxCONV_FAILED;
953
954 code <<= 6;
955 code |= c & 0x3F;
956 }
957 }
958
959 #ifdef WC_UTF16
960 // cast is ok because wchar_t == wxUint16 if WC_UTF16
961 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
962 {
963 if ( out )
964 out++;
965 written++;
966 }
967 #else // !WC_UTF16
968 if ( out )
969 *out = code;
970 #endif // WC_UTF16/!WC_UTF16
971
972 if ( out )
973 out++;
974
975 written++;
976 }
977
978 return wxCONV_FAILED;
979 }
980
981 size_t
982 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
983 const wchar_t *src, size_t srcLen) const
984 {
985 char *out = dstLen ? dst : NULL;
986 size_t written = 0;
987
988 for ( const wchar_t *wp = src; ; wp++ )
989 {
990 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
991 {
992 // all done successfully, just add the trailing NULL if we are not
993 // using explicit length
994 if ( srcLen == wxNO_LEN )
995 {
996 if ( out )
997 {
998 if ( !dstLen )
999 break;
1000
1001 *out = '\0';
1002 }
1003
1004 written++;
1005 }
1006
1007 return written;
1008 }
1009
1010
1011 wxUint32 code;
1012 #ifdef WC_UTF16
1013 // cast is ok for WC_UTF16
1014 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1015 {
1016 // skip the next char too as we decoded a surrogate
1017 wp++;
1018 }
1019 #else // wchar_t is UTF-32
1020 code = *wp & 0x7fffffff;
1021 #endif
1022
1023 unsigned len;
1024 if ( code <= 0x7F )
1025 {
1026 len = 1;
1027 if ( out )
1028 {
1029 if ( dstLen < len )
1030 break;
1031
1032 out[0] = (char)code;
1033 }
1034 }
1035 else if ( code <= 0x07FF )
1036 {
1037 len = 2;
1038 if ( out )
1039 {
1040 if ( dstLen < len )
1041 break;
1042
1043 // NB: this line takes 6 least significant bits, encodes them as
1044 // 10xxxxxx and discards them so that the next byte can be encoded:
1045 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1046 out[0] = 0xC0 | code;
1047 }
1048 }
1049 else if ( code < 0xFFFF )
1050 {
1051 len = 3;
1052 if ( out )
1053 {
1054 if ( dstLen < len )
1055 break;
1056
1057 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1058 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1059 out[0] = 0xE0 | code;
1060 }
1061 }
1062 else if ( code <= 0x10FFFF )
1063 {
1064 len = 4;
1065 if ( out )
1066 {
1067 if ( dstLen < len )
1068 break;
1069
1070 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1071 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1072 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1073 out[0] = 0xF0 | code;
1074 }
1075 }
1076 else
1077 {
1078 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1079 break;
1080 }
1081
1082 if ( out )
1083 {
1084 out += len;
1085 dstLen -= len;
1086 }
1087
1088 written += len;
1089 }
1090
1091 // we only get here if an error occurs during decoding
1092 return wxCONV_FAILED;
1093 }
1094
1095 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1096 const char *psz, size_t srcLen) const
1097 {
1098 if ( m_options == MAP_INVALID_UTF8_NOT )
1099 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1100
1101 size_t len = 0;
1102
1103 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1104 {
1105 const char *opsz = psz;
1106 bool invalid = false;
1107 unsigned char cc = *psz++, fc = cc;
1108 unsigned cnt;
1109 for (cnt = 0; fc & 0x80; cnt++)
1110 fc <<= 1;
1111
1112 if (!cnt)
1113 {
1114 // plain ASCII char
1115 if (buf)
1116 *buf++ = cc;
1117 len++;
1118
1119 // escape the escape character for octal escapes
1120 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1121 && cc == '\\' && (!buf || len < n))
1122 {
1123 if (buf)
1124 *buf++ = cc;
1125 len++;
1126 }
1127 }
1128 else
1129 {
1130 cnt--;
1131 if (!cnt)
1132 {
1133 // invalid UTF-8 sequence
1134 invalid = true;
1135 }
1136 else
1137 {
1138 unsigned ocnt = cnt - 1;
1139 wxUint32 res = cc & (0x3f >> cnt);
1140 while (cnt--)
1141 {
1142 cc = *psz;
1143 if ((cc & 0xC0) != 0x80)
1144 {
1145 // invalid UTF-8 sequence
1146 invalid = true;
1147 break;
1148 }
1149
1150 psz++;
1151 res = (res << 6) | (cc & 0x3f);
1152 }
1153
1154 if (invalid || res <= utf8_max[ocnt])
1155 {
1156 // illegal UTF-8 encoding
1157 invalid = true;
1158 }
1159 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1160 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1161 {
1162 // if one of our PUA characters turns up externally
1163 // it must also be treated as an illegal sequence
1164 // (a bit like you have to escape an escape character)
1165 invalid = true;
1166 }
1167 else
1168 {
1169 #ifdef WC_UTF16
1170 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1171 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1172 if (pa == wxCONV_FAILED)
1173 {
1174 invalid = true;
1175 }
1176 else
1177 {
1178 if (buf)
1179 buf += pa;
1180 len += pa;
1181 }
1182 #else // !WC_UTF16
1183 if (buf)
1184 *buf++ = (wchar_t)res;
1185 len++;
1186 #endif // WC_UTF16/!WC_UTF16
1187 }
1188 }
1189
1190 if (invalid)
1191 {
1192 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1193 {
1194 while (opsz < psz && (!buf || len < n))
1195 {
1196 #ifdef WC_UTF16
1197 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1198 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1199 wxASSERT(pa != wxCONV_FAILED);
1200 if (buf)
1201 buf += pa;
1202 opsz++;
1203 len += pa;
1204 #else
1205 if (buf)
1206 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1207 opsz++;
1208 len++;
1209 #endif
1210 }
1211 }
1212 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1213 {
1214 while (opsz < psz && (!buf || len < n))
1215 {
1216 if ( buf && len + 3 < n )
1217 {
1218 unsigned char on = *opsz;
1219 *buf++ = L'\\';
1220 *buf++ = (wchar_t)( L'0' + on / 0100 );
1221 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1222 *buf++ = (wchar_t)( L'0' + on % 010 );
1223 }
1224
1225 opsz++;
1226 len += 4;
1227 }
1228 }
1229 else // MAP_INVALID_UTF8_NOT
1230 {
1231 return wxCONV_FAILED;
1232 }
1233 }
1234 }
1235 }
1236
1237 if (srcLen == wxNO_LEN && buf && (len < n))
1238 *buf = 0;
1239
1240 return len + 1;
1241 }
1242
1243 static inline bool isoctal(wchar_t wch)
1244 {
1245 return L'0' <= wch && wch <= L'7';
1246 }
1247
1248 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1249 const wchar_t *psz, size_t srcLen) const
1250 {
1251 if ( m_options == MAP_INVALID_UTF8_NOT )
1252 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1253
1254 size_t len = 0;
1255
1256 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1257 {
1258 wxUint32 cc;
1259
1260 #ifdef WC_UTF16
1261 // cast is ok for WC_UTF16
1262 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1263 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1264 #else
1265 cc = (*psz++) & 0x7fffffff;
1266 #endif
1267
1268 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1269 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1270 {
1271 if (buf)
1272 *buf++ = (char)(cc - wxUnicodePUA);
1273 len++;
1274 }
1275 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1276 && cc == L'\\' && psz[0] == L'\\' )
1277 {
1278 if (buf)
1279 *buf++ = (char)cc;
1280 psz++;
1281 len++;
1282 }
1283 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1284 cc == L'\\' &&
1285 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1286 {
1287 if (buf)
1288 {
1289 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1290 (psz[1] - L'0') * 010 +
1291 (psz[2] - L'0'));
1292 }
1293
1294 psz += 3;
1295 len++;
1296 }
1297 else
1298 {
1299 unsigned cnt;
1300 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1301 {
1302 }
1303
1304 if (!cnt)
1305 {
1306 // plain ASCII char
1307 if (buf)
1308 *buf++ = (char) cc;
1309 len++;
1310 }
1311 else
1312 {
1313 len += cnt + 1;
1314 if (buf)
1315 {
1316 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1317 while (cnt--)
1318 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1319 }
1320 }
1321 }
1322 }
1323
1324 if (srcLen == wxNO_LEN && buf && (len < n))
1325 *buf = 0;
1326
1327 return len + 1;
1328 }
1329
1330 // ============================================================================
1331 // UTF-16
1332 // ============================================================================
1333
1334 #ifdef WORDS_BIGENDIAN
1335 #define wxMBConvUTF16straight wxMBConvUTF16BE
1336 #define wxMBConvUTF16swap wxMBConvUTF16LE
1337 #else
1338 #define wxMBConvUTF16swap wxMBConvUTF16BE
1339 #define wxMBConvUTF16straight wxMBConvUTF16LE
1340 #endif
1341
1342 /* static */
1343 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1344 {
1345 if ( srcLen == wxNO_LEN )
1346 {
1347 // count the number of bytes in input, including the trailing NULs
1348 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1349 for ( srcLen = 1; *inBuff++; srcLen++ )
1350 ;
1351
1352 srcLen *= BYTES_PER_CHAR;
1353 }
1354 else // we already have the length
1355 {
1356 // we can only convert an entire number of UTF-16 characters
1357 if ( srcLen % BYTES_PER_CHAR )
1358 return wxCONV_FAILED;
1359 }
1360
1361 return srcLen;
1362 }
1363
1364 // case when in-memory representation is UTF-16 too
1365 #ifdef WC_UTF16
1366
1367 // ----------------------------------------------------------------------------
1368 // conversions without endianness change
1369 // ----------------------------------------------------------------------------
1370
1371 size_t
1372 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1373 const char *src, size_t srcLen) const
1374 {
1375 // set up the scene for using memcpy() (which is presumably more efficient
1376 // than copying the bytes one by one)
1377 srcLen = GetLength(src, srcLen);
1378 if ( srcLen == wxNO_LEN )
1379 return wxCONV_FAILED;
1380
1381 const size_t inLen = srcLen / BYTES_PER_CHAR;
1382 if ( dst )
1383 {
1384 if ( dstLen < inLen )
1385 return wxCONV_FAILED;
1386
1387 memcpy(dst, src, srcLen);
1388 }
1389
1390 return inLen;
1391 }
1392
1393 size_t
1394 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1395 const wchar_t *src, size_t srcLen) const
1396 {
1397 if ( srcLen == wxNO_LEN )
1398 srcLen = wxWcslen(src) + 1;
1399
1400 srcLen *= BYTES_PER_CHAR;
1401
1402 if ( dst )
1403 {
1404 if ( dstLen < srcLen )
1405 return wxCONV_FAILED;
1406
1407 memcpy(dst, src, srcLen);
1408 }
1409
1410 return srcLen;
1411 }
1412
1413 // ----------------------------------------------------------------------------
1414 // endian-reversing conversions
1415 // ----------------------------------------------------------------------------
1416
1417 size_t
1418 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1419 const char *src, size_t srcLen) const
1420 {
1421 srcLen = GetLength(src, srcLen);
1422 if ( srcLen == wxNO_LEN )
1423 return wxCONV_FAILED;
1424
1425 srcLen /= BYTES_PER_CHAR;
1426
1427 if ( dst )
1428 {
1429 if ( dstLen < srcLen )
1430 return wxCONV_FAILED;
1431
1432 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1433 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1434 {
1435 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1436 }
1437 }
1438
1439 return srcLen;
1440 }
1441
1442 size_t
1443 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1444 const wchar_t *src, size_t srcLen) const
1445 {
1446 if ( srcLen == wxNO_LEN )
1447 srcLen = wxWcslen(src) + 1;
1448
1449 srcLen *= BYTES_PER_CHAR;
1450
1451 if ( dst )
1452 {
1453 if ( dstLen < srcLen )
1454 return wxCONV_FAILED;
1455
1456 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1457 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1458 {
1459 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1460 }
1461 }
1462
1463 return srcLen;
1464 }
1465
1466 #else // !WC_UTF16: wchar_t is UTF-32
1467
1468 // ----------------------------------------------------------------------------
1469 // conversions without endianness change
1470 // ----------------------------------------------------------------------------
1471
1472 size_t
1473 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1474 const char *src, size_t srcLen) const
1475 {
1476 srcLen = GetLength(src, srcLen);
1477 if ( srcLen == wxNO_LEN )
1478 return wxCONV_FAILED;
1479
1480 const size_t inLen = srcLen / BYTES_PER_CHAR;
1481 if ( !dst )
1482 {
1483 // optimization: return maximal space which could be needed for this
1484 // string even if the real size could be smaller if the buffer contains
1485 // any surrogates
1486 return inLen;
1487 }
1488
1489 size_t outLen = 0;
1490 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1491 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1492 {
1493 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1494 if ( !inBuff )
1495 return wxCONV_FAILED;
1496
1497 if ( ++outLen > dstLen )
1498 return wxCONV_FAILED;
1499
1500 *dst++ = ch;
1501 }
1502
1503
1504 return outLen;
1505 }
1506
1507 size_t
1508 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1509 const wchar_t *src, size_t srcLen) const
1510 {
1511 if ( srcLen == wxNO_LEN )
1512 srcLen = wxWcslen(src) + 1;
1513
1514 size_t outLen = 0;
1515 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1516 for ( size_t n = 0; n < srcLen; n++ )
1517 {
1518 wxUint16 cc[2];
1519 const size_t numChars = encode_utf16(*src++, cc);
1520 if ( numChars == wxCONV_FAILED )
1521 return wxCONV_FAILED;
1522
1523 outLen += numChars * BYTES_PER_CHAR;
1524 if ( outBuff )
1525 {
1526 if ( outLen > dstLen )
1527 return wxCONV_FAILED;
1528
1529 *outBuff++ = cc[0];
1530 if ( numChars == 2 )
1531 {
1532 // second character of a surrogate
1533 *outBuff++ = cc[1];
1534 }
1535 }
1536 }
1537
1538 return outLen;
1539 }
1540
1541 // ----------------------------------------------------------------------------
1542 // endian-reversing conversions
1543 // ----------------------------------------------------------------------------
1544
1545 size_t
1546 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1547 const char *src, size_t srcLen) const
1548 {
1549 srcLen = GetLength(src, srcLen);
1550 if ( srcLen == wxNO_LEN )
1551 return wxCONV_FAILED;
1552
1553 const size_t inLen = srcLen / BYTES_PER_CHAR;
1554 if ( !dst )
1555 {
1556 // optimization: return maximal space which could be needed for this
1557 // string even if the real size could be smaller if the buffer contains
1558 // any surrogates
1559 return inLen;
1560 }
1561
1562 size_t outLen = 0;
1563 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1564 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1565 {
1566 wxUint32 ch;
1567 wxUint16 tmp[2];
1568
1569 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1570 inBuff++;
1571 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1572
1573 const size_t numChars = decode_utf16(tmp, ch);
1574 if ( numChars == wxCONV_FAILED )
1575 return wxCONV_FAILED;
1576
1577 if ( numChars == 2 )
1578 inBuff++;
1579
1580 if ( ++outLen > dstLen )
1581 return wxCONV_FAILED;
1582
1583 *dst++ = ch;
1584 }
1585
1586
1587 return outLen;
1588 }
1589
1590 size_t
1591 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1592 const wchar_t *src, size_t srcLen) const
1593 {
1594 if ( srcLen == wxNO_LEN )
1595 srcLen = wxWcslen(src) + 1;
1596
1597 size_t outLen = 0;
1598 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1599 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1600 {
1601 wxUint16 cc[2];
1602 const size_t numChars = encode_utf16(*src, cc);
1603 if ( numChars == wxCONV_FAILED )
1604 return wxCONV_FAILED;
1605
1606 outLen += numChars * BYTES_PER_CHAR;
1607 if ( outBuff )
1608 {
1609 if ( outLen > dstLen )
1610 return wxCONV_FAILED;
1611
1612 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1613 if ( numChars == 2 )
1614 {
1615 // second character of a surrogate
1616 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1617 }
1618 }
1619 }
1620
1621 return outLen;
1622 }
1623
1624 #endif // WC_UTF16/!WC_UTF16
1625
1626
1627 // ============================================================================
1628 // UTF-32
1629 // ============================================================================
1630
1631 #ifdef WORDS_BIGENDIAN
1632 #define wxMBConvUTF32straight wxMBConvUTF32BE
1633 #define wxMBConvUTF32swap wxMBConvUTF32LE
1634 #else
1635 #define wxMBConvUTF32swap wxMBConvUTF32BE
1636 #define wxMBConvUTF32straight wxMBConvUTF32LE
1637 #endif
1638
1639
1640 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1641 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1642
1643 /* static */
1644 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1645 {
1646 if ( srcLen == wxNO_LEN )
1647 {
1648 // count the number of bytes in input, including the trailing NULs
1649 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1650 for ( srcLen = 1; *inBuff++; srcLen++ )
1651 ;
1652
1653 srcLen *= BYTES_PER_CHAR;
1654 }
1655 else // we already have the length
1656 {
1657 // we can only convert an entire number of UTF-32 characters
1658 if ( srcLen % BYTES_PER_CHAR )
1659 return wxCONV_FAILED;
1660 }
1661
1662 return srcLen;
1663 }
1664
1665 // case when in-memory representation is UTF-16
1666 #ifdef WC_UTF16
1667
1668 // ----------------------------------------------------------------------------
1669 // conversions without endianness change
1670 // ----------------------------------------------------------------------------
1671
1672 size_t
1673 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1674 const char *src, size_t srcLen) const
1675 {
1676 srcLen = GetLength(src, srcLen);
1677 if ( srcLen == wxNO_LEN )
1678 return wxCONV_FAILED;
1679
1680 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1681 const size_t inLen = srcLen / BYTES_PER_CHAR;
1682 size_t outLen = 0;
1683 for ( size_t n = 0; n < inLen; n++ )
1684 {
1685 wxUint16 cc[2];
1686 const size_t numChars = encode_utf16(*inBuff++, cc);
1687 if ( numChars == wxCONV_FAILED )
1688 return wxCONV_FAILED;
1689
1690 outLen += numChars;
1691 if ( dst )
1692 {
1693 if ( outLen > dstLen )
1694 return wxCONV_FAILED;
1695
1696 *dst++ = cc[0];
1697 if ( numChars == 2 )
1698 {
1699 // second character of a surrogate
1700 *dst++ = cc[1];
1701 }
1702 }
1703 }
1704
1705 return outLen;
1706 }
1707
1708 size_t
1709 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1710 const wchar_t *src, size_t srcLen) const
1711 {
1712 if ( srcLen == wxNO_LEN )
1713 srcLen = wxWcslen(src) + 1;
1714
1715 if ( !dst )
1716 {
1717 // optimization: return maximal space which could be needed for this
1718 // string instead of the exact amount which could be less if there are
1719 // any surrogates in the input
1720 //
1721 // we consider that surrogates are rare enough to make it worthwhile to
1722 // avoid running the loop below at the cost of slightly extra memory
1723 // consumption
1724 return srcLen * BYTES_PER_CHAR;
1725 }
1726
1727 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1728 size_t outLen = 0;
1729 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1730 {
1731 const wxUint32 ch = wxDecodeSurrogate(&src);
1732 if ( !src )
1733 return wxCONV_FAILED;
1734
1735 outLen += BYTES_PER_CHAR;
1736
1737 if ( outLen > dstLen )
1738 return wxCONV_FAILED;
1739
1740 *outBuff++ = ch;
1741 }
1742
1743 return outLen;
1744 }
1745
1746 // ----------------------------------------------------------------------------
1747 // endian-reversing conversions
1748 // ----------------------------------------------------------------------------
1749
1750 size_t
1751 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1752 const char *src, size_t srcLen) const
1753 {
1754 srcLen = GetLength(src, srcLen);
1755 if ( srcLen == wxNO_LEN )
1756 return wxCONV_FAILED;
1757
1758 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1759 const size_t inLen = srcLen / BYTES_PER_CHAR;
1760 size_t outLen = 0;
1761 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1762 {
1763 wxUint16 cc[2];
1764 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1765 if ( numChars == wxCONV_FAILED )
1766 return wxCONV_FAILED;
1767
1768 outLen += numChars;
1769 if ( dst )
1770 {
1771 if ( outLen > dstLen )
1772 return wxCONV_FAILED;
1773
1774 *dst++ = cc[0];
1775 if ( numChars == 2 )
1776 {
1777 // second character of a surrogate
1778 *dst++ = cc[1];
1779 }
1780 }
1781 }
1782
1783 return outLen;
1784 }
1785
1786 size_t
1787 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1788 const wchar_t *src, size_t srcLen) const
1789 {
1790 if ( srcLen == wxNO_LEN )
1791 srcLen = wxWcslen(src) + 1;
1792
1793 if ( !dst )
1794 {
1795 // optimization: return maximal space which could be needed for this
1796 // string instead of the exact amount which could be less if there are
1797 // any surrogates in the input
1798 //
1799 // we consider that surrogates are rare enough to make it worthwhile to
1800 // avoid running the loop below at the cost of slightly extra memory
1801 // consumption
1802 return srcLen*BYTES_PER_CHAR;
1803 }
1804
1805 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1806 size_t outLen = 0;
1807 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1808 {
1809 const wxUint32 ch = wxDecodeSurrogate(&src);
1810 if ( !src )
1811 return wxCONV_FAILED;
1812
1813 outLen += BYTES_PER_CHAR;
1814
1815 if ( outLen > dstLen )
1816 return wxCONV_FAILED;
1817
1818 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1819 }
1820
1821 return outLen;
1822 }
1823
1824 #else // !WC_UTF16: wchar_t is UTF-32
1825
1826 // ----------------------------------------------------------------------------
1827 // conversions without endianness change
1828 // ----------------------------------------------------------------------------
1829
1830 size_t
1831 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1832 const char *src, size_t srcLen) const
1833 {
1834 // use memcpy() as it should be much faster than hand-written loop
1835 srcLen = GetLength(src, srcLen);
1836 if ( srcLen == wxNO_LEN )
1837 return wxCONV_FAILED;
1838
1839 const size_t inLen = srcLen/BYTES_PER_CHAR;
1840 if ( dst )
1841 {
1842 if ( dstLen < inLen )
1843 return wxCONV_FAILED;
1844
1845 memcpy(dst, src, srcLen);
1846 }
1847
1848 return inLen;
1849 }
1850
1851 size_t
1852 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1853 const wchar_t *src, size_t srcLen) const
1854 {
1855 if ( srcLen == wxNO_LEN )
1856 srcLen = wxWcslen(src) + 1;
1857
1858 srcLen *= BYTES_PER_CHAR;
1859
1860 if ( dst )
1861 {
1862 if ( dstLen < srcLen )
1863 return wxCONV_FAILED;
1864
1865 memcpy(dst, src, srcLen);
1866 }
1867
1868 return srcLen;
1869 }
1870
1871 // ----------------------------------------------------------------------------
1872 // endian-reversing conversions
1873 // ----------------------------------------------------------------------------
1874
1875 size_t
1876 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1877 const char *src, size_t srcLen) const
1878 {
1879 srcLen = GetLength(src, srcLen);
1880 if ( srcLen == wxNO_LEN )
1881 return wxCONV_FAILED;
1882
1883 srcLen /= BYTES_PER_CHAR;
1884
1885 if ( dst )
1886 {
1887 if ( dstLen < srcLen )
1888 return wxCONV_FAILED;
1889
1890 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1891 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1892 {
1893 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1894 }
1895 }
1896
1897 return srcLen;
1898 }
1899
1900 size_t
1901 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1902 const wchar_t *src, size_t srcLen) const
1903 {
1904 if ( srcLen == wxNO_LEN )
1905 srcLen = wxWcslen(src) + 1;
1906
1907 srcLen *= BYTES_PER_CHAR;
1908
1909 if ( dst )
1910 {
1911 if ( dstLen < srcLen )
1912 return wxCONV_FAILED;
1913
1914 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1915 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1916 {
1917 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1918 }
1919 }
1920
1921 return srcLen;
1922 }
1923
1924 #endif // WC_UTF16/!WC_UTF16
1925
1926
1927 // ============================================================================
1928 // The classes doing conversion using the iconv_xxx() functions
1929 // ============================================================================
1930
1931 #ifdef HAVE_ICONV
1932
1933 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1934 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1935 // (unless there's yet another bug in glibc) the only case when iconv()
1936 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1937 // left in the input buffer -- when _real_ error occurs,
1938 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1939 // iconv() failure.
1940 // [This bug does not appear in glibc 2.2.]
1941 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1942 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1943 (errno != E2BIG || bufLeft != 0))
1944 #else
1945 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1946 #endif
1947
1948 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1949
1950 #define ICONV_T_INVALID ((iconv_t)-1)
1951
1952 #if SIZEOF_WCHAR_T == 4
1953 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1954 #define WC_ENC wxFONTENCODING_UTF32
1955 #elif SIZEOF_WCHAR_T == 2
1956 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1957 #define WC_ENC wxFONTENCODING_UTF16
1958 #else // sizeof(wchar_t) != 2 nor 4
1959 // does this ever happen?
1960 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1961 #endif
1962
1963 // ----------------------------------------------------------------------------
1964 // wxMBConv_iconv: encapsulates an iconv character set
1965 // ----------------------------------------------------------------------------
1966
1967 class wxMBConv_iconv : public wxMBConv
1968 {
1969 public:
1970 wxMBConv_iconv(const char *name);
1971 virtual ~wxMBConv_iconv();
1972
1973 // implement base class virtual methods
1974 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1975 const char *src, size_t srcLen = wxNO_LEN) const;
1976 virtual size_t FromWChar(char *dst, size_t dstLen,
1977 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1978 virtual size_t GetMBNulLen() const;
1979
1980 #if wxUSE_UNICODE_UTF8
1981 virtual bool IsUTF8() const;
1982 #endif
1983
1984 virtual wxMBConv *Clone() const
1985 {
1986 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1987 p->m_minMBCharWidth = m_minMBCharWidth;
1988 return p;
1989 }
1990
1991 bool IsOk() const
1992 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1993
1994 protected:
1995 // the iconv handlers used to translate from multibyte
1996 // to wide char and in the other direction
1997 iconv_t m2w,
1998 w2m;
1999
2000 #if wxUSE_THREADS
2001 // guards access to m2w and w2m objects
2002 wxMutex m_iconvMutex;
2003 #endif
2004
2005 private:
2006 // the name (for iconv_open()) of a wide char charset -- if none is
2007 // available on this machine, it will remain NULL
2008 static wxString ms_wcCharsetName;
2009
2010 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2011 // different endian-ness than the native one
2012 static bool ms_wcNeedsSwap;
2013
2014
2015 // name of the encoding handled by this conversion
2016 wxString m_name;
2017
2018 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2019 // initially
2020 size_t m_minMBCharWidth;
2021 };
2022
2023 // make the constructor available for unit testing
2024 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2025 {
2026 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2027 if ( !result->IsOk() )
2028 {
2029 delete result;
2030 return 0;
2031 }
2032
2033 return result;
2034 }
2035
2036 wxString wxMBConv_iconv::ms_wcCharsetName;
2037 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2038
2039 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2040 : m_name(name)
2041 {
2042 m_minMBCharWidth = 0;
2043
2044 // check for charset that represents wchar_t:
2045 if ( ms_wcCharsetName.empty() )
2046 {
2047 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2048
2049 #if wxUSE_FONTMAP
2050 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2051 #else // !wxUSE_FONTMAP
2052 static const wxChar *names_static[] =
2053 {
2054 #if SIZEOF_WCHAR_T == 4
2055 _T("UCS-4"),
2056 #elif SIZEOF_WCHAR_T = 2
2057 _T("UCS-2"),
2058 #endif
2059 NULL
2060 };
2061 const wxChar **names = names_static;
2062 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2063
2064 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2065 {
2066 const wxString nameCS(*names);
2067
2068 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2069 wxString nameXE(nameCS);
2070
2071 #ifdef WORDS_BIGENDIAN
2072 nameXE += _T("BE");
2073 #else // little endian
2074 nameXE += _T("LE");
2075 #endif
2076
2077 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2078 nameXE.c_str());
2079
2080 m2w = iconv_open(nameXE.ToAscii(), name);
2081 if ( m2w == ICONV_T_INVALID )
2082 {
2083 // try charset w/o bytesex info (e.g. "UCS4")
2084 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2085 nameCS.c_str());
2086 m2w = iconv_open(nameCS.ToAscii(), name);
2087
2088 // and check for bytesex ourselves:
2089 if ( m2w != ICONV_T_INVALID )
2090 {
2091 char buf[2], *bufPtr;
2092 wchar_t wbuf[2];
2093 size_t insz, outsz;
2094 size_t res;
2095
2096 buf[0] = 'A';
2097 buf[1] = 0;
2098 wbuf[0] = 0;
2099 insz = 2;
2100 outsz = SIZEOF_WCHAR_T * 2;
2101 char* wbufPtr = (char*)wbuf;
2102 bufPtr = buf;
2103
2104 res = iconv(
2105 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2106 &wbufPtr, &outsz);
2107
2108 if (ICONV_FAILED(res, insz))
2109 {
2110 wxLogLastError(wxT("iconv"));
2111 wxLogError(_("Conversion to charset '%s' doesn't work."),
2112 nameCS.c_str());
2113 }
2114 else // ok, can convert to this encoding, remember it
2115 {
2116 ms_wcCharsetName = nameCS;
2117 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2118 }
2119 }
2120 }
2121 else // use charset not requiring byte swapping
2122 {
2123 ms_wcCharsetName = nameXE;
2124 }
2125 }
2126
2127 wxLogTrace(TRACE_STRCONV,
2128 wxT("iconv wchar_t charset is \"%s\"%s"),
2129 ms_wcCharsetName.empty() ? wxString("<none>")
2130 : ms_wcCharsetName,
2131 ms_wcNeedsSwap ? _T(" (needs swap)")
2132 : _T(""));
2133 }
2134 else // we already have ms_wcCharsetName
2135 {
2136 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2137 }
2138
2139 if ( ms_wcCharsetName.empty() )
2140 {
2141 w2m = ICONV_T_INVALID;
2142 }
2143 else
2144 {
2145 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2146 if ( w2m == ICONV_T_INVALID )
2147 {
2148 wxLogTrace(TRACE_STRCONV,
2149 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2150 ms_wcCharsetName.c_str(), name);
2151 }
2152 }
2153 }
2154
2155 wxMBConv_iconv::~wxMBConv_iconv()
2156 {
2157 if ( m2w != ICONV_T_INVALID )
2158 iconv_close(m2w);
2159 if ( w2m != ICONV_T_INVALID )
2160 iconv_close(w2m);
2161 }
2162
2163 size_t
2164 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2165 const char *src, size_t srcLen) const
2166 {
2167 if ( srcLen == wxNO_LEN )
2168 {
2169 // find the string length: notice that must be done differently for
2170 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2171 // consecutive NULs
2172 const size_t nulLen = GetMBNulLen();
2173 switch ( nulLen )
2174 {
2175 default:
2176 return wxCONV_FAILED;
2177
2178 case 1:
2179 srcLen = strlen(src); // arguably more optimized than our version
2180 break;
2181
2182 case 2:
2183 case 4:
2184 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2185 // but they also have to start at character boundary and not
2186 // span two adjacent characters
2187 const char *p;
2188 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2189 ;
2190 srcLen = p - src;
2191 break;
2192 }
2193
2194 // when we're determining the length of the string ourselves we count
2195 // the terminating NUL(s) as part of it and always NUL-terminate the
2196 // output
2197 srcLen += nulLen;
2198 }
2199
2200 // we express length in the number of (wide) characters but iconv always
2201 // counts buffer sizes it in bytes
2202 dstLen *= SIZEOF_WCHAR_T;
2203
2204 #if wxUSE_THREADS
2205 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2206 // Unfortunately there are a couple of global wxCSConv objects such as
2207 // wxConvLocal that are used all over wx code, so we have to make sure
2208 // the handle is used by at most one thread at the time. Otherwise
2209 // only a few wx classes would be safe to use from non-main threads
2210 // as MB<->WC conversion would fail "randomly".
2211 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2212 #endif // wxUSE_THREADS
2213
2214 size_t res, cres;
2215 const char *pszPtr = src;
2216
2217 if ( dst )
2218 {
2219 char* bufPtr = (char*)dst;
2220
2221 // have destination buffer, convert there
2222 size_t dstLenOrig = dstLen;
2223 cres = iconv(m2w,
2224 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2225 &bufPtr, &dstLen);
2226
2227 // convert the number of bytes converted as returned by iconv to the
2228 // number of (wide) characters converted that we need
2229 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2230
2231 if (ms_wcNeedsSwap)
2232 {
2233 // convert to native endianness
2234 for ( unsigned i = 0; i < res; i++ )
2235 dst[i] = WC_BSWAP(dst[i]);
2236 }
2237 }
2238 else // no destination buffer
2239 {
2240 // convert using temp buffer to calculate the size of the buffer needed
2241 wchar_t tbuf[8];
2242 res = 0;
2243
2244 do
2245 {
2246 char* bufPtr = (char*)tbuf;
2247 dstLen = 8 * SIZEOF_WCHAR_T;
2248
2249 cres = iconv(m2w,
2250 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2251 &bufPtr, &dstLen );
2252
2253 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2254 }
2255 while ((cres == (size_t)-1) && (errno == E2BIG));
2256 }
2257
2258 if (ICONV_FAILED(cres, srcLen))
2259 {
2260 //VS: it is ok if iconv fails, hence trace only
2261 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2262 return wxCONV_FAILED;
2263 }
2264
2265 return res;
2266 }
2267
2268 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2269 const wchar_t *src, size_t srcLen) const
2270 {
2271 #if wxUSE_THREADS
2272 // NB: explained in MB2WC
2273 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2274 #endif
2275
2276 if ( srcLen == wxNO_LEN )
2277 srcLen = wxWcslen(src) + 1;
2278
2279 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2280 size_t outbuflen = dstLen;
2281 size_t res, cres;
2282
2283 wchar_t *tmpbuf = 0;
2284
2285 if (ms_wcNeedsSwap)
2286 {
2287 // need to copy to temp buffer to switch endianness
2288 // (doing WC_BSWAP twice on the original buffer won't help, as it
2289 // could be in read-only memory, or be accessed in some other thread)
2290 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2291 for ( size_t i = 0; i < srcLen; i++ )
2292 tmpbuf[i] = WC_BSWAP(src[i]);
2293
2294 tmpbuf[srcLen] = L'\0';
2295 src = tmpbuf;
2296 }
2297
2298 char* inbuf = (char*)src;
2299 if ( dst )
2300 {
2301 // have destination buffer, convert there
2302 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2303
2304 res = dstLen - outbuflen;
2305 }
2306 else // no destination buffer
2307 {
2308 // convert using temp buffer to calculate the size of the buffer needed
2309 char tbuf[16];
2310 res = 0;
2311 do
2312 {
2313 dst = tbuf;
2314 outbuflen = 16;
2315
2316 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2317
2318 res += 16 - outbuflen;
2319 }
2320 while ((cres == (size_t)-1) && (errno == E2BIG));
2321 }
2322
2323 if (ms_wcNeedsSwap)
2324 {
2325 free(tmpbuf);
2326 }
2327
2328 if (ICONV_FAILED(cres, inbuflen))
2329 {
2330 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2331 return wxCONV_FAILED;
2332 }
2333
2334 return res;
2335 }
2336
2337 size_t wxMBConv_iconv::GetMBNulLen() const
2338 {
2339 if ( m_minMBCharWidth == 0 )
2340 {
2341 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2342
2343 #if wxUSE_THREADS
2344 // NB: explained in MB2WC
2345 wxMutexLocker lock(self->m_iconvMutex);
2346 #endif
2347
2348 const wchar_t *wnul = L"";
2349 char buf[8]; // should be enough for NUL in any encoding
2350 size_t inLen = sizeof(wchar_t),
2351 outLen = WXSIZEOF(buf);
2352 char *inBuff = (char *)wnul;
2353 char *outBuff = buf;
2354 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2355 {
2356 self->m_minMBCharWidth = (size_t)-1;
2357 }
2358 else // ok
2359 {
2360 self->m_minMBCharWidth = outBuff - buf;
2361 }
2362 }
2363
2364 return m_minMBCharWidth;
2365 }
2366
2367 #if wxUSE_UNICODE_UTF8
2368 bool wxMBConv_iconv::IsUTF8() const
2369 {
2370 return wxStricmp(m_name, "UTF-8") == 0 ||
2371 wxStricmp(m_name, "UTF8") == 0;
2372 }
2373 #endif
2374
2375 #endif // HAVE_ICONV
2376
2377
2378 // ============================================================================
2379 // Win32 conversion classes
2380 // ============================================================================
2381
2382 #ifdef wxHAVE_WIN32_MB2WC
2383
2384 // from utils.cpp
2385 #if wxUSE_FONTMAP
2386 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2387 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2388 #endif
2389
2390 class wxMBConv_win32 : public wxMBConv
2391 {
2392 public:
2393 wxMBConv_win32()
2394 {
2395 m_CodePage = CP_ACP;
2396 m_minMBCharWidth = 0;
2397 }
2398
2399 wxMBConv_win32(const wxMBConv_win32& conv)
2400 : wxMBConv()
2401 {
2402 m_CodePage = conv.m_CodePage;
2403 m_minMBCharWidth = conv.m_minMBCharWidth;
2404 }
2405
2406 #if wxUSE_FONTMAP
2407 wxMBConv_win32(const char* name)
2408 {
2409 m_CodePage = wxCharsetToCodepage(name);
2410 m_minMBCharWidth = 0;
2411 }
2412
2413 wxMBConv_win32(wxFontEncoding encoding)
2414 {
2415 m_CodePage = wxEncodingToCodepage(encoding);
2416 m_minMBCharWidth = 0;
2417 }
2418 #endif // wxUSE_FONTMAP
2419
2420 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2421 {
2422 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2423 // the behaviour is not compatible with the Unix version (using iconv)
2424 // and break the library itself, e.g. wxTextInputStream::NextChar()
2425 // wouldn't work if reading an incomplete MB char didn't result in an
2426 // error
2427 //
2428 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2429 // Win XP or newer and it is not supported for UTF-[78] so we always
2430 // use our own conversions in this case. See
2431 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2432 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2433 if ( m_CodePage == CP_UTF8 )
2434 {
2435 return wxMBConvUTF8().MB2WC(buf, psz, n);
2436 }
2437
2438 if ( m_CodePage == CP_UTF7 )
2439 {
2440 return wxMBConvUTF7().MB2WC(buf, psz, n);
2441 }
2442
2443 int flags = 0;
2444 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2445 IsAtLeastWin2kSP4() )
2446 {
2447 flags = MB_ERR_INVALID_CHARS;
2448 }
2449
2450 const size_t len = ::MultiByteToWideChar
2451 (
2452 m_CodePage, // code page
2453 flags, // flags: fall on error
2454 psz, // input string
2455 -1, // its length (NUL-terminated)
2456 buf, // output string
2457 buf ? n : 0 // size of output buffer
2458 );
2459 if ( !len )
2460 {
2461 // function totally failed
2462 return wxCONV_FAILED;
2463 }
2464
2465 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2466 // check if we succeeded, by doing a double trip:
2467 if ( !flags && buf )
2468 {
2469 const size_t mbLen = strlen(psz);
2470 wxCharBuffer mbBuf(mbLen);
2471 if ( ::WideCharToMultiByte
2472 (
2473 m_CodePage,
2474 0,
2475 buf,
2476 -1,
2477 mbBuf.data(),
2478 mbLen + 1, // size in bytes, not length
2479 NULL,
2480 NULL
2481 ) == 0 ||
2482 strcmp(mbBuf, psz) != 0 )
2483 {
2484 // we didn't obtain the same thing we started from, hence
2485 // the conversion was lossy and we consider that it failed
2486 return wxCONV_FAILED;
2487 }
2488 }
2489
2490 // note that it returns count of written chars for buf != NULL and size
2491 // of the needed buffer for buf == NULL so in either case the length of
2492 // the string (which never includes the terminating NUL) is one less
2493 return len - 1;
2494 }
2495
2496 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2497 {
2498 /*
2499 we have a problem here: by default, WideCharToMultiByte() may
2500 replace characters unrepresentable in the target code page with bad
2501 quality approximations such as turning "1/2" symbol (U+00BD) into
2502 "1" for the code pages which don't have it and we, obviously, want
2503 to avoid this at any price
2504
2505 the trouble is that this function does it _silently_, i.e. it won't
2506 even tell us whether it did or not... Win98/2000 and higher provide
2507 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2508 we have to resort to a round trip, i.e. check that converting back
2509 results in the same string -- this is, of course, expensive but
2510 otherwise we simply can't be sure to not garble the data.
2511 */
2512
2513 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2514 // it doesn't work with CJK encodings (which we test for rather roughly
2515 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2516 // supporting it
2517 BOOL usedDef wxDUMMY_INITIALIZE(false);
2518 BOOL *pUsedDef;
2519 int flags;
2520 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2521 {
2522 // it's our lucky day
2523 flags = WC_NO_BEST_FIT_CHARS;
2524 pUsedDef = &usedDef;
2525 }
2526 else // old system or unsupported encoding
2527 {
2528 flags = 0;
2529 pUsedDef = NULL;
2530 }
2531
2532 const size_t len = ::WideCharToMultiByte
2533 (
2534 m_CodePage, // code page
2535 flags, // either none or no best fit
2536 pwz, // input string
2537 -1, // it is (wide) NUL-terminated
2538 buf, // output buffer
2539 buf ? n : 0, // and its size
2540 NULL, // default "replacement" char
2541 pUsedDef // [out] was it used?
2542 );
2543
2544 if ( !len )
2545 {
2546 // function totally failed
2547 return wxCONV_FAILED;
2548 }
2549
2550 // we did something, check if we really succeeded
2551 if ( flags )
2552 {
2553 // check if the conversion failed, i.e. if any replacements
2554 // were done
2555 if ( usedDef )
2556 return wxCONV_FAILED;
2557 }
2558 else // we must resort to double tripping...
2559 {
2560 // first we need to ensure that we really have the MB data: this is
2561 // not the case if we're called with NULL buffer, in which case we
2562 // need to do the conversion yet again
2563 wxCharBuffer bufDef;
2564 if ( !buf )
2565 {
2566 bufDef = wxCharBuffer(len);
2567 buf = bufDef.data();
2568 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2569 buf, len, NULL, NULL) )
2570 return wxCONV_FAILED;
2571 }
2572
2573 if ( !n )
2574 n = wcslen(pwz);
2575 wxWCharBuffer wcBuf(n);
2576 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2577 wcscmp(wcBuf, pwz) != 0 )
2578 {
2579 // we didn't obtain the same thing we started from, hence
2580 // the conversion was lossy and we consider that it failed
2581 return wxCONV_FAILED;
2582 }
2583 }
2584
2585 // see the comment above for the reason of "len - 1"
2586 return len - 1;
2587 }
2588
2589 virtual size_t GetMBNulLen() const
2590 {
2591 if ( m_minMBCharWidth == 0 )
2592 {
2593 int len = ::WideCharToMultiByte
2594 (
2595 m_CodePage, // code page
2596 0, // no flags
2597 L"", // input string
2598 1, // translate just the NUL
2599 NULL, // output buffer
2600 0, // and its size
2601 NULL, // no replacement char
2602 NULL // [out] don't care if it was used
2603 );
2604
2605 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2606 switch ( len )
2607 {
2608 default:
2609 wxLogDebug(_T("Unexpected NUL length %d"), len);
2610 self->m_minMBCharWidth = (size_t)-1;
2611 break;
2612
2613 case 0:
2614 self->m_minMBCharWidth = (size_t)-1;
2615 break;
2616
2617 case 1:
2618 case 2:
2619 case 4:
2620 self->m_minMBCharWidth = len;
2621 break;
2622 }
2623 }
2624
2625 return m_minMBCharWidth;
2626 }
2627
2628 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2629
2630 bool IsOk() const { return m_CodePage != -1; }
2631
2632 private:
2633 static bool CanUseNoBestFit()
2634 {
2635 static int s_isWin98Or2k = -1;
2636
2637 if ( s_isWin98Or2k == -1 )
2638 {
2639 int verMaj, verMin;
2640 switch ( wxGetOsVersion(&verMaj, &verMin) )
2641 {
2642 case wxOS_WINDOWS_9X:
2643 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2644 break;
2645
2646 case wxOS_WINDOWS_NT:
2647 s_isWin98Or2k = verMaj >= 5;
2648 break;
2649
2650 default:
2651 // unknown: be conservative by default
2652 s_isWin98Or2k = 0;
2653 break;
2654 }
2655
2656 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2657 }
2658
2659 return s_isWin98Or2k == 1;
2660 }
2661
2662 static bool IsAtLeastWin2kSP4()
2663 {
2664 #ifdef __WXWINCE__
2665 return false;
2666 #else
2667 static int s_isAtLeastWin2kSP4 = -1;
2668
2669 if ( s_isAtLeastWin2kSP4 == -1 )
2670 {
2671 OSVERSIONINFOEX ver;
2672
2673 memset(&ver, 0, sizeof(ver));
2674 ver.dwOSVersionInfoSize = sizeof(ver);
2675 GetVersionEx((OSVERSIONINFO*)&ver);
2676
2677 s_isAtLeastWin2kSP4 =
2678 ((ver.dwMajorVersion > 5) || // Vista+
2679 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2680 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2681 ver.wServicePackMajor >= 4)) // 2000 SP4+
2682 ? 1 : 0;
2683 }
2684
2685 return s_isAtLeastWin2kSP4 == 1;
2686 #endif
2687 }
2688
2689
2690 // the code page we're working with
2691 long m_CodePage;
2692
2693 // cached result of GetMBNulLen(), set to 0 initially meaning
2694 // "unknown"
2695 size_t m_minMBCharWidth;
2696 };
2697
2698 #endif // wxHAVE_WIN32_MB2WC
2699
2700
2701 // ============================================================================
2702 // wxEncodingConverter based conversion classes
2703 // ============================================================================
2704
2705 #if wxUSE_FONTMAP
2706
2707 class wxMBConv_wxwin : public wxMBConv
2708 {
2709 private:
2710 void Init()
2711 {
2712 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2713 // The wxMBConv_cf class does a better job.
2714 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2715 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2716 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2717 }
2718
2719 public:
2720 // temporarily just use wxEncodingConverter stuff,
2721 // so that it works while a better implementation is built
2722 wxMBConv_wxwin(const char* name)
2723 {
2724 if (name)
2725 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2726 else
2727 m_enc = wxFONTENCODING_SYSTEM;
2728
2729 Init();
2730 }
2731
2732 wxMBConv_wxwin(wxFontEncoding enc)
2733 {
2734 m_enc = enc;
2735
2736 Init();
2737 }
2738
2739 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2740 {
2741 size_t inbuf = strlen(psz);
2742 if (buf)
2743 {
2744 if (!m2w.Convert(psz, buf))
2745 return wxCONV_FAILED;
2746 }
2747 return inbuf;
2748 }
2749
2750 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2751 {
2752 const size_t inbuf = wxWcslen(psz);
2753 if (buf)
2754 {
2755 if (!w2m.Convert(psz, buf))
2756 return wxCONV_FAILED;
2757 }
2758
2759 return inbuf;
2760 }
2761
2762 virtual size_t GetMBNulLen() const
2763 {
2764 switch ( m_enc )
2765 {
2766 case wxFONTENCODING_UTF16BE:
2767 case wxFONTENCODING_UTF16LE:
2768 return 2;
2769
2770 case wxFONTENCODING_UTF32BE:
2771 case wxFONTENCODING_UTF32LE:
2772 return 4;
2773
2774 default:
2775 return 1;
2776 }
2777 }
2778
2779 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2780
2781 bool IsOk() const { return m_ok; }
2782
2783 public:
2784 wxFontEncoding m_enc;
2785 wxEncodingConverter m2w, w2m;
2786
2787 private:
2788 // were we initialized successfully?
2789 bool m_ok;
2790
2791 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2792 };
2793
2794 // make the constructors available for unit testing
2795 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2796 {
2797 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2798 if ( !result->IsOk() )
2799 {
2800 delete result;
2801 return 0;
2802 }
2803
2804 return result;
2805 }
2806
2807 #endif // wxUSE_FONTMAP
2808
2809 // ============================================================================
2810 // wxCSConv implementation
2811 // ============================================================================
2812
2813 void wxCSConv::Init()
2814 {
2815 m_name = NULL;
2816 m_convReal = NULL;
2817 m_deferred = true;
2818 }
2819
2820 wxCSConv::wxCSConv(const wxString& charset)
2821 {
2822 Init();
2823
2824 if ( !charset.empty() )
2825 {
2826 SetName(charset.ToAscii());
2827 }
2828
2829 #if wxUSE_FONTMAP
2830 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2831 #else
2832 m_encoding = wxFONTENCODING_SYSTEM;
2833 #endif
2834 }
2835
2836 wxCSConv::wxCSConv(wxFontEncoding encoding)
2837 {
2838 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2839 {
2840 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2841
2842 encoding = wxFONTENCODING_SYSTEM;
2843 }
2844
2845 Init();
2846
2847 m_encoding = encoding;
2848 }
2849
2850 wxCSConv::~wxCSConv()
2851 {
2852 Clear();
2853 }
2854
2855 wxCSConv::wxCSConv(const wxCSConv& conv)
2856 : wxMBConv()
2857 {
2858 Init();
2859
2860 SetName(conv.m_name);
2861 m_encoding = conv.m_encoding;
2862 }
2863
2864 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2865 {
2866 Clear();
2867
2868 SetName(conv.m_name);
2869 m_encoding = conv.m_encoding;
2870
2871 return *this;
2872 }
2873
2874 void wxCSConv::Clear()
2875 {
2876 free(m_name);
2877 delete m_convReal;
2878
2879 m_name = NULL;
2880 m_convReal = NULL;
2881 }
2882
2883 void wxCSConv::SetName(const char *charset)
2884 {
2885 if (charset)
2886 {
2887 m_name = wxStrdup(charset);
2888 m_deferred = true;
2889 }
2890 }
2891
2892 #if wxUSE_FONTMAP
2893
2894 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2895 wxEncodingNameCache );
2896
2897 static wxEncodingNameCache gs_nameCache;
2898 #endif
2899
2900 wxMBConv *wxCSConv::DoCreate() const
2901 {
2902 #if wxUSE_FONTMAP
2903 wxLogTrace(TRACE_STRCONV,
2904 wxT("creating conversion for %s"),
2905 (m_name ? m_name
2906 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2907 #endif // wxUSE_FONTMAP
2908
2909 // check for the special case of ASCII or ISO8859-1 charset: as we have
2910 // special knowledge of it anyhow, we don't need to create a special
2911 // conversion object
2912 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2913 m_encoding == wxFONTENCODING_DEFAULT )
2914 {
2915 // don't convert at all
2916 return NULL;
2917 }
2918
2919 // we trust OS to do conversion better than we can so try external
2920 // conversion methods first
2921 //
2922 // the full order is:
2923 // 1. OS conversion (iconv() under Unix or Win32 API)
2924 // 2. hard coded conversions for UTF
2925 // 3. wxEncodingConverter as fall back
2926
2927 // step (1)
2928 #ifdef HAVE_ICONV
2929 #if !wxUSE_FONTMAP
2930 if ( m_name )
2931 #endif // !wxUSE_FONTMAP
2932 {
2933 #if wxUSE_FONTMAP
2934 wxFontEncoding encoding(m_encoding);
2935 #endif
2936
2937 if ( m_name )
2938 {
2939 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2940 if ( conv->IsOk() )
2941 return conv;
2942
2943 delete conv;
2944
2945 #if wxUSE_FONTMAP
2946 encoding =
2947 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2948 #endif // wxUSE_FONTMAP
2949 }
2950 #if wxUSE_FONTMAP
2951 {
2952 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2953 if ( it != gs_nameCache.end() )
2954 {
2955 if ( it->second.empty() )
2956 return NULL;
2957
2958 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2959 if ( conv->IsOk() )
2960 return conv;
2961
2962 delete conv;
2963 }
2964
2965 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2966 // CS : in case this does not return valid names (eg for MacRoman)
2967 // encoding got a 'failure' entry in the cache all the same,
2968 // although it just has to be created using a different method, so
2969 // only store failed iconv creation attempts (or perhaps we
2970 // shoulnd't do this at all ?)
2971 if ( names[0] != NULL )
2972 {
2973 for ( ; *names; ++names )
2974 {
2975 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2976 // will need changes that will obsolete this
2977 wxString name(*names);
2978 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2979 if ( conv->IsOk() )
2980 {
2981 gs_nameCache[encoding] = *names;
2982 return conv;
2983 }
2984
2985 delete conv;
2986 }
2987
2988 gs_nameCache[encoding] = _T(""); // cache the failure
2989 }
2990 }
2991 #endif // wxUSE_FONTMAP
2992 }
2993 #endif // HAVE_ICONV
2994
2995 #ifdef wxHAVE_WIN32_MB2WC
2996 {
2997 #if wxUSE_FONTMAP
2998 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2999 : new wxMBConv_win32(m_encoding);
3000 if ( conv->IsOk() )
3001 return conv;
3002
3003 delete conv;
3004 #else
3005 return NULL;
3006 #endif
3007 }
3008 #endif // wxHAVE_WIN32_MB2WC
3009
3010 #ifdef __DARWIN__
3011 {
3012 // leave UTF16 and UTF32 to the built-ins of wx
3013 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3014 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3015 {
3016 #if wxUSE_FONTMAP
3017 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3018 : new wxMBConv_cf(m_encoding);
3019 #else
3020 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3021 #endif
3022
3023 if ( conv->IsOk() )
3024 return conv;
3025
3026 delete conv;
3027 }
3028 }
3029 #endif // __DARWIN__
3030
3031 // step (2)
3032 wxFontEncoding enc = m_encoding;
3033 #if wxUSE_FONTMAP
3034 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3035 {
3036 // use "false" to suppress interactive dialogs -- we can be called from
3037 // anywhere and popping up a dialog from here is the last thing we want to
3038 // do
3039 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3040 }
3041 #endif // wxUSE_FONTMAP
3042
3043 switch ( enc )
3044 {
3045 case wxFONTENCODING_UTF7:
3046 return new wxMBConvUTF7;
3047
3048 case wxFONTENCODING_UTF8:
3049 return new wxMBConvUTF8;
3050
3051 case wxFONTENCODING_UTF16BE:
3052 return new wxMBConvUTF16BE;
3053
3054 case wxFONTENCODING_UTF16LE:
3055 return new wxMBConvUTF16LE;
3056
3057 case wxFONTENCODING_UTF32BE:
3058 return new wxMBConvUTF32BE;
3059
3060 case wxFONTENCODING_UTF32LE:
3061 return new wxMBConvUTF32LE;
3062
3063 default:
3064 // nothing to do but put here to suppress gcc warnings
3065 break;
3066 }
3067
3068 // step (3)
3069 #if wxUSE_FONTMAP
3070 {
3071 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3072 : new wxMBConv_wxwin(m_encoding);
3073 if ( conv->IsOk() )
3074 return conv;
3075
3076 delete conv;
3077 }
3078 #endif // wxUSE_FONTMAP
3079
3080 // NB: This is a hack to prevent deadlock. What could otherwise happen
3081 // in Unicode build: wxConvLocal creation ends up being here
3082 // because of some failure and logs the error. But wxLog will try to
3083 // attach a timestamp, for which it will need wxConvLocal (to convert
3084 // time to char* and then wchar_t*), but that fails, tries to log the
3085 // error, but wxLog has an (already locked) critical section that
3086 // guards the static buffer.
3087 static bool alreadyLoggingError = false;
3088 if (!alreadyLoggingError)
3089 {
3090 alreadyLoggingError = true;
3091 wxLogError(_("Cannot convert from the charset '%s'!"),
3092 m_name ? m_name
3093 :
3094 #if wxUSE_FONTMAP
3095 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3096 #else // !wxUSE_FONTMAP
3097 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3098 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3099 );
3100
3101 alreadyLoggingError = false;
3102 }
3103
3104 return NULL;
3105 }
3106
3107 void wxCSConv::CreateConvIfNeeded() const
3108 {
3109 if ( m_deferred )
3110 {
3111 wxCSConv *self = (wxCSConv *)this; // const_cast
3112
3113 // if we don't have neither the name nor the encoding, use the default
3114 // encoding for this system
3115 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3116 {
3117 #if wxUSE_INTL
3118 self->m_encoding = wxLocale::GetSystemEncoding();
3119 #else
3120 // fallback to some reasonable default:
3121 self->m_encoding = wxFONTENCODING_ISO8859_1;
3122 #endif // wxUSE_INTL
3123 }
3124
3125 self->m_convReal = DoCreate();
3126 self->m_deferred = false;
3127 }
3128 }
3129
3130 bool wxCSConv::IsOk() const
3131 {
3132 CreateConvIfNeeded();
3133
3134 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3135 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3136 return true; // always ok as we do it ourselves
3137
3138 // m_convReal->IsOk() is called at its own creation, so we know it must
3139 // be ok if m_convReal is non-NULL
3140 return m_convReal != NULL;
3141 }
3142
3143 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3144 const char *src, size_t srcLen) const
3145 {
3146 CreateConvIfNeeded();
3147
3148 if (m_convReal)
3149 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3150
3151 // latin-1 (direct)
3152 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3153 }
3154
3155 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3156 const wchar_t *src, size_t srcLen) const
3157 {
3158 CreateConvIfNeeded();
3159
3160 if (m_convReal)
3161 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3162
3163 // latin-1 (direct)
3164 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3165 }
3166
3167 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3168 {
3169 CreateConvIfNeeded();
3170
3171 if (m_convReal)
3172 return m_convReal->MB2WC(buf, psz, n);
3173
3174 // latin-1 (direct)
3175 size_t len = strlen(psz);
3176
3177 if (buf)
3178 {
3179 for (size_t c = 0; c <= len; c++)
3180 buf[c] = (unsigned char)(psz[c]);
3181 }
3182
3183 return len;
3184 }
3185
3186 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3187 {
3188 CreateConvIfNeeded();
3189
3190 if (m_convReal)
3191 return m_convReal->WC2MB(buf, psz, n);
3192
3193 // latin-1 (direct)
3194 const size_t len = wxWcslen(psz);
3195 if (buf)
3196 {
3197 for (size_t c = 0; c <= len; c++)
3198 {
3199 if (psz[c] > 0xFF)
3200 return wxCONV_FAILED;
3201
3202 buf[c] = (char)psz[c];
3203 }
3204 }
3205 else
3206 {
3207 for (size_t c = 0; c <= len; c++)
3208 {
3209 if (psz[c] > 0xFF)
3210 return wxCONV_FAILED;
3211 }
3212 }
3213
3214 return len;
3215 }
3216
3217 size_t wxCSConv::GetMBNulLen() const
3218 {
3219 CreateConvIfNeeded();
3220
3221 if ( m_convReal )
3222 {
3223 return m_convReal->GetMBNulLen();
3224 }
3225
3226 // otherwise, we are ISO-8859-1
3227 return 1;
3228 }
3229
3230 #if wxUSE_UNICODE_UTF8
3231 bool wxCSConv::IsUTF8() const
3232 {
3233 CreateConvIfNeeded();
3234
3235 if ( m_convReal )
3236 {
3237 return m_convReal->IsUTF8();
3238 }
3239
3240 // otherwise, we are ISO-8859-1
3241 return false;
3242 }
3243 #endif
3244
3245
3246 #if wxUSE_UNICODE
3247
3248 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3249 {
3250 if ( !s )
3251 return wxWCharBuffer();
3252
3253 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3254 if ( !wbuf )
3255 wbuf = wxMBConvUTF8().cMB2WX(s);
3256 if ( !wbuf )
3257 wbuf = wxConvISO8859_1.cMB2WX(s);
3258
3259 return wbuf;
3260 }
3261
3262 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3263 {
3264 if ( !ws )
3265 return wxCharBuffer();
3266
3267 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3268 if ( !buf )
3269 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3270
3271 return buf;
3272 }
3273
3274 #endif // wxUSE_UNICODE
3275
3276 // ----------------------------------------------------------------------------
3277 // globals
3278 // ----------------------------------------------------------------------------
3279
3280 // NB: The reason why we create converted objects in this convoluted way,
3281 // using a factory function instead of global variable, is that they
3282 // may be used at static initialization time (some of them are used by
3283 // wxString ctors and there may be a global wxString object). In other
3284 // words, possibly _before_ the converter global object would be
3285 // initialized.
3286
3287 #undef wxConvLibc
3288 #undef wxConvUTF8
3289 #undef wxConvUTF7
3290 #undef wxConvLocal
3291 #undef wxConvISO8859_1
3292
3293 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3294 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3295 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3296 { \
3297 static impl_klass name##Obj ctor_args; \
3298 return &name##Obj; \
3299 } \
3300 /* this ensures that all global converter objects are created */ \
3301 /* by the time static initialization is done, i.e. before any */ \
3302 /* thread is launched: */ \
3303 static klass* gs_##name##instance = wxGet_##name##Ptr()
3304
3305 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3306 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3307
3308 #ifdef __WINDOWS__
3309 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3310 #else
3311 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3312 #endif
3313
3314 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3315 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3316 // provokes an error message about "not enough macro parameters"; and we
3317 // can't use "()" here as the name##Obj declaration would be parsed as a
3318 // function declaration then, so use a semicolon and live with an extra
3319 // empty statement (and hope that no compilers warns about this)
3320 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3321 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3322
3323 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3324 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3325
3326 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3327 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3328
3329 #ifdef __DARWIN__
3330 // The xnu kernel always communicates file paths in decomposed UTF-8.
3331 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3332 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3333 #endif
3334
3335 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3336 #ifdef __DARWIN__
3337 &wxConvMacUTF8DObj;
3338 #else // !__DARWIN__
3339 wxGet_wxConvLibcPtr();
3340 #endif // __DARWIN__/!__DARWIN__
3341
3342 #else // !wxUSE_WCHAR_T
3343
3344 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3345 // stand-ins in absence of wchar_t
3346 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3347 wxConvISO8859_1,
3348 wxConvLocal,
3349 wxConvUTF8;
3350
3351 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T