]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Add ribbon libraries sources to the monolithic build.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV wxT("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
227 for ( ;; )
228 {
229 // try to convert the current chunk
230 size_t lenChunk = MB2WC(NULL, src, 0);
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dstWritten += lenChunk;
235 if ( !srcEnd )
236 dstWritten++;
237
238 if ( !lenChunk )
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
254 if ( !srcEnd )
255 dst++;
256 }
257
258 if ( !srcEnd )
259 {
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
262 break;
263 }
264
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 src += nulLen; // skipping over its terminator as well
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
280 if ( src >= srcEnd )
281 break;
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
292 }
293
294 return dstWritten;
295 }
296
297 size_t
298 wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
300 {
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
303
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
311 wxWCharBuffer bufTmp;
312 if ( isNulTerminated )
313 {
314 srcLen = wxWcslen(src) + 1;
315 }
316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
317 {
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp = wxWCharBuffer(srcLen);
320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
335 dstWritten += lenChunk;
336 if ( src+lenChunk < srcEnd || isNulTerminated )
337 dstWritten += lenNul;
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
348 if ( src+lenChunk < srcEnd || isNulTerminated )
349 dst += lenNul;
350 }
351 }
352
353 return dstWritten;
354 }
355
356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
357 {
358 size_t rc = ToWChar(outBuff, outLen, inBuff);
359 if ( rc != wxCONV_FAILED )
360 {
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
363 rc--;
364 }
365
366 return rc;
367 }
368
369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
370 {
371 size_t rc = FromWChar(outBuff, outLen, inBuff);
372 if ( rc != wxCONV_FAILED )
373 {
374 rc -= GetMBNulLen();
375 }
376
377 return rc;
378 }
379
380 wxMBConv::~wxMBConv()
381 {
382 // nothing to do here (necessary for Darwin linking probably)
383 }
384
385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
386 {
387 if ( psz )
388 {
389 // calculate the length of the buffer needed first
390 const size_t nLen = ToWChar(NULL, 0, psz);
391 if ( nLen != wxCONV_FAILED )
392 {
393 // now do the actual conversion
394 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
395
396 // +1 for the trailing NULL
397 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
398 return buf;
399 }
400 }
401
402 return wxWCharBuffer();
403 }
404
405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
406 {
407 if ( pwz )
408 {
409 const size_t nLen = FromWChar(NULL, 0, pwz);
410 if ( nLen != wxCONV_FAILED )
411 {
412 wxCharBuffer buf(nLen - 1);
413 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
414 return buf;
415 }
416 }
417
418 return wxCharBuffer();
419 }
420
421 const wxWCharBuffer
422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
423 {
424 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
425 if ( dstLen != wxCONV_FAILED )
426 {
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer wbuf(dstLen);
431 wbuf.data()[dstLen] = L'\0';
432 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
433 {
434 if ( outLen )
435 {
436 *outLen = dstLen;
437
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen == wxNO_LEN )
444 (*outLen)--;
445 }
446
447 return wbuf;
448 }
449 }
450
451 if ( outLen )
452 *outLen = 0;
453
454 return wxWCharBuffer();
455 }
456
457 const wxCharBuffer
458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
459 {
460 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
461 if ( dstLen != wxCONV_FAILED )
462 {
463 const size_t nulLen = GetMBNulLen();
464
465 // as above, ensure that the buffer is always NUL-terminated, even if
466 // the input is not
467 wxCharBuffer buf(dstLen + nulLen - 1);
468 memset(buf.data() + dstLen, 0, nulLen);
469 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
470 {
471 if ( outLen )
472 {
473 *outLen = dstLen;
474
475 if ( inLen == wxNO_LEN )
476 {
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
479 *outLen -= nulLen;
480 }
481 }
482
483 return buf;
484 }
485 }
486
487 if ( outLen )
488 *outLen = 0;
489
490 return wxCharBuffer();
491 }
492
493 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
494 {
495 const size_t srcLen = buf.length();
496 if ( srcLen )
497 {
498 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
499 if ( dstLen != wxCONV_FAILED )
500 {
501 wxWCharBuffer wbuf(dstLen);
502 wbuf.data()[dstLen] = L'\0';
503 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
504 return wbuf;
505 }
506 }
507
508 return wxWCharBuffer();
509 }
510
511 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
512 {
513 const size_t srcLen = wbuf.length();
514 if ( srcLen )
515 {
516 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
517 if ( dstLen != wxCONV_FAILED )
518 {
519 wxCharBuffer buf(dstLen);
520 buf.data()[dstLen] = '\0';
521 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
522 return buf;
523 }
524 }
525
526 return wxCharBuffer();
527 }
528
529 // ----------------------------------------------------------------------------
530 // wxMBConvLibc
531 // ----------------------------------------------------------------------------
532
533 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
534 {
535 return wxMB2WC(buf, psz, n);
536 }
537
538 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
539 {
540 return wxWC2MB(buf, psz, n);
541 }
542
543 // ----------------------------------------------------------------------------
544 // wxConvBrokenFileNames
545 // ----------------------------------------------------------------------------
546
547 #ifdef __UNIX__
548
549 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
550 {
551 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
552 wxStricmp(charset, wxT("UTF8")) == 0 )
553 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
554 else
555 m_conv = new wxCSConv(charset);
556 }
557
558 #endif // __UNIX__
559
560 // ----------------------------------------------------------------------------
561 // UTF-7
562 // ----------------------------------------------------------------------------
563
564 // Implementation (C) 2004 Fredrik Roubert
565 //
566 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
567
568 //
569 // BASE64 decoding table
570 //
571 static const unsigned char utf7unb64[] =
572 {
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
576 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
577 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
578 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
579 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
580 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
581 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
582 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
583 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
584 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
585 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
586 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
587 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
588 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
605 };
606
607 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
608 const char *src, size_t srcLen) const
609 {
610 DecoderState stateOrig,
611 *statePtr;
612 if ( srcLen == wxNO_LEN )
613 {
614 // convert the entire string, up to and including the trailing NUL
615 srcLen = strlen(src) + 1;
616
617 // when working on the entire strings we don't update nor use the shift
618 // state from the previous call
619 statePtr = &stateOrig;
620 }
621 else // when working with partial strings we do use the shift state
622 {
623 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
624
625 // also save the old state to be able to rollback to it on error
626 stateOrig = m_stateDecoder;
627 }
628
629 // but to simplify the code below we use this variable in both cases
630 DecoderState& state = *statePtr;
631
632
633 // number of characters [which would have been] written to dst [if it were
634 // not NULL]
635 size_t len = 0;
636
637 const char * const srcEnd = src + srcLen;
638
639 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
640 {
641 const unsigned char cc = *src++;
642
643 if ( state.IsShifted() )
644 {
645 const unsigned char dc = utf7unb64[cc];
646 if ( dc == 0xff )
647 {
648 // end of encoded part, check that nothing was left: there can
649 // be up to 4 bits of 0 padding but nothing else (we also need
650 // to check isLSB as we count bits modulo 8 while a valid UTF-7
651 // encoded sequence must contain an integral number of UTF-16
652 // characters)
653 if ( state.isLSB || state.bit > 4 ||
654 (state.accum & ((1 << state.bit) - 1)) )
655 {
656 if ( !len )
657 state = stateOrig;
658
659 return wxCONV_FAILED;
660 }
661
662 state.ToDirect();
663
664 // re-parse this character normally below unless it's '-' which
665 // is consumed by the decoder
666 if ( cc == '-' )
667 continue;
668 }
669 else // valid encoded character
670 {
671 // mini base64 decoder: each character is 6 bits
672 state.bit += 6;
673 state.accum <<= 6;
674 state.accum += dc;
675
676 if ( state.bit >= 8 )
677 {
678 // got the full byte, consume it
679 state.bit -= 8;
680 unsigned char b = (state.accum >> state.bit) & 0x00ff;
681
682 if ( state.isLSB )
683 {
684 // we've got the full word, output it
685 if ( dst )
686 *dst++ = (state.msb << 8) | b;
687 len++;
688 state.isLSB = false;
689 }
690 else // MSB
691 {
692 // just store it while we wait for LSB
693 state.msb = b;
694 state.isLSB = true;
695 }
696 }
697 }
698 }
699
700 if ( state.IsDirect() )
701 {
702 // start of an encoded segment?
703 if ( cc == '+' )
704 {
705 if ( *src == '-' )
706 {
707 // just the encoded plus sign, don't switch to shifted mode
708 if ( dst )
709 *dst++ = '+';
710 len++;
711 src++;
712 }
713 else if ( utf7unb64[(unsigned)*src] == 0xff )
714 {
715 // empty encoded chunks are not allowed
716 if ( !len )
717 state = stateOrig;
718
719 return wxCONV_FAILED;
720 }
721 else // base-64 encoded chunk follows
722 {
723 state.ToShifted();
724 }
725 }
726 else // not '+'
727 {
728 // only printable 7 bit ASCII characters (with the exception of
729 // NUL, TAB, CR and LF) can be used directly
730 if ( cc >= 0x7f || (cc < ' ' &&
731 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
732 return wxCONV_FAILED;
733
734 if ( dst )
735 *dst++ = cc;
736 len++;
737 }
738 }
739 }
740
741 if ( !len )
742 {
743 // as we didn't read any characters we should be called with the same
744 // data (followed by some more new data) again later so don't save our
745 // state
746 state = stateOrig;
747
748 return wxCONV_FAILED;
749 }
750
751 return len;
752 }
753
754 //
755 // BASE64 encoding table
756 //
757 static const unsigned char utf7enb64[] =
758 {
759 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
760 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
761 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
762 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
763 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
764 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
765 'w', 'x', 'y', 'z', '0', '1', '2', '3',
766 '4', '5', '6', '7', '8', '9', '+', '/'
767 };
768
769 //
770 // UTF-7 encoding table
771 //
772 // 0 - Set D (directly encoded characters)
773 // 1 - Set O (optional direct characters)
774 // 2 - whitespace characters (optional)
775 // 3 - special characters
776 //
777 static const unsigned char utf7encode[128] =
778 {
779 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
780 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
781 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
783 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
785 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
787 };
788
789 static inline bool wxIsUTF7Direct(wchar_t wc)
790 {
791 return wc < 0x80 && utf7encode[wc] < 1;
792 }
793
794 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
795 const wchar_t *src, size_t srcLen) const
796 {
797 EncoderState stateOrig,
798 *statePtr;
799 if ( srcLen == wxNO_LEN )
800 {
801 // we don't apply the stored state when operating on entire strings at
802 // once
803 statePtr = &stateOrig;
804
805 srcLen = wxWcslen(src) + 1;
806 }
807 else // do use the mode we left the output in previously
808 {
809 stateOrig = m_stateEncoder;
810 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
811 }
812
813 EncoderState& state = *statePtr;
814
815
816 size_t len = 0;
817
818 const wchar_t * const srcEnd = src + srcLen;
819 while ( src < srcEnd && (!dst || len < dstLen) )
820 {
821 wchar_t cc = *src++;
822 if ( wxIsUTF7Direct(cc) )
823 {
824 if ( state.IsShifted() )
825 {
826 // pad with zeros the last encoded block if necessary
827 if ( state.bit )
828 {
829 if ( dst )
830 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
831 len++;
832 }
833
834 state.ToDirect();
835
836 if ( dst )
837 *dst++ = '-';
838 len++;
839 }
840
841 if ( dst )
842 *dst++ = (char)cc;
843 len++;
844 }
845 else if ( cc == '+' && state.IsDirect() )
846 {
847 if ( dst )
848 {
849 *dst++ = '+';
850 *dst++ = '-';
851 }
852
853 len += 2;
854 }
855 #ifndef WC_UTF16
856 else if (((wxUint32)cc) > 0xffff)
857 {
858 // no surrogate pair generation (yet?)
859 return wxCONV_FAILED;
860 }
861 #endif
862 else
863 {
864 if ( state.IsDirect() )
865 {
866 state.ToShifted();
867
868 if ( dst )
869 *dst++ = '+';
870 len++;
871 }
872
873 // BASE64 encode string
874 for ( ;; )
875 {
876 for ( unsigned lsb = 0; lsb < 2; lsb++ )
877 {
878 state.accum <<= 8;
879 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
880
881 for (state.bit += 8; state.bit >= 6; )
882 {
883 state.bit -= 6;
884 if ( dst )
885 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
886 len++;
887 }
888 }
889
890 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
891 break;
892
893 src++;
894 }
895 }
896 }
897
898 // we need to restore the original encoder state if we were called just to
899 // calculate the amount of space needed as we will presumably be called
900 // again to really convert the data now
901 if ( !dst )
902 state = stateOrig;
903
904 return len;
905 }
906
907 // ----------------------------------------------------------------------------
908 // UTF-8
909 // ----------------------------------------------------------------------------
910
911 static const wxUint32 utf8_max[]=
912 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
913
914 // boundaries of the private use area we use to (temporarily) remap invalid
915 // characters invalid in a UTF-8 encoded string
916 const wxUint32 wxUnicodePUA = 0x100000;
917 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
918
919 // this table gives the length of the UTF-8 encoding from its first character:
920 const unsigned char tableUtf8Lengths[256] = {
921 // single-byte sequences (ASCII):
922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
929 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
930
931 // these are invalid:
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
936 0, 0, // C0,C1
937
938 // two-byte sequences:
939 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
940 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
941
942 // three-byte sequences:
943 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
944
945 // four-byte sequences:
946 4, 4, 4, 4, 4, // F0..F4
947
948 // these are invalid again (5- or 6-byte
949 // sequences and sequences for code points
950 // above U+10FFFF, as restricted by RFC 3629):
951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
952 };
953
954 size_t
955 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
956 const char *src, size_t srcLen) const
957 {
958 wchar_t *out = dstLen ? dst : NULL;
959 size_t written = 0;
960
961 if ( srcLen == wxNO_LEN )
962 srcLen = strlen(src) + 1;
963
964 for ( const char *p = src; ; p++ )
965 {
966 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
967 {
968 // all done successfully, just add the trailing NULL if we are not
969 // using explicit length
970 if ( srcLen == wxNO_LEN )
971 {
972 if ( out )
973 {
974 if ( !dstLen )
975 break;
976
977 *out = L'\0';
978 }
979
980 written++;
981 }
982
983 return written;
984 }
985
986 if ( out && !dstLen-- )
987 break;
988
989 wxUint32 code;
990 unsigned char c = *p;
991
992 if ( c < 0x80 )
993 {
994 if ( srcLen == 0 ) // the test works for wxNO_LEN too
995 break;
996
997 if ( srcLen != wxNO_LEN )
998 srcLen--;
999
1000 code = c;
1001 }
1002 else
1003 {
1004 unsigned len = tableUtf8Lengths[c];
1005 if ( !len )
1006 break;
1007
1008 if ( srcLen < len ) // the test works for wxNO_LEN too
1009 break;
1010
1011 if ( srcLen != wxNO_LEN )
1012 srcLen -= len;
1013
1014 // Char. number range | UTF-8 octet sequence
1015 // (hexadecimal) | (binary)
1016 // ----------------------+----------------------------------------
1017 // 0000 0000 - 0000 007F | 0xxxxxxx
1018 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1019 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1020 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1021 //
1022 // Code point value is stored in bits marked with 'x',
1023 // lowest-order bit of the value on the right side in the diagram
1024 // above. (from RFC 3629)
1025
1026 // mask to extract lead byte's value ('x' bits above), by sequence
1027 // length:
1028 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1029
1030 // mask and value of lead byte's most significant bits, by length:
1031 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1032 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1033
1034 len--; // it's more convenient to work with 0-based length here
1035
1036 // extract the lead byte's value bits:
1037 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1038 break;
1039
1040 code = c & leadValueMask[len];
1041
1042 // all remaining bytes, if any, are handled in the same way
1043 // regardless of sequence's length:
1044 for ( ; len; --len )
1045 {
1046 c = *++p;
1047 if ( (c & 0xC0) != 0x80 )
1048 return wxCONV_FAILED;
1049
1050 code <<= 6;
1051 code |= c & 0x3F;
1052 }
1053 }
1054
1055 #ifdef WC_UTF16
1056 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1057 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1058 {
1059 if ( out )
1060 out++;
1061 written++;
1062 }
1063 #else // !WC_UTF16
1064 if ( out )
1065 *out = code;
1066 #endif // WC_UTF16/!WC_UTF16
1067
1068 if ( out )
1069 out++;
1070
1071 written++;
1072 }
1073
1074 return wxCONV_FAILED;
1075 }
1076
1077 size_t
1078 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1079 const wchar_t *src, size_t srcLen) const
1080 {
1081 char *out = dstLen ? dst : NULL;
1082 size_t written = 0;
1083
1084 for ( const wchar_t *wp = src; ; wp++ )
1085 {
1086 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1087 {
1088 // all done successfully, just add the trailing NULL if we are not
1089 // using explicit length
1090 if ( srcLen == wxNO_LEN )
1091 {
1092 if ( out )
1093 {
1094 if ( !dstLen )
1095 break;
1096
1097 *out = '\0';
1098 }
1099
1100 written++;
1101 }
1102
1103 return written;
1104 }
1105
1106 if ( srcLen != wxNO_LEN )
1107 srcLen--;
1108
1109 wxUint32 code;
1110 #ifdef WC_UTF16
1111 // cast is ok for WC_UTF16
1112 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1113 {
1114 // skip the next char too as we decoded a surrogate
1115 wp++;
1116 }
1117 #else // wchar_t is UTF-32
1118 code = *wp & 0x7fffffff;
1119 #endif
1120
1121 unsigned len;
1122 if ( code <= 0x7F )
1123 {
1124 len = 1;
1125 if ( out )
1126 {
1127 if ( dstLen < len )
1128 break;
1129
1130 out[0] = (char)code;
1131 }
1132 }
1133 else if ( code <= 0x07FF )
1134 {
1135 len = 2;
1136 if ( out )
1137 {
1138 if ( dstLen < len )
1139 break;
1140
1141 // NB: this line takes 6 least significant bits, encodes them as
1142 // 10xxxxxx and discards them so that the next byte can be encoded:
1143 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1144 out[0] = 0xC0 | code;
1145 }
1146 }
1147 else if ( code < 0xFFFF )
1148 {
1149 len = 3;
1150 if ( out )
1151 {
1152 if ( dstLen < len )
1153 break;
1154
1155 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1156 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1157 out[0] = 0xE0 | code;
1158 }
1159 }
1160 else if ( code <= 0x10FFFF )
1161 {
1162 len = 4;
1163 if ( out )
1164 {
1165 if ( dstLen < len )
1166 break;
1167
1168 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1169 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1170 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1171 out[0] = 0xF0 | code;
1172 }
1173 }
1174 else
1175 {
1176 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1177 break;
1178 }
1179
1180 if ( out )
1181 {
1182 out += len;
1183 dstLen -= len;
1184 }
1185
1186 written += len;
1187 }
1188
1189 // we only get here if an error occurs during decoding
1190 return wxCONV_FAILED;
1191 }
1192
1193 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1194 const char *psz, size_t srcLen) const
1195 {
1196 if ( m_options == MAP_INVALID_UTF8_NOT )
1197 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1198
1199 size_t len = 0;
1200
1201 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1202 {
1203 const char *opsz = psz;
1204 bool invalid = false;
1205 unsigned char cc = *psz++, fc = cc;
1206 unsigned cnt;
1207 for (cnt = 0; fc & 0x80; cnt++)
1208 fc <<= 1;
1209
1210 if (!cnt)
1211 {
1212 // plain ASCII char
1213 if (buf)
1214 *buf++ = cc;
1215 len++;
1216
1217 // escape the escape character for octal escapes
1218 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1219 && cc == '\\' && (!buf || len < n))
1220 {
1221 if (buf)
1222 *buf++ = cc;
1223 len++;
1224 }
1225 }
1226 else
1227 {
1228 cnt--;
1229 if (!cnt)
1230 {
1231 // invalid UTF-8 sequence
1232 invalid = true;
1233 }
1234 else
1235 {
1236 unsigned ocnt = cnt - 1;
1237 wxUint32 res = cc & (0x3f >> cnt);
1238 while (cnt--)
1239 {
1240 cc = *psz;
1241 if ((cc & 0xC0) != 0x80)
1242 {
1243 // invalid UTF-8 sequence
1244 invalid = true;
1245 break;
1246 }
1247
1248 psz++;
1249 res = (res << 6) | (cc & 0x3f);
1250 }
1251
1252 if (invalid || res <= utf8_max[ocnt])
1253 {
1254 // illegal UTF-8 encoding
1255 invalid = true;
1256 }
1257 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1258 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1259 {
1260 // if one of our PUA characters turns up externally
1261 // it must also be treated as an illegal sequence
1262 // (a bit like you have to escape an escape character)
1263 invalid = true;
1264 }
1265 else
1266 {
1267 #ifdef WC_UTF16
1268 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1269 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1270 if (pa == wxCONV_FAILED)
1271 {
1272 invalid = true;
1273 }
1274 else
1275 {
1276 if (buf)
1277 buf += pa;
1278 len += pa;
1279 }
1280 #else // !WC_UTF16
1281 if (buf)
1282 *buf++ = (wchar_t)res;
1283 len++;
1284 #endif // WC_UTF16/!WC_UTF16
1285 }
1286 }
1287
1288 if (invalid)
1289 {
1290 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1291 {
1292 while (opsz < psz && (!buf || len < n))
1293 {
1294 #ifdef WC_UTF16
1295 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1296 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1297 wxASSERT(pa != wxCONV_FAILED);
1298 if (buf)
1299 buf += pa;
1300 opsz++;
1301 len += pa;
1302 #else
1303 if (buf)
1304 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1305 opsz++;
1306 len++;
1307 #endif
1308 }
1309 }
1310 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1311 {
1312 while (opsz < psz && (!buf || len < n))
1313 {
1314 if ( buf && len + 3 < n )
1315 {
1316 unsigned char on = *opsz;
1317 *buf++ = L'\\';
1318 *buf++ = (wchar_t)( L'0' + on / 0100 );
1319 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1320 *buf++ = (wchar_t)( L'0' + on % 010 );
1321 }
1322
1323 opsz++;
1324 len += 4;
1325 }
1326 }
1327 else // MAP_INVALID_UTF8_NOT
1328 {
1329 return wxCONV_FAILED;
1330 }
1331 }
1332 }
1333 }
1334
1335 if (srcLen == wxNO_LEN && buf && (len < n))
1336 *buf = 0;
1337
1338 return len + 1;
1339 }
1340
1341 static inline bool isoctal(wchar_t wch)
1342 {
1343 return L'0' <= wch && wch <= L'7';
1344 }
1345
1346 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1347 const wchar_t *psz, size_t srcLen) const
1348 {
1349 if ( m_options == MAP_INVALID_UTF8_NOT )
1350 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1351
1352 size_t len = 0;
1353
1354 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1355 {
1356 wxUint32 cc;
1357
1358 #ifdef WC_UTF16
1359 // cast is ok for WC_UTF16
1360 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1361 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1362 #else
1363 cc = (*psz++) & 0x7fffffff;
1364 #endif
1365
1366 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1367 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1368 {
1369 if (buf)
1370 *buf++ = (char)(cc - wxUnicodePUA);
1371 len++;
1372 }
1373 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1374 && cc == L'\\' && psz[0] == L'\\' )
1375 {
1376 if (buf)
1377 *buf++ = (char)cc;
1378 psz++;
1379 len++;
1380 }
1381 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1382 cc == L'\\' &&
1383 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1384 {
1385 if (buf)
1386 {
1387 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1388 (psz[1] - L'0') * 010 +
1389 (psz[2] - L'0'));
1390 }
1391
1392 psz += 3;
1393 len++;
1394 }
1395 else
1396 {
1397 unsigned cnt;
1398 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1399 {
1400 }
1401
1402 if (!cnt)
1403 {
1404 // plain ASCII char
1405 if (buf)
1406 *buf++ = (char) cc;
1407 len++;
1408 }
1409 else
1410 {
1411 len += cnt + 1;
1412 if (buf)
1413 {
1414 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1415 while (cnt--)
1416 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1417 }
1418 }
1419 }
1420 }
1421
1422 if (srcLen == wxNO_LEN && buf && (len < n))
1423 *buf = 0;
1424
1425 return len + 1;
1426 }
1427
1428 // ============================================================================
1429 // UTF-16
1430 // ============================================================================
1431
1432 #ifdef WORDS_BIGENDIAN
1433 #define wxMBConvUTF16straight wxMBConvUTF16BE
1434 #define wxMBConvUTF16swap wxMBConvUTF16LE
1435 #else
1436 #define wxMBConvUTF16swap wxMBConvUTF16BE
1437 #define wxMBConvUTF16straight wxMBConvUTF16LE
1438 #endif
1439
1440 /* static */
1441 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1442 {
1443 if ( srcLen == wxNO_LEN )
1444 {
1445 // count the number of bytes in input, including the trailing NULs
1446 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1447 for ( srcLen = 1; *inBuff++; srcLen++ )
1448 ;
1449
1450 srcLen *= BYTES_PER_CHAR;
1451 }
1452 else // we already have the length
1453 {
1454 // we can only convert an entire number of UTF-16 characters
1455 if ( srcLen % BYTES_PER_CHAR )
1456 return wxCONV_FAILED;
1457 }
1458
1459 return srcLen;
1460 }
1461
1462 // case when in-memory representation is UTF-16 too
1463 #ifdef WC_UTF16
1464
1465 // ----------------------------------------------------------------------------
1466 // conversions without endianness change
1467 // ----------------------------------------------------------------------------
1468
1469 size_t
1470 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1471 const char *src, size_t srcLen) const
1472 {
1473 // set up the scene for using memcpy() (which is presumably more efficient
1474 // than copying the bytes one by one)
1475 srcLen = GetLength(src, srcLen);
1476 if ( srcLen == wxNO_LEN )
1477 return wxCONV_FAILED;
1478
1479 const size_t inLen = srcLen / BYTES_PER_CHAR;
1480 if ( dst )
1481 {
1482 if ( dstLen < inLen )
1483 return wxCONV_FAILED;
1484
1485 memcpy(dst, src, srcLen);
1486 }
1487
1488 return inLen;
1489 }
1490
1491 size_t
1492 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1493 const wchar_t *src, size_t srcLen) const
1494 {
1495 if ( srcLen == wxNO_LEN )
1496 srcLen = wxWcslen(src) + 1;
1497
1498 srcLen *= BYTES_PER_CHAR;
1499
1500 if ( dst )
1501 {
1502 if ( dstLen < srcLen )
1503 return wxCONV_FAILED;
1504
1505 memcpy(dst, src, srcLen);
1506 }
1507
1508 return srcLen;
1509 }
1510
1511 // ----------------------------------------------------------------------------
1512 // endian-reversing conversions
1513 // ----------------------------------------------------------------------------
1514
1515 size_t
1516 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1517 const char *src, size_t srcLen) const
1518 {
1519 srcLen = GetLength(src, srcLen);
1520 if ( srcLen == wxNO_LEN )
1521 return wxCONV_FAILED;
1522
1523 srcLen /= BYTES_PER_CHAR;
1524
1525 if ( dst )
1526 {
1527 if ( dstLen < srcLen )
1528 return wxCONV_FAILED;
1529
1530 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1531 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1532 {
1533 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1534 }
1535 }
1536
1537 return srcLen;
1538 }
1539
1540 size_t
1541 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1542 const wchar_t *src, size_t srcLen) const
1543 {
1544 if ( srcLen == wxNO_LEN )
1545 srcLen = wxWcslen(src) + 1;
1546
1547 srcLen *= BYTES_PER_CHAR;
1548
1549 if ( dst )
1550 {
1551 if ( dstLen < srcLen )
1552 return wxCONV_FAILED;
1553
1554 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1555 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1556 {
1557 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1558 }
1559 }
1560
1561 return srcLen;
1562 }
1563
1564 #else // !WC_UTF16: wchar_t is UTF-32
1565
1566 // ----------------------------------------------------------------------------
1567 // conversions without endianness change
1568 // ----------------------------------------------------------------------------
1569
1570 size_t
1571 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1572 const char *src, size_t srcLen) const
1573 {
1574 srcLen = GetLength(src, srcLen);
1575 if ( srcLen == wxNO_LEN )
1576 return wxCONV_FAILED;
1577
1578 const size_t inLen = srcLen / BYTES_PER_CHAR;
1579 if ( !dst )
1580 {
1581 // optimization: return maximal space which could be needed for this
1582 // string even if the real size could be smaller if the buffer contains
1583 // any surrogates
1584 return inLen;
1585 }
1586
1587 size_t outLen = 0;
1588 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1589 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1590 {
1591 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1592 if ( !inBuff )
1593 return wxCONV_FAILED;
1594
1595 if ( ++outLen > dstLen )
1596 return wxCONV_FAILED;
1597
1598 *dst++ = ch;
1599 }
1600
1601
1602 return outLen;
1603 }
1604
1605 size_t
1606 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1607 const wchar_t *src, size_t srcLen) const
1608 {
1609 if ( srcLen == wxNO_LEN )
1610 srcLen = wxWcslen(src) + 1;
1611
1612 size_t outLen = 0;
1613 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1614 for ( size_t n = 0; n < srcLen; n++ )
1615 {
1616 wxUint16 cc[2];
1617 const size_t numChars = encode_utf16(*src++, cc);
1618 if ( numChars == wxCONV_FAILED )
1619 return wxCONV_FAILED;
1620
1621 outLen += numChars * BYTES_PER_CHAR;
1622 if ( outBuff )
1623 {
1624 if ( outLen > dstLen )
1625 return wxCONV_FAILED;
1626
1627 *outBuff++ = cc[0];
1628 if ( numChars == 2 )
1629 {
1630 // second character of a surrogate
1631 *outBuff++ = cc[1];
1632 }
1633 }
1634 }
1635
1636 return outLen;
1637 }
1638
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1642
1643 size_t
1644 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1645 const char *src, size_t srcLen) const
1646 {
1647 srcLen = GetLength(src, srcLen);
1648 if ( srcLen == wxNO_LEN )
1649 return wxCONV_FAILED;
1650
1651 const size_t inLen = srcLen / BYTES_PER_CHAR;
1652 if ( !dst )
1653 {
1654 // optimization: return maximal space which could be needed for this
1655 // string even if the real size could be smaller if the buffer contains
1656 // any surrogates
1657 return inLen;
1658 }
1659
1660 size_t outLen = 0;
1661 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1662 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1663 {
1664 wxUint32 ch;
1665 wxUint16 tmp[2];
1666
1667 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1668 inBuff++;
1669 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1670
1671 const size_t numChars = decode_utf16(tmp, ch);
1672 if ( numChars == wxCONV_FAILED )
1673 return wxCONV_FAILED;
1674
1675 if ( numChars == 2 )
1676 inBuff++;
1677
1678 if ( ++outLen > dstLen )
1679 return wxCONV_FAILED;
1680
1681 *dst++ = ch;
1682 }
1683
1684
1685 return outLen;
1686 }
1687
1688 size_t
1689 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1690 const wchar_t *src, size_t srcLen) const
1691 {
1692 if ( srcLen == wxNO_LEN )
1693 srcLen = wxWcslen(src) + 1;
1694
1695 size_t outLen = 0;
1696 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1697 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1698 {
1699 wxUint16 cc[2];
1700 const size_t numChars = encode_utf16(*src, cc);
1701 if ( numChars == wxCONV_FAILED )
1702 return wxCONV_FAILED;
1703
1704 outLen += numChars * BYTES_PER_CHAR;
1705 if ( outBuff )
1706 {
1707 if ( outLen > dstLen )
1708 return wxCONV_FAILED;
1709
1710 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1711 if ( numChars == 2 )
1712 {
1713 // second character of a surrogate
1714 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1715 }
1716 }
1717 }
1718
1719 return outLen;
1720 }
1721
1722 #endif // WC_UTF16/!WC_UTF16
1723
1724
1725 // ============================================================================
1726 // UTF-32
1727 // ============================================================================
1728
1729 #ifdef WORDS_BIGENDIAN
1730 #define wxMBConvUTF32straight wxMBConvUTF32BE
1731 #define wxMBConvUTF32swap wxMBConvUTF32LE
1732 #else
1733 #define wxMBConvUTF32swap wxMBConvUTF32BE
1734 #define wxMBConvUTF32straight wxMBConvUTF32LE
1735 #endif
1736
1737
1738 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1739 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1740
1741 /* static */
1742 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1743 {
1744 if ( srcLen == wxNO_LEN )
1745 {
1746 // count the number of bytes in input, including the trailing NULs
1747 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1748 for ( srcLen = 1; *inBuff++; srcLen++ )
1749 ;
1750
1751 srcLen *= BYTES_PER_CHAR;
1752 }
1753 else // we already have the length
1754 {
1755 // we can only convert an entire number of UTF-32 characters
1756 if ( srcLen % BYTES_PER_CHAR )
1757 return wxCONV_FAILED;
1758 }
1759
1760 return srcLen;
1761 }
1762
1763 // case when in-memory representation is UTF-16
1764 #ifdef WC_UTF16
1765
1766 // ----------------------------------------------------------------------------
1767 // conversions without endianness change
1768 // ----------------------------------------------------------------------------
1769
1770 size_t
1771 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1772 const char *src, size_t srcLen) const
1773 {
1774 srcLen = GetLength(src, srcLen);
1775 if ( srcLen == wxNO_LEN )
1776 return wxCONV_FAILED;
1777
1778 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1779 const size_t inLen = srcLen / BYTES_PER_CHAR;
1780 size_t outLen = 0;
1781 for ( size_t n = 0; n < inLen; n++ )
1782 {
1783 wxUint16 cc[2];
1784 const size_t numChars = encode_utf16(*inBuff++, cc);
1785 if ( numChars == wxCONV_FAILED )
1786 return wxCONV_FAILED;
1787
1788 outLen += numChars;
1789 if ( dst )
1790 {
1791 if ( outLen > dstLen )
1792 return wxCONV_FAILED;
1793
1794 *dst++ = cc[0];
1795 if ( numChars == 2 )
1796 {
1797 // second character of a surrogate
1798 *dst++ = cc[1];
1799 }
1800 }
1801 }
1802
1803 return outLen;
1804 }
1805
1806 size_t
1807 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1808 const wchar_t *src, size_t srcLen) const
1809 {
1810 if ( srcLen == wxNO_LEN )
1811 srcLen = wxWcslen(src) + 1;
1812
1813 if ( !dst )
1814 {
1815 // optimization: return maximal space which could be needed for this
1816 // string instead of the exact amount which could be less if there are
1817 // any surrogates in the input
1818 //
1819 // we consider that surrogates are rare enough to make it worthwhile to
1820 // avoid running the loop below at the cost of slightly extra memory
1821 // consumption
1822 return srcLen * BYTES_PER_CHAR;
1823 }
1824
1825 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1826 size_t outLen = 0;
1827 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1828 {
1829 const wxUint32 ch = wxDecodeSurrogate(&src);
1830 if ( !src )
1831 return wxCONV_FAILED;
1832
1833 outLen += BYTES_PER_CHAR;
1834
1835 if ( outLen > dstLen )
1836 return wxCONV_FAILED;
1837
1838 *outBuff++ = ch;
1839 }
1840
1841 return outLen;
1842 }
1843
1844 // ----------------------------------------------------------------------------
1845 // endian-reversing conversions
1846 // ----------------------------------------------------------------------------
1847
1848 size_t
1849 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1850 const char *src, size_t srcLen) const
1851 {
1852 srcLen = GetLength(src, srcLen);
1853 if ( srcLen == wxNO_LEN )
1854 return wxCONV_FAILED;
1855
1856 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1857 const size_t inLen = srcLen / BYTES_PER_CHAR;
1858 size_t outLen = 0;
1859 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1860 {
1861 wxUint16 cc[2];
1862 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1863 if ( numChars == wxCONV_FAILED )
1864 return wxCONV_FAILED;
1865
1866 outLen += numChars;
1867 if ( dst )
1868 {
1869 if ( outLen > dstLen )
1870 return wxCONV_FAILED;
1871
1872 *dst++ = cc[0];
1873 if ( numChars == 2 )
1874 {
1875 // second character of a surrogate
1876 *dst++ = cc[1];
1877 }
1878 }
1879 }
1880
1881 return outLen;
1882 }
1883
1884 size_t
1885 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1886 const wchar_t *src, size_t srcLen) const
1887 {
1888 if ( srcLen == wxNO_LEN )
1889 srcLen = wxWcslen(src) + 1;
1890
1891 if ( !dst )
1892 {
1893 // optimization: return maximal space which could be needed for this
1894 // string instead of the exact amount which could be less if there are
1895 // any surrogates in the input
1896 //
1897 // we consider that surrogates are rare enough to make it worthwhile to
1898 // avoid running the loop below at the cost of slightly extra memory
1899 // consumption
1900 return srcLen*BYTES_PER_CHAR;
1901 }
1902
1903 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1904 size_t outLen = 0;
1905 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1906 {
1907 const wxUint32 ch = wxDecodeSurrogate(&src);
1908 if ( !src )
1909 return wxCONV_FAILED;
1910
1911 outLen += BYTES_PER_CHAR;
1912
1913 if ( outLen > dstLen )
1914 return wxCONV_FAILED;
1915
1916 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1917 }
1918
1919 return outLen;
1920 }
1921
1922 #else // !WC_UTF16: wchar_t is UTF-32
1923
1924 // ----------------------------------------------------------------------------
1925 // conversions without endianness change
1926 // ----------------------------------------------------------------------------
1927
1928 size_t
1929 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1930 const char *src, size_t srcLen) const
1931 {
1932 // use memcpy() as it should be much faster than hand-written loop
1933 srcLen = GetLength(src, srcLen);
1934 if ( srcLen == wxNO_LEN )
1935 return wxCONV_FAILED;
1936
1937 const size_t inLen = srcLen/BYTES_PER_CHAR;
1938 if ( dst )
1939 {
1940 if ( dstLen < inLen )
1941 return wxCONV_FAILED;
1942
1943 memcpy(dst, src, srcLen);
1944 }
1945
1946 return inLen;
1947 }
1948
1949 size_t
1950 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1951 const wchar_t *src, size_t srcLen) const
1952 {
1953 if ( srcLen == wxNO_LEN )
1954 srcLen = wxWcslen(src) + 1;
1955
1956 srcLen *= BYTES_PER_CHAR;
1957
1958 if ( dst )
1959 {
1960 if ( dstLen < srcLen )
1961 return wxCONV_FAILED;
1962
1963 memcpy(dst, src, srcLen);
1964 }
1965
1966 return srcLen;
1967 }
1968
1969 // ----------------------------------------------------------------------------
1970 // endian-reversing conversions
1971 // ----------------------------------------------------------------------------
1972
1973 size_t
1974 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1975 const char *src, size_t srcLen) const
1976 {
1977 srcLen = GetLength(src, srcLen);
1978 if ( srcLen == wxNO_LEN )
1979 return wxCONV_FAILED;
1980
1981 srcLen /= BYTES_PER_CHAR;
1982
1983 if ( dst )
1984 {
1985 if ( dstLen < srcLen )
1986 return wxCONV_FAILED;
1987
1988 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1989 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1990 {
1991 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1992 }
1993 }
1994
1995 return srcLen;
1996 }
1997
1998 size_t
1999 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2000 const wchar_t *src, size_t srcLen) const
2001 {
2002 if ( srcLen == wxNO_LEN )
2003 srcLen = wxWcslen(src) + 1;
2004
2005 srcLen *= BYTES_PER_CHAR;
2006
2007 if ( dst )
2008 {
2009 if ( dstLen < srcLen )
2010 return wxCONV_FAILED;
2011
2012 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2013 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2014 {
2015 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2016 }
2017 }
2018
2019 return srcLen;
2020 }
2021
2022 #endif // WC_UTF16/!WC_UTF16
2023
2024
2025 // ============================================================================
2026 // The classes doing conversion using the iconv_xxx() functions
2027 // ============================================================================
2028
2029 #ifdef HAVE_ICONV
2030
2031 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2032 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2033 // (unless there's yet another bug in glibc) the only case when iconv()
2034 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2035 // left in the input buffer -- when _real_ error occurs,
2036 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2037 // iconv() failure.
2038 // [This bug does not appear in glibc 2.2.]
2039 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2040 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2041 (errno != E2BIG || bufLeft != 0))
2042 #else
2043 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2044 #endif
2045
2046 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2047
2048 #define ICONV_T_INVALID ((iconv_t)-1)
2049
2050 #if SIZEOF_WCHAR_T == 4
2051 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2052 #define WC_ENC wxFONTENCODING_UTF32
2053 #elif SIZEOF_WCHAR_T == 2
2054 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2055 #define WC_ENC wxFONTENCODING_UTF16
2056 #else // sizeof(wchar_t) != 2 nor 4
2057 // does this ever happen?
2058 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2059 #endif
2060
2061 // ----------------------------------------------------------------------------
2062 // wxMBConv_iconv: encapsulates an iconv character set
2063 // ----------------------------------------------------------------------------
2064
2065 class wxMBConv_iconv : public wxMBConv
2066 {
2067 public:
2068 wxMBConv_iconv(const char *name);
2069 virtual ~wxMBConv_iconv();
2070
2071 // implement base class virtual methods
2072 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2073 const char *src, size_t srcLen = wxNO_LEN) const;
2074 virtual size_t FromWChar(char *dst, size_t dstLen,
2075 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2076 virtual size_t GetMBNulLen() const;
2077
2078 #if wxUSE_UNICODE_UTF8
2079 virtual bool IsUTF8() const;
2080 #endif
2081
2082 virtual wxMBConv *Clone() const
2083 {
2084 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2085 p->m_minMBCharWidth = m_minMBCharWidth;
2086 return p;
2087 }
2088
2089 bool IsOk() const
2090 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2091
2092 protected:
2093 // the iconv handlers used to translate from multibyte
2094 // to wide char and in the other direction
2095 iconv_t m2w,
2096 w2m;
2097
2098 #if wxUSE_THREADS
2099 // guards access to m2w and w2m objects
2100 wxMutex m_iconvMutex;
2101 #endif
2102
2103 private:
2104 // the name (for iconv_open()) of a wide char charset -- if none is
2105 // available on this machine, it will remain NULL
2106 static wxString ms_wcCharsetName;
2107
2108 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2109 // different endian-ness than the native one
2110 static bool ms_wcNeedsSwap;
2111
2112
2113 // name of the encoding handled by this conversion
2114 wxString m_name;
2115
2116 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2117 // initially
2118 size_t m_minMBCharWidth;
2119 };
2120
2121 // make the constructor available for unit testing
2122 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2123 {
2124 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2125 if ( !result->IsOk() )
2126 {
2127 delete result;
2128 return 0;
2129 }
2130
2131 return result;
2132 }
2133
2134 wxString wxMBConv_iconv::ms_wcCharsetName;
2135 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2136
2137 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2138 : m_name(name)
2139 {
2140 m_minMBCharWidth = 0;
2141
2142 // check for charset that represents wchar_t:
2143 if ( ms_wcCharsetName.empty() )
2144 {
2145 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2146
2147 #if wxUSE_FONTMAP
2148 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2149 #else // !wxUSE_FONTMAP
2150 static const wxChar *names_static[] =
2151 {
2152 #if SIZEOF_WCHAR_T == 4
2153 wxT("UCS-4"),
2154 #elif SIZEOF_WCHAR_T = 2
2155 wxT("UCS-2"),
2156 #endif
2157 NULL
2158 };
2159 const wxChar **names = names_static;
2160 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2161
2162 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2163 {
2164 const wxString nameCS(*names);
2165
2166 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2167 wxString nameXE(nameCS);
2168
2169 #ifdef WORDS_BIGENDIAN
2170 nameXE += wxT("BE");
2171 #else // little endian
2172 nameXE += wxT("LE");
2173 #endif
2174
2175 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2176 nameXE.c_str());
2177
2178 m2w = iconv_open(nameXE.ToAscii(), name);
2179 if ( m2w == ICONV_T_INVALID )
2180 {
2181 // try charset w/o bytesex info (e.g. "UCS4")
2182 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2183 nameCS.c_str());
2184 m2w = iconv_open(nameCS.ToAscii(), name);
2185
2186 // and check for bytesex ourselves:
2187 if ( m2w != ICONV_T_INVALID )
2188 {
2189 char buf[2], *bufPtr;
2190 wchar_t wbuf[2];
2191 size_t insz, outsz;
2192 size_t res;
2193
2194 buf[0] = 'A';
2195 buf[1] = 0;
2196 wbuf[0] = 0;
2197 insz = 2;
2198 outsz = SIZEOF_WCHAR_T * 2;
2199 char* wbufPtr = (char*)wbuf;
2200 bufPtr = buf;
2201
2202 res = iconv(
2203 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2204 &wbufPtr, &outsz);
2205
2206 if (ICONV_FAILED(res, insz))
2207 {
2208 wxLogLastError(wxT("iconv"));
2209 wxLogError(_("Conversion to charset '%s' doesn't work."),
2210 nameCS.c_str());
2211 }
2212 else // ok, can convert to this encoding, remember it
2213 {
2214 ms_wcCharsetName = nameCS;
2215 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2216 }
2217 }
2218 }
2219 else // use charset not requiring byte swapping
2220 {
2221 ms_wcCharsetName = nameXE;
2222 }
2223 }
2224
2225 wxLogTrace(TRACE_STRCONV,
2226 wxT("iconv wchar_t charset is \"%s\"%s"),
2227 ms_wcCharsetName.empty() ? wxString("<none>")
2228 : ms_wcCharsetName,
2229 ms_wcNeedsSwap ? wxT(" (needs swap)")
2230 : wxT(""));
2231 }
2232 else // we already have ms_wcCharsetName
2233 {
2234 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2235 }
2236
2237 if ( ms_wcCharsetName.empty() )
2238 {
2239 w2m = ICONV_T_INVALID;
2240 }
2241 else
2242 {
2243 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2244 if ( w2m == ICONV_T_INVALID )
2245 {
2246 wxLogTrace(TRACE_STRCONV,
2247 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2248 ms_wcCharsetName.c_str(), name);
2249 }
2250 }
2251 }
2252
2253 wxMBConv_iconv::~wxMBConv_iconv()
2254 {
2255 if ( m2w != ICONV_T_INVALID )
2256 iconv_close(m2w);
2257 if ( w2m != ICONV_T_INVALID )
2258 iconv_close(w2m);
2259 }
2260
2261 size_t
2262 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2263 const char *src, size_t srcLen) const
2264 {
2265 if ( srcLen == wxNO_LEN )
2266 {
2267 // find the string length: notice that must be done differently for
2268 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2269 // consecutive NULs
2270 const size_t nulLen = GetMBNulLen();
2271 switch ( nulLen )
2272 {
2273 default:
2274 return wxCONV_FAILED;
2275
2276 case 1:
2277 srcLen = strlen(src); // arguably more optimized than our version
2278 break;
2279
2280 case 2:
2281 case 4:
2282 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2283 // but they also have to start at character boundary and not
2284 // span two adjacent characters
2285 const char *p;
2286 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2287 ;
2288 srcLen = p - src;
2289 break;
2290 }
2291
2292 // when we're determining the length of the string ourselves we count
2293 // the terminating NUL(s) as part of it and always NUL-terminate the
2294 // output
2295 srcLen += nulLen;
2296 }
2297
2298 // we express length in the number of (wide) characters but iconv always
2299 // counts buffer sizes it in bytes
2300 dstLen *= SIZEOF_WCHAR_T;
2301
2302 #if wxUSE_THREADS
2303 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2304 // Unfortunately there are a couple of global wxCSConv objects such as
2305 // wxConvLocal that are used all over wx code, so we have to make sure
2306 // the handle is used by at most one thread at the time. Otherwise
2307 // only a few wx classes would be safe to use from non-main threads
2308 // as MB<->WC conversion would fail "randomly".
2309 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2310 #endif // wxUSE_THREADS
2311
2312 size_t res, cres;
2313 const char *pszPtr = src;
2314
2315 if ( dst )
2316 {
2317 char* bufPtr = (char*)dst;
2318
2319 // have destination buffer, convert there
2320 size_t dstLenOrig = dstLen;
2321 cres = iconv(m2w,
2322 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2323 &bufPtr, &dstLen);
2324
2325 // convert the number of bytes converted as returned by iconv to the
2326 // number of (wide) characters converted that we need
2327 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2328
2329 if (ms_wcNeedsSwap)
2330 {
2331 // convert to native endianness
2332 for ( unsigned i = 0; i < res; i++ )
2333 dst[i] = WC_BSWAP(dst[i]);
2334 }
2335 }
2336 else // no destination buffer
2337 {
2338 // convert using temp buffer to calculate the size of the buffer needed
2339 wchar_t tbuf[256];
2340 res = 0;
2341
2342 do
2343 {
2344 char* bufPtr = (char*)tbuf;
2345 dstLen = 8 * SIZEOF_WCHAR_T;
2346
2347 cres = iconv(m2w,
2348 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2349 &bufPtr, &dstLen );
2350
2351 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2352 }
2353 while ((cres == (size_t)-1) && (errno == E2BIG));
2354 }
2355
2356 if (ICONV_FAILED(cres, srcLen))
2357 {
2358 //VS: it is ok if iconv fails, hence trace only
2359 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2360 return wxCONV_FAILED;
2361 }
2362
2363 return res;
2364 }
2365
2366 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2367 const wchar_t *src, size_t srcLen) const
2368 {
2369 #if wxUSE_THREADS
2370 // NB: explained in MB2WC
2371 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2372 #endif
2373
2374 if ( srcLen == wxNO_LEN )
2375 srcLen = wxWcslen(src) + 1;
2376
2377 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2378 size_t outbuflen = dstLen;
2379 size_t res, cres;
2380
2381 wchar_t *tmpbuf = 0;
2382
2383 if (ms_wcNeedsSwap)
2384 {
2385 // need to copy to temp buffer to switch endianness
2386 // (doing WC_BSWAP twice on the original buffer won't work, as it
2387 // could be in read-only memory, or be accessed in some other thread)
2388 tmpbuf = (wchar_t *)malloc(inbuflen);
2389 for ( size_t i = 0; i < srcLen; i++ )
2390 tmpbuf[i] = WC_BSWAP(src[i]);
2391
2392 src = tmpbuf;
2393 }
2394
2395 char* inbuf = (char*)src;
2396 if ( dst )
2397 {
2398 // have destination buffer, convert there
2399 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2400
2401 res = dstLen - outbuflen;
2402 }
2403 else // no destination buffer
2404 {
2405 // convert using temp buffer to calculate the size of the buffer needed
2406 char tbuf[256];
2407 res = 0;
2408 do
2409 {
2410 dst = tbuf;
2411 outbuflen = WXSIZEOF(tbuf);
2412
2413 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2414
2415 res += WXSIZEOF(tbuf) - outbuflen;
2416 }
2417 while ((cres == (size_t)-1) && (errno == E2BIG));
2418 }
2419
2420 if (ms_wcNeedsSwap)
2421 {
2422 free(tmpbuf);
2423 }
2424
2425 if (ICONV_FAILED(cres, inbuflen))
2426 {
2427 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2428 return wxCONV_FAILED;
2429 }
2430
2431 return res;
2432 }
2433
2434 size_t wxMBConv_iconv::GetMBNulLen() const
2435 {
2436 if ( m_minMBCharWidth == 0 )
2437 {
2438 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2439
2440 #if wxUSE_THREADS
2441 // NB: explained in MB2WC
2442 wxMutexLocker lock(self->m_iconvMutex);
2443 #endif
2444
2445 const wchar_t *wnul = L"";
2446 char buf[8]; // should be enough for NUL in any encoding
2447 size_t inLen = sizeof(wchar_t),
2448 outLen = WXSIZEOF(buf);
2449 char *inBuff = (char *)wnul;
2450 char *outBuff = buf;
2451 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2452 {
2453 self->m_minMBCharWidth = (size_t)-1;
2454 }
2455 else // ok
2456 {
2457 self->m_minMBCharWidth = outBuff - buf;
2458 }
2459 }
2460
2461 return m_minMBCharWidth;
2462 }
2463
2464 #if wxUSE_UNICODE_UTF8
2465 bool wxMBConv_iconv::IsUTF8() const
2466 {
2467 return wxStricmp(m_name, "UTF-8") == 0 ||
2468 wxStricmp(m_name, "UTF8") == 0;
2469 }
2470 #endif
2471
2472 #endif // HAVE_ICONV
2473
2474
2475 // ============================================================================
2476 // Win32 conversion classes
2477 // ============================================================================
2478
2479 #ifdef wxHAVE_WIN32_MB2WC
2480
2481 // from utils.cpp
2482 #if wxUSE_FONTMAP
2483 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2484 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2485 #endif
2486
2487 class wxMBConv_win32 : public wxMBConv
2488 {
2489 public:
2490 wxMBConv_win32()
2491 {
2492 m_CodePage = CP_ACP;
2493 m_minMBCharWidth = 0;
2494 }
2495
2496 wxMBConv_win32(const wxMBConv_win32& conv)
2497 : wxMBConv()
2498 {
2499 m_CodePage = conv.m_CodePage;
2500 m_minMBCharWidth = conv.m_minMBCharWidth;
2501 }
2502
2503 #if wxUSE_FONTMAP
2504 wxMBConv_win32(const char* name)
2505 {
2506 m_CodePage = wxCharsetToCodepage(name);
2507 m_minMBCharWidth = 0;
2508 }
2509
2510 wxMBConv_win32(wxFontEncoding encoding)
2511 {
2512 m_CodePage = wxEncodingToCodepage(encoding);
2513 m_minMBCharWidth = 0;
2514 }
2515 #endif // wxUSE_FONTMAP
2516
2517 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2518 {
2519 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2520 // the behaviour is not compatible with the Unix version (using iconv)
2521 // and break the library itself, e.g. wxTextInputStream::NextChar()
2522 // wouldn't work if reading an incomplete MB char didn't result in an
2523 // error
2524 //
2525 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2526 // Win XP or newer and it is not supported for UTF-[78] so we always
2527 // use our own conversions in this case. See
2528 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2529 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2530 if ( m_CodePage == CP_UTF8 )
2531 {
2532 return wxMBConvUTF8().MB2WC(buf, psz, n);
2533 }
2534
2535 if ( m_CodePage == CP_UTF7 )
2536 {
2537 return wxMBConvUTF7().MB2WC(buf, psz, n);
2538 }
2539
2540 int flags = 0;
2541 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2542 IsAtLeastWin2kSP4() )
2543 {
2544 flags = MB_ERR_INVALID_CHARS;
2545 }
2546
2547 const size_t len = ::MultiByteToWideChar
2548 (
2549 m_CodePage, // code page
2550 flags, // flags: fall on error
2551 psz, // input string
2552 -1, // its length (NUL-terminated)
2553 buf, // output string
2554 buf ? n : 0 // size of output buffer
2555 );
2556 if ( !len )
2557 {
2558 // function totally failed
2559 return wxCONV_FAILED;
2560 }
2561
2562 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2563 // check if we succeeded, by doing a double trip:
2564 if ( !flags && buf )
2565 {
2566 const size_t mbLen = strlen(psz);
2567 wxCharBuffer mbBuf(mbLen);
2568 if ( ::WideCharToMultiByte
2569 (
2570 m_CodePage,
2571 0,
2572 buf,
2573 -1,
2574 mbBuf.data(),
2575 mbLen + 1, // size in bytes, not length
2576 NULL,
2577 NULL
2578 ) == 0 ||
2579 strcmp(mbBuf, psz) != 0 )
2580 {
2581 // we didn't obtain the same thing we started from, hence
2582 // the conversion was lossy and we consider that it failed
2583 return wxCONV_FAILED;
2584 }
2585 }
2586
2587 // note that it returns count of written chars for buf != NULL and size
2588 // of the needed buffer for buf == NULL so in either case the length of
2589 // the string (which never includes the terminating NUL) is one less
2590 return len - 1;
2591 }
2592
2593 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2594 {
2595 /*
2596 we have a problem here: by default, WideCharToMultiByte() may
2597 replace characters unrepresentable in the target code page with bad
2598 quality approximations such as turning "1/2" symbol (U+00BD) into
2599 "1" for the code pages which don't have it and we, obviously, want
2600 to avoid this at any price
2601
2602 the trouble is that this function does it _silently_, i.e. it won't
2603 even tell us whether it did or not... Win98/2000 and higher provide
2604 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2605 we have to resort to a round trip, i.e. check that converting back
2606 results in the same string -- this is, of course, expensive but
2607 otherwise we simply can't be sure to not garble the data.
2608 */
2609
2610 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2611 // it doesn't work with CJK encodings (which we test for rather roughly
2612 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2613 // supporting it
2614 BOOL usedDef wxDUMMY_INITIALIZE(false);
2615 BOOL *pUsedDef;
2616 int flags;
2617 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2618 {
2619 // it's our lucky day
2620 flags = WC_NO_BEST_FIT_CHARS;
2621 pUsedDef = &usedDef;
2622 }
2623 else // old system or unsupported encoding
2624 {
2625 flags = 0;
2626 pUsedDef = NULL;
2627 }
2628
2629 const size_t len = ::WideCharToMultiByte
2630 (
2631 m_CodePage, // code page
2632 flags, // either none or no best fit
2633 pwz, // input string
2634 -1, // it is (wide) NUL-terminated
2635 buf, // output buffer
2636 buf ? n : 0, // and its size
2637 NULL, // default "replacement" char
2638 pUsedDef // [out] was it used?
2639 );
2640
2641 if ( !len )
2642 {
2643 // function totally failed
2644 return wxCONV_FAILED;
2645 }
2646
2647 // we did something, check if we really succeeded
2648 if ( flags )
2649 {
2650 // check if the conversion failed, i.e. if any replacements
2651 // were done
2652 if ( usedDef )
2653 return wxCONV_FAILED;
2654 }
2655 else // we must resort to double tripping...
2656 {
2657 // first we need to ensure that we really have the MB data: this is
2658 // not the case if we're called with NULL buffer, in which case we
2659 // need to do the conversion yet again
2660 wxCharBuffer bufDef;
2661 if ( !buf )
2662 {
2663 bufDef = wxCharBuffer(len);
2664 buf = bufDef.data();
2665 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2666 buf, len, NULL, NULL) )
2667 return wxCONV_FAILED;
2668 }
2669
2670 if ( !n )
2671 n = wcslen(pwz);
2672 wxWCharBuffer wcBuf(n);
2673 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2674 wcscmp(wcBuf, pwz) != 0 )
2675 {
2676 // we didn't obtain the same thing we started from, hence
2677 // the conversion was lossy and we consider that it failed
2678 return wxCONV_FAILED;
2679 }
2680 }
2681
2682 // see the comment above for the reason of "len - 1"
2683 return len - 1;
2684 }
2685
2686 virtual size_t GetMBNulLen() const
2687 {
2688 if ( m_minMBCharWidth == 0 )
2689 {
2690 int len = ::WideCharToMultiByte
2691 (
2692 m_CodePage, // code page
2693 0, // no flags
2694 L"", // input string
2695 1, // translate just the NUL
2696 NULL, // output buffer
2697 0, // and its size
2698 NULL, // no replacement char
2699 NULL // [out] don't care if it was used
2700 );
2701
2702 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2703 switch ( len )
2704 {
2705 default:
2706 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2707 self->m_minMBCharWidth = (size_t)-1;
2708 break;
2709
2710 case 0:
2711 self->m_minMBCharWidth = (size_t)-1;
2712 break;
2713
2714 case 1:
2715 case 2:
2716 case 4:
2717 self->m_minMBCharWidth = len;
2718 break;
2719 }
2720 }
2721
2722 return m_minMBCharWidth;
2723 }
2724
2725 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2726
2727 bool IsOk() const { return m_CodePage != -1; }
2728
2729 private:
2730 static bool CanUseNoBestFit()
2731 {
2732 static int s_isWin98Or2k = -1;
2733
2734 if ( s_isWin98Or2k == -1 )
2735 {
2736 int verMaj, verMin;
2737 switch ( wxGetOsVersion(&verMaj, &verMin) )
2738 {
2739 case wxOS_WINDOWS_9X:
2740 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2741 break;
2742
2743 case wxOS_WINDOWS_NT:
2744 s_isWin98Or2k = verMaj >= 5;
2745 break;
2746
2747 default:
2748 // unknown: be conservative by default
2749 s_isWin98Or2k = 0;
2750 break;
2751 }
2752
2753 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2754 }
2755
2756 return s_isWin98Or2k == 1;
2757 }
2758
2759 static bool IsAtLeastWin2kSP4()
2760 {
2761 #ifdef __WXWINCE__
2762 return false;
2763 #else
2764 static int s_isAtLeastWin2kSP4 = -1;
2765
2766 if ( s_isAtLeastWin2kSP4 == -1 )
2767 {
2768 OSVERSIONINFOEX ver;
2769
2770 memset(&ver, 0, sizeof(ver));
2771 ver.dwOSVersionInfoSize = sizeof(ver);
2772 GetVersionEx((OSVERSIONINFO*)&ver);
2773
2774 s_isAtLeastWin2kSP4 =
2775 ((ver.dwMajorVersion > 5) || // Vista+
2776 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2777 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2778 ver.wServicePackMajor >= 4)) // 2000 SP4+
2779 ? 1 : 0;
2780 }
2781
2782 return s_isAtLeastWin2kSP4 == 1;
2783 #endif
2784 }
2785
2786
2787 // the code page we're working with
2788 long m_CodePage;
2789
2790 // cached result of GetMBNulLen(), set to 0 initially meaning
2791 // "unknown"
2792 size_t m_minMBCharWidth;
2793 };
2794
2795 #endif // wxHAVE_WIN32_MB2WC
2796
2797
2798 // ============================================================================
2799 // wxEncodingConverter based conversion classes
2800 // ============================================================================
2801
2802 #if wxUSE_FONTMAP
2803
2804 class wxMBConv_wxwin : public wxMBConv
2805 {
2806 private:
2807 void Init()
2808 {
2809 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2810 // The wxMBConv_cf class does a better job.
2811 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2812 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2813 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2814 }
2815
2816 public:
2817 // temporarily just use wxEncodingConverter stuff,
2818 // so that it works while a better implementation is built
2819 wxMBConv_wxwin(const char* name)
2820 {
2821 if (name)
2822 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2823 else
2824 m_enc = wxFONTENCODING_SYSTEM;
2825
2826 Init();
2827 }
2828
2829 wxMBConv_wxwin(wxFontEncoding enc)
2830 {
2831 m_enc = enc;
2832
2833 Init();
2834 }
2835
2836 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2837 {
2838 size_t inbuf = strlen(psz);
2839 if (buf)
2840 {
2841 if (!m2w.Convert(psz, buf))
2842 return wxCONV_FAILED;
2843 }
2844 return inbuf;
2845 }
2846
2847 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2848 {
2849 const size_t inbuf = wxWcslen(psz);
2850 if (buf)
2851 {
2852 if (!w2m.Convert(psz, buf))
2853 return wxCONV_FAILED;
2854 }
2855
2856 return inbuf;
2857 }
2858
2859 virtual size_t GetMBNulLen() const
2860 {
2861 switch ( m_enc )
2862 {
2863 case wxFONTENCODING_UTF16BE:
2864 case wxFONTENCODING_UTF16LE:
2865 return 2;
2866
2867 case wxFONTENCODING_UTF32BE:
2868 case wxFONTENCODING_UTF32LE:
2869 return 4;
2870
2871 default:
2872 return 1;
2873 }
2874 }
2875
2876 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2877
2878 bool IsOk() const { return m_ok; }
2879
2880 public:
2881 wxFontEncoding m_enc;
2882 wxEncodingConverter m2w, w2m;
2883
2884 private:
2885 // were we initialized successfully?
2886 bool m_ok;
2887
2888 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2889 };
2890
2891 // make the constructors available for unit testing
2892 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2893 {
2894 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2895 if ( !result->IsOk() )
2896 {
2897 delete result;
2898 return 0;
2899 }
2900
2901 return result;
2902 }
2903
2904 #endif // wxUSE_FONTMAP
2905
2906 // ============================================================================
2907 // wxCSConv implementation
2908 // ============================================================================
2909
2910 void wxCSConv::Init()
2911 {
2912 m_name = NULL;
2913 m_convReal = NULL;
2914 m_deferred = true;
2915 }
2916
2917 wxCSConv::wxCSConv(const wxString& charset)
2918 {
2919 Init();
2920
2921 if ( !charset.empty() )
2922 {
2923 SetName(charset.ToAscii());
2924 }
2925
2926 #if wxUSE_FONTMAP
2927 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2928 if ( m_encoding == wxFONTENCODING_MAX )
2929 {
2930 // set to unknown/invalid value
2931 m_encoding = wxFONTENCODING_SYSTEM;
2932 }
2933 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2934 {
2935 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2936 m_encoding = wxFONTENCODING_ISO8859_1;
2937 }
2938 #else
2939 m_encoding = wxFONTENCODING_SYSTEM;
2940 #endif
2941 }
2942
2943 wxCSConv::wxCSConv(wxFontEncoding encoding)
2944 {
2945 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2946 {
2947 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2948
2949 encoding = wxFONTENCODING_SYSTEM;
2950 }
2951
2952 Init();
2953
2954 m_encoding = encoding;
2955 }
2956
2957 wxCSConv::~wxCSConv()
2958 {
2959 Clear();
2960 }
2961
2962 wxCSConv::wxCSConv(const wxCSConv& conv)
2963 : wxMBConv()
2964 {
2965 Init();
2966
2967 SetName(conv.m_name);
2968 m_encoding = conv.m_encoding;
2969 }
2970
2971 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2972 {
2973 Clear();
2974
2975 SetName(conv.m_name);
2976 m_encoding = conv.m_encoding;
2977
2978 return *this;
2979 }
2980
2981 void wxCSConv::Clear()
2982 {
2983 free(m_name);
2984 delete m_convReal;
2985
2986 m_name = NULL;
2987 m_convReal = NULL;
2988 }
2989
2990 void wxCSConv::SetName(const char *charset)
2991 {
2992 if (charset)
2993 {
2994 m_name = wxStrdup(charset);
2995 m_deferred = true;
2996 }
2997 }
2998
2999 #if wxUSE_FONTMAP
3000
3001 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3002 wxEncodingNameCache );
3003
3004 static wxEncodingNameCache gs_nameCache;
3005 #endif
3006
3007 wxMBConv *wxCSConv::DoCreate() const
3008 {
3009 #if wxUSE_FONTMAP
3010 wxLogTrace(TRACE_STRCONV,
3011 wxT("creating conversion for %s"),
3012 (m_name ? m_name
3013 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3014 #endif // wxUSE_FONTMAP
3015
3016 // check for the special case of ASCII or ISO8859-1 charset: as we have
3017 // special knowledge of it anyhow, we don't need to create a special
3018 // conversion object
3019 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3020 m_encoding == wxFONTENCODING_DEFAULT )
3021 {
3022 // don't convert at all
3023 return NULL;
3024 }
3025
3026 // we trust OS to do conversion better than we can so try external
3027 // conversion methods first
3028 //
3029 // the full order is:
3030 // 1. OS conversion (iconv() under Unix or Win32 API)
3031 // 2. hard coded conversions for UTF
3032 // 3. wxEncodingConverter as fall back
3033
3034 // step (1)
3035 #ifdef HAVE_ICONV
3036 #if !wxUSE_FONTMAP
3037 if ( m_name )
3038 #endif // !wxUSE_FONTMAP
3039 {
3040 #if wxUSE_FONTMAP
3041 wxFontEncoding encoding(m_encoding);
3042 #endif
3043
3044 if ( m_name )
3045 {
3046 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3047 if ( conv->IsOk() )
3048 return conv;
3049
3050 delete conv;
3051
3052 #if wxUSE_FONTMAP
3053 encoding =
3054 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3055 #endif // wxUSE_FONTMAP
3056 }
3057 #if wxUSE_FONTMAP
3058 {
3059 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3060 if ( it != gs_nameCache.end() )
3061 {
3062 if ( it->second.empty() )
3063 return NULL;
3064
3065 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3066 if ( conv->IsOk() )
3067 return conv;
3068
3069 delete conv;
3070 }
3071
3072 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3073 // CS : in case this does not return valid names (eg for MacRoman)
3074 // encoding got a 'failure' entry in the cache all the same,
3075 // although it just has to be created using a different method, so
3076 // only store failed iconv creation attempts (or perhaps we
3077 // shoulnd't do this at all ?)
3078 if ( names[0] != NULL )
3079 {
3080 for ( ; *names; ++names )
3081 {
3082 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3083 // will need changes that will obsolete this
3084 wxString name(*names);
3085 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3086 if ( conv->IsOk() )
3087 {
3088 gs_nameCache[encoding] = *names;
3089 return conv;
3090 }
3091
3092 delete conv;
3093 }
3094
3095 gs_nameCache[encoding] = wxT(""); // cache the failure
3096 }
3097 }
3098 #endif // wxUSE_FONTMAP
3099 }
3100 #endif // HAVE_ICONV
3101
3102 #ifdef wxHAVE_WIN32_MB2WC
3103 {
3104 #if wxUSE_FONTMAP
3105 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3106 : new wxMBConv_win32(m_encoding);
3107 if ( conv->IsOk() )
3108 return conv;
3109
3110 delete conv;
3111 #else
3112 return NULL;
3113 #endif
3114 }
3115 #endif // wxHAVE_WIN32_MB2WC
3116
3117 #ifdef __DARWIN__
3118 {
3119 // leave UTF16 and UTF32 to the built-ins of wx
3120 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3121 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3122 {
3123 #if wxUSE_FONTMAP
3124 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3125 : new wxMBConv_cf(m_encoding);
3126 #else
3127 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3128 #endif
3129
3130 if ( conv->IsOk() )
3131 return conv;
3132
3133 delete conv;
3134 }
3135 }
3136 #endif // __DARWIN__
3137
3138 // step (2)
3139 wxFontEncoding enc = m_encoding;
3140 #if wxUSE_FONTMAP
3141 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3142 {
3143 // use "false" to suppress interactive dialogs -- we can be called from
3144 // anywhere and popping up a dialog from here is the last thing we want to
3145 // do
3146 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3147 }
3148 #endif // wxUSE_FONTMAP
3149
3150 switch ( enc )
3151 {
3152 case wxFONTENCODING_UTF7:
3153 return new wxMBConvUTF7;
3154
3155 case wxFONTENCODING_UTF8:
3156 return new wxMBConvUTF8;
3157
3158 case wxFONTENCODING_UTF16BE:
3159 return new wxMBConvUTF16BE;
3160
3161 case wxFONTENCODING_UTF16LE:
3162 return new wxMBConvUTF16LE;
3163
3164 case wxFONTENCODING_UTF32BE:
3165 return new wxMBConvUTF32BE;
3166
3167 case wxFONTENCODING_UTF32LE:
3168 return new wxMBConvUTF32LE;
3169
3170 default:
3171 // nothing to do but put here to suppress gcc warnings
3172 break;
3173 }
3174
3175 // step (3)
3176 #if wxUSE_FONTMAP
3177 {
3178 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3179 : new wxMBConv_wxwin(m_encoding);
3180 if ( conv->IsOk() )
3181 return conv;
3182
3183 delete conv;
3184 }
3185
3186 wxLogTrace(TRACE_STRCONV,
3187 wxT("encoding \"%s\" is not supported by this system"),
3188 (m_name ? wxString(m_name)
3189 : wxFontMapperBase::GetEncodingName(m_encoding)));
3190 #endif // wxUSE_FONTMAP
3191
3192 return NULL;
3193 }
3194
3195 void wxCSConv::CreateConvIfNeeded() const
3196 {
3197 if ( m_deferred )
3198 {
3199 wxCSConv *self = (wxCSConv *)this; // const_cast
3200
3201 // if we don't have neither the name nor the encoding, use the default
3202 // encoding for this system
3203 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3204 {
3205 #if wxUSE_INTL
3206 self->m_encoding = wxLocale::GetSystemEncoding();
3207 #else
3208 // fallback to some reasonable default:
3209 self->m_encoding = wxFONTENCODING_ISO8859_1;
3210 #endif // wxUSE_INTL
3211 }
3212
3213 self->m_convReal = DoCreate();
3214 self->m_deferred = false;
3215 }
3216 }
3217
3218 bool wxCSConv::IsOk() const
3219 {
3220 CreateConvIfNeeded();
3221
3222 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3223 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3224 return true; // always ok as we do it ourselves
3225
3226 // m_convReal->IsOk() is called at its own creation, so we know it must
3227 // be ok if m_convReal is non-NULL
3228 return m_convReal != NULL;
3229 }
3230
3231 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3232 const char *src, size_t srcLen) const
3233 {
3234 CreateConvIfNeeded();
3235
3236 if (m_convReal)
3237 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3238
3239 // latin-1 (direct)
3240 if ( srcLen == wxNO_LEN )
3241 srcLen = strlen(src) + 1; // take trailing NUL too
3242
3243 if ( dst )
3244 {
3245 if ( dstLen < srcLen )
3246 return wxCONV_FAILED;
3247
3248 for ( size_t n = 0; n < srcLen; n++ )
3249 dst[n] = (unsigned char)(src[n]);
3250 }
3251
3252 return srcLen;
3253 }
3254
3255 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3256 const wchar_t *src, size_t srcLen) const
3257 {
3258 CreateConvIfNeeded();
3259
3260 if (m_convReal)
3261 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3262
3263 // latin-1 (direct)
3264 if ( srcLen == wxNO_LEN )
3265 srcLen = wxWcslen(src) + 1;
3266
3267 if ( dst )
3268 {
3269 if ( dstLen < srcLen )
3270 return wxCONV_FAILED;
3271
3272 for ( size_t n = 0; n < srcLen; n++ )
3273 {
3274 if ( src[n] > 0xFF )
3275 return wxCONV_FAILED;
3276
3277 dst[n] = (char)src[n];
3278 }
3279
3280 }
3281 else // still need to check the input validity
3282 {
3283 for ( size_t n = 0; n < srcLen; n++ )
3284 {
3285 if ( src[n] > 0xFF )
3286 return wxCONV_FAILED;
3287 }
3288 }
3289
3290 return srcLen;
3291 }
3292
3293 size_t wxCSConv::GetMBNulLen() const
3294 {
3295 CreateConvIfNeeded();
3296
3297 if ( m_convReal )
3298 {
3299 return m_convReal->GetMBNulLen();
3300 }
3301
3302 // otherwise, we are ISO-8859-1
3303 return 1;
3304 }
3305
3306 #if wxUSE_UNICODE_UTF8
3307 bool wxCSConv::IsUTF8() const
3308 {
3309 CreateConvIfNeeded();
3310
3311 if ( m_convReal )
3312 {
3313 return m_convReal->IsUTF8();
3314 }
3315
3316 // otherwise, we are ISO-8859-1
3317 return false;
3318 }
3319 #endif
3320
3321
3322 #if wxUSE_UNICODE
3323
3324 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3325 {
3326 if ( !s )
3327 return wxWCharBuffer();
3328
3329 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3330 if ( !wbuf )
3331 wbuf = wxMBConvUTF8().cMB2WX(s);
3332 if ( !wbuf )
3333 wbuf = wxConvISO8859_1.cMB2WX(s);
3334
3335 return wbuf;
3336 }
3337
3338 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3339 {
3340 if ( !ws )
3341 return wxCharBuffer();
3342
3343 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3344 if ( !buf )
3345 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3346
3347 return buf;
3348 }
3349
3350 #endif // wxUSE_UNICODE
3351
3352 // ----------------------------------------------------------------------------
3353 // globals
3354 // ----------------------------------------------------------------------------
3355
3356 // NB: The reason why we create converted objects in this convoluted way,
3357 // using a factory function instead of global variable, is that they
3358 // may be used at static initialization time (some of them are used by
3359 // wxString ctors and there may be a global wxString object). In other
3360 // words, possibly _before_ the converter global object would be
3361 // initialized.
3362
3363 #undef wxConvLibc
3364 #undef wxConvUTF8
3365 #undef wxConvUTF7
3366 #undef wxConvLocal
3367 #undef wxConvISO8859_1
3368
3369 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3370 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3371 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3372 { \
3373 static impl_klass name##Obj ctor_args; \
3374 return &name##Obj; \
3375 } \
3376 /* this ensures that all global converter objects are created */ \
3377 /* by the time static initialization is done, i.e. before any */ \
3378 /* thread is launched: */ \
3379 static klass* gs_##name##instance = wxGet_##name##Ptr()
3380
3381 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3382 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3383
3384 #ifdef __INTELC__
3385 // disable warning "variable 'xxx' was declared but never referenced"
3386 #pragma warning(disable: 177)
3387 #endif // Intel C++
3388
3389 #ifdef __WINDOWS__
3390 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3391 #elif 0 // defined(__WXOSX__)
3392 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3393 #else
3394 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3395 #endif
3396
3397 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3398 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3399 // provokes an error message about "not enough macro parameters"; and we
3400 // can't use "()" here as the name##Obj declaration would be parsed as a
3401 // function declaration then, so use a semicolon and live with an extra
3402 // empty statement (and hope that no compilers warns about this)
3403 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3404 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3405
3406 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3407 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3408
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3410 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3411
3412 #ifdef __DARWIN__
3413 // The xnu kernel always communicates file paths in decomposed UTF-8.
3414 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3415 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3416 #endif
3417
3418 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3419 #ifdef __DARWIN__
3420 &wxConvMacUTF8DObj;
3421 #else // !__DARWIN__
3422 wxGet_wxConvLibcPtr();
3423 #endif // __DARWIN__/!__DARWIN__
3424
3425 #else // !wxUSE_WCHAR_T
3426
3427 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3428 // stand-ins in absence of wchar_t
3429 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3430 wxConvISO8859_1,
3431 wxConvLocal,
3432 wxConvUTF8;
3433
3434 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T