]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
osx-cocoa updates
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
227 for ( ;; )
228 {
229 // try to convert the current chunk
230 size_t lenChunk = MB2WC(NULL, src, 0);
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dstWritten += lenChunk;
235 if ( !srcEnd )
236 dstWritten++;
237
238 if ( !lenChunk )
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
254 if ( !srcEnd )
255 dst++;
256 }
257
258 if ( !srcEnd )
259 {
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow
262 break;
263 }
264
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 src += nulLen; // skipping over its terminator as well
276
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
280 if ( src >= srcEnd )
281 break;
282
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
286 if ( srcEnd )
287 {
288 dstWritten++;
289 if ( dst )
290 dst++;
291 }
292 }
293
294 return dstWritten;
295 }
296
297 size_t
298 wxMBConv::FromWChar(char *dst, size_t dstLen,
299 const wchar_t *src, size_t srcLen) const
300 {
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten = 0;
303
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated = srcLen == wxNO_LEN;
308
309 // make a copy of the input string unless it is already properly
310 // NUL-terminated
311 wxWCharBuffer bufTmp;
312 if ( isNulTerminated )
313 {
314 srcLen = wxWcslen(src) + 1;
315 }
316 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
317 {
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp = wxWCharBuffer(srcLen);
320 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
321 src = bufTmp;
322 }
323
324 const size_t lenNul = GetMBNulLen();
325 for ( const wchar_t * const srcEnd = src + srcLen;
326 src < srcEnd;
327 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
328 {
329 // try to convert the current chunk
330 size_t lenChunk = WC2MB(NULL, src, 0);
331
332 if ( lenChunk == wxCONV_FAILED )
333 return wxCONV_FAILED;
334
335 dstWritten += lenChunk;
336 if ( isNulTerminated )
337 dstWritten += lenNul;
338
339 if ( dst )
340 {
341 if ( dstWritten > dstLen )
342 return wxCONV_FAILED;
343
344 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
345 return wxCONV_FAILED;
346
347 dst += lenChunk;
348 if ( isNulTerminated )
349 dst += lenNul;
350 }
351 }
352
353 return dstWritten;
354 }
355
356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
357 {
358 size_t rc = ToWChar(outBuff, outLen, inBuff);
359 if ( rc != wxCONV_FAILED )
360 {
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
363 rc--;
364 }
365
366 return rc;
367 }
368
369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
370 {
371 size_t rc = FromWChar(outBuff, outLen, inBuff);
372 if ( rc != wxCONV_FAILED )
373 {
374 rc -= GetMBNulLen();
375 }
376
377 return rc;
378 }
379
380 wxMBConv::~wxMBConv()
381 {
382 // nothing to do here (necessary for Darwin linking probably)
383 }
384
385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
386 {
387 if ( psz )
388 {
389 // calculate the length of the buffer needed first
390 const size_t nLen = ToWChar(NULL, 0, psz);
391 if ( nLen != wxCONV_FAILED )
392 {
393 // now do the actual conversion
394 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
395
396 // +1 for the trailing NULL
397 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
398 return buf;
399 }
400 }
401
402 return wxWCharBuffer();
403 }
404
405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
406 {
407 if ( pwz )
408 {
409 const size_t nLen = FromWChar(NULL, 0, pwz);
410 if ( nLen != wxCONV_FAILED )
411 {
412 wxCharBuffer buf(nLen - 1);
413 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
414 return buf;
415 }
416 }
417
418 return wxCharBuffer();
419 }
420
421 const wxWCharBuffer
422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
423 {
424 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
425 if ( dstLen != wxCONV_FAILED )
426 {
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer wbuf(dstLen);
431 wbuf.data()[dstLen] = L'\0';
432 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
433 {
434 if ( outLen )
435 {
436 *outLen = dstLen;
437
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen == wxNO_LEN )
444 (*outLen)--;
445 }
446
447 return wbuf;
448 }
449 }
450
451 if ( outLen )
452 *outLen = 0;
453
454 return wxWCharBuffer();
455 }
456
457 const wxCharBuffer
458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
459 {
460 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
461 if ( dstLen != wxCONV_FAILED )
462 {
463 const size_t nulLen = GetMBNulLen();
464
465 // as above, ensure that the buffer is always NUL-terminated, even if
466 // the input is not
467 wxCharBuffer buf(dstLen + nulLen - 1);
468 memset(buf.data() + dstLen, 0, nulLen);
469 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
470 {
471 if ( outLen )
472 {
473 *outLen = dstLen;
474
475 if ( inLen == wxNO_LEN )
476 {
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
479 *outLen -= nulLen;
480 }
481 }
482
483 return buf;
484 }
485 }
486
487 if ( outLen )
488 *outLen = 0;
489
490 return wxCharBuffer();
491 }
492
493 // ----------------------------------------------------------------------------
494 // wxMBConvLibc
495 // ----------------------------------------------------------------------------
496
497 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
498 {
499 return wxMB2WC(buf, psz, n);
500 }
501
502 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
503 {
504 return wxWC2MB(buf, psz, n);
505 }
506
507 // ----------------------------------------------------------------------------
508 // wxConvBrokenFileNames
509 // ----------------------------------------------------------------------------
510
511 #ifdef __UNIX__
512
513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
514 {
515 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
516 wxStricmp(charset, _T("UTF8")) == 0 )
517 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
518 else
519 m_conv = new wxCSConv(charset);
520 }
521
522 #endif // __UNIX__
523
524 // ----------------------------------------------------------------------------
525 // UTF-7
526 // ----------------------------------------------------------------------------
527
528 // Implementation (C) 2004 Fredrik Roubert
529 //
530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
531
532 //
533 // BASE64 decoding table
534 //
535 static const unsigned char utf7unb64[] =
536 {
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
543 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
544 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
546 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
547 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
548 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
550 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
551 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
552 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
554 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
555 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
556 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
569 };
570
571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
572 const char *src, size_t srcLen) const
573 {
574 DecoderState stateOrig,
575 *statePtr;
576 if ( srcLen == wxNO_LEN )
577 {
578 // convert the entire string, up to and including the trailing NUL
579 srcLen = strlen(src) + 1;
580
581 // when working on the entire strings we don't update nor use the shift
582 // state from the previous call
583 statePtr = &stateOrig;
584 }
585 else // when working with partial strings we do use the shift state
586 {
587 statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
588
589 // also save the old state to be able to rollback to it on error
590 stateOrig = m_stateDecoder;
591 }
592
593 // but to simplify the code below we use this variable in both cases
594 DecoderState& state = *statePtr;
595
596
597 // number of characters [which would have been] written to dst [if it were
598 // not NULL]
599 size_t len = 0;
600
601 const char * const srcEnd = src + srcLen;
602
603 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
604 {
605 const unsigned char cc = *src++;
606
607 if ( state.IsShifted() )
608 {
609 const unsigned char dc = utf7unb64[cc];
610 if ( dc == 0xff )
611 {
612 // end of encoded part, check that nothing was left: there can
613 // be up to 4 bits of 0 padding but nothing else (we also need
614 // to check isLSB as we count bits modulo 8 while a valid UTF-7
615 // encoded sequence must contain an integral number of UTF-16
616 // characters)
617 if ( state.isLSB || state.bit > 4 ||
618 (state.accum & ((1 << state.bit) - 1)) )
619 {
620 if ( !len )
621 state = stateOrig;
622
623 return wxCONV_FAILED;
624 }
625
626 state.ToDirect();
627
628 // re-parse this character normally below unless it's '-' which
629 // is consumed by the decoder
630 if ( cc == '-' )
631 continue;
632 }
633 else // valid encoded character
634 {
635 // mini base64 decoder: each character is 6 bits
636 state.bit += 6;
637 state.accum <<= 6;
638 state.accum += dc;
639
640 if ( state.bit >= 8 )
641 {
642 // got the full byte, consume it
643 state.bit -= 8;
644 unsigned char b = (state.accum >> state.bit) & 0x00ff;
645
646 if ( state.isLSB )
647 {
648 // we've got the full word, output it
649 if ( dst )
650 *dst++ = (state.msb << 8) | b;
651 len++;
652 state.isLSB = false;
653 }
654 else // MSB
655 {
656 // just store it while we wait for LSB
657 state.msb = b;
658 state.isLSB = true;
659 }
660 }
661 }
662 }
663
664 if ( state.IsDirect() )
665 {
666 // start of an encoded segment?
667 if ( cc == '+' )
668 {
669 if ( *src == '-' )
670 {
671 // just the encoded plus sign, don't switch to shifted mode
672 if ( dst )
673 *dst++ = '+';
674 len++;
675 src++;
676 }
677 else if ( utf7unb64[(unsigned)*src] == 0xff )
678 {
679 // empty encoded chunks are not allowed
680 if ( !len )
681 state = stateOrig;
682
683 return wxCONV_FAILED;
684 }
685 else // base-64 encoded chunk follows
686 {
687 state.ToShifted();
688 }
689 }
690 else // not '+'
691 {
692 // only printable 7 bit ASCII characters (with the exception of
693 // NUL, TAB, CR and LF) can be used directly
694 if ( cc >= 0x7f || (cc < ' ' &&
695 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
696 return wxCONV_FAILED;
697
698 if ( dst )
699 *dst++ = cc;
700 len++;
701 }
702 }
703 }
704
705 if ( !len )
706 {
707 // as we didn't read any characters we should be called with the same
708 // data (followed by some more new data) again later so don't save our
709 // state
710 state = stateOrig;
711
712 return wxCONV_FAILED;
713 }
714
715 return len;
716 }
717
718 //
719 // BASE64 encoding table
720 //
721 static const unsigned char utf7enb64[] =
722 {
723 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
724 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
725 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
726 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
727 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
728 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
729 'w', 'x', 'y', 'z', '0', '1', '2', '3',
730 '4', '5', '6', '7', '8', '9', '+', '/'
731 };
732
733 //
734 // UTF-7 encoding table
735 //
736 // 0 - Set D (directly encoded characters)
737 // 1 - Set O (optional direct characters)
738 // 2 - whitespace characters (optional)
739 // 3 - special characters
740 //
741 static const unsigned char utf7encode[128] =
742 {
743 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
745 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
747 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
749 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
751 };
752
753 static inline bool wxIsUTF7Direct(wchar_t wc)
754 {
755 return wc < 0x80 && utf7encode[wc] < 1;
756 }
757
758 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
759 const wchar_t *src, size_t srcLen) const
760 {
761 EncoderState stateOrig,
762 *statePtr;
763 if ( srcLen == wxNO_LEN )
764 {
765 // we don't apply the stored state when operating on entire strings at
766 // once
767 statePtr = &stateOrig;
768
769 srcLen = wxWcslen(src) + 1;
770 }
771 else // do use the mode we left the output in previously
772 {
773 stateOrig = m_stateEncoder;
774 statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
775 }
776
777 EncoderState& state = *statePtr;
778
779
780 size_t len = 0;
781
782 const wchar_t * const srcEnd = src + srcLen;
783 while ( src < srcEnd && (!dst || len < dstLen) )
784 {
785 wchar_t cc = *src++;
786 if ( wxIsUTF7Direct(cc) )
787 {
788 if ( state.IsShifted() )
789 {
790 // pad with zeros the last encoded block if necessary
791 if ( state.bit )
792 {
793 if ( dst )
794 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
795 len++;
796 }
797
798 state.ToDirect();
799
800 if ( dst )
801 *dst++ = '-';
802 len++;
803 }
804
805 if ( dst )
806 *dst++ = (char)cc;
807 len++;
808 }
809 else if ( cc == '+' && state.IsDirect() )
810 {
811 if ( dst )
812 {
813 *dst++ = '+';
814 *dst++ = '-';
815 }
816
817 len += 2;
818 }
819 #ifndef WC_UTF16
820 else if (((wxUint32)cc) > 0xffff)
821 {
822 // no surrogate pair generation (yet?)
823 return wxCONV_FAILED;
824 }
825 #endif
826 else
827 {
828 if ( state.IsDirect() )
829 {
830 state.ToShifted();
831
832 if ( dst )
833 *dst++ = '+';
834 len++;
835 }
836
837 // BASE64 encode string
838 for ( ;; )
839 {
840 for ( unsigned lsb = 0; lsb < 2; lsb++ )
841 {
842 state.accum <<= 8;
843 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
844
845 for (state.bit += 8; state.bit >= 6; )
846 {
847 state.bit -= 6;
848 if ( dst )
849 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
850 len++;
851 }
852 }
853
854 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
855 break;
856
857 src++;
858 }
859 }
860 }
861
862 // we need to restore the original encoder state if we were called just to
863 // calculate the amount of space needed as we will presumably be called
864 // again to really convert the data now
865 if ( !dst )
866 state = stateOrig;
867
868 return len;
869 }
870
871 // ----------------------------------------------------------------------------
872 // UTF-8
873 // ----------------------------------------------------------------------------
874
875 static const wxUint32 utf8_max[]=
876 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
877
878 // boundaries of the private use area we use to (temporarily) remap invalid
879 // characters invalid in a UTF-8 encoded string
880 const wxUint32 wxUnicodePUA = 0x100000;
881 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
882
883 // this table gives the length of the UTF-8 encoding from its first character:
884 const unsigned char tableUtf8Lengths[256] = {
885 // single-byte sequences (ASCII):
886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
894
895 // these are invalid:
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
900 0, 0, // C0,C1
901
902 // two-byte sequences:
903 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
904 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
905
906 // three-byte sequences:
907 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
908
909 // four-byte sequences:
910 4, 4, 4, 4, 4, // F0..F4
911
912 // these are invalid again (5- or 6-byte
913 // sequences and sequences for code points
914 // above U+10FFFF, as restricted by RFC 3629):
915 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
916 };
917
918 size_t
919 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
920 const char *src, size_t srcLen) const
921 {
922 wchar_t *out = dstLen ? dst : NULL;
923 size_t written = 0;
924
925 if ( srcLen == wxNO_LEN )
926 srcLen = strlen(src) + 1;
927
928 for ( const char *p = src; ; p++ )
929 {
930 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
931 {
932 // all done successfully, just add the trailing NULL if we are not
933 // using explicit length
934 if ( srcLen == wxNO_LEN )
935 {
936 if ( out )
937 {
938 if ( !dstLen )
939 break;
940
941 *out = L'\0';
942 }
943
944 written++;
945 }
946
947 return written;
948 }
949
950 if ( out && !dstLen-- )
951 break;
952
953 wxUint32 code;
954 unsigned char c = *p;
955
956 if ( c < 0x80 )
957 {
958 if ( srcLen == 0 ) // the test works for wxNO_LEN too
959 break;
960
961 if ( srcLen != wxNO_LEN )
962 srcLen--;
963
964 code = c;
965 }
966 else
967 {
968 unsigned len = tableUtf8Lengths[c];
969 if ( !len )
970 break;
971
972 if ( srcLen < len ) // the test works for wxNO_LEN too
973 break;
974
975 if ( srcLen != wxNO_LEN )
976 srcLen -= len;
977
978 // Char. number range | UTF-8 octet sequence
979 // (hexadecimal) | (binary)
980 // ----------------------+----------------------------------------
981 // 0000 0000 - 0000 007F | 0xxxxxxx
982 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
983 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
984 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
985 //
986 // Code point value is stored in bits marked with 'x',
987 // lowest-order bit of the value on the right side in the diagram
988 // above. (from RFC 3629)
989
990 // mask to extract lead byte's value ('x' bits above), by sequence
991 // length:
992 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
993
994 // mask and value of lead byte's most significant bits, by length:
995 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
996 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
997
998 len--; // it's more convenient to work with 0-based length here
999
1000 // extract the lead byte's value bits:
1001 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1002 break;
1003
1004 code = c & leadValueMask[len];
1005
1006 // all remaining bytes, if any, are handled in the same way
1007 // regardless of sequence's length:
1008 for ( ; len; --len )
1009 {
1010 c = *++p;
1011 if ( (c & 0xC0) != 0x80 )
1012 return wxCONV_FAILED;
1013
1014 code <<= 6;
1015 code |= c & 0x3F;
1016 }
1017 }
1018
1019 #ifdef WC_UTF16
1020 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1021 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1022 {
1023 if ( out )
1024 out++;
1025 written++;
1026 }
1027 #else // !WC_UTF16
1028 if ( out )
1029 *out = code;
1030 #endif // WC_UTF16/!WC_UTF16
1031
1032 if ( out )
1033 out++;
1034
1035 written++;
1036 }
1037
1038 return wxCONV_FAILED;
1039 }
1040
1041 size_t
1042 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1043 const wchar_t *src, size_t srcLen) const
1044 {
1045 char *out = dstLen ? dst : NULL;
1046 size_t written = 0;
1047
1048 for ( const wchar_t *wp = src; ; wp++ )
1049 {
1050 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1051 {
1052 // all done successfully, just add the trailing NULL if we are not
1053 // using explicit length
1054 if ( srcLen == wxNO_LEN )
1055 {
1056 if ( out )
1057 {
1058 if ( !dstLen )
1059 break;
1060
1061 *out = '\0';
1062 }
1063
1064 written++;
1065 }
1066
1067 return written;
1068 }
1069
1070 if ( srcLen != wxNO_LEN )
1071 srcLen--;
1072
1073 wxUint32 code;
1074 #ifdef WC_UTF16
1075 // cast is ok for WC_UTF16
1076 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1077 {
1078 // skip the next char too as we decoded a surrogate
1079 wp++;
1080 }
1081 #else // wchar_t is UTF-32
1082 code = *wp & 0x7fffffff;
1083 #endif
1084
1085 unsigned len;
1086 if ( code <= 0x7F )
1087 {
1088 len = 1;
1089 if ( out )
1090 {
1091 if ( dstLen < len )
1092 break;
1093
1094 out[0] = (char)code;
1095 }
1096 }
1097 else if ( code <= 0x07FF )
1098 {
1099 len = 2;
1100 if ( out )
1101 {
1102 if ( dstLen < len )
1103 break;
1104
1105 // NB: this line takes 6 least significant bits, encodes them as
1106 // 10xxxxxx and discards them so that the next byte can be encoded:
1107 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1108 out[0] = 0xC0 | code;
1109 }
1110 }
1111 else if ( code < 0xFFFF )
1112 {
1113 len = 3;
1114 if ( out )
1115 {
1116 if ( dstLen < len )
1117 break;
1118
1119 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1120 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1121 out[0] = 0xE0 | code;
1122 }
1123 }
1124 else if ( code <= 0x10FFFF )
1125 {
1126 len = 4;
1127 if ( out )
1128 {
1129 if ( dstLen < len )
1130 break;
1131
1132 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1133 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1134 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1135 out[0] = 0xF0 | code;
1136 }
1137 }
1138 else
1139 {
1140 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1141 break;
1142 }
1143
1144 if ( out )
1145 {
1146 out += len;
1147 dstLen -= len;
1148 }
1149
1150 written += len;
1151 }
1152
1153 // we only get here if an error occurs during decoding
1154 return wxCONV_FAILED;
1155 }
1156
1157 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1158 const char *psz, size_t srcLen) const
1159 {
1160 if ( m_options == MAP_INVALID_UTF8_NOT )
1161 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1162
1163 size_t len = 0;
1164
1165 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1166 {
1167 const char *opsz = psz;
1168 bool invalid = false;
1169 unsigned char cc = *psz++, fc = cc;
1170 unsigned cnt;
1171 for (cnt = 0; fc & 0x80; cnt++)
1172 fc <<= 1;
1173
1174 if (!cnt)
1175 {
1176 // plain ASCII char
1177 if (buf)
1178 *buf++ = cc;
1179 len++;
1180
1181 // escape the escape character for octal escapes
1182 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1183 && cc == '\\' && (!buf || len < n))
1184 {
1185 if (buf)
1186 *buf++ = cc;
1187 len++;
1188 }
1189 }
1190 else
1191 {
1192 cnt--;
1193 if (!cnt)
1194 {
1195 // invalid UTF-8 sequence
1196 invalid = true;
1197 }
1198 else
1199 {
1200 unsigned ocnt = cnt - 1;
1201 wxUint32 res = cc & (0x3f >> cnt);
1202 while (cnt--)
1203 {
1204 cc = *psz;
1205 if ((cc & 0xC0) != 0x80)
1206 {
1207 // invalid UTF-8 sequence
1208 invalid = true;
1209 break;
1210 }
1211
1212 psz++;
1213 res = (res << 6) | (cc & 0x3f);
1214 }
1215
1216 if (invalid || res <= utf8_max[ocnt])
1217 {
1218 // illegal UTF-8 encoding
1219 invalid = true;
1220 }
1221 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1222 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1223 {
1224 // if one of our PUA characters turns up externally
1225 // it must also be treated as an illegal sequence
1226 // (a bit like you have to escape an escape character)
1227 invalid = true;
1228 }
1229 else
1230 {
1231 #ifdef WC_UTF16
1232 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1233 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1234 if (pa == wxCONV_FAILED)
1235 {
1236 invalid = true;
1237 }
1238 else
1239 {
1240 if (buf)
1241 buf += pa;
1242 len += pa;
1243 }
1244 #else // !WC_UTF16
1245 if (buf)
1246 *buf++ = (wchar_t)res;
1247 len++;
1248 #endif // WC_UTF16/!WC_UTF16
1249 }
1250 }
1251
1252 if (invalid)
1253 {
1254 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1255 {
1256 while (opsz < psz && (!buf || len < n))
1257 {
1258 #ifdef WC_UTF16
1259 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1260 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1261 wxASSERT(pa != wxCONV_FAILED);
1262 if (buf)
1263 buf += pa;
1264 opsz++;
1265 len += pa;
1266 #else
1267 if (buf)
1268 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1269 opsz++;
1270 len++;
1271 #endif
1272 }
1273 }
1274 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1275 {
1276 while (opsz < psz && (!buf || len < n))
1277 {
1278 if ( buf && len + 3 < n )
1279 {
1280 unsigned char on = *opsz;
1281 *buf++ = L'\\';
1282 *buf++ = (wchar_t)( L'0' + on / 0100 );
1283 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1284 *buf++ = (wchar_t)( L'0' + on % 010 );
1285 }
1286
1287 opsz++;
1288 len += 4;
1289 }
1290 }
1291 else // MAP_INVALID_UTF8_NOT
1292 {
1293 return wxCONV_FAILED;
1294 }
1295 }
1296 }
1297 }
1298
1299 if (srcLen == wxNO_LEN && buf && (len < n))
1300 *buf = 0;
1301
1302 return len + 1;
1303 }
1304
1305 static inline bool isoctal(wchar_t wch)
1306 {
1307 return L'0' <= wch && wch <= L'7';
1308 }
1309
1310 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1311 const wchar_t *psz, size_t srcLen) const
1312 {
1313 if ( m_options == MAP_INVALID_UTF8_NOT )
1314 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1315
1316 size_t len = 0;
1317
1318 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1319 {
1320 wxUint32 cc;
1321
1322 #ifdef WC_UTF16
1323 // cast is ok for WC_UTF16
1324 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1325 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1326 #else
1327 cc = (*psz++) & 0x7fffffff;
1328 #endif
1329
1330 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1331 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1332 {
1333 if (buf)
1334 *buf++ = (char)(cc - wxUnicodePUA);
1335 len++;
1336 }
1337 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1338 && cc == L'\\' && psz[0] == L'\\' )
1339 {
1340 if (buf)
1341 *buf++ = (char)cc;
1342 psz++;
1343 len++;
1344 }
1345 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1346 cc == L'\\' &&
1347 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1348 {
1349 if (buf)
1350 {
1351 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1352 (psz[1] - L'0') * 010 +
1353 (psz[2] - L'0'));
1354 }
1355
1356 psz += 3;
1357 len++;
1358 }
1359 else
1360 {
1361 unsigned cnt;
1362 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1363 {
1364 }
1365
1366 if (!cnt)
1367 {
1368 // plain ASCII char
1369 if (buf)
1370 *buf++ = (char) cc;
1371 len++;
1372 }
1373 else
1374 {
1375 len += cnt + 1;
1376 if (buf)
1377 {
1378 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1379 while (cnt--)
1380 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1381 }
1382 }
1383 }
1384 }
1385
1386 if (srcLen == wxNO_LEN && buf && (len < n))
1387 *buf = 0;
1388
1389 return len + 1;
1390 }
1391
1392 // ============================================================================
1393 // UTF-16
1394 // ============================================================================
1395
1396 #ifdef WORDS_BIGENDIAN
1397 #define wxMBConvUTF16straight wxMBConvUTF16BE
1398 #define wxMBConvUTF16swap wxMBConvUTF16LE
1399 #else
1400 #define wxMBConvUTF16swap wxMBConvUTF16BE
1401 #define wxMBConvUTF16straight wxMBConvUTF16LE
1402 #endif
1403
1404 /* static */
1405 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1406 {
1407 if ( srcLen == wxNO_LEN )
1408 {
1409 // count the number of bytes in input, including the trailing NULs
1410 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1411 for ( srcLen = 1; *inBuff++; srcLen++ )
1412 ;
1413
1414 srcLen *= BYTES_PER_CHAR;
1415 }
1416 else // we already have the length
1417 {
1418 // we can only convert an entire number of UTF-16 characters
1419 if ( srcLen % BYTES_PER_CHAR )
1420 return wxCONV_FAILED;
1421 }
1422
1423 return srcLen;
1424 }
1425
1426 // case when in-memory representation is UTF-16 too
1427 #ifdef WC_UTF16
1428
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1432
1433 size_t
1434 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1435 const char *src, size_t srcLen) const
1436 {
1437 // set up the scene for using memcpy() (which is presumably more efficient
1438 // than copying the bytes one by one)
1439 srcLen = GetLength(src, srcLen);
1440 if ( srcLen == wxNO_LEN )
1441 return wxCONV_FAILED;
1442
1443 const size_t inLen = srcLen / BYTES_PER_CHAR;
1444 if ( dst )
1445 {
1446 if ( dstLen < inLen )
1447 return wxCONV_FAILED;
1448
1449 memcpy(dst, src, srcLen);
1450 }
1451
1452 return inLen;
1453 }
1454
1455 size_t
1456 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1457 const wchar_t *src, size_t srcLen) const
1458 {
1459 if ( srcLen == wxNO_LEN )
1460 srcLen = wxWcslen(src) + 1;
1461
1462 srcLen *= BYTES_PER_CHAR;
1463
1464 if ( dst )
1465 {
1466 if ( dstLen < srcLen )
1467 return wxCONV_FAILED;
1468
1469 memcpy(dst, src, srcLen);
1470 }
1471
1472 return srcLen;
1473 }
1474
1475 // ----------------------------------------------------------------------------
1476 // endian-reversing conversions
1477 // ----------------------------------------------------------------------------
1478
1479 size_t
1480 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1481 const char *src, size_t srcLen) const
1482 {
1483 srcLen = GetLength(src, srcLen);
1484 if ( srcLen == wxNO_LEN )
1485 return wxCONV_FAILED;
1486
1487 srcLen /= BYTES_PER_CHAR;
1488
1489 if ( dst )
1490 {
1491 if ( dstLen < srcLen )
1492 return wxCONV_FAILED;
1493
1494 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1495 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1496 {
1497 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1498 }
1499 }
1500
1501 return srcLen;
1502 }
1503
1504 size_t
1505 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1506 const wchar_t *src, size_t srcLen) const
1507 {
1508 if ( srcLen == wxNO_LEN )
1509 srcLen = wxWcslen(src) + 1;
1510
1511 srcLen *= BYTES_PER_CHAR;
1512
1513 if ( dst )
1514 {
1515 if ( dstLen < srcLen )
1516 return wxCONV_FAILED;
1517
1518 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1519 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1520 {
1521 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1522 }
1523 }
1524
1525 return srcLen;
1526 }
1527
1528 #else // !WC_UTF16: wchar_t is UTF-32
1529
1530 // ----------------------------------------------------------------------------
1531 // conversions without endianness change
1532 // ----------------------------------------------------------------------------
1533
1534 size_t
1535 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1536 const char *src, size_t srcLen) const
1537 {
1538 srcLen = GetLength(src, srcLen);
1539 if ( srcLen == wxNO_LEN )
1540 return wxCONV_FAILED;
1541
1542 const size_t inLen = srcLen / BYTES_PER_CHAR;
1543 if ( !dst )
1544 {
1545 // optimization: return maximal space which could be needed for this
1546 // string even if the real size could be smaller if the buffer contains
1547 // any surrogates
1548 return inLen;
1549 }
1550
1551 size_t outLen = 0;
1552 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1553 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1554 {
1555 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1556 if ( !inBuff )
1557 return wxCONV_FAILED;
1558
1559 if ( ++outLen > dstLen )
1560 return wxCONV_FAILED;
1561
1562 *dst++ = ch;
1563 }
1564
1565
1566 return outLen;
1567 }
1568
1569 size_t
1570 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1571 const wchar_t *src, size_t srcLen) const
1572 {
1573 if ( srcLen == wxNO_LEN )
1574 srcLen = wxWcslen(src) + 1;
1575
1576 size_t outLen = 0;
1577 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1578 for ( size_t n = 0; n < srcLen; n++ )
1579 {
1580 wxUint16 cc[2];
1581 const size_t numChars = encode_utf16(*src++, cc);
1582 if ( numChars == wxCONV_FAILED )
1583 return wxCONV_FAILED;
1584
1585 outLen += numChars * BYTES_PER_CHAR;
1586 if ( outBuff )
1587 {
1588 if ( outLen > dstLen )
1589 return wxCONV_FAILED;
1590
1591 *outBuff++ = cc[0];
1592 if ( numChars == 2 )
1593 {
1594 // second character of a surrogate
1595 *outBuff++ = cc[1];
1596 }
1597 }
1598 }
1599
1600 return outLen;
1601 }
1602
1603 // ----------------------------------------------------------------------------
1604 // endian-reversing conversions
1605 // ----------------------------------------------------------------------------
1606
1607 size_t
1608 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1609 const char *src, size_t srcLen) const
1610 {
1611 srcLen = GetLength(src, srcLen);
1612 if ( srcLen == wxNO_LEN )
1613 return wxCONV_FAILED;
1614
1615 const size_t inLen = srcLen / BYTES_PER_CHAR;
1616 if ( !dst )
1617 {
1618 // optimization: return maximal space which could be needed for this
1619 // string even if the real size could be smaller if the buffer contains
1620 // any surrogates
1621 return inLen;
1622 }
1623
1624 size_t outLen = 0;
1625 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1626 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1627 {
1628 wxUint32 ch;
1629 wxUint16 tmp[2];
1630
1631 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1632 inBuff++;
1633 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1634
1635 const size_t numChars = decode_utf16(tmp, ch);
1636 if ( numChars == wxCONV_FAILED )
1637 return wxCONV_FAILED;
1638
1639 if ( numChars == 2 )
1640 inBuff++;
1641
1642 if ( ++outLen > dstLen )
1643 return wxCONV_FAILED;
1644
1645 *dst++ = ch;
1646 }
1647
1648
1649 return outLen;
1650 }
1651
1652 size_t
1653 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1654 const wchar_t *src, size_t srcLen) const
1655 {
1656 if ( srcLen == wxNO_LEN )
1657 srcLen = wxWcslen(src) + 1;
1658
1659 size_t outLen = 0;
1660 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1661 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1662 {
1663 wxUint16 cc[2];
1664 const size_t numChars = encode_utf16(*src, cc);
1665 if ( numChars == wxCONV_FAILED )
1666 return wxCONV_FAILED;
1667
1668 outLen += numChars * BYTES_PER_CHAR;
1669 if ( outBuff )
1670 {
1671 if ( outLen > dstLen )
1672 return wxCONV_FAILED;
1673
1674 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1675 if ( numChars == 2 )
1676 {
1677 // second character of a surrogate
1678 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1679 }
1680 }
1681 }
1682
1683 return outLen;
1684 }
1685
1686 #endif // WC_UTF16/!WC_UTF16
1687
1688
1689 // ============================================================================
1690 // UTF-32
1691 // ============================================================================
1692
1693 #ifdef WORDS_BIGENDIAN
1694 #define wxMBConvUTF32straight wxMBConvUTF32BE
1695 #define wxMBConvUTF32swap wxMBConvUTF32LE
1696 #else
1697 #define wxMBConvUTF32swap wxMBConvUTF32BE
1698 #define wxMBConvUTF32straight wxMBConvUTF32LE
1699 #endif
1700
1701
1702 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1703 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1704
1705 /* static */
1706 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1707 {
1708 if ( srcLen == wxNO_LEN )
1709 {
1710 // count the number of bytes in input, including the trailing NULs
1711 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1712 for ( srcLen = 1; *inBuff++; srcLen++ )
1713 ;
1714
1715 srcLen *= BYTES_PER_CHAR;
1716 }
1717 else // we already have the length
1718 {
1719 // we can only convert an entire number of UTF-32 characters
1720 if ( srcLen % BYTES_PER_CHAR )
1721 return wxCONV_FAILED;
1722 }
1723
1724 return srcLen;
1725 }
1726
1727 // case when in-memory representation is UTF-16
1728 #ifdef WC_UTF16
1729
1730 // ----------------------------------------------------------------------------
1731 // conversions without endianness change
1732 // ----------------------------------------------------------------------------
1733
1734 size_t
1735 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1736 const char *src, size_t srcLen) const
1737 {
1738 srcLen = GetLength(src, srcLen);
1739 if ( srcLen == wxNO_LEN )
1740 return wxCONV_FAILED;
1741
1742 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1743 const size_t inLen = srcLen / BYTES_PER_CHAR;
1744 size_t outLen = 0;
1745 for ( size_t n = 0; n < inLen; n++ )
1746 {
1747 wxUint16 cc[2];
1748 const size_t numChars = encode_utf16(*inBuff++, cc);
1749 if ( numChars == wxCONV_FAILED )
1750 return wxCONV_FAILED;
1751
1752 outLen += numChars;
1753 if ( dst )
1754 {
1755 if ( outLen > dstLen )
1756 return wxCONV_FAILED;
1757
1758 *dst++ = cc[0];
1759 if ( numChars == 2 )
1760 {
1761 // second character of a surrogate
1762 *dst++ = cc[1];
1763 }
1764 }
1765 }
1766
1767 return outLen;
1768 }
1769
1770 size_t
1771 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1772 const wchar_t *src, size_t srcLen) const
1773 {
1774 if ( srcLen == wxNO_LEN )
1775 srcLen = wxWcslen(src) + 1;
1776
1777 if ( !dst )
1778 {
1779 // optimization: return maximal space which could be needed for this
1780 // string instead of the exact amount which could be less if there are
1781 // any surrogates in the input
1782 //
1783 // we consider that surrogates are rare enough to make it worthwhile to
1784 // avoid running the loop below at the cost of slightly extra memory
1785 // consumption
1786 return srcLen * BYTES_PER_CHAR;
1787 }
1788
1789 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1790 size_t outLen = 0;
1791 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1792 {
1793 const wxUint32 ch = wxDecodeSurrogate(&src);
1794 if ( !src )
1795 return wxCONV_FAILED;
1796
1797 outLen += BYTES_PER_CHAR;
1798
1799 if ( outLen > dstLen )
1800 return wxCONV_FAILED;
1801
1802 *outBuff++ = ch;
1803 }
1804
1805 return outLen;
1806 }
1807
1808 // ----------------------------------------------------------------------------
1809 // endian-reversing conversions
1810 // ----------------------------------------------------------------------------
1811
1812 size_t
1813 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1814 const char *src, size_t srcLen) const
1815 {
1816 srcLen = GetLength(src, srcLen);
1817 if ( srcLen == wxNO_LEN )
1818 return wxCONV_FAILED;
1819
1820 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1821 const size_t inLen = srcLen / BYTES_PER_CHAR;
1822 size_t outLen = 0;
1823 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1824 {
1825 wxUint16 cc[2];
1826 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1827 if ( numChars == wxCONV_FAILED )
1828 return wxCONV_FAILED;
1829
1830 outLen += numChars;
1831 if ( dst )
1832 {
1833 if ( outLen > dstLen )
1834 return wxCONV_FAILED;
1835
1836 *dst++ = cc[0];
1837 if ( numChars == 2 )
1838 {
1839 // second character of a surrogate
1840 *dst++ = cc[1];
1841 }
1842 }
1843 }
1844
1845 return outLen;
1846 }
1847
1848 size_t
1849 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1850 const wchar_t *src, size_t srcLen) const
1851 {
1852 if ( srcLen == wxNO_LEN )
1853 srcLen = wxWcslen(src) + 1;
1854
1855 if ( !dst )
1856 {
1857 // optimization: return maximal space which could be needed for this
1858 // string instead of the exact amount which could be less if there are
1859 // any surrogates in the input
1860 //
1861 // we consider that surrogates are rare enough to make it worthwhile to
1862 // avoid running the loop below at the cost of slightly extra memory
1863 // consumption
1864 return srcLen*BYTES_PER_CHAR;
1865 }
1866
1867 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1868 size_t outLen = 0;
1869 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1870 {
1871 const wxUint32 ch = wxDecodeSurrogate(&src);
1872 if ( !src )
1873 return wxCONV_FAILED;
1874
1875 outLen += BYTES_PER_CHAR;
1876
1877 if ( outLen > dstLen )
1878 return wxCONV_FAILED;
1879
1880 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1881 }
1882
1883 return outLen;
1884 }
1885
1886 #else // !WC_UTF16: wchar_t is UTF-32
1887
1888 // ----------------------------------------------------------------------------
1889 // conversions without endianness change
1890 // ----------------------------------------------------------------------------
1891
1892 size_t
1893 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1894 const char *src, size_t srcLen) const
1895 {
1896 // use memcpy() as it should be much faster than hand-written loop
1897 srcLen = GetLength(src, srcLen);
1898 if ( srcLen == wxNO_LEN )
1899 return wxCONV_FAILED;
1900
1901 const size_t inLen = srcLen/BYTES_PER_CHAR;
1902 if ( dst )
1903 {
1904 if ( dstLen < inLen )
1905 return wxCONV_FAILED;
1906
1907 memcpy(dst, src, srcLen);
1908 }
1909
1910 return inLen;
1911 }
1912
1913 size_t
1914 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1915 const wchar_t *src, size_t srcLen) const
1916 {
1917 if ( srcLen == wxNO_LEN )
1918 srcLen = wxWcslen(src) + 1;
1919
1920 srcLen *= BYTES_PER_CHAR;
1921
1922 if ( dst )
1923 {
1924 if ( dstLen < srcLen )
1925 return wxCONV_FAILED;
1926
1927 memcpy(dst, src, srcLen);
1928 }
1929
1930 return srcLen;
1931 }
1932
1933 // ----------------------------------------------------------------------------
1934 // endian-reversing conversions
1935 // ----------------------------------------------------------------------------
1936
1937 size_t
1938 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1939 const char *src, size_t srcLen) const
1940 {
1941 srcLen = GetLength(src, srcLen);
1942 if ( srcLen == wxNO_LEN )
1943 return wxCONV_FAILED;
1944
1945 srcLen /= BYTES_PER_CHAR;
1946
1947 if ( dst )
1948 {
1949 if ( dstLen < srcLen )
1950 return wxCONV_FAILED;
1951
1952 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1953 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1954 {
1955 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1956 }
1957 }
1958
1959 return srcLen;
1960 }
1961
1962 size_t
1963 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1964 const wchar_t *src, size_t srcLen) const
1965 {
1966 if ( srcLen == wxNO_LEN )
1967 srcLen = wxWcslen(src) + 1;
1968
1969 srcLen *= BYTES_PER_CHAR;
1970
1971 if ( dst )
1972 {
1973 if ( dstLen < srcLen )
1974 return wxCONV_FAILED;
1975
1976 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1977 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1978 {
1979 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1980 }
1981 }
1982
1983 return srcLen;
1984 }
1985
1986 #endif // WC_UTF16/!WC_UTF16
1987
1988
1989 // ============================================================================
1990 // The classes doing conversion using the iconv_xxx() functions
1991 // ============================================================================
1992
1993 #ifdef HAVE_ICONV
1994
1995 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1996 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1997 // (unless there's yet another bug in glibc) the only case when iconv()
1998 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1999 // left in the input buffer -- when _real_ error occurs,
2000 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2001 // iconv() failure.
2002 // [This bug does not appear in glibc 2.2.]
2003 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2004 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2005 (errno != E2BIG || bufLeft != 0))
2006 #else
2007 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2008 #endif
2009
2010 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2011
2012 #define ICONV_T_INVALID ((iconv_t)-1)
2013
2014 #if SIZEOF_WCHAR_T == 4
2015 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2016 #define WC_ENC wxFONTENCODING_UTF32
2017 #elif SIZEOF_WCHAR_T == 2
2018 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2019 #define WC_ENC wxFONTENCODING_UTF16
2020 #else // sizeof(wchar_t) != 2 nor 4
2021 // does this ever happen?
2022 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2023 #endif
2024
2025 // ----------------------------------------------------------------------------
2026 // wxMBConv_iconv: encapsulates an iconv character set
2027 // ----------------------------------------------------------------------------
2028
2029 class wxMBConv_iconv : public wxMBConv
2030 {
2031 public:
2032 wxMBConv_iconv(const char *name);
2033 virtual ~wxMBConv_iconv();
2034
2035 // implement base class virtual methods
2036 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2037 const char *src, size_t srcLen = wxNO_LEN) const;
2038 virtual size_t FromWChar(char *dst, size_t dstLen,
2039 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2040 virtual size_t GetMBNulLen() const;
2041
2042 #if wxUSE_UNICODE_UTF8
2043 virtual bool IsUTF8() const;
2044 #endif
2045
2046 virtual wxMBConv *Clone() const
2047 {
2048 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2049 p->m_minMBCharWidth = m_minMBCharWidth;
2050 return p;
2051 }
2052
2053 bool IsOk() const
2054 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2055
2056 protected:
2057 // the iconv handlers used to translate from multibyte
2058 // to wide char and in the other direction
2059 iconv_t m2w,
2060 w2m;
2061
2062 #if wxUSE_THREADS
2063 // guards access to m2w and w2m objects
2064 wxMutex m_iconvMutex;
2065 #endif
2066
2067 private:
2068 // the name (for iconv_open()) of a wide char charset -- if none is
2069 // available on this machine, it will remain NULL
2070 static wxString ms_wcCharsetName;
2071
2072 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2073 // different endian-ness than the native one
2074 static bool ms_wcNeedsSwap;
2075
2076
2077 // name of the encoding handled by this conversion
2078 wxString m_name;
2079
2080 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2081 // initially
2082 size_t m_minMBCharWidth;
2083 };
2084
2085 // make the constructor available for unit testing
2086 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2087 {
2088 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2089 if ( !result->IsOk() )
2090 {
2091 delete result;
2092 return 0;
2093 }
2094
2095 return result;
2096 }
2097
2098 wxString wxMBConv_iconv::ms_wcCharsetName;
2099 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2100
2101 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2102 : m_name(name)
2103 {
2104 m_minMBCharWidth = 0;
2105
2106 // check for charset that represents wchar_t:
2107 if ( ms_wcCharsetName.empty() )
2108 {
2109 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2110
2111 #if wxUSE_FONTMAP
2112 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2113 #else // !wxUSE_FONTMAP
2114 static const wxChar *names_static[] =
2115 {
2116 #if SIZEOF_WCHAR_T == 4
2117 _T("UCS-4"),
2118 #elif SIZEOF_WCHAR_T = 2
2119 _T("UCS-2"),
2120 #endif
2121 NULL
2122 };
2123 const wxChar **names = names_static;
2124 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2125
2126 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2127 {
2128 const wxString nameCS(*names);
2129
2130 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2131 wxString nameXE(nameCS);
2132
2133 #ifdef WORDS_BIGENDIAN
2134 nameXE += _T("BE");
2135 #else // little endian
2136 nameXE += _T("LE");
2137 #endif
2138
2139 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2140 nameXE.c_str());
2141
2142 m2w = iconv_open(nameXE.ToAscii(), name);
2143 if ( m2w == ICONV_T_INVALID )
2144 {
2145 // try charset w/o bytesex info (e.g. "UCS4")
2146 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
2147 nameCS.c_str());
2148 m2w = iconv_open(nameCS.ToAscii(), name);
2149
2150 // and check for bytesex ourselves:
2151 if ( m2w != ICONV_T_INVALID )
2152 {
2153 char buf[2], *bufPtr;
2154 wchar_t wbuf[2];
2155 size_t insz, outsz;
2156 size_t res;
2157
2158 buf[0] = 'A';
2159 buf[1] = 0;
2160 wbuf[0] = 0;
2161 insz = 2;
2162 outsz = SIZEOF_WCHAR_T * 2;
2163 char* wbufPtr = (char*)wbuf;
2164 bufPtr = buf;
2165
2166 res = iconv(
2167 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2168 &wbufPtr, &outsz);
2169
2170 if (ICONV_FAILED(res, insz))
2171 {
2172 wxLogLastError(wxT("iconv"));
2173 wxLogError(_("Conversion to charset '%s' doesn't work."),
2174 nameCS.c_str());
2175 }
2176 else // ok, can convert to this encoding, remember it
2177 {
2178 ms_wcCharsetName = nameCS;
2179 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2180 }
2181 }
2182 }
2183 else // use charset not requiring byte swapping
2184 {
2185 ms_wcCharsetName = nameXE;
2186 }
2187 }
2188
2189 wxLogTrace(TRACE_STRCONV,
2190 wxT("iconv wchar_t charset is \"%s\"%s"),
2191 ms_wcCharsetName.empty() ? wxString("<none>")
2192 : ms_wcCharsetName,
2193 ms_wcNeedsSwap ? _T(" (needs swap)")
2194 : _T(""));
2195 }
2196 else // we already have ms_wcCharsetName
2197 {
2198 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2199 }
2200
2201 if ( ms_wcCharsetName.empty() )
2202 {
2203 w2m = ICONV_T_INVALID;
2204 }
2205 else
2206 {
2207 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2208 if ( w2m == ICONV_T_INVALID )
2209 {
2210 wxLogTrace(TRACE_STRCONV,
2211 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2212 ms_wcCharsetName.c_str(), name);
2213 }
2214 }
2215 }
2216
2217 wxMBConv_iconv::~wxMBConv_iconv()
2218 {
2219 if ( m2w != ICONV_T_INVALID )
2220 iconv_close(m2w);
2221 if ( w2m != ICONV_T_INVALID )
2222 iconv_close(w2m);
2223 }
2224
2225 size_t
2226 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2227 const char *src, size_t srcLen) const
2228 {
2229 if ( srcLen == wxNO_LEN )
2230 {
2231 // find the string length: notice that must be done differently for
2232 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2233 // consecutive NULs
2234 const size_t nulLen = GetMBNulLen();
2235 switch ( nulLen )
2236 {
2237 default:
2238 return wxCONV_FAILED;
2239
2240 case 1:
2241 srcLen = strlen(src); // arguably more optimized than our version
2242 break;
2243
2244 case 2:
2245 case 4:
2246 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2247 // but they also have to start at character boundary and not
2248 // span two adjacent characters
2249 const char *p;
2250 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2251 ;
2252 srcLen = p - src;
2253 break;
2254 }
2255
2256 // when we're determining the length of the string ourselves we count
2257 // the terminating NUL(s) as part of it and always NUL-terminate the
2258 // output
2259 srcLen += nulLen;
2260 }
2261
2262 // we express length in the number of (wide) characters but iconv always
2263 // counts buffer sizes it in bytes
2264 dstLen *= SIZEOF_WCHAR_T;
2265
2266 #if wxUSE_THREADS
2267 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2268 // Unfortunately there are a couple of global wxCSConv objects such as
2269 // wxConvLocal that are used all over wx code, so we have to make sure
2270 // the handle is used by at most one thread at the time. Otherwise
2271 // only a few wx classes would be safe to use from non-main threads
2272 // as MB<->WC conversion would fail "randomly".
2273 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2274 #endif // wxUSE_THREADS
2275
2276 size_t res, cres;
2277 const char *pszPtr = src;
2278
2279 if ( dst )
2280 {
2281 char* bufPtr = (char*)dst;
2282
2283 // have destination buffer, convert there
2284 size_t dstLenOrig = dstLen;
2285 cres = iconv(m2w,
2286 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2287 &bufPtr, &dstLen);
2288
2289 // convert the number of bytes converted as returned by iconv to the
2290 // number of (wide) characters converted that we need
2291 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2292
2293 if (ms_wcNeedsSwap)
2294 {
2295 // convert to native endianness
2296 for ( unsigned i = 0; i < res; i++ )
2297 dst[i] = WC_BSWAP(dst[i]);
2298 }
2299 }
2300 else // no destination buffer
2301 {
2302 // convert using temp buffer to calculate the size of the buffer needed
2303 wchar_t tbuf[8];
2304 res = 0;
2305
2306 do
2307 {
2308 char* bufPtr = (char*)tbuf;
2309 dstLen = 8 * SIZEOF_WCHAR_T;
2310
2311 cres = iconv(m2w,
2312 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2313 &bufPtr, &dstLen );
2314
2315 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2316 }
2317 while ((cres == (size_t)-1) && (errno == E2BIG));
2318 }
2319
2320 if (ICONV_FAILED(cres, srcLen))
2321 {
2322 //VS: it is ok if iconv fails, hence trace only
2323 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2324 return wxCONV_FAILED;
2325 }
2326
2327 return res;
2328 }
2329
2330 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2331 const wchar_t *src, size_t srcLen) const
2332 {
2333 #if wxUSE_THREADS
2334 // NB: explained in MB2WC
2335 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2336 #endif
2337
2338 if ( srcLen == wxNO_LEN )
2339 srcLen = wxWcslen(src) + 1;
2340
2341 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2342 size_t outbuflen = dstLen;
2343 size_t res, cres;
2344
2345 wchar_t *tmpbuf = 0;
2346
2347 if (ms_wcNeedsSwap)
2348 {
2349 // need to copy to temp buffer to switch endianness
2350 // (doing WC_BSWAP twice on the original buffer won't help, as it
2351 // could be in read-only memory, or be accessed in some other thread)
2352 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2353 for ( size_t i = 0; i < srcLen; i++ )
2354 tmpbuf[i] = WC_BSWAP(src[i]);
2355
2356 tmpbuf[srcLen] = L'\0';
2357 src = tmpbuf;
2358 }
2359
2360 char* inbuf = (char*)src;
2361 if ( dst )
2362 {
2363 // have destination buffer, convert there
2364 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2365
2366 res = dstLen - outbuflen;
2367 }
2368 else // no destination buffer
2369 {
2370 // convert using temp buffer to calculate the size of the buffer needed
2371 char tbuf[16];
2372 res = 0;
2373 do
2374 {
2375 dst = tbuf;
2376 outbuflen = 16;
2377
2378 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2379
2380 res += 16 - outbuflen;
2381 }
2382 while ((cres == (size_t)-1) && (errno == E2BIG));
2383 }
2384
2385 if (ms_wcNeedsSwap)
2386 {
2387 free(tmpbuf);
2388 }
2389
2390 if (ICONV_FAILED(cres, inbuflen))
2391 {
2392 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2393 return wxCONV_FAILED;
2394 }
2395
2396 return res;
2397 }
2398
2399 size_t wxMBConv_iconv::GetMBNulLen() const
2400 {
2401 if ( m_minMBCharWidth == 0 )
2402 {
2403 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2404
2405 #if wxUSE_THREADS
2406 // NB: explained in MB2WC
2407 wxMutexLocker lock(self->m_iconvMutex);
2408 #endif
2409
2410 const wchar_t *wnul = L"";
2411 char buf[8]; // should be enough for NUL in any encoding
2412 size_t inLen = sizeof(wchar_t),
2413 outLen = WXSIZEOF(buf);
2414 char *inBuff = (char *)wnul;
2415 char *outBuff = buf;
2416 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2417 {
2418 self->m_minMBCharWidth = (size_t)-1;
2419 }
2420 else // ok
2421 {
2422 self->m_minMBCharWidth = outBuff - buf;
2423 }
2424 }
2425
2426 return m_minMBCharWidth;
2427 }
2428
2429 #if wxUSE_UNICODE_UTF8
2430 bool wxMBConv_iconv::IsUTF8() const
2431 {
2432 return wxStricmp(m_name, "UTF-8") == 0 ||
2433 wxStricmp(m_name, "UTF8") == 0;
2434 }
2435 #endif
2436
2437 #endif // HAVE_ICONV
2438
2439
2440 // ============================================================================
2441 // Win32 conversion classes
2442 // ============================================================================
2443
2444 #ifdef wxHAVE_WIN32_MB2WC
2445
2446 // from utils.cpp
2447 #if wxUSE_FONTMAP
2448 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2449 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2450 #endif
2451
2452 class wxMBConv_win32 : public wxMBConv
2453 {
2454 public:
2455 wxMBConv_win32()
2456 {
2457 m_CodePage = CP_ACP;
2458 m_minMBCharWidth = 0;
2459 }
2460
2461 wxMBConv_win32(const wxMBConv_win32& conv)
2462 : wxMBConv()
2463 {
2464 m_CodePage = conv.m_CodePage;
2465 m_minMBCharWidth = conv.m_minMBCharWidth;
2466 }
2467
2468 #if wxUSE_FONTMAP
2469 wxMBConv_win32(const char* name)
2470 {
2471 m_CodePage = wxCharsetToCodepage(name);
2472 m_minMBCharWidth = 0;
2473 }
2474
2475 wxMBConv_win32(wxFontEncoding encoding)
2476 {
2477 m_CodePage = wxEncodingToCodepage(encoding);
2478 m_minMBCharWidth = 0;
2479 }
2480 #endif // wxUSE_FONTMAP
2481
2482 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2483 {
2484 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2485 // the behaviour is not compatible with the Unix version (using iconv)
2486 // and break the library itself, e.g. wxTextInputStream::NextChar()
2487 // wouldn't work if reading an incomplete MB char didn't result in an
2488 // error
2489 //
2490 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2491 // Win XP or newer and it is not supported for UTF-[78] so we always
2492 // use our own conversions in this case. See
2493 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2494 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2495 if ( m_CodePage == CP_UTF8 )
2496 {
2497 return wxMBConvUTF8().MB2WC(buf, psz, n);
2498 }
2499
2500 if ( m_CodePage == CP_UTF7 )
2501 {
2502 return wxMBConvUTF7().MB2WC(buf, psz, n);
2503 }
2504
2505 int flags = 0;
2506 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2507 IsAtLeastWin2kSP4() )
2508 {
2509 flags = MB_ERR_INVALID_CHARS;
2510 }
2511
2512 const size_t len = ::MultiByteToWideChar
2513 (
2514 m_CodePage, // code page
2515 flags, // flags: fall on error
2516 psz, // input string
2517 -1, // its length (NUL-terminated)
2518 buf, // output string
2519 buf ? n : 0 // size of output buffer
2520 );
2521 if ( !len )
2522 {
2523 // function totally failed
2524 return wxCONV_FAILED;
2525 }
2526
2527 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2528 // check if we succeeded, by doing a double trip:
2529 if ( !flags && buf )
2530 {
2531 const size_t mbLen = strlen(psz);
2532 wxCharBuffer mbBuf(mbLen);
2533 if ( ::WideCharToMultiByte
2534 (
2535 m_CodePage,
2536 0,
2537 buf,
2538 -1,
2539 mbBuf.data(),
2540 mbLen + 1, // size in bytes, not length
2541 NULL,
2542 NULL
2543 ) == 0 ||
2544 strcmp(mbBuf, psz) != 0 )
2545 {
2546 // we didn't obtain the same thing we started from, hence
2547 // the conversion was lossy and we consider that it failed
2548 return wxCONV_FAILED;
2549 }
2550 }
2551
2552 // note that it returns count of written chars for buf != NULL and size
2553 // of the needed buffer for buf == NULL so in either case the length of
2554 // the string (which never includes the terminating NUL) is one less
2555 return len - 1;
2556 }
2557
2558 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2559 {
2560 /*
2561 we have a problem here: by default, WideCharToMultiByte() may
2562 replace characters unrepresentable in the target code page with bad
2563 quality approximations such as turning "1/2" symbol (U+00BD) into
2564 "1" for the code pages which don't have it and we, obviously, want
2565 to avoid this at any price
2566
2567 the trouble is that this function does it _silently_, i.e. it won't
2568 even tell us whether it did or not... Win98/2000 and higher provide
2569 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2570 we have to resort to a round trip, i.e. check that converting back
2571 results in the same string -- this is, of course, expensive but
2572 otherwise we simply can't be sure to not garble the data.
2573 */
2574
2575 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2576 // it doesn't work with CJK encodings (which we test for rather roughly
2577 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2578 // supporting it
2579 BOOL usedDef wxDUMMY_INITIALIZE(false);
2580 BOOL *pUsedDef;
2581 int flags;
2582 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2583 {
2584 // it's our lucky day
2585 flags = WC_NO_BEST_FIT_CHARS;
2586 pUsedDef = &usedDef;
2587 }
2588 else // old system or unsupported encoding
2589 {
2590 flags = 0;
2591 pUsedDef = NULL;
2592 }
2593
2594 const size_t len = ::WideCharToMultiByte
2595 (
2596 m_CodePage, // code page
2597 flags, // either none or no best fit
2598 pwz, // input string
2599 -1, // it is (wide) NUL-terminated
2600 buf, // output buffer
2601 buf ? n : 0, // and its size
2602 NULL, // default "replacement" char
2603 pUsedDef // [out] was it used?
2604 );
2605
2606 if ( !len )
2607 {
2608 // function totally failed
2609 return wxCONV_FAILED;
2610 }
2611
2612 // we did something, check if we really succeeded
2613 if ( flags )
2614 {
2615 // check if the conversion failed, i.e. if any replacements
2616 // were done
2617 if ( usedDef )
2618 return wxCONV_FAILED;
2619 }
2620 else // we must resort to double tripping...
2621 {
2622 // first we need to ensure that we really have the MB data: this is
2623 // not the case if we're called with NULL buffer, in which case we
2624 // need to do the conversion yet again
2625 wxCharBuffer bufDef;
2626 if ( !buf )
2627 {
2628 bufDef = wxCharBuffer(len);
2629 buf = bufDef.data();
2630 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2631 buf, len, NULL, NULL) )
2632 return wxCONV_FAILED;
2633 }
2634
2635 if ( !n )
2636 n = wcslen(pwz);
2637 wxWCharBuffer wcBuf(n);
2638 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2639 wcscmp(wcBuf, pwz) != 0 )
2640 {
2641 // we didn't obtain the same thing we started from, hence
2642 // the conversion was lossy and we consider that it failed
2643 return wxCONV_FAILED;
2644 }
2645 }
2646
2647 // see the comment above for the reason of "len - 1"
2648 return len - 1;
2649 }
2650
2651 virtual size_t GetMBNulLen() const
2652 {
2653 if ( m_minMBCharWidth == 0 )
2654 {
2655 int len = ::WideCharToMultiByte
2656 (
2657 m_CodePage, // code page
2658 0, // no flags
2659 L"", // input string
2660 1, // translate just the NUL
2661 NULL, // output buffer
2662 0, // and its size
2663 NULL, // no replacement char
2664 NULL // [out] don't care if it was used
2665 );
2666
2667 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2668 switch ( len )
2669 {
2670 default:
2671 wxLogDebug(_T("Unexpected NUL length %d"), len);
2672 self->m_minMBCharWidth = (size_t)-1;
2673 break;
2674
2675 case 0:
2676 self->m_minMBCharWidth = (size_t)-1;
2677 break;
2678
2679 case 1:
2680 case 2:
2681 case 4:
2682 self->m_minMBCharWidth = len;
2683 break;
2684 }
2685 }
2686
2687 return m_minMBCharWidth;
2688 }
2689
2690 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2691
2692 bool IsOk() const { return m_CodePage != -1; }
2693
2694 private:
2695 static bool CanUseNoBestFit()
2696 {
2697 static int s_isWin98Or2k = -1;
2698
2699 if ( s_isWin98Or2k == -1 )
2700 {
2701 int verMaj, verMin;
2702 switch ( wxGetOsVersion(&verMaj, &verMin) )
2703 {
2704 case wxOS_WINDOWS_9X:
2705 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2706 break;
2707
2708 case wxOS_WINDOWS_NT:
2709 s_isWin98Or2k = verMaj >= 5;
2710 break;
2711
2712 default:
2713 // unknown: be conservative by default
2714 s_isWin98Or2k = 0;
2715 break;
2716 }
2717
2718 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2719 }
2720
2721 return s_isWin98Or2k == 1;
2722 }
2723
2724 static bool IsAtLeastWin2kSP4()
2725 {
2726 #ifdef __WXWINCE__
2727 return false;
2728 #else
2729 static int s_isAtLeastWin2kSP4 = -1;
2730
2731 if ( s_isAtLeastWin2kSP4 == -1 )
2732 {
2733 OSVERSIONINFOEX ver;
2734
2735 memset(&ver, 0, sizeof(ver));
2736 ver.dwOSVersionInfoSize = sizeof(ver);
2737 GetVersionEx((OSVERSIONINFO*)&ver);
2738
2739 s_isAtLeastWin2kSP4 =
2740 ((ver.dwMajorVersion > 5) || // Vista+
2741 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2742 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2743 ver.wServicePackMajor >= 4)) // 2000 SP4+
2744 ? 1 : 0;
2745 }
2746
2747 return s_isAtLeastWin2kSP4 == 1;
2748 #endif
2749 }
2750
2751
2752 // the code page we're working with
2753 long m_CodePage;
2754
2755 // cached result of GetMBNulLen(), set to 0 initially meaning
2756 // "unknown"
2757 size_t m_minMBCharWidth;
2758 };
2759
2760 #endif // wxHAVE_WIN32_MB2WC
2761
2762
2763 // ============================================================================
2764 // wxEncodingConverter based conversion classes
2765 // ============================================================================
2766
2767 #if wxUSE_FONTMAP
2768
2769 class wxMBConv_wxwin : public wxMBConv
2770 {
2771 private:
2772 void Init()
2773 {
2774 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2775 // The wxMBConv_cf class does a better job.
2776 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2777 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2778 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2779 }
2780
2781 public:
2782 // temporarily just use wxEncodingConverter stuff,
2783 // so that it works while a better implementation is built
2784 wxMBConv_wxwin(const char* name)
2785 {
2786 if (name)
2787 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2788 else
2789 m_enc = wxFONTENCODING_SYSTEM;
2790
2791 Init();
2792 }
2793
2794 wxMBConv_wxwin(wxFontEncoding enc)
2795 {
2796 m_enc = enc;
2797
2798 Init();
2799 }
2800
2801 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2802 {
2803 size_t inbuf = strlen(psz);
2804 if (buf)
2805 {
2806 if (!m2w.Convert(psz, buf))
2807 return wxCONV_FAILED;
2808 }
2809 return inbuf;
2810 }
2811
2812 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2813 {
2814 const size_t inbuf = wxWcslen(psz);
2815 if (buf)
2816 {
2817 if (!w2m.Convert(psz, buf))
2818 return wxCONV_FAILED;
2819 }
2820
2821 return inbuf;
2822 }
2823
2824 virtual size_t GetMBNulLen() const
2825 {
2826 switch ( m_enc )
2827 {
2828 case wxFONTENCODING_UTF16BE:
2829 case wxFONTENCODING_UTF16LE:
2830 return 2;
2831
2832 case wxFONTENCODING_UTF32BE:
2833 case wxFONTENCODING_UTF32LE:
2834 return 4;
2835
2836 default:
2837 return 1;
2838 }
2839 }
2840
2841 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2842
2843 bool IsOk() const { return m_ok; }
2844
2845 public:
2846 wxFontEncoding m_enc;
2847 wxEncodingConverter m2w, w2m;
2848
2849 private:
2850 // were we initialized successfully?
2851 bool m_ok;
2852
2853 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2854 };
2855
2856 // make the constructors available for unit testing
2857 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2858 {
2859 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2860 if ( !result->IsOk() )
2861 {
2862 delete result;
2863 return 0;
2864 }
2865
2866 return result;
2867 }
2868
2869 #endif // wxUSE_FONTMAP
2870
2871 // ============================================================================
2872 // wxCSConv implementation
2873 // ============================================================================
2874
2875 void wxCSConv::Init()
2876 {
2877 m_name = NULL;
2878 m_convReal = NULL;
2879 m_deferred = true;
2880 }
2881
2882 wxCSConv::wxCSConv(const wxString& charset)
2883 {
2884 Init();
2885
2886 if ( !charset.empty() )
2887 {
2888 SetName(charset.ToAscii());
2889 }
2890
2891 #if wxUSE_FONTMAP
2892 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2893 if ( m_encoding == wxFONTENCODING_MAX )
2894 {
2895 // set to unknown/invalid value
2896 m_encoding = wxFONTENCODING_SYSTEM;
2897 }
2898 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2899 {
2900 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2901 m_encoding = wxFONTENCODING_ISO8859_1;
2902 }
2903 #else
2904 m_encoding = wxFONTENCODING_SYSTEM;
2905 #endif
2906 }
2907
2908 wxCSConv::wxCSConv(wxFontEncoding encoding)
2909 {
2910 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2911 {
2912 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2913
2914 encoding = wxFONTENCODING_SYSTEM;
2915 }
2916
2917 Init();
2918
2919 m_encoding = encoding;
2920 }
2921
2922 wxCSConv::~wxCSConv()
2923 {
2924 Clear();
2925 }
2926
2927 wxCSConv::wxCSConv(const wxCSConv& conv)
2928 : wxMBConv()
2929 {
2930 Init();
2931
2932 SetName(conv.m_name);
2933 m_encoding = conv.m_encoding;
2934 }
2935
2936 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2937 {
2938 Clear();
2939
2940 SetName(conv.m_name);
2941 m_encoding = conv.m_encoding;
2942
2943 return *this;
2944 }
2945
2946 void wxCSConv::Clear()
2947 {
2948 free(m_name);
2949 delete m_convReal;
2950
2951 m_name = NULL;
2952 m_convReal = NULL;
2953 }
2954
2955 void wxCSConv::SetName(const char *charset)
2956 {
2957 if (charset)
2958 {
2959 m_name = wxStrdup(charset);
2960 m_deferred = true;
2961 }
2962 }
2963
2964 #if wxUSE_FONTMAP
2965
2966 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2967 wxEncodingNameCache );
2968
2969 static wxEncodingNameCache gs_nameCache;
2970 #endif
2971
2972 wxMBConv *wxCSConv::DoCreate() const
2973 {
2974 #if wxUSE_FONTMAP
2975 wxLogTrace(TRACE_STRCONV,
2976 wxT("creating conversion for %s"),
2977 (m_name ? m_name
2978 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2979 #endif // wxUSE_FONTMAP
2980
2981 // check for the special case of ASCII or ISO8859-1 charset: as we have
2982 // special knowledge of it anyhow, we don't need to create a special
2983 // conversion object
2984 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2985 m_encoding == wxFONTENCODING_DEFAULT )
2986 {
2987 // don't convert at all
2988 return NULL;
2989 }
2990
2991 // we trust OS to do conversion better than we can so try external
2992 // conversion methods first
2993 //
2994 // the full order is:
2995 // 1. OS conversion (iconv() under Unix or Win32 API)
2996 // 2. hard coded conversions for UTF
2997 // 3. wxEncodingConverter as fall back
2998
2999 // step (1)
3000 #ifdef HAVE_ICONV
3001 #if !wxUSE_FONTMAP
3002 if ( m_name )
3003 #endif // !wxUSE_FONTMAP
3004 {
3005 #if wxUSE_FONTMAP
3006 wxFontEncoding encoding(m_encoding);
3007 #endif
3008
3009 if ( m_name )
3010 {
3011 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3012 if ( conv->IsOk() )
3013 return conv;
3014
3015 delete conv;
3016
3017 #if wxUSE_FONTMAP
3018 encoding =
3019 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3020 #endif // wxUSE_FONTMAP
3021 }
3022 #if wxUSE_FONTMAP
3023 {
3024 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3025 if ( it != gs_nameCache.end() )
3026 {
3027 if ( it->second.empty() )
3028 return NULL;
3029
3030 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3031 if ( conv->IsOk() )
3032 return conv;
3033
3034 delete conv;
3035 }
3036
3037 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3038 // CS : in case this does not return valid names (eg for MacRoman)
3039 // encoding got a 'failure' entry in the cache all the same,
3040 // although it just has to be created using a different method, so
3041 // only store failed iconv creation attempts (or perhaps we
3042 // shoulnd't do this at all ?)
3043 if ( names[0] != NULL )
3044 {
3045 for ( ; *names; ++names )
3046 {
3047 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3048 // will need changes that will obsolete this
3049 wxString name(*names);
3050 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3051 if ( conv->IsOk() )
3052 {
3053 gs_nameCache[encoding] = *names;
3054 return conv;
3055 }
3056
3057 delete conv;
3058 }
3059
3060 gs_nameCache[encoding] = _T(""); // cache the failure
3061 }
3062 }
3063 #endif // wxUSE_FONTMAP
3064 }
3065 #endif // HAVE_ICONV
3066
3067 #ifdef wxHAVE_WIN32_MB2WC
3068 {
3069 #if wxUSE_FONTMAP
3070 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3071 : new wxMBConv_win32(m_encoding);
3072 if ( conv->IsOk() )
3073 return conv;
3074
3075 delete conv;
3076 #else
3077 return NULL;
3078 #endif
3079 }
3080 #endif // wxHAVE_WIN32_MB2WC
3081
3082 #ifdef __DARWIN__
3083 {
3084 // leave UTF16 and UTF32 to the built-ins of wx
3085 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3086 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3087 {
3088 #if wxUSE_FONTMAP
3089 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3090 : new wxMBConv_cf(m_encoding);
3091 #else
3092 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3093 #endif
3094
3095 if ( conv->IsOk() )
3096 return conv;
3097
3098 delete conv;
3099 }
3100 }
3101 #endif // __DARWIN__
3102
3103 // step (2)
3104 wxFontEncoding enc = m_encoding;
3105 #if wxUSE_FONTMAP
3106 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3107 {
3108 // use "false" to suppress interactive dialogs -- we can be called from
3109 // anywhere and popping up a dialog from here is the last thing we want to
3110 // do
3111 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3112 }
3113 #endif // wxUSE_FONTMAP
3114
3115 switch ( enc )
3116 {
3117 case wxFONTENCODING_UTF7:
3118 return new wxMBConvUTF7;
3119
3120 case wxFONTENCODING_UTF8:
3121 return new wxMBConvUTF8;
3122
3123 case wxFONTENCODING_UTF16BE:
3124 return new wxMBConvUTF16BE;
3125
3126 case wxFONTENCODING_UTF16LE:
3127 return new wxMBConvUTF16LE;
3128
3129 case wxFONTENCODING_UTF32BE:
3130 return new wxMBConvUTF32BE;
3131
3132 case wxFONTENCODING_UTF32LE:
3133 return new wxMBConvUTF32LE;
3134
3135 default:
3136 // nothing to do but put here to suppress gcc warnings
3137 break;
3138 }
3139
3140 // step (3)
3141 #if wxUSE_FONTMAP
3142 {
3143 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3144 : new wxMBConv_wxwin(m_encoding);
3145 if ( conv->IsOk() )
3146 return conv;
3147
3148 delete conv;
3149 }
3150 #endif // wxUSE_FONTMAP
3151
3152 // NB: This is a hack to prevent deadlock. What could otherwise happen
3153 // in Unicode build: wxConvLocal creation ends up being here
3154 // because of some failure and logs the error. But wxLog will try to
3155 // attach a timestamp, for which it will need wxConvLocal (to convert
3156 // time to char* and then wchar_t*), but that fails, tries to log the
3157 // error, but wxLog has an (already locked) critical section that
3158 // guards the static buffer.
3159 static bool alreadyLoggingError = false;
3160 if (!alreadyLoggingError)
3161 {
3162 alreadyLoggingError = true;
3163 wxLogError(_("Cannot convert from the charset '%s'!"),
3164 m_name ? m_name
3165 :
3166 #if wxUSE_FONTMAP
3167 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3168 #else // !wxUSE_FONTMAP
3169 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3170 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3171 );
3172
3173 alreadyLoggingError = false;
3174 }
3175
3176 return NULL;
3177 }
3178
3179 void wxCSConv::CreateConvIfNeeded() const
3180 {
3181 if ( m_deferred )
3182 {
3183 wxCSConv *self = (wxCSConv *)this; // const_cast
3184
3185 // if we don't have neither the name nor the encoding, use the default
3186 // encoding for this system
3187 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3188 {
3189 #if wxUSE_INTL
3190 self->m_encoding = wxLocale::GetSystemEncoding();
3191 #else
3192 // fallback to some reasonable default:
3193 self->m_encoding = wxFONTENCODING_ISO8859_1;
3194 #endif // wxUSE_INTL
3195 }
3196
3197 self->m_convReal = DoCreate();
3198 self->m_deferred = false;
3199 }
3200 }
3201
3202 bool wxCSConv::IsOk() const
3203 {
3204 CreateConvIfNeeded();
3205
3206 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3207 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3208 return true; // always ok as we do it ourselves
3209
3210 // m_convReal->IsOk() is called at its own creation, so we know it must
3211 // be ok if m_convReal is non-NULL
3212 return m_convReal != NULL;
3213 }
3214
3215 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3216 const char *src, size_t srcLen) const
3217 {
3218 CreateConvIfNeeded();
3219
3220 if (m_convReal)
3221 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3222
3223 // latin-1 (direct)
3224 if ( srcLen == wxNO_LEN )
3225 srcLen = strlen(src) + 1; // take trailing NUL too
3226
3227 if ( dst )
3228 {
3229 if ( dstLen < srcLen )
3230 return wxCONV_FAILED;
3231
3232 for ( size_t n = 0; n < srcLen; n++ )
3233 dst[n] = (unsigned char)(src[n]);
3234 }
3235
3236 return srcLen;
3237 }
3238
3239 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3240 const wchar_t *src, size_t srcLen) const
3241 {
3242 CreateConvIfNeeded();
3243
3244 if (m_convReal)
3245 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3246
3247 // latin-1 (direct)
3248 if ( srcLen == wxNO_LEN )
3249 srcLen = wxWcslen(src) + 1;
3250
3251 if ( dst )
3252 {
3253 if ( dstLen < srcLen )
3254 return wxCONV_FAILED;
3255
3256 for ( size_t n = 0; n < srcLen; n++ )
3257 {
3258 if ( src[n] > 0xFF )
3259 return wxCONV_FAILED;
3260
3261 dst[n] = (char)src[n];
3262 }
3263
3264 }
3265 else // still need to check the input validity
3266 {
3267 for ( size_t n = 0; n < srcLen; n++ )
3268 {
3269 if ( src[n] > 0xFF )
3270 return wxCONV_FAILED;
3271 }
3272 }
3273
3274 return srcLen;
3275 }
3276
3277 size_t wxCSConv::GetMBNulLen() const
3278 {
3279 CreateConvIfNeeded();
3280
3281 if ( m_convReal )
3282 {
3283 return m_convReal->GetMBNulLen();
3284 }
3285
3286 // otherwise, we are ISO-8859-1
3287 return 1;
3288 }
3289
3290 #if wxUSE_UNICODE_UTF8
3291 bool wxCSConv::IsUTF8() const
3292 {
3293 CreateConvIfNeeded();
3294
3295 if ( m_convReal )
3296 {
3297 return m_convReal->IsUTF8();
3298 }
3299
3300 // otherwise, we are ISO-8859-1
3301 return false;
3302 }
3303 #endif
3304
3305
3306 #if wxUSE_UNICODE
3307
3308 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3309 {
3310 if ( !s )
3311 return wxWCharBuffer();
3312
3313 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3314 if ( !wbuf )
3315 wbuf = wxMBConvUTF8().cMB2WX(s);
3316 if ( !wbuf )
3317 wbuf = wxConvISO8859_1.cMB2WX(s);
3318
3319 return wbuf;
3320 }
3321
3322 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3323 {
3324 if ( !ws )
3325 return wxCharBuffer();
3326
3327 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3328 if ( !buf )
3329 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3330
3331 return buf;
3332 }
3333
3334 #endif // wxUSE_UNICODE
3335
3336 // ----------------------------------------------------------------------------
3337 // globals
3338 // ----------------------------------------------------------------------------
3339
3340 // NB: The reason why we create converted objects in this convoluted way,
3341 // using a factory function instead of global variable, is that they
3342 // may be used at static initialization time (some of them are used by
3343 // wxString ctors and there may be a global wxString object). In other
3344 // words, possibly _before_ the converter global object would be
3345 // initialized.
3346
3347 #undef wxConvLibc
3348 #undef wxConvUTF8
3349 #undef wxConvUTF7
3350 #undef wxConvLocal
3351 #undef wxConvISO8859_1
3352
3353 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3354 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3355 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3356 { \
3357 static impl_klass name##Obj ctor_args; \
3358 return &name##Obj; \
3359 } \
3360 /* this ensures that all global converter objects are created */ \
3361 /* by the time static initialization is done, i.e. before any */ \
3362 /* thread is launched: */ \
3363 static klass* gs_##name##instance = wxGet_##name##Ptr()
3364
3365 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3366 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3367
3368 #ifdef __WINDOWS__
3369 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3370 #else
3371 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3372 #endif
3373
3374 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3375 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3376 // provokes an error message about "not enough macro parameters"; and we
3377 // can't use "()" here as the name##Obj declaration would be parsed as a
3378 // function declaration then, so use a semicolon and live with an extra
3379 // empty statement (and hope that no compilers warns about this)
3380 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3381 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3382
3383 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3384 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3385
3386 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3387 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3388
3389 #ifdef __DARWIN__
3390 // The xnu kernel always communicates file paths in decomposed UTF-8.
3391 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3392 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3393 #endif
3394
3395 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3396 #ifdef __DARWIN__
3397 &wxConvMacUTF8DObj;
3398 #else // !__DARWIN__
3399 wxGet_wxConvLibcPtr();
3400 #endif // __DARWIN__/!__DARWIN__
3401
3402 #else // !wxUSE_WCHAR_T
3403
3404 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3405 // stand-ins in absence of wchar_t
3406 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3407 wxConvISO8859_1,
3408 wxConvLocal,
3409 wxConvUTF8;
3410
3411 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T