Ensure that strings returned by wxMBConv_cf are in NFC form.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #ifndef __WXWINCE__
32 #include <errno.h>
33 #endif
34
35 #include <ctype.h>
36 #include <string.h>
37 #include <stdlib.h>
38
39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef HAVE_ICONV
46 #include <iconv.h>
47 #include "wx/thread.h"
48 #endif
49
50 #include "wx/encconv.h"
51 #include "wx/fontmap.h"
52
53 #ifdef __DARWIN__
54 #include "wx/osx/core/private/strconv_cf.h"
55 #endif //def __DARWIN__
56
57
58 #define TRACE_STRCONV wxT("strconv")
59
60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61 // be 4 bytes
62 #if SIZEOF_WCHAR_T == 2
63 #define WC_UTF16
64 #endif
65
66
67 // ============================================================================
68 // implementation
69 // ============================================================================
70
71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
72 static bool NotAllNULs(const char *p, size_t n)
73 {
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78 }
79
80 // ----------------------------------------------------------------------------
81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
82 // ----------------------------------------------------------------------------
83
84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
85 {
86 if (input <= 0xffff)
87 {
88 if (output)
89 *output = (wxUint16) input;
90
91 return 1;
92 }
93 else if (input >= 0x110000)
94 {
95 return wxCONV_FAILED;
96 }
97 else
98 {
99 if (output)
100 {
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
103 }
104
105 return 2;
106 }
107 }
108
109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
110 {
111 if ((*input < 0xd800) || (*input > 0xdfff))
112 {
113 output = *input;
114 return 1;
115 }
116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
117 {
118 output = *input;
119 return wxCONV_FAILED;
120 }
121 else
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
126 }
127
128 #ifdef WC_UTF16
129 typedef wchar_t wxDecodeSurrogate_t;
130 #else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132 #endif // WC_UTF16/!WC_UTF16
133
134 // returns the next UTF-32 character from the wchar_t buffer and advances the
135 // pointer to the character after this one
136 //
137 // if an invalid character is found, *pSrc is set to NULL, the caller must
138 // check for this
139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
140 {
141 wxUint32 out;
142 const size_t
143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150 }
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 size_t
157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
159 {
160 // although new conversion classes are supposed to implement this function
161 // directly, the existing ones only implement the old MB2WC() and so, to
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
170
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
173
174 // the number of NULs terminating this string
175 size_t nulLen = 0; // not really needed, but just to avoid warnings
176
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
183 if ( srcLen != wxNO_LEN )
184 {
185 // we need to know how to find the end of this string
186 nulLen = GetMBNulLen();
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
189
190 // if there are enough NULs we can avoid the copy
191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
192 {
193 // make a copy in order to properly NUL-terminate the string
194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
195 char * const p = bufTmp.data();
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
198 *s = '\0';
199
200 src = bufTmp;
201 }
202
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
209
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
225 for ( ;; )
226 {
227 // try to convert the current chunk
228 size_t lenChunk = MB2WC(NULL, src, 0);
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
231
232 dstWritten += lenChunk;
233 if ( !srcEnd )
234 dstWritten++;
235
236 if ( !lenChunk )
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
252 if ( !srcEnd )
253 dst++;
254 }
255
256 if ( !srcEnd )
257 {
258 // we convert just one chunk in this case as this is the entire
259 // string anyhow (and we don't count the trailing NUL in this case)
260 break;
261 }
262
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
286
287 if ( src >= srcEnd )
288 break;
289 }
290
291 return dstWritten;
292 }
293
294 size_t
295 wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297 {
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
300
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
308 wxWCharBuffer bufTmp;
309 if ( isNulTerminated )
310 {
311 srcLen = wxWcslen(src) + 1;
312 }
313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
314 {
315 // make a copy in order to properly NUL-terminate the string
316 bufTmp = wxWCharBuffer(srcLen);
317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
324 src++ /* skip L'\0' too */ )
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
331 dstWritten += lenChunk;
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
340 dstWritten += lenNul;
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
364 return wxCONV_FAILED;
365
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
377 dst += lenChunk;
378 if ( chunkEnd < srcEnd )
379 dst += lenNul;
380 }
381
382 src = chunkEnd;
383 }
384
385 return dstWritten;
386 }
387
388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
389 {
390 size_t rc = ToWChar(outBuff, outLen, inBuff);
391 if ( rc != wxCONV_FAILED )
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399 }
400
401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
402 {
403 size_t rc = FromWChar(outBuff, outLen, inBuff);
404 if ( rc != wxCONV_FAILED )
405 {
406 rc -= GetMBNulLen();
407 }
408
409 return rc;
410 }
411
412 wxMBConv::~wxMBConv()
413 {
414 // nothing to do here (necessary for Darwin linking probably)
415 }
416
417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418 {
419 if ( psz )
420 {
421 // calculate the length of the buffer needed first
422 const size_t nLen = ToWChar(NULL, 0, psz);
423 if ( nLen != wxCONV_FAILED )
424 {
425 // now do the actual conversion
426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
427
428 // +1 for the trailing NULL
429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
430 return buf;
431 }
432 }
433
434 return wxWCharBuffer();
435 }
436
437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438 {
439 if ( pwz )
440 {
441 const size_t nLen = FromWChar(NULL, 0, pwz);
442 if ( nLen != wxCONV_FAILED )
443 {
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451 }
452
453 const wxWCharBuffer
454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
455 {
456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
457 if ( dstLen != wxCONV_FAILED )
458 {
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
463 wbuf.data()[dstLen] = L'\0';
464 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
465 {
466 if ( outLen )
467 {
468 *outLen = dstLen;
469
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen == wxNO_LEN )
476 (*outLen)--;
477 }
478
479 return wbuf;
480 }
481 }
482
483 if ( outLen )
484 *outLen = 0;
485
486 return wxWCharBuffer();
487 }
488
489 const wxCharBuffer
490 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
491 {
492 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
493 if ( dstLen != wxCONV_FAILED )
494 {
495 const size_t nulLen = GetMBNulLen();
496
497 // as above, ensure that the buffer is always NUL-terminated, even if
498 // the input is not
499 wxCharBuffer buf(dstLen + nulLen - 1);
500 memset(buf.data() + dstLen, 0, nulLen);
501 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
502 {
503 if ( outLen )
504 {
505 *outLen = dstLen;
506
507 if ( inLen == wxNO_LEN )
508 {
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
511 *outLen -= nulLen;
512 }
513 }
514
515 return buf;
516 }
517 }
518
519 if ( outLen )
520 *outLen = 0;
521
522 return wxCharBuffer();
523 }
524
525 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
526 {
527 const size_t srcLen = buf.length();
528 if ( srcLen )
529 {
530 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
531 if ( dstLen != wxCONV_FAILED )
532 {
533 wxWCharBuffer wbuf(dstLen);
534 wbuf.data()[dstLen] = L'\0';
535 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
536 return wbuf;
537 }
538 }
539
540 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
541 }
542
543 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
544 {
545 const size_t srcLen = wbuf.length();
546 if ( srcLen )
547 {
548 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
549 if ( dstLen != wxCONV_FAILED )
550 {
551 wxCharBuffer buf(dstLen);
552 buf.data()[dstLen] = '\0';
553 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
554 return buf;
555 }
556 }
557
558 return wxScopedCharBuffer::CreateNonOwned("", 0);
559 }
560
561 // ----------------------------------------------------------------------------
562 // wxMBConvLibc
563 // ----------------------------------------------------------------------------
564
565 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
566 {
567 return wxMB2WC(buf, psz, n);
568 }
569
570 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
571 {
572 return wxWC2MB(buf, psz, n);
573 }
574
575 // ----------------------------------------------------------------------------
576 // wxConvBrokenFileNames
577 // ----------------------------------------------------------------------------
578
579 #ifdef __UNIX__
580
581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
582 {
583 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset, wxT("UTF8")) == 0 )
585 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
586 else
587 m_conv = new wxCSConv(charset);
588 }
589
590 #endif // __UNIX__
591
592 // ----------------------------------------------------------------------------
593 // UTF-7
594 // ----------------------------------------------------------------------------
595
596 // Implementation (C) 2004 Fredrik Roubert
597 //
598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
599
600 //
601 // BASE64 decoding table
602 //
603 static const unsigned char utf7unb64[] =
604 {
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
637 };
638
639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
640 const char *src, size_t srcLen) const
641 {
642 DecoderState stateOrig,
643 *statePtr;
644 if ( srcLen == wxNO_LEN )
645 {
646 // convert the entire string, up to and including the trailing NUL
647 srcLen = strlen(src) + 1;
648
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr = &stateOrig;
652 }
653 else // when working with partial strings we do use the shift state
654 {
655 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
656
657 // also save the old state to be able to rollback to it on error
658 stateOrig = m_stateDecoder;
659 }
660
661 // but to simplify the code below we use this variable in both cases
662 DecoderState& state = *statePtr;
663
664
665 // number of characters [which would have been] written to dst [if it were
666 // not NULL]
667 size_t len = 0;
668
669 const char * const srcEnd = src + srcLen;
670
671 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
672 {
673 const unsigned char cc = *src++;
674
675 if ( state.IsShifted() )
676 {
677 const unsigned char dc = utf7unb64[cc];
678 if ( dc == 0xff )
679 {
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
684 // characters)
685 if ( state.isLSB || state.bit > 4 ||
686 (state.accum & ((1 << state.bit) - 1)) )
687 {
688 if ( !len )
689 state = stateOrig;
690
691 return wxCONV_FAILED;
692 }
693
694 state.ToDirect();
695
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
698 if ( cc == '-' )
699 continue;
700 }
701 else // valid encoded character
702 {
703 // mini base64 decoder: each character is 6 bits
704 state.bit += 6;
705 state.accum <<= 6;
706 state.accum += dc;
707
708 if ( state.bit >= 8 )
709 {
710 // got the full byte, consume it
711 state.bit -= 8;
712 unsigned char b = (state.accum >> state.bit) & 0x00ff;
713
714 if ( state.isLSB )
715 {
716 // we've got the full word, output it
717 if ( dst )
718 *dst++ = (state.msb << 8) | b;
719 len++;
720 state.isLSB = false;
721 }
722 else // MSB
723 {
724 // just store it while we wait for LSB
725 state.msb = b;
726 state.isLSB = true;
727 }
728 }
729 }
730 }
731
732 if ( state.IsDirect() )
733 {
734 // start of an encoded segment?
735 if ( cc == '+' )
736 {
737 if ( *src == '-' )
738 {
739 // just the encoded plus sign, don't switch to shifted mode
740 if ( dst )
741 *dst++ = '+';
742 len++;
743 src++;
744 }
745 else if ( utf7unb64[(unsigned)*src] == 0xff )
746 {
747 // empty encoded chunks are not allowed
748 if ( !len )
749 state = stateOrig;
750
751 return wxCONV_FAILED;
752 }
753 else // base-64 encoded chunk follows
754 {
755 state.ToShifted();
756 }
757 }
758 else // not '+'
759 {
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc >= 0x7f || (cc < ' ' &&
763 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
764 return wxCONV_FAILED;
765
766 if ( dst )
767 *dst++ = cc;
768 len++;
769 }
770 }
771 }
772
773 if ( !len )
774 {
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
777 // state
778 state = stateOrig;
779
780 return wxCONV_FAILED;
781 }
782
783 return len;
784 }
785
786 //
787 // BASE64 encoding table
788 //
789 static const unsigned char utf7enb64[] =
790 {
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
799 };
800
801 //
802 // UTF-7 encoding table
803 //
804 // 0 - Set D (directly encoded characters)
805 // 1 - Set O (optional direct characters)
806 // 2 - whitespace characters (optional)
807 // 3 - special characters
808 //
809 static const unsigned char utf7encode[128] =
810 {
811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819 };
820
821 static inline bool wxIsUTF7Direct(wchar_t wc)
822 {
823 return wc < 0x80 && utf7encode[wc] < 1;
824 }
825
826 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
827 const wchar_t *src, size_t srcLen) const
828 {
829 EncoderState stateOrig,
830 *statePtr;
831 if ( srcLen == wxNO_LEN )
832 {
833 // we don't apply the stored state when operating on entire strings at
834 // once
835 statePtr = &stateOrig;
836
837 srcLen = wxWcslen(src) + 1;
838 }
839 else // do use the mode we left the output in previously
840 {
841 stateOrig = m_stateEncoder;
842 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
843 }
844
845 EncoderState& state = *statePtr;
846
847
848 size_t len = 0;
849
850 const wchar_t * const srcEnd = src + srcLen;
851 while ( src < srcEnd && (!dst || len < dstLen) )
852 {
853 wchar_t cc = *src++;
854 if ( wxIsUTF7Direct(cc) )
855 {
856 if ( state.IsShifted() )
857 {
858 // pad with zeros the last encoded block if necessary
859 if ( state.bit )
860 {
861 if ( dst )
862 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
863 len++;
864 }
865
866 state.ToDirect();
867
868 if ( dst )
869 *dst++ = '-';
870 len++;
871 }
872
873 if ( dst )
874 *dst++ = (char)cc;
875 len++;
876 }
877 else if ( cc == '+' && state.IsDirect() )
878 {
879 if ( dst )
880 {
881 *dst++ = '+';
882 *dst++ = '-';
883 }
884
885 len += 2;
886 }
887 #ifndef WC_UTF16
888 else if (((wxUint32)cc) > 0xffff)
889 {
890 // no surrogate pair generation (yet?)
891 return wxCONV_FAILED;
892 }
893 #endif
894 else
895 {
896 if ( state.IsDirect() )
897 {
898 state.ToShifted();
899
900 if ( dst )
901 *dst++ = '+';
902 len++;
903 }
904
905 // BASE64 encode string
906 for ( ;; )
907 {
908 for ( unsigned lsb = 0; lsb < 2; lsb++ )
909 {
910 state.accum <<= 8;
911 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
912
913 for (state.bit += 8; state.bit >= 6; )
914 {
915 state.bit -= 6;
916 if ( dst )
917 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
918 len++;
919 }
920 }
921
922 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
923 break;
924
925 src++;
926 }
927 }
928 }
929
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
933 if ( !dst )
934 state = stateOrig;
935
936 return len;
937 }
938
939 // ----------------------------------------------------------------------------
940 // UTF-8
941 // ----------------------------------------------------------------------------
942
943 static const wxUint32 utf8_max[]=
944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
945
946 // boundaries of the private use area we use to (temporarily) remap invalid
947 // characters invalid in a UTF-8 encoded string
948 const wxUint32 wxUnicodePUA = 0x100000;
949 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
950
951 // this table gives the length of the UTF-8 encoding from its first character:
952 const unsigned char tableUtf8Lengths[256] = {
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
962
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 0, 0, // C0,C1
969
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
973
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
976
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
979
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
984 };
985
986 size_t
987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989 {
990 wchar_t *out = dstLen ? dst : NULL;
991 size_t written = 0;
992
993 if ( srcLen == wxNO_LEN )
994 srcLen = strlen(src) + 1;
995
996 for ( const char *p = src; ; p++ )
997 {
998 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
999 {
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen == wxNO_LEN )
1003 {
1004 if ( out )
1005 {
1006 if ( !dstLen )
1007 break;
1008
1009 *out = L'\0';
1010 }
1011
1012 written++;
1013 }
1014
1015 return written;
1016 }
1017
1018 if ( out && !dstLen-- )
1019 break;
1020
1021 wxUint32 code;
1022 unsigned char c = *p;
1023
1024 if ( c < 0x80 )
1025 {
1026 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1027 break;
1028
1029 if ( srcLen != wxNO_LEN )
1030 srcLen--;
1031
1032 code = c;
1033 }
1034 else
1035 {
1036 unsigned len = tableUtf8Lengths[c];
1037 if ( !len )
1038 break;
1039
1040 if ( srcLen < len ) // the test works for wxNO_LEN too
1041 break;
1042
1043 if ( srcLen != wxNO_LEN )
1044 srcLen -= len;
1045
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1053 //
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1057
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1059 // length:
1060 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1061
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1065
1066 len--; // it's more convenient to work with 0-based length here
1067
1068 // extract the lead byte's value bits:
1069 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1070 break;
1071
1072 code = c & leadValueMask[len];
1073
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len; --len )
1077 {
1078 c = *++p;
1079 if ( (c & 0xC0) != 0x80 )
1080 return wxCONV_FAILED;
1081
1082 code <<= 6;
1083 code |= c & 0x3F;
1084 }
1085 }
1086
1087 #ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1090 {
1091 if ( out )
1092 out++;
1093 written++;
1094 }
1095 #else // !WC_UTF16
1096 if ( out )
1097 *out = code;
1098 #endif // WC_UTF16/!WC_UTF16
1099
1100 if ( out )
1101 out++;
1102
1103 written++;
1104 }
1105
1106 return wxCONV_FAILED;
1107 }
1108
1109 size_t
1110 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1111 const wchar_t *src, size_t srcLen) const
1112 {
1113 char *out = dstLen ? dst : NULL;
1114 size_t written = 0;
1115
1116 for ( const wchar_t *wp = src; ; wp++ )
1117 {
1118 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1119 {
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen == wxNO_LEN )
1123 {
1124 if ( out )
1125 {
1126 if ( !dstLen )
1127 break;
1128
1129 *out = '\0';
1130 }
1131
1132 written++;
1133 }
1134
1135 return written;
1136 }
1137
1138 if ( srcLen != wxNO_LEN )
1139 srcLen--;
1140
1141 wxUint32 code;
1142 #ifdef WC_UTF16
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1145 {
1146 // skip the next char too as we decoded a surrogate
1147 wp++;
1148 }
1149 #else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151 #endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223 }
1224
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
1227 {
1228 if ( m_options == MAP_INVALID_UTF8_NOT )
1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1230
1231 size_t len = 0;
1232
1233 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1234 {
1235 const char *opsz = psz;
1236 bool invalid = false;
1237 unsigned char cc = *psz++, fc = cc;
1238 unsigned cnt;
1239 for (cnt = 0; fc & 0x80; cnt++)
1240 fc <<= 1;
1241
1242 if (!cnt)
1243 {
1244 // plain ASCII char
1245 if (buf)
1246 *buf++ = cc;
1247 len++;
1248
1249 // escape the escape character for octal escapes
1250 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1251 && cc == '\\' && (!buf || len < n))
1252 {
1253 if (buf)
1254 *buf++ = cc;
1255 len++;
1256 }
1257 }
1258 else
1259 {
1260 cnt--;
1261 if (!cnt)
1262 {
1263 // invalid UTF-8 sequence
1264 invalid = true;
1265 }
1266 else
1267 {
1268 unsigned ocnt = cnt - 1;
1269 wxUint32 res = cc & (0x3f >> cnt);
1270 while (cnt--)
1271 {
1272 cc = *psz;
1273 if ((cc & 0xC0) != 0x80)
1274 {
1275 // invalid UTF-8 sequence
1276 invalid = true;
1277 break;
1278 }
1279
1280 psz++;
1281 res = (res << 6) | (cc & 0x3f);
1282 }
1283
1284 if (invalid || res <= utf8_max[ocnt])
1285 {
1286 // illegal UTF-8 encoding
1287 invalid = true;
1288 }
1289 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1290 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1291 {
1292 // if one of our PUA characters turns up externally
1293 // it must also be treated as an illegal sequence
1294 // (a bit like you have to escape an escape character)
1295 invalid = true;
1296 }
1297 else
1298 {
1299 #ifdef WC_UTF16
1300 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1301 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1302 if (pa == wxCONV_FAILED)
1303 {
1304 invalid = true;
1305 }
1306 else
1307 {
1308 if (buf)
1309 buf += pa;
1310 len += pa;
1311 }
1312 #else // !WC_UTF16
1313 if (buf)
1314 *buf++ = (wchar_t)res;
1315 len++;
1316 #endif // WC_UTF16/!WC_UTF16
1317 }
1318 }
1319
1320 if (invalid)
1321 {
1322 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1323 {
1324 while (opsz < psz && (!buf || len < n))
1325 {
1326 #ifdef WC_UTF16
1327 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1328 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1329 wxASSERT(pa != wxCONV_FAILED);
1330 if (buf)
1331 buf += pa;
1332 opsz++;
1333 len += pa;
1334 #else
1335 if (buf)
1336 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1337 opsz++;
1338 len++;
1339 #endif
1340 }
1341 }
1342 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1343 {
1344 while (opsz < psz && (!buf || len < n))
1345 {
1346 if ( buf && len + 3 < n )
1347 {
1348 unsigned char on = *opsz;
1349 *buf++ = L'\\';
1350 *buf++ = (wchar_t)( L'0' + on / 0100 );
1351 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1352 *buf++ = (wchar_t)( L'0' + on % 010 );
1353 }
1354
1355 opsz++;
1356 len += 4;
1357 }
1358 }
1359 else // MAP_INVALID_UTF8_NOT
1360 {
1361 return wxCONV_FAILED;
1362 }
1363 }
1364 }
1365 }
1366
1367 if (srcLen == wxNO_LEN && buf && (len < n))
1368 *buf = 0;
1369
1370 return len + 1;
1371 }
1372
1373 static inline bool isoctal(wchar_t wch)
1374 {
1375 return L'0' <= wch && wch <= L'7';
1376 }
1377
1378 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1379 const wchar_t *psz, size_t srcLen) const
1380 {
1381 if ( m_options == MAP_INVALID_UTF8_NOT )
1382 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1383
1384 size_t len = 0;
1385
1386 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1387 {
1388 wxUint32 cc;
1389
1390 #ifdef WC_UTF16
1391 // cast is ok for WC_UTF16
1392 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1393 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1394 #else
1395 cc = (*psz++) & 0x7fffffff;
1396 #endif
1397
1398 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1399 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1400 {
1401 if (buf)
1402 *buf++ = (char)(cc - wxUnicodePUA);
1403 len++;
1404 }
1405 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1406 && cc == L'\\' && psz[0] == L'\\' )
1407 {
1408 if (buf)
1409 *buf++ = (char)cc;
1410 psz++;
1411 len++;
1412 }
1413 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1414 cc == L'\\' &&
1415 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1416 {
1417 if (buf)
1418 {
1419 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1420 (psz[1] - L'0') * 010 +
1421 (psz[2] - L'0'));
1422 }
1423
1424 psz += 3;
1425 len++;
1426 }
1427 else
1428 {
1429 unsigned cnt;
1430 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1431 {
1432 }
1433
1434 if (!cnt)
1435 {
1436 // plain ASCII char
1437 if (buf)
1438 *buf++ = (char) cc;
1439 len++;
1440 }
1441 else
1442 {
1443 len += cnt + 1;
1444 if (buf)
1445 {
1446 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1447 while (cnt--)
1448 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1449 }
1450 }
1451 }
1452 }
1453
1454 if (srcLen == wxNO_LEN && buf && (len < n))
1455 *buf = 0;
1456
1457 return len + 1;
1458 }
1459
1460 // ============================================================================
1461 // UTF-16
1462 // ============================================================================
1463
1464 #ifdef WORDS_BIGENDIAN
1465 #define wxMBConvUTF16straight wxMBConvUTF16BE
1466 #define wxMBConvUTF16swap wxMBConvUTF16LE
1467 #else
1468 #define wxMBConvUTF16swap wxMBConvUTF16BE
1469 #define wxMBConvUTF16straight wxMBConvUTF16LE
1470 #endif
1471
1472 /* static */
1473 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1474 {
1475 if ( srcLen == wxNO_LEN )
1476 {
1477 // count the number of bytes in input, including the trailing NULs
1478 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1479 for ( srcLen = 1; *inBuff++; srcLen++ )
1480 ;
1481
1482 srcLen *= BYTES_PER_CHAR;
1483 }
1484 else // we already have the length
1485 {
1486 // we can only convert an entire number of UTF-16 characters
1487 if ( srcLen % BYTES_PER_CHAR )
1488 return wxCONV_FAILED;
1489 }
1490
1491 return srcLen;
1492 }
1493
1494 // case when in-memory representation is UTF-16 too
1495 #ifdef WC_UTF16
1496
1497 // ----------------------------------------------------------------------------
1498 // conversions without endianness change
1499 // ----------------------------------------------------------------------------
1500
1501 size_t
1502 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1503 const char *src, size_t srcLen) const
1504 {
1505 // set up the scene for using memcpy() (which is presumably more efficient
1506 // than copying the bytes one by one)
1507 srcLen = GetLength(src, srcLen);
1508 if ( srcLen == wxNO_LEN )
1509 return wxCONV_FAILED;
1510
1511 const size_t inLen = srcLen / BYTES_PER_CHAR;
1512 if ( dst )
1513 {
1514 if ( dstLen < inLen )
1515 return wxCONV_FAILED;
1516
1517 memcpy(dst, src, srcLen);
1518 }
1519
1520 return inLen;
1521 }
1522
1523 size_t
1524 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1525 const wchar_t *src, size_t srcLen) const
1526 {
1527 if ( srcLen == wxNO_LEN )
1528 srcLen = wxWcslen(src) + 1;
1529
1530 srcLen *= BYTES_PER_CHAR;
1531
1532 if ( dst )
1533 {
1534 if ( dstLen < srcLen )
1535 return wxCONV_FAILED;
1536
1537 memcpy(dst, src, srcLen);
1538 }
1539
1540 return srcLen;
1541 }
1542
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1546
1547 size_t
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1549 const char *src, size_t srcLen) const
1550 {
1551 srcLen = GetLength(src, srcLen);
1552 if ( srcLen == wxNO_LEN )
1553 return wxCONV_FAILED;
1554
1555 srcLen /= BYTES_PER_CHAR;
1556
1557 if ( dst )
1558 {
1559 if ( dstLen < srcLen )
1560 return wxCONV_FAILED;
1561
1562 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1563 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1564 {
1565 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1566 }
1567 }
1568
1569 return srcLen;
1570 }
1571
1572 size_t
1573 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1574 const wchar_t *src, size_t srcLen) const
1575 {
1576 if ( srcLen == wxNO_LEN )
1577 srcLen = wxWcslen(src) + 1;
1578
1579 srcLen *= BYTES_PER_CHAR;
1580
1581 if ( dst )
1582 {
1583 if ( dstLen < srcLen )
1584 return wxCONV_FAILED;
1585
1586 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1587 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1588 {
1589 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1590 }
1591 }
1592
1593 return srcLen;
1594 }
1595
1596 #else // !WC_UTF16: wchar_t is UTF-32
1597
1598 // ----------------------------------------------------------------------------
1599 // conversions without endianness change
1600 // ----------------------------------------------------------------------------
1601
1602 size_t
1603 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1604 const char *src, size_t srcLen) const
1605 {
1606 srcLen = GetLength(src, srcLen);
1607 if ( srcLen == wxNO_LEN )
1608 return wxCONV_FAILED;
1609
1610 const size_t inLen = srcLen / BYTES_PER_CHAR;
1611 if ( !dst )
1612 {
1613 // optimization: return maximal space which could be needed for this
1614 // string even if the real size could be smaller if the buffer contains
1615 // any surrogates
1616 return inLen;
1617 }
1618
1619 size_t outLen = 0;
1620 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1621 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1622 {
1623 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1624 if ( !inBuff )
1625 return wxCONV_FAILED;
1626
1627 if ( ++outLen > dstLen )
1628 return wxCONV_FAILED;
1629
1630 *dst++ = ch;
1631 }
1632
1633
1634 return outLen;
1635 }
1636
1637 size_t
1638 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1639 const wchar_t *src, size_t srcLen) const
1640 {
1641 if ( srcLen == wxNO_LEN )
1642 srcLen = wxWcslen(src) + 1;
1643
1644 size_t outLen = 0;
1645 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1646 for ( size_t n = 0; n < srcLen; n++ )
1647 {
1648 wxUint16 cc[2];
1649 const size_t numChars = encode_utf16(*src++, cc);
1650 if ( numChars == wxCONV_FAILED )
1651 return wxCONV_FAILED;
1652
1653 outLen += numChars * BYTES_PER_CHAR;
1654 if ( outBuff )
1655 {
1656 if ( outLen > dstLen )
1657 return wxCONV_FAILED;
1658
1659 *outBuff++ = cc[0];
1660 if ( numChars == 2 )
1661 {
1662 // second character of a surrogate
1663 *outBuff++ = cc[1];
1664 }
1665 }
1666 }
1667
1668 return outLen;
1669 }
1670
1671 // ----------------------------------------------------------------------------
1672 // endian-reversing conversions
1673 // ----------------------------------------------------------------------------
1674
1675 size_t
1676 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1677 const char *src, size_t srcLen) const
1678 {
1679 srcLen = GetLength(src, srcLen);
1680 if ( srcLen == wxNO_LEN )
1681 return wxCONV_FAILED;
1682
1683 const size_t inLen = srcLen / BYTES_PER_CHAR;
1684 if ( !dst )
1685 {
1686 // optimization: return maximal space which could be needed for this
1687 // string even if the real size could be smaller if the buffer contains
1688 // any surrogates
1689 return inLen;
1690 }
1691
1692 size_t outLen = 0;
1693 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1694 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1695 {
1696 wxUint32 ch;
1697 wxUint16 tmp[2];
1698
1699 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1700 inBuff++;
1701 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1702
1703 const size_t numChars = decode_utf16(tmp, ch);
1704 if ( numChars == wxCONV_FAILED )
1705 return wxCONV_FAILED;
1706
1707 if ( numChars == 2 )
1708 inBuff++;
1709
1710 if ( ++outLen > dstLen )
1711 return wxCONV_FAILED;
1712
1713 *dst++ = ch;
1714 }
1715
1716
1717 return outLen;
1718 }
1719
1720 size_t
1721 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1722 const wchar_t *src, size_t srcLen) const
1723 {
1724 if ( srcLen == wxNO_LEN )
1725 srcLen = wxWcslen(src) + 1;
1726
1727 size_t outLen = 0;
1728 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1729 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1730 {
1731 wxUint16 cc[2];
1732 const size_t numChars = encode_utf16(*src, cc);
1733 if ( numChars == wxCONV_FAILED )
1734 return wxCONV_FAILED;
1735
1736 outLen += numChars * BYTES_PER_CHAR;
1737 if ( outBuff )
1738 {
1739 if ( outLen > dstLen )
1740 return wxCONV_FAILED;
1741
1742 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1743 if ( numChars == 2 )
1744 {
1745 // second character of a surrogate
1746 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1747 }
1748 }
1749 }
1750
1751 return outLen;
1752 }
1753
1754 #endif // WC_UTF16/!WC_UTF16
1755
1756
1757 // ============================================================================
1758 // UTF-32
1759 // ============================================================================
1760
1761 #ifdef WORDS_BIGENDIAN
1762 #define wxMBConvUTF32straight wxMBConvUTF32BE
1763 #define wxMBConvUTF32swap wxMBConvUTF32LE
1764 #else
1765 #define wxMBConvUTF32swap wxMBConvUTF32BE
1766 #define wxMBConvUTF32straight wxMBConvUTF32LE
1767 #endif
1768
1769
1770 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1771 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1772
1773 /* static */
1774 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1775 {
1776 if ( srcLen == wxNO_LEN )
1777 {
1778 // count the number of bytes in input, including the trailing NULs
1779 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1780 for ( srcLen = 1; *inBuff++; srcLen++ )
1781 ;
1782
1783 srcLen *= BYTES_PER_CHAR;
1784 }
1785 else // we already have the length
1786 {
1787 // we can only convert an entire number of UTF-32 characters
1788 if ( srcLen % BYTES_PER_CHAR )
1789 return wxCONV_FAILED;
1790 }
1791
1792 return srcLen;
1793 }
1794
1795 // case when in-memory representation is UTF-16
1796 #ifdef WC_UTF16
1797
1798 // ----------------------------------------------------------------------------
1799 // conversions without endianness change
1800 // ----------------------------------------------------------------------------
1801
1802 size_t
1803 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1804 const char *src, size_t srcLen) const
1805 {
1806 srcLen = GetLength(src, srcLen);
1807 if ( srcLen == wxNO_LEN )
1808 return wxCONV_FAILED;
1809
1810 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1811 const size_t inLen = srcLen / BYTES_PER_CHAR;
1812 size_t outLen = 0;
1813 for ( size_t n = 0; n < inLen; n++ )
1814 {
1815 wxUint16 cc[2];
1816 const size_t numChars = encode_utf16(*inBuff++, cc);
1817 if ( numChars == wxCONV_FAILED )
1818 return wxCONV_FAILED;
1819
1820 outLen += numChars;
1821 if ( dst )
1822 {
1823 if ( outLen > dstLen )
1824 return wxCONV_FAILED;
1825
1826 *dst++ = cc[0];
1827 if ( numChars == 2 )
1828 {
1829 // second character of a surrogate
1830 *dst++ = cc[1];
1831 }
1832 }
1833 }
1834
1835 return outLen;
1836 }
1837
1838 size_t
1839 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1840 const wchar_t *src, size_t srcLen) const
1841 {
1842 if ( srcLen == wxNO_LEN )
1843 srcLen = wxWcslen(src) + 1;
1844
1845 if ( !dst )
1846 {
1847 // optimization: return maximal space which could be needed for this
1848 // string instead of the exact amount which could be less if there are
1849 // any surrogates in the input
1850 //
1851 // we consider that surrogates are rare enough to make it worthwhile to
1852 // avoid running the loop below at the cost of slightly extra memory
1853 // consumption
1854 return srcLen * BYTES_PER_CHAR;
1855 }
1856
1857 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1858 size_t outLen = 0;
1859 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1860 {
1861 const wxUint32 ch = wxDecodeSurrogate(&src);
1862 if ( !src )
1863 return wxCONV_FAILED;
1864
1865 outLen += BYTES_PER_CHAR;
1866
1867 if ( outLen > dstLen )
1868 return wxCONV_FAILED;
1869
1870 *outBuff++ = ch;
1871 }
1872
1873 return outLen;
1874 }
1875
1876 // ----------------------------------------------------------------------------
1877 // endian-reversing conversions
1878 // ----------------------------------------------------------------------------
1879
1880 size_t
1881 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1882 const char *src, size_t srcLen) const
1883 {
1884 srcLen = GetLength(src, srcLen);
1885 if ( srcLen == wxNO_LEN )
1886 return wxCONV_FAILED;
1887
1888 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1889 const size_t inLen = srcLen / BYTES_PER_CHAR;
1890 size_t outLen = 0;
1891 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1892 {
1893 wxUint16 cc[2];
1894 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1895 if ( numChars == wxCONV_FAILED )
1896 return wxCONV_FAILED;
1897
1898 outLen += numChars;
1899 if ( dst )
1900 {
1901 if ( outLen > dstLen )
1902 return wxCONV_FAILED;
1903
1904 *dst++ = cc[0];
1905 if ( numChars == 2 )
1906 {
1907 // second character of a surrogate
1908 *dst++ = cc[1];
1909 }
1910 }
1911 }
1912
1913 return outLen;
1914 }
1915
1916 size_t
1917 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1918 const wchar_t *src, size_t srcLen) const
1919 {
1920 if ( srcLen == wxNO_LEN )
1921 srcLen = wxWcslen(src) + 1;
1922
1923 if ( !dst )
1924 {
1925 // optimization: return maximal space which could be needed for this
1926 // string instead of the exact amount which could be less if there are
1927 // any surrogates in the input
1928 //
1929 // we consider that surrogates are rare enough to make it worthwhile to
1930 // avoid running the loop below at the cost of slightly extra memory
1931 // consumption
1932 return srcLen*BYTES_PER_CHAR;
1933 }
1934
1935 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1936 size_t outLen = 0;
1937 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1938 {
1939 const wxUint32 ch = wxDecodeSurrogate(&src);
1940 if ( !src )
1941 return wxCONV_FAILED;
1942
1943 outLen += BYTES_PER_CHAR;
1944
1945 if ( outLen > dstLen )
1946 return wxCONV_FAILED;
1947
1948 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1949 }
1950
1951 return outLen;
1952 }
1953
1954 #else // !WC_UTF16: wchar_t is UTF-32
1955
1956 // ----------------------------------------------------------------------------
1957 // conversions without endianness change
1958 // ----------------------------------------------------------------------------
1959
1960 size_t
1961 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1962 const char *src, size_t srcLen) const
1963 {
1964 // use memcpy() as it should be much faster than hand-written loop
1965 srcLen = GetLength(src, srcLen);
1966 if ( srcLen == wxNO_LEN )
1967 return wxCONV_FAILED;
1968
1969 const size_t inLen = srcLen/BYTES_PER_CHAR;
1970 if ( dst )
1971 {
1972 if ( dstLen < inLen )
1973 return wxCONV_FAILED;
1974
1975 memcpy(dst, src, srcLen);
1976 }
1977
1978 return inLen;
1979 }
1980
1981 size_t
1982 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1983 const wchar_t *src, size_t srcLen) const
1984 {
1985 if ( srcLen == wxNO_LEN )
1986 srcLen = wxWcslen(src) + 1;
1987
1988 srcLen *= BYTES_PER_CHAR;
1989
1990 if ( dst )
1991 {
1992 if ( dstLen < srcLen )
1993 return wxCONV_FAILED;
1994
1995 memcpy(dst, src, srcLen);
1996 }
1997
1998 return srcLen;
1999 }
2000
2001 // ----------------------------------------------------------------------------
2002 // endian-reversing conversions
2003 // ----------------------------------------------------------------------------
2004
2005 size_t
2006 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2007 const char *src, size_t srcLen) const
2008 {
2009 srcLen = GetLength(src, srcLen);
2010 if ( srcLen == wxNO_LEN )
2011 return wxCONV_FAILED;
2012
2013 srcLen /= BYTES_PER_CHAR;
2014
2015 if ( dst )
2016 {
2017 if ( dstLen < srcLen )
2018 return wxCONV_FAILED;
2019
2020 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2021 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2022 {
2023 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2024 }
2025 }
2026
2027 return srcLen;
2028 }
2029
2030 size_t
2031 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2032 const wchar_t *src, size_t srcLen) const
2033 {
2034 if ( srcLen == wxNO_LEN )
2035 srcLen = wxWcslen(src) + 1;
2036
2037 srcLen *= BYTES_PER_CHAR;
2038
2039 if ( dst )
2040 {
2041 if ( dstLen < srcLen )
2042 return wxCONV_FAILED;
2043
2044 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2045 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2046 {
2047 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2048 }
2049 }
2050
2051 return srcLen;
2052 }
2053
2054 #endif // WC_UTF16/!WC_UTF16
2055
2056
2057 // ============================================================================
2058 // The classes doing conversion using the iconv_xxx() functions
2059 // ============================================================================
2060
2061 #ifdef HAVE_ICONV
2062
2063 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2064 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2065 // (unless there's yet another bug in glibc) the only case when iconv()
2066 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2067 // left in the input buffer -- when _real_ error occurs,
2068 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2069 // iconv() failure.
2070 // [This bug does not appear in glibc 2.2.]
2071 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2072 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2073 (errno != E2BIG || bufLeft != 0))
2074 #else
2075 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2076 #endif
2077
2078 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2079
2080 #define ICONV_T_INVALID ((iconv_t)-1)
2081
2082 #if SIZEOF_WCHAR_T == 4
2083 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2084 #define WC_ENC wxFONTENCODING_UTF32
2085 #elif SIZEOF_WCHAR_T == 2
2086 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2087 #define WC_ENC wxFONTENCODING_UTF16
2088 #else // sizeof(wchar_t) != 2 nor 4
2089 // does this ever happen?
2090 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2091 #endif
2092
2093 // ----------------------------------------------------------------------------
2094 // wxMBConv_iconv: encapsulates an iconv character set
2095 // ----------------------------------------------------------------------------
2096
2097 class wxMBConv_iconv : public wxMBConv
2098 {
2099 public:
2100 wxMBConv_iconv(const char *name);
2101 virtual ~wxMBConv_iconv();
2102
2103 // implement base class virtual methods
2104 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2105 const char *src, size_t srcLen = wxNO_LEN) const;
2106 virtual size_t FromWChar(char *dst, size_t dstLen,
2107 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2108 virtual size_t GetMBNulLen() const;
2109
2110 #if wxUSE_UNICODE_UTF8
2111 virtual bool IsUTF8() const;
2112 #endif
2113
2114 virtual wxMBConv *Clone() const
2115 {
2116 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2117 p->m_minMBCharWidth = m_minMBCharWidth;
2118 return p;
2119 }
2120
2121 bool IsOk() const
2122 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2123
2124 protected:
2125 // the iconv handlers used to translate from multibyte
2126 // to wide char and in the other direction
2127 iconv_t m2w,
2128 w2m;
2129
2130 #if wxUSE_THREADS
2131 // guards access to m2w and w2m objects
2132 wxMutex m_iconvMutex;
2133 #endif
2134
2135 private:
2136 // the name (for iconv_open()) of a wide char charset -- if none is
2137 // available on this machine, it will remain NULL
2138 static wxString ms_wcCharsetName;
2139
2140 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2141 // different endian-ness than the native one
2142 static bool ms_wcNeedsSwap;
2143
2144
2145 // name of the encoding handled by this conversion
2146 const char *m_name;
2147
2148 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2149 // initially
2150 size_t m_minMBCharWidth;
2151 };
2152
2153 // make the constructor available for unit testing
2154 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2155 {
2156 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2157 if ( !result->IsOk() )
2158 {
2159 delete result;
2160 return 0;
2161 }
2162
2163 return result;
2164 }
2165
2166 wxString wxMBConv_iconv::ms_wcCharsetName;
2167 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2168
2169 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2170 : m_name(wxStrdup(name))
2171 {
2172 m_minMBCharWidth = 0;
2173
2174 // check for charset that represents wchar_t:
2175 if ( ms_wcCharsetName.empty() )
2176 {
2177 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2178
2179 #if wxUSE_FONTMAP
2180 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2181 #else // !wxUSE_FONTMAP
2182 static const wxChar *const names_static[] =
2183 {
2184 #if SIZEOF_WCHAR_T == 4
2185 wxT("UCS-4"),
2186 #elif SIZEOF_WCHAR_T = 2
2187 wxT("UCS-2"),
2188 #endif
2189 NULL
2190 };
2191 const wxChar *const *names = names_static;
2192 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2193
2194 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2195 {
2196 const wxString nameCS(*names);
2197
2198 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2199 wxString nameXE(nameCS);
2200
2201 #ifdef WORDS_BIGENDIAN
2202 nameXE += wxT("BE");
2203 #else // little endian
2204 nameXE += wxT("LE");
2205 #endif
2206
2207 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2208 nameXE.c_str());
2209
2210 m2w = iconv_open(nameXE.ToAscii(), name);
2211 if ( m2w == ICONV_T_INVALID )
2212 {
2213 // try charset w/o bytesex info (e.g. "UCS4")
2214 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2215 nameCS.c_str());
2216 m2w = iconv_open(nameCS.ToAscii(), name);
2217
2218 // and check for bytesex ourselves:
2219 if ( m2w != ICONV_T_INVALID )
2220 {
2221 char buf[2], *bufPtr;
2222 wchar_t wbuf[2];
2223 size_t insz, outsz;
2224 size_t res;
2225
2226 buf[0] = 'A';
2227 buf[1] = 0;
2228 wbuf[0] = 0;
2229 insz = 2;
2230 outsz = SIZEOF_WCHAR_T * 2;
2231 char* wbufPtr = (char*)wbuf;
2232 bufPtr = buf;
2233
2234 res = iconv(
2235 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2236 &wbufPtr, &outsz);
2237
2238 if (ICONV_FAILED(res, insz))
2239 {
2240 wxLogLastError(wxT("iconv"));
2241 wxLogError(_("Conversion to charset '%s' doesn't work."),
2242 nameCS.c_str());
2243 }
2244 else // ok, can convert to this encoding, remember it
2245 {
2246 ms_wcCharsetName = nameCS;
2247 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2248 }
2249 }
2250 }
2251 else // use charset not requiring byte swapping
2252 {
2253 ms_wcCharsetName = nameXE;
2254 }
2255 }
2256
2257 wxLogTrace(TRACE_STRCONV,
2258 wxT("iconv wchar_t charset is \"%s\"%s"),
2259 ms_wcCharsetName.empty() ? wxString("<none>")
2260 : ms_wcCharsetName,
2261 ms_wcNeedsSwap ? wxT(" (needs swap)")
2262 : wxT(""));
2263 }
2264 else // we already have ms_wcCharsetName
2265 {
2266 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2267 }
2268
2269 if ( ms_wcCharsetName.empty() )
2270 {
2271 w2m = ICONV_T_INVALID;
2272 }
2273 else
2274 {
2275 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2276 if ( w2m == ICONV_T_INVALID )
2277 {
2278 wxLogTrace(TRACE_STRCONV,
2279 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2280 ms_wcCharsetName.c_str(), name);
2281 }
2282 }
2283 }
2284
2285 wxMBConv_iconv::~wxMBConv_iconv()
2286 {
2287 free(const_cast<char *>(m_name));
2288
2289 if ( m2w != ICONV_T_INVALID )
2290 iconv_close(m2w);
2291 if ( w2m != ICONV_T_INVALID )
2292 iconv_close(w2m);
2293 }
2294
2295 size_t
2296 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2297 const char *src, size_t srcLen) const
2298 {
2299 if ( srcLen == wxNO_LEN )
2300 {
2301 // find the string length: notice that must be done differently for
2302 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2303 // consecutive NULs
2304 const size_t nulLen = GetMBNulLen();
2305 switch ( nulLen )
2306 {
2307 default:
2308 return wxCONV_FAILED;
2309
2310 case 1:
2311 srcLen = strlen(src); // arguably more optimized than our version
2312 break;
2313
2314 case 2:
2315 case 4:
2316 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317 // but they also have to start at character boundary and not
2318 // span two adjacent characters
2319 const char *p;
2320 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2321 ;
2322 srcLen = p - src;
2323 break;
2324 }
2325
2326 // when we're determining the length of the string ourselves we count
2327 // the terminating NUL(s) as part of it and always NUL-terminate the
2328 // output
2329 srcLen += nulLen;
2330 }
2331
2332 // we express length in the number of (wide) characters but iconv always
2333 // counts buffer sizes it in bytes
2334 dstLen *= SIZEOF_WCHAR_T;
2335
2336 #if wxUSE_THREADS
2337 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338 // Unfortunately there are a couple of global wxCSConv objects such as
2339 // wxConvLocal that are used all over wx code, so we have to make sure
2340 // the handle is used by at most one thread at the time. Otherwise
2341 // only a few wx classes would be safe to use from non-main threads
2342 // as MB<->WC conversion would fail "randomly".
2343 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2344 #endif // wxUSE_THREADS
2345
2346 size_t res, cres;
2347 const char *pszPtr = src;
2348
2349 if ( dst )
2350 {
2351 char* bufPtr = (char*)dst;
2352
2353 // have destination buffer, convert there
2354 size_t dstLenOrig = dstLen;
2355 cres = iconv(m2w,
2356 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2357 &bufPtr, &dstLen);
2358
2359 // convert the number of bytes converted as returned by iconv to the
2360 // number of (wide) characters converted that we need
2361 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2362
2363 if (ms_wcNeedsSwap)
2364 {
2365 // convert to native endianness
2366 for ( unsigned i = 0; i < res; i++ )
2367 dst[i] = WC_BSWAP(dst[i]);
2368 }
2369 }
2370 else // no destination buffer
2371 {
2372 // convert using temp buffer to calculate the size of the buffer needed
2373 wchar_t tbuf[256];
2374 res = 0;
2375
2376 do
2377 {
2378 char* bufPtr = (char*)tbuf;
2379 dstLen = 8 * SIZEOF_WCHAR_T;
2380
2381 cres = iconv(m2w,
2382 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2383 &bufPtr, &dstLen );
2384
2385 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2386 }
2387 while ((cres == (size_t)-1) && (errno == E2BIG));
2388 }
2389
2390 if (ICONV_FAILED(cres, srcLen))
2391 {
2392 //VS: it is ok if iconv fails, hence trace only
2393 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2394 return wxCONV_FAILED;
2395 }
2396
2397 return res;
2398 }
2399
2400 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2401 const wchar_t *src, size_t srcLen) const
2402 {
2403 #if wxUSE_THREADS
2404 // NB: explained in MB2WC
2405 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2406 #endif
2407
2408 if ( srcLen == wxNO_LEN )
2409 srcLen = wxWcslen(src) + 1;
2410
2411 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2412 size_t outbuflen = dstLen;
2413 size_t res, cres;
2414
2415 wchar_t *tmpbuf = 0;
2416
2417 if (ms_wcNeedsSwap)
2418 {
2419 // need to copy to temp buffer to switch endianness
2420 // (doing WC_BSWAP twice on the original buffer won't work, as it
2421 // could be in read-only memory, or be accessed in some other thread)
2422 tmpbuf = (wchar_t *)malloc(inbuflen);
2423 for ( size_t i = 0; i < srcLen; i++ )
2424 tmpbuf[i] = WC_BSWAP(src[i]);
2425
2426 src = tmpbuf;
2427 }
2428
2429 char* inbuf = (char*)src;
2430 if ( dst )
2431 {
2432 // have destination buffer, convert there
2433 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2434
2435 res = dstLen - outbuflen;
2436 }
2437 else // no destination buffer
2438 {
2439 // convert using temp buffer to calculate the size of the buffer needed
2440 char tbuf[256];
2441 res = 0;
2442 do
2443 {
2444 dst = tbuf;
2445 outbuflen = WXSIZEOF(tbuf);
2446
2447 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2448
2449 res += WXSIZEOF(tbuf) - outbuflen;
2450 }
2451 while ((cres == (size_t)-1) && (errno == E2BIG));
2452 }
2453
2454 if (ms_wcNeedsSwap)
2455 {
2456 free(tmpbuf);
2457 }
2458
2459 if (ICONV_FAILED(cres, inbuflen))
2460 {
2461 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2462 return wxCONV_FAILED;
2463 }
2464
2465 return res;
2466 }
2467
2468 size_t wxMBConv_iconv::GetMBNulLen() const
2469 {
2470 if ( m_minMBCharWidth == 0 )
2471 {
2472 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2473
2474 #if wxUSE_THREADS
2475 // NB: explained in MB2WC
2476 wxMutexLocker lock(self->m_iconvMutex);
2477 #endif
2478
2479 const wchar_t *wnul = L"";
2480 char buf[8]; // should be enough for NUL in any encoding
2481 size_t inLen = sizeof(wchar_t),
2482 outLen = WXSIZEOF(buf);
2483 char *inBuff = (char *)wnul;
2484 char *outBuff = buf;
2485 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2486 {
2487 self->m_minMBCharWidth = (size_t)-1;
2488 }
2489 else // ok
2490 {
2491 self->m_minMBCharWidth = outBuff - buf;
2492 }
2493 }
2494
2495 return m_minMBCharWidth;
2496 }
2497
2498 #if wxUSE_UNICODE_UTF8
2499 bool wxMBConv_iconv::IsUTF8() const
2500 {
2501 return wxStricmp(m_name, "UTF-8") == 0 ||
2502 wxStricmp(m_name, "UTF8") == 0;
2503 }
2504 #endif
2505
2506 #endif // HAVE_ICONV
2507
2508
2509 // ============================================================================
2510 // Win32 conversion classes
2511 // ============================================================================
2512
2513 #ifdef wxHAVE_WIN32_MB2WC
2514
2515 // from utils.cpp
2516 #if wxUSE_FONTMAP
2517 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2518 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2519 #endif
2520
2521 class wxMBConv_win32 : public wxMBConv
2522 {
2523 public:
2524 wxMBConv_win32()
2525 {
2526 m_CodePage = CP_ACP;
2527 m_minMBCharWidth = 0;
2528 }
2529
2530 wxMBConv_win32(const wxMBConv_win32& conv)
2531 : wxMBConv()
2532 {
2533 m_CodePage = conv.m_CodePage;
2534 m_minMBCharWidth = conv.m_minMBCharWidth;
2535 }
2536
2537 #if wxUSE_FONTMAP
2538 wxMBConv_win32(const char* name)
2539 {
2540 m_CodePage = wxCharsetToCodepage(name);
2541 m_minMBCharWidth = 0;
2542 }
2543
2544 wxMBConv_win32(wxFontEncoding encoding)
2545 {
2546 m_CodePage = wxEncodingToCodepage(encoding);
2547 m_minMBCharWidth = 0;
2548 }
2549 #endif // wxUSE_FONTMAP
2550
2551 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2552 {
2553 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554 // the behaviour is not compatible with the Unix version (using iconv)
2555 // and break the library itself, e.g. wxTextInputStream::NextChar()
2556 // wouldn't work if reading an incomplete MB char didn't result in an
2557 // error
2558 //
2559 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2560 // Win XP or newer and it is not supported for UTF-[78] so we always
2561 // use our own conversions in this case. See
2562 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2564 if ( m_CodePage == CP_UTF8 )
2565 {
2566 return wxMBConvUTF8().MB2WC(buf, psz, n);
2567 }
2568
2569 if ( m_CodePage == CP_UTF7 )
2570 {
2571 return wxMBConvUTF7().MB2WC(buf, psz, n);
2572 }
2573
2574 int flags = 0;
2575 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2576 IsAtLeastWin2kSP4() )
2577 {
2578 flags = MB_ERR_INVALID_CHARS;
2579 }
2580
2581 const size_t len = ::MultiByteToWideChar
2582 (
2583 m_CodePage, // code page
2584 flags, // flags: fall on error
2585 psz, // input string
2586 -1, // its length (NUL-terminated)
2587 buf, // output string
2588 buf ? n : 0 // size of output buffer
2589 );
2590 if ( !len )
2591 {
2592 // function totally failed
2593 return wxCONV_FAILED;
2594 }
2595
2596 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597 // check if we succeeded, by doing a double trip:
2598 if ( !flags && buf )
2599 {
2600 const size_t mbLen = strlen(psz);
2601 wxCharBuffer mbBuf(mbLen);
2602 if ( ::WideCharToMultiByte
2603 (
2604 m_CodePage,
2605 0,
2606 buf,
2607 -1,
2608 mbBuf.data(),
2609 mbLen + 1, // size in bytes, not length
2610 NULL,
2611 NULL
2612 ) == 0 ||
2613 strcmp(mbBuf, psz) != 0 )
2614 {
2615 // we didn't obtain the same thing we started from, hence
2616 // the conversion was lossy and we consider that it failed
2617 return wxCONV_FAILED;
2618 }
2619 }
2620
2621 // note that it returns count of written chars for buf != NULL and size
2622 // of the needed buffer for buf == NULL so in either case the length of
2623 // the string (which never includes the terminating NUL) is one less
2624 return len - 1;
2625 }
2626
2627 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2628 {
2629 /*
2630 we have a problem here: by default, WideCharToMultiByte() may
2631 replace characters unrepresentable in the target code page with bad
2632 quality approximations such as turning "1/2" symbol (U+00BD) into
2633 "1" for the code pages which don't have it and we, obviously, want
2634 to avoid this at any price
2635
2636 the trouble is that this function does it _silently_, i.e. it won't
2637 even tell us whether it did or not... Win98/2000 and higher provide
2638 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639 we have to resort to a round trip, i.e. check that converting back
2640 results in the same string -- this is, of course, expensive but
2641 otherwise we simply can't be sure to not garble the data.
2642 */
2643
2644 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645 // it doesn't work with CJK encodings (which we test for rather roughly
2646 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2647 // supporting it
2648 BOOL usedDef wxDUMMY_INITIALIZE(false);
2649 BOOL *pUsedDef;
2650 int flags;
2651 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2652 {
2653 // it's our lucky day
2654 flags = WC_NO_BEST_FIT_CHARS;
2655 pUsedDef = &usedDef;
2656 }
2657 else // old system or unsupported encoding
2658 {
2659 flags = 0;
2660 pUsedDef = NULL;
2661 }
2662
2663 const size_t len = ::WideCharToMultiByte
2664 (
2665 m_CodePage, // code page
2666 flags, // either none or no best fit
2667 pwz, // input string
2668 -1, // it is (wide) NUL-terminated
2669 buf, // output buffer
2670 buf ? n : 0, // and its size
2671 NULL, // default "replacement" char
2672 pUsedDef // [out] was it used?
2673 );
2674
2675 if ( !len )
2676 {
2677 // function totally failed
2678 return wxCONV_FAILED;
2679 }
2680
2681 // we did something, check if we really succeeded
2682 if ( flags )
2683 {
2684 // check if the conversion failed, i.e. if any replacements
2685 // were done
2686 if ( usedDef )
2687 return wxCONV_FAILED;
2688 }
2689 else // we must resort to double tripping...
2690 {
2691 // first we need to ensure that we really have the MB data: this is
2692 // not the case if we're called with NULL buffer, in which case we
2693 // need to do the conversion yet again
2694 wxCharBuffer bufDef;
2695 if ( !buf )
2696 {
2697 bufDef = wxCharBuffer(len);
2698 buf = bufDef.data();
2699 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2700 buf, len, NULL, NULL) )
2701 return wxCONV_FAILED;
2702 }
2703
2704 if ( !n )
2705 n = wcslen(pwz);
2706 wxWCharBuffer wcBuf(n);
2707 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2708 wcscmp(wcBuf, pwz) != 0 )
2709 {
2710 // we didn't obtain the same thing we started from, hence
2711 // the conversion was lossy and we consider that it failed
2712 return wxCONV_FAILED;
2713 }
2714 }
2715
2716 // see the comment above for the reason of "len - 1"
2717 return len - 1;
2718 }
2719
2720 virtual size_t GetMBNulLen() const
2721 {
2722 if ( m_minMBCharWidth == 0 )
2723 {
2724 int len = ::WideCharToMultiByte
2725 (
2726 m_CodePage, // code page
2727 0, // no flags
2728 L"", // input string
2729 1, // translate just the NUL
2730 NULL, // output buffer
2731 0, // and its size
2732 NULL, // no replacement char
2733 NULL // [out] don't care if it was used
2734 );
2735
2736 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2737 switch ( len )
2738 {
2739 default:
2740 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2741 self->m_minMBCharWidth = (size_t)-1;
2742 break;
2743
2744 case 0:
2745 self->m_minMBCharWidth = (size_t)-1;
2746 break;
2747
2748 case 1:
2749 case 2:
2750 case 4:
2751 self->m_minMBCharWidth = len;
2752 break;
2753 }
2754 }
2755
2756 return m_minMBCharWidth;
2757 }
2758
2759 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2760
2761 bool IsOk() const { return m_CodePage != -1; }
2762
2763 private:
2764 static bool CanUseNoBestFit()
2765 {
2766 static int s_isWin98Or2k = -1;
2767
2768 if ( s_isWin98Or2k == -1 )
2769 {
2770 int verMaj, verMin;
2771 switch ( wxGetOsVersion(&verMaj, &verMin) )
2772 {
2773 case wxOS_WINDOWS_9X:
2774 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2775 break;
2776
2777 case wxOS_WINDOWS_NT:
2778 s_isWin98Or2k = verMaj >= 5;
2779 break;
2780
2781 default:
2782 // unknown: be conservative by default
2783 s_isWin98Or2k = 0;
2784 break;
2785 }
2786
2787 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2788 }
2789
2790 return s_isWin98Or2k == 1;
2791 }
2792
2793 static bool IsAtLeastWin2kSP4()
2794 {
2795 #ifdef __WXWINCE__
2796 return false;
2797 #else
2798 static int s_isAtLeastWin2kSP4 = -1;
2799
2800 if ( s_isAtLeastWin2kSP4 == -1 )
2801 {
2802 OSVERSIONINFOEX ver;
2803
2804 memset(&ver, 0, sizeof(ver));
2805 ver.dwOSVersionInfoSize = sizeof(ver);
2806 GetVersionEx((OSVERSIONINFO*)&ver);
2807
2808 s_isAtLeastWin2kSP4 =
2809 ((ver.dwMajorVersion > 5) || // Vista+
2810 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2811 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2812 ver.wServicePackMajor >= 4)) // 2000 SP4+
2813 ? 1 : 0;
2814 }
2815
2816 return s_isAtLeastWin2kSP4 == 1;
2817 #endif
2818 }
2819
2820
2821 // the code page we're working with
2822 long m_CodePage;
2823
2824 // cached result of GetMBNulLen(), set to 0 initially meaning
2825 // "unknown"
2826 size_t m_minMBCharWidth;
2827 };
2828
2829 #endif // wxHAVE_WIN32_MB2WC
2830
2831
2832 // ============================================================================
2833 // wxEncodingConverter based conversion classes
2834 // ============================================================================
2835
2836 #if wxUSE_FONTMAP
2837
2838 class wxMBConv_wxwin : public wxMBConv
2839 {
2840 private:
2841 void Init()
2842 {
2843 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844 // The wxMBConv_cf class does a better job.
2845 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2846 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2847 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2848 }
2849
2850 public:
2851 // temporarily just use wxEncodingConverter stuff,
2852 // so that it works while a better implementation is built
2853 wxMBConv_wxwin(const char* name)
2854 {
2855 if (name)
2856 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2857 else
2858 m_enc = wxFONTENCODING_SYSTEM;
2859
2860 Init();
2861 }
2862
2863 wxMBConv_wxwin(wxFontEncoding enc)
2864 {
2865 m_enc = enc;
2866
2867 Init();
2868 }
2869
2870 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2871 {
2872 size_t inbuf = strlen(psz);
2873 if (buf)
2874 {
2875 if (!m2w.Convert(psz, buf))
2876 return wxCONV_FAILED;
2877 }
2878 return inbuf;
2879 }
2880
2881 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2882 {
2883 const size_t inbuf = wxWcslen(psz);
2884 if (buf)
2885 {
2886 if (!w2m.Convert(psz, buf))
2887 return wxCONV_FAILED;
2888 }
2889
2890 return inbuf;
2891 }
2892
2893 virtual size_t GetMBNulLen() const
2894 {
2895 switch ( m_enc )
2896 {
2897 case wxFONTENCODING_UTF16BE:
2898 case wxFONTENCODING_UTF16LE:
2899 return 2;
2900
2901 case wxFONTENCODING_UTF32BE:
2902 case wxFONTENCODING_UTF32LE:
2903 return 4;
2904
2905 default:
2906 return 1;
2907 }
2908 }
2909
2910 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2911
2912 bool IsOk() const { return m_ok; }
2913
2914 public:
2915 wxFontEncoding m_enc;
2916 wxEncodingConverter m2w, w2m;
2917
2918 private:
2919 // were we initialized successfully?
2920 bool m_ok;
2921
2922 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2923 };
2924
2925 // make the constructors available for unit testing
2926 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2927 {
2928 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2929 if ( !result->IsOk() )
2930 {
2931 delete result;
2932 return 0;
2933 }
2934
2935 return result;
2936 }
2937
2938 #endif // wxUSE_FONTMAP
2939
2940 // ============================================================================
2941 // wxCSConv implementation
2942 // ============================================================================
2943
2944 void wxCSConv::Init()
2945 {
2946 m_name = NULL;
2947 m_convReal = NULL;
2948 m_deferred = true;
2949 }
2950
2951 wxCSConv::wxCSConv(const wxString& charset)
2952 {
2953 Init();
2954
2955 if ( !charset.empty() )
2956 {
2957 SetName(charset.ToAscii());
2958 }
2959
2960 #if wxUSE_FONTMAP
2961 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2962 if ( m_encoding == wxFONTENCODING_MAX )
2963 {
2964 // set to unknown/invalid value
2965 m_encoding = wxFONTENCODING_SYSTEM;
2966 }
2967 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2968 {
2969 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2970 m_encoding = wxFONTENCODING_ISO8859_1;
2971 }
2972 #else
2973 m_encoding = wxFONTENCODING_SYSTEM;
2974 #endif
2975 }
2976
2977 wxCSConv::wxCSConv(wxFontEncoding encoding)
2978 {
2979 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2980 {
2981 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2982
2983 encoding = wxFONTENCODING_SYSTEM;
2984 }
2985
2986 Init();
2987
2988 m_encoding = encoding;
2989 }
2990
2991 wxCSConv::~wxCSConv()
2992 {
2993 Clear();
2994 }
2995
2996 wxCSConv::wxCSConv(const wxCSConv& conv)
2997 : wxMBConv()
2998 {
2999 Init();
3000
3001 SetName(conv.m_name);
3002 m_encoding = conv.m_encoding;
3003 }
3004
3005 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3006 {
3007 Clear();
3008
3009 SetName(conv.m_name);
3010 m_encoding = conv.m_encoding;
3011
3012 return *this;
3013 }
3014
3015 void wxCSConv::Clear()
3016 {
3017 free(m_name);
3018 wxDELETE(m_convReal);
3019
3020 m_name = NULL;
3021 }
3022
3023 void wxCSConv::SetName(const char *charset)
3024 {
3025 if (charset)
3026 {
3027 m_name = wxStrdup(charset);
3028 m_deferred = true;
3029 }
3030 }
3031
3032 #if wxUSE_FONTMAP
3033
3034 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3035 wxEncodingNameCache );
3036
3037 static wxEncodingNameCache gs_nameCache;
3038 #endif
3039
3040 wxMBConv *wxCSConv::DoCreate() const
3041 {
3042 #if wxUSE_FONTMAP
3043 wxLogTrace(TRACE_STRCONV,
3044 wxT("creating conversion for %s"),
3045 (m_name ? m_name
3046 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3047 #endif // wxUSE_FONTMAP
3048
3049 // check for the special case of ASCII or ISO8859-1 charset: as we have
3050 // special knowledge of it anyhow, we don't need to create a special
3051 // conversion object
3052 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3053 m_encoding == wxFONTENCODING_DEFAULT )
3054 {
3055 // don't convert at all
3056 return NULL;
3057 }
3058
3059 // we trust OS to do conversion better than we can so try external
3060 // conversion methods first
3061 //
3062 // the full order is:
3063 // 1. OS conversion (iconv() under Unix or Win32 API)
3064 // 2. hard coded conversions for UTF
3065 // 3. wxEncodingConverter as fall back
3066
3067 // step (1)
3068 #ifdef HAVE_ICONV
3069 #if !wxUSE_FONTMAP
3070 if ( m_name )
3071 #endif // !wxUSE_FONTMAP
3072 {
3073 #if wxUSE_FONTMAP
3074 wxFontEncoding encoding(m_encoding);
3075 #endif
3076
3077 if ( m_name )
3078 {
3079 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3080 if ( conv->IsOk() )
3081 return conv;
3082
3083 delete conv;
3084
3085 #if wxUSE_FONTMAP
3086 encoding =
3087 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3088 #endif // wxUSE_FONTMAP
3089 }
3090 #if wxUSE_FONTMAP
3091 {
3092 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3093 if ( it != gs_nameCache.end() )
3094 {
3095 if ( it->second.empty() )
3096 return NULL;
3097
3098 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3099 if ( conv->IsOk() )
3100 return conv;
3101
3102 delete conv;
3103 }
3104
3105 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3106 // CS : in case this does not return valid names (eg for MacRoman)
3107 // encoding got a 'failure' entry in the cache all the same,
3108 // although it just has to be created using a different method, so
3109 // only store failed iconv creation attempts (or perhaps we
3110 // shoulnd't do this at all ?)
3111 if ( names[0] != NULL )
3112 {
3113 for ( ; *names; ++names )
3114 {
3115 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3116 // will need changes that will obsolete this
3117 wxString name(*names);
3118 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3119 if ( conv->IsOk() )
3120 {
3121 gs_nameCache[encoding] = *names;
3122 return conv;
3123 }
3124
3125 delete conv;
3126 }
3127
3128 gs_nameCache[encoding] = wxT(""); // cache the failure
3129 }
3130 }
3131 #endif // wxUSE_FONTMAP
3132 }
3133 #endif // HAVE_ICONV
3134
3135 #ifdef wxHAVE_WIN32_MB2WC
3136 {
3137 #if wxUSE_FONTMAP
3138 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3139 : new wxMBConv_win32(m_encoding);
3140 if ( conv->IsOk() )
3141 return conv;
3142
3143 delete conv;
3144 #else
3145 return NULL;
3146 #endif
3147 }
3148 #endif // wxHAVE_WIN32_MB2WC
3149
3150 #ifdef __DARWIN__
3151 {
3152 // leave UTF16 and UTF32 to the built-ins of wx
3153 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3154 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3155 {
3156 #if wxUSE_FONTMAP
3157 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3158 : new wxMBConv_cf(m_encoding);
3159 #else
3160 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3161 #endif
3162
3163 if ( conv->IsOk() )
3164 return conv;
3165
3166 delete conv;
3167 }
3168 }
3169 #endif // __DARWIN__
3170
3171 // step (2)
3172 wxFontEncoding enc = m_encoding;
3173 #if wxUSE_FONTMAP
3174 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3175 {
3176 // use "false" to suppress interactive dialogs -- we can be called from
3177 // anywhere and popping up a dialog from here is the last thing we want to
3178 // do
3179 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3180 }
3181 #endif // wxUSE_FONTMAP
3182
3183 switch ( enc )
3184 {
3185 case wxFONTENCODING_UTF7:
3186 return new wxMBConvUTF7;
3187
3188 case wxFONTENCODING_UTF8:
3189 return new wxMBConvUTF8;
3190
3191 case wxFONTENCODING_UTF16BE:
3192 return new wxMBConvUTF16BE;
3193
3194 case wxFONTENCODING_UTF16LE:
3195 return new wxMBConvUTF16LE;
3196
3197 case wxFONTENCODING_UTF32BE:
3198 return new wxMBConvUTF32BE;
3199
3200 case wxFONTENCODING_UTF32LE:
3201 return new wxMBConvUTF32LE;
3202
3203 default:
3204 // nothing to do but put here to suppress gcc warnings
3205 break;
3206 }
3207
3208 // step (3)
3209 #if wxUSE_FONTMAP
3210 {
3211 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3212 : new wxMBConv_wxwin(m_encoding);
3213 if ( conv->IsOk() )
3214 return conv;
3215
3216 delete conv;
3217 }
3218
3219 wxLogTrace(TRACE_STRCONV,
3220 wxT("encoding \"%s\" is not supported by this system"),
3221 (m_name ? wxString(m_name)
3222 : wxFontMapperBase::GetEncodingName(m_encoding)));
3223 #endif // wxUSE_FONTMAP
3224
3225 return NULL;
3226 }
3227
3228 void wxCSConv::CreateConvIfNeeded() const
3229 {
3230 if ( m_deferred )
3231 {
3232 wxCSConv *self = const_cast<wxCSConv *>(this);
3233
3234 // if we don't have neither the name nor the encoding, use the default
3235 // encoding for this system
3236 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3237 {
3238 #if wxUSE_INTL
3239 self->m_encoding = wxLocale::GetSystemEncoding();
3240 #else
3241 // fallback to some reasonable default:
3242 self->m_encoding = wxFONTENCODING_ISO8859_1;
3243 #endif // wxUSE_INTL
3244 }
3245
3246 self->m_convReal = DoCreate();
3247 self->m_deferred = false;
3248 }
3249 }
3250
3251 bool wxCSConv::IsOk() const
3252 {
3253 CreateConvIfNeeded();
3254
3255 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3256 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3257 return true; // always ok as we do it ourselves
3258
3259 // m_convReal->IsOk() is called at its own creation, so we know it must
3260 // be ok if m_convReal is non-NULL
3261 return m_convReal != NULL;
3262 }
3263
3264 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3265 const char *src, size_t srcLen) const
3266 {
3267 CreateConvIfNeeded();
3268
3269 if (m_convReal)
3270 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3271
3272 // latin-1 (direct)
3273 if ( srcLen == wxNO_LEN )
3274 srcLen = strlen(src) + 1; // take trailing NUL too
3275
3276 if ( dst )
3277 {
3278 if ( dstLen < srcLen )
3279 return wxCONV_FAILED;
3280
3281 for ( size_t n = 0; n < srcLen; n++ )
3282 dst[n] = (unsigned char)(src[n]);
3283 }
3284
3285 return srcLen;
3286 }
3287
3288 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3289 const wchar_t *src, size_t srcLen) const
3290 {
3291 CreateConvIfNeeded();
3292
3293 if (m_convReal)
3294 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3295
3296 // latin-1 (direct)
3297 if ( srcLen == wxNO_LEN )
3298 srcLen = wxWcslen(src) + 1;
3299
3300 if ( dst )
3301 {
3302 if ( dstLen < srcLen )
3303 return wxCONV_FAILED;
3304
3305 for ( size_t n = 0; n < srcLen; n++ )
3306 {
3307 if ( src[n] > 0xFF )
3308 return wxCONV_FAILED;
3309
3310 dst[n] = (char)src[n];
3311 }
3312
3313 }
3314 else // still need to check the input validity
3315 {
3316 for ( size_t n = 0; n < srcLen; n++ )
3317 {
3318 if ( src[n] > 0xFF )
3319 return wxCONV_FAILED;
3320 }
3321 }
3322
3323 return srcLen;
3324 }
3325
3326 size_t wxCSConv::GetMBNulLen() const
3327 {
3328 CreateConvIfNeeded();
3329
3330 if ( m_convReal )
3331 {
3332 return m_convReal->GetMBNulLen();
3333 }
3334
3335 // otherwise, we are ISO-8859-1
3336 return 1;
3337 }
3338
3339 #if wxUSE_UNICODE_UTF8
3340 bool wxCSConv::IsUTF8() const
3341 {
3342 CreateConvIfNeeded();
3343
3344 if ( m_convReal )
3345 {
3346 return m_convReal->IsUTF8();
3347 }
3348
3349 // otherwise, we are ISO-8859-1
3350 return false;
3351 }
3352 #endif
3353
3354
3355 #if wxUSE_UNICODE
3356
3357 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3358 {
3359 if ( !s )
3360 return wxWCharBuffer();
3361
3362 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3363 if ( !wbuf )
3364 wbuf = wxMBConvUTF8().cMB2WX(s);
3365 if ( !wbuf )
3366 wbuf = wxConvISO8859_1.cMB2WX(s);
3367
3368 return wbuf;
3369 }
3370
3371 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3372 {
3373 if ( !ws )
3374 return wxCharBuffer();
3375
3376 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3377 if ( !buf )
3378 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3379
3380 return buf;
3381 }
3382
3383 #endif // wxUSE_UNICODE
3384
3385 // ----------------------------------------------------------------------------
3386 // globals
3387 // ----------------------------------------------------------------------------
3388
3389 // NB: The reason why we create converted objects in this convoluted way,
3390 // using a factory function instead of global variable, is that they
3391 // may be used at static initialization time (some of them are used by
3392 // wxString ctors and there may be a global wxString object). In other
3393 // words, possibly _before_ the converter global object would be
3394 // initialized.
3395
3396 #undef wxConvLibc
3397 #undef wxConvUTF8
3398 #undef wxConvUTF7
3399 #undef wxConvLocal
3400 #undef wxConvISO8859_1
3401
3402 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3403 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3404 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3405 { \
3406 static impl_klass name##Obj ctor_args; \
3407 return &name##Obj; \
3408 } \
3409 /* this ensures that all global converter objects are created */ \
3410 /* by the time static initialization is done, i.e. before any */ \
3411 /* thread is launched: */ \
3412 static klass* gs_##name##instance = wxGet_##name##Ptr()
3413
3414 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3415 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3416
3417 #ifdef __INTELC__
3418 // disable warning "variable 'xxx' was declared but never referenced"
3419 #pragma warning(disable: 177)
3420 #endif // Intel C++
3421
3422 #ifdef __WINDOWS__
3423 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3424 #elif 0 // defined(__WXOSX__)
3425 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3426 #else
3427 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3428 #endif
3429
3430 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3431 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3432 // provokes an error message about "not enough macro parameters"; and we
3433 // can't use "()" here as the name##Obj declaration would be parsed as a
3434 // function declaration then, so use a semicolon and live with an extra
3435 // empty statement (and hope that no compilers warns about this)
3436 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3437 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3438
3439 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3440 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3441
3442 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3443 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3444
3445 #ifdef __DARWIN__
3446 // It is important to use this conversion object under Darwin as it ensures
3447 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3448 // decomposed form internally (at least for the file names).
3449 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3450 #endif
3451
3452 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3453 #ifdef __DARWIN__
3454 &wxConvMacUTF8DObj;
3455 #else // !__DARWIN__
3456 wxGet_wxConvLibcPtr();
3457 #endif // __DARWIN__/!__DARWIN__