]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Fix wxHtmlHelpData::SetTempDir() to behave correctly without trailing slash.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // (c) 2004 Ryan Norton, Fredrik Roubert
11 // Licence: wxWindows licence
12 /////////////////////////////////////////////////////////////////////////////
13
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
16
17 #ifdef __BORLANDC__
18 #pragma hdrstop
19 #endif //__BORLANDC__
20
21 #ifndef WX_PRECOMP
22 #include "wx/intl.h"
23 #include "wx/log.h"
24 #include "wx/utils.h"
25 #include "wx/hashmap.h"
26 #endif
27
28 #include "wx/strconv.h"
29
30 #ifndef __WXWINCE__
31 #include <errno.h>
32 #endif
33
34 #include <ctype.h>
35 #include <string.h>
36 #include <stdlib.h>
37
38 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
39 #include "wx/msw/private.h"
40 #include "wx/msw/missing.h"
41 #define wxHAVE_WIN32_MB2WC
42 #endif
43
44 #ifdef HAVE_ICONV
45 #include <iconv.h>
46 #include "wx/thread.h"
47 #endif
48
49 #include "wx/encconv.h"
50 #include "wx/fontmap.h"
51
52 #ifdef __DARWIN__
53 #include "wx/osx/core/private/strconv_cf.h"
54 #endif //def __DARWIN__
55
56
57 #define TRACE_STRCONV wxT("strconv")
58
59 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
60 // be 4 bytes
61 #if SIZEOF_WCHAR_T == 2
62 #define WC_UTF16
63 #endif
64
65
66 // ============================================================================
67 // implementation
68 // ============================================================================
69
70 // helper function of cMB2WC(): check if n bytes at this location are all NUL
71 static bool NotAllNULs(const char *p, size_t n)
72 {
73 while ( n && *p++ == '\0' )
74 n--;
75
76 return n != 0;
77 }
78
79 // ----------------------------------------------------------------------------
80 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
81 // ----------------------------------------------------------------------------
82
83 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
84 {
85 if (input <= 0xffff)
86 {
87 if (output)
88 *output = (wxUint16) input;
89
90 return 1;
91 }
92 else if (input >= 0x110000)
93 {
94 return wxCONV_FAILED;
95 }
96 else
97 {
98 if (output)
99 {
100 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
101 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
102 }
103
104 return 2;
105 }
106 }
107
108 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
109 {
110 if ((*input < 0xd800) || (*input > 0xdfff))
111 {
112 output = *input;
113 return 1;
114 }
115 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
116 {
117 output = *input;
118 return wxCONV_FAILED;
119 }
120 else
121 {
122 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
123 return 2;
124 }
125 }
126
127 #ifdef WC_UTF16
128 typedef wchar_t wxDecodeSurrogate_t;
129 #else // !WC_UTF16
130 typedef wxUint16 wxDecodeSurrogate_t;
131 #endif // WC_UTF16/!WC_UTF16
132
133 // returns the next UTF-32 character from the wchar_t buffer and advances the
134 // pointer to the character after this one
135 //
136 // if an invalid character is found, *pSrc is set to NULL, the caller must
137 // check for this
138 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
139 {
140 wxUint32 out;
141 const size_t
142 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
143 if ( n == wxCONV_FAILED )
144 *pSrc = NULL;
145 else
146 *pSrc += n;
147
148 return out;
149 }
150
151 // ----------------------------------------------------------------------------
152 // wxMBConv
153 // ----------------------------------------------------------------------------
154
155 size_t
156 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
157 const char *src, size_t srcLen) const
158 {
159 // although new conversion classes are supposed to implement this function
160 // directly, the existing ones only implement the old MB2WC() and so, to
161 // avoid to have to rewrite all conversion classes at once, we provide a
162 // default (but not efficient) implementation of this one in terms of the
163 // old function by copying the input to ensure that it's NUL-terminated and
164 // then using MB2WC() to convert it
165 //
166 // moreover, some conversion classes simply can't implement ToWChar()
167 // directly, the primary example is wxConvLibc: mbstowcs() only handles
168 // NUL-terminated strings
169
170 // the number of chars [which would be] written to dst [if it were not NULL]
171 size_t dstWritten = 0;
172
173 // the number of NULs terminating this string
174 size_t nulLen = 0; // not really needed, but just to avoid warnings
175
176 // if we were not given the input size we just have to assume that the
177 // string is properly terminated as we have no way of knowing how long it
178 // is anyhow, but if we do have the size check whether there are enough
179 // NULs at the end
180 wxCharBuffer bufTmp;
181 const char *srcEnd;
182 if ( srcLen != wxNO_LEN )
183 {
184 // we need to know how to find the end of this string
185 nulLen = GetMBNulLen();
186 if ( nulLen == wxCONV_FAILED )
187 return wxCONV_FAILED;
188
189 // if there are enough NULs we can avoid the copy
190 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
191 {
192 // make a copy in order to properly NUL-terminate the string
193 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
194 char * const p = bufTmp.data();
195 memcpy(p, src, srcLen);
196 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
197 *s = '\0';
198
199 src = bufTmp;
200 }
201
202 srcEnd = src + srcLen;
203 }
204 else // quit after the first loop iteration
205 {
206 srcEnd = NULL;
207 }
208
209 // the idea of this code is straightforward: it converts a NUL-terminated
210 // chunk of the string during each iteration and updates the output buffer
211 // with the result
212 //
213 // all the complication come from the fact that this function, for
214 // historical reasons, must behave in 2 subtly different ways when it's
215 // called with a fixed number of characters and when it's called for the
216 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
217 // must count all characters we convert, NUL or not; but in the latter we
218 // do not count the trailing NUL -- but still count all the NULs inside the
219 // string
220 //
221 // so for the (simple) former case we just always count the trailing NUL,
222 // but for the latter we need to wait until we see if there is going to be
223 // another loop iteration and only count it then
224 for ( ;; )
225 {
226 // try to convert the current chunk
227 size_t lenChunk = MB2WC(NULL, src, 0);
228 if ( lenChunk == wxCONV_FAILED )
229 return wxCONV_FAILED;
230
231 dstWritten += lenChunk;
232 if ( !srcEnd )
233 dstWritten++;
234
235 if ( !lenChunk )
236 {
237 // nothing left in the input string, conversion succeeded
238 break;
239 }
240
241 if ( dst )
242 {
243 if ( dstWritten > dstLen )
244 return wxCONV_FAILED;
245
246 // +1 is for trailing NUL
247 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
248 return wxCONV_FAILED;
249
250 dst += lenChunk;
251 if ( !srcEnd )
252 dst++;
253 }
254
255 if ( !srcEnd )
256 {
257 // we convert just one chunk in this case as this is the entire
258 // string anyhow (and we don't count the trailing NUL in this case)
259 break;
260 }
261
262 // advance the input pointer past the end of this chunk: notice that we
263 // will always stop before srcEnd because we know that the chunk is
264 // always properly NUL-terminated
265 while ( NotAllNULs(src, nulLen) )
266 {
267 // notice that we must skip over multiple bytes here as we suppose
268 // that if NUL takes 2 or 4 bytes, then all the other characters do
269 // too and so if advanced by a single byte we might erroneously
270 // detect sequences of NUL bytes in the middle of the input
271 src += nulLen;
272 }
273
274 // if the buffer ends before this NUL, we shouldn't count it in our
275 // output so skip the code below
276 if ( src == srcEnd )
277 break;
278
279 // do count this terminator as it's inside the buffer we convert
280 dstWritten++;
281 if ( dst )
282 dst++;
283
284 src += nulLen; // skip the terminator itself
285
286 if ( src >= srcEnd )
287 break;
288 }
289
290 return dstWritten;
291 }
292
293 size_t
294 wxMBConv::FromWChar(char *dst, size_t dstLen,
295 const wchar_t *src, size_t srcLen) const
296 {
297 // the number of chars [which would be] written to dst [if it were not NULL]
298 size_t dstWritten = 0;
299
300 // if we don't know its length we have no choice but to assume that it is
301 // NUL-terminated (notice that it can still be NUL-terminated even if
302 // explicit length is given but it doesn't change our return value)
303 const bool isNulTerminated = srcLen == wxNO_LEN;
304
305 // make a copy of the input string unless it is already properly
306 // NUL-terminated
307 wxWCharBuffer bufTmp;
308 if ( isNulTerminated )
309 {
310 srcLen = wxWcslen(src) + 1;
311 }
312 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
313 {
314 // make a copy in order to properly NUL-terminate the string
315 bufTmp = wxWCharBuffer(srcLen);
316 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
317 src = bufTmp;
318 }
319
320 const size_t lenNul = GetMBNulLen();
321 for ( const wchar_t * const srcEnd = src + srcLen;
322 src < srcEnd;
323 src++ /* skip L'\0' too */ )
324 {
325 // try to convert the current chunk
326 size_t lenChunk = WC2MB(NULL, src, 0);
327 if ( lenChunk == wxCONV_FAILED )
328 return wxCONV_FAILED;
329
330 dstWritten += lenChunk;
331
332 const wchar_t * const
333 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
334
335 // our return value accounts for the trailing NUL(s), unlike that of
336 // WC2MB(), however don't do it for the last NUL we artificially added
337 // ourselves above
338 if ( chunkEnd < srcEnd )
339 dstWritten += lenNul;
340
341 if ( dst )
342 {
343 if ( dstWritten > dstLen )
344 return wxCONV_FAILED;
345
346 // if we know that there is enough space in the destination buffer
347 // (because we accounted for lenNul in dstWritten above), we can
348 // convert directly in place -- but otherwise we need another
349 // temporary buffer to ensure that we don't overwrite the output
350 wxCharBuffer dstBuf;
351 char *dstTmp;
352 if ( chunkEnd == srcEnd )
353 {
354 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
355 dstTmp = dstBuf.data();
356 }
357 else
358 {
359 dstTmp = dst;
360 }
361
362 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
363 return wxCONV_FAILED;
364
365 if ( dstTmp != dst )
366 {
367 // copy everything up to but excluding the terminating NUL(s)
368 // into the real output buffer
369 memcpy(dst, dstTmp, lenChunk);
370
371 // micro-optimization: if dstTmp != dst it means that chunkEnd
372 // == srcEnd and so we're done, no need to update anything below
373 break;
374 }
375
376 dst += lenChunk;
377 if ( chunkEnd < srcEnd )
378 dst += lenNul;
379 }
380
381 src = chunkEnd;
382 }
383
384 return dstWritten;
385 }
386
387 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
388 {
389 size_t rc = ToWChar(outBuff, outLen, inBuff);
390 if ( rc != wxCONV_FAILED )
391 {
392 // ToWChar() returns the buffer length, i.e. including the trailing
393 // NUL, while this method doesn't take it into account
394 rc--;
395 }
396
397 return rc;
398 }
399
400 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
401 {
402 size_t rc = FromWChar(outBuff, outLen, inBuff);
403 if ( rc != wxCONV_FAILED )
404 {
405 rc -= GetMBNulLen();
406 }
407
408 return rc;
409 }
410
411 wxMBConv::~wxMBConv()
412 {
413 // nothing to do here (necessary for Darwin linking probably)
414 }
415
416 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
417 {
418 if ( psz )
419 {
420 // calculate the length of the buffer needed first
421 const size_t nLen = ToWChar(NULL, 0, psz);
422 if ( nLen != wxCONV_FAILED )
423 {
424 // now do the actual conversion
425 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
426
427 // +1 for the trailing NULL
428 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
429 return buf;
430 }
431 }
432
433 return wxWCharBuffer();
434 }
435
436 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
437 {
438 if ( pwz )
439 {
440 const size_t nLen = FromWChar(NULL, 0, pwz);
441 if ( nLen != wxCONV_FAILED )
442 {
443 wxCharBuffer buf(nLen - 1);
444 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
445 return buf;
446 }
447 }
448
449 return wxCharBuffer();
450 }
451
452 const wxWCharBuffer
453 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
454 {
455 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
456 if ( dstLen != wxCONV_FAILED )
457 {
458 // notice that we allocate space for dstLen+1 wide characters here
459 // because we want the buffer to always be NUL-terminated, even if the
460 // input isn't (as otherwise the caller has no way to know its length)
461 wxWCharBuffer wbuf(dstLen);
462 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
463 {
464 if ( outLen )
465 {
466 *outLen = dstLen;
467
468 // we also need to handle NUL-terminated input strings
469 // specially: for them the output is the length of the string
470 // excluding the trailing NUL, however if we're asked to
471 // convert a specific number of characters we return the length
472 // of the resulting output even if it's NUL-terminated
473 if ( inLen == wxNO_LEN )
474 (*outLen)--;
475 }
476
477 return wbuf;
478 }
479 }
480
481 if ( outLen )
482 *outLen = 0;
483
484 return wxWCharBuffer();
485 }
486
487 const wxCharBuffer
488 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
489 {
490 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
491 if ( dstLen != wxCONV_FAILED )
492 {
493 const size_t nulLen = GetMBNulLen();
494
495 // as above, ensure that the buffer is always NUL-terminated, even if
496 // the input is not
497 wxCharBuffer buf(dstLen + nulLen - 1);
498 memset(buf.data() + dstLen, 0, nulLen);
499 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
500 {
501 if ( outLen )
502 {
503 *outLen = dstLen;
504
505 if ( inLen == wxNO_LEN )
506 {
507 // in this case both input and output are NUL-terminated
508 // and we're not supposed to count NUL
509 *outLen -= nulLen;
510 }
511 }
512
513 return buf;
514 }
515 }
516
517 if ( outLen )
518 *outLen = 0;
519
520 return wxCharBuffer();
521 }
522
523 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
524 {
525 const size_t srcLen = buf.length();
526 if ( srcLen )
527 {
528 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
529 if ( dstLen != wxCONV_FAILED )
530 {
531 wxWCharBuffer wbuf(dstLen);
532 wbuf.data()[dstLen] = L'\0';
533 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
534 return wbuf;
535 }
536 }
537
538 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
539 }
540
541 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
542 {
543 const size_t srcLen = wbuf.length();
544 if ( srcLen )
545 {
546 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
547 if ( dstLen != wxCONV_FAILED )
548 {
549 wxCharBuffer buf(dstLen);
550 buf.data()[dstLen] = '\0';
551 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
552 return buf;
553 }
554 }
555
556 return wxScopedCharBuffer::CreateNonOwned("", 0);
557 }
558
559 // ----------------------------------------------------------------------------
560 // wxMBConvLibc
561 // ----------------------------------------------------------------------------
562
563 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
564 {
565 return wxMB2WC(buf, psz, n);
566 }
567
568 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
569 {
570 return wxWC2MB(buf, psz, n);
571 }
572
573 // ----------------------------------------------------------------------------
574 // wxConvBrokenFileNames
575 // ----------------------------------------------------------------------------
576
577 #ifdef __UNIX__
578
579 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
580 {
581 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
582 wxStricmp(charset, wxT("UTF8")) == 0 )
583 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
584 else
585 m_conv = new wxCSConv(charset);
586 }
587
588 #endif // __UNIX__
589
590 // ----------------------------------------------------------------------------
591 // UTF-7
592 // ----------------------------------------------------------------------------
593
594 // Implementation (C) 2004 Fredrik Roubert
595 //
596 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
597
598 //
599 // BASE64 decoding table
600 //
601 static const unsigned char utf7unb64[] =
602 {
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
609 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
610 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
612 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
613 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
614 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
616 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
617 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
618 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
620 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
635 };
636
637 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
638 const char *src, size_t srcLen) const
639 {
640 DecoderState stateOrig,
641 *statePtr;
642 if ( srcLen == wxNO_LEN )
643 {
644 // convert the entire string, up to and including the trailing NUL
645 srcLen = strlen(src) + 1;
646
647 // when working on the entire strings we don't update nor use the shift
648 // state from the previous call
649 statePtr = &stateOrig;
650 }
651 else // when working with partial strings we do use the shift state
652 {
653 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
654
655 // also save the old state to be able to rollback to it on error
656 stateOrig = m_stateDecoder;
657 }
658
659 // but to simplify the code below we use this variable in both cases
660 DecoderState& state = *statePtr;
661
662
663 // number of characters [which would have been] written to dst [if it were
664 // not NULL]
665 size_t len = 0;
666
667 const char * const srcEnd = src + srcLen;
668
669 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
670 {
671 const unsigned char cc = *src++;
672
673 if ( state.IsShifted() )
674 {
675 const unsigned char dc = utf7unb64[cc];
676 if ( dc == 0xff )
677 {
678 // end of encoded part, check that nothing was left: there can
679 // be up to 4 bits of 0 padding but nothing else (we also need
680 // to check isLSB as we count bits modulo 8 while a valid UTF-7
681 // encoded sequence must contain an integral number of UTF-16
682 // characters)
683 if ( state.isLSB || state.bit > 4 ||
684 (state.accum & ((1 << state.bit) - 1)) )
685 {
686 if ( !len )
687 state = stateOrig;
688
689 return wxCONV_FAILED;
690 }
691
692 state.ToDirect();
693
694 // re-parse this character normally below unless it's '-' which
695 // is consumed by the decoder
696 if ( cc == '-' )
697 continue;
698 }
699 else // valid encoded character
700 {
701 // mini base64 decoder: each character is 6 bits
702 state.bit += 6;
703 state.accum <<= 6;
704 state.accum += dc;
705
706 if ( state.bit >= 8 )
707 {
708 // got the full byte, consume it
709 state.bit -= 8;
710 unsigned char b = (state.accum >> state.bit) & 0x00ff;
711
712 if ( state.isLSB )
713 {
714 // we've got the full word, output it
715 if ( dst )
716 *dst++ = (state.msb << 8) | b;
717 len++;
718 state.isLSB = false;
719 }
720 else // MSB
721 {
722 // just store it while we wait for LSB
723 state.msb = b;
724 state.isLSB = true;
725 }
726 }
727 }
728 }
729
730 if ( state.IsDirect() )
731 {
732 // start of an encoded segment?
733 if ( cc == '+' )
734 {
735 if ( *src == '-' )
736 {
737 // just the encoded plus sign, don't switch to shifted mode
738 if ( dst )
739 *dst++ = '+';
740 len++;
741 src++;
742 }
743 else if ( utf7unb64[(unsigned)*src] == 0xff )
744 {
745 // empty encoded chunks are not allowed
746 if ( !len )
747 state = stateOrig;
748
749 return wxCONV_FAILED;
750 }
751 else // base-64 encoded chunk follows
752 {
753 state.ToShifted();
754 }
755 }
756 else // not '+'
757 {
758 // only printable 7 bit ASCII characters (with the exception of
759 // NUL, TAB, CR and LF) can be used directly
760 if ( cc >= 0x7f || (cc < ' ' &&
761 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
762 return wxCONV_FAILED;
763
764 if ( dst )
765 *dst++ = cc;
766 len++;
767 }
768 }
769 }
770
771 if ( !len )
772 {
773 // as we didn't read any characters we should be called with the same
774 // data (followed by some more new data) again later so don't save our
775 // state
776 state = stateOrig;
777
778 return wxCONV_FAILED;
779 }
780
781 return len;
782 }
783
784 //
785 // BASE64 encoding table
786 //
787 static const unsigned char utf7enb64[] =
788 {
789 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
790 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
791 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
792 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
793 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
794 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
795 'w', 'x', 'y', 'z', '0', '1', '2', '3',
796 '4', '5', '6', '7', '8', '9', '+', '/'
797 };
798
799 //
800 // UTF-7 encoding table
801 //
802 // 0 - Set D (directly encoded characters)
803 // 1 - Set O (optional direct characters)
804 // 2 - whitespace characters (optional)
805 // 3 - special characters
806 //
807 static const unsigned char utf7encode[128] =
808 {
809 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
810 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
811 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
813 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
817 };
818
819 static inline bool wxIsUTF7Direct(wchar_t wc)
820 {
821 return wc < 0x80 && utf7encode[wc] < 1;
822 }
823
824 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
825 const wchar_t *src, size_t srcLen) const
826 {
827 EncoderState stateOrig,
828 *statePtr;
829 if ( srcLen == wxNO_LEN )
830 {
831 // we don't apply the stored state when operating on entire strings at
832 // once
833 statePtr = &stateOrig;
834
835 srcLen = wxWcslen(src) + 1;
836 }
837 else // do use the mode we left the output in previously
838 {
839 stateOrig = m_stateEncoder;
840 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
841 }
842
843 EncoderState& state = *statePtr;
844
845
846 size_t len = 0;
847
848 const wchar_t * const srcEnd = src + srcLen;
849 while ( src < srcEnd && (!dst || len < dstLen) )
850 {
851 wchar_t cc = *src++;
852 if ( wxIsUTF7Direct(cc) )
853 {
854 if ( state.IsShifted() )
855 {
856 // pad with zeros the last encoded block if necessary
857 if ( state.bit )
858 {
859 if ( dst )
860 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
861 len++;
862 }
863
864 state.ToDirect();
865
866 if ( dst )
867 *dst++ = '-';
868 len++;
869 }
870
871 if ( dst )
872 *dst++ = (char)cc;
873 len++;
874 }
875 else if ( cc == '+' && state.IsDirect() )
876 {
877 if ( dst )
878 {
879 *dst++ = '+';
880 *dst++ = '-';
881 }
882
883 len += 2;
884 }
885 #ifndef WC_UTF16
886 else if (((wxUint32)cc) > 0xffff)
887 {
888 // no surrogate pair generation (yet?)
889 return wxCONV_FAILED;
890 }
891 #endif
892 else
893 {
894 if ( state.IsDirect() )
895 {
896 state.ToShifted();
897
898 if ( dst )
899 *dst++ = '+';
900 len++;
901 }
902
903 // BASE64 encode string
904 for ( ;; )
905 {
906 for ( unsigned lsb = 0; lsb < 2; lsb++ )
907 {
908 state.accum <<= 8;
909 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
910
911 for (state.bit += 8; state.bit >= 6; )
912 {
913 state.bit -= 6;
914 if ( dst )
915 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
916 len++;
917 }
918 }
919
920 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
921 break;
922
923 src++;
924 }
925 }
926 }
927
928 // we need to restore the original encoder state if we were called just to
929 // calculate the amount of space needed as we will presumably be called
930 // again to really convert the data now
931 if ( !dst )
932 state = stateOrig;
933
934 return len;
935 }
936
937 // ----------------------------------------------------------------------------
938 // UTF-8
939 // ----------------------------------------------------------------------------
940
941 static const wxUint32 utf8_max[]=
942 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
943
944 // boundaries of the private use area we use to (temporarily) remap invalid
945 // characters invalid in a UTF-8 encoded string
946 const wxUint32 wxUnicodePUA = 0x100000;
947 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
948
949 // this table gives the length of the UTF-8 encoding from its first character:
950 const unsigned char tableUtf8Lengths[256] = {
951 // single-byte sequences (ASCII):
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
960
961 // these are invalid:
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
966 0, 0, // C0,C1
967
968 // two-byte sequences:
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
971
972 // three-byte sequences:
973 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
974
975 // four-byte sequences:
976 4, 4, 4, 4, 4, // F0..F4
977
978 // these are invalid again (5- or 6-byte
979 // sequences and sequences for code points
980 // above U+10FFFF, as restricted by RFC 3629):
981 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
982 };
983
984 size_t
985 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
986 const char *src, size_t srcLen) const
987 {
988 wchar_t *out = dstLen ? dst : NULL;
989 size_t written = 0;
990
991 if ( srcLen == wxNO_LEN )
992 srcLen = strlen(src) + 1;
993
994 for ( const char *p = src; ; p++ )
995 {
996 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
997 {
998 // all done successfully, just add the trailing NULL if we are not
999 // using explicit length
1000 if ( srcLen == wxNO_LEN )
1001 {
1002 if ( out )
1003 {
1004 if ( !dstLen )
1005 break;
1006
1007 *out = L'\0';
1008 }
1009
1010 written++;
1011 }
1012
1013 return written;
1014 }
1015
1016 if ( out && !dstLen-- )
1017 break;
1018
1019 wxUint32 code;
1020 unsigned char c = *p;
1021
1022 if ( c < 0x80 )
1023 {
1024 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1025 break;
1026
1027 if ( srcLen != wxNO_LEN )
1028 srcLen--;
1029
1030 code = c;
1031 }
1032 else
1033 {
1034 unsigned len = tableUtf8Lengths[c];
1035 if ( !len )
1036 break;
1037
1038 if ( srcLen < len ) // the test works for wxNO_LEN too
1039 break;
1040
1041 if ( srcLen != wxNO_LEN )
1042 srcLen -= len;
1043
1044 // Char. number range | UTF-8 octet sequence
1045 // (hexadecimal) | (binary)
1046 // ----------------------+----------------------------------------
1047 // 0000 0000 - 0000 007F | 0xxxxxxx
1048 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1049 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1050 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1051 //
1052 // Code point value is stored in bits marked with 'x',
1053 // lowest-order bit of the value on the right side in the diagram
1054 // above. (from RFC 3629)
1055
1056 // mask to extract lead byte's value ('x' bits above), by sequence
1057 // length:
1058 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1059
1060 // mask and value of lead byte's most significant bits, by length:
1061 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1062 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1063
1064 len--; // it's more convenient to work with 0-based length here
1065
1066 // extract the lead byte's value bits:
1067 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1068 break;
1069
1070 code = c & leadValueMask[len];
1071
1072 // all remaining bytes, if any, are handled in the same way
1073 // regardless of sequence's length:
1074 for ( ; len; --len )
1075 {
1076 c = *++p;
1077 if ( (c & 0xC0) != 0x80 )
1078 return wxCONV_FAILED;
1079
1080 code <<= 6;
1081 code |= c & 0x3F;
1082 }
1083 }
1084
1085 #ifdef WC_UTF16
1086 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1087 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1088 {
1089 if ( out )
1090 out++;
1091 written++;
1092 }
1093 #else // !WC_UTF16
1094 if ( out )
1095 *out = code;
1096 #endif // WC_UTF16/!WC_UTF16
1097
1098 if ( out )
1099 out++;
1100
1101 written++;
1102 }
1103
1104 return wxCONV_FAILED;
1105 }
1106
1107 size_t
1108 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1109 const wchar_t *src, size_t srcLen) const
1110 {
1111 char *out = dstLen ? dst : NULL;
1112 size_t written = 0;
1113
1114 for ( const wchar_t *wp = src; ; wp++ )
1115 {
1116 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
1117 {
1118 // all done successfully, just add the trailing NULL if we are not
1119 // using explicit length
1120 if ( srcLen == wxNO_LEN )
1121 {
1122 if ( out )
1123 {
1124 if ( !dstLen )
1125 break;
1126
1127 *out = '\0';
1128 }
1129
1130 written++;
1131 }
1132
1133 return written;
1134 }
1135
1136 if ( srcLen != wxNO_LEN )
1137 srcLen--;
1138
1139 wxUint32 code;
1140 #ifdef WC_UTF16
1141 // cast is ok for WC_UTF16
1142 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1143 {
1144 // skip the next char too as we decoded a surrogate
1145 wp++;
1146 if ( srcLen != wxNO_LEN )
1147 srcLen--;
1148 }
1149 #else // wchar_t is UTF-32
1150 code = *wp & 0x7fffffff;
1151 #endif
1152
1153 unsigned len;
1154 if ( code <= 0x7F )
1155 {
1156 len = 1;
1157 if ( out )
1158 {
1159 if ( dstLen < len )
1160 break;
1161
1162 out[0] = (char)code;
1163 }
1164 }
1165 else if ( code <= 0x07FF )
1166 {
1167 len = 2;
1168 if ( out )
1169 {
1170 if ( dstLen < len )
1171 break;
1172
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1176 out[0] = 0xC0 | code;
1177 }
1178 }
1179 else if ( code < 0xFFFF )
1180 {
1181 len = 3;
1182 if ( out )
1183 {
1184 if ( dstLen < len )
1185 break;
1186
1187 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1188 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[0] = 0xE0 | code;
1190 }
1191 }
1192 else if ( code <= 0x10FFFF )
1193 {
1194 len = 4;
1195 if ( out )
1196 {
1197 if ( dstLen < len )
1198 break;
1199
1200 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1201 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[0] = 0xF0 | code;
1204 }
1205 }
1206 else
1207 {
1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1209 break;
1210 }
1211
1212 if ( out )
1213 {
1214 out += len;
1215 dstLen -= len;
1216 }
1217
1218 written += len;
1219 }
1220
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED;
1223 }
1224
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1226 const char *psz, size_t srcLen) const
1227 {
1228 if ( m_options == MAP_INVALID_UTF8_NOT )
1229 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1230
1231 size_t len = 0;
1232
1233 // The length can be either given explicitly or computed implicitly for the
1234 // NUL-terminated strings.
1235 const bool isNulTerminated = srcLen == wxNO_LEN;
1236 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1237 {
1238 const char *opsz = psz;
1239 bool invalid = false;
1240 unsigned char cc = *psz++, fc = cc;
1241 unsigned cnt;
1242 for (cnt = 0; fc & 0x80; cnt++)
1243 fc <<= 1;
1244
1245 if (!cnt)
1246 {
1247 // plain ASCII char
1248 if (buf)
1249 *buf++ = cc;
1250 len++;
1251
1252 // escape the escape character for octal escapes
1253 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1254 && cc == '\\' && (!buf || len < n))
1255 {
1256 if (buf)
1257 *buf++ = cc;
1258 len++;
1259 }
1260 }
1261 else
1262 {
1263 cnt--;
1264 if (!cnt)
1265 {
1266 // invalid UTF-8 sequence
1267 invalid = true;
1268 }
1269 else
1270 {
1271 unsigned ocnt = cnt - 1;
1272 wxUint32 res = cc & (0x3f >> cnt);
1273 while (cnt--)
1274 {
1275 cc = *psz;
1276 if ((cc & 0xC0) != 0x80)
1277 {
1278 // invalid UTF-8 sequence
1279 invalid = true;
1280 break;
1281 }
1282
1283 psz++;
1284 res = (res << 6) | (cc & 0x3f);
1285 }
1286
1287 if (invalid || res <= utf8_max[ocnt])
1288 {
1289 // illegal UTF-8 encoding
1290 invalid = true;
1291 }
1292 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1293 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1294 {
1295 // if one of our PUA characters turns up externally
1296 // it must also be treated as an illegal sequence
1297 // (a bit like you have to escape an escape character)
1298 invalid = true;
1299 }
1300 else
1301 {
1302 #ifdef WC_UTF16
1303 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1304 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1305 if (pa == wxCONV_FAILED)
1306 {
1307 invalid = true;
1308 }
1309 else
1310 {
1311 if (buf)
1312 buf += pa;
1313 len += pa;
1314 }
1315 #else // !WC_UTF16
1316 if (buf)
1317 *buf++ = (wchar_t)res;
1318 len++;
1319 #endif // WC_UTF16/!WC_UTF16
1320 }
1321 }
1322
1323 if (invalid)
1324 {
1325 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1326 {
1327 while (opsz < psz && (!buf || len < n))
1328 {
1329 #ifdef WC_UTF16
1330 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1331 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1332 wxASSERT(pa != wxCONV_FAILED);
1333 if (buf)
1334 buf += pa;
1335 opsz++;
1336 len += pa;
1337 #else
1338 if (buf)
1339 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1340 opsz++;
1341 len++;
1342 #endif
1343 }
1344 }
1345 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1346 {
1347 while (opsz < psz && (!buf || len < n))
1348 {
1349 if ( buf && len + 3 < n )
1350 {
1351 unsigned char on = *opsz;
1352 *buf++ = L'\\';
1353 *buf++ = (wchar_t)( L'0' + on / 0100 );
1354 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1355 *buf++ = (wchar_t)( L'0' + on % 010 );
1356 }
1357
1358 opsz++;
1359 len += 4;
1360 }
1361 }
1362 else // MAP_INVALID_UTF8_NOT
1363 {
1364 return wxCONV_FAILED;
1365 }
1366 }
1367 }
1368 }
1369
1370 if ( isNulTerminated )
1371 {
1372 // Add the trailing NUL in this case if we have a large enough buffer.
1373 if ( buf && (len < n) )
1374 *buf = 0;
1375
1376 // And count it in any case.
1377 len++;
1378 }
1379
1380 return len;
1381 }
1382
1383 static inline bool isoctal(wchar_t wch)
1384 {
1385 return L'0' <= wch && wch <= L'7';
1386 }
1387
1388 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1389 const wchar_t *psz, size_t srcLen) const
1390 {
1391 if ( m_options == MAP_INVALID_UTF8_NOT )
1392 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1393
1394 size_t len = 0;
1395
1396 // The length can be either given explicitly or computed implicitly for the
1397 // NUL-terminated strings.
1398 const bool isNulTerminated = srcLen == wxNO_LEN;
1399 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1400 {
1401 wxUint32 cc;
1402
1403 #ifdef WC_UTF16
1404 // cast is ok for WC_UTF16
1405 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1406 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1407 #else
1408 cc = (*psz++) & 0x7fffffff;
1409 #endif
1410
1411 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1412 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1413 {
1414 if (buf)
1415 *buf++ = (char)(cc - wxUnicodePUA);
1416 len++;
1417 }
1418 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1419 && cc == L'\\' && psz[0] == L'\\' )
1420 {
1421 if (buf)
1422 *buf++ = (char)cc;
1423 psz++;
1424 len++;
1425 }
1426 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1427 cc == L'\\' &&
1428 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1429 {
1430 if (buf)
1431 {
1432 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1433 (psz[1] - L'0') * 010 +
1434 (psz[2] - L'0'));
1435 }
1436
1437 psz += 3;
1438 len++;
1439 }
1440 else
1441 {
1442 unsigned cnt;
1443 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1444 {
1445 }
1446
1447 if (!cnt)
1448 {
1449 // plain ASCII char
1450 if (buf)
1451 *buf++ = (char) cc;
1452 len++;
1453 }
1454 else
1455 {
1456 len += cnt + 1;
1457 if (buf)
1458 {
1459 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1460 while (cnt--)
1461 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1462 }
1463 }
1464 }
1465 }
1466
1467 if ( isNulTerminated )
1468 {
1469 // Add the trailing NUL in this case if we have a large enough buffer.
1470 if ( buf && (len < n) )
1471 *buf = 0;
1472
1473 // And count it in any case.
1474 len++;
1475 }
1476
1477 return len;
1478 }
1479
1480 // ============================================================================
1481 // UTF-16
1482 // ============================================================================
1483
1484 #ifdef WORDS_BIGENDIAN
1485 #define wxMBConvUTF16straight wxMBConvUTF16BE
1486 #define wxMBConvUTF16swap wxMBConvUTF16LE
1487 #else
1488 #define wxMBConvUTF16swap wxMBConvUTF16BE
1489 #define wxMBConvUTF16straight wxMBConvUTF16LE
1490 #endif
1491
1492 /* static */
1493 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1494 {
1495 if ( srcLen == wxNO_LEN )
1496 {
1497 // count the number of bytes in input, including the trailing NULs
1498 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1499 for ( srcLen = 1; *inBuff++; srcLen++ )
1500 ;
1501
1502 srcLen *= BYTES_PER_CHAR;
1503 }
1504 else // we already have the length
1505 {
1506 // we can only convert an entire number of UTF-16 characters
1507 if ( srcLen % BYTES_PER_CHAR )
1508 return wxCONV_FAILED;
1509 }
1510
1511 return srcLen;
1512 }
1513
1514 // case when in-memory representation is UTF-16 too
1515 #ifdef WC_UTF16
1516
1517 // ----------------------------------------------------------------------------
1518 // conversions without endianness change
1519 // ----------------------------------------------------------------------------
1520
1521 size_t
1522 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1523 const char *src, size_t srcLen) const
1524 {
1525 // set up the scene for using memcpy() (which is presumably more efficient
1526 // than copying the bytes one by one)
1527 srcLen = GetLength(src, srcLen);
1528 if ( srcLen == wxNO_LEN )
1529 return wxCONV_FAILED;
1530
1531 const size_t inLen = srcLen / BYTES_PER_CHAR;
1532 if ( dst )
1533 {
1534 if ( dstLen < inLen )
1535 return wxCONV_FAILED;
1536
1537 memcpy(dst, src, srcLen);
1538 }
1539
1540 return inLen;
1541 }
1542
1543 size_t
1544 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1545 const wchar_t *src, size_t srcLen) const
1546 {
1547 if ( srcLen == wxNO_LEN )
1548 srcLen = wxWcslen(src) + 1;
1549
1550 srcLen *= BYTES_PER_CHAR;
1551
1552 if ( dst )
1553 {
1554 if ( dstLen < srcLen )
1555 return wxCONV_FAILED;
1556
1557 memcpy(dst, src, srcLen);
1558 }
1559
1560 return srcLen;
1561 }
1562
1563 // ----------------------------------------------------------------------------
1564 // endian-reversing conversions
1565 // ----------------------------------------------------------------------------
1566
1567 size_t
1568 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1569 const char *src, size_t srcLen) const
1570 {
1571 srcLen = GetLength(src, srcLen);
1572 if ( srcLen == wxNO_LEN )
1573 return wxCONV_FAILED;
1574
1575 srcLen /= BYTES_PER_CHAR;
1576
1577 if ( dst )
1578 {
1579 if ( dstLen < srcLen )
1580 return wxCONV_FAILED;
1581
1582 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1583 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1584 {
1585 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1586 }
1587 }
1588
1589 return srcLen;
1590 }
1591
1592 size_t
1593 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1594 const wchar_t *src, size_t srcLen) const
1595 {
1596 if ( srcLen == wxNO_LEN )
1597 srcLen = wxWcslen(src) + 1;
1598
1599 srcLen *= BYTES_PER_CHAR;
1600
1601 if ( dst )
1602 {
1603 if ( dstLen < srcLen )
1604 return wxCONV_FAILED;
1605
1606 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1607 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1608 {
1609 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1610 }
1611 }
1612
1613 return srcLen;
1614 }
1615
1616 #else // !WC_UTF16: wchar_t is UTF-32
1617
1618 // ----------------------------------------------------------------------------
1619 // conversions without endianness change
1620 // ----------------------------------------------------------------------------
1621
1622 size_t
1623 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1624 const char *src, size_t srcLen) const
1625 {
1626 srcLen = GetLength(src, srcLen);
1627 if ( srcLen == wxNO_LEN )
1628 return wxCONV_FAILED;
1629
1630 const size_t inLen = srcLen / BYTES_PER_CHAR;
1631 if ( !dst )
1632 {
1633 // optimization: return maximal space which could be needed for this
1634 // string even if the real size could be smaller if the buffer contains
1635 // any surrogates
1636 return inLen;
1637 }
1638
1639 size_t outLen = 0;
1640 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1641 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1642 {
1643 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1644 if ( !inBuff )
1645 return wxCONV_FAILED;
1646
1647 if ( ++outLen > dstLen )
1648 return wxCONV_FAILED;
1649
1650 *dst++ = ch;
1651 }
1652
1653
1654 return outLen;
1655 }
1656
1657 size_t
1658 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1659 const wchar_t *src, size_t srcLen) const
1660 {
1661 if ( srcLen == wxNO_LEN )
1662 srcLen = wxWcslen(src) + 1;
1663
1664 size_t outLen = 0;
1665 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1666 for ( size_t n = 0; n < srcLen; n++ )
1667 {
1668 wxUint16 cc[2] = { 0 };
1669 const size_t numChars = encode_utf16(*src++, cc);
1670 if ( numChars == wxCONV_FAILED )
1671 return wxCONV_FAILED;
1672
1673 outLen += numChars * BYTES_PER_CHAR;
1674 if ( outBuff )
1675 {
1676 if ( outLen > dstLen )
1677 return wxCONV_FAILED;
1678
1679 *outBuff++ = cc[0];
1680 if ( numChars == 2 )
1681 {
1682 // second character of a surrogate
1683 *outBuff++ = cc[1];
1684 }
1685 }
1686 }
1687
1688 return outLen;
1689 }
1690
1691 // ----------------------------------------------------------------------------
1692 // endian-reversing conversions
1693 // ----------------------------------------------------------------------------
1694
1695 size_t
1696 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1697 const char *src, size_t srcLen) const
1698 {
1699 srcLen = GetLength(src, srcLen);
1700 if ( srcLen == wxNO_LEN )
1701 return wxCONV_FAILED;
1702
1703 const size_t inLen = srcLen / BYTES_PER_CHAR;
1704 if ( !dst )
1705 {
1706 // optimization: return maximal space which could be needed for this
1707 // string even if the real size could be smaller if the buffer contains
1708 // any surrogates
1709 return inLen;
1710 }
1711
1712 size_t outLen = 0;
1713 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1714 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1715 {
1716 wxUint32 ch;
1717 wxUint16 tmp[2];
1718
1719 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1720 inBuff++;
1721 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1722
1723 const size_t numChars = decode_utf16(tmp, ch);
1724 if ( numChars == wxCONV_FAILED )
1725 return wxCONV_FAILED;
1726
1727 if ( numChars == 2 )
1728 inBuff++;
1729
1730 if ( ++outLen > dstLen )
1731 return wxCONV_FAILED;
1732
1733 *dst++ = ch;
1734 }
1735
1736
1737 return outLen;
1738 }
1739
1740 size_t
1741 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1742 const wchar_t *src, size_t srcLen) const
1743 {
1744 if ( srcLen == wxNO_LEN )
1745 srcLen = wxWcslen(src) + 1;
1746
1747 size_t outLen = 0;
1748 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1749 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1750 {
1751 wxUint16 cc[2] = { 0 };
1752 const size_t numChars = encode_utf16(*src, cc);
1753 if ( numChars == wxCONV_FAILED )
1754 return wxCONV_FAILED;
1755
1756 outLen += numChars * BYTES_PER_CHAR;
1757 if ( outBuff )
1758 {
1759 if ( outLen > dstLen )
1760 return wxCONV_FAILED;
1761
1762 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1763 if ( numChars == 2 )
1764 {
1765 // second character of a surrogate
1766 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1767 }
1768 }
1769 }
1770
1771 return outLen;
1772 }
1773
1774 #endif // WC_UTF16/!WC_UTF16
1775
1776
1777 // ============================================================================
1778 // UTF-32
1779 // ============================================================================
1780
1781 #ifdef WORDS_BIGENDIAN
1782 #define wxMBConvUTF32straight wxMBConvUTF32BE
1783 #define wxMBConvUTF32swap wxMBConvUTF32LE
1784 #else
1785 #define wxMBConvUTF32swap wxMBConvUTF32BE
1786 #define wxMBConvUTF32straight wxMBConvUTF32LE
1787 #endif
1788
1789
1790 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1791 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1792
1793 /* static */
1794 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1795 {
1796 if ( srcLen == wxNO_LEN )
1797 {
1798 // count the number of bytes in input, including the trailing NULs
1799 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1800 for ( srcLen = 1; *inBuff++; srcLen++ )
1801 ;
1802
1803 srcLen *= BYTES_PER_CHAR;
1804 }
1805 else // we already have the length
1806 {
1807 // we can only convert an entire number of UTF-32 characters
1808 if ( srcLen % BYTES_PER_CHAR )
1809 return wxCONV_FAILED;
1810 }
1811
1812 return srcLen;
1813 }
1814
1815 // case when in-memory representation is UTF-16
1816 #ifdef WC_UTF16
1817
1818 // ----------------------------------------------------------------------------
1819 // conversions without endianness change
1820 // ----------------------------------------------------------------------------
1821
1822 size_t
1823 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1824 const char *src, size_t srcLen) const
1825 {
1826 srcLen = GetLength(src, srcLen);
1827 if ( srcLen == wxNO_LEN )
1828 return wxCONV_FAILED;
1829
1830 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1831 const size_t inLen = srcLen / BYTES_PER_CHAR;
1832 size_t outLen = 0;
1833 for ( size_t n = 0; n < inLen; n++ )
1834 {
1835 wxUint16 cc[2] = { 0 };
1836 const size_t numChars = encode_utf16(*inBuff++, cc);
1837 if ( numChars == wxCONV_FAILED )
1838 return wxCONV_FAILED;
1839
1840 outLen += numChars;
1841 if ( dst )
1842 {
1843 if ( outLen > dstLen )
1844 return wxCONV_FAILED;
1845
1846 *dst++ = cc[0];
1847 if ( numChars == 2 )
1848 {
1849 // second character of a surrogate
1850 *dst++ = cc[1];
1851 }
1852 }
1853 }
1854
1855 return outLen;
1856 }
1857
1858 size_t
1859 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1860 const wchar_t *src, size_t srcLen) const
1861 {
1862 if ( srcLen == wxNO_LEN )
1863 srcLen = wxWcslen(src) + 1;
1864
1865 if ( !dst )
1866 {
1867 // optimization: return maximal space which could be needed for this
1868 // string instead of the exact amount which could be less if there are
1869 // any surrogates in the input
1870 //
1871 // we consider that surrogates are rare enough to make it worthwhile to
1872 // avoid running the loop below at the cost of slightly extra memory
1873 // consumption
1874 return srcLen * BYTES_PER_CHAR;
1875 }
1876
1877 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1878 size_t outLen = 0;
1879 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1880 {
1881 const wxUint32 ch = wxDecodeSurrogate(&src);
1882 if ( !src )
1883 return wxCONV_FAILED;
1884
1885 outLen += BYTES_PER_CHAR;
1886
1887 if ( outLen > dstLen )
1888 return wxCONV_FAILED;
1889
1890 *outBuff++ = ch;
1891 }
1892
1893 return outLen;
1894 }
1895
1896 // ----------------------------------------------------------------------------
1897 // endian-reversing conversions
1898 // ----------------------------------------------------------------------------
1899
1900 size_t
1901 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1902 const char *src, size_t srcLen) const
1903 {
1904 srcLen = GetLength(src, srcLen);
1905 if ( srcLen == wxNO_LEN )
1906 return wxCONV_FAILED;
1907
1908 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1909 const size_t inLen = srcLen / BYTES_PER_CHAR;
1910 size_t outLen = 0;
1911 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1912 {
1913 wxUint16 cc[2] = { 0 };
1914 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1915 if ( numChars == wxCONV_FAILED )
1916 return wxCONV_FAILED;
1917
1918 outLen += numChars;
1919 if ( dst )
1920 {
1921 if ( outLen > dstLen )
1922 return wxCONV_FAILED;
1923
1924 *dst++ = cc[0];
1925 if ( numChars == 2 )
1926 {
1927 // second character of a surrogate
1928 *dst++ = cc[1];
1929 }
1930 }
1931 }
1932
1933 return outLen;
1934 }
1935
1936 size_t
1937 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1938 const wchar_t *src, size_t srcLen) const
1939 {
1940 if ( srcLen == wxNO_LEN )
1941 srcLen = wxWcslen(src) + 1;
1942
1943 if ( !dst )
1944 {
1945 // optimization: return maximal space which could be needed for this
1946 // string instead of the exact amount which could be less if there are
1947 // any surrogates in the input
1948 //
1949 // we consider that surrogates are rare enough to make it worthwhile to
1950 // avoid running the loop below at the cost of slightly extra memory
1951 // consumption
1952 return srcLen*BYTES_PER_CHAR;
1953 }
1954
1955 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1956 size_t outLen = 0;
1957 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1958 {
1959 const wxUint32 ch = wxDecodeSurrogate(&src);
1960 if ( !src )
1961 return wxCONV_FAILED;
1962
1963 outLen += BYTES_PER_CHAR;
1964
1965 if ( outLen > dstLen )
1966 return wxCONV_FAILED;
1967
1968 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1969 }
1970
1971 return outLen;
1972 }
1973
1974 #else // !WC_UTF16: wchar_t is UTF-32
1975
1976 // ----------------------------------------------------------------------------
1977 // conversions without endianness change
1978 // ----------------------------------------------------------------------------
1979
1980 size_t
1981 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1982 const char *src, size_t srcLen) const
1983 {
1984 // use memcpy() as it should be much faster than hand-written loop
1985 srcLen = GetLength(src, srcLen);
1986 if ( srcLen == wxNO_LEN )
1987 return wxCONV_FAILED;
1988
1989 const size_t inLen = srcLen/BYTES_PER_CHAR;
1990 if ( dst )
1991 {
1992 if ( dstLen < inLen )
1993 return wxCONV_FAILED;
1994
1995 memcpy(dst, src, srcLen);
1996 }
1997
1998 return inLen;
1999 }
2000
2001 size_t
2002 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2003 const wchar_t *src, size_t srcLen) const
2004 {
2005 if ( srcLen == wxNO_LEN )
2006 srcLen = wxWcslen(src) + 1;
2007
2008 srcLen *= BYTES_PER_CHAR;
2009
2010 if ( dst )
2011 {
2012 if ( dstLen < srcLen )
2013 return wxCONV_FAILED;
2014
2015 memcpy(dst, src, srcLen);
2016 }
2017
2018 return srcLen;
2019 }
2020
2021 // ----------------------------------------------------------------------------
2022 // endian-reversing conversions
2023 // ----------------------------------------------------------------------------
2024
2025 size_t
2026 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2027 const char *src, size_t srcLen) const
2028 {
2029 srcLen = GetLength(src, srcLen);
2030 if ( srcLen == wxNO_LEN )
2031 return wxCONV_FAILED;
2032
2033 srcLen /= BYTES_PER_CHAR;
2034
2035 if ( dst )
2036 {
2037 if ( dstLen < srcLen )
2038 return wxCONV_FAILED;
2039
2040 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2041 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2042 {
2043 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2044 }
2045 }
2046
2047 return srcLen;
2048 }
2049
2050 size_t
2051 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2052 const wchar_t *src, size_t srcLen) const
2053 {
2054 if ( srcLen == wxNO_LEN )
2055 srcLen = wxWcslen(src) + 1;
2056
2057 srcLen *= BYTES_PER_CHAR;
2058
2059 if ( dst )
2060 {
2061 if ( dstLen < srcLen )
2062 return wxCONV_FAILED;
2063
2064 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2065 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2066 {
2067 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2068 }
2069 }
2070
2071 return srcLen;
2072 }
2073
2074 #endif // WC_UTF16/!WC_UTF16
2075
2076
2077 // ============================================================================
2078 // The classes doing conversion using the iconv_xxx() functions
2079 // ============================================================================
2080
2081 #ifdef HAVE_ICONV
2082
2083 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2084 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2085 // (unless there's yet another bug in glibc) the only case when iconv()
2086 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2087 // left in the input buffer -- when _real_ error occurs,
2088 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2089 // iconv() failure.
2090 // [This bug does not appear in glibc 2.2.]
2091 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2092 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2093 (errno != E2BIG || bufLeft != 0))
2094 #else
2095 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2096 #endif
2097
2098 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2099
2100 #define ICONV_T_INVALID ((iconv_t)-1)
2101
2102 #if SIZEOF_WCHAR_T == 4
2103 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2104 #define WC_ENC wxFONTENCODING_UTF32
2105 #elif SIZEOF_WCHAR_T == 2
2106 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2107 #define WC_ENC wxFONTENCODING_UTF16
2108 #else // sizeof(wchar_t) != 2 nor 4
2109 // does this ever happen?
2110 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2111 #endif
2112
2113 // ----------------------------------------------------------------------------
2114 // wxMBConv_iconv: encapsulates an iconv character set
2115 // ----------------------------------------------------------------------------
2116
2117 class wxMBConv_iconv : public wxMBConv
2118 {
2119 public:
2120 wxMBConv_iconv(const char *name);
2121 virtual ~wxMBConv_iconv();
2122
2123 // implement base class virtual methods
2124 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2125 const char *src, size_t srcLen = wxNO_LEN) const;
2126 virtual size_t FromWChar(char *dst, size_t dstLen,
2127 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2128 virtual size_t GetMBNulLen() const;
2129
2130 #if wxUSE_UNICODE_UTF8
2131 virtual bool IsUTF8() const;
2132 #endif
2133
2134 virtual wxMBConv *Clone() const
2135 {
2136 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2137 p->m_minMBCharWidth = m_minMBCharWidth;
2138 return p;
2139 }
2140
2141 bool IsOk() const
2142 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2143
2144 protected:
2145 // the iconv handlers used to translate from multibyte
2146 // to wide char and in the other direction
2147 iconv_t m2w,
2148 w2m;
2149
2150 #if wxUSE_THREADS
2151 // guards access to m2w and w2m objects
2152 wxMutex m_iconvMutex;
2153 #endif
2154
2155 private:
2156 // the name (for iconv_open()) of a wide char charset -- if none is
2157 // available on this machine, it will remain NULL
2158 static wxString ms_wcCharsetName;
2159
2160 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2161 // different endian-ness than the native one
2162 static bool ms_wcNeedsSwap;
2163
2164
2165 // name of the encoding handled by this conversion
2166 const char *m_name;
2167
2168 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2169 // initially
2170 size_t m_minMBCharWidth;
2171 };
2172
2173 // make the constructor available for unit testing
2174 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2175 {
2176 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2177 if ( !result->IsOk() )
2178 {
2179 delete result;
2180 return 0;
2181 }
2182
2183 return result;
2184 }
2185
2186 wxString wxMBConv_iconv::ms_wcCharsetName;
2187 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2188
2189 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2190 : m_name(wxStrdup(name))
2191 {
2192 m_minMBCharWidth = 0;
2193
2194 // check for charset that represents wchar_t:
2195 if ( ms_wcCharsetName.empty() )
2196 {
2197 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2198
2199 #if wxUSE_FONTMAP
2200 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2201 #else // !wxUSE_FONTMAP
2202 static const wxChar *const names_static[] =
2203 {
2204 #if SIZEOF_WCHAR_T == 4
2205 wxT("UCS-4"),
2206 #elif SIZEOF_WCHAR_T == 2
2207 wxT("UCS-2"),
2208 #endif
2209 NULL
2210 };
2211 const wxChar *const *names = names_static;
2212 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2213
2214 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2215 {
2216 const wxString nameCS(*names);
2217
2218 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2219 wxString nameXE(nameCS);
2220
2221 #ifdef WORDS_BIGENDIAN
2222 nameXE += wxT("BE");
2223 #else // little endian
2224 nameXE += wxT("LE");
2225 #endif
2226
2227 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2228 nameXE.c_str());
2229
2230 m2w = iconv_open(nameXE.ToAscii(), name);
2231 if ( m2w == ICONV_T_INVALID )
2232 {
2233 // try charset w/o bytesex info (e.g. "UCS4")
2234 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2235 nameCS.c_str());
2236 m2w = iconv_open(nameCS.ToAscii(), name);
2237
2238 // and check for bytesex ourselves:
2239 if ( m2w != ICONV_T_INVALID )
2240 {
2241 char buf[2], *bufPtr;
2242 wchar_t wbuf[2];
2243 size_t insz, outsz;
2244 size_t res;
2245
2246 buf[0] = 'A';
2247 buf[1] = 0;
2248 wbuf[0] = 0;
2249 insz = 2;
2250 outsz = SIZEOF_WCHAR_T * 2;
2251 char* wbufPtr = (char*)wbuf;
2252 bufPtr = buf;
2253
2254 res = iconv(
2255 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2256 &wbufPtr, &outsz);
2257
2258 if (ICONV_FAILED(res, insz))
2259 {
2260 wxLogLastError(wxT("iconv"));
2261 wxLogError(_("Conversion to charset '%s' doesn't work."),
2262 nameCS.c_str());
2263 }
2264 else // ok, can convert to this encoding, remember it
2265 {
2266 ms_wcCharsetName = nameCS;
2267 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2268 }
2269 }
2270 }
2271 else // use charset not requiring byte swapping
2272 {
2273 ms_wcCharsetName = nameXE;
2274 }
2275 }
2276
2277 wxLogTrace(TRACE_STRCONV,
2278 wxT("iconv wchar_t charset is \"%s\"%s"),
2279 ms_wcCharsetName.empty() ? wxString("<none>")
2280 : ms_wcCharsetName,
2281 ms_wcNeedsSwap ? wxT(" (needs swap)")
2282 : wxT(""));
2283 }
2284 else // we already have ms_wcCharsetName
2285 {
2286 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2287 }
2288
2289 if ( ms_wcCharsetName.empty() )
2290 {
2291 w2m = ICONV_T_INVALID;
2292 }
2293 else
2294 {
2295 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2296 if ( w2m == ICONV_T_INVALID )
2297 {
2298 wxLogTrace(TRACE_STRCONV,
2299 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2300 ms_wcCharsetName.c_str(), name);
2301 }
2302 }
2303 }
2304
2305 wxMBConv_iconv::~wxMBConv_iconv()
2306 {
2307 free(const_cast<char *>(m_name));
2308
2309 if ( m2w != ICONV_T_INVALID )
2310 iconv_close(m2w);
2311 if ( w2m != ICONV_T_INVALID )
2312 iconv_close(w2m);
2313 }
2314
2315 size_t
2316 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2317 const char *src, size_t srcLen) const
2318 {
2319 if ( srcLen == wxNO_LEN )
2320 {
2321 // find the string length: notice that must be done differently for
2322 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2323 // consecutive NULs
2324 const size_t nulLen = GetMBNulLen();
2325 switch ( nulLen )
2326 {
2327 default:
2328 return wxCONV_FAILED;
2329
2330 case 1:
2331 srcLen = strlen(src); // arguably more optimized than our version
2332 break;
2333
2334 case 2:
2335 case 4:
2336 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2337 // but they also have to start at character boundary and not
2338 // span two adjacent characters
2339 const char *p;
2340 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2341 ;
2342 srcLen = p - src;
2343 break;
2344 }
2345
2346 // when we're determining the length of the string ourselves we count
2347 // the terminating NUL(s) as part of it and always NUL-terminate the
2348 // output
2349 srcLen += nulLen;
2350 }
2351
2352 // we express length in the number of (wide) characters but iconv always
2353 // counts buffer sizes it in bytes
2354 dstLen *= SIZEOF_WCHAR_T;
2355
2356 #if wxUSE_THREADS
2357 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2358 // Unfortunately there are a couple of global wxCSConv objects such as
2359 // wxConvLocal that are used all over wx code, so we have to make sure
2360 // the handle is used by at most one thread at the time. Otherwise
2361 // only a few wx classes would be safe to use from non-main threads
2362 // as MB<->WC conversion would fail "randomly".
2363 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2364 #endif // wxUSE_THREADS
2365
2366 size_t res, cres;
2367 const char *pszPtr = src;
2368
2369 if ( dst )
2370 {
2371 char* bufPtr = (char*)dst;
2372
2373 // have destination buffer, convert there
2374 size_t dstLenOrig = dstLen;
2375 cres = iconv(m2w,
2376 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2377 &bufPtr, &dstLen);
2378
2379 // convert the number of bytes converted as returned by iconv to the
2380 // number of (wide) characters converted that we need
2381 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2382
2383 if (ms_wcNeedsSwap)
2384 {
2385 // convert to native endianness
2386 for ( unsigned i = 0; i < res; i++ )
2387 dst[i] = WC_BSWAP(dst[i]);
2388 }
2389 }
2390 else // no destination buffer
2391 {
2392 // convert using temp buffer to calculate the size of the buffer needed
2393 wchar_t tbuf[256];
2394 res = 0;
2395
2396 do
2397 {
2398 char* bufPtr = (char*)tbuf;
2399 dstLen = 8 * SIZEOF_WCHAR_T;
2400
2401 cres = iconv(m2w,
2402 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2403 &bufPtr, &dstLen );
2404
2405 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2406 }
2407 while ((cres == (size_t)-1) && (errno == E2BIG));
2408 }
2409
2410 if (ICONV_FAILED(cres, srcLen))
2411 {
2412 //VS: it is ok if iconv fails, hence trace only
2413 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2414 return wxCONV_FAILED;
2415 }
2416
2417 return res;
2418 }
2419
2420 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2421 const wchar_t *src, size_t srcLen) const
2422 {
2423 #if wxUSE_THREADS
2424 // NB: explained in MB2WC
2425 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2426 #endif
2427
2428 if ( srcLen == wxNO_LEN )
2429 srcLen = wxWcslen(src) + 1;
2430
2431 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2432 size_t outbuflen = dstLen;
2433 size_t res, cres;
2434
2435 wchar_t *tmpbuf = 0;
2436
2437 if (ms_wcNeedsSwap)
2438 {
2439 // need to copy to temp buffer to switch endianness
2440 // (doing WC_BSWAP twice on the original buffer won't work, as it
2441 // could be in read-only memory, or be accessed in some other thread)
2442 tmpbuf = (wchar_t *)malloc(inbuflen);
2443 for ( size_t i = 0; i < srcLen; i++ )
2444 tmpbuf[i] = WC_BSWAP(src[i]);
2445
2446 src = tmpbuf;
2447 }
2448
2449 char* inbuf = (char*)src;
2450 if ( dst )
2451 {
2452 // have destination buffer, convert there
2453 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2454
2455 res = dstLen - outbuflen;
2456 }
2457 else // no destination buffer
2458 {
2459 // convert using temp buffer to calculate the size of the buffer needed
2460 char tbuf[256];
2461 res = 0;
2462 do
2463 {
2464 dst = tbuf;
2465 outbuflen = WXSIZEOF(tbuf);
2466
2467 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2468
2469 res += WXSIZEOF(tbuf) - outbuflen;
2470 }
2471 while ((cres == (size_t)-1) && (errno == E2BIG));
2472 }
2473
2474 if (ms_wcNeedsSwap)
2475 {
2476 free(tmpbuf);
2477 }
2478
2479 if (ICONV_FAILED(cres, inbuflen))
2480 {
2481 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2482 return wxCONV_FAILED;
2483 }
2484
2485 return res;
2486 }
2487
2488 size_t wxMBConv_iconv::GetMBNulLen() const
2489 {
2490 if ( m_minMBCharWidth == 0 )
2491 {
2492 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2493
2494 #if wxUSE_THREADS
2495 // NB: explained in MB2WC
2496 wxMutexLocker lock(self->m_iconvMutex);
2497 #endif
2498
2499 const wchar_t *wnul = L"";
2500 char buf[8]; // should be enough for NUL in any encoding
2501 size_t inLen = sizeof(wchar_t),
2502 outLen = WXSIZEOF(buf);
2503 char *inBuff = (char *)wnul;
2504 char *outBuff = buf;
2505 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2506 {
2507 self->m_minMBCharWidth = (size_t)-1;
2508 }
2509 else // ok
2510 {
2511 self->m_minMBCharWidth = outBuff - buf;
2512 }
2513 }
2514
2515 return m_minMBCharWidth;
2516 }
2517
2518 #if wxUSE_UNICODE_UTF8
2519 bool wxMBConv_iconv::IsUTF8() const
2520 {
2521 return wxStricmp(m_name, "UTF-8") == 0 ||
2522 wxStricmp(m_name, "UTF8") == 0;
2523 }
2524 #endif
2525
2526 #endif // HAVE_ICONV
2527
2528
2529 // ============================================================================
2530 // Win32 conversion classes
2531 // ============================================================================
2532
2533 #ifdef wxHAVE_WIN32_MB2WC
2534
2535 // from utils.cpp
2536 #if wxUSE_FONTMAP
2537 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2538 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2539 #endif
2540
2541 class wxMBConv_win32 : public wxMBConv
2542 {
2543 public:
2544 wxMBConv_win32()
2545 {
2546 m_CodePage = CP_ACP;
2547 m_minMBCharWidth = 0;
2548 }
2549
2550 wxMBConv_win32(const wxMBConv_win32& conv)
2551 : wxMBConv()
2552 {
2553 m_CodePage = conv.m_CodePage;
2554 m_minMBCharWidth = conv.m_minMBCharWidth;
2555 }
2556
2557 #if wxUSE_FONTMAP
2558 wxMBConv_win32(const char* name)
2559 {
2560 m_CodePage = wxCharsetToCodepage(name);
2561 m_minMBCharWidth = 0;
2562 }
2563
2564 wxMBConv_win32(wxFontEncoding encoding)
2565 {
2566 m_CodePage = wxEncodingToCodepage(encoding);
2567 m_minMBCharWidth = 0;
2568 }
2569 #endif // wxUSE_FONTMAP
2570
2571 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2572 {
2573 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2574 // the behaviour is not compatible with the Unix version (using iconv)
2575 // and break the library itself, e.g. wxTextInputStream::NextChar()
2576 // wouldn't work if reading an incomplete MB char didn't result in an
2577 // error
2578 //
2579 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2580 // Win XP or newer and it is not supported for UTF-[78] so we always
2581 // use our own conversions in this case. See
2582 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2583 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2584 if ( m_CodePage == CP_UTF8 )
2585 {
2586 return wxMBConvUTF8().MB2WC(buf, psz, n);
2587 }
2588
2589 if ( m_CodePage == CP_UTF7 )
2590 {
2591 return wxMBConvUTF7().MB2WC(buf, psz, n);
2592 }
2593
2594 int flags = 0;
2595 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2596 IsAtLeastWin2kSP4() )
2597 {
2598 flags = MB_ERR_INVALID_CHARS;
2599 }
2600
2601 const size_t len = ::MultiByteToWideChar
2602 (
2603 m_CodePage, // code page
2604 flags, // flags: fall on error
2605 psz, // input string
2606 -1, // its length (NUL-terminated)
2607 buf, // output string
2608 buf ? n : 0 // size of output buffer
2609 );
2610 if ( !len )
2611 {
2612 // function totally failed
2613 return wxCONV_FAILED;
2614 }
2615
2616 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2617 // check if we succeeded, by doing a double trip:
2618 if ( !flags && buf )
2619 {
2620 const size_t mbLen = strlen(psz);
2621 wxCharBuffer mbBuf(mbLen);
2622 if ( ::WideCharToMultiByte
2623 (
2624 m_CodePage,
2625 0,
2626 buf,
2627 -1,
2628 mbBuf.data(),
2629 mbLen + 1, // size in bytes, not length
2630 NULL,
2631 NULL
2632 ) == 0 ||
2633 strcmp(mbBuf, psz) != 0 )
2634 {
2635 // we didn't obtain the same thing we started from, hence
2636 // the conversion was lossy and we consider that it failed
2637 return wxCONV_FAILED;
2638 }
2639 }
2640
2641 // note that it returns count of written chars for buf != NULL and size
2642 // of the needed buffer for buf == NULL so in either case the length of
2643 // the string (which never includes the terminating NUL) is one less
2644 return len - 1;
2645 }
2646
2647 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2648 {
2649 /*
2650 we have a problem here: by default, WideCharToMultiByte() may
2651 replace characters unrepresentable in the target code page with bad
2652 quality approximations such as turning "1/2" symbol (U+00BD) into
2653 "1" for the code pages which don't have it and we, obviously, want
2654 to avoid this at any price
2655
2656 the trouble is that this function does it _silently_, i.e. it won't
2657 even tell us whether it did or not... Win98/2000 and higher provide
2658 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2659 we have to resort to a round trip, i.e. check that converting back
2660 results in the same string -- this is, of course, expensive but
2661 otherwise we simply can't be sure to not garble the data.
2662 */
2663
2664 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2665 // it doesn't work with CJK encodings (which we test for rather roughly
2666 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2667 // supporting it
2668 BOOL usedDef wxDUMMY_INITIALIZE(false);
2669 BOOL *pUsedDef;
2670 int flags;
2671 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2672 {
2673 // it's our lucky day
2674 flags = WC_NO_BEST_FIT_CHARS;
2675 pUsedDef = &usedDef;
2676 }
2677 else // old system or unsupported encoding
2678 {
2679 flags = 0;
2680 pUsedDef = NULL;
2681 }
2682
2683 const size_t len = ::WideCharToMultiByte
2684 (
2685 m_CodePage, // code page
2686 flags, // either none or no best fit
2687 pwz, // input string
2688 -1, // it is (wide) NUL-terminated
2689 buf, // output buffer
2690 buf ? n : 0, // and its size
2691 NULL, // default "replacement" char
2692 pUsedDef // [out] was it used?
2693 );
2694
2695 if ( !len )
2696 {
2697 // function totally failed
2698 return wxCONV_FAILED;
2699 }
2700
2701 // we did something, check if we really succeeded
2702 if ( flags )
2703 {
2704 // check if the conversion failed, i.e. if any replacements
2705 // were done
2706 if ( usedDef )
2707 return wxCONV_FAILED;
2708 }
2709 else // we must resort to double tripping...
2710 {
2711 // first we need to ensure that we really have the MB data: this is
2712 // not the case if we're called with NULL buffer, in which case we
2713 // need to do the conversion yet again
2714 wxCharBuffer bufDef;
2715 if ( !buf )
2716 {
2717 bufDef = wxCharBuffer(len);
2718 buf = bufDef.data();
2719 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2720 buf, len, NULL, NULL) )
2721 return wxCONV_FAILED;
2722 }
2723
2724 if ( !n )
2725 n = wcslen(pwz);
2726 wxWCharBuffer wcBuf(n);
2727 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2728 wcscmp(wcBuf, pwz) != 0 )
2729 {
2730 // we didn't obtain the same thing we started from, hence
2731 // the conversion was lossy and we consider that it failed
2732 return wxCONV_FAILED;
2733 }
2734 }
2735
2736 // see the comment above for the reason of "len - 1"
2737 return len - 1;
2738 }
2739
2740 virtual size_t GetMBNulLen() const
2741 {
2742 if ( m_minMBCharWidth == 0 )
2743 {
2744 int len = ::WideCharToMultiByte
2745 (
2746 m_CodePage, // code page
2747 0, // no flags
2748 L"", // input string
2749 1, // translate just the NUL
2750 NULL, // output buffer
2751 0, // and its size
2752 NULL, // no replacement char
2753 NULL // [out] don't care if it was used
2754 );
2755
2756 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2757 switch ( len )
2758 {
2759 default:
2760 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2761 self->m_minMBCharWidth = (size_t)-1;
2762 break;
2763
2764 case 0:
2765 self->m_minMBCharWidth = (size_t)-1;
2766 break;
2767
2768 case 1:
2769 case 2:
2770 case 4:
2771 self->m_minMBCharWidth = len;
2772 break;
2773 }
2774 }
2775
2776 return m_minMBCharWidth;
2777 }
2778
2779 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2780
2781 bool IsOk() const { return m_CodePage != -1; }
2782
2783 private:
2784 static bool CanUseNoBestFit()
2785 {
2786 static int s_isWin98Or2k = -1;
2787
2788 if ( s_isWin98Or2k == -1 )
2789 {
2790 int verMaj, verMin;
2791 switch ( wxGetOsVersion(&verMaj, &verMin) )
2792 {
2793 case wxOS_WINDOWS_9X:
2794 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2795 break;
2796
2797 case wxOS_WINDOWS_NT:
2798 s_isWin98Or2k = verMaj >= 5;
2799 break;
2800
2801 default:
2802 // unknown: be conservative by default
2803 s_isWin98Or2k = 0;
2804 break;
2805 }
2806
2807 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2808 }
2809
2810 return s_isWin98Or2k == 1;
2811 }
2812
2813 static bool IsAtLeastWin2kSP4()
2814 {
2815 #ifdef __WXWINCE__
2816 return false;
2817 #else
2818 static int s_isAtLeastWin2kSP4 = -1;
2819
2820 if ( s_isAtLeastWin2kSP4 == -1 )
2821 {
2822 OSVERSIONINFOEX ver;
2823
2824 memset(&ver, 0, sizeof(ver));
2825 ver.dwOSVersionInfoSize = sizeof(ver);
2826 GetVersionEx((OSVERSIONINFO*)&ver);
2827
2828 s_isAtLeastWin2kSP4 =
2829 ((ver.dwMajorVersion > 5) || // Vista+
2830 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2831 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2832 ver.wServicePackMajor >= 4)) // 2000 SP4+
2833 ? 1 : 0;
2834 }
2835
2836 return s_isAtLeastWin2kSP4 == 1;
2837 #endif
2838 }
2839
2840
2841 // the code page we're working with
2842 long m_CodePage;
2843
2844 // cached result of GetMBNulLen(), set to 0 initially meaning
2845 // "unknown"
2846 size_t m_minMBCharWidth;
2847 };
2848
2849 #endif // wxHAVE_WIN32_MB2WC
2850
2851
2852 // ============================================================================
2853 // wxEncodingConverter based conversion classes
2854 // ============================================================================
2855
2856 #if wxUSE_FONTMAP
2857
2858 class wxMBConv_wxwin : public wxMBConv
2859 {
2860 private:
2861 void Init()
2862 {
2863 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2864 // The wxMBConv_cf class does a better job.
2865 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2866 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2867 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2868 }
2869
2870 public:
2871 // temporarily just use wxEncodingConverter stuff,
2872 // so that it works while a better implementation is built
2873 wxMBConv_wxwin(const char* name)
2874 {
2875 if (name)
2876 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2877 else
2878 m_enc = wxFONTENCODING_SYSTEM;
2879
2880 Init();
2881 }
2882
2883 wxMBConv_wxwin(wxFontEncoding enc)
2884 {
2885 m_enc = enc;
2886
2887 Init();
2888 }
2889
2890 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2891 {
2892 size_t inbuf = strlen(psz);
2893 if (buf)
2894 {
2895 if (!m2w.Convert(psz, buf))
2896 return wxCONV_FAILED;
2897 }
2898 return inbuf;
2899 }
2900
2901 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2902 {
2903 const size_t inbuf = wxWcslen(psz);
2904 if (buf)
2905 {
2906 if (!w2m.Convert(psz, buf))
2907 return wxCONV_FAILED;
2908 }
2909
2910 return inbuf;
2911 }
2912
2913 virtual size_t GetMBNulLen() const
2914 {
2915 switch ( m_enc )
2916 {
2917 case wxFONTENCODING_UTF16BE:
2918 case wxFONTENCODING_UTF16LE:
2919 return 2;
2920
2921 case wxFONTENCODING_UTF32BE:
2922 case wxFONTENCODING_UTF32LE:
2923 return 4;
2924
2925 default:
2926 return 1;
2927 }
2928 }
2929
2930 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2931
2932 bool IsOk() const { return m_ok; }
2933
2934 public:
2935 wxFontEncoding m_enc;
2936 wxEncodingConverter m2w, w2m;
2937
2938 private:
2939 // were we initialized successfully?
2940 bool m_ok;
2941
2942 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2943 };
2944
2945 // make the constructors available for unit testing
2946 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2947 {
2948 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2949 if ( !result->IsOk() )
2950 {
2951 delete result;
2952 return 0;
2953 }
2954
2955 return result;
2956 }
2957
2958 #endif // wxUSE_FONTMAP
2959
2960 // ============================================================================
2961 // wxCSConv implementation
2962 // ============================================================================
2963
2964 void wxCSConv::Init()
2965 {
2966 m_name = NULL;
2967 m_convReal = NULL;
2968 }
2969
2970 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2971 {
2972 switch ( encoding )
2973 {
2974 case wxFONTENCODING_MAX:
2975 case wxFONTENCODING_SYSTEM:
2976 if ( m_name )
2977 {
2978 // It's ok to not have encoding value if we have a name for it.
2979 m_encoding = wxFONTENCODING_SYSTEM;
2980 }
2981 else // No name neither.
2982 {
2983 // Fall back to the system default encoding in this case (not
2984 // sure how much sense does this make but this is how the old
2985 // code used to behave).
2986 #if wxUSE_INTL
2987 m_encoding = wxLocale::GetSystemEncoding();
2988 if ( m_encoding == wxFONTENCODING_SYSTEM )
2989 #endif // wxUSE_INTL
2990 m_encoding = wxFONTENCODING_ISO8859_1;
2991 }
2992 break;
2993
2994 case wxFONTENCODING_DEFAULT:
2995 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2996 m_encoding = wxFONTENCODING_ISO8859_1;
2997 break;
2998
2999 default:
3000 // Just use the provided encoding.
3001 m_encoding = encoding;
3002 }
3003 }
3004
3005 wxCSConv::wxCSConv(const wxString& charset)
3006 {
3007 Init();
3008
3009 if ( !charset.empty() )
3010 {
3011 SetName(charset.ToAscii());
3012 }
3013
3014 #if wxUSE_FONTMAP
3015 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3016 #else
3017 SetEncoding(wxFONTENCODING_SYSTEM);
3018 #endif
3019
3020 m_convReal = DoCreate();
3021 }
3022
3023 wxCSConv::wxCSConv(wxFontEncoding encoding)
3024 {
3025 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3026 {
3027 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3028
3029 encoding = wxFONTENCODING_SYSTEM;
3030 }
3031
3032 Init();
3033
3034 SetEncoding(encoding);
3035
3036 m_convReal = DoCreate();
3037 }
3038
3039 wxCSConv::~wxCSConv()
3040 {
3041 Clear();
3042 }
3043
3044 wxCSConv::wxCSConv(const wxCSConv& conv)
3045 : wxMBConv()
3046 {
3047 Init();
3048
3049 SetName(conv.m_name);
3050 SetEncoding(conv.m_encoding);
3051
3052 m_convReal = DoCreate();
3053 }
3054
3055 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3056 {
3057 Clear();
3058
3059 SetName(conv.m_name);
3060 SetEncoding(conv.m_encoding);
3061
3062 m_convReal = DoCreate();
3063
3064 return *this;
3065 }
3066
3067 void wxCSConv::Clear()
3068 {
3069 free(m_name);
3070 m_name = NULL;
3071
3072 wxDELETE(m_convReal);
3073 }
3074
3075 void wxCSConv::SetName(const char *charset)
3076 {
3077 if ( charset )
3078 m_name = wxStrdup(charset);
3079 }
3080
3081 #if wxUSE_FONTMAP
3082
3083 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3084 wxEncodingNameCache );
3085
3086 static wxEncodingNameCache gs_nameCache;
3087 #endif
3088
3089 wxMBConv *wxCSConv::DoCreate() const
3090 {
3091 #if wxUSE_FONTMAP
3092 wxLogTrace(TRACE_STRCONV,
3093 wxT("creating conversion for %s"),
3094 (m_name ? m_name
3095 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3096 #endif // wxUSE_FONTMAP
3097
3098 // check for the special case of ASCII or ISO8859-1 charset: as we have
3099 // special knowledge of it anyhow, we don't need to create a special
3100 // conversion object
3101 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3102 {
3103 // don't convert at all
3104 return NULL;
3105 }
3106
3107 // we trust OS to do conversion better than we can so try external
3108 // conversion methods first
3109 //
3110 // the full order is:
3111 // 1. OS conversion (iconv() under Unix or Win32 API)
3112 // 2. hard coded conversions for UTF
3113 // 3. wxEncodingConverter as fall back
3114
3115 // step (1)
3116 #ifdef HAVE_ICONV
3117 #if !wxUSE_FONTMAP
3118 if ( m_name )
3119 #endif // !wxUSE_FONTMAP
3120 {
3121 #if wxUSE_FONTMAP
3122 wxFontEncoding encoding(m_encoding);
3123 #endif
3124
3125 if ( m_name )
3126 {
3127 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3128 if ( conv->IsOk() )
3129 return conv;
3130
3131 delete conv;
3132
3133 #if wxUSE_FONTMAP
3134 encoding =
3135 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3136 #endif // wxUSE_FONTMAP
3137 }
3138 #if wxUSE_FONTMAP
3139 {
3140 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3141 if ( it != gs_nameCache.end() )
3142 {
3143 if ( it->second.empty() )
3144 return NULL;
3145
3146 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3147 if ( conv->IsOk() )
3148 return conv;
3149
3150 delete conv;
3151 }
3152
3153 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3154 // CS : in case this does not return valid names (eg for MacRoman)
3155 // encoding got a 'failure' entry in the cache all the same,
3156 // although it just has to be created using a different method, so
3157 // only store failed iconv creation attempts (or perhaps we
3158 // shoulnd't do this at all ?)
3159 if ( names[0] != NULL )
3160 {
3161 for ( ; *names; ++names )
3162 {
3163 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3164 // will need changes that will obsolete this
3165 wxString name(*names);
3166 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3167 if ( conv->IsOk() )
3168 {
3169 gs_nameCache[encoding] = *names;
3170 return conv;
3171 }
3172
3173 delete conv;
3174 }
3175
3176 gs_nameCache[encoding] = wxT(""); // cache the failure
3177 }
3178 }
3179 #endif // wxUSE_FONTMAP
3180 }
3181 #endif // HAVE_ICONV
3182
3183 #ifdef wxHAVE_WIN32_MB2WC
3184 {
3185 #if wxUSE_FONTMAP
3186 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3187 : new wxMBConv_win32(m_encoding);
3188 if ( conv->IsOk() )
3189 return conv;
3190
3191 delete conv;
3192 #else
3193 return NULL;
3194 #endif
3195 }
3196 #endif // wxHAVE_WIN32_MB2WC
3197
3198 #ifdef __DARWIN__
3199 {
3200 // leave UTF16 and UTF32 to the built-ins of wx
3201 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3202 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3203 {
3204 #if wxUSE_FONTMAP
3205 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3206 : new wxMBConv_cf(m_encoding);
3207 #else
3208 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3209 #endif
3210
3211 if ( conv->IsOk() )
3212 return conv;
3213
3214 delete conv;
3215 }
3216 }
3217 #endif // __DARWIN__
3218
3219 // step (2)
3220 wxFontEncoding enc = m_encoding;
3221 #if wxUSE_FONTMAP
3222 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3223 {
3224 // use "false" to suppress interactive dialogs -- we can be called from
3225 // anywhere and popping up a dialog from here is the last thing we want to
3226 // do
3227 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3228 }
3229 #endif // wxUSE_FONTMAP
3230
3231 switch ( enc )
3232 {
3233 case wxFONTENCODING_UTF7:
3234 return new wxMBConvUTF7;
3235
3236 case wxFONTENCODING_UTF8:
3237 return new wxMBConvUTF8;
3238
3239 case wxFONTENCODING_UTF16BE:
3240 return new wxMBConvUTF16BE;
3241
3242 case wxFONTENCODING_UTF16LE:
3243 return new wxMBConvUTF16LE;
3244
3245 case wxFONTENCODING_UTF32BE:
3246 return new wxMBConvUTF32BE;
3247
3248 case wxFONTENCODING_UTF32LE:
3249 return new wxMBConvUTF32LE;
3250
3251 default:
3252 // nothing to do but put here to suppress gcc warnings
3253 break;
3254 }
3255
3256 // step (3)
3257 #if wxUSE_FONTMAP
3258 {
3259 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3260 : new wxMBConv_wxwin(m_encoding);
3261 if ( conv->IsOk() )
3262 return conv;
3263
3264 delete conv;
3265 }
3266
3267 wxLogTrace(TRACE_STRCONV,
3268 wxT("encoding \"%s\" is not supported by this system"),
3269 (m_name ? wxString(m_name)
3270 : wxFontMapperBase::GetEncodingName(m_encoding)));
3271 #endif // wxUSE_FONTMAP
3272
3273 return NULL;
3274 }
3275
3276 bool wxCSConv::IsOk() const
3277 {
3278 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3279 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3280 return true; // always ok as we do it ourselves
3281
3282 // m_convReal->IsOk() is called at its own creation, so we know it must
3283 // be ok if m_convReal is non-NULL
3284 return m_convReal != NULL;
3285 }
3286
3287 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3288 const char *src, size_t srcLen) const
3289 {
3290 if (m_convReal)
3291 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3292
3293 // latin-1 (direct)
3294 if ( srcLen == wxNO_LEN )
3295 srcLen = strlen(src) + 1; // take trailing NUL too
3296
3297 if ( dst )
3298 {
3299 if ( dstLen < srcLen )
3300 return wxCONV_FAILED;
3301
3302 for ( size_t n = 0; n < srcLen; n++ )
3303 dst[n] = (unsigned char)(src[n]);
3304 }
3305
3306 return srcLen;
3307 }
3308
3309 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3310 const wchar_t *src, size_t srcLen) const
3311 {
3312 if (m_convReal)
3313 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3314
3315 // latin-1 (direct)
3316 if ( srcLen == wxNO_LEN )
3317 srcLen = wxWcslen(src) + 1;
3318
3319 if ( dst )
3320 {
3321 if ( dstLen < srcLen )
3322 return wxCONV_FAILED;
3323
3324 for ( size_t n = 0; n < srcLen; n++ )
3325 {
3326 if ( src[n] > 0xFF )
3327 return wxCONV_FAILED;
3328
3329 dst[n] = (char)src[n];
3330 }
3331
3332 }
3333 else // still need to check the input validity
3334 {
3335 for ( size_t n = 0; n < srcLen; n++ )
3336 {
3337 if ( src[n] > 0xFF )
3338 return wxCONV_FAILED;
3339 }
3340 }
3341
3342 return srcLen;
3343 }
3344
3345 size_t wxCSConv::GetMBNulLen() const
3346 {
3347 if ( m_convReal )
3348 return m_convReal->GetMBNulLen();
3349
3350 // otherwise, we are ISO-8859-1
3351 return 1;
3352 }
3353
3354 #if wxUSE_UNICODE_UTF8
3355 bool wxCSConv::IsUTF8() const
3356 {
3357 if ( m_convReal )
3358 return m_convReal->IsUTF8();
3359
3360 // otherwise, we are ISO-8859-1
3361 return false;
3362 }
3363 #endif
3364
3365
3366 #if wxUSE_UNICODE
3367
3368 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3369 {
3370 if ( !s )
3371 return wxWCharBuffer();
3372
3373 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3374 if ( !wbuf )
3375 wbuf = wxMBConvUTF8().cMB2WX(s);
3376 if ( !wbuf )
3377 wbuf = wxConvISO8859_1.cMB2WX(s);
3378
3379 return wbuf;
3380 }
3381
3382 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3383 {
3384 if ( !ws )
3385 return wxCharBuffer();
3386
3387 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3388 if ( !buf )
3389 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3390
3391 return buf;
3392 }
3393
3394 #endif // wxUSE_UNICODE
3395
3396 // ----------------------------------------------------------------------------
3397 // globals
3398 // ----------------------------------------------------------------------------
3399
3400 // NB: The reason why we create converted objects in this convoluted way,
3401 // using a factory function instead of global variable, is that they
3402 // may be used at static initialization time (some of them are used by
3403 // wxString ctors and there may be a global wxString object). In other
3404 // words, possibly _before_ the converter global object would be
3405 // initialized.
3406
3407 #undef wxConvLibc
3408 #undef wxConvUTF8
3409 #undef wxConvUTF7
3410 #undef wxConvLocal
3411 #undef wxConvISO8859_1
3412
3413 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3414 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3415 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3416 { \
3417 static impl_klass name##Obj ctor_args; \
3418 return &name##Obj; \
3419 } \
3420 /* this ensures that all global converter objects are created */ \
3421 /* by the time static initialization is done, i.e. before any */ \
3422 /* thread is launched: */ \
3423 static klass* gs_##name##instance = wxGet_##name##Ptr()
3424
3425 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3426 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3427
3428 #ifdef __INTELC__
3429 // disable warning "variable 'xxx' was declared but never referenced"
3430 #pragma warning(disable: 177)
3431 #endif // Intel C++
3432
3433 #ifdef __WINDOWS__
3434 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3435 #elif 0 // defined(__WXOSX__)
3436 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3437 #else
3438 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3439 #endif
3440
3441 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3442 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3443 // provokes an error message about "not enough macro parameters"; and we
3444 // can't use "()" here as the name##Obj declaration would be parsed as a
3445 // function declaration then, so use a semicolon and live with an extra
3446 // empty statement (and hope that no compilers warns about this)
3447 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3448 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3449
3450 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3451 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3452
3453 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3454 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3455
3456 #ifdef __DARWIN__
3457 // It is important to use this conversion object under Darwin as it ensures
3458 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3459 // decomposed form internally (at least for the file names).
3460 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3461 #endif
3462
3463 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3464 #ifdef __DARWIN__
3465 &wxConvMacUTF8DObj;
3466 #else // !__DARWIN__
3467 wxGet_wxConvLibcPtr();
3468 #endif // __DARWIN__/!__DARWIN__