]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Allow retrieving the descent and external leading of empty strings.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #ifndef __WXWINCE__
32 #include <errno.h>
33 #endif
34
35 #include <ctype.h>
36 #include <string.h>
37 #include <stdlib.h>
38
39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef HAVE_ICONV
46 #include <iconv.h>
47 #include "wx/thread.h"
48 #endif
49
50 #include "wx/encconv.h"
51 #include "wx/fontmap.h"
52
53 #ifdef __DARWIN__
54 #include "wx/osx/core/private/strconv_cf.h"
55 #endif //def __DARWIN__
56
57
58 #define TRACE_STRCONV wxT("strconv")
59
60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61 // be 4 bytes
62 #if SIZEOF_WCHAR_T == 2
63 #define WC_UTF16
64 #endif
65
66
67 // ============================================================================
68 // implementation
69 // ============================================================================
70
71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
72 static bool NotAllNULs(const char *p, size_t n)
73 {
74 while ( n && *p++ == '\0' )
75 n--;
76
77 return n != 0;
78 }
79
80 // ----------------------------------------------------------------------------
81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
82 // ----------------------------------------------------------------------------
83
84 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
85 {
86 if (input <= 0xffff)
87 {
88 if (output)
89 *output = (wxUint16) input;
90
91 return 1;
92 }
93 else if (input >= 0x110000)
94 {
95 return wxCONV_FAILED;
96 }
97 else
98 {
99 if (output)
100 {
101 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
102 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
103 }
104
105 return 2;
106 }
107 }
108
109 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
110 {
111 if ((*input < 0xd800) || (*input > 0xdfff))
112 {
113 output = *input;
114 return 1;
115 }
116 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
117 {
118 output = *input;
119 return wxCONV_FAILED;
120 }
121 else
122 {
123 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
124 return 2;
125 }
126 }
127
128 #ifdef WC_UTF16
129 typedef wchar_t wxDecodeSurrogate_t;
130 #else // !WC_UTF16
131 typedef wxUint16 wxDecodeSurrogate_t;
132 #endif // WC_UTF16/!WC_UTF16
133
134 // returns the next UTF-32 character from the wchar_t buffer and advances the
135 // pointer to the character after this one
136 //
137 // if an invalid character is found, *pSrc is set to NULL, the caller must
138 // check for this
139 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
140 {
141 wxUint32 out;
142 const size_t
143 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
144 if ( n == wxCONV_FAILED )
145 *pSrc = NULL;
146 else
147 *pSrc += n;
148
149 return out;
150 }
151
152 // ----------------------------------------------------------------------------
153 // wxMBConv
154 // ----------------------------------------------------------------------------
155
156 size_t
157 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
158 const char *src, size_t srcLen) const
159 {
160 // although new conversion classes are supposed to implement this function
161 // directly, the existing ones only implement the old MB2WC() and so, to
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
166 //
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
170
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten = 0;
173
174 // the number of NULs terminating this string
175 size_t nulLen = 0; // not really needed, but just to avoid warnings
176
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
180 // NULs at the end
181 wxCharBuffer bufTmp;
182 const char *srcEnd;
183 if ( srcLen != wxNO_LEN )
184 {
185 // we need to know how to find the end of this string
186 nulLen = GetMBNulLen();
187 if ( nulLen == wxCONV_FAILED )
188 return wxCONV_FAILED;
189
190 // if there are enough NULs we can avoid the copy
191 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
192 {
193 // make a copy in order to properly NUL-terminate the string
194 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
195 char * const p = bufTmp.data();
196 memcpy(p, src, srcLen);
197 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
198 *s = '\0';
199
200 src = bufTmp;
201 }
202
203 srcEnd = src + srcLen;
204 }
205 else // quit after the first loop iteration
206 {
207 srcEnd = NULL;
208 }
209
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
212 // with the result
213 //
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
220 // string
221 //
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
225 for ( ;; )
226 {
227 // try to convert the current chunk
228 size_t lenChunk = MB2WC(NULL, src, 0);
229 if ( lenChunk == wxCONV_FAILED )
230 return wxCONV_FAILED;
231
232 dstWritten += lenChunk;
233 if ( !srcEnd )
234 dstWritten++;
235
236 if ( !lenChunk )
237 {
238 // nothing left in the input string, conversion succeeded
239 break;
240 }
241
242 if ( dst )
243 {
244 if ( dstWritten > dstLen )
245 return wxCONV_FAILED;
246
247 // +1 is for trailing NUL
248 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
252 if ( !srcEnd )
253 dst++;
254 }
255
256 if ( !srcEnd )
257 {
258 // we convert just one chunk in this case as this is the entire
259 // string anyhow (and we don't count the trailing NUL in this case)
260 break;
261 }
262
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
266 while ( NotAllNULs(src, nulLen) )
267 {
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
272 src += nulLen;
273 }
274
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
277 if ( src == srcEnd )
278 break;
279
280 // do count this terminator as it's inside the buffer we convert
281 dstWritten++;
282 if ( dst )
283 dst++;
284
285 src += nulLen; // skip the terminator itself
286
287 if ( src >= srcEnd )
288 break;
289 }
290
291 return dstWritten;
292 }
293
294 size_t
295 wxMBConv::FromWChar(char *dst, size_t dstLen,
296 const wchar_t *src, size_t srcLen) const
297 {
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten = 0;
300
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated = srcLen == wxNO_LEN;
305
306 // make a copy of the input string unless it is already properly
307 // NUL-terminated
308 wxWCharBuffer bufTmp;
309 if ( isNulTerminated )
310 {
311 srcLen = wxWcslen(src) + 1;
312 }
313 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
314 {
315 // make a copy in order to properly NUL-terminate the string
316 bufTmp = wxWCharBuffer(srcLen);
317 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
318 src = bufTmp;
319 }
320
321 const size_t lenNul = GetMBNulLen();
322 for ( const wchar_t * const srcEnd = src + srcLen;
323 src < srcEnd;
324 src++ /* skip L'\0' too */ )
325 {
326 // try to convert the current chunk
327 size_t lenChunk = WC2MB(NULL, src, 0);
328 if ( lenChunk == wxCONV_FAILED )
329 return wxCONV_FAILED;
330
331 dstWritten += lenChunk;
332
333 const wchar_t * const
334 chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
335
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
338 // ourselves above
339 if ( chunkEnd < srcEnd )
340 dstWritten += lenNul;
341
342 if ( dst )
343 {
344 if ( dstWritten > dstLen )
345 return wxCONV_FAILED;
346
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
351 wxCharBuffer dstBuf;
352 char *dstTmp;
353 if ( chunkEnd == srcEnd )
354 {
355 dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
356 dstTmp = dstBuf.data();
357 }
358 else
359 {
360 dstTmp = dst;
361 }
362
363 if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
364 return wxCONV_FAILED;
365
366 if ( dstTmp != dst )
367 {
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst, dstTmp, lenChunk);
371
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
374 break;
375 }
376
377 dst += lenChunk;
378 if ( chunkEnd < srcEnd )
379 dst += lenNul;
380 }
381
382 src = chunkEnd;
383 }
384
385 return dstWritten;
386 }
387
388 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
389 {
390 size_t rc = ToWChar(outBuff, outLen, inBuff);
391 if ( rc != wxCONV_FAILED )
392 {
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
395 rc--;
396 }
397
398 return rc;
399 }
400
401 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
402 {
403 size_t rc = FromWChar(outBuff, outLen, inBuff);
404 if ( rc != wxCONV_FAILED )
405 {
406 rc -= GetMBNulLen();
407 }
408
409 return rc;
410 }
411
412 wxMBConv::~wxMBConv()
413 {
414 // nothing to do here (necessary for Darwin linking probably)
415 }
416
417 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
418 {
419 if ( psz )
420 {
421 // calculate the length of the buffer needed first
422 const size_t nLen = ToWChar(NULL, 0, psz);
423 if ( nLen != wxCONV_FAILED )
424 {
425 // now do the actual conversion
426 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
427
428 // +1 for the trailing NULL
429 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
430 return buf;
431 }
432 }
433
434 return wxWCharBuffer();
435 }
436
437 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
438 {
439 if ( pwz )
440 {
441 const size_t nLen = FromWChar(NULL, 0, pwz);
442 if ( nLen != wxCONV_FAILED )
443 {
444 wxCharBuffer buf(nLen - 1);
445 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
446 return buf;
447 }
448 }
449
450 return wxCharBuffer();
451 }
452
453 const wxWCharBuffer
454 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
455 {
456 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
457 if ( dstLen != wxCONV_FAILED )
458 {
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer wbuf(dstLen);
463 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
464 {
465 if ( outLen )
466 {
467 *outLen = dstLen;
468
469 // we also need to handle NUL-terminated input strings
470 // specially: for them the output is the length of the string
471 // excluding the trailing NUL, however if we're asked to
472 // convert a specific number of characters we return the length
473 // of the resulting output even if it's NUL-terminated
474 if ( inLen == wxNO_LEN )
475 (*outLen)--;
476 }
477
478 return wbuf;
479 }
480 }
481
482 if ( outLen )
483 *outLen = 0;
484
485 return wxWCharBuffer();
486 }
487
488 const wxCharBuffer
489 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
490 {
491 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
492 if ( dstLen != wxCONV_FAILED )
493 {
494 const size_t nulLen = GetMBNulLen();
495
496 // as above, ensure that the buffer is always NUL-terminated, even if
497 // the input is not
498 wxCharBuffer buf(dstLen + nulLen - 1);
499 memset(buf.data() + dstLen, 0, nulLen);
500 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
501 {
502 if ( outLen )
503 {
504 *outLen = dstLen;
505
506 if ( inLen == wxNO_LEN )
507 {
508 // in this case both input and output are NUL-terminated
509 // and we're not supposed to count NUL
510 *outLen -= nulLen;
511 }
512 }
513
514 return buf;
515 }
516 }
517
518 if ( outLen )
519 *outLen = 0;
520
521 return wxCharBuffer();
522 }
523
524 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
525 {
526 const size_t srcLen = buf.length();
527 if ( srcLen )
528 {
529 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
530 if ( dstLen != wxCONV_FAILED )
531 {
532 wxWCharBuffer wbuf(dstLen);
533 wbuf.data()[dstLen] = L'\0';
534 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
535 return wbuf;
536 }
537 }
538
539 return wxScopedWCharBuffer::CreateNonOwned(L"", 0);
540 }
541
542 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
543 {
544 const size_t srcLen = wbuf.length();
545 if ( srcLen )
546 {
547 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
548 if ( dstLen != wxCONV_FAILED )
549 {
550 wxCharBuffer buf(dstLen);
551 buf.data()[dstLen] = '\0';
552 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
553 return buf;
554 }
555 }
556
557 return wxScopedCharBuffer::CreateNonOwned("", 0);
558 }
559
560 // ----------------------------------------------------------------------------
561 // wxMBConvLibc
562 // ----------------------------------------------------------------------------
563
564 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
565 {
566 return wxMB2WC(buf, psz, n);
567 }
568
569 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
570 {
571 return wxWC2MB(buf, psz, n);
572 }
573
574 // ----------------------------------------------------------------------------
575 // wxConvBrokenFileNames
576 // ----------------------------------------------------------------------------
577
578 #ifdef __UNIX__
579
580 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
581 {
582 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
583 wxStricmp(charset, wxT("UTF8")) == 0 )
584 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
585 else
586 m_conv = new wxCSConv(charset);
587 }
588
589 #endif // __UNIX__
590
591 // ----------------------------------------------------------------------------
592 // UTF-7
593 // ----------------------------------------------------------------------------
594
595 // Implementation (C) 2004 Fredrik Roubert
596 //
597 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
598
599 //
600 // BASE64 decoding table
601 //
602 static const unsigned char utf7unb64[] =
603 {
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
610 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
611 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
613 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
614 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
615 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
616 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
617 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
618 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
619 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
620 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
636 };
637
638 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
639 const char *src, size_t srcLen) const
640 {
641 DecoderState stateOrig,
642 *statePtr;
643 if ( srcLen == wxNO_LEN )
644 {
645 // convert the entire string, up to and including the trailing NUL
646 srcLen = strlen(src) + 1;
647
648 // when working on the entire strings we don't update nor use the shift
649 // state from the previous call
650 statePtr = &stateOrig;
651 }
652 else // when working with partial strings we do use the shift state
653 {
654 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
655
656 // also save the old state to be able to rollback to it on error
657 stateOrig = m_stateDecoder;
658 }
659
660 // but to simplify the code below we use this variable in both cases
661 DecoderState& state = *statePtr;
662
663
664 // number of characters [which would have been] written to dst [if it were
665 // not NULL]
666 size_t len = 0;
667
668 const char * const srcEnd = src + srcLen;
669
670 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
671 {
672 const unsigned char cc = *src++;
673
674 if ( state.IsShifted() )
675 {
676 const unsigned char dc = utf7unb64[cc];
677 if ( dc == 0xff )
678 {
679 // end of encoded part, check that nothing was left: there can
680 // be up to 4 bits of 0 padding but nothing else (we also need
681 // to check isLSB as we count bits modulo 8 while a valid UTF-7
682 // encoded sequence must contain an integral number of UTF-16
683 // characters)
684 if ( state.isLSB || state.bit > 4 ||
685 (state.accum & ((1 << state.bit) - 1)) )
686 {
687 if ( !len )
688 state = stateOrig;
689
690 return wxCONV_FAILED;
691 }
692
693 state.ToDirect();
694
695 // re-parse this character normally below unless it's '-' which
696 // is consumed by the decoder
697 if ( cc == '-' )
698 continue;
699 }
700 else // valid encoded character
701 {
702 // mini base64 decoder: each character is 6 bits
703 state.bit += 6;
704 state.accum <<= 6;
705 state.accum += dc;
706
707 if ( state.bit >= 8 )
708 {
709 // got the full byte, consume it
710 state.bit -= 8;
711 unsigned char b = (state.accum >> state.bit) & 0x00ff;
712
713 if ( state.isLSB )
714 {
715 // we've got the full word, output it
716 if ( dst )
717 *dst++ = (state.msb << 8) | b;
718 len++;
719 state.isLSB = false;
720 }
721 else // MSB
722 {
723 // just store it while we wait for LSB
724 state.msb = b;
725 state.isLSB = true;
726 }
727 }
728 }
729 }
730
731 if ( state.IsDirect() )
732 {
733 // start of an encoded segment?
734 if ( cc == '+' )
735 {
736 if ( *src == '-' )
737 {
738 // just the encoded plus sign, don't switch to shifted mode
739 if ( dst )
740 *dst++ = '+';
741 len++;
742 src++;
743 }
744 else if ( utf7unb64[(unsigned)*src] == 0xff )
745 {
746 // empty encoded chunks are not allowed
747 if ( !len )
748 state = stateOrig;
749
750 return wxCONV_FAILED;
751 }
752 else // base-64 encoded chunk follows
753 {
754 state.ToShifted();
755 }
756 }
757 else // not '+'
758 {
759 // only printable 7 bit ASCII characters (with the exception of
760 // NUL, TAB, CR and LF) can be used directly
761 if ( cc >= 0x7f || (cc < ' ' &&
762 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
763 return wxCONV_FAILED;
764
765 if ( dst )
766 *dst++ = cc;
767 len++;
768 }
769 }
770 }
771
772 if ( !len )
773 {
774 // as we didn't read any characters we should be called with the same
775 // data (followed by some more new data) again later so don't save our
776 // state
777 state = stateOrig;
778
779 return wxCONV_FAILED;
780 }
781
782 return len;
783 }
784
785 //
786 // BASE64 encoding table
787 //
788 static const unsigned char utf7enb64[] =
789 {
790 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
791 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
792 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
793 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
794 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
795 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
796 'w', 'x', 'y', 'z', '0', '1', '2', '3',
797 '4', '5', '6', '7', '8', '9', '+', '/'
798 };
799
800 //
801 // UTF-7 encoding table
802 //
803 // 0 - Set D (directly encoded characters)
804 // 1 - Set O (optional direct characters)
805 // 2 - whitespace characters (optional)
806 // 3 - special characters
807 //
808 static const unsigned char utf7encode[128] =
809 {
810 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
811 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
812 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
813 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
814 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
816 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
818 };
819
820 static inline bool wxIsUTF7Direct(wchar_t wc)
821 {
822 return wc < 0x80 && utf7encode[wc] < 1;
823 }
824
825 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
826 const wchar_t *src, size_t srcLen) const
827 {
828 EncoderState stateOrig,
829 *statePtr;
830 if ( srcLen == wxNO_LEN )
831 {
832 // we don't apply the stored state when operating on entire strings at
833 // once
834 statePtr = &stateOrig;
835
836 srcLen = wxWcslen(src) + 1;
837 }
838 else // do use the mode we left the output in previously
839 {
840 stateOrig = m_stateEncoder;
841 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
842 }
843
844 EncoderState& state = *statePtr;
845
846
847 size_t len = 0;
848
849 const wchar_t * const srcEnd = src + srcLen;
850 while ( src < srcEnd && (!dst || len < dstLen) )
851 {
852 wchar_t cc = *src++;
853 if ( wxIsUTF7Direct(cc) )
854 {
855 if ( state.IsShifted() )
856 {
857 // pad with zeros the last encoded block if necessary
858 if ( state.bit )
859 {
860 if ( dst )
861 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
862 len++;
863 }
864
865 state.ToDirect();
866
867 if ( dst )
868 *dst++ = '-';
869 len++;
870 }
871
872 if ( dst )
873 *dst++ = (char)cc;
874 len++;
875 }
876 else if ( cc == '+' && state.IsDirect() )
877 {
878 if ( dst )
879 {
880 *dst++ = '+';
881 *dst++ = '-';
882 }
883
884 len += 2;
885 }
886 #ifndef WC_UTF16
887 else if (((wxUint32)cc) > 0xffff)
888 {
889 // no surrogate pair generation (yet?)
890 return wxCONV_FAILED;
891 }
892 #endif
893 else
894 {
895 if ( state.IsDirect() )
896 {
897 state.ToShifted();
898
899 if ( dst )
900 *dst++ = '+';
901 len++;
902 }
903
904 // BASE64 encode string
905 for ( ;; )
906 {
907 for ( unsigned lsb = 0; lsb < 2; lsb++ )
908 {
909 state.accum <<= 8;
910 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
911
912 for (state.bit += 8; state.bit >= 6; )
913 {
914 state.bit -= 6;
915 if ( dst )
916 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
917 len++;
918 }
919 }
920
921 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
922 break;
923
924 src++;
925 }
926 }
927 }
928
929 // we need to restore the original encoder state if we were called just to
930 // calculate the amount of space needed as we will presumably be called
931 // again to really convert the data now
932 if ( !dst )
933 state = stateOrig;
934
935 return len;
936 }
937
938 // ----------------------------------------------------------------------------
939 // UTF-8
940 // ----------------------------------------------------------------------------
941
942 static const wxUint32 utf8_max[]=
943 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
944
945 // boundaries of the private use area we use to (temporarily) remap invalid
946 // characters invalid in a UTF-8 encoded string
947 const wxUint32 wxUnicodePUA = 0x100000;
948 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
949
950 // this table gives the length of the UTF-8 encoding from its first character:
951 const unsigned char tableUtf8Lengths[256] = {
952 // single-byte sequences (ASCII):
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
961
962 // these are invalid:
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
967 0, 0, // C0,C1
968
969 // two-byte sequences:
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
972
973 // three-byte sequences:
974 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
975
976 // four-byte sequences:
977 4, 4, 4, 4, 4, // F0..F4
978
979 // these are invalid again (5- or 6-byte
980 // sequences and sequences for code points
981 // above U+10FFFF, as restricted by RFC 3629):
982 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
983 };
984
985 size_t
986 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
987 const char *src, size_t srcLen) const
988 {
989 wchar_t *out = dstLen ? dst : NULL;
990 size_t written = 0;
991
992 if ( srcLen == wxNO_LEN )
993 srcLen = strlen(src) + 1;
994
995 for ( const char *p = src; ; p++ )
996 {
997 if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
998 {
999 // all done successfully, just add the trailing NULL if we are not
1000 // using explicit length
1001 if ( srcLen == wxNO_LEN )
1002 {
1003 if ( out )
1004 {
1005 if ( !dstLen )
1006 break;
1007
1008 *out = L'\0';
1009 }
1010
1011 written++;
1012 }
1013
1014 return written;
1015 }
1016
1017 if ( out && !dstLen-- )
1018 break;
1019
1020 wxUint32 code;
1021 unsigned char c = *p;
1022
1023 if ( c < 0x80 )
1024 {
1025 if ( srcLen == 0 ) // the test works for wxNO_LEN too
1026 break;
1027
1028 if ( srcLen != wxNO_LEN )
1029 srcLen--;
1030
1031 code = c;
1032 }
1033 else
1034 {
1035 unsigned len = tableUtf8Lengths[c];
1036 if ( !len )
1037 break;
1038
1039 if ( srcLen < len ) // the test works for wxNO_LEN too
1040 break;
1041
1042 if ( srcLen != wxNO_LEN )
1043 srcLen -= len;
1044
1045 // Char. number range | UTF-8 octet sequence
1046 // (hexadecimal) | (binary)
1047 // ----------------------+----------------------------------------
1048 // 0000 0000 - 0000 007F | 0xxxxxxx
1049 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1050 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1051 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1052 //
1053 // Code point value is stored in bits marked with 'x',
1054 // lowest-order bit of the value on the right side in the diagram
1055 // above. (from RFC 3629)
1056
1057 // mask to extract lead byte's value ('x' bits above), by sequence
1058 // length:
1059 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1060
1061 // mask and value of lead byte's most significant bits, by length:
1062 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1063 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1064
1065 len--; // it's more convenient to work with 0-based length here
1066
1067 // extract the lead byte's value bits:
1068 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1069 break;
1070
1071 code = c & leadValueMask[len];
1072
1073 // all remaining bytes, if any, are handled in the same way
1074 // regardless of sequence's length:
1075 for ( ; len; --len )
1076 {
1077 c = *++p;
1078 if ( (c & 0xC0) != 0x80 )
1079 return wxCONV_FAILED;
1080
1081 code <<= 6;
1082 code |= c & 0x3F;
1083 }
1084 }
1085
1086 #ifdef WC_UTF16
1087 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1088 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1089 {
1090 if ( out )
1091 out++;
1092 written++;
1093 }
1094 #else // !WC_UTF16
1095 if ( out )
1096 *out = code;
1097 #endif // WC_UTF16/!WC_UTF16
1098
1099 if ( out )
1100 out++;
1101
1102 written++;
1103 }
1104
1105 return wxCONV_FAILED;
1106 }
1107
1108 size_t
1109 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1110 const wchar_t *src, size_t srcLen) const
1111 {
1112 char *out = dstLen ? dst : NULL;
1113 size_t written = 0;
1114
1115 for ( const wchar_t *wp = src; ; wp++ )
1116 {
1117 if ( (srcLen == wxNO_LEN ? !*wp : !srcLen) )
1118 {
1119 // all done successfully, just add the trailing NULL if we are not
1120 // using explicit length
1121 if ( srcLen == wxNO_LEN )
1122 {
1123 if ( out )
1124 {
1125 if ( !dstLen )
1126 break;
1127
1128 *out = '\0';
1129 }
1130
1131 written++;
1132 }
1133
1134 return written;
1135 }
1136
1137 if ( srcLen != wxNO_LEN )
1138 srcLen--;
1139
1140 wxUint32 code;
1141 #ifdef WC_UTF16
1142 // cast is ok for WC_UTF16
1143 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1144 {
1145 // skip the next char too as we decoded a surrogate
1146 wp++;
1147 if ( srcLen != wxNO_LEN )
1148 srcLen--;
1149 }
1150 #else // wchar_t is UTF-32
1151 code = *wp & 0x7fffffff;
1152 #endif
1153
1154 unsigned len;
1155 if ( code <= 0x7F )
1156 {
1157 len = 1;
1158 if ( out )
1159 {
1160 if ( dstLen < len )
1161 break;
1162
1163 out[0] = (char)code;
1164 }
1165 }
1166 else if ( code <= 0x07FF )
1167 {
1168 len = 2;
1169 if ( out )
1170 {
1171 if ( dstLen < len )
1172 break;
1173
1174 // NB: this line takes 6 least significant bits, encodes them as
1175 // 10xxxxxx and discards them so that the next byte can be encoded:
1176 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1177 out[0] = 0xC0 | code;
1178 }
1179 }
1180 else if ( code < 0xFFFF )
1181 {
1182 len = 3;
1183 if ( out )
1184 {
1185 if ( dstLen < len )
1186 break;
1187
1188 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1189 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1190 out[0] = 0xE0 | code;
1191 }
1192 }
1193 else if ( code <= 0x10FFFF )
1194 {
1195 len = 4;
1196 if ( out )
1197 {
1198 if ( dstLen < len )
1199 break;
1200
1201 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1202 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1203 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1204 out[0] = 0xF0 | code;
1205 }
1206 }
1207 else
1208 {
1209 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1210 break;
1211 }
1212
1213 if ( out )
1214 {
1215 out += len;
1216 dstLen -= len;
1217 }
1218
1219 written += len;
1220 }
1221
1222 // we only get here if an error occurs during decoding
1223 return wxCONV_FAILED;
1224 }
1225
1226 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1227 const char *psz, size_t srcLen) const
1228 {
1229 if ( m_options == MAP_INVALID_UTF8_NOT )
1230 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1231
1232 size_t len = 0;
1233
1234 // The length can be either given explicitly or computed implicitly for the
1235 // NUL-terminated strings.
1236 const bool isNulTerminated = srcLen == wxNO_LEN;
1237 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1238 {
1239 const char *opsz = psz;
1240 bool invalid = false;
1241 unsigned char cc = *psz++, fc = cc;
1242 unsigned cnt;
1243 for (cnt = 0; fc & 0x80; cnt++)
1244 fc <<= 1;
1245
1246 if (!cnt)
1247 {
1248 // plain ASCII char
1249 if (buf)
1250 *buf++ = cc;
1251 len++;
1252
1253 // escape the escape character for octal escapes
1254 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1255 && cc == '\\' && (!buf || len < n))
1256 {
1257 if (buf)
1258 *buf++ = cc;
1259 len++;
1260 }
1261 }
1262 else
1263 {
1264 cnt--;
1265 if (!cnt)
1266 {
1267 // invalid UTF-8 sequence
1268 invalid = true;
1269 }
1270 else
1271 {
1272 unsigned ocnt = cnt - 1;
1273 wxUint32 res = cc & (0x3f >> cnt);
1274 while (cnt--)
1275 {
1276 cc = *psz;
1277 if ((cc & 0xC0) != 0x80)
1278 {
1279 // invalid UTF-8 sequence
1280 invalid = true;
1281 break;
1282 }
1283
1284 psz++;
1285 res = (res << 6) | (cc & 0x3f);
1286 }
1287
1288 if (invalid || res <= utf8_max[ocnt])
1289 {
1290 // illegal UTF-8 encoding
1291 invalid = true;
1292 }
1293 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1294 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1295 {
1296 // if one of our PUA characters turns up externally
1297 // it must also be treated as an illegal sequence
1298 // (a bit like you have to escape an escape character)
1299 invalid = true;
1300 }
1301 else
1302 {
1303 #ifdef WC_UTF16
1304 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1305 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1306 if (pa == wxCONV_FAILED)
1307 {
1308 invalid = true;
1309 }
1310 else
1311 {
1312 if (buf)
1313 buf += pa;
1314 len += pa;
1315 }
1316 #else // !WC_UTF16
1317 if (buf)
1318 *buf++ = (wchar_t)res;
1319 len++;
1320 #endif // WC_UTF16/!WC_UTF16
1321 }
1322 }
1323
1324 if (invalid)
1325 {
1326 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1327 {
1328 while (opsz < psz && (!buf || len < n))
1329 {
1330 #ifdef WC_UTF16
1331 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1332 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1333 wxASSERT(pa != wxCONV_FAILED);
1334 if (buf)
1335 buf += pa;
1336 opsz++;
1337 len += pa;
1338 #else
1339 if (buf)
1340 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1341 opsz++;
1342 len++;
1343 #endif
1344 }
1345 }
1346 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1347 {
1348 while (opsz < psz && (!buf || len < n))
1349 {
1350 if ( buf && len + 3 < n )
1351 {
1352 unsigned char on = *opsz;
1353 *buf++ = L'\\';
1354 *buf++ = (wchar_t)( L'0' + on / 0100 );
1355 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1356 *buf++ = (wchar_t)( L'0' + on % 010 );
1357 }
1358
1359 opsz++;
1360 len += 4;
1361 }
1362 }
1363 else // MAP_INVALID_UTF8_NOT
1364 {
1365 return wxCONV_FAILED;
1366 }
1367 }
1368 }
1369 }
1370
1371 if ( isNulTerminated )
1372 {
1373 // Add the trailing NUL in this case if we have a large enough buffer.
1374 if ( buf && (len < n) )
1375 *buf = 0;
1376
1377 // And count it in any case.
1378 len++;
1379 }
1380
1381 return len;
1382 }
1383
1384 static inline bool isoctal(wchar_t wch)
1385 {
1386 return L'0' <= wch && wch <= L'7';
1387 }
1388
1389 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1390 const wchar_t *psz, size_t srcLen) const
1391 {
1392 if ( m_options == MAP_INVALID_UTF8_NOT )
1393 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1394
1395 size_t len = 0;
1396
1397 // The length can be either given explicitly or computed implicitly for the
1398 // NUL-terminated strings.
1399 const bool isNulTerminated = srcLen == wxNO_LEN;
1400 while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
1401 {
1402 wxUint32 cc;
1403
1404 #ifdef WC_UTF16
1405 // cast is ok for WC_UTF16
1406 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1407 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1408 #else
1409 cc = (*psz++) & 0x7fffffff;
1410 #endif
1411
1412 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1413 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1414 {
1415 if (buf)
1416 *buf++ = (char)(cc - wxUnicodePUA);
1417 len++;
1418 }
1419 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1420 && cc == L'\\' && psz[0] == L'\\' )
1421 {
1422 if (buf)
1423 *buf++ = (char)cc;
1424 psz++;
1425 len++;
1426 }
1427 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1428 cc == L'\\' &&
1429 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1430 {
1431 if (buf)
1432 {
1433 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1434 (psz[1] - L'0') * 010 +
1435 (psz[2] - L'0'));
1436 }
1437
1438 psz += 3;
1439 len++;
1440 }
1441 else
1442 {
1443 unsigned cnt;
1444 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1445 {
1446 }
1447
1448 if (!cnt)
1449 {
1450 // plain ASCII char
1451 if (buf)
1452 *buf++ = (char) cc;
1453 len++;
1454 }
1455 else
1456 {
1457 len += cnt + 1;
1458 if (buf)
1459 {
1460 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1461 while (cnt--)
1462 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1463 }
1464 }
1465 }
1466 }
1467
1468 if ( isNulTerminated )
1469 {
1470 // Add the trailing NUL in this case if we have a large enough buffer.
1471 if ( buf && (len < n) )
1472 *buf = 0;
1473
1474 // And count it in any case.
1475 len++;
1476 }
1477
1478 return len;
1479 }
1480
1481 // ============================================================================
1482 // UTF-16
1483 // ============================================================================
1484
1485 #ifdef WORDS_BIGENDIAN
1486 #define wxMBConvUTF16straight wxMBConvUTF16BE
1487 #define wxMBConvUTF16swap wxMBConvUTF16LE
1488 #else
1489 #define wxMBConvUTF16swap wxMBConvUTF16BE
1490 #define wxMBConvUTF16straight wxMBConvUTF16LE
1491 #endif
1492
1493 /* static */
1494 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1495 {
1496 if ( srcLen == wxNO_LEN )
1497 {
1498 // count the number of bytes in input, including the trailing NULs
1499 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1500 for ( srcLen = 1; *inBuff++; srcLen++ )
1501 ;
1502
1503 srcLen *= BYTES_PER_CHAR;
1504 }
1505 else // we already have the length
1506 {
1507 // we can only convert an entire number of UTF-16 characters
1508 if ( srcLen % BYTES_PER_CHAR )
1509 return wxCONV_FAILED;
1510 }
1511
1512 return srcLen;
1513 }
1514
1515 // case when in-memory representation is UTF-16 too
1516 #ifdef WC_UTF16
1517
1518 // ----------------------------------------------------------------------------
1519 // conversions without endianness change
1520 // ----------------------------------------------------------------------------
1521
1522 size_t
1523 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1524 const char *src, size_t srcLen) const
1525 {
1526 // set up the scene for using memcpy() (which is presumably more efficient
1527 // than copying the bytes one by one)
1528 srcLen = GetLength(src, srcLen);
1529 if ( srcLen == wxNO_LEN )
1530 return wxCONV_FAILED;
1531
1532 const size_t inLen = srcLen / BYTES_PER_CHAR;
1533 if ( dst )
1534 {
1535 if ( dstLen < inLen )
1536 return wxCONV_FAILED;
1537
1538 memcpy(dst, src, srcLen);
1539 }
1540
1541 return inLen;
1542 }
1543
1544 size_t
1545 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1546 const wchar_t *src, size_t srcLen) const
1547 {
1548 if ( srcLen == wxNO_LEN )
1549 srcLen = wxWcslen(src) + 1;
1550
1551 srcLen *= BYTES_PER_CHAR;
1552
1553 if ( dst )
1554 {
1555 if ( dstLen < srcLen )
1556 return wxCONV_FAILED;
1557
1558 memcpy(dst, src, srcLen);
1559 }
1560
1561 return srcLen;
1562 }
1563
1564 // ----------------------------------------------------------------------------
1565 // endian-reversing conversions
1566 // ----------------------------------------------------------------------------
1567
1568 size_t
1569 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1570 const char *src, size_t srcLen) const
1571 {
1572 srcLen = GetLength(src, srcLen);
1573 if ( srcLen == wxNO_LEN )
1574 return wxCONV_FAILED;
1575
1576 srcLen /= BYTES_PER_CHAR;
1577
1578 if ( dst )
1579 {
1580 if ( dstLen < srcLen )
1581 return wxCONV_FAILED;
1582
1583 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1584 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1585 {
1586 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1587 }
1588 }
1589
1590 return srcLen;
1591 }
1592
1593 size_t
1594 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1595 const wchar_t *src, size_t srcLen) const
1596 {
1597 if ( srcLen == wxNO_LEN )
1598 srcLen = wxWcslen(src) + 1;
1599
1600 srcLen *= BYTES_PER_CHAR;
1601
1602 if ( dst )
1603 {
1604 if ( dstLen < srcLen )
1605 return wxCONV_FAILED;
1606
1607 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1608 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1609 {
1610 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1611 }
1612 }
1613
1614 return srcLen;
1615 }
1616
1617 #else // !WC_UTF16: wchar_t is UTF-32
1618
1619 // ----------------------------------------------------------------------------
1620 // conversions without endianness change
1621 // ----------------------------------------------------------------------------
1622
1623 size_t
1624 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1625 const char *src, size_t srcLen) const
1626 {
1627 srcLen = GetLength(src, srcLen);
1628 if ( srcLen == wxNO_LEN )
1629 return wxCONV_FAILED;
1630
1631 const size_t inLen = srcLen / BYTES_PER_CHAR;
1632 if ( !dst )
1633 {
1634 // optimization: return maximal space which could be needed for this
1635 // string even if the real size could be smaller if the buffer contains
1636 // any surrogates
1637 return inLen;
1638 }
1639
1640 size_t outLen = 0;
1641 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1642 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1643 {
1644 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1645 if ( !inBuff )
1646 return wxCONV_FAILED;
1647
1648 if ( ++outLen > dstLen )
1649 return wxCONV_FAILED;
1650
1651 *dst++ = ch;
1652 }
1653
1654
1655 return outLen;
1656 }
1657
1658 size_t
1659 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1660 const wchar_t *src, size_t srcLen) const
1661 {
1662 if ( srcLen == wxNO_LEN )
1663 srcLen = wxWcslen(src) + 1;
1664
1665 size_t outLen = 0;
1666 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1667 for ( size_t n = 0; n < srcLen; n++ )
1668 {
1669 wxUint16 cc[2] = { 0 };
1670 const size_t numChars = encode_utf16(*src++, cc);
1671 if ( numChars == wxCONV_FAILED )
1672 return wxCONV_FAILED;
1673
1674 outLen += numChars * BYTES_PER_CHAR;
1675 if ( outBuff )
1676 {
1677 if ( outLen > dstLen )
1678 return wxCONV_FAILED;
1679
1680 *outBuff++ = cc[0];
1681 if ( numChars == 2 )
1682 {
1683 // second character of a surrogate
1684 *outBuff++ = cc[1];
1685 }
1686 }
1687 }
1688
1689 return outLen;
1690 }
1691
1692 // ----------------------------------------------------------------------------
1693 // endian-reversing conversions
1694 // ----------------------------------------------------------------------------
1695
1696 size_t
1697 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1698 const char *src, size_t srcLen) const
1699 {
1700 srcLen = GetLength(src, srcLen);
1701 if ( srcLen == wxNO_LEN )
1702 return wxCONV_FAILED;
1703
1704 const size_t inLen = srcLen / BYTES_PER_CHAR;
1705 if ( !dst )
1706 {
1707 // optimization: return maximal space which could be needed for this
1708 // string even if the real size could be smaller if the buffer contains
1709 // any surrogates
1710 return inLen;
1711 }
1712
1713 size_t outLen = 0;
1714 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1715 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1716 {
1717 wxUint32 ch;
1718 wxUint16 tmp[2];
1719
1720 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1721 inBuff++;
1722 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1723
1724 const size_t numChars = decode_utf16(tmp, ch);
1725 if ( numChars == wxCONV_FAILED )
1726 return wxCONV_FAILED;
1727
1728 if ( numChars == 2 )
1729 inBuff++;
1730
1731 if ( ++outLen > dstLen )
1732 return wxCONV_FAILED;
1733
1734 *dst++ = ch;
1735 }
1736
1737
1738 return outLen;
1739 }
1740
1741 size_t
1742 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1743 const wchar_t *src, size_t srcLen) const
1744 {
1745 if ( srcLen == wxNO_LEN )
1746 srcLen = wxWcslen(src) + 1;
1747
1748 size_t outLen = 0;
1749 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1750 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1751 {
1752 wxUint16 cc[2] = { 0 };
1753 const size_t numChars = encode_utf16(*src, cc);
1754 if ( numChars == wxCONV_FAILED )
1755 return wxCONV_FAILED;
1756
1757 outLen += numChars * BYTES_PER_CHAR;
1758 if ( outBuff )
1759 {
1760 if ( outLen > dstLen )
1761 return wxCONV_FAILED;
1762
1763 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1764 if ( numChars == 2 )
1765 {
1766 // second character of a surrogate
1767 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1768 }
1769 }
1770 }
1771
1772 return outLen;
1773 }
1774
1775 #endif // WC_UTF16/!WC_UTF16
1776
1777
1778 // ============================================================================
1779 // UTF-32
1780 // ============================================================================
1781
1782 #ifdef WORDS_BIGENDIAN
1783 #define wxMBConvUTF32straight wxMBConvUTF32BE
1784 #define wxMBConvUTF32swap wxMBConvUTF32LE
1785 #else
1786 #define wxMBConvUTF32swap wxMBConvUTF32BE
1787 #define wxMBConvUTF32straight wxMBConvUTF32LE
1788 #endif
1789
1790
1791 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1792 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1793
1794 /* static */
1795 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1796 {
1797 if ( srcLen == wxNO_LEN )
1798 {
1799 // count the number of bytes in input, including the trailing NULs
1800 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1801 for ( srcLen = 1; *inBuff++; srcLen++ )
1802 ;
1803
1804 srcLen *= BYTES_PER_CHAR;
1805 }
1806 else // we already have the length
1807 {
1808 // we can only convert an entire number of UTF-32 characters
1809 if ( srcLen % BYTES_PER_CHAR )
1810 return wxCONV_FAILED;
1811 }
1812
1813 return srcLen;
1814 }
1815
1816 // case when in-memory representation is UTF-16
1817 #ifdef WC_UTF16
1818
1819 // ----------------------------------------------------------------------------
1820 // conversions without endianness change
1821 // ----------------------------------------------------------------------------
1822
1823 size_t
1824 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1825 const char *src, size_t srcLen) const
1826 {
1827 srcLen = GetLength(src, srcLen);
1828 if ( srcLen == wxNO_LEN )
1829 return wxCONV_FAILED;
1830
1831 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1832 const size_t inLen = srcLen / BYTES_PER_CHAR;
1833 size_t outLen = 0;
1834 for ( size_t n = 0; n < inLen; n++ )
1835 {
1836 wxUint16 cc[2] = { 0 };
1837 const size_t numChars = encode_utf16(*inBuff++, cc);
1838 if ( numChars == wxCONV_FAILED )
1839 return wxCONV_FAILED;
1840
1841 outLen += numChars;
1842 if ( dst )
1843 {
1844 if ( outLen > dstLen )
1845 return wxCONV_FAILED;
1846
1847 *dst++ = cc[0];
1848 if ( numChars == 2 )
1849 {
1850 // second character of a surrogate
1851 *dst++ = cc[1];
1852 }
1853 }
1854 }
1855
1856 return outLen;
1857 }
1858
1859 size_t
1860 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1861 const wchar_t *src, size_t srcLen) const
1862 {
1863 if ( srcLen == wxNO_LEN )
1864 srcLen = wxWcslen(src) + 1;
1865
1866 if ( !dst )
1867 {
1868 // optimization: return maximal space which could be needed for this
1869 // string instead of the exact amount which could be less if there are
1870 // any surrogates in the input
1871 //
1872 // we consider that surrogates are rare enough to make it worthwhile to
1873 // avoid running the loop below at the cost of slightly extra memory
1874 // consumption
1875 return srcLen * BYTES_PER_CHAR;
1876 }
1877
1878 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1879 size_t outLen = 0;
1880 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1881 {
1882 const wxUint32 ch = wxDecodeSurrogate(&src);
1883 if ( !src )
1884 return wxCONV_FAILED;
1885
1886 outLen += BYTES_PER_CHAR;
1887
1888 if ( outLen > dstLen )
1889 return wxCONV_FAILED;
1890
1891 *outBuff++ = ch;
1892 }
1893
1894 return outLen;
1895 }
1896
1897 // ----------------------------------------------------------------------------
1898 // endian-reversing conversions
1899 // ----------------------------------------------------------------------------
1900
1901 size_t
1902 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1903 const char *src, size_t srcLen) const
1904 {
1905 srcLen = GetLength(src, srcLen);
1906 if ( srcLen == wxNO_LEN )
1907 return wxCONV_FAILED;
1908
1909 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1910 const size_t inLen = srcLen / BYTES_PER_CHAR;
1911 size_t outLen = 0;
1912 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1913 {
1914 wxUint16 cc[2] = { 0 };
1915 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1916 if ( numChars == wxCONV_FAILED )
1917 return wxCONV_FAILED;
1918
1919 outLen += numChars;
1920 if ( dst )
1921 {
1922 if ( outLen > dstLen )
1923 return wxCONV_FAILED;
1924
1925 *dst++ = cc[0];
1926 if ( numChars == 2 )
1927 {
1928 // second character of a surrogate
1929 *dst++ = cc[1];
1930 }
1931 }
1932 }
1933
1934 return outLen;
1935 }
1936
1937 size_t
1938 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1939 const wchar_t *src, size_t srcLen) const
1940 {
1941 if ( srcLen == wxNO_LEN )
1942 srcLen = wxWcslen(src) + 1;
1943
1944 if ( !dst )
1945 {
1946 // optimization: return maximal space which could be needed for this
1947 // string instead of the exact amount which could be less if there are
1948 // any surrogates in the input
1949 //
1950 // we consider that surrogates are rare enough to make it worthwhile to
1951 // avoid running the loop below at the cost of slightly extra memory
1952 // consumption
1953 return srcLen*BYTES_PER_CHAR;
1954 }
1955
1956 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1957 size_t outLen = 0;
1958 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1959 {
1960 const wxUint32 ch = wxDecodeSurrogate(&src);
1961 if ( !src )
1962 return wxCONV_FAILED;
1963
1964 outLen += BYTES_PER_CHAR;
1965
1966 if ( outLen > dstLen )
1967 return wxCONV_FAILED;
1968
1969 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1970 }
1971
1972 return outLen;
1973 }
1974
1975 #else // !WC_UTF16: wchar_t is UTF-32
1976
1977 // ----------------------------------------------------------------------------
1978 // conversions without endianness change
1979 // ----------------------------------------------------------------------------
1980
1981 size_t
1982 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1983 const char *src, size_t srcLen) const
1984 {
1985 // use memcpy() as it should be much faster than hand-written loop
1986 srcLen = GetLength(src, srcLen);
1987 if ( srcLen == wxNO_LEN )
1988 return wxCONV_FAILED;
1989
1990 const size_t inLen = srcLen/BYTES_PER_CHAR;
1991 if ( dst )
1992 {
1993 if ( dstLen < inLen )
1994 return wxCONV_FAILED;
1995
1996 memcpy(dst, src, srcLen);
1997 }
1998
1999 return inLen;
2000 }
2001
2002 size_t
2003 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
2004 const wchar_t *src, size_t srcLen) const
2005 {
2006 if ( srcLen == wxNO_LEN )
2007 srcLen = wxWcslen(src) + 1;
2008
2009 srcLen *= BYTES_PER_CHAR;
2010
2011 if ( dst )
2012 {
2013 if ( dstLen < srcLen )
2014 return wxCONV_FAILED;
2015
2016 memcpy(dst, src, srcLen);
2017 }
2018
2019 return srcLen;
2020 }
2021
2022 // ----------------------------------------------------------------------------
2023 // endian-reversing conversions
2024 // ----------------------------------------------------------------------------
2025
2026 size_t
2027 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
2028 const char *src, size_t srcLen) const
2029 {
2030 srcLen = GetLength(src, srcLen);
2031 if ( srcLen == wxNO_LEN )
2032 return wxCONV_FAILED;
2033
2034 srcLen /= BYTES_PER_CHAR;
2035
2036 if ( dst )
2037 {
2038 if ( dstLen < srcLen )
2039 return wxCONV_FAILED;
2040
2041 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
2042 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
2043 {
2044 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
2045 }
2046 }
2047
2048 return srcLen;
2049 }
2050
2051 size_t
2052 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2053 const wchar_t *src, size_t srcLen) const
2054 {
2055 if ( srcLen == wxNO_LEN )
2056 srcLen = wxWcslen(src) + 1;
2057
2058 srcLen *= BYTES_PER_CHAR;
2059
2060 if ( dst )
2061 {
2062 if ( dstLen < srcLen )
2063 return wxCONV_FAILED;
2064
2065 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2066 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2067 {
2068 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2069 }
2070 }
2071
2072 return srcLen;
2073 }
2074
2075 #endif // WC_UTF16/!WC_UTF16
2076
2077
2078 // ============================================================================
2079 // The classes doing conversion using the iconv_xxx() functions
2080 // ============================================================================
2081
2082 #ifdef HAVE_ICONV
2083
2084 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2085 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2086 // (unless there's yet another bug in glibc) the only case when iconv()
2087 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2088 // left in the input buffer -- when _real_ error occurs,
2089 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2090 // iconv() failure.
2091 // [This bug does not appear in glibc 2.2.]
2092 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2093 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2094 (errno != E2BIG || bufLeft != 0))
2095 #else
2096 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2097 #endif
2098
2099 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2100
2101 #define ICONV_T_INVALID ((iconv_t)-1)
2102
2103 #if SIZEOF_WCHAR_T == 4
2104 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2105 #define WC_ENC wxFONTENCODING_UTF32
2106 #elif SIZEOF_WCHAR_T == 2
2107 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2108 #define WC_ENC wxFONTENCODING_UTF16
2109 #else // sizeof(wchar_t) != 2 nor 4
2110 // does this ever happen?
2111 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2112 #endif
2113
2114 // ----------------------------------------------------------------------------
2115 // wxMBConv_iconv: encapsulates an iconv character set
2116 // ----------------------------------------------------------------------------
2117
2118 class wxMBConv_iconv : public wxMBConv
2119 {
2120 public:
2121 wxMBConv_iconv(const char *name);
2122 virtual ~wxMBConv_iconv();
2123
2124 // implement base class virtual methods
2125 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2126 const char *src, size_t srcLen = wxNO_LEN) const;
2127 virtual size_t FromWChar(char *dst, size_t dstLen,
2128 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2129 virtual size_t GetMBNulLen() const;
2130
2131 #if wxUSE_UNICODE_UTF8
2132 virtual bool IsUTF8() const;
2133 #endif
2134
2135 virtual wxMBConv *Clone() const
2136 {
2137 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
2138 p->m_minMBCharWidth = m_minMBCharWidth;
2139 return p;
2140 }
2141
2142 bool IsOk() const
2143 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2144
2145 protected:
2146 // the iconv handlers used to translate from multibyte
2147 // to wide char and in the other direction
2148 iconv_t m2w,
2149 w2m;
2150
2151 #if wxUSE_THREADS
2152 // guards access to m2w and w2m objects
2153 wxMutex m_iconvMutex;
2154 #endif
2155
2156 private:
2157 // the name (for iconv_open()) of a wide char charset -- if none is
2158 // available on this machine, it will remain NULL
2159 static wxString ms_wcCharsetName;
2160
2161 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2162 // different endian-ness than the native one
2163 static bool ms_wcNeedsSwap;
2164
2165
2166 // name of the encoding handled by this conversion
2167 const char *m_name;
2168
2169 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2170 // initially
2171 size_t m_minMBCharWidth;
2172 };
2173
2174 // make the constructor available for unit testing
2175 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2176 {
2177 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2178 if ( !result->IsOk() )
2179 {
2180 delete result;
2181 return 0;
2182 }
2183
2184 return result;
2185 }
2186
2187 wxString wxMBConv_iconv::ms_wcCharsetName;
2188 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2189
2190 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2191 : m_name(wxStrdup(name))
2192 {
2193 m_minMBCharWidth = 0;
2194
2195 // check for charset that represents wchar_t:
2196 if ( ms_wcCharsetName.empty() )
2197 {
2198 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2199
2200 #if wxUSE_FONTMAP
2201 const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2202 #else // !wxUSE_FONTMAP
2203 static const wxChar *const names_static[] =
2204 {
2205 #if SIZEOF_WCHAR_T == 4
2206 wxT("UCS-4"),
2207 #elif SIZEOF_WCHAR_T == 2
2208 wxT("UCS-2"),
2209 #endif
2210 NULL
2211 };
2212 const wxChar *const *names = names_static;
2213 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2214
2215 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2216 {
2217 const wxString nameCS(*names);
2218
2219 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2220 wxString nameXE(nameCS);
2221
2222 #ifdef WORDS_BIGENDIAN
2223 nameXE += wxT("BE");
2224 #else // little endian
2225 nameXE += wxT("LE");
2226 #endif
2227
2228 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2229 nameXE.c_str());
2230
2231 m2w = iconv_open(nameXE.ToAscii(), name);
2232 if ( m2w == ICONV_T_INVALID )
2233 {
2234 // try charset w/o bytesex info (e.g. "UCS4")
2235 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2236 nameCS.c_str());
2237 m2w = iconv_open(nameCS.ToAscii(), name);
2238
2239 // and check for bytesex ourselves:
2240 if ( m2w != ICONV_T_INVALID )
2241 {
2242 char buf[2], *bufPtr;
2243 wchar_t wbuf[2];
2244 size_t insz, outsz;
2245 size_t res;
2246
2247 buf[0] = 'A';
2248 buf[1] = 0;
2249 wbuf[0] = 0;
2250 insz = 2;
2251 outsz = SIZEOF_WCHAR_T * 2;
2252 char* wbufPtr = (char*)wbuf;
2253 bufPtr = buf;
2254
2255 res = iconv(
2256 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2257 &wbufPtr, &outsz);
2258
2259 if (ICONV_FAILED(res, insz))
2260 {
2261 wxLogLastError(wxT("iconv"));
2262 wxLogError(_("Conversion to charset '%s' doesn't work."),
2263 nameCS.c_str());
2264 }
2265 else // ok, can convert to this encoding, remember it
2266 {
2267 ms_wcCharsetName = nameCS;
2268 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2269 }
2270 }
2271 }
2272 else // use charset not requiring byte swapping
2273 {
2274 ms_wcCharsetName = nameXE;
2275 }
2276 }
2277
2278 wxLogTrace(TRACE_STRCONV,
2279 wxT("iconv wchar_t charset is \"%s\"%s"),
2280 ms_wcCharsetName.empty() ? wxString("<none>")
2281 : ms_wcCharsetName,
2282 ms_wcNeedsSwap ? wxT(" (needs swap)")
2283 : wxT(""));
2284 }
2285 else // we already have ms_wcCharsetName
2286 {
2287 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2288 }
2289
2290 if ( ms_wcCharsetName.empty() )
2291 {
2292 w2m = ICONV_T_INVALID;
2293 }
2294 else
2295 {
2296 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2297 if ( w2m == ICONV_T_INVALID )
2298 {
2299 wxLogTrace(TRACE_STRCONV,
2300 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2301 ms_wcCharsetName.c_str(), name);
2302 }
2303 }
2304 }
2305
2306 wxMBConv_iconv::~wxMBConv_iconv()
2307 {
2308 free(const_cast<char *>(m_name));
2309
2310 if ( m2w != ICONV_T_INVALID )
2311 iconv_close(m2w);
2312 if ( w2m != ICONV_T_INVALID )
2313 iconv_close(w2m);
2314 }
2315
2316 size_t
2317 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2318 const char *src, size_t srcLen) const
2319 {
2320 if ( srcLen == wxNO_LEN )
2321 {
2322 // find the string length: notice that must be done differently for
2323 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2324 // consecutive NULs
2325 const size_t nulLen = GetMBNulLen();
2326 switch ( nulLen )
2327 {
2328 default:
2329 return wxCONV_FAILED;
2330
2331 case 1:
2332 srcLen = strlen(src); // arguably more optimized than our version
2333 break;
2334
2335 case 2:
2336 case 4:
2337 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2338 // but they also have to start at character boundary and not
2339 // span two adjacent characters
2340 const char *p;
2341 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2342 ;
2343 srcLen = p - src;
2344 break;
2345 }
2346
2347 // when we're determining the length of the string ourselves we count
2348 // the terminating NUL(s) as part of it and always NUL-terminate the
2349 // output
2350 srcLen += nulLen;
2351 }
2352
2353 // we express length in the number of (wide) characters but iconv always
2354 // counts buffer sizes it in bytes
2355 dstLen *= SIZEOF_WCHAR_T;
2356
2357 #if wxUSE_THREADS
2358 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2359 // Unfortunately there are a couple of global wxCSConv objects such as
2360 // wxConvLocal that are used all over wx code, so we have to make sure
2361 // the handle is used by at most one thread at the time. Otherwise
2362 // only a few wx classes would be safe to use from non-main threads
2363 // as MB<->WC conversion would fail "randomly".
2364 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2365 #endif // wxUSE_THREADS
2366
2367 size_t res, cres;
2368 const char *pszPtr = src;
2369
2370 if ( dst )
2371 {
2372 char* bufPtr = (char*)dst;
2373
2374 // have destination buffer, convert there
2375 size_t dstLenOrig = dstLen;
2376 cres = iconv(m2w,
2377 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2378 &bufPtr, &dstLen);
2379
2380 // convert the number of bytes converted as returned by iconv to the
2381 // number of (wide) characters converted that we need
2382 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2383
2384 if (ms_wcNeedsSwap)
2385 {
2386 // convert to native endianness
2387 for ( unsigned i = 0; i < res; i++ )
2388 dst[i] = WC_BSWAP(dst[i]);
2389 }
2390 }
2391 else // no destination buffer
2392 {
2393 // convert using temp buffer to calculate the size of the buffer needed
2394 wchar_t tbuf[256];
2395 res = 0;
2396
2397 do
2398 {
2399 char* bufPtr = (char*)tbuf;
2400 dstLen = 8 * SIZEOF_WCHAR_T;
2401
2402 cres = iconv(m2w,
2403 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2404 &bufPtr, &dstLen );
2405
2406 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2407 }
2408 while ((cres == (size_t)-1) && (errno == E2BIG));
2409 }
2410
2411 if (ICONV_FAILED(cres, srcLen))
2412 {
2413 //VS: it is ok if iconv fails, hence trace only
2414 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2415 return wxCONV_FAILED;
2416 }
2417
2418 return res;
2419 }
2420
2421 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2422 const wchar_t *src, size_t srcLen) const
2423 {
2424 #if wxUSE_THREADS
2425 // NB: explained in MB2WC
2426 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2427 #endif
2428
2429 if ( srcLen == wxNO_LEN )
2430 srcLen = wxWcslen(src) + 1;
2431
2432 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2433 size_t outbuflen = dstLen;
2434 size_t res, cres;
2435
2436 wchar_t *tmpbuf = 0;
2437
2438 if (ms_wcNeedsSwap)
2439 {
2440 // need to copy to temp buffer to switch endianness
2441 // (doing WC_BSWAP twice on the original buffer won't work, as it
2442 // could be in read-only memory, or be accessed in some other thread)
2443 tmpbuf = (wchar_t *)malloc(inbuflen);
2444 for ( size_t i = 0; i < srcLen; i++ )
2445 tmpbuf[i] = WC_BSWAP(src[i]);
2446
2447 src = tmpbuf;
2448 }
2449
2450 char* inbuf = (char*)src;
2451 if ( dst )
2452 {
2453 // have destination buffer, convert there
2454 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2455
2456 res = dstLen - outbuflen;
2457 }
2458 else // no destination buffer
2459 {
2460 // convert using temp buffer to calculate the size of the buffer needed
2461 char tbuf[256];
2462 res = 0;
2463 do
2464 {
2465 dst = tbuf;
2466 outbuflen = WXSIZEOF(tbuf);
2467
2468 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2469
2470 res += WXSIZEOF(tbuf) - outbuflen;
2471 }
2472 while ((cres == (size_t)-1) && (errno == E2BIG));
2473 }
2474
2475 if (ms_wcNeedsSwap)
2476 {
2477 free(tmpbuf);
2478 }
2479
2480 if (ICONV_FAILED(cres, inbuflen))
2481 {
2482 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2483 return wxCONV_FAILED;
2484 }
2485
2486 return res;
2487 }
2488
2489 size_t wxMBConv_iconv::GetMBNulLen() const
2490 {
2491 if ( m_minMBCharWidth == 0 )
2492 {
2493 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2494
2495 #if wxUSE_THREADS
2496 // NB: explained in MB2WC
2497 wxMutexLocker lock(self->m_iconvMutex);
2498 #endif
2499
2500 const wchar_t *wnul = L"";
2501 char buf[8]; // should be enough for NUL in any encoding
2502 size_t inLen = sizeof(wchar_t),
2503 outLen = WXSIZEOF(buf);
2504 char *inBuff = (char *)wnul;
2505 char *outBuff = buf;
2506 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2507 {
2508 self->m_minMBCharWidth = (size_t)-1;
2509 }
2510 else // ok
2511 {
2512 self->m_minMBCharWidth = outBuff - buf;
2513 }
2514 }
2515
2516 return m_minMBCharWidth;
2517 }
2518
2519 #if wxUSE_UNICODE_UTF8
2520 bool wxMBConv_iconv::IsUTF8() const
2521 {
2522 return wxStricmp(m_name, "UTF-8") == 0 ||
2523 wxStricmp(m_name, "UTF8") == 0;
2524 }
2525 #endif
2526
2527 #endif // HAVE_ICONV
2528
2529
2530 // ============================================================================
2531 // Win32 conversion classes
2532 // ============================================================================
2533
2534 #ifdef wxHAVE_WIN32_MB2WC
2535
2536 // from utils.cpp
2537 #if wxUSE_FONTMAP
2538 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2539 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2540 #endif
2541
2542 class wxMBConv_win32 : public wxMBConv
2543 {
2544 public:
2545 wxMBConv_win32()
2546 {
2547 m_CodePage = CP_ACP;
2548 m_minMBCharWidth = 0;
2549 }
2550
2551 wxMBConv_win32(const wxMBConv_win32& conv)
2552 : wxMBConv()
2553 {
2554 m_CodePage = conv.m_CodePage;
2555 m_minMBCharWidth = conv.m_minMBCharWidth;
2556 }
2557
2558 #if wxUSE_FONTMAP
2559 wxMBConv_win32(const char* name)
2560 {
2561 m_CodePage = wxCharsetToCodepage(name);
2562 m_minMBCharWidth = 0;
2563 }
2564
2565 wxMBConv_win32(wxFontEncoding encoding)
2566 {
2567 m_CodePage = wxEncodingToCodepage(encoding);
2568 m_minMBCharWidth = 0;
2569 }
2570 #endif // wxUSE_FONTMAP
2571
2572 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2573 {
2574 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2575 // the behaviour is not compatible with the Unix version (using iconv)
2576 // and break the library itself, e.g. wxTextInputStream::NextChar()
2577 // wouldn't work if reading an incomplete MB char didn't result in an
2578 // error
2579 //
2580 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2581 // Win XP or newer and it is not supported for UTF-[78] so we always
2582 // use our own conversions in this case. See
2583 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2584 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2585 if ( m_CodePage == CP_UTF8 )
2586 {
2587 return wxMBConvUTF8().MB2WC(buf, psz, n);
2588 }
2589
2590 if ( m_CodePage == CP_UTF7 )
2591 {
2592 return wxMBConvUTF7().MB2WC(buf, psz, n);
2593 }
2594
2595 int flags = 0;
2596 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2597 IsAtLeastWin2kSP4() )
2598 {
2599 flags = MB_ERR_INVALID_CHARS;
2600 }
2601
2602 const size_t len = ::MultiByteToWideChar
2603 (
2604 m_CodePage, // code page
2605 flags, // flags: fall on error
2606 psz, // input string
2607 -1, // its length (NUL-terminated)
2608 buf, // output string
2609 buf ? n : 0 // size of output buffer
2610 );
2611 if ( !len )
2612 {
2613 // function totally failed
2614 return wxCONV_FAILED;
2615 }
2616
2617 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2618 // check if we succeeded, by doing a double trip:
2619 if ( !flags && buf )
2620 {
2621 const size_t mbLen = strlen(psz);
2622 wxCharBuffer mbBuf(mbLen);
2623 if ( ::WideCharToMultiByte
2624 (
2625 m_CodePage,
2626 0,
2627 buf,
2628 -1,
2629 mbBuf.data(),
2630 mbLen + 1, // size in bytes, not length
2631 NULL,
2632 NULL
2633 ) == 0 ||
2634 strcmp(mbBuf, psz) != 0 )
2635 {
2636 // we didn't obtain the same thing we started from, hence
2637 // the conversion was lossy and we consider that it failed
2638 return wxCONV_FAILED;
2639 }
2640 }
2641
2642 // note that it returns count of written chars for buf != NULL and size
2643 // of the needed buffer for buf == NULL so in either case the length of
2644 // the string (which never includes the terminating NUL) is one less
2645 return len - 1;
2646 }
2647
2648 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2649 {
2650 /*
2651 we have a problem here: by default, WideCharToMultiByte() may
2652 replace characters unrepresentable in the target code page with bad
2653 quality approximations such as turning "1/2" symbol (U+00BD) into
2654 "1" for the code pages which don't have it and we, obviously, want
2655 to avoid this at any price
2656
2657 the trouble is that this function does it _silently_, i.e. it won't
2658 even tell us whether it did or not... Win98/2000 and higher provide
2659 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2660 we have to resort to a round trip, i.e. check that converting back
2661 results in the same string -- this is, of course, expensive but
2662 otherwise we simply can't be sure to not garble the data.
2663 */
2664
2665 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2666 // it doesn't work with CJK encodings (which we test for rather roughly
2667 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2668 // supporting it
2669 BOOL usedDef wxDUMMY_INITIALIZE(false);
2670 BOOL *pUsedDef;
2671 int flags;
2672 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2673 {
2674 // it's our lucky day
2675 flags = WC_NO_BEST_FIT_CHARS;
2676 pUsedDef = &usedDef;
2677 }
2678 else // old system or unsupported encoding
2679 {
2680 flags = 0;
2681 pUsedDef = NULL;
2682 }
2683
2684 const size_t len = ::WideCharToMultiByte
2685 (
2686 m_CodePage, // code page
2687 flags, // either none or no best fit
2688 pwz, // input string
2689 -1, // it is (wide) NUL-terminated
2690 buf, // output buffer
2691 buf ? n : 0, // and its size
2692 NULL, // default "replacement" char
2693 pUsedDef // [out] was it used?
2694 );
2695
2696 if ( !len )
2697 {
2698 // function totally failed
2699 return wxCONV_FAILED;
2700 }
2701
2702 // we did something, check if we really succeeded
2703 if ( flags )
2704 {
2705 // check if the conversion failed, i.e. if any replacements
2706 // were done
2707 if ( usedDef )
2708 return wxCONV_FAILED;
2709 }
2710 else // we must resort to double tripping...
2711 {
2712 // first we need to ensure that we really have the MB data: this is
2713 // not the case if we're called with NULL buffer, in which case we
2714 // need to do the conversion yet again
2715 wxCharBuffer bufDef;
2716 if ( !buf )
2717 {
2718 bufDef = wxCharBuffer(len);
2719 buf = bufDef.data();
2720 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2721 buf, len, NULL, NULL) )
2722 return wxCONV_FAILED;
2723 }
2724
2725 if ( !n )
2726 n = wcslen(pwz);
2727 wxWCharBuffer wcBuf(n);
2728 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2729 wcscmp(wcBuf, pwz) != 0 )
2730 {
2731 // we didn't obtain the same thing we started from, hence
2732 // the conversion was lossy and we consider that it failed
2733 return wxCONV_FAILED;
2734 }
2735 }
2736
2737 // see the comment above for the reason of "len - 1"
2738 return len - 1;
2739 }
2740
2741 virtual size_t GetMBNulLen() const
2742 {
2743 if ( m_minMBCharWidth == 0 )
2744 {
2745 int len = ::WideCharToMultiByte
2746 (
2747 m_CodePage, // code page
2748 0, // no flags
2749 L"", // input string
2750 1, // translate just the NUL
2751 NULL, // output buffer
2752 0, // and its size
2753 NULL, // no replacement char
2754 NULL // [out] don't care if it was used
2755 );
2756
2757 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2758 switch ( len )
2759 {
2760 default:
2761 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2762 self->m_minMBCharWidth = (size_t)-1;
2763 break;
2764
2765 case 0:
2766 self->m_minMBCharWidth = (size_t)-1;
2767 break;
2768
2769 case 1:
2770 case 2:
2771 case 4:
2772 self->m_minMBCharWidth = len;
2773 break;
2774 }
2775 }
2776
2777 return m_minMBCharWidth;
2778 }
2779
2780 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2781
2782 bool IsOk() const { return m_CodePage != -1; }
2783
2784 private:
2785 static bool CanUseNoBestFit()
2786 {
2787 static int s_isWin98Or2k = -1;
2788
2789 if ( s_isWin98Or2k == -1 )
2790 {
2791 int verMaj, verMin;
2792 switch ( wxGetOsVersion(&verMaj, &verMin) )
2793 {
2794 case wxOS_WINDOWS_9X:
2795 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2796 break;
2797
2798 case wxOS_WINDOWS_NT:
2799 s_isWin98Or2k = verMaj >= 5;
2800 break;
2801
2802 default:
2803 // unknown: be conservative by default
2804 s_isWin98Or2k = 0;
2805 break;
2806 }
2807
2808 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2809 }
2810
2811 return s_isWin98Or2k == 1;
2812 }
2813
2814 static bool IsAtLeastWin2kSP4()
2815 {
2816 #ifdef __WXWINCE__
2817 return false;
2818 #else
2819 static int s_isAtLeastWin2kSP4 = -1;
2820
2821 if ( s_isAtLeastWin2kSP4 == -1 )
2822 {
2823 OSVERSIONINFOEX ver;
2824
2825 memset(&ver, 0, sizeof(ver));
2826 ver.dwOSVersionInfoSize = sizeof(ver);
2827 GetVersionEx((OSVERSIONINFO*)&ver);
2828
2829 s_isAtLeastWin2kSP4 =
2830 ((ver.dwMajorVersion > 5) || // Vista+
2831 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2832 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2833 ver.wServicePackMajor >= 4)) // 2000 SP4+
2834 ? 1 : 0;
2835 }
2836
2837 return s_isAtLeastWin2kSP4 == 1;
2838 #endif
2839 }
2840
2841
2842 // the code page we're working with
2843 long m_CodePage;
2844
2845 // cached result of GetMBNulLen(), set to 0 initially meaning
2846 // "unknown"
2847 size_t m_minMBCharWidth;
2848 };
2849
2850 #endif // wxHAVE_WIN32_MB2WC
2851
2852
2853 // ============================================================================
2854 // wxEncodingConverter based conversion classes
2855 // ============================================================================
2856
2857 #if wxUSE_FONTMAP
2858
2859 class wxMBConv_wxwin : public wxMBConv
2860 {
2861 private:
2862 void Init()
2863 {
2864 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2865 // The wxMBConv_cf class does a better job.
2866 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2867 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2868 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2869 }
2870
2871 public:
2872 // temporarily just use wxEncodingConverter stuff,
2873 // so that it works while a better implementation is built
2874 wxMBConv_wxwin(const char* name)
2875 {
2876 if (name)
2877 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2878 else
2879 m_enc = wxFONTENCODING_SYSTEM;
2880
2881 Init();
2882 }
2883
2884 wxMBConv_wxwin(wxFontEncoding enc)
2885 {
2886 m_enc = enc;
2887
2888 Init();
2889 }
2890
2891 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2892 {
2893 size_t inbuf = strlen(psz);
2894 if (buf)
2895 {
2896 if (!m2w.Convert(psz, buf))
2897 return wxCONV_FAILED;
2898 }
2899 return inbuf;
2900 }
2901
2902 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2903 {
2904 const size_t inbuf = wxWcslen(psz);
2905 if (buf)
2906 {
2907 if (!w2m.Convert(psz, buf))
2908 return wxCONV_FAILED;
2909 }
2910
2911 return inbuf;
2912 }
2913
2914 virtual size_t GetMBNulLen() const
2915 {
2916 switch ( m_enc )
2917 {
2918 case wxFONTENCODING_UTF16BE:
2919 case wxFONTENCODING_UTF16LE:
2920 return 2;
2921
2922 case wxFONTENCODING_UTF32BE:
2923 case wxFONTENCODING_UTF32LE:
2924 return 4;
2925
2926 default:
2927 return 1;
2928 }
2929 }
2930
2931 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2932
2933 bool IsOk() const { return m_ok; }
2934
2935 public:
2936 wxFontEncoding m_enc;
2937 wxEncodingConverter m2w, w2m;
2938
2939 private:
2940 // were we initialized successfully?
2941 bool m_ok;
2942
2943 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2944 };
2945
2946 // make the constructors available for unit testing
2947 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2948 {
2949 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2950 if ( !result->IsOk() )
2951 {
2952 delete result;
2953 return 0;
2954 }
2955
2956 return result;
2957 }
2958
2959 #endif // wxUSE_FONTMAP
2960
2961 // ============================================================================
2962 // wxCSConv implementation
2963 // ============================================================================
2964
2965 void wxCSConv::Init()
2966 {
2967 m_name = NULL;
2968 m_convReal = NULL;
2969 }
2970
2971 void wxCSConv::SetEncoding(wxFontEncoding encoding)
2972 {
2973 switch ( encoding )
2974 {
2975 case wxFONTENCODING_MAX:
2976 case wxFONTENCODING_SYSTEM:
2977 if ( m_name )
2978 {
2979 // It's ok to not have encoding value if we have a name for it.
2980 m_encoding = wxFONTENCODING_SYSTEM;
2981 }
2982 else // No name neither.
2983 {
2984 // Fall back to the system default encoding in this case (not
2985 // sure how much sense does this make but this is how the old
2986 // code used to behave).
2987 #if wxUSE_INTL
2988 m_encoding = wxLocale::GetSystemEncoding();
2989 if ( m_encoding == wxFONTENCODING_SYSTEM )
2990 #endif // wxUSE_INTL
2991 m_encoding = wxFONTENCODING_ISO8859_1;
2992 }
2993 break;
2994
2995 case wxFONTENCODING_DEFAULT:
2996 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2997 m_encoding = wxFONTENCODING_ISO8859_1;
2998 break;
2999
3000 default:
3001 // Just use the provided encoding.
3002 m_encoding = encoding;
3003 }
3004 }
3005
3006 wxCSConv::wxCSConv(const wxString& charset)
3007 {
3008 Init();
3009
3010 if ( !charset.empty() )
3011 {
3012 SetName(charset.ToAscii());
3013 }
3014
3015 #if wxUSE_FONTMAP
3016 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
3017 #else
3018 SetEncoding(wxFONTENCODING_SYSTEM);
3019 #endif
3020
3021 m_convReal = DoCreate();
3022 }
3023
3024 wxCSConv::wxCSConv(wxFontEncoding encoding)
3025 {
3026 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3027 {
3028 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3029
3030 encoding = wxFONTENCODING_SYSTEM;
3031 }
3032
3033 Init();
3034
3035 SetEncoding(encoding);
3036
3037 m_convReal = DoCreate();
3038 }
3039
3040 wxCSConv::~wxCSConv()
3041 {
3042 Clear();
3043 }
3044
3045 wxCSConv::wxCSConv(const wxCSConv& conv)
3046 : wxMBConv()
3047 {
3048 Init();
3049
3050 SetName(conv.m_name);
3051 SetEncoding(conv.m_encoding);
3052
3053 m_convReal = DoCreate();
3054 }
3055
3056 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3057 {
3058 Clear();
3059
3060 SetName(conv.m_name);
3061 SetEncoding(conv.m_encoding);
3062
3063 m_convReal = DoCreate();
3064
3065 return *this;
3066 }
3067
3068 void wxCSConv::Clear()
3069 {
3070 free(m_name);
3071 m_name = NULL;
3072
3073 wxDELETE(m_convReal);
3074 }
3075
3076 void wxCSConv::SetName(const char *charset)
3077 {
3078 if ( charset )
3079 m_name = wxStrdup(charset);
3080 }
3081
3082 #if wxUSE_FONTMAP
3083
3084 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3085 wxEncodingNameCache );
3086
3087 static wxEncodingNameCache gs_nameCache;
3088 #endif
3089
3090 wxMBConv *wxCSConv::DoCreate() const
3091 {
3092 #if wxUSE_FONTMAP
3093 wxLogTrace(TRACE_STRCONV,
3094 wxT("creating conversion for %s"),
3095 (m_name ? m_name
3096 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3097 #endif // wxUSE_FONTMAP
3098
3099 // check for the special case of ASCII or ISO8859-1 charset: as we have
3100 // special knowledge of it anyhow, we don't need to create a special
3101 // conversion object
3102 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3103 {
3104 // don't convert at all
3105 return NULL;
3106 }
3107
3108 // we trust OS to do conversion better than we can so try external
3109 // conversion methods first
3110 //
3111 // the full order is:
3112 // 1. OS conversion (iconv() under Unix or Win32 API)
3113 // 2. hard coded conversions for UTF
3114 // 3. wxEncodingConverter as fall back
3115
3116 // step (1)
3117 #ifdef HAVE_ICONV
3118 #if !wxUSE_FONTMAP
3119 if ( m_name )
3120 #endif // !wxUSE_FONTMAP
3121 {
3122 #if wxUSE_FONTMAP
3123 wxFontEncoding encoding(m_encoding);
3124 #endif
3125
3126 if ( m_name )
3127 {
3128 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3129 if ( conv->IsOk() )
3130 return conv;
3131
3132 delete conv;
3133
3134 #if wxUSE_FONTMAP
3135 encoding =
3136 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3137 #endif // wxUSE_FONTMAP
3138 }
3139 #if wxUSE_FONTMAP
3140 {
3141 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3142 if ( it != gs_nameCache.end() )
3143 {
3144 if ( it->second.empty() )
3145 return NULL;
3146
3147 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3148 if ( conv->IsOk() )
3149 return conv;
3150
3151 delete conv;
3152 }
3153
3154 const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
3155 // CS : in case this does not return valid names (eg for MacRoman)
3156 // encoding got a 'failure' entry in the cache all the same,
3157 // although it just has to be created using a different method, so
3158 // only store failed iconv creation attempts (or perhaps we
3159 // shoulnd't do this at all ?)
3160 if ( names[0] != NULL )
3161 {
3162 for ( ; *names; ++names )
3163 {
3164 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3165 // will need changes that will obsolete this
3166 wxString name(*names);
3167 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3168 if ( conv->IsOk() )
3169 {
3170 gs_nameCache[encoding] = *names;
3171 return conv;
3172 }
3173
3174 delete conv;
3175 }
3176
3177 gs_nameCache[encoding] = wxT(""); // cache the failure
3178 }
3179 }
3180 #endif // wxUSE_FONTMAP
3181 }
3182 #endif // HAVE_ICONV
3183
3184 #ifdef wxHAVE_WIN32_MB2WC
3185 {
3186 #if wxUSE_FONTMAP
3187 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3188 : new wxMBConv_win32(m_encoding);
3189 if ( conv->IsOk() )
3190 return conv;
3191
3192 delete conv;
3193 #else
3194 return NULL;
3195 #endif
3196 }
3197 #endif // wxHAVE_WIN32_MB2WC
3198
3199 #ifdef __DARWIN__
3200 {
3201 // leave UTF16 and UTF32 to the built-ins of wx
3202 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3203 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3204 {
3205 #if wxUSE_FONTMAP
3206 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3207 : new wxMBConv_cf(m_encoding);
3208 #else
3209 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3210 #endif
3211
3212 if ( conv->IsOk() )
3213 return conv;
3214
3215 delete conv;
3216 }
3217 }
3218 #endif // __DARWIN__
3219
3220 // step (2)
3221 wxFontEncoding enc = m_encoding;
3222 #if wxUSE_FONTMAP
3223 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3224 {
3225 // use "false" to suppress interactive dialogs -- we can be called from
3226 // anywhere and popping up a dialog from here is the last thing we want to
3227 // do
3228 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3229 }
3230 #endif // wxUSE_FONTMAP
3231
3232 switch ( enc )
3233 {
3234 case wxFONTENCODING_UTF7:
3235 return new wxMBConvUTF7;
3236
3237 case wxFONTENCODING_UTF8:
3238 return new wxMBConvUTF8;
3239
3240 case wxFONTENCODING_UTF16BE:
3241 return new wxMBConvUTF16BE;
3242
3243 case wxFONTENCODING_UTF16LE:
3244 return new wxMBConvUTF16LE;
3245
3246 case wxFONTENCODING_UTF32BE:
3247 return new wxMBConvUTF32BE;
3248
3249 case wxFONTENCODING_UTF32LE:
3250 return new wxMBConvUTF32LE;
3251
3252 default:
3253 // nothing to do but put here to suppress gcc warnings
3254 break;
3255 }
3256
3257 // step (3)
3258 #if wxUSE_FONTMAP
3259 {
3260 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3261 : new wxMBConv_wxwin(m_encoding);
3262 if ( conv->IsOk() )
3263 return conv;
3264
3265 delete conv;
3266 }
3267
3268 wxLogTrace(TRACE_STRCONV,
3269 wxT("encoding \"%s\" is not supported by this system"),
3270 (m_name ? wxString(m_name)
3271 : wxFontMapperBase::GetEncodingName(m_encoding)));
3272 #endif // wxUSE_FONTMAP
3273
3274 return NULL;
3275 }
3276
3277 bool wxCSConv::IsOk() const
3278 {
3279 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3280 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3281 return true; // always ok as we do it ourselves
3282
3283 // m_convReal->IsOk() is called at its own creation, so we know it must
3284 // be ok if m_convReal is non-NULL
3285 return m_convReal != NULL;
3286 }
3287
3288 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3289 const char *src, size_t srcLen) const
3290 {
3291 if (m_convReal)
3292 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3293
3294 // latin-1 (direct)
3295 if ( srcLen == wxNO_LEN )
3296 srcLen = strlen(src) + 1; // take trailing NUL too
3297
3298 if ( dst )
3299 {
3300 if ( dstLen < srcLen )
3301 return wxCONV_FAILED;
3302
3303 for ( size_t n = 0; n < srcLen; n++ )
3304 dst[n] = (unsigned char)(src[n]);
3305 }
3306
3307 return srcLen;
3308 }
3309
3310 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3311 const wchar_t *src, size_t srcLen) const
3312 {
3313 if (m_convReal)
3314 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3315
3316 // latin-1 (direct)
3317 if ( srcLen == wxNO_LEN )
3318 srcLen = wxWcslen(src) + 1;
3319
3320 if ( dst )
3321 {
3322 if ( dstLen < srcLen )
3323 return wxCONV_FAILED;
3324
3325 for ( size_t n = 0; n < srcLen; n++ )
3326 {
3327 if ( src[n] > 0xFF )
3328 return wxCONV_FAILED;
3329
3330 dst[n] = (char)src[n];
3331 }
3332
3333 }
3334 else // still need to check the input validity
3335 {
3336 for ( size_t n = 0; n < srcLen; n++ )
3337 {
3338 if ( src[n] > 0xFF )
3339 return wxCONV_FAILED;
3340 }
3341 }
3342
3343 return srcLen;
3344 }
3345
3346 size_t wxCSConv::GetMBNulLen() const
3347 {
3348 if ( m_convReal )
3349 return m_convReal->GetMBNulLen();
3350
3351 // otherwise, we are ISO-8859-1
3352 return 1;
3353 }
3354
3355 #if wxUSE_UNICODE_UTF8
3356 bool wxCSConv::IsUTF8() const
3357 {
3358 if ( m_convReal )
3359 return m_convReal->IsUTF8();
3360
3361 // otherwise, we are ISO-8859-1
3362 return false;
3363 }
3364 #endif
3365
3366
3367 #if wxUSE_UNICODE
3368
3369 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3370 {
3371 if ( !s )
3372 return wxWCharBuffer();
3373
3374 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3375 if ( !wbuf )
3376 wbuf = wxMBConvUTF8().cMB2WX(s);
3377 if ( !wbuf )
3378 wbuf = wxConvISO8859_1.cMB2WX(s);
3379
3380 return wbuf;
3381 }
3382
3383 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3384 {
3385 if ( !ws )
3386 return wxCharBuffer();
3387
3388 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3389 if ( !buf )
3390 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3391
3392 return buf;
3393 }
3394
3395 #endif // wxUSE_UNICODE
3396
3397 // ----------------------------------------------------------------------------
3398 // globals
3399 // ----------------------------------------------------------------------------
3400
3401 // NB: The reason why we create converted objects in this convoluted way,
3402 // using a factory function instead of global variable, is that they
3403 // may be used at static initialization time (some of them are used by
3404 // wxString ctors and there may be a global wxString object). In other
3405 // words, possibly _before_ the converter global object would be
3406 // initialized.
3407
3408 #undef wxConvLibc
3409 #undef wxConvUTF8
3410 #undef wxConvUTF7
3411 #undef wxConvLocal
3412 #undef wxConvISO8859_1
3413
3414 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3415 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3416 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3417 { \
3418 static impl_klass name##Obj ctor_args; \
3419 return &name##Obj; \
3420 } \
3421 /* this ensures that all global converter objects are created */ \
3422 /* by the time static initialization is done, i.e. before any */ \
3423 /* thread is launched: */ \
3424 static klass* gs_##name##instance = wxGet_##name##Ptr()
3425
3426 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3427 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3428
3429 #ifdef __INTELC__
3430 // disable warning "variable 'xxx' was declared but never referenced"
3431 #pragma warning(disable: 177)
3432 #endif // Intel C++
3433
3434 #ifdef __WINDOWS__
3435 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3436 #elif 0 // defined(__WXOSX__)
3437 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3438 #else
3439 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3440 #endif
3441
3442 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3443 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3444 // provokes an error message about "not enough macro parameters"; and we
3445 // can't use "()" here as the name##Obj declaration would be parsed as a
3446 // function declaration then, so use a semicolon and live with an extra
3447 // empty statement (and hope that no compilers warns about this)
3448 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3449 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3450
3451 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3452 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3453
3454 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3455 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3456
3457 #ifdef __DARWIN__
3458 // It is important to use this conversion object under Darwin as it ensures
3459 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3460 // decomposed form internally (at least for the file names).
3461 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3462 #endif
3463
3464 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3465 #ifdef __DARWIN__
3466 &wxConvMacUTF8DObj;
3467 #else // !__DARWIN__
3468 wxGet_wxConvLibcPtr();
3469 #endif // __DARWIN__/!__DARWIN__