]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Merge SOC2009_FSWATCHER branch into trunk.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV wxT("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168 //
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
214 // with the result
215 //
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
222 // string
223 //
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
227 for ( ;; )
228 {
229 // try to convert the current chunk
230 size_t lenChunk = MB2WC(NULL, src, 0);
231 if ( lenChunk == wxCONV_FAILED )
232 return wxCONV_FAILED;
233
234 dstWritten += lenChunk;
235 if ( !srcEnd )
236 dstWritten++;
237
238 if ( !lenChunk )
239 {
240 // nothing left in the input string, conversion succeeded
241 break;
242 }
243
244 if ( dst )
245 {
246 if ( dstWritten > dstLen )
247 return wxCONV_FAILED;
248
249 // +1 is for trailing NUL
250 if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
251 return wxCONV_FAILED;
252
253 dst += lenChunk;
254 if ( !srcEnd )
255 dst++;
256 }
257
258 if ( !srcEnd )
259 {
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow (and we don't count the trailing NUL in this case)
262 break;
263 }
264
265 // advance the input pointer past the end of this chunk: notice that we
266 // will always stop before srcEnd because we know that the chunk is
267 // always properly NUL-terminated
268 while ( NotAllNULs(src, nulLen) )
269 {
270 // notice that we must skip over multiple bytes here as we suppose
271 // that if NUL takes 2 or 4 bytes, then all the other characters do
272 // too and so if advanced by a single byte we might erroneously
273 // detect sequences of NUL bytes in the middle of the input
274 src += nulLen;
275 }
276
277 // if the buffer ends before this NUL, we shouldn't count it in our
278 // output so skip the code below
279 if ( src == srcEnd )
280 break;
281
282 // do count this terminator as it's inside the buffer we convert
283 dstWritten++;
284 if ( dst )
285 dst++;
286
287 src += nulLen; // skip the terminator itself
288
289 if ( src >= srcEnd )
290 break;
291 }
292
293 return dstWritten;
294 }
295
296 size_t
297 wxMBConv::FromWChar(char *dst, size_t dstLen,
298 const wchar_t *src, size_t srcLen) const
299 {
300 // the number of chars [which would be] written to dst [if it were not NULL]
301 size_t dstWritten = 0;
302
303 // if we don't know its length we have no choice but to assume that it is
304 // NUL-terminated (notice that it can still be NUL-terminated even if
305 // explicit length is given but it doesn't change our return value)
306 const bool isNulTerminated = srcLen == wxNO_LEN;
307
308 // make a copy of the input string unless it is already properly
309 // NUL-terminated
310 wxWCharBuffer bufTmp;
311 if ( isNulTerminated )
312 {
313 srcLen = wxWcslen(src) + 1;
314 }
315 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
316 {
317 // make a copy in order to properly NUL-terminate the string
318 bufTmp = wxWCharBuffer(srcLen);
319 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
320 src = bufTmp;
321 }
322
323 const size_t lenNul = GetMBNulLen();
324 for ( const wchar_t * const srcEnd = src + srcLen;
325 src < srcEnd;
326 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
327 {
328 // try to convert the current chunk
329 size_t lenChunk = WC2MB(NULL, src, 0);
330
331 if ( lenChunk == wxCONV_FAILED )
332 return wxCONV_FAILED;
333
334 dstWritten += lenChunk;
335 if ( src + lenChunk < srcEnd || isNulTerminated )
336 dstWritten += lenNul;
337
338 if ( dst )
339 {
340 if ( dstWritten > dstLen )
341 return wxCONV_FAILED;
342
343 if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
344 return wxCONV_FAILED;
345
346 dst += lenChunk;
347 if ( src + lenChunk < srcEnd || isNulTerminated )
348 dst += lenNul;
349 }
350 }
351
352 return dstWritten;
353 }
354
355 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
356 {
357 size_t rc = ToWChar(outBuff, outLen, inBuff);
358 if ( rc != wxCONV_FAILED )
359 {
360 // ToWChar() returns the buffer length, i.e. including the trailing
361 // NUL, while this method doesn't take it into account
362 rc--;
363 }
364
365 return rc;
366 }
367
368 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
369 {
370 size_t rc = FromWChar(outBuff, outLen, inBuff);
371 if ( rc != wxCONV_FAILED )
372 {
373 rc -= GetMBNulLen();
374 }
375
376 return rc;
377 }
378
379 wxMBConv::~wxMBConv()
380 {
381 // nothing to do here (necessary for Darwin linking probably)
382 }
383
384 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
385 {
386 if ( psz )
387 {
388 // calculate the length of the buffer needed first
389 const size_t nLen = ToWChar(NULL, 0, psz);
390 if ( nLen != wxCONV_FAILED )
391 {
392 // now do the actual conversion
393 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
394
395 // +1 for the trailing NULL
396 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
397 return buf;
398 }
399 }
400
401 return wxWCharBuffer();
402 }
403
404 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
405 {
406 if ( pwz )
407 {
408 const size_t nLen = FromWChar(NULL, 0, pwz);
409 if ( nLen != wxCONV_FAILED )
410 {
411 wxCharBuffer buf(nLen - 1);
412 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
413 return buf;
414 }
415 }
416
417 return wxCharBuffer();
418 }
419
420 const wxWCharBuffer
421 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
422 {
423 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
424 if ( dstLen != wxCONV_FAILED )
425 {
426 // notice that we allocate space for dstLen+1 wide characters here
427 // because we want the buffer to always be NUL-terminated, even if the
428 // input isn't (as otherwise the caller has no way to know its length)
429 wxWCharBuffer wbuf(dstLen);
430 wbuf.data()[dstLen] = L'\0';
431 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
432 {
433 if ( outLen )
434 {
435 *outLen = dstLen;
436
437 // we also need to handle NUL-terminated input strings
438 // specially: for them the output is the length of the string
439 // excluding the trailing NUL, however if we're asked to
440 // convert a specific number of characters we return the length
441 // of the resulting output even if it's NUL-terminated
442 if ( inLen == wxNO_LEN )
443 (*outLen)--;
444 }
445
446 return wbuf;
447 }
448 }
449
450 if ( outLen )
451 *outLen = 0;
452
453 return wxWCharBuffer();
454 }
455
456 const wxCharBuffer
457 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
458 {
459 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
460 if ( dstLen != wxCONV_FAILED )
461 {
462 const size_t nulLen = GetMBNulLen();
463
464 // as above, ensure that the buffer is always NUL-terminated, even if
465 // the input is not
466 wxCharBuffer buf(dstLen + nulLen - 1);
467 memset(buf.data() + dstLen, 0, nulLen);
468 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
469 {
470 if ( outLen )
471 {
472 *outLen = dstLen;
473
474 if ( inLen == wxNO_LEN )
475 {
476 // in this case both input and output are NUL-terminated
477 // and we're not supposed to count NUL
478 *outLen -= nulLen;
479 }
480 }
481
482 return buf;
483 }
484 }
485
486 if ( outLen )
487 *outLen = 0;
488
489 return wxCharBuffer();
490 }
491
492 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
493 {
494 const size_t srcLen = buf.length();
495 if ( srcLen )
496 {
497 const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
498 if ( dstLen != wxCONV_FAILED )
499 {
500 wxWCharBuffer wbuf(dstLen);
501 wbuf.data()[dstLen] = L'\0';
502 if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
503 return wbuf;
504 }
505 }
506
507 return wxWCharBuffer();
508 }
509
510 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
511 {
512 const size_t srcLen = wbuf.length();
513 if ( srcLen )
514 {
515 const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
516 if ( dstLen != wxCONV_FAILED )
517 {
518 wxCharBuffer buf(dstLen);
519 buf.data()[dstLen] = '\0';
520 if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
521 return buf;
522 }
523 }
524
525 return wxCharBuffer();
526 }
527
528 // ----------------------------------------------------------------------------
529 // wxMBConvLibc
530 // ----------------------------------------------------------------------------
531
532 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
533 {
534 return wxMB2WC(buf, psz, n);
535 }
536
537 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
538 {
539 return wxWC2MB(buf, psz, n);
540 }
541
542 // ----------------------------------------------------------------------------
543 // wxConvBrokenFileNames
544 // ----------------------------------------------------------------------------
545
546 #ifdef __UNIX__
547
548 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
549 {
550 if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
551 wxStricmp(charset, wxT("UTF8")) == 0 )
552 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
553 else
554 m_conv = new wxCSConv(charset);
555 }
556
557 #endif // __UNIX__
558
559 // ----------------------------------------------------------------------------
560 // UTF-7
561 // ----------------------------------------------------------------------------
562
563 // Implementation (C) 2004 Fredrik Roubert
564 //
565 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
566
567 //
568 // BASE64 decoding table
569 //
570 static const unsigned char utf7unb64[] =
571 {
572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
576 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
577 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
578 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
579 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
580 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
581 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
582 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
583 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
584 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
585 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
586 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
587 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
588 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
604 };
605
606 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
607 const char *src, size_t srcLen) const
608 {
609 DecoderState stateOrig,
610 *statePtr;
611 if ( srcLen == wxNO_LEN )
612 {
613 // convert the entire string, up to and including the trailing NUL
614 srcLen = strlen(src) + 1;
615
616 // when working on the entire strings we don't update nor use the shift
617 // state from the previous call
618 statePtr = &stateOrig;
619 }
620 else // when working with partial strings we do use the shift state
621 {
622 statePtr = const_cast<DecoderState *>(&m_stateDecoder);
623
624 // also save the old state to be able to rollback to it on error
625 stateOrig = m_stateDecoder;
626 }
627
628 // but to simplify the code below we use this variable in both cases
629 DecoderState& state = *statePtr;
630
631
632 // number of characters [which would have been] written to dst [if it were
633 // not NULL]
634 size_t len = 0;
635
636 const char * const srcEnd = src + srcLen;
637
638 while ( (src < srcEnd) && (!dst || (len < dstLen)) )
639 {
640 const unsigned char cc = *src++;
641
642 if ( state.IsShifted() )
643 {
644 const unsigned char dc = utf7unb64[cc];
645 if ( dc == 0xff )
646 {
647 // end of encoded part, check that nothing was left: there can
648 // be up to 4 bits of 0 padding but nothing else (we also need
649 // to check isLSB as we count bits modulo 8 while a valid UTF-7
650 // encoded sequence must contain an integral number of UTF-16
651 // characters)
652 if ( state.isLSB || state.bit > 4 ||
653 (state.accum & ((1 << state.bit) - 1)) )
654 {
655 if ( !len )
656 state = stateOrig;
657
658 return wxCONV_FAILED;
659 }
660
661 state.ToDirect();
662
663 // re-parse this character normally below unless it's '-' which
664 // is consumed by the decoder
665 if ( cc == '-' )
666 continue;
667 }
668 else // valid encoded character
669 {
670 // mini base64 decoder: each character is 6 bits
671 state.bit += 6;
672 state.accum <<= 6;
673 state.accum += dc;
674
675 if ( state.bit >= 8 )
676 {
677 // got the full byte, consume it
678 state.bit -= 8;
679 unsigned char b = (state.accum >> state.bit) & 0x00ff;
680
681 if ( state.isLSB )
682 {
683 // we've got the full word, output it
684 if ( dst )
685 *dst++ = (state.msb << 8) | b;
686 len++;
687 state.isLSB = false;
688 }
689 else // MSB
690 {
691 // just store it while we wait for LSB
692 state.msb = b;
693 state.isLSB = true;
694 }
695 }
696 }
697 }
698
699 if ( state.IsDirect() )
700 {
701 // start of an encoded segment?
702 if ( cc == '+' )
703 {
704 if ( *src == '-' )
705 {
706 // just the encoded plus sign, don't switch to shifted mode
707 if ( dst )
708 *dst++ = '+';
709 len++;
710 src++;
711 }
712 else if ( utf7unb64[(unsigned)*src] == 0xff )
713 {
714 // empty encoded chunks are not allowed
715 if ( !len )
716 state = stateOrig;
717
718 return wxCONV_FAILED;
719 }
720 else // base-64 encoded chunk follows
721 {
722 state.ToShifted();
723 }
724 }
725 else // not '+'
726 {
727 // only printable 7 bit ASCII characters (with the exception of
728 // NUL, TAB, CR and LF) can be used directly
729 if ( cc >= 0x7f || (cc < ' ' &&
730 !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
731 return wxCONV_FAILED;
732
733 if ( dst )
734 *dst++ = cc;
735 len++;
736 }
737 }
738 }
739
740 if ( !len )
741 {
742 // as we didn't read any characters we should be called with the same
743 // data (followed by some more new data) again later so don't save our
744 // state
745 state = stateOrig;
746
747 return wxCONV_FAILED;
748 }
749
750 return len;
751 }
752
753 //
754 // BASE64 encoding table
755 //
756 static const unsigned char utf7enb64[] =
757 {
758 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
759 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
760 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
761 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
762 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
763 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
764 'w', 'x', 'y', 'z', '0', '1', '2', '3',
765 '4', '5', '6', '7', '8', '9', '+', '/'
766 };
767
768 //
769 // UTF-7 encoding table
770 //
771 // 0 - Set D (directly encoded characters)
772 // 1 - Set O (optional direct characters)
773 // 2 - whitespace characters (optional)
774 // 3 - special characters
775 //
776 static const unsigned char utf7encode[128] =
777 {
778 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
779 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
780 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
782 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
784 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
786 };
787
788 static inline bool wxIsUTF7Direct(wchar_t wc)
789 {
790 return wc < 0x80 && utf7encode[wc] < 1;
791 }
792
793 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
794 const wchar_t *src, size_t srcLen) const
795 {
796 EncoderState stateOrig,
797 *statePtr;
798 if ( srcLen == wxNO_LEN )
799 {
800 // we don't apply the stored state when operating on entire strings at
801 // once
802 statePtr = &stateOrig;
803
804 srcLen = wxWcslen(src) + 1;
805 }
806 else // do use the mode we left the output in previously
807 {
808 stateOrig = m_stateEncoder;
809 statePtr = const_cast<EncoderState *>(&m_stateEncoder);
810 }
811
812 EncoderState& state = *statePtr;
813
814
815 size_t len = 0;
816
817 const wchar_t * const srcEnd = src + srcLen;
818 while ( src < srcEnd && (!dst || len < dstLen) )
819 {
820 wchar_t cc = *src++;
821 if ( wxIsUTF7Direct(cc) )
822 {
823 if ( state.IsShifted() )
824 {
825 // pad with zeros the last encoded block if necessary
826 if ( state.bit )
827 {
828 if ( dst )
829 *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
830 len++;
831 }
832
833 state.ToDirect();
834
835 if ( dst )
836 *dst++ = '-';
837 len++;
838 }
839
840 if ( dst )
841 *dst++ = (char)cc;
842 len++;
843 }
844 else if ( cc == '+' && state.IsDirect() )
845 {
846 if ( dst )
847 {
848 *dst++ = '+';
849 *dst++ = '-';
850 }
851
852 len += 2;
853 }
854 #ifndef WC_UTF16
855 else if (((wxUint32)cc) > 0xffff)
856 {
857 // no surrogate pair generation (yet?)
858 return wxCONV_FAILED;
859 }
860 #endif
861 else
862 {
863 if ( state.IsDirect() )
864 {
865 state.ToShifted();
866
867 if ( dst )
868 *dst++ = '+';
869 len++;
870 }
871
872 // BASE64 encode string
873 for ( ;; )
874 {
875 for ( unsigned lsb = 0; lsb < 2; lsb++ )
876 {
877 state.accum <<= 8;
878 state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
879
880 for (state.bit += 8; state.bit >= 6; )
881 {
882 state.bit -= 6;
883 if ( dst )
884 *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
885 len++;
886 }
887 }
888
889 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
890 break;
891
892 src++;
893 }
894 }
895 }
896
897 // we need to restore the original encoder state if we were called just to
898 // calculate the amount of space needed as we will presumably be called
899 // again to really convert the data now
900 if ( !dst )
901 state = stateOrig;
902
903 return len;
904 }
905
906 // ----------------------------------------------------------------------------
907 // UTF-8
908 // ----------------------------------------------------------------------------
909
910 static const wxUint32 utf8_max[]=
911 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
912
913 // boundaries of the private use area we use to (temporarily) remap invalid
914 // characters invalid in a UTF-8 encoded string
915 const wxUint32 wxUnicodePUA = 0x100000;
916 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
917
918 // this table gives the length of the UTF-8 encoding from its first character:
919 const unsigned char tableUtf8Lengths[256] = {
920 // single-byte sequences (ASCII):
921 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
929
930 // these are invalid:
931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
935 0, 0, // C0,C1
936
937 // two-byte sequences:
938 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
939 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
940
941 // three-byte sequences:
942 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
943
944 // four-byte sequences:
945 4, 4, 4, 4, 4, // F0..F4
946
947 // these are invalid again (5- or 6-byte
948 // sequences and sequences for code points
949 // above U+10FFFF, as restricted by RFC 3629):
950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
951 };
952
953 size_t
954 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
955 const char *src, size_t srcLen) const
956 {
957 wchar_t *out = dstLen ? dst : NULL;
958 size_t written = 0;
959
960 if ( srcLen == wxNO_LEN )
961 srcLen = strlen(src) + 1;
962
963 for ( const char *p = src; ; p++ )
964 {
965 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
966 {
967 // all done successfully, just add the trailing NULL if we are not
968 // using explicit length
969 if ( srcLen == wxNO_LEN )
970 {
971 if ( out )
972 {
973 if ( !dstLen )
974 break;
975
976 *out = L'\0';
977 }
978
979 written++;
980 }
981
982 return written;
983 }
984
985 if ( out && !dstLen-- )
986 break;
987
988 wxUint32 code;
989 unsigned char c = *p;
990
991 if ( c < 0x80 )
992 {
993 if ( srcLen == 0 ) // the test works for wxNO_LEN too
994 break;
995
996 if ( srcLen != wxNO_LEN )
997 srcLen--;
998
999 code = c;
1000 }
1001 else
1002 {
1003 unsigned len = tableUtf8Lengths[c];
1004 if ( !len )
1005 break;
1006
1007 if ( srcLen < len ) // the test works for wxNO_LEN too
1008 break;
1009
1010 if ( srcLen != wxNO_LEN )
1011 srcLen -= len;
1012
1013 // Char. number range | UTF-8 octet sequence
1014 // (hexadecimal) | (binary)
1015 // ----------------------+----------------------------------------
1016 // 0000 0000 - 0000 007F | 0xxxxxxx
1017 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1018 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1019 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1020 //
1021 // Code point value is stored in bits marked with 'x',
1022 // lowest-order bit of the value on the right side in the diagram
1023 // above. (from RFC 3629)
1024
1025 // mask to extract lead byte's value ('x' bits above), by sequence
1026 // length:
1027 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1028
1029 // mask and value of lead byte's most significant bits, by length:
1030 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1031 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1032
1033 len--; // it's more convenient to work with 0-based length here
1034
1035 // extract the lead byte's value bits:
1036 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1037 break;
1038
1039 code = c & leadValueMask[len];
1040
1041 // all remaining bytes, if any, are handled in the same way
1042 // regardless of sequence's length:
1043 for ( ; len; --len )
1044 {
1045 c = *++p;
1046 if ( (c & 0xC0) != 0x80 )
1047 return wxCONV_FAILED;
1048
1049 code <<= 6;
1050 code |= c & 0x3F;
1051 }
1052 }
1053
1054 #ifdef WC_UTF16
1055 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1056 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1057 {
1058 if ( out )
1059 out++;
1060 written++;
1061 }
1062 #else // !WC_UTF16
1063 if ( out )
1064 *out = code;
1065 #endif // WC_UTF16/!WC_UTF16
1066
1067 if ( out )
1068 out++;
1069
1070 written++;
1071 }
1072
1073 return wxCONV_FAILED;
1074 }
1075
1076 size_t
1077 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1078 const wchar_t *src, size_t srcLen) const
1079 {
1080 char *out = dstLen ? dst : NULL;
1081 size_t written = 0;
1082
1083 for ( const wchar_t *wp = src; ; wp++ )
1084 {
1085 if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1086 {
1087 // all done successfully, just add the trailing NULL if we are not
1088 // using explicit length
1089 if ( srcLen == wxNO_LEN )
1090 {
1091 if ( out )
1092 {
1093 if ( !dstLen )
1094 break;
1095
1096 *out = '\0';
1097 }
1098
1099 written++;
1100 }
1101
1102 return written;
1103 }
1104
1105 if ( srcLen != wxNO_LEN )
1106 srcLen--;
1107
1108 wxUint32 code;
1109 #ifdef WC_UTF16
1110 // cast is ok for WC_UTF16
1111 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1112 {
1113 // skip the next char too as we decoded a surrogate
1114 wp++;
1115 }
1116 #else // wchar_t is UTF-32
1117 code = *wp & 0x7fffffff;
1118 #endif
1119
1120 unsigned len;
1121 if ( code <= 0x7F )
1122 {
1123 len = 1;
1124 if ( out )
1125 {
1126 if ( dstLen < len )
1127 break;
1128
1129 out[0] = (char)code;
1130 }
1131 }
1132 else if ( code <= 0x07FF )
1133 {
1134 len = 2;
1135 if ( out )
1136 {
1137 if ( dstLen < len )
1138 break;
1139
1140 // NB: this line takes 6 least significant bits, encodes them as
1141 // 10xxxxxx and discards them so that the next byte can be encoded:
1142 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1143 out[0] = 0xC0 | code;
1144 }
1145 }
1146 else if ( code < 0xFFFF )
1147 {
1148 len = 3;
1149 if ( out )
1150 {
1151 if ( dstLen < len )
1152 break;
1153
1154 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1155 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1156 out[0] = 0xE0 | code;
1157 }
1158 }
1159 else if ( code <= 0x10FFFF )
1160 {
1161 len = 4;
1162 if ( out )
1163 {
1164 if ( dstLen < len )
1165 break;
1166
1167 out[3] = 0x80 | (code & 0x3F); code >>= 6;
1168 out[2] = 0x80 | (code & 0x3F); code >>= 6;
1169 out[1] = 0x80 | (code & 0x3F); code >>= 6;
1170 out[0] = 0xF0 | code;
1171 }
1172 }
1173 else
1174 {
1175 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1176 break;
1177 }
1178
1179 if ( out )
1180 {
1181 out += len;
1182 dstLen -= len;
1183 }
1184
1185 written += len;
1186 }
1187
1188 // we only get here if an error occurs during decoding
1189 return wxCONV_FAILED;
1190 }
1191
1192 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1193 const char *psz, size_t srcLen) const
1194 {
1195 if ( m_options == MAP_INVALID_UTF8_NOT )
1196 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1197
1198 size_t len = 0;
1199
1200 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1201 {
1202 const char *opsz = psz;
1203 bool invalid = false;
1204 unsigned char cc = *psz++, fc = cc;
1205 unsigned cnt;
1206 for (cnt = 0; fc & 0x80; cnt++)
1207 fc <<= 1;
1208
1209 if (!cnt)
1210 {
1211 // plain ASCII char
1212 if (buf)
1213 *buf++ = cc;
1214 len++;
1215
1216 // escape the escape character for octal escapes
1217 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1218 && cc == '\\' && (!buf || len < n))
1219 {
1220 if (buf)
1221 *buf++ = cc;
1222 len++;
1223 }
1224 }
1225 else
1226 {
1227 cnt--;
1228 if (!cnt)
1229 {
1230 // invalid UTF-8 sequence
1231 invalid = true;
1232 }
1233 else
1234 {
1235 unsigned ocnt = cnt - 1;
1236 wxUint32 res = cc & (0x3f >> cnt);
1237 while (cnt--)
1238 {
1239 cc = *psz;
1240 if ((cc & 0xC0) != 0x80)
1241 {
1242 // invalid UTF-8 sequence
1243 invalid = true;
1244 break;
1245 }
1246
1247 psz++;
1248 res = (res << 6) | (cc & 0x3f);
1249 }
1250
1251 if (invalid || res <= utf8_max[ocnt])
1252 {
1253 // illegal UTF-8 encoding
1254 invalid = true;
1255 }
1256 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1257 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1258 {
1259 // if one of our PUA characters turns up externally
1260 // it must also be treated as an illegal sequence
1261 // (a bit like you have to escape an escape character)
1262 invalid = true;
1263 }
1264 else
1265 {
1266 #ifdef WC_UTF16
1267 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1268 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1269 if (pa == wxCONV_FAILED)
1270 {
1271 invalid = true;
1272 }
1273 else
1274 {
1275 if (buf)
1276 buf += pa;
1277 len += pa;
1278 }
1279 #else // !WC_UTF16
1280 if (buf)
1281 *buf++ = (wchar_t)res;
1282 len++;
1283 #endif // WC_UTF16/!WC_UTF16
1284 }
1285 }
1286
1287 if (invalid)
1288 {
1289 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1290 {
1291 while (opsz < psz && (!buf || len < n))
1292 {
1293 #ifdef WC_UTF16
1294 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1295 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1296 wxASSERT(pa != wxCONV_FAILED);
1297 if (buf)
1298 buf += pa;
1299 opsz++;
1300 len += pa;
1301 #else
1302 if (buf)
1303 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1304 opsz++;
1305 len++;
1306 #endif
1307 }
1308 }
1309 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1310 {
1311 while (opsz < psz && (!buf || len < n))
1312 {
1313 if ( buf && len + 3 < n )
1314 {
1315 unsigned char on = *opsz;
1316 *buf++ = L'\\';
1317 *buf++ = (wchar_t)( L'0' + on / 0100 );
1318 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1319 *buf++ = (wchar_t)( L'0' + on % 010 );
1320 }
1321
1322 opsz++;
1323 len += 4;
1324 }
1325 }
1326 else // MAP_INVALID_UTF8_NOT
1327 {
1328 return wxCONV_FAILED;
1329 }
1330 }
1331 }
1332 }
1333
1334 if (srcLen == wxNO_LEN && buf && (len < n))
1335 *buf = 0;
1336
1337 return len + 1;
1338 }
1339
1340 static inline bool isoctal(wchar_t wch)
1341 {
1342 return L'0' <= wch && wch <= L'7';
1343 }
1344
1345 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1346 const wchar_t *psz, size_t srcLen) const
1347 {
1348 if ( m_options == MAP_INVALID_UTF8_NOT )
1349 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1350
1351 size_t len = 0;
1352
1353 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1354 {
1355 wxUint32 cc;
1356
1357 #ifdef WC_UTF16
1358 // cast is ok for WC_UTF16
1359 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1360 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1361 #else
1362 cc = (*psz++) & 0x7fffffff;
1363 #endif
1364
1365 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1366 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1367 {
1368 if (buf)
1369 *buf++ = (char)(cc - wxUnicodePUA);
1370 len++;
1371 }
1372 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1373 && cc == L'\\' && psz[0] == L'\\' )
1374 {
1375 if (buf)
1376 *buf++ = (char)cc;
1377 psz++;
1378 len++;
1379 }
1380 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1381 cc == L'\\' &&
1382 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1383 {
1384 if (buf)
1385 {
1386 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1387 (psz[1] - L'0') * 010 +
1388 (psz[2] - L'0'));
1389 }
1390
1391 psz += 3;
1392 len++;
1393 }
1394 else
1395 {
1396 unsigned cnt;
1397 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1398 {
1399 }
1400
1401 if (!cnt)
1402 {
1403 // plain ASCII char
1404 if (buf)
1405 *buf++ = (char) cc;
1406 len++;
1407 }
1408 else
1409 {
1410 len += cnt + 1;
1411 if (buf)
1412 {
1413 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1414 while (cnt--)
1415 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1416 }
1417 }
1418 }
1419 }
1420
1421 if (srcLen == wxNO_LEN && buf && (len < n))
1422 *buf = 0;
1423
1424 return len + 1;
1425 }
1426
1427 // ============================================================================
1428 // UTF-16
1429 // ============================================================================
1430
1431 #ifdef WORDS_BIGENDIAN
1432 #define wxMBConvUTF16straight wxMBConvUTF16BE
1433 #define wxMBConvUTF16swap wxMBConvUTF16LE
1434 #else
1435 #define wxMBConvUTF16swap wxMBConvUTF16BE
1436 #define wxMBConvUTF16straight wxMBConvUTF16LE
1437 #endif
1438
1439 /* static */
1440 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1441 {
1442 if ( srcLen == wxNO_LEN )
1443 {
1444 // count the number of bytes in input, including the trailing NULs
1445 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1446 for ( srcLen = 1; *inBuff++; srcLen++ )
1447 ;
1448
1449 srcLen *= BYTES_PER_CHAR;
1450 }
1451 else // we already have the length
1452 {
1453 // we can only convert an entire number of UTF-16 characters
1454 if ( srcLen % BYTES_PER_CHAR )
1455 return wxCONV_FAILED;
1456 }
1457
1458 return srcLen;
1459 }
1460
1461 // case when in-memory representation is UTF-16 too
1462 #ifdef WC_UTF16
1463
1464 // ----------------------------------------------------------------------------
1465 // conversions without endianness change
1466 // ----------------------------------------------------------------------------
1467
1468 size_t
1469 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1470 const char *src, size_t srcLen) const
1471 {
1472 // set up the scene for using memcpy() (which is presumably more efficient
1473 // than copying the bytes one by one)
1474 srcLen = GetLength(src, srcLen);
1475 if ( srcLen == wxNO_LEN )
1476 return wxCONV_FAILED;
1477
1478 const size_t inLen = srcLen / BYTES_PER_CHAR;
1479 if ( dst )
1480 {
1481 if ( dstLen < inLen )
1482 return wxCONV_FAILED;
1483
1484 memcpy(dst, src, srcLen);
1485 }
1486
1487 return inLen;
1488 }
1489
1490 size_t
1491 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1492 const wchar_t *src, size_t srcLen) const
1493 {
1494 if ( srcLen == wxNO_LEN )
1495 srcLen = wxWcslen(src) + 1;
1496
1497 srcLen *= BYTES_PER_CHAR;
1498
1499 if ( dst )
1500 {
1501 if ( dstLen < srcLen )
1502 return wxCONV_FAILED;
1503
1504 memcpy(dst, src, srcLen);
1505 }
1506
1507 return srcLen;
1508 }
1509
1510 // ----------------------------------------------------------------------------
1511 // endian-reversing conversions
1512 // ----------------------------------------------------------------------------
1513
1514 size_t
1515 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1516 const char *src, size_t srcLen) const
1517 {
1518 srcLen = GetLength(src, srcLen);
1519 if ( srcLen == wxNO_LEN )
1520 return wxCONV_FAILED;
1521
1522 srcLen /= BYTES_PER_CHAR;
1523
1524 if ( dst )
1525 {
1526 if ( dstLen < srcLen )
1527 return wxCONV_FAILED;
1528
1529 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1530 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1531 {
1532 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1533 }
1534 }
1535
1536 return srcLen;
1537 }
1538
1539 size_t
1540 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1541 const wchar_t *src, size_t srcLen) const
1542 {
1543 if ( srcLen == wxNO_LEN )
1544 srcLen = wxWcslen(src) + 1;
1545
1546 srcLen *= BYTES_PER_CHAR;
1547
1548 if ( dst )
1549 {
1550 if ( dstLen < srcLen )
1551 return wxCONV_FAILED;
1552
1553 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1554 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1555 {
1556 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1557 }
1558 }
1559
1560 return srcLen;
1561 }
1562
1563 #else // !WC_UTF16: wchar_t is UTF-32
1564
1565 // ----------------------------------------------------------------------------
1566 // conversions without endianness change
1567 // ----------------------------------------------------------------------------
1568
1569 size_t
1570 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1571 const char *src, size_t srcLen) const
1572 {
1573 srcLen = GetLength(src, srcLen);
1574 if ( srcLen == wxNO_LEN )
1575 return wxCONV_FAILED;
1576
1577 const size_t inLen = srcLen / BYTES_PER_CHAR;
1578 if ( !dst )
1579 {
1580 // optimization: return maximal space which could be needed for this
1581 // string even if the real size could be smaller if the buffer contains
1582 // any surrogates
1583 return inLen;
1584 }
1585
1586 size_t outLen = 0;
1587 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1588 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1589 {
1590 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1591 if ( !inBuff )
1592 return wxCONV_FAILED;
1593
1594 if ( ++outLen > dstLen )
1595 return wxCONV_FAILED;
1596
1597 *dst++ = ch;
1598 }
1599
1600
1601 return outLen;
1602 }
1603
1604 size_t
1605 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1606 const wchar_t *src, size_t srcLen) const
1607 {
1608 if ( srcLen == wxNO_LEN )
1609 srcLen = wxWcslen(src) + 1;
1610
1611 size_t outLen = 0;
1612 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1613 for ( size_t n = 0; n < srcLen; n++ )
1614 {
1615 wxUint16 cc[2];
1616 const size_t numChars = encode_utf16(*src++, cc);
1617 if ( numChars == wxCONV_FAILED )
1618 return wxCONV_FAILED;
1619
1620 outLen += numChars * BYTES_PER_CHAR;
1621 if ( outBuff )
1622 {
1623 if ( outLen > dstLen )
1624 return wxCONV_FAILED;
1625
1626 *outBuff++ = cc[0];
1627 if ( numChars == 2 )
1628 {
1629 // second character of a surrogate
1630 *outBuff++ = cc[1];
1631 }
1632 }
1633 }
1634
1635 return outLen;
1636 }
1637
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1641
1642 size_t
1643 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1644 const char *src, size_t srcLen) const
1645 {
1646 srcLen = GetLength(src, srcLen);
1647 if ( srcLen == wxNO_LEN )
1648 return wxCONV_FAILED;
1649
1650 const size_t inLen = srcLen / BYTES_PER_CHAR;
1651 if ( !dst )
1652 {
1653 // optimization: return maximal space which could be needed for this
1654 // string even if the real size could be smaller if the buffer contains
1655 // any surrogates
1656 return inLen;
1657 }
1658
1659 size_t outLen = 0;
1660 const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1661 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1662 {
1663 wxUint32 ch;
1664 wxUint16 tmp[2];
1665
1666 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1667 inBuff++;
1668 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1669
1670 const size_t numChars = decode_utf16(tmp, ch);
1671 if ( numChars == wxCONV_FAILED )
1672 return wxCONV_FAILED;
1673
1674 if ( numChars == 2 )
1675 inBuff++;
1676
1677 if ( ++outLen > dstLen )
1678 return wxCONV_FAILED;
1679
1680 *dst++ = ch;
1681 }
1682
1683
1684 return outLen;
1685 }
1686
1687 size_t
1688 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1689 const wchar_t *src, size_t srcLen) const
1690 {
1691 if ( srcLen == wxNO_LEN )
1692 srcLen = wxWcslen(src) + 1;
1693
1694 size_t outLen = 0;
1695 wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1696 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1697 {
1698 wxUint16 cc[2];
1699 const size_t numChars = encode_utf16(*src, cc);
1700 if ( numChars == wxCONV_FAILED )
1701 return wxCONV_FAILED;
1702
1703 outLen += numChars * BYTES_PER_CHAR;
1704 if ( outBuff )
1705 {
1706 if ( outLen > dstLen )
1707 return wxCONV_FAILED;
1708
1709 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1710 if ( numChars == 2 )
1711 {
1712 // second character of a surrogate
1713 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1714 }
1715 }
1716 }
1717
1718 return outLen;
1719 }
1720
1721 #endif // WC_UTF16/!WC_UTF16
1722
1723
1724 // ============================================================================
1725 // UTF-32
1726 // ============================================================================
1727
1728 #ifdef WORDS_BIGENDIAN
1729 #define wxMBConvUTF32straight wxMBConvUTF32BE
1730 #define wxMBConvUTF32swap wxMBConvUTF32LE
1731 #else
1732 #define wxMBConvUTF32swap wxMBConvUTF32BE
1733 #define wxMBConvUTF32straight wxMBConvUTF32LE
1734 #endif
1735
1736
1737 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1738 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1739
1740 /* static */
1741 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1742 {
1743 if ( srcLen == wxNO_LEN )
1744 {
1745 // count the number of bytes in input, including the trailing NULs
1746 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1747 for ( srcLen = 1; *inBuff++; srcLen++ )
1748 ;
1749
1750 srcLen *= BYTES_PER_CHAR;
1751 }
1752 else // we already have the length
1753 {
1754 // we can only convert an entire number of UTF-32 characters
1755 if ( srcLen % BYTES_PER_CHAR )
1756 return wxCONV_FAILED;
1757 }
1758
1759 return srcLen;
1760 }
1761
1762 // case when in-memory representation is UTF-16
1763 #ifdef WC_UTF16
1764
1765 // ----------------------------------------------------------------------------
1766 // conversions without endianness change
1767 // ----------------------------------------------------------------------------
1768
1769 size_t
1770 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1771 const char *src, size_t srcLen) const
1772 {
1773 srcLen = GetLength(src, srcLen);
1774 if ( srcLen == wxNO_LEN )
1775 return wxCONV_FAILED;
1776
1777 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1778 const size_t inLen = srcLen / BYTES_PER_CHAR;
1779 size_t outLen = 0;
1780 for ( size_t n = 0; n < inLen; n++ )
1781 {
1782 wxUint16 cc[2];
1783 const size_t numChars = encode_utf16(*inBuff++, cc);
1784 if ( numChars == wxCONV_FAILED )
1785 return wxCONV_FAILED;
1786
1787 outLen += numChars;
1788 if ( dst )
1789 {
1790 if ( outLen > dstLen )
1791 return wxCONV_FAILED;
1792
1793 *dst++ = cc[0];
1794 if ( numChars == 2 )
1795 {
1796 // second character of a surrogate
1797 *dst++ = cc[1];
1798 }
1799 }
1800 }
1801
1802 return outLen;
1803 }
1804
1805 size_t
1806 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1807 const wchar_t *src, size_t srcLen) const
1808 {
1809 if ( srcLen == wxNO_LEN )
1810 srcLen = wxWcslen(src) + 1;
1811
1812 if ( !dst )
1813 {
1814 // optimization: return maximal space which could be needed for this
1815 // string instead of the exact amount which could be less if there are
1816 // any surrogates in the input
1817 //
1818 // we consider that surrogates are rare enough to make it worthwhile to
1819 // avoid running the loop below at the cost of slightly extra memory
1820 // consumption
1821 return srcLen * BYTES_PER_CHAR;
1822 }
1823
1824 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1825 size_t outLen = 0;
1826 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1827 {
1828 const wxUint32 ch = wxDecodeSurrogate(&src);
1829 if ( !src )
1830 return wxCONV_FAILED;
1831
1832 outLen += BYTES_PER_CHAR;
1833
1834 if ( outLen > dstLen )
1835 return wxCONV_FAILED;
1836
1837 *outBuff++ = ch;
1838 }
1839
1840 return outLen;
1841 }
1842
1843 // ----------------------------------------------------------------------------
1844 // endian-reversing conversions
1845 // ----------------------------------------------------------------------------
1846
1847 size_t
1848 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1849 const char *src, size_t srcLen) const
1850 {
1851 srcLen = GetLength(src, srcLen);
1852 if ( srcLen == wxNO_LEN )
1853 return wxCONV_FAILED;
1854
1855 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1856 const size_t inLen = srcLen / BYTES_PER_CHAR;
1857 size_t outLen = 0;
1858 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1859 {
1860 wxUint16 cc[2];
1861 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1862 if ( numChars == wxCONV_FAILED )
1863 return wxCONV_FAILED;
1864
1865 outLen += numChars;
1866 if ( dst )
1867 {
1868 if ( outLen > dstLen )
1869 return wxCONV_FAILED;
1870
1871 *dst++ = cc[0];
1872 if ( numChars == 2 )
1873 {
1874 // second character of a surrogate
1875 *dst++ = cc[1];
1876 }
1877 }
1878 }
1879
1880 return outLen;
1881 }
1882
1883 size_t
1884 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1885 const wchar_t *src, size_t srcLen) const
1886 {
1887 if ( srcLen == wxNO_LEN )
1888 srcLen = wxWcslen(src) + 1;
1889
1890 if ( !dst )
1891 {
1892 // optimization: return maximal space which could be needed for this
1893 // string instead of the exact amount which could be less if there are
1894 // any surrogates in the input
1895 //
1896 // we consider that surrogates are rare enough to make it worthwhile to
1897 // avoid running the loop below at the cost of slightly extra memory
1898 // consumption
1899 return srcLen*BYTES_PER_CHAR;
1900 }
1901
1902 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1903 size_t outLen = 0;
1904 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1905 {
1906 const wxUint32 ch = wxDecodeSurrogate(&src);
1907 if ( !src )
1908 return wxCONV_FAILED;
1909
1910 outLen += BYTES_PER_CHAR;
1911
1912 if ( outLen > dstLen )
1913 return wxCONV_FAILED;
1914
1915 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1916 }
1917
1918 return outLen;
1919 }
1920
1921 #else // !WC_UTF16: wchar_t is UTF-32
1922
1923 // ----------------------------------------------------------------------------
1924 // conversions without endianness change
1925 // ----------------------------------------------------------------------------
1926
1927 size_t
1928 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1929 const char *src, size_t srcLen) const
1930 {
1931 // use memcpy() as it should be much faster than hand-written loop
1932 srcLen = GetLength(src, srcLen);
1933 if ( srcLen == wxNO_LEN )
1934 return wxCONV_FAILED;
1935
1936 const size_t inLen = srcLen/BYTES_PER_CHAR;
1937 if ( dst )
1938 {
1939 if ( dstLen < inLen )
1940 return wxCONV_FAILED;
1941
1942 memcpy(dst, src, srcLen);
1943 }
1944
1945 return inLen;
1946 }
1947
1948 size_t
1949 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1950 const wchar_t *src, size_t srcLen) const
1951 {
1952 if ( srcLen == wxNO_LEN )
1953 srcLen = wxWcslen(src) + 1;
1954
1955 srcLen *= BYTES_PER_CHAR;
1956
1957 if ( dst )
1958 {
1959 if ( dstLen < srcLen )
1960 return wxCONV_FAILED;
1961
1962 memcpy(dst, src, srcLen);
1963 }
1964
1965 return srcLen;
1966 }
1967
1968 // ----------------------------------------------------------------------------
1969 // endian-reversing conversions
1970 // ----------------------------------------------------------------------------
1971
1972 size_t
1973 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1974 const char *src, size_t srcLen) const
1975 {
1976 srcLen = GetLength(src, srcLen);
1977 if ( srcLen == wxNO_LEN )
1978 return wxCONV_FAILED;
1979
1980 srcLen /= BYTES_PER_CHAR;
1981
1982 if ( dst )
1983 {
1984 if ( dstLen < srcLen )
1985 return wxCONV_FAILED;
1986
1987 const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1988 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1989 {
1990 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1991 }
1992 }
1993
1994 return srcLen;
1995 }
1996
1997 size_t
1998 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1999 const wchar_t *src, size_t srcLen) const
2000 {
2001 if ( srcLen == wxNO_LEN )
2002 srcLen = wxWcslen(src) + 1;
2003
2004 srcLen *= BYTES_PER_CHAR;
2005
2006 if ( dst )
2007 {
2008 if ( dstLen < srcLen )
2009 return wxCONV_FAILED;
2010
2011 wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2012 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2013 {
2014 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2015 }
2016 }
2017
2018 return srcLen;
2019 }
2020
2021 #endif // WC_UTF16/!WC_UTF16
2022
2023
2024 // ============================================================================
2025 // The classes doing conversion using the iconv_xxx() functions
2026 // ============================================================================
2027
2028 #ifdef HAVE_ICONV
2029
2030 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2031 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2032 // (unless there's yet another bug in glibc) the only case when iconv()
2033 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2034 // left in the input buffer -- when _real_ error occurs,
2035 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2036 // iconv() failure.
2037 // [This bug does not appear in glibc 2.2.]
2038 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2039 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2040 (errno != E2BIG || bufLeft != 0))
2041 #else
2042 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2043 #endif
2044
2045 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2046
2047 #define ICONV_T_INVALID ((iconv_t)-1)
2048
2049 #if SIZEOF_WCHAR_T == 4
2050 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2051 #define WC_ENC wxFONTENCODING_UTF32
2052 #elif SIZEOF_WCHAR_T == 2
2053 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2054 #define WC_ENC wxFONTENCODING_UTF16
2055 #else // sizeof(wchar_t) != 2 nor 4
2056 // does this ever happen?
2057 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2058 #endif
2059
2060 // ----------------------------------------------------------------------------
2061 // wxMBConv_iconv: encapsulates an iconv character set
2062 // ----------------------------------------------------------------------------
2063
2064 class wxMBConv_iconv : public wxMBConv
2065 {
2066 public:
2067 wxMBConv_iconv(const char *name);
2068 virtual ~wxMBConv_iconv();
2069
2070 // implement base class virtual methods
2071 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2072 const char *src, size_t srcLen = wxNO_LEN) const;
2073 virtual size_t FromWChar(char *dst, size_t dstLen,
2074 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2075 virtual size_t GetMBNulLen() const;
2076
2077 #if wxUSE_UNICODE_UTF8
2078 virtual bool IsUTF8() const;
2079 #endif
2080
2081 virtual wxMBConv *Clone() const
2082 {
2083 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2084 p->m_minMBCharWidth = m_minMBCharWidth;
2085 return p;
2086 }
2087
2088 bool IsOk() const
2089 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2090
2091 protected:
2092 // the iconv handlers used to translate from multibyte
2093 // to wide char and in the other direction
2094 iconv_t m2w,
2095 w2m;
2096
2097 #if wxUSE_THREADS
2098 // guards access to m2w and w2m objects
2099 wxMutex m_iconvMutex;
2100 #endif
2101
2102 private:
2103 // the name (for iconv_open()) of a wide char charset -- if none is
2104 // available on this machine, it will remain NULL
2105 static wxString ms_wcCharsetName;
2106
2107 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2108 // different endian-ness than the native one
2109 static bool ms_wcNeedsSwap;
2110
2111
2112 // name of the encoding handled by this conversion
2113 wxString m_name;
2114
2115 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2116 // initially
2117 size_t m_minMBCharWidth;
2118 };
2119
2120 // make the constructor available for unit testing
2121 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2122 {
2123 wxMBConv_iconv* result = new wxMBConv_iconv( name );
2124 if ( !result->IsOk() )
2125 {
2126 delete result;
2127 return 0;
2128 }
2129
2130 return result;
2131 }
2132
2133 wxString wxMBConv_iconv::ms_wcCharsetName;
2134 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2135
2136 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2137 : m_name(name)
2138 {
2139 m_minMBCharWidth = 0;
2140
2141 // check for charset that represents wchar_t:
2142 if ( ms_wcCharsetName.empty() )
2143 {
2144 wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2145
2146 #if wxUSE_FONTMAP
2147 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2148 #else // !wxUSE_FONTMAP
2149 static const wxChar *names_static[] =
2150 {
2151 #if SIZEOF_WCHAR_T == 4
2152 wxT("UCS-4"),
2153 #elif SIZEOF_WCHAR_T = 2
2154 wxT("UCS-2"),
2155 #endif
2156 NULL
2157 };
2158 const wxChar **names = names_static;
2159 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2160
2161 for ( ; *names && ms_wcCharsetName.empty(); ++names )
2162 {
2163 const wxString nameCS(*names);
2164
2165 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2166 wxString nameXE(nameCS);
2167
2168 #ifdef WORDS_BIGENDIAN
2169 nameXE += wxT("BE");
2170 #else // little endian
2171 nameXE += wxT("LE");
2172 #endif
2173
2174 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2175 nameXE.c_str());
2176
2177 m2w = iconv_open(nameXE.ToAscii(), name);
2178 if ( m2w == ICONV_T_INVALID )
2179 {
2180 // try charset w/o bytesex info (e.g. "UCS4")
2181 wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
2182 nameCS.c_str());
2183 m2w = iconv_open(nameCS.ToAscii(), name);
2184
2185 // and check for bytesex ourselves:
2186 if ( m2w != ICONV_T_INVALID )
2187 {
2188 char buf[2], *bufPtr;
2189 wchar_t wbuf[2];
2190 size_t insz, outsz;
2191 size_t res;
2192
2193 buf[0] = 'A';
2194 buf[1] = 0;
2195 wbuf[0] = 0;
2196 insz = 2;
2197 outsz = SIZEOF_WCHAR_T * 2;
2198 char* wbufPtr = (char*)wbuf;
2199 bufPtr = buf;
2200
2201 res = iconv(
2202 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2203 &wbufPtr, &outsz);
2204
2205 if (ICONV_FAILED(res, insz))
2206 {
2207 wxLogLastError(wxT("iconv"));
2208 wxLogError(_("Conversion to charset '%s' doesn't work."),
2209 nameCS.c_str());
2210 }
2211 else // ok, can convert to this encoding, remember it
2212 {
2213 ms_wcCharsetName = nameCS;
2214 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2215 }
2216 }
2217 }
2218 else // use charset not requiring byte swapping
2219 {
2220 ms_wcCharsetName = nameXE;
2221 }
2222 }
2223
2224 wxLogTrace(TRACE_STRCONV,
2225 wxT("iconv wchar_t charset is \"%s\"%s"),
2226 ms_wcCharsetName.empty() ? wxString("<none>")
2227 : ms_wcCharsetName,
2228 ms_wcNeedsSwap ? wxT(" (needs swap)")
2229 : wxT(""));
2230 }
2231 else // we already have ms_wcCharsetName
2232 {
2233 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2234 }
2235
2236 if ( ms_wcCharsetName.empty() )
2237 {
2238 w2m = ICONV_T_INVALID;
2239 }
2240 else
2241 {
2242 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2243 if ( w2m == ICONV_T_INVALID )
2244 {
2245 wxLogTrace(TRACE_STRCONV,
2246 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2247 ms_wcCharsetName.c_str(), name);
2248 }
2249 }
2250 }
2251
2252 wxMBConv_iconv::~wxMBConv_iconv()
2253 {
2254 if ( m2w != ICONV_T_INVALID )
2255 iconv_close(m2w);
2256 if ( w2m != ICONV_T_INVALID )
2257 iconv_close(w2m);
2258 }
2259
2260 size_t
2261 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2262 const char *src, size_t srcLen) const
2263 {
2264 if ( srcLen == wxNO_LEN )
2265 {
2266 // find the string length: notice that must be done differently for
2267 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2268 // consecutive NULs
2269 const size_t nulLen = GetMBNulLen();
2270 switch ( nulLen )
2271 {
2272 default:
2273 return wxCONV_FAILED;
2274
2275 case 1:
2276 srcLen = strlen(src); // arguably more optimized than our version
2277 break;
2278
2279 case 2:
2280 case 4:
2281 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2282 // but they also have to start at character boundary and not
2283 // span two adjacent characters
2284 const char *p;
2285 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2286 ;
2287 srcLen = p - src;
2288 break;
2289 }
2290
2291 // when we're determining the length of the string ourselves we count
2292 // the terminating NUL(s) as part of it and always NUL-terminate the
2293 // output
2294 srcLen += nulLen;
2295 }
2296
2297 // we express length in the number of (wide) characters but iconv always
2298 // counts buffer sizes it in bytes
2299 dstLen *= SIZEOF_WCHAR_T;
2300
2301 #if wxUSE_THREADS
2302 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2303 // Unfortunately there are a couple of global wxCSConv objects such as
2304 // wxConvLocal that are used all over wx code, so we have to make sure
2305 // the handle is used by at most one thread at the time. Otherwise
2306 // only a few wx classes would be safe to use from non-main threads
2307 // as MB<->WC conversion would fail "randomly".
2308 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2309 #endif // wxUSE_THREADS
2310
2311 size_t res, cres;
2312 const char *pszPtr = src;
2313
2314 if ( dst )
2315 {
2316 char* bufPtr = (char*)dst;
2317
2318 // have destination buffer, convert there
2319 size_t dstLenOrig = dstLen;
2320 cres = iconv(m2w,
2321 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2322 &bufPtr, &dstLen);
2323
2324 // convert the number of bytes converted as returned by iconv to the
2325 // number of (wide) characters converted that we need
2326 res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2327
2328 if (ms_wcNeedsSwap)
2329 {
2330 // convert to native endianness
2331 for ( unsigned i = 0; i < res; i++ )
2332 dst[i] = WC_BSWAP(dst[i]);
2333 }
2334 }
2335 else // no destination buffer
2336 {
2337 // convert using temp buffer to calculate the size of the buffer needed
2338 wchar_t tbuf[256];
2339 res = 0;
2340
2341 do
2342 {
2343 char* bufPtr = (char*)tbuf;
2344 dstLen = 8 * SIZEOF_WCHAR_T;
2345
2346 cres = iconv(m2w,
2347 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2348 &bufPtr, &dstLen );
2349
2350 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2351 }
2352 while ((cres == (size_t)-1) && (errno == E2BIG));
2353 }
2354
2355 if (ICONV_FAILED(cres, srcLen))
2356 {
2357 //VS: it is ok if iconv fails, hence trace only
2358 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2359 return wxCONV_FAILED;
2360 }
2361
2362 return res;
2363 }
2364
2365 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2366 const wchar_t *src, size_t srcLen) const
2367 {
2368 #if wxUSE_THREADS
2369 // NB: explained in MB2WC
2370 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2371 #endif
2372
2373 if ( srcLen == wxNO_LEN )
2374 srcLen = wxWcslen(src) + 1;
2375
2376 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2377 size_t outbuflen = dstLen;
2378 size_t res, cres;
2379
2380 wchar_t *tmpbuf = 0;
2381
2382 if (ms_wcNeedsSwap)
2383 {
2384 // need to copy to temp buffer to switch endianness
2385 // (doing WC_BSWAP twice on the original buffer won't work, as it
2386 // could be in read-only memory, or be accessed in some other thread)
2387 tmpbuf = (wchar_t *)malloc(inbuflen);
2388 for ( size_t i = 0; i < srcLen; i++ )
2389 tmpbuf[i] = WC_BSWAP(src[i]);
2390
2391 src = tmpbuf;
2392 }
2393
2394 char* inbuf = (char*)src;
2395 if ( dst )
2396 {
2397 // have destination buffer, convert there
2398 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2399
2400 res = dstLen - outbuflen;
2401 }
2402 else // no destination buffer
2403 {
2404 // convert using temp buffer to calculate the size of the buffer needed
2405 char tbuf[256];
2406 res = 0;
2407 do
2408 {
2409 dst = tbuf;
2410 outbuflen = WXSIZEOF(tbuf);
2411
2412 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2413
2414 res += WXSIZEOF(tbuf) - outbuflen;
2415 }
2416 while ((cres == (size_t)-1) && (errno == E2BIG));
2417 }
2418
2419 if (ms_wcNeedsSwap)
2420 {
2421 free(tmpbuf);
2422 }
2423
2424 if (ICONV_FAILED(cres, inbuflen))
2425 {
2426 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2427 return wxCONV_FAILED;
2428 }
2429
2430 return res;
2431 }
2432
2433 size_t wxMBConv_iconv::GetMBNulLen() const
2434 {
2435 if ( m_minMBCharWidth == 0 )
2436 {
2437 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2438
2439 #if wxUSE_THREADS
2440 // NB: explained in MB2WC
2441 wxMutexLocker lock(self->m_iconvMutex);
2442 #endif
2443
2444 const wchar_t *wnul = L"";
2445 char buf[8]; // should be enough for NUL in any encoding
2446 size_t inLen = sizeof(wchar_t),
2447 outLen = WXSIZEOF(buf);
2448 char *inBuff = (char *)wnul;
2449 char *outBuff = buf;
2450 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2451 {
2452 self->m_minMBCharWidth = (size_t)-1;
2453 }
2454 else // ok
2455 {
2456 self->m_minMBCharWidth = outBuff - buf;
2457 }
2458 }
2459
2460 return m_minMBCharWidth;
2461 }
2462
2463 #if wxUSE_UNICODE_UTF8
2464 bool wxMBConv_iconv::IsUTF8() const
2465 {
2466 return wxStricmp(m_name, "UTF-8") == 0 ||
2467 wxStricmp(m_name, "UTF8") == 0;
2468 }
2469 #endif
2470
2471 #endif // HAVE_ICONV
2472
2473
2474 // ============================================================================
2475 // Win32 conversion classes
2476 // ============================================================================
2477
2478 #ifdef wxHAVE_WIN32_MB2WC
2479
2480 // from utils.cpp
2481 #if wxUSE_FONTMAP
2482 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2483 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2484 #endif
2485
2486 class wxMBConv_win32 : public wxMBConv
2487 {
2488 public:
2489 wxMBConv_win32()
2490 {
2491 m_CodePage = CP_ACP;
2492 m_minMBCharWidth = 0;
2493 }
2494
2495 wxMBConv_win32(const wxMBConv_win32& conv)
2496 : wxMBConv()
2497 {
2498 m_CodePage = conv.m_CodePage;
2499 m_minMBCharWidth = conv.m_minMBCharWidth;
2500 }
2501
2502 #if wxUSE_FONTMAP
2503 wxMBConv_win32(const char* name)
2504 {
2505 m_CodePage = wxCharsetToCodepage(name);
2506 m_minMBCharWidth = 0;
2507 }
2508
2509 wxMBConv_win32(wxFontEncoding encoding)
2510 {
2511 m_CodePage = wxEncodingToCodepage(encoding);
2512 m_minMBCharWidth = 0;
2513 }
2514 #endif // wxUSE_FONTMAP
2515
2516 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2517 {
2518 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2519 // the behaviour is not compatible with the Unix version (using iconv)
2520 // and break the library itself, e.g. wxTextInputStream::NextChar()
2521 // wouldn't work if reading an incomplete MB char didn't result in an
2522 // error
2523 //
2524 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2525 // Win XP or newer and it is not supported for UTF-[78] so we always
2526 // use our own conversions in this case. See
2527 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2528 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2529 if ( m_CodePage == CP_UTF8 )
2530 {
2531 return wxMBConvUTF8().MB2WC(buf, psz, n);
2532 }
2533
2534 if ( m_CodePage == CP_UTF7 )
2535 {
2536 return wxMBConvUTF7().MB2WC(buf, psz, n);
2537 }
2538
2539 int flags = 0;
2540 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2541 IsAtLeastWin2kSP4() )
2542 {
2543 flags = MB_ERR_INVALID_CHARS;
2544 }
2545
2546 const size_t len = ::MultiByteToWideChar
2547 (
2548 m_CodePage, // code page
2549 flags, // flags: fall on error
2550 psz, // input string
2551 -1, // its length (NUL-terminated)
2552 buf, // output string
2553 buf ? n : 0 // size of output buffer
2554 );
2555 if ( !len )
2556 {
2557 // function totally failed
2558 return wxCONV_FAILED;
2559 }
2560
2561 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2562 // check if we succeeded, by doing a double trip:
2563 if ( !flags && buf )
2564 {
2565 const size_t mbLen = strlen(psz);
2566 wxCharBuffer mbBuf(mbLen);
2567 if ( ::WideCharToMultiByte
2568 (
2569 m_CodePage,
2570 0,
2571 buf,
2572 -1,
2573 mbBuf.data(),
2574 mbLen + 1, // size in bytes, not length
2575 NULL,
2576 NULL
2577 ) == 0 ||
2578 strcmp(mbBuf, psz) != 0 )
2579 {
2580 // we didn't obtain the same thing we started from, hence
2581 // the conversion was lossy and we consider that it failed
2582 return wxCONV_FAILED;
2583 }
2584 }
2585
2586 // note that it returns count of written chars for buf != NULL and size
2587 // of the needed buffer for buf == NULL so in either case the length of
2588 // the string (which never includes the terminating NUL) is one less
2589 return len - 1;
2590 }
2591
2592 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2593 {
2594 /*
2595 we have a problem here: by default, WideCharToMultiByte() may
2596 replace characters unrepresentable in the target code page with bad
2597 quality approximations such as turning "1/2" symbol (U+00BD) into
2598 "1" for the code pages which don't have it and we, obviously, want
2599 to avoid this at any price
2600
2601 the trouble is that this function does it _silently_, i.e. it won't
2602 even tell us whether it did or not... Win98/2000 and higher provide
2603 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2604 we have to resort to a round trip, i.e. check that converting back
2605 results in the same string -- this is, of course, expensive but
2606 otherwise we simply can't be sure to not garble the data.
2607 */
2608
2609 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2610 // it doesn't work with CJK encodings (which we test for rather roughly
2611 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2612 // supporting it
2613 BOOL usedDef wxDUMMY_INITIALIZE(false);
2614 BOOL *pUsedDef;
2615 int flags;
2616 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2617 {
2618 // it's our lucky day
2619 flags = WC_NO_BEST_FIT_CHARS;
2620 pUsedDef = &usedDef;
2621 }
2622 else // old system or unsupported encoding
2623 {
2624 flags = 0;
2625 pUsedDef = NULL;
2626 }
2627
2628 const size_t len = ::WideCharToMultiByte
2629 (
2630 m_CodePage, // code page
2631 flags, // either none or no best fit
2632 pwz, // input string
2633 -1, // it is (wide) NUL-terminated
2634 buf, // output buffer
2635 buf ? n : 0, // and its size
2636 NULL, // default "replacement" char
2637 pUsedDef // [out] was it used?
2638 );
2639
2640 if ( !len )
2641 {
2642 // function totally failed
2643 return wxCONV_FAILED;
2644 }
2645
2646 // we did something, check if we really succeeded
2647 if ( flags )
2648 {
2649 // check if the conversion failed, i.e. if any replacements
2650 // were done
2651 if ( usedDef )
2652 return wxCONV_FAILED;
2653 }
2654 else // we must resort to double tripping...
2655 {
2656 // first we need to ensure that we really have the MB data: this is
2657 // not the case if we're called with NULL buffer, in which case we
2658 // need to do the conversion yet again
2659 wxCharBuffer bufDef;
2660 if ( !buf )
2661 {
2662 bufDef = wxCharBuffer(len);
2663 buf = bufDef.data();
2664 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2665 buf, len, NULL, NULL) )
2666 return wxCONV_FAILED;
2667 }
2668
2669 if ( !n )
2670 n = wcslen(pwz);
2671 wxWCharBuffer wcBuf(n);
2672 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2673 wcscmp(wcBuf, pwz) != 0 )
2674 {
2675 // we didn't obtain the same thing we started from, hence
2676 // the conversion was lossy and we consider that it failed
2677 return wxCONV_FAILED;
2678 }
2679 }
2680
2681 // see the comment above for the reason of "len - 1"
2682 return len - 1;
2683 }
2684
2685 virtual size_t GetMBNulLen() const
2686 {
2687 if ( m_minMBCharWidth == 0 )
2688 {
2689 int len = ::WideCharToMultiByte
2690 (
2691 m_CodePage, // code page
2692 0, // no flags
2693 L"", // input string
2694 1, // translate just the NUL
2695 NULL, // output buffer
2696 0, // and its size
2697 NULL, // no replacement char
2698 NULL // [out] don't care if it was used
2699 );
2700
2701 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2702 switch ( len )
2703 {
2704 default:
2705 wxLogDebug(wxT("Unexpected NUL length %d"), len);
2706 self->m_minMBCharWidth = (size_t)-1;
2707 break;
2708
2709 case 0:
2710 self->m_minMBCharWidth = (size_t)-1;
2711 break;
2712
2713 case 1:
2714 case 2:
2715 case 4:
2716 self->m_minMBCharWidth = len;
2717 break;
2718 }
2719 }
2720
2721 return m_minMBCharWidth;
2722 }
2723
2724 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2725
2726 bool IsOk() const { return m_CodePage != -1; }
2727
2728 private:
2729 static bool CanUseNoBestFit()
2730 {
2731 static int s_isWin98Or2k = -1;
2732
2733 if ( s_isWin98Or2k == -1 )
2734 {
2735 int verMaj, verMin;
2736 switch ( wxGetOsVersion(&verMaj, &verMin) )
2737 {
2738 case wxOS_WINDOWS_9X:
2739 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2740 break;
2741
2742 case wxOS_WINDOWS_NT:
2743 s_isWin98Or2k = verMaj >= 5;
2744 break;
2745
2746 default:
2747 // unknown: be conservative by default
2748 s_isWin98Or2k = 0;
2749 break;
2750 }
2751
2752 wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2753 }
2754
2755 return s_isWin98Or2k == 1;
2756 }
2757
2758 static bool IsAtLeastWin2kSP4()
2759 {
2760 #ifdef __WXWINCE__
2761 return false;
2762 #else
2763 static int s_isAtLeastWin2kSP4 = -1;
2764
2765 if ( s_isAtLeastWin2kSP4 == -1 )
2766 {
2767 OSVERSIONINFOEX ver;
2768
2769 memset(&ver, 0, sizeof(ver));
2770 ver.dwOSVersionInfoSize = sizeof(ver);
2771 GetVersionEx((OSVERSIONINFO*)&ver);
2772
2773 s_isAtLeastWin2kSP4 =
2774 ((ver.dwMajorVersion > 5) || // Vista+
2775 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2776 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2777 ver.wServicePackMajor >= 4)) // 2000 SP4+
2778 ? 1 : 0;
2779 }
2780
2781 return s_isAtLeastWin2kSP4 == 1;
2782 #endif
2783 }
2784
2785
2786 // the code page we're working with
2787 long m_CodePage;
2788
2789 // cached result of GetMBNulLen(), set to 0 initially meaning
2790 // "unknown"
2791 size_t m_minMBCharWidth;
2792 };
2793
2794 #endif // wxHAVE_WIN32_MB2WC
2795
2796
2797 // ============================================================================
2798 // wxEncodingConverter based conversion classes
2799 // ============================================================================
2800
2801 #if wxUSE_FONTMAP
2802
2803 class wxMBConv_wxwin : public wxMBConv
2804 {
2805 private:
2806 void Init()
2807 {
2808 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2809 // The wxMBConv_cf class does a better job.
2810 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2811 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2812 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2813 }
2814
2815 public:
2816 // temporarily just use wxEncodingConverter stuff,
2817 // so that it works while a better implementation is built
2818 wxMBConv_wxwin(const char* name)
2819 {
2820 if (name)
2821 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2822 else
2823 m_enc = wxFONTENCODING_SYSTEM;
2824
2825 Init();
2826 }
2827
2828 wxMBConv_wxwin(wxFontEncoding enc)
2829 {
2830 m_enc = enc;
2831
2832 Init();
2833 }
2834
2835 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2836 {
2837 size_t inbuf = strlen(psz);
2838 if (buf)
2839 {
2840 if (!m2w.Convert(psz, buf))
2841 return wxCONV_FAILED;
2842 }
2843 return inbuf;
2844 }
2845
2846 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2847 {
2848 const size_t inbuf = wxWcslen(psz);
2849 if (buf)
2850 {
2851 if (!w2m.Convert(psz, buf))
2852 return wxCONV_FAILED;
2853 }
2854
2855 return inbuf;
2856 }
2857
2858 virtual size_t GetMBNulLen() const
2859 {
2860 switch ( m_enc )
2861 {
2862 case wxFONTENCODING_UTF16BE:
2863 case wxFONTENCODING_UTF16LE:
2864 return 2;
2865
2866 case wxFONTENCODING_UTF32BE:
2867 case wxFONTENCODING_UTF32LE:
2868 return 4;
2869
2870 default:
2871 return 1;
2872 }
2873 }
2874
2875 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2876
2877 bool IsOk() const { return m_ok; }
2878
2879 public:
2880 wxFontEncoding m_enc;
2881 wxEncodingConverter m2w, w2m;
2882
2883 private:
2884 // were we initialized successfully?
2885 bool m_ok;
2886
2887 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2888 };
2889
2890 // make the constructors available for unit testing
2891 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2892 {
2893 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2894 if ( !result->IsOk() )
2895 {
2896 delete result;
2897 return 0;
2898 }
2899
2900 return result;
2901 }
2902
2903 #endif // wxUSE_FONTMAP
2904
2905 // ============================================================================
2906 // wxCSConv implementation
2907 // ============================================================================
2908
2909 void wxCSConv::Init()
2910 {
2911 m_name = NULL;
2912 m_convReal = NULL;
2913 m_deferred = true;
2914 }
2915
2916 wxCSConv::wxCSConv(const wxString& charset)
2917 {
2918 Init();
2919
2920 if ( !charset.empty() )
2921 {
2922 SetName(charset.ToAscii());
2923 }
2924
2925 #if wxUSE_FONTMAP
2926 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2927 if ( m_encoding == wxFONTENCODING_MAX )
2928 {
2929 // set to unknown/invalid value
2930 m_encoding = wxFONTENCODING_SYSTEM;
2931 }
2932 else if ( m_encoding == wxFONTENCODING_DEFAULT )
2933 {
2934 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2935 m_encoding = wxFONTENCODING_ISO8859_1;
2936 }
2937 #else
2938 m_encoding = wxFONTENCODING_SYSTEM;
2939 #endif
2940 }
2941
2942 wxCSConv::wxCSConv(wxFontEncoding encoding)
2943 {
2944 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2945 {
2946 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2947
2948 encoding = wxFONTENCODING_SYSTEM;
2949 }
2950
2951 Init();
2952
2953 m_encoding = encoding;
2954 }
2955
2956 wxCSConv::~wxCSConv()
2957 {
2958 Clear();
2959 }
2960
2961 wxCSConv::wxCSConv(const wxCSConv& conv)
2962 : wxMBConv()
2963 {
2964 Init();
2965
2966 SetName(conv.m_name);
2967 m_encoding = conv.m_encoding;
2968 }
2969
2970 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2971 {
2972 Clear();
2973
2974 SetName(conv.m_name);
2975 m_encoding = conv.m_encoding;
2976
2977 return *this;
2978 }
2979
2980 void wxCSConv::Clear()
2981 {
2982 free(m_name);
2983 delete m_convReal;
2984
2985 m_name = NULL;
2986 m_convReal = NULL;
2987 }
2988
2989 void wxCSConv::SetName(const char *charset)
2990 {
2991 if (charset)
2992 {
2993 m_name = wxStrdup(charset);
2994 m_deferred = true;
2995 }
2996 }
2997
2998 #if wxUSE_FONTMAP
2999
3000 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3001 wxEncodingNameCache );
3002
3003 static wxEncodingNameCache gs_nameCache;
3004 #endif
3005
3006 wxMBConv *wxCSConv::DoCreate() const
3007 {
3008 #if wxUSE_FONTMAP
3009 wxLogTrace(TRACE_STRCONV,
3010 wxT("creating conversion for %s"),
3011 (m_name ? m_name
3012 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3013 #endif // wxUSE_FONTMAP
3014
3015 // check for the special case of ASCII or ISO8859-1 charset: as we have
3016 // special knowledge of it anyhow, we don't need to create a special
3017 // conversion object
3018 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3019 m_encoding == wxFONTENCODING_DEFAULT )
3020 {
3021 // don't convert at all
3022 return NULL;
3023 }
3024
3025 // we trust OS to do conversion better than we can so try external
3026 // conversion methods first
3027 //
3028 // the full order is:
3029 // 1. OS conversion (iconv() under Unix or Win32 API)
3030 // 2. hard coded conversions for UTF
3031 // 3. wxEncodingConverter as fall back
3032
3033 // step (1)
3034 #ifdef HAVE_ICONV
3035 #if !wxUSE_FONTMAP
3036 if ( m_name )
3037 #endif // !wxUSE_FONTMAP
3038 {
3039 #if wxUSE_FONTMAP
3040 wxFontEncoding encoding(m_encoding);
3041 #endif
3042
3043 if ( m_name )
3044 {
3045 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3046 if ( conv->IsOk() )
3047 return conv;
3048
3049 delete conv;
3050
3051 #if wxUSE_FONTMAP
3052 encoding =
3053 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3054 #endif // wxUSE_FONTMAP
3055 }
3056 #if wxUSE_FONTMAP
3057 {
3058 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3059 if ( it != gs_nameCache.end() )
3060 {
3061 if ( it->second.empty() )
3062 return NULL;
3063
3064 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3065 if ( conv->IsOk() )
3066 return conv;
3067
3068 delete conv;
3069 }
3070
3071 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3072 // CS : in case this does not return valid names (eg for MacRoman)
3073 // encoding got a 'failure' entry in the cache all the same,
3074 // although it just has to be created using a different method, so
3075 // only store failed iconv creation attempts (or perhaps we
3076 // shoulnd't do this at all ?)
3077 if ( names[0] != NULL )
3078 {
3079 for ( ; *names; ++names )
3080 {
3081 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3082 // will need changes that will obsolete this
3083 wxString name(*names);
3084 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3085 if ( conv->IsOk() )
3086 {
3087 gs_nameCache[encoding] = *names;
3088 return conv;
3089 }
3090
3091 delete conv;
3092 }
3093
3094 gs_nameCache[encoding] = wxT(""); // cache the failure
3095 }
3096 }
3097 #endif // wxUSE_FONTMAP
3098 }
3099 #endif // HAVE_ICONV
3100
3101 #ifdef wxHAVE_WIN32_MB2WC
3102 {
3103 #if wxUSE_FONTMAP
3104 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3105 : new wxMBConv_win32(m_encoding);
3106 if ( conv->IsOk() )
3107 return conv;
3108
3109 delete conv;
3110 #else
3111 return NULL;
3112 #endif
3113 }
3114 #endif // wxHAVE_WIN32_MB2WC
3115
3116 #ifdef __DARWIN__
3117 {
3118 // leave UTF16 and UTF32 to the built-ins of wx
3119 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3120 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3121 {
3122 #if wxUSE_FONTMAP
3123 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3124 : new wxMBConv_cf(m_encoding);
3125 #else
3126 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3127 #endif
3128
3129 if ( conv->IsOk() )
3130 return conv;
3131
3132 delete conv;
3133 }
3134 }
3135 #endif // __DARWIN__
3136
3137 // step (2)
3138 wxFontEncoding enc = m_encoding;
3139 #if wxUSE_FONTMAP
3140 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3141 {
3142 // use "false" to suppress interactive dialogs -- we can be called from
3143 // anywhere and popping up a dialog from here is the last thing we want to
3144 // do
3145 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3146 }
3147 #endif // wxUSE_FONTMAP
3148
3149 switch ( enc )
3150 {
3151 case wxFONTENCODING_UTF7:
3152 return new wxMBConvUTF7;
3153
3154 case wxFONTENCODING_UTF8:
3155 return new wxMBConvUTF8;
3156
3157 case wxFONTENCODING_UTF16BE:
3158 return new wxMBConvUTF16BE;
3159
3160 case wxFONTENCODING_UTF16LE:
3161 return new wxMBConvUTF16LE;
3162
3163 case wxFONTENCODING_UTF32BE:
3164 return new wxMBConvUTF32BE;
3165
3166 case wxFONTENCODING_UTF32LE:
3167 return new wxMBConvUTF32LE;
3168
3169 default:
3170 // nothing to do but put here to suppress gcc warnings
3171 break;
3172 }
3173
3174 // step (3)
3175 #if wxUSE_FONTMAP
3176 {
3177 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3178 : new wxMBConv_wxwin(m_encoding);
3179 if ( conv->IsOk() )
3180 return conv;
3181
3182 delete conv;
3183 }
3184
3185 wxLogTrace(TRACE_STRCONV,
3186 wxT("encoding \"%s\" is not supported by this system"),
3187 (m_name ? wxString(m_name)
3188 : wxFontMapperBase::GetEncodingName(m_encoding)));
3189 #endif // wxUSE_FONTMAP
3190
3191 return NULL;
3192 }
3193
3194 void wxCSConv::CreateConvIfNeeded() const
3195 {
3196 if ( m_deferred )
3197 {
3198 wxCSConv *self = (wxCSConv *)this; // const_cast
3199
3200 // if we don't have neither the name nor the encoding, use the default
3201 // encoding for this system
3202 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3203 {
3204 #if wxUSE_INTL
3205 self->m_encoding = wxLocale::GetSystemEncoding();
3206 #else
3207 // fallback to some reasonable default:
3208 self->m_encoding = wxFONTENCODING_ISO8859_1;
3209 #endif // wxUSE_INTL
3210 }
3211
3212 self->m_convReal = DoCreate();
3213 self->m_deferred = false;
3214 }
3215 }
3216
3217 bool wxCSConv::IsOk() const
3218 {
3219 CreateConvIfNeeded();
3220
3221 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3222 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3223 return true; // always ok as we do it ourselves
3224
3225 // m_convReal->IsOk() is called at its own creation, so we know it must
3226 // be ok if m_convReal is non-NULL
3227 return m_convReal != NULL;
3228 }
3229
3230 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3231 const char *src, size_t srcLen) const
3232 {
3233 CreateConvIfNeeded();
3234
3235 if (m_convReal)
3236 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3237
3238 // latin-1 (direct)
3239 if ( srcLen == wxNO_LEN )
3240 srcLen = strlen(src) + 1; // take trailing NUL too
3241
3242 if ( dst )
3243 {
3244 if ( dstLen < srcLen )
3245 return wxCONV_FAILED;
3246
3247 for ( size_t n = 0; n < srcLen; n++ )
3248 dst[n] = (unsigned char)(src[n]);
3249 }
3250
3251 return srcLen;
3252 }
3253
3254 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3255 const wchar_t *src, size_t srcLen) const
3256 {
3257 CreateConvIfNeeded();
3258
3259 if (m_convReal)
3260 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3261
3262 // latin-1 (direct)
3263 if ( srcLen == wxNO_LEN )
3264 srcLen = wxWcslen(src) + 1;
3265
3266 if ( dst )
3267 {
3268 if ( dstLen < srcLen )
3269 return wxCONV_FAILED;
3270
3271 for ( size_t n = 0; n < srcLen; n++ )
3272 {
3273 if ( src[n] > 0xFF )
3274 return wxCONV_FAILED;
3275
3276 dst[n] = (char)src[n];
3277 }
3278
3279 }
3280 else // still need to check the input validity
3281 {
3282 for ( size_t n = 0; n < srcLen; n++ )
3283 {
3284 if ( src[n] > 0xFF )
3285 return wxCONV_FAILED;
3286 }
3287 }
3288
3289 return srcLen;
3290 }
3291
3292 size_t wxCSConv::GetMBNulLen() const
3293 {
3294 CreateConvIfNeeded();
3295
3296 if ( m_convReal )
3297 {
3298 return m_convReal->GetMBNulLen();
3299 }
3300
3301 // otherwise, we are ISO-8859-1
3302 return 1;
3303 }
3304
3305 #if wxUSE_UNICODE_UTF8
3306 bool wxCSConv::IsUTF8() const
3307 {
3308 CreateConvIfNeeded();
3309
3310 if ( m_convReal )
3311 {
3312 return m_convReal->IsUTF8();
3313 }
3314
3315 // otherwise, we are ISO-8859-1
3316 return false;
3317 }
3318 #endif
3319
3320
3321 #if wxUSE_UNICODE
3322
3323 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3324 {
3325 if ( !s )
3326 return wxWCharBuffer();
3327
3328 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3329 if ( !wbuf )
3330 wbuf = wxMBConvUTF8().cMB2WX(s);
3331 if ( !wbuf )
3332 wbuf = wxConvISO8859_1.cMB2WX(s);
3333
3334 return wbuf;
3335 }
3336
3337 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3338 {
3339 if ( !ws )
3340 return wxCharBuffer();
3341
3342 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3343 if ( !buf )
3344 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3345
3346 return buf;
3347 }
3348
3349 #endif // wxUSE_UNICODE
3350
3351 // ----------------------------------------------------------------------------
3352 // globals
3353 // ----------------------------------------------------------------------------
3354
3355 // NB: The reason why we create converted objects in this convoluted way,
3356 // using a factory function instead of global variable, is that they
3357 // may be used at static initialization time (some of them are used by
3358 // wxString ctors and there may be a global wxString object). In other
3359 // words, possibly _before_ the converter global object would be
3360 // initialized.
3361
3362 #undef wxConvLibc
3363 #undef wxConvUTF8
3364 #undef wxConvUTF7
3365 #undef wxConvLocal
3366 #undef wxConvISO8859_1
3367
3368 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3369 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3370 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3371 { \
3372 static impl_klass name##Obj ctor_args; \
3373 return &name##Obj; \
3374 } \
3375 /* this ensures that all global converter objects are created */ \
3376 /* by the time static initialization is done, i.e. before any */ \
3377 /* thread is launched: */ \
3378 static klass* gs_##name##instance = wxGet_##name##Ptr()
3379
3380 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3381 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3382
3383 #ifdef __INTELC__
3384 // disable warning "variable 'xxx' was declared but never referenced"
3385 #pragma warning(disable: 177)
3386 #endif // Intel C++
3387
3388 #ifdef __WINDOWS__
3389 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3390 #elif 0 // defined(__WXOSX__)
3391 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
3392 #else
3393 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3394 #endif
3395
3396 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3397 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3398 // provokes an error message about "not enough macro parameters"; and we
3399 // can't use "()" here as the name##Obj declaration would be parsed as a
3400 // function declaration then, so use a semicolon and live with an extra
3401 // empty statement (and hope that no compilers warns about this)
3402 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3403 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3404
3405 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3406 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3407
3408 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3410
3411 #ifdef __DARWIN__
3412 // The xnu kernel always communicates file paths in decomposed UTF-8.
3413 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3414 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3415 #endif
3416
3417 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3418 #ifdef __DARWIN__
3419 &wxConvMacUTF8DObj;
3420 #else // !__DARWIN__
3421 wxGet_wxConvLibcPtr();
3422 #endif // __DARWIN__/!__DARWIN__
3423
3424 #else // !wxUSE_WCHAR_T
3425
3426 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3427 // stand-ins in absence of wchar_t
3428 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3429 wxConvISO8859_1,
3430 wxConvLocal,
3431 wxConvUTF8;
3432
3433 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T