]> git.saurik.com Git - wxWidgets.git/blame_incremental - src/common/strconv.cpp
don't create conversion objects unless we really need to convert
[wxWidgets.git] / src / common / strconv.cpp
... / ...
CommitLineData
1/////////////////////////////////////////////////////////////////////////////
2// Name: strconv.cpp
3// Purpose: Unicode conversion classes
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
11// (c) 2004 Ryan Norton, Fredrik Roubert
12// Licence: wxWindows licence
13/////////////////////////////////////////////////////////////////////////////
14
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
23#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25#endif
26
27// For compilers that support precompilation, includes "wx.h".
28#include "wx/wxprec.h"
29
30#ifdef __BORLANDC__
31 #pragma hdrstop
32#endif
33
34#ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37#endif // WX_PRECOMP
38
39#include "wx/strconv.h"
40
41#if wxUSE_WCHAR_T
42
43#ifdef __WINDOWS__
44 #include "wx/msw/private.h"
45 #include "wx/msw/missing.h"
46#endif
47
48#ifndef __WXWINCE__
49#include <errno.h>
50#endif
51
52#include <ctype.h>
53#include <string.h>
54#include <stdlib.h>
55
56#if defined(__WIN32__) && !defined(__WXMICROWIN__)
57 #define wxHAVE_WIN32_MB2WC
58#endif // __WIN32__ but !__WXMICROWIN__
59
60// ----------------------------------------------------------------------------
61// headers
62// ----------------------------------------------------------------------------
63
64#ifdef __SALFORDC__
65 #include <clib.h>
66#endif
67
68#ifdef HAVE_ICONV
69 #include <iconv.h>
70 #include "wx/thread.h"
71#endif
72
73#include "wx/encconv.h"
74#include "wx/fontmap.h"
75#include "wx/utils.h"
76
77#ifdef __WXMAC__
78#ifndef __DARWIN__
79#include <ATSUnicode.h>
80#include <TextCommon.h>
81#include <TextEncodingConverter.h>
82#endif
83
84#include "wx/mac/private.h" // includes mac headers
85#endif
86
87#define TRACE_STRCONV _T("strconv")
88
89// ----------------------------------------------------------------------------
90// macros
91// ----------------------------------------------------------------------------
92
93#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
94#define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
95
96#if SIZEOF_WCHAR_T == 4
97 #define WC_NAME "UCS4"
98 #define WC_BSWAP BSWAP_UCS4
99 #ifdef WORDS_BIGENDIAN
100 #define WC_NAME_BEST "UCS-4BE"
101 #else
102 #define WC_NAME_BEST "UCS-4LE"
103 #endif
104#elif SIZEOF_WCHAR_T == 2
105 #define WC_NAME "UTF16"
106 #define WC_BSWAP BSWAP_UTF16
107 #define WC_UTF16
108 #ifdef WORDS_BIGENDIAN
109 #define WC_NAME_BEST "UTF-16BE"
110 #else
111 #define WC_NAME_BEST "UTF-16LE"
112 #endif
113#else // sizeof(wchar_t) != 2 nor 4
114 // does this ever happen?
115 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
116#endif
117
118// ============================================================================
119// implementation
120// ============================================================================
121
122// ----------------------------------------------------------------------------
123// UTF-16 en/decoding to/from UCS-4
124// ----------------------------------------------------------------------------
125
126
127static size_t encode_utf16(wxUint32 input, wxUint16 *output)
128{
129 if (input<=0xffff)
130 {
131 if (output)
132 *output = (wxUint16) input;
133 return 1;
134 }
135 else if (input>=0x110000)
136 {
137 return (size_t)-1;
138 }
139 else
140 {
141 if (output)
142 {
143 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
144 *output = (wxUint16) ((input&0x3ff)+0xdc00);
145 }
146 return 2;
147 }
148}
149
150static size_t decode_utf16(const wxUint16* input, wxUint32& output)
151{
152 if ((*input<0xd800) || (*input>0xdfff))
153 {
154 output = *input;
155 return 1;
156 }
157 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
158 {
159 output = *input;
160 return (size_t)-1;
161 }
162 else
163 {
164 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
165 return 2;
166 }
167}
168
169
170// ----------------------------------------------------------------------------
171// wxMBConv
172// ----------------------------------------------------------------------------
173
174wxMBConv::~wxMBConv()
175{
176 // nothing to do here (necessary for Darwin linking probably)
177}
178
179const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
180{
181 if ( psz )
182 {
183 // calculate the length of the buffer needed first
184 size_t nLen = MB2WC(NULL, psz, 0);
185 if ( nLen != (size_t)-1 )
186 {
187 // now do the actual conversion
188 wxWCharBuffer buf(nLen);
189 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
190 if ( nLen != (size_t)-1 )
191 {
192 return buf;
193 }
194 }
195 }
196
197 wxWCharBuffer buf((wchar_t *)NULL);
198
199 return buf;
200}
201
202const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
203{
204 if ( pwz )
205 {
206 size_t nLen = WC2MB(NULL, pwz, 0);
207 if ( nLen != (size_t)-1 )
208 {
209 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
210 nLen = WC2MB(buf.data(), pwz, nLen + 4);
211 if ( nLen != (size_t)-1 )
212 {
213 return buf;
214 }
215 }
216 }
217
218 wxCharBuffer buf((char *)NULL);
219
220 return buf;
221}
222
223const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
224{
225 wxASSERT(pOutSize != NULL);
226
227 const char* szEnd = szString + nStringLen + 1;
228 const char* szPos = szString;
229 const char* szStart = szPos;
230
231 size_t nActualLength = 0;
232 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
233
234 wxWCharBuffer theBuffer(nCurrentSize);
235
236 //Convert the string until the length() is reached, continuing the
237 //loop every time a null character is reached
238 while(szPos != szEnd)
239 {
240 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
241
242 //Get the length of the current (sub)string
243 size_t nLen = MB2WC(NULL, szPos, 0);
244
245 //Invalid conversion?
246 if( nLen == (size_t)-1 )
247 {
248 *pOutSize = 0;
249 theBuffer.data()[0u] = wxT('\0');
250 return theBuffer;
251 }
252
253
254 //Increase the actual length (+1 for current null character)
255 nActualLength += nLen + 1;
256
257 //if buffer too big, realloc the buffer
258 if (nActualLength > (nCurrentSize+1))
259 {
260 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
261 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
262 theBuffer = theNewBuffer;
263 nCurrentSize <<= 1;
264 }
265
266 //Convert the current (sub)string
267 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
268 {
269 *pOutSize = 0;
270 theBuffer.data()[0u] = wxT('\0');
271 return theBuffer;
272 }
273
274 //Increment to next (sub)string
275 //Note that we have to use strlen instead of nLen here
276 //because XX2XX gives us the size of the output buffer,
277 //which is not necessarily the length of the string
278 szPos += strlen(szPos) + 1;
279 }
280
281 //success - return actual length and the buffer
282 *pOutSize = nActualLength;
283 return theBuffer;
284}
285
286const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
287{
288 wxASSERT(pOutSize != NULL);
289
290 const wchar_t* szEnd = szString + nStringLen + 1;
291 const wchar_t* szPos = szString;
292 const wchar_t* szStart = szPos;
293
294 size_t nActualLength = 0;
295 size_t nCurrentSize = nStringLen << 2; //try * 4 first
296
297 wxCharBuffer theBuffer(nCurrentSize);
298
299 //Convert the string until the length() is reached, continuing the
300 //loop every time a null character is reached
301 while(szPos != szEnd)
302 {
303 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
304
305 //Get the length of the current (sub)string
306 size_t nLen = WC2MB(NULL, szPos, 0);
307
308 //Invalid conversion?
309 if( nLen == (size_t)-1 )
310 {
311 *pOutSize = 0;
312 theBuffer.data()[0u] = wxT('\0');
313 return theBuffer;
314 }
315
316 //Increase the actual length (+1 for current null character)
317 nActualLength += nLen + 1;
318
319 //if buffer too big, realloc the buffer
320 if (nActualLength > (nCurrentSize+1))
321 {
322 wxCharBuffer theNewBuffer(nCurrentSize << 1);
323 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
324 theBuffer = theNewBuffer;
325 nCurrentSize <<= 1;
326 }
327
328 //Convert the current (sub)string
329 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
330 {
331 *pOutSize = 0;
332 theBuffer.data()[0u] = wxT('\0');
333 return theBuffer;
334 }
335
336 //Increment to next (sub)string
337 //Note that we have to use wxWcslen instead of nLen here
338 //because XX2XX gives us the size of the output buffer,
339 //which is not necessarily the length of the string
340 szPos += wxWcslen(szPos) + 1;
341 }
342
343 //success - return actual length and the buffer
344 *pOutSize = nActualLength;
345 return theBuffer;
346}
347
348// ----------------------------------------------------------------------------
349// wxMBConvLibc
350// ----------------------------------------------------------------------------
351
352size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
353{
354 return wxMB2WC(buf, psz, n);
355}
356
357size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
358{
359 return wxWC2MB(buf, psz, n);
360}
361
362#ifdef __UNIX__
363
364// ----------------------------------------------------------------------------
365// wxConvBrokenFileNames
366// ----------------------------------------------------------------------------
367
368wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
369{
370 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
371 || wxStricmp(charset, _T("UTF8")) == 0 )
372 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
373 else
374 m_conv = new wxCSConv(charset);
375}
376
377size_t
378wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
379 const char *psz,
380 size_t outputSize) const
381{
382 return m_conv->MB2WC( outputBuf, psz, outputSize );
383}
384
385size_t
386wxConvBrokenFileNames::WC2MB(char *outputBuf,
387 const wchar_t *psz,
388 size_t outputSize) const
389{
390 return m_conv->WC2MB( outputBuf, psz, outputSize );
391}
392
393#endif
394
395// ----------------------------------------------------------------------------
396// UTF-7
397// ----------------------------------------------------------------------------
398
399// Implementation (C) 2004 Fredrik Roubert
400
401//
402// BASE64 decoding table
403//
404static const unsigned char utf7unb64[] =
405{
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
412 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
413 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
414 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
415 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
416 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
417 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
418 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
419 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
420 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
421 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
437 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
438};
439
440size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
441{
442 size_t len = 0;
443
444 while (*psz && ((!buf) || (len < n)))
445 {
446 unsigned char cc = *psz++;
447 if (cc != '+')
448 {
449 // plain ASCII char
450 if (buf)
451 *buf++ = cc;
452 len++;
453 }
454 else if (*psz == '-')
455 {
456 // encoded plus sign
457 if (buf)
458 *buf++ = cc;
459 len++;
460 psz++;
461 }
462 else
463 {
464 // BASE64 encoded string
465 bool lsb;
466 unsigned char c;
467 unsigned int d, l;
468 for (lsb = false, d = 0, l = 0;
469 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
470 {
471 d <<= 6;
472 d += cc;
473 for (l += 6; l >= 8; lsb = !lsb)
474 {
475 c = (unsigned char)((d >> (l -= 8)) % 256);
476 if (lsb)
477 {
478 if (buf)
479 *buf++ |= c;
480 len ++;
481 }
482 else
483 if (buf)
484 *buf = (wchar_t)(c << 8);
485 }
486 }
487 if (*psz == '-')
488 psz++;
489 }
490 }
491 if (buf && (len < n))
492 *buf = 0;
493 return len;
494}
495
496//
497// BASE64 encoding table
498//
499static const unsigned char utf7enb64[] =
500{
501 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
502 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
503 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
504 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
505 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
506 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
507 'w', 'x', 'y', 'z', '0', '1', '2', '3',
508 '4', '5', '6', '7', '8', '9', '+', '/'
509};
510
511//
512// UTF-7 encoding table
513//
514// 0 - Set D (directly encoded characters)
515// 1 - Set O (optional direct characters)
516// 2 - whitespace characters (optional)
517// 3 - special characters
518//
519static const unsigned char utf7encode[128] =
520{
521 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
522 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
523 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
525 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
527 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
529};
530
531size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
532{
533
534
535 size_t len = 0;
536
537 while (*psz && ((!buf) || (len < n)))
538 {
539 wchar_t cc = *psz++;
540 if (cc < 0x80 && utf7encode[cc] < 1)
541 {
542 // plain ASCII char
543 if (buf)
544 *buf++ = (char)cc;
545 len++;
546 }
547#ifndef WC_UTF16
548 else if (((wxUint32)cc) > 0xffff)
549 {
550 // no surrogate pair generation (yet?)
551 return (size_t)-1;
552 }
553#endif
554 else
555 {
556 if (buf)
557 *buf++ = '+';
558 len++;
559 if (cc != '+')
560 {
561 // BASE64 encode string
562 unsigned int lsb, d, l;
563 for (d = 0, l = 0;; psz++)
564 {
565 for (lsb = 0; lsb < 2; lsb ++)
566 {
567 d <<= 8;
568 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
569
570 for (l += 8; l >= 6; )
571 {
572 l -= 6;
573 if (buf)
574 *buf++ = utf7enb64[(d >> l) % 64];
575 len++;
576 }
577 }
578 cc = *psz;
579 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
580 break;
581 }
582 if (l != 0)
583 {
584 if (buf)
585 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
586 len++;
587 }
588 }
589 if (buf)
590 *buf++ = '-';
591 len++;
592 }
593 }
594 if (buf && (len < n))
595 *buf = 0;
596 return len;
597}
598
599// ----------------------------------------------------------------------------
600// UTF-8
601// ----------------------------------------------------------------------------
602
603static wxUint32 utf8_max[]=
604 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
605
606// boundaries of the private use area we use to (temporarily) remap invalid
607// characters invalid in a UTF-8 encoded string
608const wxUint32 wxUnicodePUA = 0x100000;
609const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
610
611size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
612{
613 size_t len = 0;
614
615 while (*psz && ((!buf) || (len < n)))
616 {
617 const char *opsz = psz;
618 bool invalid = false;
619 unsigned char cc = *psz++, fc = cc;
620 unsigned cnt;
621 for (cnt = 0; fc & 0x80; cnt++)
622 fc <<= 1;
623 if (!cnt)
624 {
625 // plain ASCII char
626 if (buf)
627 *buf++ = cc;
628 len++;
629
630 // escape the escape character for octal escapes
631 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
632 && cc == '\\' && (!buf || len < n))
633 {
634 if (buf)
635 *buf++ = cc;
636 len++;
637 }
638 }
639 else
640 {
641 cnt--;
642 if (!cnt)
643 {
644 // invalid UTF-8 sequence
645 invalid = true;
646 }
647 else
648 {
649 unsigned ocnt = cnt - 1;
650 wxUint32 res = cc & (0x3f >> cnt);
651 while (cnt--)
652 {
653 cc = *psz;
654 if ((cc & 0xC0) != 0x80)
655 {
656 // invalid UTF-8 sequence
657 invalid = true;
658 break;
659 }
660 psz++;
661 res = (res << 6) | (cc & 0x3f);
662 }
663 if (invalid || res <= utf8_max[ocnt])
664 {
665 // illegal UTF-8 encoding
666 invalid = true;
667 }
668 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
669 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
670 {
671 // if one of our PUA characters turns up externally
672 // it must also be treated as an illegal sequence
673 // (a bit like you have to escape an escape character)
674 invalid = true;
675 }
676 else
677 {
678#ifdef WC_UTF16
679 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
680 size_t pa = encode_utf16(res, (wxUint16 *)buf);
681 if (pa == (size_t)-1)
682 {
683 invalid = true;
684 }
685 else
686 {
687 if (buf)
688 buf += pa;
689 len += pa;
690 }
691#else // !WC_UTF16
692 if (buf)
693 *buf++ = res;
694 len++;
695#endif // WC_UTF16/!WC_UTF16
696 }
697 }
698 if (invalid)
699 {
700 if (m_options & MAP_INVALID_UTF8_TO_PUA)
701 {
702 while (opsz < psz && (!buf || len < n))
703 {
704#ifdef WC_UTF16
705 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
706 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
707 wxASSERT(pa != (size_t)-1);
708 if (buf)
709 buf += pa;
710 opsz++;
711 len += pa;
712#else
713 if (buf)
714 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
715 opsz++;
716 len++;
717#endif
718 }
719 }
720 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
721 {
722 while (opsz < psz && (!buf || len < n))
723 {
724 if ( buf && len + 3 < n )
725 {
726 unsigned char n = *opsz;
727 *buf++ = L'\\';
728 *buf++ = (wchar_t)( L'0' + n / 0100 );
729 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
730 *buf++ = (wchar_t)( L'0' + n % 010 );
731 }
732 opsz++;
733 len += 4;
734 }
735 }
736 else // MAP_INVALID_UTF8_NOT
737 {
738 return (size_t)-1;
739 }
740 }
741 }
742 }
743 if (buf && (len < n))
744 *buf = 0;
745 return len;
746}
747
748static inline bool isoctal(wchar_t wch)
749{
750 return L'0' <= wch && wch <= L'7';
751}
752
753size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
754{
755 size_t len = 0;
756
757 while (*psz && ((!buf) || (len < n)))
758 {
759 wxUint32 cc;
760#ifdef WC_UTF16
761 // cast is ok for WC_UTF16
762 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
763 psz += (pa == (size_t)-1) ? 1 : pa;
764#else
765 cc=(*psz++) & 0x7fffffff;
766#endif
767
768 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
769 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
770 {
771 if (buf)
772 *buf++ = (char)(cc - wxUnicodePUA);
773 len++;
774 }
775 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
776 && cc == L'\\' && psz[0] == L'\\' )
777 {
778 if (buf)
779 *buf++ = (char)cc;
780 psz++;
781 len++;
782 }
783 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
784 cc == L'\\' &&
785 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
786 {
787 if (buf)
788 {
789 *buf++ = (char) ((psz[0] - L'0')*0100 +
790 (psz[1] - L'0')*010 +
791 (psz[2] - L'0'));
792 }
793
794 psz += 3;
795 len++;
796 }
797 else
798 {
799 unsigned cnt;
800 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
801 if (!cnt)
802 {
803 // plain ASCII char
804 if (buf)
805 *buf++ = (char) cc;
806 len++;
807 }
808
809 else
810 {
811 len += cnt + 1;
812 if (buf)
813 {
814 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
815 while (cnt--)
816 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
817 }
818 }
819 }
820 }
821
822 if (buf && (len<n))
823 *buf = 0;
824
825 return len;
826}
827
828// ----------------------------------------------------------------------------
829// UTF-16
830// ----------------------------------------------------------------------------
831
832#ifdef WORDS_BIGENDIAN
833 #define wxMBConvUTF16straight wxMBConvUTF16BE
834 #define wxMBConvUTF16swap wxMBConvUTF16LE
835#else
836 #define wxMBConvUTF16swap wxMBConvUTF16BE
837 #define wxMBConvUTF16straight wxMBConvUTF16LE
838#endif
839
840
841#ifdef WC_UTF16
842
843// copy 16bit MB to 16bit String
844size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
845{
846 size_t len=0;
847
848 while (*(wxUint16*)psz && (!buf || len < n))
849 {
850 if (buf)
851 *buf++ = *(wxUint16*)psz;
852 len++;
853
854 psz += sizeof(wxUint16);
855 }
856 if (buf && len<n) *buf=0;
857
858 return len;
859}
860
861
862// copy 16bit String to 16bit MB
863size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
864{
865 size_t len=0;
866
867 while (*psz && (!buf || len < n))
868 {
869 if (buf)
870 {
871 *(wxUint16*)buf = *psz;
872 buf += sizeof(wxUint16);
873 }
874 len += sizeof(wxUint16);
875 psz++;
876 }
877 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
878
879 return len;
880}
881
882
883// swap 16bit MB to 16bit String
884size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
885{
886 size_t len=0;
887
888 while (*(wxUint16*)psz && (!buf || len < n))
889 {
890 if (buf)
891 {
892 ((char *)buf)[0] = psz[1];
893 ((char *)buf)[1] = psz[0];
894 buf++;
895 }
896 len++;
897 psz += sizeof(wxUint16);
898 }
899 if (buf && len<n) *buf=0;
900
901 return len;
902}
903
904
905// swap 16bit MB to 16bit String
906size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
907{
908 size_t len=0;
909
910 while (*psz && (!buf || len < n))
911 {
912 if (buf)
913 {
914 *buf++ = ((char*)psz)[1];
915 *buf++ = ((char*)psz)[0];
916 }
917 len += sizeof(wxUint16);
918 psz++;
919 }
920 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
921
922 return len;
923}
924
925
926#else // WC_UTF16
927
928
929// copy 16bit MB to 32bit String
930size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
931{
932 size_t len=0;
933
934 while (*(wxUint16*)psz && (!buf || len < n))
935 {
936 wxUint32 cc;
937 size_t pa=decode_utf16((wxUint16*)psz, cc);
938 if (pa == (size_t)-1)
939 return pa;
940
941 if (buf)
942 *buf++ = cc;
943 len++;
944 psz += pa * sizeof(wxUint16);
945 }
946 if (buf && len<n) *buf=0;
947
948 return len;
949}
950
951
952// copy 32bit String to 16bit MB
953size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
954{
955 size_t len=0;
956
957 while (*psz && (!buf || len < n))
958 {
959 wxUint16 cc[2];
960 size_t pa=encode_utf16(*psz, cc);
961
962 if (pa == (size_t)-1)
963 return pa;
964
965 if (buf)
966 {
967 *(wxUint16*)buf = cc[0];
968 buf += sizeof(wxUint16);
969 if (pa > 1)
970 {
971 *(wxUint16*)buf = cc[1];
972 buf += sizeof(wxUint16);
973 }
974 }
975
976 len += pa*sizeof(wxUint16);
977 psz++;
978 }
979 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
980
981 return len;
982}
983
984
985// swap 16bit MB to 32bit String
986size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
987{
988 size_t len=0;
989
990 while (*(wxUint16*)psz && (!buf || len < n))
991 {
992 wxUint32 cc;
993 char tmp[4];
994 tmp[0]=psz[1]; tmp[1]=psz[0];
995 tmp[2]=psz[3]; tmp[3]=psz[2];
996
997 size_t pa=decode_utf16((wxUint16*)tmp, cc);
998 if (pa == (size_t)-1)
999 return pa;
1000
1001 if (buf)
1002 *buf++ = cc;
1003
1004 len++;
1005 psz += pa * sizeof(wxUint16);
1006 }
1007 if (buf && len<n) *buf=0;
1008
1009 return len;
1010}
1011
1012
1013// swap 32bit String to 16bit MB
1014size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1015{
1016 size_t len=0;
1017
1018 while (*psz && (!buf || len < n))
1019 {
1020 wxUint16 cc[2];
1021 size_t pa=encode_utf16(*psz, cc);
1022
1023 if (pa == (size_t)-1)
1024 return pa;
1025
1026 if (buf)
1027 {
1028 *buf++ = ((char*)cc)[1];
1029 *buf++ = ((char*)cc)[0];
1030 if (pa > 1)
1031 {
1032 *buf++ = ((char*)cc)[3];
1033 *buf++ = ((char*)cc)[2];
1034 }
1035 }
1036
1037 len += pa*sizeof(wxUint16);
1038 psz++;
1039 }
1040 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1041
1042 return len;
1043}
1044
1045#endif // WC_UTF16
1046
1047
1048// ----------------------------------------------------------------------------
1049// UTF-32
1050// ----------------------------------------------------------------------------
1051
1052#ifdef WORDS_BIGENDIAN
1053#define wxMBConvUTF32straight wxMBConvUTF32BE
1054#define wxMBConvUTF32swap wxMBConvUTF32LE
1055#else
1056#define wxMBConvUTF32swap wxMBConvUTF32BE
1057#define wxMBConvUTF32straight wxMBConvUTF32LE
1058#endif
1059
1060
1061WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1062WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1063
1064
1065#ifdef WC_UTF16
1066
1067// copy 32bit MB to 16bit String
1068size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1069{
1070 size_t len=0;
1071
1072 while (*(wxUint32*)psz && (!buf || len < n))
1073 {
1074 wxUint16 cc[2];
1075
1076 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1077 if (pa == (size_t)-1)
1078 return pa;
1079
1080 if (buf)
1081 {
1082 *buf++ = cc[0];
1083 if (pa > 1)
1084 *buf++ = cc[1];
1085 }
1086 len += pa;
1087 psz += sizeof(wxUint32);
1088 }
1089 if (buf && len<n) *buf=0;
1090
1091 return len;
1092}
1093
1094
1095// copy 16bit String to 32bit MB
1096size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1097{
1098 size_t len=0;
1099
1100 while (*psz && (!buf || len < n))
1101 {
1102 wxUint32 cc;
1103
1104 // cast is ok for WC_UTF16
1105 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1106 if (pa == (size_t)-1)
1107 return pa;
1108
1109 if (buf)
1110 {
1111 *(wxUint32*)buf = cc;
1112 buf += sizeof(wxUint32);
1113 }
1114 len += sizeof(wxUint32);
1115 psz += pa;
1116 }
1117
1118 if (buf && len<=n-sizeof(wxUint32))
1119 *(wxUint32*)buf=0;
1120
1121 return len;
1122}
1123
1124
1125
1126// swap 32bit MB to 16bit String
1127size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1128{
1129 size_t len=0;
1130
1131 while (*(wxUint32*)psz && (!buf || len < n))
1132 {
1133 char tmp[4];
1134 tmp[0] = psz[3]; tmp[1] = psz[2];
1135 tmp[2] = psz[1]; tmp[3] = psz[0];
1136
1137
1138 wxUint16 cc[2];
1139
1140 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1141 if (pa == (size_t)-1)
1142 return pa;
1143
1144 if (buf)
1145 {
1146 *buf++ = cc[0];
1147 if (pa > 1)
1148 *buf++ = cc[1];
1149 }
1150 len += pa;
1151 psz += sizeof(wxUint32);
1152 }
1153
1154 if (buf && len<n)
1155 *buf=0;
1156
1157 return len;
1158}
1159
1160
1161// swap 16bit String to 32bit MB
1162size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1163{
1164 size_t len=0;
1165
1166 while (*psz && (!buf || len < n))
1167 {
1168 char cc[4];
1169
1170 // cast is ok for WC_UTF16
1171 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1172 if (pa == (size_t)-1)
1173 return pa;
1174
1175 if (buf)
1176 {
1177 *buf++ = cc[3];
1178 *buf++ = cc[2];
1179 *buf++ = cc[1];
1180 *buf++ = cc[0];
1181 }
1182 len += sizeof(wxUint32);
1183 psz += pa;
1184 }
1185
1186 if (buf && len<=n-sizeof(wxUint32))
1187 *(wxUint32*)buf=0;
1188
1189 return len;
1190}
1191
1192#else // WC_UTF16
1193
1194
1195// copy 32bit MB to 32bit String
1196size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1197{
1198 size_t len=0;
1199
1200 while (*(wxUint32*)psz && (!buf || len < n))
1201 {
1202 if (buf)
1203 *buf++ = *(wxUint32*)psz;
1204 len++;
1205 psz += sizeof(wxUint32);
1206 }
1207
1208 if (buf && len<n)
1209 *buf=0;
1210
1211 return len;
1212}
1213
1214
1215// copy 32bit String to 32bit MB
1216size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1217{
1218 size_t len=0;
1219
1220 while (*psz && (!buf || len < n))
1221 {
1222 if (buf)
1223 {
1224 *(wxUint32*)buf = *psz;
1225 buf += sizeof(wxUint32);
1226 }
1227
1228 len += sizeof(wxUint32);
1229 psz++;
1230 }
1231
1232 if (buf && len<=n-sizeof(wxUint32))
1233 *(wxUint32*)buf=0;
1234
1235 return len;
1236}
1237
1238
1239// swap 32bit MB to 32bit String
1240size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1241{
1242 size_t len=0;
1243
1244 while (*(wxUint32*)psz && (!buf || len < n))
1245 {
1246 if (buf)
1247 {
1248 ((char *)buf)[0] = psz[3];
1249 ((char *)buf)[1] = psz[2];
1250 ((char *)buf)[2] = psz[1];
1251 ((char *)buf)[3] = psz[0];
1252 buf++;
1253 }
1254 len++;
1255 psz += sizeof(wxUint32);
1256 }
1257
1258 if (buf && len<n)
1259 *buf=0;
1260
1261 return len;
1262}
1263
1264
1265// swap 32bit String to 32bit MB
1266size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1267{
1268 size_t len=0;
1269
1270 while (*psz && (!buf || len < n))
1271 {
1272 if (buf)
1273 {
1274 *buf++ = ((char *)psz)[3];
1275 *buf++ = ((char *)psz)[2];
1276 *buf++ = ((char *)psz)[1];
1277 *buf++ = ((char *)psz)[0];
1278 }
1279 len += sizeof(wxUint32);
1280 psz++;
1281 }
1282
1283 if (buf && len<=n-sizeof(wxUint32))
1284 *(wxUint32*)buf=0;
1285
1286 return len;
1287}
1288
1289
1290#endif // WC_UTF16
1291
1292
1293// ============================================================================
1294// The classes doing conversion using the iconv_xxx() functions
1295// ============================================================================
1296
1297#ifdef HAVE_ICONV
1298
1299// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1300// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1301// (unless there's yet another bug in glibc) the only case when iconv()
1302// returns with (size_t)-1 (which means error) and says there are 0 bytes
1303// left in the input buffer -- when _real_ error occurs,
1304// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1305// iconv() failure.
1306// [This bug does not appear in glibc 2.2.]
1307#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1308#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1309 (errno != E2BIG || bufLeft != 0))
1310#else
1311#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1312#endif
1313
1314#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1315
1316// ----------------------------------------------------------------------------
1317// wxMBConv_iconv: encapsulates an iconv character set
1318// ----------------------------------------------------------------------------
1319
1320class wxMBConv_iconv : public wxMBConv
1321{
1322public:
1323 wxMBConv_iconv(const wxChar *name);
1324 virtual ~wxMBConv_iconv();
1325
1326 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1327 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1328
1329 bool IsOk() const
1330 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1331
1332protected:
1333 // the iconv handlers used to translate from multibyte to wide char and in
1334 // the other direction
1335 iconv_t m2w,
1336 w2m;
1337#if wxUSE_THREADS
1338 // guards access to m2w and w2m objects
1339 wxMutex m_iconvMutex;
1340#endif
1341
1342private:
1343 // the name (for iconv_open()) of a wide char charset -- if none is
1344 // available on this machine, it will remain NULL
1345 static const char *ms_wcCharsetName;
1346
1347 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1348 // different endian-ness than the native one
1349 static bool ms_wcNeedsSwap;
1350};
1351
1352// make the constructor available for unit testing
1353WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1354{
1355 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1356 if ( !result->IsOk() )
1357 {
1358 delete result;
1359 return 0;
1360 }
1361 return result;
1362}
1363
1364const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1365bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1366
1367wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1368{
1369 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1370 // names for the charsets
1371 const wxCharBuffer cname(wxString(name).ToAscii());
1372
1373 // check for charset that represents wchar_t:
1374 if (ms_wcCharsetName == NULL)
1375 {
1376 ms_wcNeedsSwap = false;
1377
1378 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1379 ms_wcCharsetName = WC_NAME_BEST;
1380 m2w = iconv_open(ms_wcCharsetName, cname);
1381
1382 if (m2w == (iconv_t)-1)
1383 {
1384 // try charset w/o bytesex info (e.g. "UCS4")
1385 // and check for bytesex ourselves:
1386 ms_wcCharsetName = WC_NAME;
1387 m2w = iconv_open(ms_wcCharsetName, cname);
1388
1389 // last bet, try if it knows WCHAR_T pseudo-charset
1390 if (m2w == (iconv_t)-1)
1391 {
1392 ms_wcCharsetName = "WCHAR_T";
1393 m2w = iconv_open(ms_wcCharsetName, cname);
1394 }
1395
1396 if (m2w != (iconv_t)-1)
1397 {
1398 char buf[2], *bufPtr;
1399 wchar_t wbuf[2], *wbufPtr;
1400 size_t insz, outsz;
1401 size_t res;
1402
1403 buf[0] = 'A';
1404 buf[1] = 0;
1405 wbuf[0] = 0;
1406 insz = 2;
1407 outsz = SIZEOF_WCHAR_T * 2;
1408 wbufPtr = wbuf;
1409 bufPtr = buf;
1410
1411 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1412 (char**)&wbufPtr, &outsz);
1413
1414 if (ICONV_FAILED(res, insz))
1415 {
1416 ms_wcCharsetName = NULL;
1417 wxLogLastError(wxT("iconv"));
1418 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1419 }
1420 else
1421 {
1422 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1423 }
1424 }
1425 else
1426 {
1427 ms_wcCharsetName = NULL;
1428
1429 // VS: we must not output an error here, since wxWidgets will safely
1430 // fall back to using wxEncodingConverter.
1431 wxLogTrace(TRACE_STRCONV, wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1432 //wxLogError(
1433 }
1434 }
1435 wxLogTrace(TRACE_STRCONV, wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1436 }
1437 else // we already have ms_wcCharsetName
1438 {
1439 m2w = iconv_open(ms_wcCharsetName, cname);
1440 }
1441
1442 // NB: don't ever pass NULL to iconv_open(), it may crash!
1443 if ( ms_wcCharsetName )
1444 {
1445 w2m = iconv_open( cname, ms_wcCharsetName);
1446 }
1447 else
1448 {
1449 w2m = (iconv_t)-1;
1450 }
1451}
1452
1453wxMBConv_iconv::~wxMBConv_iconv()
1454{
1455 if ( m2w != (iconv_t)-1 )
1456 iconv_close(m2w);
1457 if ( w2m != (iconv_t)-1 )
1458 iconv_close(w2m);
1459}
1460
1461size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1462{
1463#if wxUSE_THREADS
1464 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1465 // Unfortunately there is a couple of global wxCSConv objects such as
1466 // wxConvLocal that are used all over wx code, so we have to make sure
1467 // the handle is used by at most one thread at the time. Otherwise
1468 // only a few wx classes would be safe to use from non-main threads
1469 // as MB<->WC conversion would fail "randomly".
1470 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1471#endif
1472
1473 size_t inbuf = strlen(psz);
1474 size_t outbuf = n * SIZEOF_WCHAR_T;
1475 size_t res, cres;
1476 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1477 wchar_t *bufPtr = buf;
1478 const char *pszPtr = psz;
1479
1480 if (buf)
1481 {
1482 // have destination buffer, convert there
1483 cres = iconv(m2w,
1484 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1485 (char**)&bufPtr, &outbuf);
1486 res = n - (outbuf / SIZEOF_WCHAR_T);
1487
1488 if (ms_wcNeedsSwap)
1489 {
1490 // convert to native endianness
1491 WC_BSWAP(buf /* _not_ bufPtr */, res)
1492 }
1493
1494 // NB: iconv was given only strlen(psz) characters on input, and so
1495 // it couldn't convert the trailing zero. Let's do it ourselves
1496 // if there's some room left for it in the output buffer.
1497 if (res < n)
1498 buf[res] = 0;
1499 }
1500 else
1501 {
1502 // no destination buffer... convert using temp buffer
1503 // to calculate destination buffer requirement
1504 wchar_t tbuf[8];
1505 res = 0;
1506 do {
1507 bufPtr = tbuf;
1508 outbuf = 8*SIZEOF_WCHAR_T;
1509
1510 cres = iconv(m2w,
1511 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1512 (char**)&bufPtr, &outbuf );
1513
1514 res += 8-(outbuf/SIZEOF_WCHAR_T);
1515 } while ((cres==(size_t)-1) && (errno==E2BIG));
1516 }
1517
1518 if (ICONV_FAILED(cres, inbuf))
1519 {
1520 //VS: it is ok if iconv fails, hence trace only
1521 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1522 return (size_t)-1;
1523 }
1524
1525 return res;
1526}
1527
1528size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1529{
1530#if wxUSE_THREADS
1531 // NB: explained in MB2WC
1532 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1533#endif
1534
1535 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1536 size_t outbuf = n;
1537 size_t res, cres;
1538
1539 wchar_t *tmpbuf = 0;
1540
1541 if (ms_wcNeedsSwap)
1542 {
1543 // need to copy to temp buffer to switch endianness
1544 // this absolutely doesn't rock!
1545 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1546 // could be in read-only memory, or be accessed in some other thread)
1547 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1548 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1549 WC_BSWAP(tmpbuf, inbuf)
1550 psz=tmpbuf;
1551 }
1552
1553 if (buf)
1554 {
1555 // have destination buffer, convert there
1556 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1557
1558 res = n-outbuf;
1559
1560 // NB: iconv was given only wcslen(psz) characters on input, and so
1561 // it couldn't convert the trailing zero. Let's do it ourselves
1562 // if there's some room left for it in the output buffer.
1563 if (res < n)
1564 buf[0] = 0;
1565 }
1566 else
1567 {
1568 // no destination buffer... convert using temp buffer
1569 // to calculate destination buffer requirement
1570 char tbuf[16];
1571 res = 0;
1572 do {
1573 buf = tbuf; outbuf = 16;
1574
1575 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1576
1577 res += 16 - outbuf;
1578 } while ((cres==(size_t)-1) && (errno==E2BIG));
1579 }
1580
1581 if (ms_wcNeedsSwap)
1582 {
1583 free(tmpbuf);
1584 }
1585
1586 if (ICONV_FAILED(cres, inbuf))
1587 {
1588 //VS: it is ok if iconv fails, hence trace only
1589 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1590 return (size_t)-1;
1591 }
1592
1593 return res;
1594}
1595
1596#endif // HAVE_ICONV
1597
1598
1599// ============================================================================
1600// Win32 conversion classes
1601// ============================================================================
1602
1603#ifdef wxHAVE_WIN32_MB2WC
1604
1605// from utils.cpp
1606#if wxUSE_FONTMAP
1607extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1608extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1609#endif
1610
1611class wxMBConv_win32 : public wxMBConv
1612{
1613public:
1614 wxMBConv_win32()
1615 {
1616 m_CodePage = CP_ACP;
1617 }
1618
1619#if wxUSE_FONTMAP
1620 wxMBConv_win32(const wxChar* name)
1621 {
1622 m_CodePage = wxCharsetToCodepage(name);
1623 }
1624
1625 wxMBConv_win32(wxFontEncoding encoding)
1626 {
1627 m_CodePage = wxEncodingToCodepage(encoding);
1628 }
1629#endif
1630
1631 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1632 {
1633 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1634 // the behaviour is not compatible with the Unix version (using iconv)
1635 // and break the library itself, e.g. wxTextInputStream::NextChar()
1636 // wouldn't work if reading an incomplete MB char didn't result in an
1637 // error
1638 //
1639 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1640 // an error (tested under Windows Server 2003) and apparently it is
1641 // done on purpose, i.e. the function accepts any input in this case
1642 // and although I'd prefer to return error on ill-formed output, our
1643 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1644 // explicitly ill-formed according to RFC 2152) neither so we don't
1645 // even have any fallback here...
1646 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1647
1648 const size_t len = ::MultiByteToWideChar
1649 (
1650 m_CodePage, // code page
1651 flags, // flags: fall on error
1652 psz, // input string
1653 -1, // its length (NUL-terminated)
1654 buf, // output string
1655 buf ? n : 0 // size of output buffer
1656 );
1657
1658 // note that it returns count of written chars for buf != NULL and size
1659 // of the needed buffer for buf == NULL so in either case the length of
1660 // the string (which never includes the terminating NUL) is one less
1661 return len ? len - 1 : (size_t)-1;
1662 }
1663
1664 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1665 {
1666 /*
1667 we have a problem here: by default, WideCharToMultiByte() may
1668 replace characters unrepresentable in the target code page with bad
1669 quality approximations such as turning "1/2" symbol (U+00BD) into
1670 "1" for the code pages which don't have it and we, obviously, want
1671 to avoid this at any price
1672
1673 the trouble is that this function does it _silently_, i.e. it won't
1674 even tell us whether it did or not... Win98/2000 and higher provide
1675 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1676 we have to resort to a round trip, i.e. check that converting back
1677 results in the same string -- this is, of course, expensive but
1678 otherwise we simply can't be sure to not garble the data.
1679 */
1680
1681 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1682 // it doesn't work with CJK encodings (which we test for rather roughly
1683 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1684 // supporting it
1685 BOOL usedDef wxDUMMY_INITIALIZE(false);
1686 BOOL *pUsedDef;
1687 int flags;
1688 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1689 {
1690 // it's our lucky day
1691 flags = WC_NO_BEST_FIT_CHARS;
1692 pUsedDef = &usedDef;
1693 }
1694 else // old system or unsupported encoding
1695 {
1696 flags = 0;
1697 pUsedDef = NULL;
1698 }
1699
1700 const size_t len = ::WideCharToMultiByte
1701 (
1702 m_CodePage, // code page
1703 flags, // either none or no best fit
1704 pwz, // input string
1705 -1, // it is (wide) NUL-terminated
1706 buf, // output buffer
1707 buf ? n : 0, // and its size
1708 NULL, // default "replacement" char
1709 pUsedDef // [out] was it used?
1710 );
1711
1712 if ( !len )
1713 {
1714 // function totally failed
1715 return (size_t)-1;
1716 }
1717
1718 // if we were really converting, check if we succeeded
1719 if ( buf )
1720 {
1721 if ( flags )
1722 {
1723 // check if the conversion failed, i.e. if any replacements
1724 // were done
1725 if ( usedDef )
1726 return (size_t)-1;
1727 }
1728 else // we must resort to double tripping...
1729 {
1730 wxWCharBuffer wcBuf(n);
1731 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1732 wcscmp(wcBuf, pwz) != 0 )
1733 {
1734 // we didn't obtain the same thing we started from, hence
1735 // the conversion was lossy and we consider that it failed
1736 return (size_t)-1;
1737 }
1738 }
1739 }
1740
1741 // see the comment above for the reason of "len - 1"
1742 return len - 1;
1743 }
1744
1745 bool IsOk() const { return m_CodePage != -1; }
1746
1747private:
1748 static bool CanUseNoBestFit()
1749 {
1750 static int s_isWin98Or2k = -1;
1751
1752 if ( s_isWin98Or2k == -1 )
1753 {
1754 int verMaj, verMin;
1755 switch ( wxGetOsVersion(&verMaj, &verMin) )
1756 {
1757 case wxWIN95:
1758 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1759 break;
1760
1761 case wxWINDOWS_NT:
1762 s_isWin98Or2k = verMaj >= 5;
1763 break;
1764
1765 default:
1766 // unknown, be conseravtive by default
1767 s_isWin98Or2k = 0;
1768 }
1769
1770 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1771 }
1772
1773 return s_isWin98Or2k == 1;
1774 }
1775
1776 long m_CodePage;
1777};
1778
1779#endif // wxHAVE_WIN32_MB2WC
1780
1781// ============================================================================
1782// Cocoa conversion classes
1783// ============================================================================
1784
1785#if defined(__WXCOCOA__)
1786
1787// RN: There is no UTF-32 support in either Core Foundation or
1788// Cocoa. Strangely enough, internally Core Foundation uses
1789// UTF 32 internally quite a bit - its just not public (yet).
1790
1791#include <CoreFoundation/CFString.h>
1792#include <CoreFoundation/CFStringEncodingExt.h>
1793
1794CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1795{
1796 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1797 if ( encoding == wxFONTENCODING_DEFAULT )
1798 {
1799 enc = CFStringGetSystemEncoding();
1800 }
1801 else switch( encoding)
1802 {
1803 case wxFONTENCODING_ISO8859_1 :
1804 enc = kCFStringEncodingISOLatin1 ;
1805 break ;
1806 case wxFONTENCODING_ISO8859_2 :
1807 enc = kCFStringEncodingISOLatin2;
1808 break ;
1809 case wxFONTENCODING_ISO8859_3 :
1810 enc = kCFStringEncodingISOLatin3 ;
1811 break ;
1812 case wxFONTENCODING_ISO8859_4 :
1813 enc = kCFStringEncodingISOLatin4;
1814 break ;
1815 case wxFONTENCODING_ISO8859_5 :
1816 enc = kCFStringEncodingISOLatinCyrillic;
1817 break ;
1818 case wxFONTENCODING_ISO8859_6 :
1819 enc = kCFStringEncodingISOLatinArabic;
1820 break ;
1821 case wxFONTENCODING_ISO8859_7 :
1822 enc = kCFStringEncodingISOLatinGreek;
1823 break ;
1824 case wxFONTENCODING_ISO8859_8 :
1825 enc = kCFStringEncodingISOLatinHebrew;
1826 break ;
1827 case wxFONTENCODING_ISO8859_9 :
1828 enc = kCFStringEncodingISOLatin5;
1829 break ;
1830 case wxFONTENCODING_ISO8859_10 :
1831 enc = kCFStringEncodingISOLatin6;
1832 break ;
1833 case wxFONTENCODING_ISO8859_11 :
1834 enc = kCFStringEncodingISOLatinThai;
1835 break ;
1836 case wxFONTENCODING_ISO8859_13 :
1837 enc = kCFStringEncodingISOLatin7;
1838 break ;
1839 case wxFONTENCODING_ISO8859_14 :
1840 enc = kCFStringEncodingISOLatin8;
1841 break ;
1842 case wxFONTENCODING_ISO8859_15 :
1843 enc = kCFStringEncodingISOLatin9;
1844 break ;
1845
1846 case wxFONTENCODING_KOI8 :
1847 enc = kCFStringEncodingKOI8_R;
1848 break ;
1849 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1850 enc = kCFStringEncodingDOSRussian;
1851 break ;
1852
1853// case wxFONTENCODING_BULGARIAN :
1854// enc = ;
1855// break ;
1856
1857 case wxFONTENCODING_CP437 :
1858 enc =kCFStringEncodingDOSLatinUS ;
1859 break ;
1860 case wxFONTENCODING_CP850 :
1861 enc = kCFStringEncodingDOSLatin1;
1862 break ;
1863 case wxFONTENCODING_CP852 :
1864 enc = kCFStringEncodingDOSLatin2;
1865 break ;
1866 case wxFONTENCODING_CP855 :
1867 enc = kCFStringEncodingDOSCyrillic;
1868 break ;
1869 case wxFONTENCODING_CP866 :
1870 enc =kCFStringEncodingDOSRussian ;
1871 break ;
1872 case wxFONTENCODING_CP874 :
1873 enc = kCFStringEncodingDOSThai;
1874 break ;
1875 case wxFONTENCODING_CP932 :
1876 enc = kCFStringEncodingDOSJapanese;
1877 break ;
1878 case wxFONTENCODING_CP936 :
1879 enc =kCFStringEncodingDOSChineseSimplif ;
1880 break ;
1881 case wxFONTENCODING_CP949 :
1882 enc = kCFStringEncodingDOSKorean;
1883 break ;
1884 case wxFONTENCODING_CP950 :
1885 enc = kCFStringEncodingDOSChineseTrad;
1886 break ;
1887 case wxFONTENCODING_CP1250 :
1888 enc = kCFStringEncodingWindowsLatin2;
1889 break ;
1890 case wxFONTENCODING_CP1251 :
1891 enc =kCFStringEncodingWindowsCyrillic ;
1892 break ;
1893 case wxFONTENCODING_CP1252 :
1894 enc =kCFStringEncodingWindowsLatin1 ;
1895 break ;
1896 case wxFONTENCODING_CP1253 :
1897 enc = kCFStringEncodingWindowsGreek;
1898 break ;
1899 case wxFONTENCODING_CP1254 :
1900 enc = kCFStringEncodingWindowsLatin5;
1901 break ;
1902 case wxFONTENCODING_CP1255 :
1903 enc =kCFStringEncodingWindowsHebrew ;
1904 break ;
1905 case wxFONTENCODING_CP1256 :
1906 enc =kCFStringEncodingWindowsArabic ;
1907 break ;
1908 case wxFONTENCODING_CP1257 :
1909 enc = kCFStringEncodingWindowsBalticRim;
1910 break ;
1911// This only really encodes to UTF7 (if that) evidently
1912// case wxFONTENCODING_UTF7 :
1913// enc = kCFStringEncodingNonLossyASCII ;
1914// break ;
1915 case wxFONTENCODING_UTF8 :
1916 enc = kCFStringEncodingUTF8 ;
1917 break ;
1918 case wxFONTENCODING_EUC_JP :
1919 enc = kCFStringEncodingEUC_JP;
1920 break ;
1921 case wxFONTENCODING_UTF16 :
1922 enc = kCFStringEncodingUnicode ;
1923 break ;
1924 case wxFONTENCODING_MACROMAN :
1925 enc = kCFStringEncodingMacRoman ;
1926 break ;
1927 case wxFONTENCODING_MACJAPANESE :
1928 enc = kCFStringEncodingMacJapanese ;
1929 break ;
1930 case wxFONTENCODING_MACCHINESETRAD :
1931 enc = kCFStringEncodingMacChineseTrad ;
1932 break ;
1933 case wxFONTENCODING_MACKOREAN :
1934 enc = kCFStringEncodingMacKorean ;
1935 break ;
1936 case wxFONTENCODING_MACARABIC :
1937 enc = kCFStringEncodingMacArabic ;
1938 break ;
1939 case wxFONTENCODING_MACHEBREW :
1940 enc = kCFStringEncodingMacHebrew ;
1941 break ;
1942 case wxFONTENCODING_MACGREEK :
1943 enc = kCFStringEncodingMacGreek ;
1944 break ;
1945 case wxFONTENCODING_MACCYRILLIC :
1946 enc = kCFStringEncodingMacCyrillic ;
1947 break ;
1948 case wxFONTENCODING_MACDEVANAGARI :
1949 enc = kCFStringEncodingMacDevanagari ;
1950 break ;
1951 case wxFONTENCODING_MACGURMUKHI :
1952 enc = kCFStringEncodingMacGurmukhi ;
1953 break ;
1954 case wxFONTENCODING_MACGUJARATI :
1955 enc = kCFStringEncodingMacGujarati ;
1956 break ;
1957 case wxFONTENCODING_MACORIYA :
1958 enc = kCFStringEncodingMacOriya ;
1959 break ;
1960 case wxFONTENCODING_MACBENGALI :
1961 enc = kCFStringEncodingMacBengali ;
1962 break ;
1963 case wxFONTENCODING_MACTAMIL :
1964 enc = kCFStringEncodingMacTamil ;
1965 break ;
1966 case wxFONTENCODING_MACTELUGU :
1967 enc = kCFStringEncodingMacTelugu ;
1968 break ;
1969 case wxFONTENCODING_MACKANNADA :
1970 enc = kCFStringEncodingMacKannada ;
1971 break ;
1972 case wxFONTENCODING_MACMALAJALAM :
1973 enc = kCFStringEncodingMacMalayalam ;
1974 break ;
1975 case wxFONTENCODING_MACSINHALESE :
1976 enc = kCFStringEncodingMacSinhalese ;
1977 break ;
1978 case wxFONTENCODING_MACBURMESE :
1979 enc = kCFStringEncodingMacBurmese ;
1980 break ;
1981 case wxFONTENCODING_MACKHMER :
1982 enc = kCFStringEncodingMacKhmer ;
1983 break ;
1984 case wxFONTENCODING_MACTHAI :
1985 enc = kCFStringEncodingMacThai ;
1986 break ;
1987 case wxFONTENCODING_MACLAOTIAN :
1988 enc = kCFStringEncodingMacLaotian ;
1989 break ;
1990 case wxFONTENCODING_MACGEORGIAN :
1991 enc = kCFStringEncodingMacGeorgian ;
1992 break ;
1993 case wxFONTENCODING_MACARMENIAN :
1994 enc = kCFStringEncodingMacArmenian ;
1995 break ;
1996 case wxFONTENCODING_MACCHINESESIMP :
1997 enc = kCFStringEncodingMacChineseSimp ;
1998 break ;
1999 case wxFONTENCODING_MACTIBETAN :
2000 enc = kCFStringEncodingMacTibetan ;
2001 break ;
2002 case wxFONTENCODING_MACMONGOLIAN :
2003 enc = kCFStringEncodingMacMongolian ;
2004 break ;
2005 case wxFONTENCODING_MACETHIOPIC :
2006 enc = kCFStringEncodingMacEthiopic ;
2007 break ;
2008 case wxFONTENCODING_MACCENTRALEUR :
2009 enc = kCFStringEncodingMacCentralEurRoman ;
2010 break ;
2011 case wxFONTENCODING_MACVIATNAMESE :
2012 enc = kCFStringEncodingMacVietnamese ;
2013 break ;
2014 case wxFONTENCODING_MACARABICEXT :
2015 enc = kCFStringEncodingMacExtArabic ;
2016 break ;
2017 case wxFONTENCODING_MACSYMBOL :
2018 enc = kCFStringEncodingMacSymbol ;
2019 break ;
2020 case wxFONTENCODING_MACDINGBATS :
2021 enc = kCFStringEncodingMacDingbats ;
2022 break ;
2023 case wxFONTENCODING_MACTURKISH :
2024 enc = kCFStringEncodingMacTurkish ;
2025 break ;
2026 case wxFONTENCODING_MACCROATIAN :
2027 enc = kCFStringEncodingMacCroatian ;
2028 break ;
2029 case wxFONTENCODING_MACICELANDIC :
2030 enc = kCFStringEncodingMacIcelandic ;
2031 break ;
2032 case wxFONTENCODING_MACROMANIAN :
2033 enc = kCFStringEncodingMacRomanian ;
2034 break ;
2035 case wxFONTENCODING_MACCELTIC :
2036 enc = kCFStringEncodingMacCeltic ;
2037 break ;
2038 case wxFONTENCODING_MACGAELIC :
2039 enc = kCFStringEncodingMacGaelic ;
2040 break ;
2041// case wxFONTENCODING_MACKEYBOARD :
2042// enc = kCFStringEncodingMacKeyboardGlyphs ;
2043// break ;
2044 default :
2045 // because gcc is picky
2046 break ;
2047 } ;
2048 return enc ;
2049}
2050
2051class wxMBConv_cocoa : public wxMBConv
2052{
2053public:
2054 wxMBConv_cocoa()
2055 {
2056 Init(CFStringGetSystemEncoding()) ;
2057 }
2058
2059#if wxUSE_FONTMAP
2060 wxMBConv_cocoa(const wxChar* name)
2061 {
2062 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2063 }
2064#endif
2065
2066 wxMBConv_cocoa(wxFontEncoding encoding)
2067 {
2068 Init( wxCFStringEncFromFontEnc(encoding) );
2069 }
2070
2071 ~wxMBConv_cocoa()
2072 {
2073 }
2074
2075 void Init( CFStringEncoding encoding)
2076 {
2077 m_encoding = encoding ;
2078 }
2079
2080 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2081 {
2082 wxASSERT(szUnConv);
2083
2084 CFStringRef theString = CFStringCreateWithBytes (
2085 NULL, //the allocator
2086 (const UInt8*)szUnConv,
2087 strlen(szUnConv),
2088 m_encoding,
2089 false //no BOM/external representation
2090 );
2091
2092 wxASSERT(theString);
2093
2094 size_t nOutLength = CFStringGetLength(theString);
2095
2096 if (szOut == NULL)
2097 {
2098 CFRelease(theString);
2099 return nOutLength;
2100 }
2101
2102 CFRange theRange = { 0, nOutSize };
2103
2104#if SIZEOF_WCHAR_T == 4
2105 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2106#endif
2107
2108 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2109
2110 CFRelease(theString);
2111
2112 szUniCharBuffer[nOutLength] = '\0' ;
2113
2114#if SIZEOF_WCHAR_T == 4
2115 wxMBConvUTF16 converter ;
2116 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2117 delete[] szUniCharBuffer;
2118#endif
2119
2120 return nOutLength;
2121 }
2122
2123 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2124 {
2125 wxASSERT(szUnConv);
2126
2127 size_t nRealOutSize;
2128 size_t nBufSize = wxWcslen(szUnConv);
2129 UniChar* szUniBuffer = (UniChar*) szUnConv;
2130
2131#if SIZEOF_WCHAR_T == 4
2132 wxMBConvUTF16 converter ;
2133 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2134 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2135 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2136 nBufSize /= sizeof(UniChar);
2137#endif
2138
2139 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2140 NULL, //allocator
2141 szUniBuffer,
2142 nBufSize,
2143 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2144 );
2145
2146 wxASSERT(theString);
2147
2148 //Note that CER puts a BOM when converting to unicode
2149 //so we check and use getchars instead in that case
2150 if (m_encoding == kCFStringEncodingUnicode)
2151 {
2152 if (szOut != NULL)
2153 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2154
2155 nRealOutSize = CFStringGetLength(theString) + 1;
2156 }
2157 else
2158 {
2159 CFStringGetBytes(
2160 theString,
2161 CFRangeMake(0, CFStringGetLength(theString)),
2162 m_encoding,
2163 0, //what to put in characters that can't be converted -
2164 //0 tells CFString to return NULL if it meets such a character
2165 false, //not an external representation
2166 (UInt8*) szOut,
2167 nOutSize,
2168 (CFIndex*) &nRealOutSize
2169 );
2170 }
2171
2172 CFRelease(theString);
2173
2174#if SIZEOF_WCHAR_T == 4
2175 delete[] szUniBuffer;
2176#endif
2177
2178 return nRealOutSize - 1;
2179 }
2180
2181 bool IsOk() const
2182 {
2183 return m_encoding != kCFStringEncodingInvalidId &&
2184 CFStringIsEncodingAvailable(m_encoding);
2185 }
2186
2187private:
2188 CFStringEncoding m_encoding ;
2189};
2190
2191#endif // defined(__WXCOCOA__)
2192
2193// ============================================================================
2194// Mac conversion classes
2195// ============================================================================
2196
2197#if defined(__WXMAC__) && defined(TARGET_CARBON)
2198
2199class wxMBConv_mac : public wxMBConv
2200{
2201public:
2202 wxMBConv_mac()
2203 {
2204 Init(CFStringGetSystemEncoding()) ;
2205 }
2206
2207#if wxUSE_FONTMAP
2208 wxMBConv_mac(const wxChar* name)
2209 {
2210 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2211 }
2212#endif
2213
2214 wxMBConv_mac(wxFontEncoding encoding)
2215 {
2216 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2217 }
2218
2219 ~wxMBConv_mac()
2220 {
2221 OSStatus status = noErr ;
2222 status = TECDisposeConverter(m_MB2WC_converter);
2223 status = TECDisposeConverter(m_WC2MB_converter);
2224 }
2225
2226
2227 void Init( TextEncodingBase encoding)
2228 {
2229 OSStatus status = noErr ;
2230 m_char_encoding = encoding ;
2231 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2232
2233 status = TECCreateConverter(&m_MB2WC_converter,
2234 m_char_encoding,
2235 m_unicode_encoding);
2236 status = TECCreateConverter(&m_WC2MB_converter,
2237 m_unicode_encoding,
2238 m_char_encoding);
2239 }
2240
2241 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2242 {
2243 OSStatus status = noErr ;
2244 ByteCount byteOutLen ;
2245 ByteCount byteInLen = strlen(psz) ;
2246 wchar_t *tbuf = NULL ;
2247 UniChar* ubuf = NULL ;
2248 size_t res = 0 ;
2249
2250 if (buf == NULL)
2251 {
2252 //apple specs say at least 32
2253 n = wxMax( 32 , byteInLen ) ;
2254 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2255 }
2256 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2257#if SIZEOF_WCHAR_T == 4
2258 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2259#else
2260 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2261#endif
2262 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2263 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2264#if SIZEOF_WCHAR_T == 4
2265 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2266 // is not properly terminated we get random characters at the end
2267 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2268 wxMBConvUTF16 converter ;
2269 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2270 free( ubuf ) ;
2271#else
2272 res = byteOutLen / sizeof( UniChar ) ;
2273#endif
2274 if ( buf == NULL )
2275 free(tbuf) ;
2276
2277 if ( buf && res < n)
2278 buf[res] = 0;
2279
2280 return res ;
2281 }
2282
2283 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2284 {
2285 OSStatus status = noErr ;
2286 ByteCount byteOutLen ;
2287 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2288
2289 char *tbuf = NULL ;
2290
2291 if (buf == NULL)
2292 {
2293 //apple specs say at least 32
2294 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2295 tbuf = (char*) malloc( n ) ;
2296 }
2297
2298 ByteCount byteBufferLen = n ;
2299 UniChar* ubuf = NULL ;
2300#if SIZEOF_WCHAR_T == 4
2301 wxMBConvUTF16 converter ;
2302 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2303 byteInLen = unicharlen ;
2304 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2305 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2306#else
2307 ubuf = (UniChar*) psz ;
2308#endif
2309 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2310 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2311#if SIZEOF_WCHAR_T == 4
2312 free( ubuf ) ;
2313#endif
2314 if ( buf == NULL )
2315 free(tbuf) ;
2316
2317 size_t res = byteOutLen ;
2318 if ( buf && res < n)
2319 {
2320 buf[res] = 0;
2321
2322 //we need to double-trip to verify it didn't insert any ? in place
2323 //of bogus characters
2324 wxWCharBuffer wcBuf(n);
2325 size_t pszlen = wxWcslen(psz);
2326 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2327 wxWcslen(wcBuf) != pszlen ||
2328 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2329 {
2330 // we didn't obtain the same thing we started from, hence
2331 // the conversion was lossy and we consider that it failed
2332 return (size_t)-1;
2333 }
2334 }
2335
2336 return res ;
2337 }
2338
2339 bool IsOk() const
2340 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2341
2342private:
2343 TECObjectRef m_MB2WC_converter ;
2344 TECObjectRef m_WC2MB_converter ;
2345
2346 TextEncodingBase m_char_encoding ;
2347 TextEncodingBase m_unicode_encoding ;
2348};
2349
2350#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2351
2352// ============================================================================
2353// wxEncodingConverter based conversion classes
2354// ============================================================================
2355
2356#if wxUSE_FONTMAP
2357
2358class wxMBConv_wxwin : public wxMBConv
2359{
2360private:
2361 void Init()
2362 {
2363 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2364 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2365 }
2366
2367public:
2368 // temporarily just use wxEncodingConverter stuff,
2369 // so that it works while a better implementation is built
2370 wxMBConv_wxwin(const wxChar* name)
2371 {
2372 if (name)
2373 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2374 else
2375 m_enc = wxFONTENCODING_SYSTEM;
2376
2377 Init();
2378 }
2379
2380 wxMBConv_wxwin(wxFontEncoding enc)
2381 {
2382 m_enc = enc;
2383
2384 Init();
2385 }
2386
2387 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2388 {
2389 size_t inbuf = strlen(psz);
2390 if (buf)
2391 {
2392 if (!m2w.Convert(psz,buf))
2393 return (size_t)-1;
2394 }
2395 return inbuf;
2396 }
2397
2398 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2399 {
2400 const size_t inbuf = wxWcslen(psz);
2401 if (buf)
2402 {
2403 if (!w2m.Convert(psz,buf))
2404 return (size_t)-1;
2405 }
2406
2407 return inbuf;
2408 }
2409
2410 bool IsOk() const { return m_ok; }
2411
2412public:
2413 wxFontEncoding m_enc;
2414 wxEncodingConverter m2w, w2m;
2415
2416 // were we initialized successfully?
2417 bool m_ok;
2418
2419 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2420};
2421
2422// make the constructors available for unit testing
2423WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2424{
2425 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2426 if ( !result->IsOk() )
2427 {
2428 delete result;
2429 return 0;
2430 }
2431 return result;
2432}
2433
2434#endif // wxUSE_FONTMAP
2435
2436// ============================================================================
2437// wxCSConv implementation
2438// ============================================================================
2439
2440void wxCSConv::Init()
2441{
2442 m_name = NULL;
2443 m_convReal = NULL;
2444 m_deferred = true;
2445}
2446
2447wxCSConv::wxCSConv(const wxChar *charset)
2448{
2449 Init();
2450
2451 if ( charset )
2452 {
2453 SetName(charset);
2454 }
2455
2456 m_encoding = wxFONTENCODING_SYSTEM;
2457}
2458
2459wxCSConv::wxCSConv(wxFontEncoding encoding)
2460{
2461 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2462 {
2463 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2464
2465 encoding = wxFONTENCODING_SYSTEM;
2466 }
2467
2468 Init();
2469
2470 m_encoding = encoding;
2471}
2472
2473wxCSConv::~wxCSConv()
2474{
2475 Clear();
2476}
2477
2478wxCSConv::wxCSConv(const wxCSConv& conv)
2479 : wxMBConv()
2480{
2481 Init();
2482
2483 SetName(conv.m_name);
2484 m_encoding = conv.m_encoding;
2485}
2486
2487wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2488{
2489 Clear();
2490
2491 SetName(conv.m_name);
2492 m_encoding = conv.m_encoding;
2493
2494 return *this;
2495}
2496
2497void wxCSConv::Clear()
2498{
2499 free(m_name);
2500 delete m_convReal;
2501
2502 m_name = NULL;
2503 m_convReal = NULL;
2504}
2505
2506void wxCSConv::SetName(const wxChar *charset)
2507{
2508 if (charset)
2509 {
2510 m_name = wxStrdup(charset);
2511 m_deferred = true;
2512 }
2513}
2514
2515#if wxUSE_FONTMAP
2516#include "wx/hashmap.h"
2517
2518WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2519 wxEncodingNameCache );
2520
2521static wxEncodingNameCache gs_nameCache;
2522#endif
2523
2524wxMBConv *wxCSConv::DoCreate() const
2525{
2526#if wxUSE_FONTMAP
2527 wxLogTrace(TRACE_STRCONV,
2528 wxT("creating conversion for %s"),
2529 (m_name ? m_name
2530 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2531#endif // wxUSE_FONTMAP
2532
2533 // check for the special case of ASCII or ISO8859-1 charset: as we have
2534 // special knowledge of it anyhow, we don't need to create a special
2535 // conversion object
2536 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2537 {
2538 // don't convert at all
2539 return NULL;
2540 }
2541
2542 // we trust OS to do conversion better than we can so try external
2543 // conversion methods first
2544 //
2545 // the full order is:
2546 // 1. OS conversion (iconv() under Unix or Win32 API)
2547 // 2. hard coded conversions for UTF
2548 // 3. wxEncodingConverter as fall back
2549
2550 // step (1)
2551#ifdef HAVE_ICONV
2552#if !wxUSE_FONTMAP
2553 if ( m_name )
2554#endif // !wxUSE_FONTMAP
2555 {
2556 wxString name(m_name);
2557 wxFontEncoding encoding(m_encoding);
2558
2559 if ( !name.empty() )
2560 {
2561 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2562 if ( conv->IsOk() )
2563 return conv;
2564
2565 delete conv;
2566
2567#if wxUSE_FONTMAP
2568 encoding =
2569 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2570#endif // wxUSE_FONTMAP
2571 }
2572#if wxUSE_FONTMAP
2573 {
2574 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2575 if ( it != gs_nameCache.end() )
2576 {
2577 if ( it->second.empty() )
2578 return NULL;
2579
2580 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2581 if ( conv->IsOk() )
2582 return conv;
2583
2584 delete conv;
2585 }
2586
2587 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2588
2589 for ( ; *names; ++names )
2590 {
2591 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2592 if ( conv->IsOk() )
2593 {
2594 gs_nameCache[encoding] = *names;
2595 return conv;
2596 }
2597
2598 delete conv;
2599 }
2600
2601 gs_nameCache[encoding] = ""; // cache the failure
2602 }
2603#endif // wxUSE_FONTMAP
2604 }
2605#endif // HAVE_ICONV
2606
2607#ifdef wxHAVE_WIN32_MB2WC
2608 {
2609#if wxUSE_FONTMAP
2610 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2611 : new wxMBConv_win32(m_encoding);
2612 if ( conv->IsOk() )
2613 return conv;
2614
2615 delete conv;
2616#else
2617 return NULL;
2618#endif
2619 }
2620#endif // wxHAVE_WIN32_MB2WC
2621#if defined(__WXMAC__)
2622 {
2623 // leave UTF16 and UTF32 to the built-ins of wx
2624 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2625 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2626 {
2627
2628#if wxUSE_FONTMAP
2629 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2630 : new wxMBConv_mac(m_encoding);
2631#else
2632 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2633#endif
2634 if ( conv->IsOk() )
2635 return conv;
2636
2637 delete conv;
2638 }
2639 }
2640#endif
2641#if defined(__WXCOCOA__)
2642 {
2643 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2644 {
2645
2646#if wxUSE_FONTMAP
2647 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2648 : new wxMBConv_cocoa(m_encoding);
2649#else
2650 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2651#endif
2652 if ( conv->IsOk() )
2653 return conv;
2654
2655 delete conv;
2656 }
2657 }
2658#endif
2659 // step (2)
2660 wxFontEncoding enc = m_encoding;
2661#if wxUSE_FONTMAP
2662 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2663 {
2664 // use "false" to suppress interactive dialogs -- we can be called from
2665 // anywhere and popping up a dialog from here is the last thing we want to
2666 // do
2667 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2668 }
2669#endif // wxUSE_FONTMAP
2670
2671 switch ( enc )
2672 {
2673 case wxFONTENCODING_UTF7:
2674 return new wxMBConvUTF7;
2675
2676 case wxFONTENCODING_UTF8:
2677 return new wxMBConvUTF8;
2678
2679 case wxFONTENCODING_UTF16BE:
2680 return new wxMBConvUTF16BE;
2681
2682 case wxFONTENCODING_UTF16LE:
2683 return new wxMBConvUTF16LE;
2684
2685 case wxFONTENCODING_UTF32BE:
2686 return new wxMBConvUTF32BE;
2687
2688 case wxFONTENCODING_UTF32LE:
2689 return new wxMBConvUTF32LE;
2690
2691 default:
2692 // nothing to do but put here to suppress gcc warnings
2693 ;
2694 }
2695
2696 // step (3)
2697#if wxUSE_FONTMAP
2698 {
2699 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2700 : new wxMBConv_wxwin(m_encoding);
2701 if ( conv->IsOk() )
2702 return conv;
2703
2704 delete conv;
2705 }
2706#endif // wxUSE_FONTMAP
2707
2708 // NB: This is a hack to prevent deadlock. What could otherwise happen
2709 // in Unicode build: wxConvLocal creation ends up being here
2710 // because of some failure and logs the error. But wxLog will try to
2711 // attach timestamp, for which it will need wxConvLocal (to convert
2712 // time to char* and then wchar_t*), but that fails, tries to log
2713 // error, but wxLog has a (already locked) critical section that
2714 // guards static buffer.
2715 static bool alreadyLoggingError = false;
2716 if (!alreadyLoggingError)
2717 {
2718 alreadyLoggingError = true;
2719 wxLogError(_("Cannot convert from the charset '%s'!"),
2720 m_name ? m_name
2721 :
2722#if wxUSE_FONTMAP
2723 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2724#else // !wxUSE_FONTMAP
2725 wxString::Format(_("encoding %s"), m_encoding).c_str()
2726#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2727 );
2728 alreadyLoggingError = false;
2729 }
2730
2731 return NULL;
2732}
2733
2734void wxCSConv::CreateConvIfNeeded() const
2735{
2736 if ( m_deferred )
2737 {
2738 wxCSConv *self = (wxCSConv *)this; // const_cast
2739
2740#if wxUSE_INTL
2741 // if we don't have neither the name nor the encoding, use the default
2742 // encoding for this system
2743 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2744 {
2745 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2746 }
2747#endif // wxUSE_INTL
2748
2749 self->m_convReal = DoCreate();
2750 self->m_deferred = false;
2751 }
2752}
2753
2754size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2755{
2756 CreateConvIfNeeded();
2757
2758 if (m_convReal)
2759 return m_convReal->MB2WC(buf, psz, n);
2760
2761 // latin-1 (direct)
2762 size_t len = strlen(psz);
2763
2764 if (buf)
2765 {
2766 for (size_t c = 0; c <= len; c++)
2767 buf[c] = (unsigned char)(psz[c]);
2768 }
2769
2770 return len;
2771}
2772
2773size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2774{
2775 CreateConvIfNeeded();
2776
2777 if (m_convReal)
2778 return m_convReal->WC2MB(buf, psz, n);
2779
2780 // latin-1 (direct)
2781 const size_t len = wxWcslen(psz);
2782 if (buf)
2783 {
2784 for (size_t c = 0; c <= len; c++)
2785 {
2786 if (psz[c] > 0xFF)
2787 return (size_t)-1;
2788 buf[c] = (char)psz[c];
2789 }
2790 }
2791 else
2792 {
2793 for (size_t c = 0; c <= len; c++)
2794 {
2795 if (psz[c] > 0xFF)
2796 return (size_t)-1;
2797 }
2798 }
2799
2800 return len;
2801}
2802
2803// ----------------------------------------------------------------------------
2804// globals
2805// ----------------------------------------------------------------------------
2806
2807#ifdef __WINDOWS__
2808 static wxMBConv_win32 wxConvLibcObj;
2809#elif defined(__WXMAC__) && !defined(__MACH__)
2810 static wxMBConv_mac wxConvLibcObj ;
2811#else
2812 static wxMBConvLibc wxConvLibcObj;
2813#endif
2814
2815static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2816static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2817static wxMBConvUTF7 wxConvUTF7Obj;
2818static wxMBConvUTF8 wxConvUTF8Obj;
2819
2820WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2821WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2822WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2823WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2824WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2825WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2826WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2827#ifdef __WXOSX__
2828 wxConvUTF8Obj;
2829#else
2830 wxConvLibcObj;
2831#endif
2832
2833
2834#else // !wxUSE_WCHAR_T
2835
2836// stand-ins in absence of wchar_t
2837WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2838 wxConvISO8859_1,
2839 wxConvLocal,
2840 wxConvUTF8;
2841
2842#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2843
2844