]> git.saurik.com Git - wxWidgets.git/blame_incremental - src/common/strconv.cpp
wxMotif for OS/2 adjustements. Source cleaning.
[wxWidgets.git] / src / common / strconv.cpp
... / ...
CommitLineData
1/////////////////////////////////////////////////////////////////////////////
2// Name: src/common/strconv.cpp
3// Purpose: Unicode conversion classes
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
11// (c) 2004 Ryan Norton, Fredrik Roubert
12// Licence: wxWindows licence
13/////////////////////////////////////////////////////////////////////////////
14
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
23// For compilers that support precompilation, includes "wx.h".
24#include "wx/wxprec.h"
25
26#ifdef __BORLANDC__
27 #pragma hdrstop
28#endif
29
30#ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33#endif // WX_PRECOMP
34
35#include "wx/strconv.h"
36
37#if wxUSE_WCHAR_T
38
39#ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42#endif
43
44#ifndef __WXWINCE__
45#include <errno.h>
46#endif
47
48#include <ctype.h>
49#include <string.h>
50#include <stdlib.h>
51
52#if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54#endif // __WIN32__ but !__WXMICROWIN__
55
56#ifdef __SALFORDC__
57 #include <clib.h>
58#endif
59
60#ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63#endif
64
65#include "wx/encconv.h"
66#include "wx/fontmap.h"
67#include "wx/utils.h"
68
69#ifdef __WXMAC__
70#ifndef __DARWIN__
71#include <ATSUnicode.h>
72#include <TextCommon.h>
73#include <TextEncodingConverter.h>
74#endif
75
76#include "wx/mac/private.h" // includes mac headers
77#endif
78
79#define TRACE_STRCONV _T("strconv")
80
81// ============================================================================
82// implementation
83// ============================================================================
84
85// ----------------------------------------------------------------------------
86// UTF-16 en/decoding to/from UCS-4
87// ----------------------------------------------------------------------------
88
89
90static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91{
92 if (input<=0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96 return 1;
97 }
98 else if (input>=0x110000)
99 {
100 return (size_t)-1;
101 }
102 else
103 {
104 if (output)
105 {
106 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
107 *output = (wxUint16) ((input&0x3ff)+0xdc00);
108 }
109 return 2;
110 }
111}
112
113static size_t decode_utf16(const wxUint16* input, wxUint32& output)
114{
115 if ((*input<0xd800) || (*input>0xdfff))
116 {
117 output = *input;
118 return 1;
119 }
120 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
121 {
122 output = *input;
123 return (size_t)-1;
124 }
125 else
126 {
127 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
128 return 2;
129 }
130}
131
132
133// ----------------------------------------------------------------------------
134// wxMBConv
135// ----------------------------------------------------------------------------
136
137wxMBConv::~wxMBConv()
138{
139 // nothing to do here (necessary for Darwin linking probably)
140}
141
142const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
143{
144 if ( psz )
145 {
146 // calculate the length of the buffer needed first
147 size_t nLen = MB2WC(NULL, psz, 0);
148 if ( nLen != (size_t)-1 )
149 {
150 // now do the actual conversion
151 wxWCharBuffer buf(nLen);
152 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
153 if ( nLen != (size_t)-1 )
154 {
155 return buf;
156 }
157 }
158 }
159
160 wxWCharBuffer buf((wchar_t *)NULL);
161
162 return buf;
163}
164
165const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
166{
167 if ( pwz )
168 {
169 size_t nLen = WC2MB(NULL, pwz, 0);
170 if ( nLen != (size_t)-1 )
171 {
172 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
173 nLen = WC2MB(buf.data(), pwz, nLen + 4);
174 if ( nLen != (size_t)-1 )
175 {
176 return buf;
177 }
178 }
179 }
180
181 wxCharBuffer buf((char *)NULL);
182
183 return buf;
184}
185
186const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
187{
188 wxASSERT(pOutSize != NULL);
189
190 const char* szEnd = szString + nStringLen + 1;
191 const char* szPos = szString;
192 const char* szStart = szPos;
193
194 size_t nActualLength = 0;
195 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
196
197 wxWCharBuffer theBuffer(nCurrentSize);
198
199 //Convert the string until the length() is reached, continuing the
200 //loop every time a null character is reached
201 while(szPos != szEnd)
202 {
203 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
204
205 //Get the length of the current (sub)string
206 size_t nLen = MB2WC(NULL, szPos, 0);
207
208 //Invalid conversion?
209 if( nLen == (size_t)-1 )
210 {
211 *pOutSize = 0;
212 theBuffer.data()[0u] = wxT('\0');
213 return theBuffer;
214 }
215
216
217 //Increase the actual length (+1 for current null character)
218 nActualLength += nLen + 1;
219
220 //if buffer too big, realloc the buffer
221 if (nActualLength > (nCurrentSize+1))
222 {
223 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
224 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
225 theBuffer = theNewBuffer;
226 nCurrentSize <<= 1;
227 }
228
229 //Convert the current (sub)string
230 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
231 {
232 *pOutSize = 0;
233 theBuffer.data()[0u] = wxT('\0');
234 return theBuffer;
235 }
236
237 //Increment to next (sub)string
238 //Note that we have to use strlen instead of nLen here
239 //because XX2XX gives us the size of the output buffer,
240 //which is not necessarily the length of the string
241 szPos += strlen(szPos) + 1;
242 }
243
244 //success - return actual length and the buffer
245 *pOutSize = nActualLength;
246 return theBuffer;
247}
248
249const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
250{
251 wxASSERT(pOutSize != NULL);
252
253 const wchar_t* szEnd = szString + nStringLen + 1;
254 const wchar_t* szPos = szString;
255 const wchar_t* szStart = szPos;
256
257 size_t nActualLength = 0;
258 size_t nCurrentSize = nStringLen << 2; //try * 4 first
259
260 wxCharBuffer theBuffer(nCurrentSize);
261
262 //Convert the string until the length() is reached, continuing the
263 //loop every time a null character is reached
264 while(szPos != szEnd)
265 {
266 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
267
268 //Get the length of the current (sub)string
269 size_t nLen = WC2MB(NULL, szPos, 0);
270
271 //Invalid conversion?
272 if( nLen == (size_t)-1 )
273 {
274 *pOutSize = 0;
275 theBuffer.data()[0u] = wxT('\0');
276 return theBuffer;
277 }
278
279 //Increase the actual length (+1 for current null character)
280 nActualLength += nLen + 1;
281
282 //if buffer too big, realloc the buffer
283 if (nActualLength > (nCurrentSize+1))
284 {
285 wxCharBuffer theNewBuffer(nCurrentSize << 1);
286 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
287 theBuffer = theNewBuffer;
288 nCurrentSize <<= 1;
289 }
290
291 //Convert the current (sub)string
292 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
293 {
294 *pOutSize = 0;
295 theBuffer.data()[0u] = wxT('\0');
296 return theBuffer;
297 }
298
299 //Increment to next (sub)string
300 //Note that we have to use wxWcslen instead of nLen here
301 //because XX2XX gives us the size of the output buffer,
302 //which is not necessarily the length of the string
303 szPos += wxWcslen(szPos) + 1;
304 }
305
306 //success - return actual length and the buffer
307 *pOutSize = nActualLength;
308 return theBuffer;
309}
310
311// ----------------------------------------------------------------------------
312// wxMBConvLibc
313// ----------------------------------------------------------------------------
314
315size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
316{
317 return wxMB2WC(buf, psz, n);
318}
319
320size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
321{
322 return wxWC2MB(buf, psz, n);
323}
324
325#ifdef __UNIX__
326
327// ----------------------------------------------------------------------------
328// wxConvBrokenFileNames
329// ----------------------------------------------------------------------------
330
331wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
332{
333 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
334 || wxStricmp(charset, _T("UTF8")) == 0 )
335 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
336 else
337 m_conv = new wxCSConv(charset);
338}
339
340size_t
341wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
342 const char *psz,
343 size_t outputSize) const
344{
345 return m_conv->MB2WC( outputBuf, psz, outputSize );
346}
347
348size_t
349wxConvBrokenFileNames::WC2MB(char *outputBuf,
350 const wchar_t *psz,
351 size_t outputSize) const
352{
353 return m_conv->WC2MB( outputBuf, psz, outputSize );
354}
355
356#endif
357
358// ----------------------------------------------------------------------------
359// UTF-7
360// ----------------------------------------------------------------------------
361
362// Implementation (C) 2004 Fredrik Roubert
363
364//
365// BASE64 decoding table
366//
367static const unsigned char utf7unb64[] =
368{
369 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
375 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
376 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
378 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
379 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
380 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
382 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
383 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
384 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
401};
402
403size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
404{
405 size_t len = 0;
406
407 while (*psz && ((!buf) || (len < n)))
408 {
409 unsigned char cc = *psz++;
410 if (cc != '+')
411 {
412 // plain ASCII char
413 if (buf)
414 *buf++ = cc;
415 len++;
416 }
417 else if (*psz == '-')
418 {
419 // encoded plus sign
420 if (buf)
421 *buf++ = cc;
422 len++;
423 psz++;
424 }
425 else
426 {
427 // BASE64 encoded string
428 bool lsb;
429 unsigned char c;
430 unsigned int d, l;
431 for (lsb = false, d = 0, l = 0;
432 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
433 {
434 d <<= 6;
435 d += cc;
436 for (l += 6; l >= 8; lsb = !lsb)
437 {
438 c = (unsigned char)((d >> (l -= 8)) % 256);
439 if (lsb)
440 {
441 if (buf)
442 *buf++ |= c;
443 len ++;
444 }
445 else
446 if (buf)
447 *buf = (wchar_t)(c << 8);
448 }
449 }
450 if (*psz == '-')
451 psz++;
452 }
453 }
454 if (buf && (len < n))
455 *buf = 0;
456 return len;
457}
458
459//
460// BASE64 encoding table
461//
462static const unsigned char utf7enb64[] =
463{
464 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
465 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
466 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
467 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
468 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
469 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
470 'w', 'x', 'y', 'z', '0', '1', '2', '3',
471 '4', '5', '6', '7', '8', '9', '+', '/'
472};
473
474//
475// UTF-7 encoding table
476//
477// 0 - Set D (directly encoded characters)
478// 1 - Set O (optional direct characters)
479// 2 - whitespace characters (optional)
480// 3 - special characters
481//
482static const unsigned char utf7encode[128] =
483{
484 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
486 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
492};
493
494size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
495{
496
497
498 size_t len = 0;
499
500 while (*psz && ((!buf) || (len < n)))
501 {
502 wchar_t cc = *psz++;
503 if (cc < 0x80 && utf7encode[cc] < 1)
504 {
505 // plain ASCII char
506 if (buf)
507 *buf++ = (char)cc;
508 len++;
509 }
510#ifndef WC_UTF16
511 else if (((wxUint32)cc) > 0xffff)
512 {
513 // no surrogate pair generation (yet?)
514 return (size_t)-1;
515 }
516#endif
517 else
518 {
519 if (buf)
520 *buf++ = '+';
521 len++;
522 if (cc != '+')
523 {
524 // BASE64 encode string
525 unsigned int lsb, d, l;
526 for (d = 0, l = 0;; psz++)
527 {
528 for (lsb = 0; lsb < 2; lsb ++)
529 {
530 d <<= 8;
531 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
532
533 for (l += 8; l >= 6; )
534 {
535 l -= 6;
536 if (buf)
537 *buf++ = utf7enb64[(d >> l) % 64];
538 len++;
539 }
540 }
541 cc = *psz;
542 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
543 break;
544 }
545 if (l != 0)
546 {
547 if (buf)
548 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
549 len++;
550 }
551 }
552 if (buf)
553 *buf++ = '-';
554 len++;
555 }
556 }
557 if (buf && (len < n))
558 *buf = 0;
559 return len;
560}
561
562// ----------------------------------------------------------------------------
563// UTF-8
564// ----------------------------------------------------------------------------
565
566static wxUint32 utf8_max[]=
567 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
568
569// boundaries of the private use area we use to (temporarily) remap invalid
570// characters invalid in a UTF-8 encoded string
571const wxUint32 wxUnicodePUA = 0x100000;
572const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
573
574size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
575{
576 size_t len = 0;
577
578 while (*psz && ((!buf) || (len < n)))
579 {
580 const char *opsz = psz;
581 bool invalid = false;
582 unsigned char cc = *psz++, fc = cc;
583 unsigned cnt;
584 for (cnt = 0; fc & 0x80; cnt++)
585 fc <<= 1;
586 if (!cnt)
587 {
588 // plain ASCII char
589 if (buf)
590 *buf++ = cc;
591 len++;
592
593 // escape the escape character for octal escapes
594 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
595 && cc == '\\' && (!buf || len < n))
596 {
597 if (buf)
598 *buf++ = cc;
599 len++;
600 }
601 }
602 else
603 {
604 cnt--;
605 if (!cnt)
606 {
607 // invalid UTF-8 sequence
608 invalid = true;
609 }
610 else
611 {
612 unsigned ocnt = cnt - 1;
613 wxUint32 res = cc & (0x3f >> cnt);
614 while (cnt--)
615 {
616 cc = *psz;
617 if ((cc & 0xC0) != 0x80)
618 {
619 // invalid UTF-8 sequence
620 invalid = true;
621 break;
622 }
623 psz++;
624 res = (res << 6) | (cc & 0x3f);
625 }
626 if (invalid || res <= utf8_max[ocnt])
627 {
628 // illegal UTF-8 encoding
629 invalid = true;
630 }
631 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
632 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
633 {
634 // if one of our PUA characters turns up externally
635 // it must also be treated as an illegal sequence
636 // (a bit like you have to escape an escape character)
637 invalid = true;
638 }
639 else
640 {
641#ifdef WC_UTF16
642 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
643 size_t pa = encode_utf16(res, (wxUint16 *)buf);
644 if (pa == (size_t)-1)
645 {
646 invalid = true;
647 }
648 else
649 {
650 if (buf)
651 buf += pa;
652 len += pa;
653 }
654#else // !WC_UTF16
655 if (buf)
656 *buf++ = (wchar_t)res;
657 len++;
658#endif // WC_UTF16/!WC_UTF16
659 }
660 }
661 if (invalid)
662 {
663 if (m_options & MAP_INVALID_UTF8_TO_PUA)
664 {
665 while (opsz < psz && (!buf || len < n))
666 {
667#ifdef WC_UTF16
668 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
669 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
670 wxASSERT(pa != (size_t)-1);
671 if (buf)
672 buf += pa;
673 opsz++;
674 len += pa;
675#else
676 if (buf)
677 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
678 opsz++;
679 len++;
680#endif
681 }
682 }
683 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
684 {
685 while (opsz < psz && (!buf || len < n))
686 {
687 if ( buf && len + 3 < n )
688 {
689 unsigned char on = *opsz;
690 *buf++ = L'\\';
691 *buf++ = (wchar_t)( L'0' + on / 0100 );
692 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
693 *buf++ = (wchar_t)( L'0' + on % 010 );
694 }
695 opsz++;
696 len += 4;
697 }
698 }
699 else // MAP_INVALID_UTF8_NOT
700 {
701 return (size_t)-1;
702 }
703 }
704 }
705 }
706 if (buf && (len < n))
707 *buf = 0;
708 return len;
709}
710
711static inline bool isoctal(wchar_t wch)
712{
713 return L'0' <= wch && wch <= L'7';
714}
715
716size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
717{
718 size_t len = 0;
719
720 while (*psz && ((!buf) || (len < n)))
721 {
722 wxUint32 cc;
723#ifdef WC_UTF16
724 // cast is ok for WC_UTF16
725 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
726 psz += (pa == (size_t)-1) ? 1 : pa;
727#else
728 cc=(*psz++) & 0x7fffffff;
729#endif
730
731 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
732 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
733 {
734 if (buf)
735 *buf++ = (char)(cc - wxUnicodePUA);
736 len++;
737 }
738 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
739 && cc == L'\\' && psz[0] == L'\\' )
740 {
741 if (buf)
742 *buf++ = (char)cc;
743 psz++;
744 len++;
745 }
746 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
747 cc == L'\\' &&
748 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
749 {
750 if (buf)
751 {
752 *buf++ = (char) ((psz[0] - L'0')*0100 +
753 (psz[1] - L'0')*010 +
754 (psz[2] - L'0'));
755 }
756
757 psz += 3;
758 len++;
759 }
760 else
761 {
762 unsigned cnt;
763 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
764 if (!cnt)
765 {
766 // plain ASCII char
767 if (buf)
768 *buf++ = (char) cc;
769 len++;
770 }
771
772 else
773 {
774 len += cnt + 1;
775 if (buf)
776 {
777 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
778 while (cnt--)
779 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
780 }
781 }
782 }
783 }
784
785 if (buf && (len<n))
786 *buf = 0;
787
788 return len;
789}
790
791// ----------------------------------------------------------------------------
792// UTF-16
793// ----------------------------------------------------------------------------
794
795#ifdef WORDS_BIGENDIAN
796 #define wxMBConvUTF16straight wxMBConvUTF16BE
797 #define wxMBConvUTF16swap wxMBConvUTF16LE
798#else
799 #define wxMBConvUTF16swap wxMBConvUTF16BE
800 #define wxMBConvUTF16straight wxMBConvUTF16LE
801#endif
802
803
804#ifdef WC_UTF16
805
806// copy 16bit MB to 16bit String
807size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
808{
809 size_t len=0;
810
811 while (*(wxUint16*)psz && (!buf || len < n))
812 {
813 if (buf)
814 *buf++ = *(wxUint16*)psz;
815 len++;
816
817 psz += sizeof(wxUint16);
818 }
819 if (buf && len<n) *buf=0;
820
821 return len;
822}
823
824
825// copy 16bit String to 16bit MB
826size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
827{
828 size_t len=0;
829
830 while (*psz && (!buf || len < n))
831 {
832 if (buf)
833 {
834 *(wxUint16*)buf = *psz;
835 buf += sizeof(wxUint16);
836 }
837 len += sizeof(wxUint16);
838 psz++;
839 }
840 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
841
842 return len;
843}
844
845
846// swap 16bit MB to 16bit String
847size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
848{
849 size_t len=0;
850
851 while (*(wxUint16*)psz && (!buf || len < n))
852 {
853 if (buf)
854 {
855 ((char *)buf)[0] = psz[1];
856 ((char *)buf)[1] = psz[0];
857 buf++;
858 }
859 len++;
860 psz += sizeof(wxUint16);
861 }
862 if (buf && len<n) *buf=0;
863
864 return len;
865}
866
867
868// swap 16bit MB to 16bit String
869size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
870{
871 size_t len=0;
872
873 while (*psz && (!buf || len < n))
874 {
875 if (buf)
876 {
877 *buf++ = ((char*)psz)[1];
878 *buf++ = ((char*)psz)[0];
879 }
880 len += sizeof(wxUint16);
881 psz++;
882 }
883 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
884
885 return len;
886}
887
888
889#else // WC_UTF16
890
891
892// copy 16bit MB to 32bit String
893size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
894{
895 size_t len=0;
896
897 while (*(wxUint16*)psz && (!buf || len < n))
898 {
899 wxUint32 cc;
900 size_t pa=decode_utf16((wxUint16*)psz, cc);
901 if (pa == (size_t)-1)
902 return pa;
903
904 if (buf)
905 *buf++ = (wchar_t)cc;
906 len++;
907 psz += pa * sizeof(wxUint16);
908 }
909 if (buf && len<n) *buf=0;
910
911 return len;
912}
913
914
915// copy 32bit String to 16bit MB
916size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
917{
918 size_t len=0;
919
920 while (*psz && (!buf || len < n))
921 {
922 wxUint16 cc[2];
923 size_t pa=encode_utf16(*psz, cc);
924
925 if (pa == (size_t)-1)
926 return pa;
927
928 if (buf)
929 {
930 *(wxUint16*)buf = cc[0];
931 buf += sizeof(wxUint16);
932 if (pa > 1)
933 {
934 *(wxUint16*)buf = cc[1];
935 buf += sizeof(wxUint16);
936 }
937 }
938
939 len += pa*sizeof(wxUint16);
940 psz++;
941 }
942 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
943
944 return len;
945}
946
947
948// swap 16bit MB to 32bit String
949size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
950{
951 size_t len=0;
952
953 while (*(wxUint16*)psz && (!buf || len < n))
954 {
955 wxUint32 cc;
956 char tmp[4];
957 tmp[0]=psz[1]; tmp[1]=psz[0];
958 tmp[2]=psz[3]; tmp[3]=psz[2];
959
960 size_t pa=decode_utf16((wxUint16*)tmp, cc);
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
965 *buf++ = (wchar_t)cc;
966
967 len++;
968 psz += pa * sizeof(wxUint16);
969 }
970 if (buf && len<n) *buf=0;
971
972 return len;
973}
974
975
976// swap 32bit String to 16bit MB
977size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
978{
979 size_t len=0;
980
981 while (*psz && (!buf || len < n))
982 {
983 wxUint16 cc[2];
984 size_t pa=encode_utf16(*psz, cc);
985
986 if (pa == (size_t)-1)
987 return pa;
988
989 if (buf)
990 {
991 *buf++ = ((char*)cc)[1];
992 *buf++ = ((char*)cc)[0];
993 if (pa > 1)
994 {
995 *buf++ = ((char*)cc)[3];
996 *buf++ = ((char*)cc)[2];
997 }
998 }
999
1000 len += pa*sizeof(wxUint16);
1001 psz++;
1002 }
1003 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1004
1005 return len;
1006}
1007
1008#endif // WC_UTF16
1009
1010
1011// ----------------------------------------------------------------------------
1012// UTF-32
1013// ----------------------------------------------------------------------------
1014
1015#ifdef WORDS_BIGENDIAN
1016#define wxMBConvUTF32straight wxMBConvUTF32BE
1017#define wxMBConvUTF32swap wxMBConvUTF32LE
1018#else
1019#define wxMBConvUTF32swap wxMBConvUTF32BE
1020#define wxMBConvUTF32straight wxMBConvUTF32LE
1021#endif
1022
1023
1024WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1025WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1026
1027
1028#ifdef WC_UTF16
1029
1030// copy 32bit MB to 16bit String
1031size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1032{
1033 size_t len=0;
1034
1035 while (*(wxUint32*)psz && (!buf || len < n))
1036 {
1037 wxUint16 cc[2];
1038
1039 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1040 if (pa == (size_t)-1)
1041 return pa;
1042
1043 if (buf)
1044 {
1045 *buf++ = cc[0];
1046 if (pa > 1)
1047 *buf++ = cc[1];
1048 }
1049 len += pa;
1050 psz += sizeof(wxUint32);
1051 }
1052 if (buf && len<n) *buf=0;
1053
1054 return len;
1055}
1056
1057
1058// copy 16bit String to 32bit MB
1059size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1060{
1061 size_t len=0;
1062
1063 while (*psz && (!buf || len < n))
1064 {
1065 wxUint32 cc;
1066
1067 // cast is ok for WC_UTF16
1068 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1069 if (pa == (size_t)-1)
1070 return pa;
1071
1072 if (buf)
1073 {
1074 *(wxUint32*)buf = cc;
1075 buf += sizeof(wxUint32);
1076 }
1077 len += sizeof(wxUint32);
1078 psz += pa;
1079 }
1080
1081 if (buf && len<=n-sizeof(wxUint32))
1082 *(wxUint32*)buf=0;
1083
1084 return len;
1085}
1086
1087
1088
1089// swap 32bit MB to 16bit String
1090size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1091{
1092 size_t len=0;
1093
1094 while (*(wxUint32*)psz && (!buf || len < n))
1095 {
1096 char tmp[4];
1097 tmp[0] = psz[3]; tmp[1] = psz[2];
1098 tmp[2] = psz[1]; tmp[3] = psz[0];
1099
1100
1101 wxUint16 cc[2];
1102
1103 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1104 if (pa == (size_t)-1)
1105 return pa;
1106
1107 if (buf)
1108 {
1109 *buf++ = cc[0];
1110 if (pa > 1)
1111 *buf++ = cc[1];
1112 }
1113 len += pa;
1114 psz += sizeof(wxUint32);
1115 }
1116
1117 if (buf && len<n)
1118 *buf=0;
1119
1120 return len;
1121}
1122
1123
1124// swap 16bit String to 32bit MB
1125size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126{
1127 size_t len=0;
1128
1129 while (*psz && (!buf || len < n))
1130 {
1131 char cc[4];
1132
1133 // cast is ok for WC_UTF16
1134 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1135 if (pa == (size_t)-1)
1136 return pa;
1137
1138 if (buf)
1139 {
1140 *buf++ = cc[3];
1141 *buf++ = cc[2];
1142 *buf++ = cc[1];
1143 *buf++ = cc[0];
1144 }
1145 len += sizeof(wxUint32);
1146 psz += pa;
1147 }
1148
1149 if (buf && len<=n-sizeof(wxUint32))
1150 *(wxUint32*)buf=0;
1151
1152 return len;
1153}
1154
1155#else // WC_UTF16
1156
1157
1158// copy 32bit MB to 32bit String
1159size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1160{
1161 size_t len=0;
1162
1163 while (*(wxUint32*)psz && (!buf || len < n))
1164 {
1165 if (buf)
1166 *buf++ = (wchar_t)(*(wxUint32*)psz);
1167 len++;
1168 psz += sizeof(wxUint32);
1169 }
1170
1171 if (buf && len<n)
1172 *buf=0;
1173
1174 return len;
1175}
1176
1177
1178// copy 32bit String to 32bit MB
1179size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1180{
1181 size_t len=0;
1182
1183 while (*psz && (!buf || len < n))
1184 {
1185 if (buf)
1186 {
1187 *(wxUint32*)buf = *psz;
1188 buf += sizeof(wxUint32);
1189 }
1190
1191 len += sizeof(wxUint32);
1192 psz++;
1193 }
1194
1195 if (buf && len<=n-sizeof(wxUint32))
1196 *(wxUint32*)buf=0;
1197
1198 return len;
1199}
1200
1201
1202// swap 32bit MB to 32bit String
1203size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1204{
1205 size_t len=0;
1206
1207 while (*(wxUint32*)psz && (!buf || len < n))
1208 {
1209 if (buf)
1210 {
1211 ((char *)buf)[0] = psz[3];
1212 ((char *)buf)[1] = psz[2];
1213 ((char *)buf)[2] = psz[1];
1214 ((char *)buf)[3] = psz[0];
1215 buf++;
1216 }
1217 len++;
1218 psz += sizeof(wxUint32);
1219 }
1220
1221 if (buf && len<n)
1222 *buf=0;
1223
1224 return len;
1225}
1226
1227
1228// swap 32bit String to 32bit MB
1229size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1230{
1231 size_t len=0;
1232
1233 while (*psz && (!buf || len < n))
1234 {
1235 if (buf)
1236 {
1237 *buf++ = ((char *)psz)[3];
1238 *buf++ = ((char *)psz)[2];
1239 *buf++ = ((char *)psz)[1];
1240 *buf++ = ((char *)psz)[0];
1241 }
1242 len += sizeof(wxUint32);
1243 psz++;
1244 }
1245
1246 if (buf && len<=n-sizeof(wxUint32))
1247 *(wxUint32*)buf=0;
1248
1249 return len;
1250}
1251
1252
1253#endif // WC_UTF16
1254
1255
1256// ============================================================================
1257// The classes doing conversion using the iconv_xxx() functions
1258// ============================================================================
1259
1260#ifdef HAVE_ICONV
1261
1262// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1263// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1264// (unless there's yet another bug in glibc) the only case when iconv()
1265// returns with (size_t)-1 (which means error) and says there are 0 bytes
1266// left in the input buffer -- when _real_ error occurs,
1267// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1268// iconv() failure.
1269// [This bug does not appear in glibc 2.2.]
1270#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1271#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1272 (errno != E2BIG || bufLeft != 0))
1273#else
1274#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1275#endif
1276
1277#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1278
1279#define ICONV_T_INVALID ((iconv_t)-1)
1280
1281#if SIZEOF_WCHAR_T == 4
1282 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1283 #define WC_ENC wxFONTENCODING_UTF32
1284#elif SIZEOF_WCHAR_T == 2
1285 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1286 #define WC_ENC wxFONTENCODING_UTF16
1287#else // sizeof(wchar_t) != 2 nor 4
1288 // does this ever happen?
1289 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1290#endif
1291
1292// ----------------------------------------------------------------------------
1293// wxMBConv_iconv: encapsulates an iconv character set
1294// ----------------------------------------------------------------------------
1295
1296class wxMBConv_iconv : public wxMBConv
1297{
1298public:
1299 wxMBConv_iconv(const wxChar *name);
1300 virtual ~wxMBConv_iconv();
1301
1302 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1303 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1304
1305 bool IsOk() const
1306 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1307
1308protected:
1309 // the iconv handlers used to translate from multibyte to wide char and in
1310 // the other direction
1311 iconv_t m2w,
1312 w2m;
1313#if wxUSE_THREADS
1314 // guards access to m2w and w2m objects
1315 wxMutex m_iconvMutex;
1316#endif
1317
1318private:
1319 // the name (for iconv_open()) of a wide char charset -- if none is
1320 // available on this machine, it will remain NULL
1321 static wxString ms_wcCharsetName;
1322
1323 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1324 // different endian-ness than the native one
1325 static bool ms_wcNeedsSwap;
1326};
1327
1328// make the constructor available for unit testing
1329WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1330{
1331 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1332 if ( !result->IsOk() )
1333 {
1334 delete result;
1335 return 0;
1336 }
1337 return result;
1338}
1339
1340wxString wxMBConv_iconv::ms_wcCharsetName;
1341bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1342
1343wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1344{
1345 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1346 // names for the charsets
1347 const wxCharBuffer cname(wxString(name).ToAscii());
1348
1349 // check for charset that represents wchar_t:
1350 if ( ms_wcCharsetName.empty() )
1351 {
1352 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1353
1354#if wxUSE_FONTMAP
1355 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1356#else // !wxUSE_FONTMAP
1357 static const wxChar *names[] =
1358 {
1359#if SIZEOF_WCHAR_T == 4
1360 _T("UCS-4"),
1361#elif SIZEOF_WCHAR_T = 2
1362 _T("UCS-2"),
1363#endif
1364 NULL
1365 };
1366#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1367
1368 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1369 {
1370 const wxString nameCS(*names);
1371
1372 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1373 wxString nameXE(nameCS);
1374 #ifdef WORDS_BIGENDIAN
1375 nameXE += _T("BE");
1376 #else // little endian
1377 nameXE += _T("LE");
1378 #endif
1379
1380 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1381 nameXE.c_str());
1382
1383 m2w = iconv_open(nameXE.ToAscii(), cname);
1384 if ( m2w == ICONV_T_INVALID )
1385 {
1386 // try charset w/o bytesex info (e.g. "UCS4")
1387 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1388 nameCS.c_str());
1389 m2w = iconv_open(nameCS.ToAscii(), cname);
1390
1391 // and check for bytesex ourselves:
1392 if ( m2w != ICONV_T_INVALID )
1393 {
1394 char buf[2], *bufPtr;
1395 wchar_t wbuf[2], *wbufPtr;
1396 size_t insz, outsz;
1397 size_t res;
1398
1399 buf[0] = 'A';
1400 buf[1] = 0;
1401 wbuf[0] = 0;
1402 insz = 2;
1403 outsz = SIZEOF_WCHAR_T * 2;
1404 wbufPtr = wbuf;
1405 bufPtr = buf;
1406
1407 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1408 (char**)&wbufPtr, &outsz);
1409
1410 if (ICONV_FAILED(res, insz))
1411 {
1412 wxLogLastError(wxT("iconv"));
1413 wxLogError(_("Conversion to charset '%s' doesn't work."),
1414 nameCS.c_str());
1415 }
1416 else // ok, can convert to this encoding, remember it
1417 {
1418 ms_wcCharsetName = nameCS;
1419 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1420 }
1421 }
1422 }
1423 else // use charset not requiring byte swapping
1424 {
1425 ms_wcCharsetName = nameXE;
1426 }
1427 }
1428
1429 wxLogTrace(TRACE_STRCONV,
1430 wxT("iconv wchar_t charset is \"%s\"%s"),
1431 ms_wcCharsetName.empty() ? _T("<none>")
1432 : ms_wcCharsetName.c_str(),
1433 ms_wcNeedsSwap ? _T(" (needs swap)")
1434 : _T(""));
1435 }
1436 else // we already have ms_wcCharsetName
1437 {
1438 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1439 }
1440
1441 if ( ms_wcCharsetName.empty() )
1442 {
1443 w2m = ICONV_T_INVALID;
1444 }
1445 else
1446 {
1447 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1448 if ( w2m == ICONV_T_INVALID )
1449 {
1450 wxLogTrace(TRACE_STRCONV,
1451 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1452 ms_wcCharsetName.c_str(), cname.data());
1453 }
1454 }
1455}
1456
1457wxMBConv_iconv::~wxMBConv_iconv()
1458{
1459 if ( m2w != ICONV_T_INVALID )
1460 iconv_close(m2w);
1461 if ( w2m != ICONV_T_INVALID )
1462 iconv_close(w2m);
1463}
1464
1465size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1466{
1467#if wxUSE_THREADS
1468 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1469 // Unfortunately there is a couple of global wxCSConv objects such as
1470 // wxConvLocal that are used all over wx code, so we have to make sure
1471 // the handle is used by at most one thread at the time. Otherwise
1472 // only a few wx classes would be safe to use from non-main threads
1473 // as MB<->WC conversion would fail "randomly".
1474 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1475#endif
1476
1477 size_t inbuf = strlen(psz);
1478 size_t outbuf = n * SIZEOF_WCHAR_T;
1479 size_t res, cres;
1480 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1481 wchar_t *bufPtr = buf;
1482 const char *pszPtr = psz;
1483
1484 if (buf)
1485 {
1486 // have destination buffer, convert there
1487 cres = iconv(m2w,
1488 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1489 (char**)&bufPtr, &outbuf);
1490 res = n - (outbuf / SIZEOF_WCHAR_T);
1491
1492 if (ms_wcNeedsSwap)
1493 {
1494 // convert to native endianness
1495 for ( unsigned i = 0; i < res; i++ )
1496 buf[n] = WC_BSWAP(buf[i]);
1497 }
1498
1499 // NB: iconv was given only strlen(psz) characters on input, and so
1500 // it couldn't convert the trailing zero. Let's do it ourselves
1501 // if there's some room left for it in the output buffer.
1502 if (res < n)
1503 buf[res] = 0;
1504 }
1505 else
1506 {
1507 // no destination buffer... convert using temp buffer
1508 // to calculate destination buffer requirement
1509 wchar_t tbuf[8];
1510 res = 0;
1511 do {
1512 bufPtr = tbuf;
1513 outbuf = 8*SIZEOF_WCHAR_T;
1514
1515 cres = iconv(m2w,
1516 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1517 (char**)&bufPtr, &outbuf );
1518
1519 res += 8-(outbuf/SIZEOF_WCHAR_T);
1520 } while ((cres==(size_t)-1) && (errno==E2BIG));
1521 }
1522
1523 if (ICONV_FAILED(cres, inbuf))
1524 {
1525 //VS: it is ok if iconv fails, hence trace only
1526 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1527 return (size_t)-1;
1528 }
1529
1530 return res;
1531}
1532
1533size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1534{
1535#if wxUSE_THREADS
1536 // NB: explained in MB2WC
1537 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1538#endif
1539
1540 size_t inlen = wxWcslen(psz);
1541 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1542 size_t outbuf = n;
1543 size_t res, cres;
1544
1545 wchar_t *tmpbuf = 0;
1546
1547 if (ms_wcNeedsSwap)
1548 {
1549 // need to copy to temp buffer to switch endianness
1550 // (doing WC_BSWAP twice on the original buffer won't help, as it
1551 // could be in read-only memory, or be accessed in some other thread)
1552 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1553 for ( size_t i = 0; i < inlen; i++ )
1554 tmpbuf[n] = WC_BSWAP(psz[i]);
1555 tmpbuf[inlen] = L'\0';
1556 psz = tmpbuf;
1557 }
1558
1559 if (buf)
1560 {
1561 // have destination buffer, convert there
1562 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1563
1564 res = n-outbuf;
1565
1566 // NB: iconv was given only wcslen(psz) characters on input, and so
1567 // it couldn't convert the trailing zero. Let's do it ourselves
1568 // if there's some room left for it in the output buffer.
1569 if (res < n)
1570 buf[0] = 0;
1571 }
1572 else
1573 {
1574 // no destination buffer... convert using temp buffer
1575 // to calculate destination buffer requirement
1576 char tbuf[16];
1577 res = 0;
1578 do {
1579 buf = tbuf; outbuf = 16;
1580
1581 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1582
1583 res += 16 - outbuf;
1584 } while ((cres==(size_t)-1) && (errno==E2BIG));
1585 }
1586
1587 if (ms_wcNeedsSwap)
1588 {
1589 free(tmpbuf);
1590 }
1591
1592 if (ICONV_FAILED(cres, inbuf))
1593 {
1594 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1595 return (size_t)-1;
1596 }
1597
1598 return res;
1599}
1600
1601#endif // HAVE_ICONV
1602
1603
1604// ============================================================================
1605// Win32 conversion classes
1606// ============================================================================
1607
1608#ifdef wxHAVE_WIN32_MB2WC
1609
1610// from utils.cpp
1611#if wxUSE_FONTMAP
1612extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1613extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1614#endif
1615
1616class wxMBConv_win32 : public wxMBConv
1617{
1618public:
1619 wxMBConv_win32()
1620 {
1621 m_CodePage = CP_ACP;
1622 }
1623
1624#if wxUSE_FONTMAP
1625 wxMBConv_win32(const wxChar* name)
1626 {
1627 m_CodePage = wxCharsetToCodepage(name);
1628 }
1629
1630 wxMBConv_win32(wxFontEncoding encoding)
1631 {
1632 m_CodePage = wxEncodingToCodepage(encoding);
1633 }
1634#endif
1635
1636 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1637 {
1638 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1639 // the behaviour is not compatible with the Unix version (using iconv)
1640 // and break the library itself, e.g. wxTextInputStream::NextChar()
1641 // wouldn't work if reading an incomplete MB char didn't result in an
1642 // error
1643 //
1644 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1645 // an error (tested under Windows Server 2003) and apparently it is
1646 // done on purpose, i.e. the function accepts any input in this case
1647 // and although I'd prefer to return error on ill-formed output, our
1648 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1649 // explicitly ill-formed according to RFC 2152) neither so we don't
1650 // even have any fallback here...
1651 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1652
1653 const size_t len = ::MultiByteToWideChar
1654 (
1655 m_CodePage, // code page
1656 flags, // flags: fall on error
1657 psz, // input string
1658 -1, // its length (NUL-terminated)
1659 buf, // output string
1660 buf ? n : 0 // size of output buffer
1661 );
1662
1663 // note that it returns count of written chars for buf != NULL and size
1664 // of the needed buffer for buf == NULL so in either case the length of
1665 // the string (which never includes the terminating NUL) is one less
1666 return len ? len - 1 : (size_t)-1;
1667 }
1668
1669 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1670 {
1671 /*
1672 we have a problem here: by default, WideCharToMultiByte() may
1673 replace characters unrepresentable in the target code page with bad
1674 quality approximations such as turning "1/2" symbol (U+00BD) into
1675 "1" for the code pages which don't have it and we, obviously, want
1676 to avoid this at any price
1677
1678 the trouble is that this function does it _silently_, i.e. it won't
1679 even tell us whether it did or not... Win98/2000 and higher provide
1680 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1681 we have to resort to a round trip, i.e. check that converting back
1682 results in the same string -- this is, of course, expensive but
1683 otherwise we simply can't be sure to not garble the data.
1684 */
1685
1686 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1687 // it doesn't work with CJK encodings (which we test for rather roughly
1688 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1689 // supporting it
1690 BOOL usedDef wxDUMMY_INITIALIZE(false);
1691 BOOL *pUsedDef;
1692 int flags;
1693 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1694 {
1695 // it's our lucky day
1696 flags = WC_NO_BEST_FIT_CHARS;
1697 pUsedDef = &usedDef;
1698 }
1699 else // old system or unsupported encoding
1700 {
1701 flags = 0;
1702 pUsedDef = NULL;
1703 }
1704
1705 const size_t len = ::WideCharToMultiByte
1706 (
1707 m_CodePage, // code page
1708 flags, // either none or no best fit
1709 pwz, // input string
1710 -1, // it is (wide) NUL-terminated
1711 buf, // output buffer
1712 buf ? n : 0, // and its size
1713 NULL, // default "replacement" char
1714 pUsedDef // [out] was it used?
1715 );
1716
1717 if ( !len )
1718 {
1719 // function totally failed
1720 return (size_t)-1;
1721 }
1722
1723 // if we were really converting, check if we succeeded
1724 if ( buf )
1725 {
1726 if ( flags )
1727 {
1728 // check if the conversion failed, i.e. if any replacements
1729 // were done
1730 if ( usedDef )
1731 return (size_t)-1;
1732 }
1733 else // we must resort to double tripping...
1734 {
1735 wxWCharBuffer wcBuf(n);
1736 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1737 wcscmp(wcBuf, pwz) != 0 )
1738 {
1739 // we didn't obtain the same thing we started from, hence
1740 // the conversion was lossy and we consider that it failed
1741 return (size_t)-1;
1742 }
1743 }
1744 }
1745
1746 // see the comment above for the reason of "len - 1"
1747 return len - 1;
1748 }
1749
1750 bool IsOk() const { return m_CodePage != -1; }
1751
1752private:
1753 static bool CanUseNoBestFit()
1754 {
1755 static int s_isWin98Or2k = -1;
1756
1757 if ( s_isWin98Or2k == -1 )
1758 {
1759 int verMaj, verMin;
1760 switch ( wxGetOsVersion(&verMaj, &verMin) )
1761 {
1762 case wxWIN95:
1763 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1764 break;
1765
1766 case wxWINDOWS_NT:
1767 s_isWin98Or2k = verMaj >= 5;
1768 break;
1769
1770 default:
1771 // unknown, be conseravtive by default
1772 s_isWin98Or2k = 0;
1773 }
1774
1775 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1776 }
1777
1778 return s_isWin98Or2k == 1;
1779 }
1780
1781 long m_CodePage;
1782};
1783
1784#endif // wxHAVE_WIN32_MB2WC
1785
1786// ============================================================================
1787// Cocoa conversion classes
1788// ============================================================================
1789
1790#if defined(__WXCOCOA__)
1791
1792// RN: There is no UTF-32 support in either Core Foundation or
1793// Cocoa. Strangely enough, internally Core Foundation uses
1794// UTF 32 internally quite a bit - its just not public (yet).
1795
1796#include <CoreFoundation/CFString.h>
1797#include <CoreFoundation/CFStringEncodingExt.h>
1798
1799CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1800{
1801 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1802 if ( encoding == wxFONTENCODING_DEFAULT )
1803 {
1804 enc = CFStringGetSystemEncoding();
1805 }
1806 else switch( encoding)
1807 {
1808 case wxFONTENCODING_ISO8859_1 :
1809 enc = kCFStringEncodingISOLatin1 ;
1810 break ;
1811 case wxFONTENCODING_ISO8859_2 :
1812 enc = kCFStringEncodingISOLatin2;
1813 break ;
1814 case wxFONTENCODING_ISO8859_3 :
1815 enc = kCFStringEncodingISOLatin3 ;
1816 break ;
1817 case wxFONTENCODING_ISO8859_4 :
1818 enc = kCFStringEncodingISOLatin4;
1819 break ;
1820 case wxFONTENCODING_ISO8859_5 :
1821 enc = kCFStringEncodingISOLatinCyrillic;
1822 break ;
1823 case wxFONTENCODING_ISO8859_6 :
1824 enc = kCFStringEncodingISOLatinArabic;
1825 break ;
1826 case wxFONTENCODING_ISO8859_7 :
1827 enc = kCFStringEncodingISOLatinGreek;
1828 break ;
1829 case wxFONTENCODING_ISO8859_8 :
1830 enc = kCFStringEncodingISOLatinHebrew;
1831 break ;
1832 case wxFONTENCODING_ISO8859_9 :
1833 enc = kCFStringEncodingISOLatin5;
1834 break ;
1835 case wxFONTENCODING_ISO8859_10 :
1836 enc = kCFStringEncodingISOLatin6;
1837 break ;
1838 case wxFONTENCODING_ISO8859_11 :
1839 enc = kCFStringEncodingISOLatinThai;
1840 break ;
1841 case wxFONTENCODING_ISO8859_13 :
1842 enc = kCFStringEncodingISOLatin7;
1843 break ;
1844 case wxFONTENCODING_ISO8859_14 :
1845 enc = kCFStringEncodingISOLatin8;
1846 break ;
1847 case wxFONTENCODING_ISO8859_15 :
1848 enc = kCFStringEncodingISOLatin9;
1849 break ;
1850
1851 case wxFONTENCODING_KOI8 :
1852 enc = kCFStringEncodingKOI8_R;
1853 break ;
1854 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1855 enc = kCFStringEncodingDOSRussian;
1856 break ;
1857
1858// case wxFONTENCODING_BULGARIAN :
1859// enc = ;
1860// break ;
1861
1862 case wxFONTENCODING_CP437 :
1863 enc =kCFStringEncodingDOSLatinUS ;
1864 break ;
1865 case wxFONTENCODING_CP850 :
1866 enc = kCFStringEncodingDOSLatin1;
1867 break ;
1868 case wxFONTENCODING_CP852 :
1869 enc = kCFStringEncodingDOSLatin2;
1870 break ;
1871 case wxFONTENCODING_CP855 :
1872 enc = kCFStringEncodingDOSCyrillic;
1873 break ;
1874 case wxFONTENCODING_CP866 :
1875 enc =kCFStringEncodingDOSRussian ;
1876 break ;
1877 case wxFONTENCODING_CP874 :
1878 enc = kCFStringEncodingDOSThai;
1879 break ;
1880 case wxFONTENCODING_CP932 :
1881 enc = kCFStringEncodingDOSJapanese;
1882 break ;
1883 case wxFONTENCODING_CP936 :
1884 enc =kCFStringEncodingDOSChineseSimplif ;
1885 break ;
1886 case wxFONTENCODING_CP949 :
1887 enc = kCFStringEncodingDOSKorean;
1888 break ;
1889 case wxFONTENCODING_CP950 :
1890 enc = kCFStringEncodingDOSChineseTrad;
1891 break ;
1892 case wxFONTENCODING_CP1250 :
1893 enc = kCFStringEncodingWindowsLatin2;
1894 break ;
1895 case wxFONTENCODING_CP1251 :
1896 enc =kCFStringEncodingWindowsCyrillic ;
1897 break ;
1898 case wxFONTENCODING_CP1252 :
1899 enc =kCFStringEncodingWindowsLatin1 ;
1900 break ;
1901 case wxFONTENCODING_CP1253 :
1902 enc = kCFStringEncodingWindowsGreek;
1903 break ;
1904 case wxFONTENCODING_CP1254 :
1905 enc = kCFStringEncodingWindowsLatin5;
1906 break ;
1907 case wxFONTENCODING_CP1255 :
1908 enc =kCFStringEncodingWindowsHebrew ;
1909 break ;
1910 case wxFONTENCODING_CP1256 :
1911 enc =kCFStringEncodingWindowsArabic ;
1912 break ;
1913 case wxFONTENCODING_CP1257 :
1914 enc = kCFStringEncodingWindowsBalticRim;
1915 break ;
1916// This only really encodes to UTF7 (if that) evidently
1917// case wxFONTENCODING_UTF7 :
1918// enc = kCFStringEncodingNonLossyASCII ;
1919// break ;
1920 case wxFONTENCODING_UTF8 :
1921 enc = kCFStringEncodingUTF8 ;
1922 break ;
1923 case wxFONTENCODING_EUC_JP :
1924 enc = kCFStringEncodingEUC_JP;
1925 break ;
1926 case wxFONTENCODING_UTF16 :
1927 enc = kCFStringEncodingUnicode ;
1928 break ;
1929 case wxFONTENCODING_MACROMAN :
1930 enc = kCFStringEncodingMacRoman ;
1931 break ;
1932 case wxFONTENCODING_MACJAPANESE :
1933 enc = kCFStringEncodingMacJapanese ;
1934 break ;
1935 case wxFONTENCODING_MACCHINESETRAD :
1936 enc = kCFStringEncodingMacChineseTrad ;
1937 break ;
1938 case wxFONTENCODING_MACKOREAN :
1939 enc = kCFStringEncodingMacKorean ;
1940 break ;
1941 case wxFONTENCODING_MACARABIC :
1942 enc = kCFStringEncodingMacArabic ;
1943 break ;
1944 case wxFONTENCODING_MACHEBREW :
1945 enc = kCFStringEncodingMacHebrew ;
1946 break ;
1947 case wxFONTENCODING_MACGREEK :
1948 enc = kCFStringEncodingMacGreek ;
1949 break ;
1950 case wxFONTENCODING_MACCYRILLIC :
1951 enc = kCFStringEncodingMacCyrillic ;
1952 break ;
1953 case wxFONTENCODING_MACDEVANAGARI :
1954 enc = kCFStringEncodingMacDevanagari ;
1955 break ;
1956 case wxFONTENCODING_MACGURMUKHI :
1957 enc = kCFStringEncodingMacGurmukhi ;
1958 break ;
1959 case wxFONTENCODING_MACGUJARATI :
1960 enc = kCFStringEncodingMacGujarati ;
1961 break ;
1962 case wxFONTENCODING_MACORIYA :
1963 enc = kCFStringEncodingMacOriya ;
1964 break ;
1965 case wxFONTENCODING_MACBENGALI :
1966 enc = kCFStringEncodingMacBengali ;
1967 break ;
1968 case wxFONTENCODING_MACTAMIL :
1969 enc = kCFStringEncodingMacTamil ;
1970 break ;
1971 case wxFONTENCODING_MACTELUGU :
1972 enc = kCFStringEncodingMacTelugu ;
1973 break ;
1974 case wxFONTENCODING_MACKANNADA :
1975 enc = kCFStringEncodingMacKannada ;
1976 break ;
1977 case wxFONTENCODING_MACMALAJALAM :
1978 enc = kCFStringEncodingMacMalayalam ;
1979 break ;
1980 case wxFONTENCODING_MACSINHALESE :
1981 enc = kCFStringEncodingMacSinhalese ;
1982 break ;
1983 case wxFONTENCODING_MACBURMESE :
1984 enc = kCFStringEncodingMacBurmese ;
1985 break ;
1986 case wxFONTENCODING_MACKHMER :
1987 enc = kCFStringEncodingMacKhmer ;
1988 break ;
1989 case wxFONTENCODING_MACTHAI :
1990 enc = kCFStringEncodingMacThai ;
1991 break ;
1992 case wxFONTENCODING_MACLAOTIAN :
1993 enc = kCFStringEncodingMacLaotian ;
1994 break ;
1995 case wxFONTENCODING_MACGEORGIAN :
1996 enc = kCFStringEncodingMacGeorgian ;
1997 break ;
1998 case wxFONTENCODING_MACARMENIAN :
1999 enc = kCFStringEncodingMacArmenian ;
2000 break ;
2001 case wxFONTENCODING_MACCHINESESIMP :
2002 enc = kCFStringEncodingMacChineseSimp ;
2003 break ;
2004 case wxFONTENCODING_MACTIBETAN :
2005 enc = kCFStringEncodingMacTibetan ;
2006 break ;
2007 case wxFONTENCODING_MACMONGOLIAN :
2008 enc = kCFStringEncodingMacMongolian ;
2009 break ;
2010 case wxFONTENCODING_MACETHIOPIC :
2011 enc = kCFStringEncodingMacEthiopic ;
2012 break ;
2013 case wxFONTENCODING_MACCENTRALEUR :
2014 enc = kCFStringEncodingMacCentralEurRoman ;
2015 break ;
2016 case wxFONTENCODING_MACVIATNAMESE :
2017 enc = kCFStringEncodingMacVietnamese ;
2018 break ;
2019 case wxFONTENCODING_MACARABICEXT :
2020 enc = kCFStringEncodingMacExtArabic ;
2021 break ;
2022 case wxFONTENCODING_MACSYMBOL :
2023 enc = kCFStringEncodingMacSymbol ;
2024 break ;
2025 case wxFONTENCODING_MACDINGBATS :
2026 enc = kCFStringEncodingMacDingbats ;
2027 break ;
2028 case wxFONTENCODING_MACTURKISH :
2029 enc = kCFStringEncodingMacTurkish ;
2030 break ;
2031 case wxFONTENCODING_MACCROATIAN :
2032 enc = kCFStringEncodingMacCroatian ;
2033 break ;
2034 case wxFONTENCODING_MACICELANDIC :
2035 enc = kCFStringEncodingMacIcelandic ;
2036 break ;
2037 case wxFONTENCODING_MACROMANIAN :
2038 enc = kCFStringEncodingMacRomanian ;
2039 break ;
2040 case wxFONTENCODING_MACCELTIC :
2041 enc = kCFStringEncodingMacCeltic ;
2042 break ;
2043 case wxFONTENCODING_MACGAELIC :
2044 enc = kCFStringEncodingMacGaelic ;
2045 break ;
2046// case wxFONTENCODING_MACKEYBOARD :
2047// enc = kCFStringEncodingMacKeyboardGlyphs ;
2048// break ;
2049 default :
2050 // because gcc is picky
2051 break ;
2052 } ;
2053 return enc ;
2054}
2055
2056class wxMBConv_cocoa : public wxMBConv
2057{
2058public:
2059 wxMBConv_cocoa()
2060 {
2061 Init(CFStringGetSystemEncoding()) ;
2062 }
2063
2064#if wxUSE_FONTMAP
2065 wxMBConv_cocoa(const wxChar* name)
2066 {
2067 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2068 }
2069#endif
2070
2071 wxMBConv_cocoa(wxFontEncoding encoding)
2072 {
2073 Init( wxCFStringEncFromFontEnc(encoding) );
2074 }
2075
2076 ~wxMBConv_cocoa()
2077 {
2078 }
2079
2080 void Init( CFStringEncoding encoding)
2081 {
2082 m_encoding = encoding ;
2083 }
2084
2085 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2086 {
2087 wxASSERT(szUnConv);
2088
2089 CFStringRef theString = CFStringCreateWithBytes (
2090 NULL, //the allocator
2091 (const UInt8*)szUnConv,
2092 strlen(szUnConv),
2093 m_encoding,
2094 false //no BOM/external representation
2095 );
2096
2097 wxASSERT(theString);
2098
2099 size_t nOutLength = CFStringGetLength(theString);
2100
2101 if (szOut == NULL)
2102 {
2103 CFRelease(theString);
2104 return nOutLength;
2105 }
2106
2107 CFRange theRange = { 0, nOutSize };
2108
2109#if SIZEOF_WCHAR_T == 4
2110 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2111#endif
2112
2113 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2114
2115 CFRelease(theString);
2116
2117 szUniCharBuffer[nOutLength] = '\0' ;
2118
2119#if SIZEOF_WCHAR_T == 4
2120 wxMBConvUTF16 converter ;
2121 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2122 delete[] szUniCharBuffer;
2123#endif
2124
2125 return nOutLength;
2126 }
2127
2128 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2129 {
2130 wxASSERT(szUnConv);
2131
2132 size_t nRealOutSize;
2133 size_t nBufSize = wxWcslen(szUnConv);
2134 UniChar* szUniBuffer = (UniChar*) szUnConv;
2135
2136#if SIZEOF_WCHAR_T == 4
2137 wxMBConvUTF16 converter ;
2138 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2139 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2140 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2141 nBufSize /= sizeof(UniChar);
2142#endif
2143
2144 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2145 NULL, //allocator
2146 szUniBuffer,
2147 nBufSize,
2148 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2149 );
2150
2151 wxASSERT(theString);
2152
2153 //Note that CER puts a BOM when converting to unicode
2154 //so we check and use getchars instead in that case
2155 if (m_encoding == kCFStringEncodingUnicode)
2156 {
2157 if (szOut != NULL)
2158 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2159
2160 nRealOutSize = CFStringGetLength(theString) + 1;
2161 }
2162 else
2163 {
2164 CFStringGetBytes(
2165 theString,
2166 CFRangeMake(0, CFStringGetLength(theString)),
2167 m_encoding,
2168 0, //what to put in characters that can't be converted -
2169 //0 tells CFString to return NULL if it meets such a character
2170 false, //not an external representation
2171 (UInt8*) szOut,
2172 nOutSize,
2173 (CFIndex*) &nRealOutSize
2174 );
2175 }
2176
2177 CFRelease(theString);
2178
2179#if SIZEOF_WCHAR_T == 4
2180 delete[] szUniBuffer;
2181#endif
2182
2183 return nRealOutSize - 1;
2184 }
2185
2186 bool IsOk() const
2187 {
2188 return m_encoding != kCFStringEncodingInvalidId &&
2189 CFStringIsEncodingAvailable(m_encoding);
2190 }
2191
2192private:
2193 CFStringEncoding m_encoding ;
2194};
2195
2196#endif // defined(__WXCOCOA__)
2197
2198// ============================================================================
2199// Mac conversion classes
2200// ============================================================================
2201
2202#if defined(__WXMAC__) && defined(TARGET_CARBON)
2203
2204class wxMBConv_mac : public wxMBConv
2205{
2206public:
2207 wxMBConv_mac()
2208 {
2209 Init(CFStringGetSystemEncoding()) ;
2210 }
2211
2212#if wxUSE_FONTMAP
2213 wxMBConv_mac(const wxChar* name)
2214 {
2215 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2216 }
2217#endif
2218
2219 wxMBConv_mac(wxFontEncoding encoding)
2220 {
2221 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2222 }
2223
2224 ~wxMBConv_mac()
2225 {
2226 OSStatus status = noErr ;
2227 status = TECDisposeConverter(m_MB2WC_converter);
2228 status = TECDisposeConverter(m_WC2MB_converter);
2229 }
2230
2231
2232 void Init( TextEncodingBase encoding)
2233 {
2234 OSStatus status = noErr ;
2235 m_char_encoding = encoding ;
2236 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2237
2238 status = TECCreateConverter(&m_MB2WC_converter,
2239 m_char_encoding,
2240 m_unicode_encoding);
2241 status = TECCreateConverter(&m_WC2MB_converter,
2242 m_unicode_encoding,
2243 m_char_encoding);
2244 }
2245
2246 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2247 {
2248 OSStatus status = noErr ;
2249 ByteCount byteOutLen ;
2250 ByteCount byteInLen = strlen(psz) ;
2251 wchar_t *tbuf = NULL ;
2252 UniChar* ubuf = NULL ;
2253 size_t res = 0 ;
2254
2255 if (buf == NULL)
2256 {
2257 //apple specs say at least 32
2258 n = wxMax( 32 , byteInLen ) ;
2259 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2260 }
2261 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2262#if SIZEOF_WCHAR_T == 4
2263 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2264#else
2265 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2266#endif
2267 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2268 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2269#if SIZEOF_WCHAR_T == 4
2270 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2271 // is not properly terminated we get random characters at the end
2272 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2273 wxMBConvUTF16 converter ;
2274 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2275 free( ubuf ) ;
2276#else
2277 res = byteOutLen / sizeof( UniChar ) ;
2278#endif
2279 if ( buf == NULL )
2280 free(tbuf) ;
2281
2282 if ( buf && res < n)
2283 buf[res] = 0;
2284
2285 return res ;
2286 }
2287
2288 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2289 {
2290 OSStatus status = noErr ;
2291 ByteCount byteOutLen ;
2292 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2293
2294 char *tbuf = NULL ;
2295
2296 if (buf == NULL)
2297 {
2298 //apple specs say at least 32
2299 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2300 tbuf = (char*) malloc( n ) ;
2301 }
2302
2303 ByteCount byteBufferLen = n ;
2304 UniChar* ubuf = NULL ;
2305#if SIZEOF_WCHAR_T == 4
2306 wxMBConvUTF16 converter ;
2307 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2308 byteInLen = unicharlen ;
2309 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2310 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2311#else
2312 ubuf = (UniChar*) psz ;
2313#endif
2314 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2315 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2316#if SIZEOF_WCHAR_T == 4
2317 free( ubuf ) ;
2318#endif
2319 if ( buf == NULL )
2320 free(tbuf) ;
2321
2322 size_t res = byteOutLen ;
2323 if ( buf && res < n)
2324 {
2325 buf[res] = 0;
2326
2327 //we need to double-trip to verify it didn't insert any ? in place
2328 //of bogus characters
2329 wxWCharBuffer wcBuf(n);
2330 size_t pszlen = wxWcslen(psz);
2331 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2332 wxWcslen(wcBuf) != pszlen ||
2333 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2334 {
2335 // we didn't obtain the same thing we started from, hence
2336 // the conversion was lossy and we consider that it failed
2337 return (size_t)-1;
2338 }
2339 }
2340
2341 return res ;
2342 }
2343
2344 bool IsOk() const
2345 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2346
2347private:
2348 TECObjectRef m_MB2WC_converter ;
2349 TECObjectRef m_WC2MB_converter ;
2350
2351 TextEncodingBase m_char_encoding ;
2352 TextEncodingBase m_unicode_encoding ;
2353};
2354
2355#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2356
2357// ============================================================================
2358// wxEncodingConverter based conversion classes
2359// ============================================================================
2360
2361#if wxUSE_FONTMAP
2362
2363class wxMBConv_wxwin : public wxMBConv
2364{
2365private:
2366 void Init()
2367 {
2368 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2369 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2370 }
2371
2372public:
2373 // temporarily just use wxEncodingConverter stuff,
2374 // so that it works while a better implementation is built
2375 wxMBConv_wxwin(const wxChar* name)
2376 {
2377 if (name)
2378 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2379 else
2380 m_enc = wxFONTENCODING_SYSTEM;
2381
2382 Init();
2383 }
2384
2385 wxMBConv_wxwin(wxFontEncoding enc)
2386 {
2387 m_enc = enc;
2388
2389 Init();
2390 }
2391
2392 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2393 {
2394 size_t inbuf = strlen(psz);
2395 if (buf)
2396 {
2397 if (!m2w.Convert(psz,buf))
2398 return (size_t)-1;
2399 }
2400 return inbuf;
2401 }
2402
2403 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2404 {
2405 const size_t inbuf = wxWcslen(psz);
2406 if (buf)
2407 {
2408 if (!w2m.Convert(psz,buf))
2409 return (size_t)-1;
2410 }
2411
2412 return inbuf;
2413 }
2414
2415 bool IsOk() const { return m_ok; }
2416
2417public:
2418 wxFontEncoding m_enc;
2419 wxEncodingConverter m2w, w2m;
2420
2421 // were we initialized successfully?
2422 bool m_ok;
2423
2424 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2425};
2426
2427// make the constructors available for unit testing
2428WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2429{
2430 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2431 if ( !result->IsOk() )
2432 {
2433 delete result;
2434 return 0;
2435 }
2436 return result;
2437}
2438
2439#endif // wxUSE_FONTMAP
2440
2441// ============================================================================
2442// wxCSConv implementation
2443// ============================================================================
2444
2445void wxCSConv::Init()
2446{
2447 m_name = NULL;
2448 m_convReal = NULL;
2449 m_deferred = true;
2450}
2451
2452wxCSConv::wxCSConv(const wxChar *charset)
2453{
2454 Init();
2455
2456 if ( charset )
2457 {
2458 SetName(charset);
2459 }
2460
2461 m_encoding = wxFONTENCODING_SYSTEM;
2462}
2463
2464wxCSConv::wxCSConv(wxFontEncoding encoding)
2465{
2466 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2467 {
2468 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2469
2470 encoding = wxFONTENCODING_SYSTEM;
2471 }
2472
2473 Init();
2474
2475 m_encoding = encoding;
2476}
2477
2478wxCSConv::~wxCSConv()
2479{
2480 Clear();
2481}
2482
2483wxCSConv::wxCSConv(const wxCSConv& conv)
2484 : wxMBConv()
2485{
2486 Init();
2487
2488 SetName(conv.m_name);
2489 m_encoding = conv.m_encoding;
2490}
2491
2492wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2493{
2494 Clear();
2495
2496 SetName(conv.m_name);
2497 m_encoding = conv.m_encoding;
2498
2499 return *this;
2500}
2501
2502void wxCSConv::Clear()
2503{
2504 free(m_name);
2505 delete m_convReal;
2506
2507 m_name = NULL;
2508 m_convReal = NULL;
2509}
2510
2511void wxCSConv::SetName(const wxChar *charset)
2512{
2513 if (charset)
2514 {
2515 m_name = wxStrdup(charset);
2516 m_deferred = true;
2517 }
2518}
2519
2520#if wxUSE_FONTMAP
2521#include "wx/hashmap.h"
2522
2523WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2524 wxEncodingNameCache );
2525
2526static wxEncodingNameCache gs_nameCache;
2527#endif
2528
2529wxMBConv *wxCSConv::DoCreate() const
2530{
2531#if wxUSE_FONTMAP
2532 wxLogTrace(TRACE_STRCONV,
2533 wxT("creating conversion for %s"),
2534 (m_name ? m_name
2535 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2536#endif // wxUSE_FONTMAP
2537
2538 // check for the special case of ASCII or ISO8859-1 charset: as we have
2539 // special knowledge of it anyhow, we don't need to create a special
2540 // conversion object
2541 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2542 {
2543 // don't convert at all
2544 return NULL;
2545 }
2546
2547 // we trust OS to do conversion better than we can so try external
2548 // conversion methods first
2549 //
2550 // the full order is:
2551 // 1. OS conversion (iconv() under Unix or Win32 API)
2552 // 2. hard coded conversions for UTF
2553 // 3. wxEncodingConverter as fall back
2554
2555 // step (1)
2556#ifdef HAVE_ICONV
2557#if !wxUSE_FONTMAP
2558 if ( m_name )
2559#endif // !wxUSE_FONTMAP
2560 {
2561 wxString name(m_name);
2562 wxFontEncoding encoding(m_encoding);
2563
2564 if ( !name.empty() )
2565 {
2566 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2567 if ( conv->IsOk() )
2568 return conv;
2569
2570 delete conv;
2571
2572#if wxUSE_FONTMAP
2573 encoding =
2574 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2575#endif // wxUSE_FONTMAP
2576 }
2577#if wxUSE_FONTMAP
2578 {
2579 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2580 if ( it != gs_nameCache.end() )
2581 {
2582 if ( it->second.empty() )
2583 return NULL;
2584
2585 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2586 if ( conv->IsOk() )
2587 return conv;
2588
2589 delete conv;
2590 }
2591
2592 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2593
2594 for ( ; *names; ++names )
2595 {
2596 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2597 if ( conv->IsOk() )
2598 {
2599 gs_nameCache[encoding] = *names;
2600 return conv;
2601 }
2602
2603 delete conv;
2604 }
2605
2606 gs_nameCache[encoding] = _T(""); // cache the failure
2607 }
2608#endif // wxUSE_FONTMAP
2609 }
2610#endif // HAVE_ICONV
2611
2612#ifdef wxHAVE_WIN32_MB2WC
2613 {
2614#if wxUSE_FONTMAP
2615 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2616 : new wxMBConv_win32(m_encoding);
2617 if ( conv->IsOk() )
2618 return conv;
2619
2620 delete conv;
2621#else
2622 return NULL;
2623#endif
2624 }
2625#endif // wxHAVE_WIN32_MB2WC
2626#if defined(__WXMAC__)
2627 {
2628 // leave UTF16 and UTF32 to the built-ins of wx
2629 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2630 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2631 {
2632
2633#if wxUSE_FONTMAP
2634 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2635 : new wxMBConv_mac(m_encoding);
2636#else
2637 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2638#endif
2639 if ( conv->IsOk() )
2640 return conv;
2641
2642 delete conv;
2643 }
2644 }
2645#endif
2646#if defined(__WXCOCOA__)
2647 {
2648 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2649 {
2650
2651#if wxUSE_FONTMAP
2652 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2653 : new wxMBConv_cocoa(m_encoding);
2654#else
2655 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2656#endif
2657 if ( conv->IsOk() )
2658 return conv;
2659
2660 delete conv;
2661 }
2662 }
2663#endif
2664 // step (2)
2665 wxFontEncoding enc = m_encoding;
2666#if wxUSE_FONTMAP
2667 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2668 {
2669 // use "false" to suppress interactive dialogs -- we can be called from
2670 // anywhere and popping up a dialog from here is the last thing we want to
2671 // do
2672 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2673 }
2674#endif // wxUSE_FONTMAP
2675
2676 switch ( enc )
2677 {
2678 case wxFONTENCODING_UTF7:
2679 return new wxMBConvUTF7;
2680
2681 case wxFONTENCODING_UTF8:
2682 return new wxMBConvUTF8;
2683
2684 case wxFONTENCODING_UTF16BE:
2685 return new wxMBConvUTF16BE;
2686
2687 case wxFONTENCODING_UTF16LE:
2688 return new wxMBConvUTF16LE;
2689
2690 case wxFONTENCODING_UTF32BE:
2691 return new wxMBConvUTF32BE;
2692
2693 case wxFONTENCODING_UTF32LE:
2694 return new wxMBConvUTF32LE;
2695
2696 default:
2697 // nothing to do but put here to suppress gcc warnings
2698 ;
2699 }
2700
2701 // step (3)
2702#if wxUSE_FONTMAP
2703 {
2704 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2705 : new wxMBConv_wxwin(m_encoding);
2706 if ( conv->IsOk() )
2707 return conv;
2708
2709 delete conv;
2710 }
2711#endif // wxUSE_FONTMAP
2712
2713 // NB: This is a hack to prevent deadlock. What could otherwise happen
2714 // in Unicode build: wxConvLocal creation ends up being here
2715 // because of some failure and logs the error. But wxLog will try to
2716 // attach timestamp, for which it will need wxConvLocal (to convert
2717 // time to char* and then wchar_t*), but that fails, tries to log
2718 // error, but wxLog has a (already locked) critical section that
2719 // guards static buffer.
2720 static bool alreadyLoggingError = false;
2721 if (!alreadyLoggingError)
2722 {
2723 alreadyLoggingError = true;
2724 wxLogError(_("Cannot convert from the charset '%s'!"),
2725 m_name ? m_name
2726 :
2727#if wxUSE_FONTMAP
2728 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2729#else // !wxUSE_FONTMAP
2730 wxString::Format(_("encoding %s"), m_encoding).c_str()
2731#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2732 );
2733 alreadyLoggingError = false;
2734 }
2735
2736 return NULL;
2737}
2738
2739void wxCSConv::CreateConvIfNeeded() const
2740{
2741 if ( m_deferred )
2742 {
2743 wxCSConv *self = (wxCSConv *)this; // const_cast
2744
2745#if wxUSE_INTL
2746 // if we don't have neither the name nor the encoding, use the default
2747 // encoding for this system
2748 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2749 {
2750 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2751 }
2752#endif // wxUSE_INTL
2753
2754 self->m_convReal = DoCreate();
2755 self->m_deferred = false;
2756 }
2757}
2758
2759size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2760{
2761 CreateConvIfNeeded();
2762
2763 if (m_convReal)
2764 return m_convReal->MB2WC(buf, psz, n);
2765
2766 // latin-1 (direct)
2767 size_t len = strlen(psz);
2768
2769 if (buf)
2770 {
2771 for (size_t c = 0; c <= len; c++)
2772 buf[c] = (unsigned char)(psz[c]);
2773 }
2774
2775 return len;
2776}
2777
2778size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2779{
2780 CreateConvIfNeeded();
2781
2782 if (m_convReal)
2783 return m_convReal->WC2MB(buf, psz, n);
2784
2785 // latin-1 (direct)
2786 const size_t len = wxWcslen(psz);
2787 if (buf)
2788 {
2789 for (size_t c = 0; c <= len; c++)
2790 {
2791 if (psz[c] > 0xFF)
2792 return (size_t)-1;
2793 buf[c] = (char)psz[c];
2794 }
2795 }
2796 else
2797 {
2798 for (size_t c = 0; c <= len; c++)
2799 {
2800 if (psz[c] > 0xFF)
2801 return (size_t)-1;
2802 }
2803 }
2804
2805 return len;
2806}
2807
2808// ----------------------------------------------------------------------------
2809// globals
2810// ----------------------------------------------------------------------------
2811
2812#ifdef __WINDOWS__
2813 static wxMBConv_win32 wxConvLibcObj;
2814#elif defined(__WXMAC__) && !defined(__MACH__)
2815 static wxMBConv_mac wxConvLibcObj ;
2816#else
2817 static wxMBConvLibc wxConvLibcObj;
2818#endif
2819
2820static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2821static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2822static wxMBConvUTF7 wxConvUTF7Obj;
2823static wxMBConvUTF8 wxConvUTF8Obj;
2824
2825WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2826WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2827WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2828WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2829WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2830WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2831WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2832#ifdef __WXOSX__
2833 wxConvUTF8Obj;
2834#else
2835 wxConvLibcObj;
2836#endif
2837
2838
2839#else // !wxUSE_WCHAR_T
2840
2841// stand-ins in absence of wchar_t
2842WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2843 wxConvISO8859_1,
2844 wxConvLocal,
2845 wxConvUTF8;
2846
2847#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T