]> git.saurik.com Git - wxWidgets.git/blame_incremental - src/common/strconv.cpp
move code ignoring VK_SPACE and VK_RETURN WM_CHAR messages to MSWDefWindowProc()...
[wxWidgets.git] / src / common / strconv.cpp
... / ...
CommitLineData
1/////////////////////////////////////////////////////////////////////////////
2// Name: strconv.cpp
3// Purpose: Unicode conversion classes
4// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5// Ryan Norton, Fredrik Roubert (UTF7)
6// Modified by:
7// Created: 29/01/98
8// RCS-ID: $Id$
9// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10// (c) 2000-2003 Vadim Zeitlin
11// (c) 2004 Ryan Norton, Fredrik Roubert
12// Licence: wxWindows licence
13/////////////////////////////////////////////////////////////////////////////
14
15// ============================================================================
16// declarations
17// ============================================================================
18
19// ----------------------------------------------------------------------------
20// headers
21// ----------------------------------------------------------------------------
22
23#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25#endif
26
27// For compilers that support precompilation, includes "wx.h".
28#include "wx/wxprec.h"
29
30#ifdef __BORLANDC__
31 #pragma hdrstop
32#endif
33
34#ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37#endif // WX_PRECOMP
38
39#include "wx/strconv.h"
40
41#if wxUSE_WCHAR_T
42
43#ifdef __WXMSW__
44 #include "wx/msw/private.h"
45#endif
46
47#ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49#endif
50
51#ifndef __WXWINCE__
52#include <errno.h>
53#endif
54
55#include <ctype.h>
56#include <string.h>
57#include <stdlib.h>
58
59#if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61#endif // __WIN32__ but !__WXMICROWIN__
62
63// ----------------------------------------------------------------------------
64// headers
65// ----------------------------------------------------------------------------
66
67#ifdef __SALFORDC__
68 #include <clib.h>
69#endif
70
71#ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74#endif
75
76#include "wx/encconv.h"
77#include "wx/fontmap.h"
78#include "wx/utils.h"
79
80#ifdef __WXMAC__
81#include <ATSUnicode.h>
82#include <TextCommon.h>
83#include <TextEncodingConverter.h>
84
85#include "wx/mac/private.h" // includes mac headers
86#endif
87// ----------------------------------------------------------------------------
88// macros
89// ----------------------------------------------------------------------------
90
91#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92#define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94#if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102#elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111#else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114#endif
115
116// ============================================================================
117// implementation
118// ============================================================================
119
120// ----------------------------------------------------------------------------
121// UTF-16 en/decoding to/from UCS-4
122// ----------------------------------------------------------------------------
123
124
125static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126{
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146}
147
148static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149{
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165}
166
167
168// ----------------------------------------------------------------------------
169// wxMBConv
170// ----------------------------------------------------------------------------
171
172wxMBConv::~wxMBConv()
173{
174 // nothing to do here (necessary for Darwin linking probably)
175}
176
177const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178{
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198}
199
200const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201{
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219}
220
221const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222{
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282}
283
284const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285{
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344}
345
346// ----------------------------------------------------------------------------
347// wxMBConvLibc
348// ----------------------------------------------------------------------------
349
350size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351{
352 return wxMB2WC(buf, psz, n);
353}
354
355size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356{
357 return wxWC2MB(buf, psz, n);
358}
359
360#ifdef __UNIX__
361
362// ----------------------------------------------------------------------------
363// wxConvBrokenFileNames
364// ----------------------------------------------------------------------------
365
366wxConvBrokenFileNames::wxConvBrokenFileNames()
367{
368 // decide which conversion to use for the file names
369
370 // (1) this variable exists for the sole purpose of specifying the encoding
371 // of the filenames for GTK+ programs, so use it if it is set
372 wxString encName(wxGetenv(_T("G_FILENAME_ENCODING")));
373 encName.MakeUpper();
374 if ( !encName.empty() && encName != _T("UTF-8") && encName != _T("UTF8") )
375 {
376 m_conv = new wxCSConv(encName);
377 }
378 else // no G_FILENAME_ENCODING
379 {
380 if ( encName.empty() )
381 encName = wxLocale::GetSystemEncodingName().Upper();
382
383 // (2) if a non default locale is set, assume that the user wants his
384 // filenames in this locale too
385 if ( !encName.empty() && encName != _T("UTF-8") && encName != _T("UTF8") )
386 {
387 wxSetEnv(_T("G_FILENAME_ENCODING"), encName);
388 m_conv = new wxMBConvLibc;
389 }
390 else
391 {
392 // (3) finally use UTF-8 by default
393 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
394 }
395 }
396}
397
398size_t
399wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
400 const char *psz,
401 size_t outputSize) const
402{
403 return m_conv->MB2WC( outputBuf, psz, outputSize );
404}
405
406size_t
407wxConvBrokenFileNames::WC2MB(char *outputBuf,
408 const wchar_t *psz,
409 size_t outputSize) const
410{
411 return m_conv->WC2MB( outputBuf, psz, outputSize );
412}
413
414#endif
415
416// ----------------------------------------------------------------------------
417// UTF-7
418// ----------------------------------------------------------------------------
419
420// Implementation (C) 2004 Fredrik Roubert
421
422//
423// BASE64 decoding table
424//
425static const unsigned char utf7unb64[] =
426{
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
433 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
434 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
436 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
437 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
438 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
439 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
440 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
441 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
442 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
443 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
444 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
445 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
446 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
451 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
452 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
453 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
454 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
455 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
456 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
457 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
458 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
459};
460
461size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
462{
463 size_t len = 0;
464
465 while (*psz && ((!buf) || (len < n)))
466 {
467 unsigned char cc = *psz++;
468 if (cc != '+')
469 {
470 // plain ASCII char
471 if (buf)
472 *buf++ = cc;
473 len++;
474 }
475 else if (*psz == '-')
476 {
477 // encoded plus sign
478 if (buf)
479 *buf++ = cc;
480 len++;
481 psz++;
482 }
483 else
484 {
485 // BASE64 encoded string
486 bool lsb;
487 unsigned char c;
488 unsigned int d, l;
489 for (lsb = false, d = 0, l = 0;
490 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
491 {
492 d <<= 6;
493 d += cc;
494 for (l += 6; l >= 8; lsb = !lsb)
495 {
496 c = (unsigned char)((d >> (l -= 8)) % 256);
497 if (lsb)
498 {
499 if (buf)
500 *buf++ |= c;
501 len ++;
502 }
503 else
504 if (buf)
505 *buf = (wchar_t)(c << 8);
506 }
507 }
508 if (*psz == '-')
509 psz++;
510 }
511 }
512 if (buf && (len < n))
513 *buf = 0;
514 return len;
515}
516
517//
518// BASE64 encoding table
519//
520static const unsigned char utf7enb64[] =
521{
522 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
523 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
524 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
525 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
526 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
527 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
528 'w', 'x', 'y', 'z', '0', '1', '2', '3',
529 '4', '5', '6', '7', '8', '9', '+', '/'
530};
531
532//
533// UTF-7 encoding table
534//
535// 0 - Set D (directly encoded characters)
536// 1 - Set O (optional direct characters)
537// 2 - whitespace characters (optional)
538// 3 - special characters
539//
540static const unsigned char utf7encode[128] =
541{
542 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
543 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
544 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
546 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
548 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
550};
551
552size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
553{
554
555
556 size_t len = 0;
557
558 while (*psz && ((!buf) || (len < n)))
559 {
560 wchar_t cc = *psz++;
561 if (cc < 0x80 && utf7encode[cc] < 1)
562 {
563 // plain ASCII char
564 if (buf)
565 *buf++ = (char)cc;
566 len++;
567 }
568#ifndef WC_UTF16
569 else if (((wxUint32)cc) > 0xffff)
570 {
571 // no surrogate pair generation (yet?)
572 return (size_t)-1;
573 }
574#endif
575 else
576 {
577 if (buf)
578 *buf++ = '+';
579 len++;
580 if (cc != '+')
581 {
582 // BASE64 encode string
583 unsigned int lsb, d, l;
584 for (d = 0, l = 0;; psz++)
585 {
586 for (lsb = 0; lsb < 2; lsb ++)
587 {
588 d <<= 8;
589 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
590
591 for (l += 8; l >= 6; )
592 {
593 l -= 6;
594 if (buf)
595 *buf++ = utf7enb64[(d >> l) % 64];
596 len++;
597 }
598 }
599 cc = *psz;
600 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
601 break;
602 }
603 if (l != 0)
604 {
605 if (buf)
606 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
607 len++;
608 }
609 }
610 if (buf)
611 *buf++ = '-';
612 len++;
613 }
614 }
615 if (buf && (len < n))
616 *buf = 0;
617 return len;
618}
619
620// ----------------------------------------------------------------------------
621// UTF-8
622// ----------------------------------------------------------------------------
623
624static wxUint32 utf8_max[]=
625 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
626
627// boundaries of the private use area we use to (temporarily) remap invalid
628// characters invalid in a UTF-8 encoded string
629const wxUint32 wxUnicodePUA = 0x100000;
630const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
631
632size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
633{
634 size_t len = 0;
635
636 while (*psz && ((!buf) || (len < n)))
637 {
638 const char *opsz = psz;
639 bool invalid = false;
640 unsigned char cc = *psz++, fc = cc;
641 unsigned cnt;
642 for (cnt = 0; fc & 0x80; cnt++)
643 fc <<= 1;
644 if (!cnt)
645 {
646 // plain ASCII char
647 if (buf)
648 *buf++ = cc;
649 len++;
650
651 // escape the escape character for octal escapes
652 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
653 && cc == '\\' && (!buf || len < n))
654 {
655 if (buf)
656 *buf++ = cc;
657 len++;
658 }
659 }
660 else
661 {
662 cnt--;
663 if (!cnt)
664 {
665 // invalid UTF-8 sequence
666 invalid = true;
667 }
668 else
669 {
670 unsigned ocnt = cnt - 1;
671 wxUint32 res = cc & (0x3f >> cnt);
672 while (cnt--)
673 {
674 cc = *psz;
675 if ((cc & 0xC0) != 0x80)
676 {
677 // invalid UTF-8 sequence
678 invalid = true;
679 break;
680 }
681 psz++;
682 res = (res << 6) | (cc & 0x3f);
683 }
684 if (invalid || res <= utf8_max[ocnt])
685 {
686 // illegal UTF-8 encoding
687 invalid = true;
688 }
689 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
690 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
691 {
692 // if one of our PUA characters turns up externally
693 // it must also be treated as an illegal sequence
694 // (a bit like you have to escape an escape character)
695 invalid = true;
696 }
697 else
698 {
699#ifdef WC_UTF16
700 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
701 size_t pa = encode_utf16(res, (wxUint16 *)buf);
702 if (pa == (size_t)-1)
703 {
704 invalid = true;
705 }
706 else
707 {
708 if (buf)
709 buf += pa;
710 len += pa;
711 }
712#else // !WC_UTF16
713 if (buf)
714 *buf++ = res;
715 len++;
716#endif // WC_UTF16/!WC_UTF16
717 }
718 }
719 if (invalid)
720 {
721 if (m_options & MAP_INVALID_UTF8_TO_PUA)
722 {
723 while (opsz < psz && (!buf || len < n))
724 {
725#ifdef WC_UTF16
726 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
727 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
728 wxASSERT(pa != (size_t)-1);
729 if (buf)
730 buf += pa;
731 opsz++;
732 len += pa;
733#else
734 if (buf)
735 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
736 opsz++;
737 len++;
738#endif
739 }
740 }
741 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
742 {
743 while (opsz < psz && (!buf || len < n))
744 {
745 if ( buf && len + 3 < n )
746 {
747 unsigned char n = *opsz;
748 *buf++ = L'\\';
749 *buf++ = (wchar_t)( L'0' + n / 0100 );
750 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
751 *buf++ = (wchar_t)( L'0' + n % 010 );
752 }
753 opsz++;
754 len += 4;
755 }
756 }
757 else // MAP_INVALID_UTF8_NOT
758 {
759 return (size_t)-1;
760 }
761 }
762 }
763 }
764 if (buf && (len < n))
765 *buf = 0;
766 return len;
767}
768
769static inline bool isoctal(wchar_t wch)
770{
771 return L'0' <= wch && wch <= L'7';
772}
773
774size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
775{
776 size_t len = 0;
777
778 while (*psz && ((!buf) || (len < n)))
779 {
780 wxUint32 cc;
781#ifdef WC_UTF16
782 // cast is ok for WC_UTF16
783 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
784 psz += (pa == (size_t)-1) ? 1 : pa;
785#else
786 cc=(*psz++) & 0x7fffffff;
787#endif
788
789 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
790 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
791 {
792 if (buf)
793 *buf++ = (char)(cc - wxUnicodePUA);
794 len++;
795 }
796 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
797 && cc == L'\\' && psz[0] == L'\\' )
798 {
799 if (buf)
800 *buf++ = (char)cc;
801 psz++;
802 len++;
803 }
804 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
805 cc == L'\\' &&
806 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
807 {
808 if (buf)
809 {
810 *buf++ = (char) ((psz[0] - L'0')*0100 +
811 (psz[1] - L'0')*010 +
812 (psz[2] - L'0'));
813 }
814
815 psz += 3;
816 len++;
817 }
818 else
819 {
820 unsigned cnt;
821 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
822 if (!cnt)
823 {
824 // plain ASCII char
825 if (buf)
826 *buf++ = (char) cc;
827 len++;
828 }
829
830 else
831 {
832 len += cnt + 1;
833 if (buf)
834 {
835 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
836 while (cnt--)
837 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
838 }
839 }
840 }
841 }
842
843 if (buf && (len<n))
844 *buf = 0;
845
846 return len;
847}
848
849// ----------------------------------------------------------------------------
850// UTF-16
851// ----------------------------------------------------------------------------
852
853#ifdef WORDS_BIGENDIAN
854 #define wxMBConvUTF16straight wxMBConvUTF16BE
855 #define wxMBConvUTF16swap wxMBConvUTF16LE
856#else
857 #define wxMBConvUTF16swap wxMBConvUTF16BE
858 #define wxMBConvUTF16straight wxMBConvUTF16LE
859#endif
860
861
862#ifdef WC_UTF16
863
864// copy 16bit MB to 16bit String
865size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
866{
867 size_t len=0;
868
869 while (*(wxUint16*)psz && (!buf || len < n))
870 {
871 if (buf)
872 *buf++ = *(wxUint16*)psz;
873 len++;
874
875 psz += sizeof(wxUint16);
876 }
877 if (buf && len<n) *buf=0;
878
879 return len;
880}
881
882
883// copy 16bit String to 16bit MB
884size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
885{
886 size_t len=0;
887
888 while (*psz && (!buf || len < n))
889 {
890 if (buf)
891 {
892 *(wxUint16*)buf = *psz;
893 buf += sizeof(wxUint16);
894 }
895 len += sizeof(wxUint16);
896 psz++;
897 }
898 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
899
900 return len;
901}
902
903
904// swap 16bit MB to 16bit String
905size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
906{
907 size_t len=0;
908
909 while (*(wxUint16*)psz && (!buf || len < n))
910 {
911 if (buf)
912 {
913 ((char *)buf)[0] = psz[1];
914 ((char *)buf)[1] = psz[0];
915 buf++;
916 }
917 len++;
918 psz += sizeof(wxUint16);
919 }
920 if (buf && len<n) *buf=0;
921
922 return len;
923}
924
925
926// swap 16bit MB to 16bit String
927size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
928{
929 size_t len=0;
930
931 while (*psz && (!buf || len < n))
932 {
933 if (buf)
934 {
935 *buf++ = ((char*)psz)[1];
936 *buf++ = ((char*)psz)[0];
937 }
938 len += sizeof(wxUint16);
939 psz++;
940 }
941 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
942
943 return len;
944}
945
946
947#else // WC_UTF16
948
949
950// copy 16bit MB to 32bit String
951size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
952{
953 size_t len=0;
954
955 while (*(wxUint16*)psz && (!buf || len < n))
956 {
957 wxUint32 cc;
958 size_t pa=decode_utf16((wxUint16*)psz, cc);
959 if (pa == (size_t)-1)
960 return pa;
961
962 if (buf)
963 *buf++ = cc;
964 len++;
965 psz += pa * sizeof(wxUint16);
966 }
967 if (buf && len<n) *buf=0;
968
969 return len;
970}
971
972
973// copy 32bit String to 16bit MB
974size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
975{
976 size_t len=0;
977
978 while (*psz && (!buf || len < n))
979 {
980 wxUint16 cc[2];
981 size_t pa=encode_utf16(*psz, cc);
982
983 if (pa == (size_t)-1)
984 return pa;
985
986 if (buf)
987 {
988 *(wxUint16*)buf = cc[0];
989 buf += sizeof(wxUint16);
990 if (pa > 1)
991 {
992 *(wxUint16*)buf = cc[1];
993 buf += sizeof(wxUint16);
994 }
995 }
996
997 len += pa*sizeof(wxUint16);
998 psz++;
999 }
1000 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1001
1002 return len;
1003}
1004
1005
1006// swap 16bit MB to 32bit String
1007size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1008{
1009 size_t len=0;
1010
1011 while (*(wxUint16*)psz && (!buf || len < n))
1012 {
1013 wxUint32 cc;
1014 char tmp[4];
1015 tmp[0]=psz[1]; tmp[1]=psz[0];
1016 tmp[2]=psz[3]; tmp[3]=psz[2];
1017
1018 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1019 if (pa == (size_t)-1)
1020 return pa;
1021
1022 if (buf)
1023 *buf++ = cc;
1024
1025 len++;
1026 psz += pa * sizeof(wxUint16);
1027 }
1028 if (buf && len<n) *buf=0;
1029
1030 return len;
1031}
1032
1033
1034// swap 32bit String to 16bit MB
1035size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1036{
1037 size_t len=0;
1038
1039 while (*psz && (!buf || len < n))
1040 {
1041 wxUint16 cc[2];
1042 size_t pa=encode_utf16(*psz, cc);
1043
1044 if (pa == (size_t)-1)
1045 return pa;
1046
1047 if (buf)
1048 {
1049 *buf++ = ((char*)cc)[1];
1050 *buf++ = ((char*)cc)[0];
1051 if (pa > 1)
1052 {
1053 *buf++ = ((char*)cc)[3];
1054 *buf++ = ((char*)cc)[2];
1055 }
1056 }
1057
1058 len += pa*sizeof(wxUint16);
1059 psz++;
1060 }
1061 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1062
1063 return len;
1064}
1065
1066#endif // WC_UTF16
1067
1068
1069// ----------------------------------------------------------------------------
1070// UTF-32
1071// ----------------------------------------------------------------------------
1072
1073#ifdef WORDS_BIGENDIAN
1074#define wxMBConvUTF32straight wxMBConvUTF32BE
1075#define wxMBConvUTF32swap wxMBConvUTF32LE
1076#else
1077#define wxMBConvUTF32swap wxMBConvUTF32BE
1078#define wxMBConvUTF32straight wxMBConvUTF32LE
1079#endif
1080
1081
1082WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1083WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1084
1085
1086#ifdef WC_UTF16
1087
1088// copy 32bit MB to 16bit String
1089size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1090{
1091 size_t len=0;
1092
1093 while (*(wxUint32*)psz && (!buf || len < n))
1094 {
1095 wxUint16 cc[2];
1096
1097 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1098 if (pa == (size_t)-1)
1099 return pa;
1100
1101 if (buf)
1102 {
1103 *buf++ = cc[0];
1104 if (pa > 1)
1105 *buf++ = cc[1];
1106 }
1107 len += pa;
1108 psz += sizeof(wxUint32);
1109 }
1110 if (buf && len<n) *buf=0;
1111
1112 return len;
1113}
1114
1115
1116// copy 16bit String to 32bit MB
1117size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1118{
1119 size_t len=0;
1120
1121 while (*psz && (!buf || len < n))
1122 {
1123 wxUint32 cc;
1124
1125 // cast is ok for WC_UTF16
1126 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1127 if (pa == (size_t)-1)
1128 return pa;
1129
1130 if (buf)
1131 {
1132 *(wxUint32*)buf = cc;
1133 buf += sizeof(wxUint32);
1134 }
1135 len += sizeof(wxUint32);
1136 psz += pa;
1137 }
1138
1139 if (buf && len<=n-sizeof(wxUint32))
1140 *(wxUint32*)buf=0;
1141
1142 return len;
1143}
1144
1145
1146
1147// swap 32bit MB to 16bit String
1148size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1149{
1150 size_t len=0;
1151
1152 while (*(wxUint32*)psz && (!buf || len < n))
1153 {
1154 char tmp[4];
1155 tmp[0] = psz[3]; tmp[1] = psz[2];
1156 tmp[2] = psz[1]; tmp[3] = psz[0];
1157
1158
1159 wxUint16 cc[2];
1160
1161 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1162 if (pa == (size_t)-1)
1163 return pa;
1164
1165 if (buf)
1166 {
1167 *buf++ = cc[0];
1168 if (pa > 1)
1169 *buf++ = cc[1];
1170 }
1171 len += pa;
1172 psz += sizeof(wxUint32);
1173 }
1174
1175 if (buf && len<n)
1176 *buf=0;
1177
1178 return len;
1179}
1180
1181
1182// swap 16bit String to 32bit MB
1183size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1184{
1185 size_t len=0;
1186
1187 while (*psz && (!buf || len < n))
1188 {
1189 char cc[4];
1190
1191 // cast is ok for WC_UTF16
1192 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1193 if (pa == (size_t)-1)
1194 return pa;
1195
1196 if (buf)
1197 {
1198 *buf++ = cc[3];
1199 *buf++ = cc[2];
1200 *buf++ = cc[1];
1201 *buf++ = cc[0];
1202 }
1203 len += sizeof(wxUint32);
1204 psz += pa;
1205 }
1206
1207 if (buf && len<=n-sizeof(wxUint32))
1208 *(wxUint32*)buf=0;
1209
1210 return len;
1211}
1212
1213#else // WC_UTF16
1214
1215
1216// copy 32bit MB to 32bit String
1217size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1218{
1219 size_t len=0;
1220
1221 while (*(wxUint32*)psz && (!buf || len < n))
1222 {
1223 if (buf)
1224 *buf++ = *(wxUint32*)psz;
1225 len++;
1226 psz += sizeof(wxUint32);
1227 }
1228
1229 if (buf && len<n)
1230 *buf=0;
1231
1232 return len;
1233}
1234
1235
1236// copy 32bit String to 32bit MB
1237size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1238{
1239 size_t len=0;
1240
1241 while (*psz && (!buf || len < n))
1242 {
1243 if (buf)
1244 {
1245 *(wxUint32*)buf = *psz;
1246 buf += sizeof(wxUint32);
1247 }
1248
1249 len += sizeof(wxUint32);
1250 psz++;
1251 }
1252
1253 if (buf && len<=n-sizeof(wxUint32))
1254 *(wxUint32*)buf=0;
1255
1256 return len;
1257}
1258
1259
1260// swap 32bit MB to 32bit String
1261size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1262{
1263 size_t len=0;
1264
1265 while (*(wxUint32*)psz && (!buf || len < n))
1266 {
1267 if (buf)
1268 {
1269 ((char *)buf)[0] = psz[3];
1270 ((char *)buf)[1] = psz[2];
1271 ((char *)buf)[2] = psz[1];
1272 ((char *)buf)[3] = psz[0];
1273 buf++;
1274 }
1275 len++;
1276 psz += sizeof(wxUint32);
1277 }
1278
1279 if (buf && len<n)
1280 *buf=0;
1281
1282 return len;
1283}
1284
1285
1286// swap 32bit String to 32bit MB
1287size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1288{
1289 size_t len=0;
1290
1291 while (*psz && (!buf || len < n))
1292 {
1293 if (buf)
1294 {
1295 *buf++ = ((char *)psz)[3];
1296 *buf++ = ((char *)psz)[2];
1297 *buf++ = ((char *)psz)[1];
1298 *buf++ = ((char *)psz)[0];
1299 }
1300 len += sizeof(wxUint32);
1301 psz++;
1302 }
1303
1304 if (buf && len<=n-sizeof(wxUint32))
1305 *(wxUint32*)buf=0;
1306
1307 return len;
1308}
1309
1310
1311#endif // WC_UTF16
1312
1313
1314// ============================================================================
1315// The classes doing conversion using the iconv_xxx() functions
1316// ============================================================================
1317
1318#ifdef HAVE_ICONV
1319
1320// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1321// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1322// (unless there's yet another bug in glibc) the only case when iconv()
1323// returns with (size_t)-1 (which means error) and says there are 0 bytes
1324// left in the input buffer -- when _real_ error occurs,
1325// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1326// iconv() failure.
1327// [This bug does not appear in glibc 2.2.]
1328#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1329#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1330 (errno != E2BIG || bufLeft != 0))
1331#else
1332#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1333#endif
1334
1335#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1336
1337// ----------------------------------------------------------------------------
1338// wxMBConv_iconv: encapsulates an iconv character set
1339// ----------------------------------------------------------------------------
1340
1341class wxMBConv_iconv : public wxMBConv
1342{
1343public:
1344 wxMBConv_iconv(const wxChar *name);
1345 virtual ~wxMBConv_iconv();
1346
1347 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1348 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1349
1350 bool IsOk() const
1351 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1352
1353protected:
1354 // the iconv handlers used to translate from multibyte to wide char and in
1355 // the other direction
1356 iconv_t m2w,
1357 w2m;
1358#if wxUSE_THREADS
1359 // guards access to m2w and w2m objects
1360 wxMutex m_iconvMutex;
1361#endif
1362
1363private:
1364 // the name (for iconv_open()) of a wide char charset -- if none is
1365 // available on this machine, it will remain NULL
1366 static const char *ms_wcCharsetName;
1367
1368 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1369 // different endian-ness than the native one
1370 static bool ms_wcNeedsSwap;
1371};
1372
1373const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1374bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1375
1376wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1377{
1378 // Do it the hard way
1379 char cname[100];
1380 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1381 cname[i] = (char) name[i];
1382
1383 // check for charset that represents wchar_t:
1384 if (ms_wcCharsetName == NULL)
1385 {
1386 ms_wcNeedsSwap = false;
1387
1388 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1389 ms_wcCharsetName = WC_NAME_BEST;
1390 m2w = iconv_open(ms_wcCharsetName, cname);
1391
1392 if (m2w == (iconv_t)-1)
1393 {
1394 // try charset w/o bytesex info (e.g. "UCS4")
1395 // and check for bytesex ourselves:
1396 ms_wcCharsetName = WC_NAME;
1397 m2w = iconv_open(ms_wcCharsetName, cname);
1398
1399 // last bet, try if it knows WCHAR_T pseudo-charset
1400 if (m2w == (iconv_t)-1)
1401 {
1402 ms_wcCharsetName = "WCHAR_T";
1403 m2w = iconv_open(ms_wcCharsetName, cname);
1404 }
1405
1406 if (m2w != (iconv_t)-1)
1407 {
1408 char buf[2], *bufPtr;
1409 wchar_t wbuf[2], *wbufPtr;
1410 size_t insz, outsz;
1411 size_t res;
1412
1413 buf[0] = 'A';
1414 buf[1] = 0;
1415 wbuf[0] = 0;
1416 insz = 2;
1417 outsz = SIZEOF_WCHAR_T * 2;
1418 wbufPtr = wbuf;
1419 bufPtr = buf;
1420
1421 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1422 (char**)&wbufPtr, &outsz);
1423
1424 if (ICONV_FAILED(res, insz))
1425 {
1426 ms_wcCharsetName = NULL;
1427 wxLogLastError(wxT("iconv"));
1428 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1429 }
1430 else
1431 {
1432 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1433 }
1434 }
1435 else
1436 {
1437 ms_wcCharsetName = NULL;
1438
1439 // VS: we must not output an error here, since wxWidgets will safely
1440 // fall back to using wxEncodingConverter.
1441 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1442 //wxLogError(
1443 }
1444 }
1445 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1446 }
1447 else // we already have ms_wcCharsetName
1448 {
1449 m2w = iconv_open(ms_wcCharsetName, cname);
1450 }
1451
1452 // NB: don't ever pass NULL to iconv_open(), it may crash!
1453 if ( ms_wcCharsetName )
1454 {
1455 w2m = iconv_open( cname, ms_wcCharsetName);
1456 }
1457 else
1458 {
1459 w2m = (iconv_t)-1;
1460 }
1461}
1462
1463wxMBConv_iconv::~wxMBConv_iconv()
1464{
1465 if ( m2w != (iconv_t)-1 )
1466 iconv_close(m2w);
1467 if ( w2m != (iconv_t)-1 )
1468 iconv_close(w2m);
1469}
1470
1471size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1472{
1473#if wxUSE_THREADS
1474 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1475 // Unfortunately there is a couple of global wxCSConv objects such as
1476 // wxConvLocal that are used all over wx code, so we have to make sure
1477 // the handle is used by at most one thread at the time. Otherwise
1478 // only a few wx classes would be safe to use from non-main threads
1479 // as MB<->WC conversion would fail "randomly".
1480 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1481#endif
1482
1483 size_t inbuf = strlen(psz);
1484 size_t outbuf = n * SIZEOF_WCHAR_T;
1485 size_t res, cres;
1486 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1487 wchar_t *bufPtr = buf;
1488 const char *pszPtr = psz;
1489
1490 if (buf)
1491 {
1492 // have destination buffer, convert there
1493 cres = iconv(m2w,
1494 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1495 (char**)&bufPtr, &outbuf);
1496 res = n - (outbuf / SIZEOF_WCHAR_T);
1497
1498 if (ms_wcNeedsSwap)
1499 {
1500 // convert to native endianness
1501 WC_BSWAP(buf /* _not_ bufPtr */, res)
1502 }
1503
1504 // NB: iconv was given only strlen(psz) characters on input, and so
1505 // it couldn't convert the trailing zero. Let's do it ourselves
1506 // if there's some room left for it in the output buffer.
1507 if (res < n)
1508 buf[res] = 0;
1509 }
1510 else
1511 {
1512 // no destination buffer... convert using temp buffer
1513 // to calculate destination buffer requirement
1514 wchar_t tbuf[8];
1515 res = 0;
1516 do {
1517 bufPtr = tbuf;
1518 outbuf = 8*SIZEOF_WCHAR_T;
1519
1520 cres = iconv(m2w,
1521 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1522 (char**)&bufPtr, &outbuf );
1523
1524 res += 8-(outbuf/SIZEOF_WCHAR_T);
1525 } while ((cres==(size_t)-1) && (errno==E2BIG));
1526 }
1527
1528 if (ICONV_FAILED(cres, inbuf))
1529 {
1530 //VS: it is ok if iconv fails, hence trace only
1531 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1532 return (size_t)-1;
1533 }
1534
1535 return res;
1536}
1537
1538size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1539{
1540#if wxUSE_THREADS
1541 // NB: explained in MB2WC
1542 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1543#endif
1544
1545 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1546 size_t outbuf = n;
1547 size_t res, cres;
1548
1549 wchar_t *tmpbuf = 0;
1550
1551 if (ms_wcNeedsSwap)
1552 {
1553 // need to copy to temp buffer to switch endianness
1554 // this absolutely doesn't rock!
1555 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1556 // could be in read-only memory, or be accessed in some other thread)
1557 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1558 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1559 WC_BSWAP(tmpbuf, inbuf)
1560 psz=tmpbuf;
1561 }
1562
1563 if (buf)
1564 {
1565 // have destination buffer, convert there
1566 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1567
1568 res = n-outbuf;
1569
1570 // NB: iconv was given only wcslen(psz) characters on input, and so
1571 // it couldn't convert the trailing zero. Let's do it ourselves
1572 // if there's some room left for it in the output buffer.
1573 if (res < n)
1574 buf[0] = 0;
1575 }
1576 else
1577 {
1578 // no destination buffer... convert using temp buffer
1579 // to calculate destination buffer requirement
1580 char tbuf[16];
1581 res = 0;
1582 do {
1583 buf = tbuf; outbuf = 16;
1584
1585 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1586
1587 res += 16 - outbuf;
1588 } while ((cres==(size_t)-1) && (errno==E2BIG));
1589 }
1590
1591 if (ms_wcNeedsSwap)
1592 {
1593 free(tmpbuf);
1594 }
1595
1596 if (ICONV_FAILED(cres, inbuf))
1597 {
1598 //VS: it is ok if iconv fails, hence trace only
1599 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1600 return (size_t)-1;
1601 }
1602
1603 return res;
1604}
1605
1606#endif // HAVE_ICONV
1607
1608
1609// ============================================================================
1610// Win32 conversion classes
1611// ============================================================================
1612
1613#ifdef wxHAVE_WIN32_MB2WC
1614
1615// from utils.cpp
1616#if wxUSE_FONTMAP
1617extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1618extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1619#endif
1620
1621class wxMBConv_win32 : public wxMBConv
1622{
1623public:
1624 wxMBConv_win32()
1625 {
1626 m_CodePage = CP_ACP;
1627 }
1628
1629#if wxUSE_FONTMAP
1630 wxMBConv_win32(const wxChar* name)
1631 {
1632 m_CodePage = wxCharsetToCodepage(name);
1633 }
1634
1635 wxMBConv_win32(wxFontEncoding encoding)
1636 {
1637 m_CodePage = wxEncodingToCodepage(encoding);
1638 }
1639#endif
1640
1641 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1642 {
1643 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1644 // the behaviour is not compatible with the Unix version (using iconv)
1645 // and break the library itself, e.g. wxTextInputStream::NextChar()
1646 // wouldn't work if reading an incomplete MB char didn't result in an
1647 // error
1648 //
1649 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1650 // an error (tested under Windows Server 2003) and apparently it is
1651 // done on purpose, i.e. the function accepts any input in this case
1652 // and although I'd prefer to return error on ill-formed output, our
1653 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1654 // explicitly ill-formed according to RFC 2152) neither so we don't
1655 // even have any fallback here...
1656 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1657
1658 const size_t len = ::MultiByteToWideChar
1659 (
1660 m_CodePage, // code page
1661 flags, // flags: fall on error
1662 psz, // input string
1663 -1, // its length (NUL-terminated)
1664 buf, // output string
1665 buf ? n : 0 // size of output buffer
1666 );
1667
1668 // note that it returns count of written chars for buf != NULL and size
1669 // of the needed buffer for buf == NULL so in either case the length of
1670 // the string (which never includes the terminating NUL) is one less
1671 return len ? len - 1 : (size_t)-1;
1672 }
1673
1674 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1675 {
1676 /*
1677 we have a problem here: by default, WideCharToMultiByte() may
1678 replace characters unrepresentable in the target code page with bad
1679 quality approximations such as turning "1/2" symbol (U+00BD) into
1680 "1" for the code pages which don't have it and we, obviously, want
1681 to avoid this at any price
1682
1683 the trouble is that this function does it _silently_, i.e. it won't
1684 even tell us whether it did or not... Win98/2000 and higher provide
1685 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1686 we have to resort to a round trip, i.e. check that converting back
1687 results in the same string -- this is, of course, expensive but
1688 otherwise we simply can't be sure to not garble the data.
1689 */
1690
1691 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1692 // it doesn't work with CJK encodings (which we test for rather roughly
1693 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1694 // supporting it
1695 BOOL usedDef wxDUMMY_INITIALIZE(false);
1696 BOOL *pUsedDef;
1697 int flags;
1698 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1699 {
1700 // it's our lucky day
1701 flags = WC_NO_BEST_FIT_CHARS;
1702 pUsedDef = &usedDef;
1703 }
1704 else // old system or unsupported encoding
1705 {
1706 flags = 0;
1707 pUsedDef = NULL;
1708 }
1709
1710 const size_t len = ::WideCharToMultiByte
1711 (
1712 m_CodePage, // code page
1713 flags, // either none or no best fit
1714 pwz, // input string
1715 -1, // it is (wide) NUL-terminated
1716 buf, // output buffer
1717 buf ? n : 0, // and its size
1718 NULL, // default "replacement" char
1719 pUsedDef // [out] was it used?
1720 );
1721
1722 if ( !len )
1723 {
1724 // function totally failed
1725 return (size_t)-1;
1726 }
1727
1728 // if we were really converting, check if we succeeded
1729 if ( buf )
1730 {
1731 if ( flags )
1732 {
1733 // check if the conversion failed, i.e. if any replacements
1734 // were done
1735 if ( usedDef )
1736 return (size_t)-1;
1737 }
1738 else // we must resort to double tripping...
1739 {
1740 wxWCharBuffer wcBuf(n);
1741 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1742 wcscmp(wcBuf, pwz) != 0 )
1743 {
1744 // we didn't obtain the same thing we started from, hence
1745 // the conversion was lossy and we consider that it failed
1746 return (size_t)-1;
1747 }
1748 }
1749 }
1750
1751 // see the comment above for the reason of "len - 1"
1752 return len - 1;
1753 }
1754
1755 bool IsOk() const { return m_CodePage != -1; }
1756
1757private:
1758 static bool CanUseNoBestFit()
1759 {
1760 static int s_isWin98Or2k = -1;
1761
1762 if ( s_isWin98Or2k == -1 )
1763 {
1764 int verMaj, verMin;
1765 switch ( wxGetOsVersion(&verMaj, &verMin) )
1766 {
1767 case wxWIN95:
1768 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1769 break;
1770
1771 case wxWINDOWS_NT:
1772 s_isWin98Or2k = verMaj >= 5;
1773 break;
1774
1775 default:
1776 // unknown, be conseravtive by default
1777 s_isWin98Or2k = 0;
1778 }
1779
1780 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1781 }
1782
1783 return s_isWin98Or2k == 1;
1784 }
1785
1786 long m_CodePage;
1787};
1788
1789#endif // wxHAVE_WIN32_MB2WC
1790
1791// ============================================================================
1792// Cocoa conversion classes
1793// ============================================================================
1794
1795#if defined(__WXCOCOA__)
1796
1797// RN: There is no UTF-32 support in either Core Foundation or
1798// Cocoa. Strangely enough, internally Core Foundation uses
1799// UTF 32 internally quite a bit - its just not public (yet).
1800
1801#include <CoreFoundation/CFString.h>
1802#include <CoreFoundation/CFStringEncodingExt.h>
1803
1804CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1805{
1806 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1807 if ( encoding == wxFONTENCODING_DEFAULT )
1808 {
1809 enc = CFStringGetSystemEncoding();
1810 }
1811 else switch( encoding)
1812 {
1813 case wxFONTENCODING_ISO8859_1 :
1814 enc = kCFStringEncodingISOLatin1 ;
1815 break ;
1816 case wxFONTENCODING_ISO8859_2 :
1817 enc = kCFStringEncodingISOLatin2;
1818 break ;
1819 case wxFONTENCODING_ISO8859_3 :
1820 enc = kCFStringEncodingISOLatin3 ;
1821 break ;
1822 case wxFONTENCODING_ISO8859_4 :
1823 enc = kCFStringEncodingISOLatin4;
1824 break ;
1825 case wxFONTENCODING_ISO8859_5 :
1826 enc = kCFStringEncodingISOLatinCyrillic;
1827 break ;
1828 case wxFONTENCODING_ISO8859_6 :
1829 enc = kCFStringEncodingISOLatinArabic;
1830 break ;
1831 case wxFONTENCODING_ISO8859_7 :
1832 enc = kCFStringEncodingISOLatinGreek;
1833 break ;
1834 case wxFONTENCODING_ISO8859_8 :
1835 enc = kCFStringEncodingISOLatinHebrew;
1836 break ;
1837 case wxFONTENCODING_ISO8859_9 :
1838 enc = kCFStringEncodingISOLatin5;
1839 break ;
1840 case wxFONTENCODING_ISO8859_10 :
1841 enc = kCFStringEncodingISOLatin6;
1842 break ;
1843 case wxFONTENCODING_ISO8859_11 :
1844 enc = kCFStringEncodingISOLatinThai;
1845 break ;
1846 case wxFONTENCODING_ISO8859_13 :
1847 enc = kCFStringEncodingISOLatin7;
1848 break ;
1849 case wxFONTENCODING_ISO8859_14 :
1850 enc = kCFStringEncodingISOLatin8;
1851 break ;
1852 case wxFONTENCODING_ISO8859_15 :
1853 enc = kCFStringEncodingISOLatin9;
1854 break ;
1855
1856 case wxFONTENCODING_KOI8 :
1857 enc = kCFStringEncodingKOI8_R;
1858 break ;
1859 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1860 enc = kCFStringEncodingDOSRussian;
1861 break ;
1862
1863// case wxFONTENCODING_BULGARIAN :
1864// enc = ;
1865// break ;
1866
1867 case wxFONTENCODING_CP437 :
1868 enc =kCFStringEncodingDOSLatinUS ;
1869 break ;
1870 case wxFONTENCODING_CP850 :
1871 enc = kCFStringEncodingDOSLatin1;
1872 break ;
1873 case wxFONTENCODING_CP852 :
1874 enc = kCFStringEncodingDOSLatin2;
1875 break ;
1876 case wxFONTENCODING_CP855 :
1877 enc = kCFStringEncodingDOSCyrillic;
1878 break ;
1879 case wxFONTENCODING_CP866 :
1880 enc =kCFStringEncodingDOSRussian ;
1881 break ;
1882 case wxFONTENCODING_CP874 :
1883 enc = kCFStringEncodingDOSThai;
1884 break ;
1885 case wxFONTENCODING_CP932 :
1886 enc = kCFStringEncodingDOSJapanese;
1887 break ;
1888 case wxFONTENCODING_CP936 :
1889 enc =kCFStringEncodingDOSChineseSimplif ;
1890 break ;
1891 case wxFONTENCODING_CP949 :
1892 enc = kCFStringEncodingDOSKorean;
1893 break ;
1894 case wxFONTENCODING_CP950 :
1895 enc = kCFStringEncodingDOSChineseTrad;
1896 break ;
1897 case wxFONTENCODING_CP1250 :
1898 enc = kCFStringEncodingWindowsLatin2;
1899 break ;
1900 case wxFONTENCODING_CP1251 :
1901 enc =kCFStringEncodingWindowsCyrillic ;
1902 break ;
1903 case wxFONTENCODING_CP1252 :
1904 enc =kCFStringEncodingWindowsLatin1 ;
1905 break ;
1906 case wxFONTENCODING_CP1253 :
1907 enc = kCFStringEncodingWindowsGreek;
1908 break ;
1909 case wxFONTENCODING_CP1254 :
1910 enc = kCFStringEncodingWindowsLatin5;
1911 break ;
1912 case wxFONTENCODING_CP1255 :
1913 enc =kCFStringEncodingWindowsHebrew ;
1914 break ;
1915 case wxFONTENCODING_CP1256 :
1916 enc =kCFStringEncodingWindowsArabic ;
1917 break ;
1918 case wxFONTENCODING_CP1257 :
1919 enc = kCFStringEncodingWindowsBalticRim;
1920 break ;
1921// This only really encodes to UTF7 (if that) evidently
1922// case wxFONTENCODING_UTF7 :
1923// enc = kCFStringEncodingNonLossyASCII ;
1924// break ;
1925 case wxFONTENCODING_UTF8 :
1926 enc = kCFStringEncodingUTF8 ;
1927 break ;
1928 case wxFONTENCODING_EUC_JP :
1929 enc = kCFStringEncodingEUC_JP;
1930 break ;
1931 case wxFONTENCODING_UTF16 :
1932 enc = kCFStringEncodingUnicode ;
1933 break ;
1934 case wxFONTENCODING_MACROMAN :
1935 enc = kCFStringEncodingMacRoman ;
1936 break ;
1937 case wxFONTENCODING_MACJAPANESE :
1938 enc = kCFStringEncodingMacJapanese ;
1939 break ;
1940 case wxFONTENCODING_MACCHINESETRAD :
1941 enc = kCFStringEncodingMacChineseTrad ;
1942 break ;
1943 case wxFONTENCODING_MACKOREAN :
1944 enc = kCFStringEncodingMacKorean ;
1945 break ;
1946 case wxFONTENCODING_MACARABIC :
1947 enc = kCFStringEncodingMacArabic ;
1948 break ;
1949 case wxFONTENCODING_MACHEBREW :
1950 enc = kCFStringEncodingMacHebrew ;
1951 break ;
1952 case wxFONTENCODING_MACGREEK :
1953 enc = kCFStringEncodingMacGreek ;
1954 break ;
1955 case wxFONTENCODING_MACCYRILLIC :
1956 enc = kCFStringEncodingMacCyrillic ;
1957 break ;
1958 case wxFONTENCODING_MACDEVANAGARI :
1959 enc = kCFStringEncodingMacDevanagari ;
1960 break ;
1961 case wxFONTENCODING_MACGURMUKHI :
1962 enc = kCFStringEncodingMacGurmukhi ;
1963 break ;
1964 case wxFONTENCODING_MACGUJARATI :
1965 enc = kCFStringEncodingMacGujarati ;
1966 break ;
1967 case wxFONTENCODING_MACORIYA :
1968 enc = kCFStringEncodingMacOriya ;
1969 break ;
1970 case wxFONTENCODING_MACBENGALI :
1971 enc = kCFStringEncodingMacBengali ;
1972 break ;
1973 case wxFONTENCODING_MACTAMIL :
1974 enc = kCFStringEncodingMacTamil ;
1975 break ;
1976 case wxFONTENCODING_MACTELUGU :
1977 enc = kCFStringEncodingMacTelugu ;
1978 break ;
1979 case wxFONTENCODING_MACKANNADA :
1980 enc = kCFStringEncodingMacKannada ;
1981 break ;
1982 case wxFONTENCODING_MACMALAJALAM :
1983 enc = kCFStringEncodingMacMalayalam ;
1984 break ;
1985 case wxFONTENCODING_MACSINHALESE :
1986 enc = kCFStringEncodingMacSinhalese ;
1987 break ;
1988 case wxFONTENCODING_MACBURMESE :
1989 enc = kCFStringEncodingMacBurmese ;
1990 break ;
1991 case wxFONTENCODING_MACKHMER :
1992 enc = kCFStringEncodingMacKhmer ;
1993 break ;
1994 case wxFONTENCODING_MACTHAI :
1995 enc = kCFStringEncodingMacThai ;
1996 break ;
1997 case wxFONTENCODING_MACLAOTIAN :
1998 enc = kCFStringEncodingMacLaotian ;
1999 break ;
2000 case wxFONTENCODING_MACGEORGIAN :
2001 enc = kCFStringEncodingMacGeorgian ;
2002 break ;
2003 case wxFONTENCODING_MACARMENIAN :
2004 enc = kCFStringEncodingMacArmenian ;
2005 break ;
2006 case wxFONTENCODING_MACCHINESESIMP :
2007 enc = kCFStringEncodingMacChineseSimp ;
2008 break ;
2009 case wxFONTENCODING_MACTIBETAN :
2010 enc = kCFStringEncodingMacTibetan ;
2011 break ;
2012 case wxFONTENCODING_MACMONGOLIAN :
2013 enc = kCFStringEncodingMacMongolian ;
2014 break ;
2015 case wxFONTENCODING_MACETHIOPIC :
2016 enc = kCFStringEncodingMacEthiopic ;
2017 break ;
2018 case wxFONTENCODING_MACCENTRALEUR :
2019 enc = kCFStringEncodingMacCentralEurRoman ;
2020 break ;
2021 case wxFONTENCODING_MACVIATNAMESE :
2022 enc = kCFStringEncodingMacVietnamese ;
2023 break ;
2024 case wxFONTENCODING_MACARABICEXT :
2025 enc = kCFStringEncodingMacExtArabic ;
2026 break ;
2027 case wxFONTENCODING_MACSYMBOL :
2028 enc = kCFStringEncodingMacSymbol ;
2029 break ;
2030 case wxFONTENCODING_MACDINGBATS :
2031 enc = kCFStringEncodingMacDingbats ;
2032 break ;
2033 case wxFONTENCODING_MACTURKISH :
2034 enc = kCFStringEncodingMacTurkish ;
2035 break ;
2036 case wxFONTENCODING_MACCROATIAN :
2037 enc = kCFStringEncodingMacCroatian ;
2038 break ;
2039 case wxFONTENCODING_MACICELANDIC :
2040 enc = kCFStringEncodingMacIcelandic ;
2041 break ;
2042 case wxFONTENCODING_MACROMANIAN :
2043 enc = kCFStringEncodingMacRomanian ;
2044 break ;
2045 case wxFONTENCODING_MACCELTIC :
2046 enc = kCFStringEncodingMacCeltic ;
2047 break ;
2048 case wxFONTENCODING_MACGAELIC :
2049 enc = kCFStringEncodingMacGaelic ;
2050 break ;
2051// case wxFONTENCODING_MACKEYBOARD :
2052// enc = kCFStringEncodingMacKeyboardGlyphs ;
2053// break ;
2054 default :
2055 // because gcc is picky
2056 break ;
2057 } ;
2058 return enc ;
2059}
2060
2061class wxMBConv_cocoa : public wxMBConv
2062{
2063public:
2064 wxMBConv_cocoa()
2065 {
2066 Init(CFStringGetSystemEncoding()) ;
2067 }
2068
2069#if wxUSE_FONTMAP
2070 wxMBConv_cocoa(const wxChar* name)
2071 {
2072 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2073 }
2074#endif
2075
2076 wxMBConv_cocoa(wxFontEncoding encoding)
2077 {
2078 Init( wxCFStringEncFromFontEnc(encoding) );
2079 }
2080
2081 ~wxMBConv_cocoa()
2082 {
2083 }
2084
2085 void Init( CFStringEncoding encoding)
2086 {
2087 m_encoding = encoding ;
2088 }
2089
2090 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2091 {
2092 wxASSERT(szUnConv);
2093
2094 CFStringRef theString = CFStringCreateWithBytes (
2095 NULL, //the allocator
2096 (const UInt8*)szUnConv,
2097 strlen(szUnConv),
2098 m_encoding,
2099 false //no BOM/external representation
2100 );
2101
2102 wxASSERT(theString);
2103
2104 size_t nOutLength = CFStringGetLength(theString);
2105
2106 if (szOut == NULL)
2107 {
2108 CFRelease(theString);
2109 return nOutLength;
2110 }
2111
2112 CFRange theRange = { 0, nOutSize };
2113
2114#if SIZEOF_WCHAR_T == 4
2115 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2116#endif
2117
2118 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2119
2120 CFRelease(theString);
2121
2122 szUniCharBuffer[nOutLength] = '\0' ;
2123
2124#if SIZEOF_WCHAR_T == 4
2125 wxMBConvUTF16 converter ;
2126 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2127 delete[] szUniCharBuffer;
2128#endif
2129
2130 return nOutLength;
2131 }
2132
2133 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2134 {
2135 wxASSERT(szUnConv);
2136
2137 size_t nRealOutSize;
2138 size_t nBufSize = wxWcslen(szUnConv);
2139 UniChar* szUniBuffer = (UniChar*) szUnConv;
2140
2141#if SIZEOF_WCHAR_T == 4
2142 wxMBConvUTF16BE converter ;
2143 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2144 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2145 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2146 nBufSize /= sizeof(UniChar);
2147#endif
2148
2149 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2150 NULL, //allocator
2151 szUniBuffer,
2152 nBufSize,
2153 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2154 );
2155
2156 wxASSERT(theString);
2157
2158 //Note that CER puts a BOM when converting to unicode
2159 //so we check and use getchars instead in that case
2160 if (m_encoding == kCFStringEncodingUnicode)
2161 {
2162 if (szOut != NULL)
2163 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2164
2165 nRealOutSize = CFStringGetLength(theString) + 1;
2166 }
2167 else
2168 {
2169 CFStringGetBytes(
2170 theString,
2171 CFRangeMake(0, CFStringGetLength(theString)),
2172 m_encoding,
2173 0, //what to put in characters that can't be converted -
2174 //0 tells CFString to return NULL if it meets such a character
2175 false, //not an external representation
2176 (UInt8*) szOut,
2177 nOutSize,
2178 (CFIndex*) &nRealOutSize
2179 );
2180 }
2181
2182 CFRelease(theString);
2183
2184#if SIZEOF_WCHAR_T == 4
2185 delete[] szUniBuffer;
2186#endif
2187
2188 return nRealOutSize - 1;
2189 }
2190
2191 bool IsOk() const
2192 {
2193 return m_encoding != kCFStringEncodingInvalidId &&
2194 CFStringIsEncodingAvailable(m_encoding);
2195 }
2196
2197private:
2198 CFStringEncoding m_encoding ;
2199};
2200
2201#endif // defined(__WXCOCOA__)
2202
2203// ============================================================================
2204// Mac conversion classes
2205// ============================================================================
2206
2207#if defined(__WXMAC__) && defined(TARGET_CARBON)
2208
2209class wxMBConv_mac : public wxMBConv
2210{
2211public:
2212 wxMBConv_mac()
2213 {
2214 Init(CFStringGetSystemEncoding()) ;
2215 }
2216
2217#if wxUSE_FONTMAP
2218 wxMBConv_mac(const wxChar* name)
2219 {
2220 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2221 }
2222#endif
2223
2224 wxMBConv_mac(wxFontEncoding encoding)
2225 {
2226 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2227 }
2228
2229 ~wxMBConv_mac()
2230 {
2231 OSStatus status = noErr ;
2232 status = TECDisposeConverter(m_MB2WC_converter);
2233 status = TECDisposeConverter(m_WC2MB_converter);
2234 }
2235
2236
2237 void Init( TextEncodingBase encoding)
2238 {
2239 OSStatus status = noErr ;
2240 m_char_encoding = encoding ;
2241 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2242
2243 status = TECCreateConverter(&m_MB2WC_converter,
2244 m_char_encoding,
2245 m_unicode_encoding);
2246 status = TECCreateConverter(&m_WC2MB_converter,
2247 m_unicode_encoding,
2248 m_char_encoding);
2249 }
2250
2251 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2252 {
2253 OSStatus status = noErr ;
2254 ByteCount byteOutLen ;
2255 ByteCount byteInLen = strlen(psz) ;
2256 wchar_t *tbuf = NULL ;
2257 UniChar* ubuf = NULL ;
2258 size_t res = 0 ;
2259
2260 if (buf == NULL)
2261 {
2262 //apple specs say at least 32
2263 n = wxMax( 32 , byteInLen ) ;
2264 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2265 }
2266 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2267#if SIZEOF_WCHAR_T == 4
2268 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2269#else
2270 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2271#endif
2272 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2273 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2274#if SIZEOF_WCHAR_T == 4
2275 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2276 // is not properly terminated we get random characters at the end
2277 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2278 wxMBConvUTF16BE converter ;
2279 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2280 free( ubuf ) ;
2281#else
2282 res = byteOutLen / sizeof( UniChar ) ;
2283#endif
2284 if ( buf == NULL )
2285 free(tbuf) ;
2286
2287 if ( buf && res < n)
2288 buf[res] = 0;
2289
2290 return res ;
2291 }
2292
2293 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2294 {
2295 OSStatus status = noErr ;
2296 ByteCount byteOutLen ;
2297 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2298
2299 char *tbuf = NULL ;
2300
2301 if (buf == NULL)
2302 {
2303 //apple specs say at least 32
2304 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2305 tbuf = (char*) malloc( n ) ;
2306 }
2307
2308 ByteCount byteBufferLen = n ;
2309 UniChar* ubuf = NULL ;
2310#if SIZEOF_WCHAR_T == 4
2311 wxMBConvUTF16BE converter ;
2312 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2313 byteInLen = unicharlen ;
2314 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2315 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2316#else
2317 ubuf = (UniChar*) psz ;
2318#endif
2319 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2320 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2321#if SIZEOF_WCHAR_T == 4
2322 free( ubuf ) ;
2323#endif
2324 if ( buf == NULL )
2325 free(tbuf) ;
2326
2327 size_t res = byteOutLen ;
2328 if ( buf && res < n)
2329 {
2330 buf[res] = 0;
2331
2332 //we need to double-trip to verify it didn't insert any ? in place
2333 //of bogus characters
2334 wxWCharBuffer wcBuf(n);
2335 size_t pszlen = wxWcslen(psz);
2336 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2337 wxWcslen(wcBuf) != pszlen ||
2338 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2339 {
2340 // we didn't obtain the same thing we started from, hence
2341 // the conversion was lossy and we consider that it failed
2342 return (size_t)-1;
2343 }
2344 }
2345
2346 return res ;
2347 }
2348
2349 bool IsOk() const
2350 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2351
2352private:
2353 TECObjectRef m_MB2WC_converter ;
2354 TECObjectRef m_WC2MB_converter ;
2355
2356 TextEncodingBase m_char_encoding ;
2357 TextEncodingBase m_unicode_encoding ;
2358};
2359
2360#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2361
2362// ============================================================================
2363// wxEncodingConverter based conversion classes
2364// ============================================================================
2365
2366#if wxUSE_FONTMAP
2367
2368class wxMBConv_wxwin : public wxMBConv
2369{
2370private:
2371 void Init()
2372 {
2373 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2374 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2375 }
2376
2377public:
2378 // temporarily just use wxEncodingConverter stuff,
2379 // so that it works while a better implementation is built
2380 wxMBConv_wxwin(const wxChar* name)
2381 {
2382 if (name)
2383 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2384 else
2385 m_enc = wxFONTENCODING_SYSTEM;
2386
2387 Init();
2388 }
2389
2390 wxMBConv_wxwin(wxFontEncoding enc)
2391 {
2392 m_enc = enc;
2393
2394 Init();
2395 }
2396
2397 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2398 {
2399 size_t inbuf = strlen(psz);
2400 if (buf)
2401 {
2402 if (!m2w.Convert(psz,buf))
2403 return (size_t)-1;
2404 }
2405 return inbuf;
2406 }
2407
2408 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2409 {
2410 const size_t inbuf = wxWcslen(psz);
2411 if (buf)
2412 {
2413 if (!w2m.Convert(psz,buf))
2414 return (size_t)-1;
2415 }
2416
2417 return inbuf;
2418 }
2419
2420 bool IsOk() const { return m_ok; }
2421
2422public:
2423 wxFontEncoding m_enc;
2424 wxEncodingConverter m2w, w2m;
2425
2426 // were we initialized successfully?
2427 bool m_ok;
2428
2429 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2430};
2431
2432#endif // wxUSE_FONTMAP
2433
2434// ============================================================================
2435// wxCSConv implementation
2436// ============================================================================
2437
2438void wxCSConv::Init()
2439{
2440 m_name = NULL;
2441 m_convReal = NULL;
2442 m_deferred = true;
2443}
2444
2445wxCSConv::wxCSConv(const wxChar *charset)
2446{
2447 Init();
2448
2449 if ( charset )
2450 {
2451 SetName(charset);
2452 }
2453
2454 m_encoding = wxFONTENCODING_SYSTEM;
2455}
2456
2457wxCSConv::wxCSConv(wxFontEncoding encoding)
2458{
2459 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2460 {
2461 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2462
2463 encoding = wxFONTENCODING_SYSTEM;
2464 }
2465
2466 Init();
2467
2468 m_encoding = encoding;
2469}
2470
2471wxCSConv::~wxCSConv()
2472{
2473 Clear();
2474}
2475
2476wxCSConv::wxCSConv(const wxCSConv& conv)
2477 : wxMBConv()
2478{
2479 Init();
2480
2481 SetName(conv.m_name);
2482 m_encoding = conv.m_encoding;
2483}
2484
2485wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2486{
2487 Clear();
2488
2489 SetName(conv.m_name);
2490 m_encoding = conv.m_encoding;
2491
2492 return *this;
2493}
2494
2495void wxCSConv::Clear()
2496{
2497 free(m_name);
2498 delete m_convReal;
2499
2500 m_name = NULL;
2501 m_convReal = NULL;
2502}
2503
2504void wxCSConv::SetName(const wxChar *charset)
2505{
2506 if (charset)
2507 {
2508 m_name = wxStrdup(charset);
2509 m_deferred = true;
2510 }
2511}
2512
2513wxMBConv *wxCSConv::DoCreate() const
2514{
2515 // check for the special case of ASCII or ISO8859-1 charset: as we have
2516 // special knowledge of it anyhow, we don't need to create a special
2517 // conversion object
2518 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2519 {
2520 // don't convert at all
2521 return NULL;
2522 }
2523
2524 // we trust OS to do conversion better than we can so try external
2525 // conversion methods first
2526 //
2527 // the full order is:
2528 // 1. OS conversion (iconv() under Unix or Win32 API)
2529 // 2. hard coded conversions for UTF
2530 // 3. wxEncodingConverter as fall back
2531
2532 // step (1)
2533#ifdef HAVE_ICONV
2534#if !wxUSE_FONTMAP
2535 if ( m_name )
2536#endif // !wxUSE_FONTMAP
2537 {
2538 wxString name(m_name);
2539
2540#if wxUSE_FONTMAP
2541 if ( name.empty() )
2542 name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2543#endif // wxUSE_FONTMAP
2544
2545 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2546 if ( conv->IsOk() )
2547 return conv;
2548
2549 delete conv;
2550 }
2551#endif // HAVE_ICONV
2552
2553#ifdef wxHAVE_WIN32_MB2WC
2554 {
2555#if wxUSE_FONTMAP
2556 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2557 : new wxMBConv_win32(m_encoding);
2558 if ( conv->IsOk() )
2559 return conv;
2560
2561 delete conv;
2562#else
2563 return NULL;
2564#endif
2565 }
2566#endif // wxHAVE_WIN32_MB2WC
2567#if defined(__WXMAC__)
2568 {
2569 // leave UTF16 and UTF32 to the built-ins of wx
2570 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2571 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2572 {
2573
2574#if wxUSE_FONTMAP
2575 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2576 : new wxMBConv_mac(m_encoding);
2577#else
2578 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2579#endif
2580 if ( conv->IsOk() )
2581 return conv;
2582
2583 delete conv;
2584 }
2585 }
2586#endif
2587#if defined(__WXCOCOA__)
2588 {
2589 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2590 {
2591
2592#if wxUSE_FONTMAP
2593 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2594 : new wxMBConv_cocoa(m_encoding);
2595#else
2596 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2597#endif
2598 if ( conv->IsOk() )
2599 return conv;
2600
2601 delete conv;
2602 }
2603 }
2604#endif
2605 // step (2)
2606 wxFontEncoding enc = m_encoding;
2607#if wxUSE_FONTMAP
2608 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2609 {
2610 // use "false" to suppress interactive dialogs -- we can be called from
2611 // anywhere and popping up a dialog from here is the last thing we want to
2612 // do
2613 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2614 }
2615#endif // wxUSE_FONTMAP
2616
2617 switch ( enc )
2618 {
2619 case wxFONTENCODING_UTF7:
2620 return new wxMBConvUTF7;
2621
2622 case wxFONTENCODING_UTF8:
2623 return new wxMBConvUTF8;
2624
2625 case wxFONTENCODING_UTF16BE:
2626 return new wxMBConvUTF16BE;
2627
2628 case wxFONTENCODING_UTF16LE:
2629 return new wxMBConvUTF16LE;
2630
2631 case wxFONTENCODING_UTF32BE:
2632 return new wxMBConvUTF32BE;
2633
2634 case wxFONTENCODING_UTF32LE:
2635 return new wxMBConvUTF32LE;
2636
2637 default:
2638 // nothing to do but put here to suppress gcc warnings
2639 ;
2640 }
2641
2642 // step (3)
2643#if wxUSE_FONTMAP
2644 {
2645 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2646 : new wxMBConv_wxwin(m_encoding);
2647 if ( conv->IsOk() )
2648 return conv;
2649
2650 delete conv;
2651 }
2652#endif // wxUSE_FONTMAP
2653
2654 // NB: This is a hack to prevent deadlock. What could otherwise happen
2655 // in Unicode build: wxConvLocal creation ends up being here
2656 // because of some failure and logs the error. But wxLog will try to
2657 // attach timestamp, for which it will need wxConvLocal (to convert
2658 // time to char* and then wchar_t*), but that fails, tries to log
2659 // error, but wxLog has a (already locked) critical section that
2660 // guards static buffer.
2661 static bool alreadyLoggingError = false;
2662 if (!alreadyLoggingError)
2663 {
2664 alreadyLoggingError = true;
2665 wxLogError(_("Cannot convert from the charset '%s'!"),
2666 m_name ? m_name
2667 :
2668#if wxUSE_FONTMAP
2669 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2670#else // !wxUSE_FONTMAP
2671 wxString::Format(_("encoding %s"), m_encoding).c_str()
2672#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2673 );
2674 alreadyLoggingError = false;
2675 }
2676
2677 return NULL;
2678}
2679
2680void wxCSConv::CreateConvIfNeeded() const
2681{
2682 if ( m_deferred )
2683 {
2684 wxCSConv *self = (wxCSConv *)this; // const_cast
2685
2686#if wxUSE_INTL
2687 // if we don't have neither the name nor the encoding, use the default
2688 // encoding for this system
2689 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2690 {
2691 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2692 }
2693#endif // wxUSE_INTL
2694
2695 self->m_convReal = DoCreate();
2696 self->m_deferred = false;
2697 }
2698}
2699
2700size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2701{
2702 CreateConvIfNeeded();
2703
2704 if (m_convReal)
2705 return m_convReal->MB2WC(buf, psz, n);
2706
2707 // latin-1 (direct)
2708 size_t len = strlen(psz);
2709
2710 if (buf)
2711 {
2712 for (size_t c = 0; c <= len; c++)
2713 buf[c] = (unsigned char)(psz[c]);
2714 }
2715
2716 return len;
2717}
2718
2719size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2720{
2721 CreateConvIfNeeded();
2722
2723 if (m_convReal)
2724 return m_convReal->WC2MB(buf, psz, n);
2725
2726 // latin-1 (direct)
2727 const size_t len = wxWcslen(psz);
2728 if (buf)
2729 {
2730 for (size_t c = 0; c <= len; c++)
2731 {
2732 if (psz[c] > 0xFF)
2733 return (size_t)-1;
2734 buf[c] = (char)psz[c];
2735 }
2736 }
2737 else
2738 {
2739 for (size_t c = 0; c <= len; c++)
2740 {
2741 if (psz[c] > 0xFF)
2742 return (size_t)-1;
2743 }
2744 }
2745
2746 return len;
2747}
2748
2749// ----------------------------------------------------------------------------
2750// globals
2751// ----------------------------------------------------------------------------
2752
2753#ifdef __WINDOWS__
2754 static wxMBConv_win32 wxConvLibcObj;
2755#elif defined(__WXMAC__) && !defined(__MACH__)
2756 static wxMBConv_mac wxConvLibcObj ;
2757#else
2758 static wxMBConvLibc wxConvLibcObj;
2759#endif
2760
2761static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2762static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2763static wxMBConvUTF7 wxConvUTF7Obj;
2764static wxMBConvUTF8 wxConvUTF8Obj;
2765
2766WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2767WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2768WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2769WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2770WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2771WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2772WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2773#ifdef __WXOSX__
2774 wxConvUTF8Obj;
2775#else
2776 wxConvLibcObj;
2777#endif
2778
2779
2780#else // !wxUSE_WCHAR_T
2781
2782// stand-ins in absence of wchar_t
2783WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2784 wxConvISO8859_1,
2785 wxConvLocal,
2786 wxConvUTF8;
2787
2788#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2789
2790