]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
1136bde236c55051afc78b73065f5b5ae31dde42
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 // ============================================================================
82 // implementation
83 // ============================================================================
84
85 // ----------------------------------------------------------------------------
86 // UTF-16 en/decoding to/from UCS-4
87 // ----------------------------------------------------------------------------
88
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input<=0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96 return 1;
97 }
98 else if (input>=0x110000)
99 {
100 return (size_t)-1;
101 }
102 else
103 {
104 if (output)
105 {
106 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
107 *output = (wxUint16) ((input&0x3ff)+0xdc00);
108 }
109 return 2;
110 }
111 }
112
113 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
114 {
115 if ((*input<0xd800) || (*input>0xdfff))
116 {
117 output = *input;
118 return 1;
119 }
120 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
121 {
122 output = *input;
123 return (size_t)-1;
124 }
125 else
126 {
127 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
128 return 2;
129 }
130 }
131
132
133 // ----------------------------------------------------------------------------
134 // wxMBConv
135 // ----------------------------------------------------------------------------
136
137 wxMBConv::~wxMBConv()
138 {
139 // nothing to do here (necessary for Darwin linking probably)
140 }
141
142 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
143 {
144 if ( psz )
145 {
146 // calculate the length of the buffer needed first
147 size_t nLen = MB2WC(NULL, psz, 0);
148 if ( nLen != (size_t)-1 )
149 {
150 // now do the actual conversion
151 wxWCharBuffer buf(nLen);
152 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
153 if ( nLen != (size_t)-1 )
154 {
155 return buf;
156 }
157 }
158 }
159
160 wxWCharBuffer buf((wchar_t *)NULL);
161
162 return buf;
163 }
164
165 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
166 {
167 if ( pwz )
168 {
169 size_t nLen = WC2MB(NULL, pwz, 0);
170 if ( nLen != (size_t)-1 )
171 {
172 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
173 nLen = WC2MB(buf.data(), pwz, nLen + 4);
174 if ( nLen != (size_t)-1 )
175 {
176 return buf;
177 }
178 }
179 }
180
181 wxCharBuffer buf((char *)NULL);
182
183 return buf;
184 }
185
186 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
187 {
188 wxASSERT(pOutSize != NULL);
189
190 const char* szEnd = szString + nStringLen + 1;
191 const char* szPos = szString;
192 const char* szStart = szPos;
193
194 size_t nActualLength = 0;
195 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
196
197 wxWCharBuffer theBuffer(nCurrentSize);
198
199 //Convert the string until the length() is reached, continuing the
200 //loop every time a null character is reached
201 while(szPos != szEnd)
202 {
203 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
204
205 //Get the length of the current (sub)string
206 size_t nLen = MB2WC(NULL, szPos, 0);
207
208 //Invalid conversion?
209 if( nLen == (size_t)-1 )
210 {
211 *pOutSize = 0;
212 theBuffer.data()[0u] = wxT('\0');
213 return theBuffer;
214 }
215
216
217 //Increase the actual length (+1 for current null character)
218 nActualLength += nLen + 1;
219
220 //if buffer too big, realloc the buffer
221 if (nActualLength > (nCurrentSize+1))
222 {
223 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
224 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
225 theBuffer = theNewBuffer;
226 nCurrentSize <<= 1;
227 }
228
229 //Convert the current (sub)string
230 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
231 {
232 *pOutSize = 0;
233 theBuffer.data()[0u] = wxT('\0');
234 return theBuffer;
235 }
236
237 //Increment to next (sub)string
238 //Note that we have to use strlen instead of nLen here
239 //because XX2XX gives us the size of the output buffer,
240 //which is not necessarily the length of the string
241 szPos += strlen(szPos) + 1;
242 }
243
244 //success - return actual length and the buffer
245 *pOutSize = nActualLength;
246 return theBuffer;
247 }
248
249 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
250 {
251 wxASSERT(pOutSize != NULL);
252
253 const wchar_t* szEnd = szString + nStringLen + 1;
254 const wchar_t* szPos = szString;
255 const wchar_t* szStart = szPos;
256
257 size_t nActualLength = 0;
258 size_t nCurrentSize = nStringLen << 2; //try * 4 first
259
260 wxCharBuffer theBuffer(nCurrentSize);
261
262 //Convert the string until the length() is reached, continuing the
263 //loop every time a null character is reached
264 while(szPos != szEnd)
265 {
266 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
267
268 //Get the length of the current (sub)string
269 size_t nLen = WC2MB(NULL, szPos, 0);
270
271 //Invalid conversion?
272 if( nLen == (size_t)-1 )
273 {
274 *pOutSize = 0;
275 theBuffer.data()[0u] = wxT('\0');
276 return theBuffer;
277 }
278
279 //Increase the actual length (+1 for current null character)
280 nActualLength += nLen + 1;
281
282 //if buffer too big, realloc the buffer
283 if (nActualLength > (nCurrentSize+1))
284 {
285 wxCharBuffer theNewBuffer(nCurrentSize << 1);
286 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
287 theBuffer = theNewBuffer;
288 nCurrentSize <<= 1;
289 }
290
291 //Convert the current (sub)string
292 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
293 {
294 *pOutSize = 0;
295 theBuffer.data()[0u] = wxT('\0');
296 return theBuffer;
297 }
298
299 //Increment to next (sub)string
300 //Note that we have to use wxWcslen instead of nLen here
301 //because XX2XX gives us the size of the output buffer,
302 //which is not necessarily the length of the string
303 szPos += wxWcslen(szPos) + 1;
304 }
305
306 //success - return actual length and the buffer
307 *pOutSize = nActualLength;
308 return theBuffer;
309 }
310
311 // ----------------------------------------------------------------------------
312 // wxMBConvLibc
313 // ----------------------------------------------------------------------------
314
315 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
316 {
317 return wxMB2WC(buf, psz, n);
318 }
319
320 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
321 {
322 return wxWC2MB(buf, psz, n);
323 }
324
325 #ifdef __UNIX__
326
327 // ----------------------------------------------------------------------------
328 // wxConvBrokenFileNames
329 // ----------------------------------------------------------------------------
330
331 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
332 {
333 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
334 || wxStricmp(charset, _T("UTF8")) == 0 )
335 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
336 else
337 m_conv = new wxCSConv(charset);
338 }
339
340 size_t
341 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
342 const char *psz,
343 size_t outputSize) const
344 {
345 return m_conv->MB2WC( outputBuf, psz, outputSize );
346 }
347
348 size_t
349 wxConvBrokenFileNames::WC2MB(char *outputBuf,
350 const wchar_t *psz,
351 size_t outputSize) const
352 {
353 return m_conv->WC2MB( outputBuf, psz, outputSize );
354 }
355
356 #endif
357
358 // ----------------------------------------------------------------------------
359 // UTF-7
360 // ----------------------------------------------------------------------------
361
362 // Implementation (C) 2004 Fredrik Roubert
363
364 //
365 // BASE64 decoding table
366 //
367 static const unsigned char utf7unb64[] =
368 {
369 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
375 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
376 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
378 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
379 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
380 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
382 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
383 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
384 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
401 };
402
403 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
404 {
405 size_t len = 0;
406
407 while (*psz && ((!buf) || (len < n)))
408 {
409 unsigned char cc = *psz++;
410 if (cc != '+')
411 {
412 // plain ASCII char
413 if (buf)
414 *buf++ = cc;
415 len++;
416 }
417 else if (*psz == '-')
418 {
419 // encoded plus sign
420 if (buf)
421 *buf++ = cc;
422 len++;
423 psz++;
424 }
425 else
426 {
427 // BASE64 encoded string
428 bool lsb;
429 unsigned char c;
430 unsigned int d, l;
431 for (lsb = false, d = 0, l = 0;
432 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
433 {
434 d <<= 6;
435 d += cc;
436 for (l += 6; l >= 8; lsb = !lsb)
437 {
438 c = (unsigned char)((d >> (l -= 8)) % 256);
439 if (lsb)
440 {
441 if (buf)
442 *buf++ |= c;
443 len ++;
444 }
445 else
446 if (buf)
447 *buf = (wchar_t)(c << 8);
448 }
449 }
450 if (*psz == '-')
451 psz++;
452 }
453 }
454 if (buf && (len < n))
455 *buf = 0;
456 return len;
457 }
458
459 //
460 // BASE64 encoding table
461 //
462 static const unsigned char utf7enb64[] =
463 {
464 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
465 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
466 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
467 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
468 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
469 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
470 'w', 'x', 'y', 'z', '0', '1', '2', '3',
471 '4', '5', '6', '7', '8', '9', '+', '/'
472 };
473
474 //
475 // UTF-7 encoding table
476 //
477 // 0 - Set D (directly encoded characters)
478 // 1 - Set O (optional direct characters)
479 // 2 - whitespace characters (optional)
480 // 3 - special characters
481 //
482 static const unsigned char utf7encode[128] =
483 {
484 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
486 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
492 };
493
494 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
495 {
496
497
498 size_t len = 0;
499
500 while (*psz && ((!buf) || (len < n)))
501 {
502 wchar_t cc = *psz++;
503 if (cc < 0x80 && utf7encode[cc] < 1)
504 {
505 // plain ASCII char
506 if (buf)
507 *buf++ = (char)cc;
508 len++;
509 }
510 #ifndef WC_UTF16
511 else if (((wxUint32)cc) > 0xffff)
512 {
513 // no surrogate pair generation (yet?)
514 return (size_t)-1;
515 }
516 #endif
517 else
518 {
519 if (buf)
520 *buf++ = '+';
521 len++;
522 if (cc != '+')
523 {
524 // BASE64 encode string
525 unsigned int lsb, d, l;
526 for (d = 0, l = 0;; psz++)
527 {
528 for (lsb = 0; lsb < 2; lsb ++)
529 {
530 d <<= 8;
531 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
532
533 for (l += 8; l >= 6; )
534 {
535 l -= 6;
536 if (buf)
537 *buf++ = utf7enb64[(d >> l) % 64];
538 len++;
539 }
540 }
541 cc = *psz;
542 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
543 break;
544 }
545 if (l != 0)
546 {
547 if (buf)
548 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
549 len++;
550 }
551 }
552 if (buf)
553 *buf++ = '-';
554 len++;
555 }
556 }
557 if (buf && (len < n))
558 *buf = 0;
559 return len;
560 }
561
562 // ----------------------------------------------------------------------------
563 // UTF-8
564 // ----------------------------------------------------------------------------
565
566 static wxUint32 utf8_max[]=
567 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
568
569 // boundaries of the private use area we use to (temporarily) remap invalid
570 // characters invalid in a UTF-8 encoded string
571 const wxUint32 wxUnicodePUA = 0x100000;
572 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
573
574 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
575 {
576 size_t len = 0;
577
578 while (*psz && ((!buf) || (len < n)))
579 {
580 const char *opsz = psz;
581 bool invalid = false;
582 unsigned char cc = *psz++, fc = cc;
583 unsigned cnt;
584 for (cnt = 0; fc & 0x80; cnt++)
585 fc <<= 1;
586 if (!cnt)
587 {
588 // plain ASCII char
589 if (buf)
590 *buf++ = cc;
591 len++;
592
593 // escape the escape character for octal escapes
594 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
595 && cc == '\\' && (!buf || len < n))
596 {
597 if (buf)
598 *buf++ = cc;
599 len++;
600 }
601 }
602 else
603 {
604 cnt--;
605 if (!cnt)
606 {
607 // invalid UTF-8 sequence
608 invalid = true;
609 }
610 else
611 {
612 unsigned ocnt = cnt - 1;
613 wxUint32 res = cc & (0x3f >> cnt);
614 while (cnt--)
615 {
616 cc = *psz;
617 if ((cc & 0xC0) != 0x80)
618 {
619 // invalid UTF-8 sequence
620 invalid = true;
621 break;
622 }
623 psz++;
624 res = (res << 6) | (cc & 0x3f);
625 }
626 if (invalid || res <= utf8_max[ocnt])
627 {
628 // illegal UTF-8 encoding
629 invalid = true;
630 }
631 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
632 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
633 {
634 // if one of our PUA characters turns up externally
635 // it must also be treated as an illegal sequence
636 // (a bit like you have to escape an escape character)
637 invalid = true;
638 }
639 else
640 {
641 #ifdef WC_UTF16
642 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
643 size_t pa = encode_utf16(res, (wxUint16 *)buf);
644 if (pa == (size_t)-1)
645 {
646 invalid = true;
647 }
648 else
649 {
650 if (buf)
651 buf += pa;
652 len += pa;
653 }
654 #else // !WC_UTF16
655 if (buf)
656 *buf++ = res;
657 len++;
658 #endif // WC_UTF16/!WC_UTF16
659 }
660 }
661 if (invalid)
662 {
663 if (m_options & MAP_INVALID_UTF8_TO_PUA)
664 {
665 while (opsz < psz && (!buf || len < n))
666 {
667 #ifdef WC_UTF16
668 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
669 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
670 wxASSERT(pa != (size_t)-1);
671 if (buf)
672 buf += pa;
673 opsz++;
674 len += pa;
675 #else
676 if (buf)
677 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
678 opsz++;
679 len++;
680 #endif
681 }
682 }
683 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
684 {
685 while (opsz < psz && (!buf || len < n))
686 {
687 if ( buf && len + 3 < n )
688 {
689 unsigned char n = *opsz;
690 *buf++ = L'\\';
691 *buf++ = (wchar_t)( L'0' + n / 0100 );
692 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
693 *buf++ = (wchar_t)( L'0' + n % 010 );
694 }
695 opsz++;
696 len += 4;
697 }
698 }
699 else // MAP_INVALID_UTF8_NOT
700 {
701 return (size_t)-1;
702 }
703 }
704 }
705 }
706 if (buf && (len < n))
707 *buf = 0;
708 return len;
709 }
710
711 static inline bool isoctal(wchar_t wch)
712 {
713 return L'0' <= wch && wch <= L'7';
714 }
715
716 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
717 {
718 size_t len = 0;
719
720 while (*psz && ((!buf) || (len < n)))
721 {
722 wxUint32 cc;
723 #ifdef WC_UTF16
724 // cast is ok for WC_UTF16
725 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
726 psz += (pa == (size_t)-1) ? 1 : pa;
727 #else
728 cc=(*psz++) & 0x7fffffff;
729 #endif
730
731 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
732 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
733 {
734 if (buf)
735 *buf++ = (char)(cc - wxUnicodePUA);
736 len++;
737 }
738 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
739 && cc == L'\\' && psz[0] == L'\\' )
740 {
741 if (buf)
742 *buf++ = (char)cc;
743 psz++;
744 len++;
745 }
746 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
747 cc == L'\\' &&
748 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
749 {
750 if (buf)
751 {
752 *buf++ = (char) ((psz[0] - L'0')*0100 +
753 (psz[1] - L'0')*010 +
754 (psz[2] - L'0'));
755 }
756
757 psz += 3;
758 len++;
759 }
760 else
761 {
762 unsigned cnt;
763 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
764 if (!cnt)
765 {
766 // plain ASCII char
767 if (buf)
768 *buf++ = (char) cc;
769 len++;
770 }
771
772 else
773 {
774 len += cnt + 1;
775 if (buf)
776 {
777 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
778 while (cnt--)
779 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
780 }
781 }
782 }
783 }
784
785 if (buf && (len<n))
786 *buf = 0;
787
788 return len;
789 }
790
791 // ----------------------------------------------------------------------------
792 // UTF-16
793 // ----------------------------------------------------------------------------
794
795 #ifdef WORDS_BIGENDIAN
796 #define wxMBConvUTF16straight wxMBConvUTF16BE
797 #define wxMBConvUTF16swap wxMBConvUTF16LE
798 #else
799 #define wxMBConvUTF16swap wxMBConvUTF16BE
800 #define wxMBConvUTF16straight wxMBConvUTF16LE
801 #endif
802
803
804 #ifdef WC_UTF16
805
806 // copy 16bit MB to 16bit String
807 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
808 {
809 size_t len=0;
810
811 while (*(wxUint16*)psz && (!buf || len < n))
812 {
813 if (buf)
814 *buf++ = *(wxUint16*)psz;
815 len++;
816
817 psz += sizeof(wxUint16);
818 }
819 if (buf && len<n) *buf=0;
820
821 return len;
822 }
823
824
825 // copy 16bit String to 16bit MB
826 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
827 {
828 size_t len=0;
829
830 while (*psz && (!buf || len < n))
831 {
832 if (buf)
833 {
834 *(wxUint16*)buf = *psz;
835 buf += sizeof(wxUint16);
836 }
837 len += sizeof(wxUint16);
838 psz++;
839 }
840 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
841
842 return len;
843 }
844
845
846 // swap 16bit MB to 16bit String
847 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
848 {
849 size_t len=0;
850
851 while (*(wxUint16*)psz && (!buf || len < n))
852 {
853 if (buf)
854 {
855 ((char *)buf)[0] = psz[1];
856 ((char *)buf)[1] = psz[0];
857 buf++;
858 }
859 len++;
860 psz += sizeof(wxUint16);
861 }
862 if (buf && len<n) *buf=0;
863
864 return len;
865 }
866
867
868 // swap 16bit MB to 16bit String
869 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
870 {
871 size_t len=0;
872
873 while (*psz && (!buf || len < n))
874 {
875 if (buf)
876 {
877 *buf++ = ((char*)psz)[1];
878 *buf++ = ((char*)psz)[0];
879 }
880 len += sizeof(wxUint16);
881 psz++;
882 }
883 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
884
885 return len;
886 }
887
888
889 #else // WC_UTF16
890
891
892 // copy 16bit MB to 32bit String
893 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
894 {
895 size_t len=0;
896
897 while (*(wxUint16*)psz && (!buf || len < n))
898 {
899 wxUint32 cc;
900 size_t pa=decode_utf16((wxUint16*)psz, cc);
901 if (pa == (size_t)-1)
902 return pa;
903
904 if (buf)
905 *buf++ = cc;
906 len++;
907 psz += pa * sizeof(wxUint16);
908 }
909 if (buf && len<n) *buf=0;
910
911 return len;
912 }
913
914
915 // copy 32bit String to 16bit MB
916 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
917 {
918 size_t len=0;
919
920 while (*psz && (!buf || len < n))
921 {
922 wxUint16 cc[2];
923 size_t pa=encode_utf16(*psz, cc);
924
925 if (pa == (size_t)-1)
926 return pa;
927
928 if (buf)
929 {
930 *(wxUint16*)buf = cc[0];
931 buf += sizeof(wxUint16);
932 if (pa > 1)
933 {
934 *(wxUint16*)buf = cc[1];
935 buf += sizeof(wxUint16);
936 }
937 }
938
939 len += pa*sizeof(wxUint16);
940 psz++;
941 }
942 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
943
944 return len;
945 }
946
947
948 // swap 16bit MB to 32bit String
949 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
950 {
951 size_t len=0;
952
953 while (*(wxUint16*)psz && (!buf || len < n))
954 {
955 wxUint32 cc;
956 char tmp[4];
957 tmp[0]=psz[1]; tmp[1]=psz[0];
958 tmp[2]=psz[3]; tmp[3]=psz[2];
959
960 size_t pa=decode_utf16((wxUint16*)tmp, cc);
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
965 *buf++ = cc;
966
967 len++;
968 psz += pa * sizeof(wxUint16);
969 }
970 if (buf && len<n) *buf=0;
971
972 return len;
973 }
974
975
976 // swap 32bit String to 16bit MB
977 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
978 {
979 size_t len=0;
980
981 while (*psz && (!buf || len < n))
982 {
983 wxUint16 cc[2];
984 size_t pa=encode_utf16(*psz, cc);
985
986 if (pa == (size_t)-1)
987 return pa;
988
989 if (buf)
990 {
991 *buf++ = ((char*)cc)[1];
992 *buf++ = ((char*)cc)[0];
993 if (pa > 1)
994 {
995 *buf++ = ((char*)cc)[3];
996 *buf++ = ((char*)cc)[2];
997 }
998 }
999
1000 len += pa*sizeof(wxUint16);
1001 psz++;
1002 }
1003 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1004
1005 return len;
1006 }
1007
1008 #endif // WC_UTF16
1009
1010
1011 // ----------------------------------------------------------------------------
1012 // UTF-32
1013 // ----------------------------------------------------------------------------
1014
1015 #ifdef WORDS_BIGENDIAN
1016 #define wxMBConvUTF32straight wxMBConvUTF32BE
1017 #define wxMBConvUTF32swap wxMBConvUTF32LE
1018 #else
1019 #define wxMBConvUTF32swap wxMBConvUTF32BE
1020 #define wxMBConvUTF32straight wxMBConvUTF32LE
1021 #endif
1022
1023
1024 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1025 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1026
1027
1028 #ifdef WC_UTF16
1029
1030 // copy 32bit MB to 16bit String
1031 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1032 {
1033 size_t len=0;
1034
1035 while (*(wxUint32*)psz && (!buf || len < n))
1036 {
1037 wxUint16 cc[2];
1038
1039 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1040 if (pa == (size_t)-1)
1041 return pa;
1042
1043 if (buf)
1044 {
1045 *buf++ = cc[0];
1046 if (pa > 1)
1047 *buf++ = cc[1];
1048 }
1049 len += pa;
1050 psz += sizeof(wxUint32);
1051 }
1052 if (buf && len<n) *buf=0;
1053
1054 return len;
1055 }
1056
1057
1058 // copy 16bit String to 32bit MB
1059 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1060 {
1061 size_t len=0;
1062
1063 while (*psz && (!buf || len < n))
1064 {
1065 wxUint32 cc;
1066
1067 // cast is ok for WC_UTF16
1068 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1069 if (pa == (size_t)-1)
1070 return pa;
1071
1072 if (buf)
1073 {
1074 *(wxUint32*)buf = cc;
1075 buf += sizeof(wxUint32);
1076 }
1077 len += sizeof(wxUint32);
1078 psz += pa;
1079 }
1080
1081 if (buf && len<=n-sizeof(wxUint32))
1082 *(wxUint32*)buf=0;
1083
1084 return len;
1085 }
1086
1087
1088
1089 // swap 32bit MB to 16bit String
1090 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1091 {
1092 size_t len=0;
1093
1094 while (*(wxUint32*)psz && (!buf || len < n))
1095 {
1096 char tmp[4];
1097 tmp[0] = psz[3]; tmp[1] = psz[2];
1098 tmp[2] = psz[1]; tmp[3] = psz[0];
1099
1100
1101 wxUint16 cc[2];
1102
1103 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1104 if (pa == (size_t)-1)
1105 return pa;
1106
1107 if (buf)
1108 {
1109 *buf++ = cc[0];
1110 if (pa > 1)
1111 *buf++ = cc[1];
1112 }
1113 len += pa;
1114 psz += sizeof(wxUint32);
1115 }
1116
1117 if (buf && len<n)
1118 *buf=0;
1119
1120 return len;
1121 }
1122
1123
1124 // swap 16bit String to 32bit MB
1125 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126 {
1127 size_t len=0;
1128
1129 while (*psz && (!buf || len < n))
1130 {
1131 char cc[4];
1132
1133 // cast is ok for WC_UTF16
1134 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1135 if (pa == (size_t)-1)
1136 return pa;
1137
1138 if (buf)
1139 {
1140 *buf++ = cc[3];
1141 *buf++ = cc[2];
1142 *buf++ = cc[1];
1143 *buf++ = cc[0];
1144 }
1145 len += sizeof(wxUint32);
1146 psz += pa;
1147 }
1148
1149 if (buf && len<=n-sizeof(wxUint32))
1150 *(wxUint32*)buf=0;
1151
1152 return len;
1153 }
1154
1155 #else // WC_UTF16
1156
1157
1158 // copy 32bit MB to 32bit String
1159 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1160 {
1161 size_t len=0;
1162
1163 while (*(wxUint32*)psz && (!buf || len < n))
1164 {
1165 if (buf)
1166 *buf++ = *(wxUint32*)psz;
1167 len++;
1168 psz += sizeof(wxUint32);
1169 }
1170
1171 if (buf && len<n)
1172 *buf=0;
1173
1174 return len;
1175 }
1176
1177
1178 // copy 32bit String to 32bit MB
1179 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1180 {
1181 size_t len=0;
1182
1183 while (*psz && (!buf || len < n))
1184 {
1185 if (buf)
1186 {
1187 *(wxUint32*)buf = *psz;
1188 buf += sizeof(wxUint32);
1189 }
1190
1191 len += sizeof(wxUint32);
1192 psz++;
1193 }
1194
1195 if (buf && len<=n-sizeof(wxUint32))
1196 *(wxUint32*)buf=0;
1197
1198 return len;
1199 }
1200
1201
1202 // swap 32bit MB to 32bit String
1203 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1204 {
1205 size_t len=0;
1206
1207 while (*(wxUint32*)psz && (!buf || len < n))
1208 {
1209 if (buf)
1210 {
1211 ((char *)buf)[0] = psz[3];
1212 ((char *)buf)[1] = psz[2];
1213 ((char *)buf)[2] = psz[1];
1214 ((char *)buf)[3] = psz[0];
1215 buf++;
1216 }
1217 len++;
1218 psz += sizeof(wxUint32);
1219 }
1220
1221 if (buf && len<n)
1222 *buf=0;
1223
1224 return len;
1225 }
1226
1227
1228 // swap 32bit String to 32bit MB
1229 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1230 {
1231 size_t len=0;
1232
1233 while (*psz && (!buf || len < n))
1234 {
1235 if (buf)
1236 {
1237 *buf++ = ((char *)psz)[3];
1238 *buf++ = ((char *)psz)[2];
1239 *buf++ = ((char *)psz)[1];
1240 *buf++ = ((char *)psz)[0];
1241 }
1242 len += sizeof(wxUint32);
1243 psz++;
1244 }
1245
1246 if (buf && len<=n-sizeof(wxUint32))
1247 *(wxUint32*)buf=0;
1248
1249 return len;
1250 }
1251
1252
1253 #endif // WC_UTF16
1254
1255
1256 // ============================================================================
1257 // The classes doing conversion using the iconv_xxx() functions
1258 // ============================================================================
1259
1260 #ifdef HAVE_ICONV
1261
1262 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1263 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1264 // (unless there's yet another bug in glibc) the only case when iconv()
1265 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1266 // left in the input buffer -- when _real_ error occurs,
1267 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1268 // iconv() failure.
1269 // [This bug does not appear in glibc 2.2.]
1270 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1271 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1272 (errno != E2BIG || bufLeft != 0))
1273 #else
1274 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1275 #endif
1276
1277 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1278
1279 #define ICONV_T_INVALID ((iconv_t)-1)
1280
1281 #if SIZEOF_WCHAR_T == 4
1282 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1283 #define WC_ENC wxFONTENCODING_UTF32
1284 #elif SIZEOF_WCHAR_T == 2
1285 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1286 #define WC_ENC wxFONTENCODING_UTF16
1287 #else // sizeof(wchar_t) != 2 nor 4
1288 // does this ever happen?
1289 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1290 #endif
1291
1292 // ----------------------------------------------------------------------------
1293 // wxMBConv_iconv: encapsulates an iconv character set
1294 // ----------------------------------------------------------------------------
1295
1296 class wxMBConv_iconv : public wxMBConv
1297 {
1298 public:
1299 wxMBConv_iconv(const wxChar *name);
1300 virtual ~wxMBConv_iconv();
1301
1302 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1303 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1304
1305 bool IsOk() const
1306 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1307
1308 protected:
1309 // the iconv handlers used to translate from multibyte to wide char and in
1310 // the other direction
1311 iconv_t m2w,
1312 w2m;
1313 #if wxUSE_THREADS
1314 // guards access to m2w and w2m objects
1315 wxMutex m_iconvMutex;
1316 #endif
1317
1318 private:
1319 // the name (for iconv_open()) of a wide char charset -- if none is
1320 // available on this machine, it will remain NULL
1321 static wxString ms_wcCharsetName;
1322
1323 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1324 // different endian-ness than the native one
1325 static bool ms_wcNeedsSwap;
1326 };
1327
1328 // make the constructor available for unit testing
1329 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1330 {
1331 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1332 if ( !result->IsOk() )
1333 {
1334 delete result;
1335 return 0;
1336 }
1337 return result;
1338 }
1339
1340 wxString wxMBConv_iconv::ms_wcCharsetName;
1341 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1342
1343 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1344 {
1345 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1346 // names for the charsets
1347 const wxCharBuffer cname(wxString(name).ToAscii());
1348
1349 // check for charset that represents wchar_t:
1350 if ( ms_wcCharsetName.empty() )
1351 {
1352 #if wxUSE_FONTMAP
1353 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1354 #else // !wxUSE_FONTMAP
1355 static const wxChar *names[] =
1356 {
1357 #if SIZEOF_WCHAR_T == 4
1358 _T("UCS-4"),
1359 #elif SIZEOF_WCHAR_T = 2
1360 _T("UCS-2"),
1361 #endif
1362 NULL
1363 };
1364 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1365
1366 for ( ; *names; ++names )
1367 {
1368 const wxString name(*names);
1369
1370 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1371 wxString nameXE(name);
1372 #ifdef WORDS_BIGENDIAN
1373 nameXE += _T("BE");
1374 #else // little endian
1375 nameXE += _T("LE");
1376 #endif
1377
1378 m2w = iconv_open(nameXE.ToAscii(), cname);
1379 if ( m2w == ICONV_T_INVALID )
1380 {
1381 // try charset w/o bytesex info (e.g. "UCS4")
1382 m2w = iconv_open(name.ToAscii(), cname);
1383
1384 // and check for bytesex ourselves:
1385 if ( m2w != ICONV_T_INVALID )
1386 {
1387 char buf[2], *bufPtr;
1388 wchar_t wbuf[2], *wbufPtr;
1389 size_t insz, outsz;
1390 size_t res;
1391
1392 buf[0] = 'A';
1393 buf[1] = 0;
1394 wbuf[0] = 0;
1395 insz = 2;
1396 outsz = SIZEOF_WCHAR_T * 2;
1397 wbufPtr = wbuf;
1398 bufPtr = buf;
1399
1400 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1401 (char**)&wbufPtr, &outsz);
1402
1403 if (ICONV_FAILED(res, insz))
1404 {
1405 wxLogLastError(wxT("iconv"));
1406 wxLogError(_("Conversion to charset '%s' doesn't work."),
1407 name.c_str());
1408 }
1409 else // ok, can convert to this encoding, remember it
1410 {
1411 ms_wcCharsetName = name;
1412 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1413 }
1414 }
1415 }
1416 else // use charset not requiring byte swapping
1417 {
1418 ms_wcCharsetName = nameXE;
1419 }
1420 }
1421
1422 wxLogTrace(TRACE_STRCONV,
1423 wxT("iconv wchar_t charset is \"%s\"%s"),
1424 ms_wcCharsetName.empty() ? _T("<none>")
1425 : ms_wcCharsetName.c_str(),
1426 ms_wcNeedsSwap ? _T(" (needs swap)")
1427 : _T(""));
1428 }
1429 else // we already have ms_wcCharsetName
1430 {
1431 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1432 }
1433
1434 if ( ms_wcCharsetName.empty() )
1435 {
1436 w2m = ICONV_T_INVALID;
1437 }
1438 else
1439 {
1440 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1441 if ( w2m == ICONV_T_INVALID )
1442 {
1443 wxLogTrace(TRACE_STRCONV,
1444 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1445 ms_wcCharsetName.c_str(), cname.data());
1446 }
1447 }
1448 }
1449
1450 wxMBConv_iconv::~wxMBConv_iconv()
1451 {
1452 if ( m2w != ICONV_T_INVALID )
1453 iconv_close(m2w);
1454 if ( w2m != ICONV_T_INVALID )
1455 iconv_close(w2m);
1456 }
1457
1458 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1459 {
1460 #if wxUSE_THREADS
1461 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1462 // Unfortunately there is a couple of global wxCSConv objects such as
1463 // wxConvLocal that are used all over wx code, so we have to make sure
1464 // the handle is used by at most one thread at the time. Otherwise
1465 // only a few wx classes would be safe to use from non-main threads
1466 // as MB<->WC conversion would fail "randomly".
1467 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1468 #endif
1469
1470 size_t inbuf = strlen(psz);
1471 size_t outbuf = n * SIZEOF_WCHAR_T;
1472 size_t res, cres;
1473 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1474 wchar_t *bufPtr = buf;
1475 const char *pszPtr = psz;
1476
1477 if (buf)
1478 {
1479 // have destination buffer, convert there
1480 cres = iconv(m2w,
1481 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1482 (char**)&bufPtr, &outbuf);
1483 res = n - (outbuf / SIZEOF_WCHAR_T);
1484
1485 if (ms_wcNeedsSwap)
1486 {
1487 // convert to native endianness
1488 for ( unsigned n = 0; n < res; n++ )
1489 buf[n] = WC_BSWAP(buf[n]);
1490 }
1491
1492 // NB: iconv was given only strlen(psz) characters on input, and so
1493 // it couldn't convert the trailing zero. Let's do it ourselves
1494 // if there's some room left for it in the output buffer.
1495 if (res < n)
1496 buf[res] = 0;
1497 }
1498 else
1499 {
1500 // no destination buffer... convert using temp buffer
1501 // to calculate destination buffer requirement
1502 wchar_t tbuf[8];
1503 res = 0;
1504 do {
1505 bufPtr = tbuf;
1506 outbuf = 8*SIZEOF_WCHAR_T;
1507
1508 cres = iconv(m2w,
1509 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1510 (char**)&bufPtr, &outbuf );
1511
1512 res += 8-(outbuf/SIZEOF_WCHAR_T);
1513 } while ((cres==(size_t)-1) && (errno==E2BIG));
1514 }
1515
1516 if (ICONV_FAILED(cres, inbuf))
1517 {
1518 //VS: it is ok if iconv fails, hence trace only
1519 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1520 return (size_t)-1;
1521 }
1522
1523 return res;
1524 }
1525
1526 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1527 {
1528 #if wxUSE_THREADS
1529 // NB: explained in MB2WC
1530 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1531 #endif
1532
1533 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1534 size_t outbuf = n;
1535 size_t res, cres;
1536
1537 wchar_t *tmpbuf = 0;
1538
1539 if (ms_wcNeedsSwap)
1540 {
1541 // need to copy to temp buffer to switch endianness
1542 // (doing WC_BSWAP twice on the original buffer won't help, as it
1543 // could be in read-only memory, or be accessed in some other thread)
1544 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1545 for ( size_t n = 0; n < inbuf; n++ )
1546 tmpbuf[n] = WC_BSWAP(psz[n]);
1547 tmpbuf[inbuf] = L'\0';
1548 psz = tmpbuf;
1549 }
1550
1551 if (buf)
1552 {
1553 // have destination buffer, convert there
1554 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1555
1556 res = n-outbuf;
1557
1558 // NB: iconv was given only wcslen(psz) characters on input, and so
1559 // it couldn't convert the trailing zero. Let's do it ourselves
1560 // if there's some room left for it in the output buffer.
1561 if (res < n)
1562 buf[0] = 0;
1563 }
1564 else
1565 {
1566 // no destination buffer... convert using temp buffer
1567 // to calculate destination buffer requirement
1568 char tbuf[16];
1569 res = 0;
1570 do {
1571 buf = tbuf; outbuf = 16;
1572
1573 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1574
1575 res += 16 - outbuf;
1576 } while ((cres==(size_t)-1) && (errno==E2BIG));
1577 }
1578
1579 if (ms_wcNeedsSwap)
1580 {
1581 free(tmpbuf);
1582 }
1583
1584 if (ICONV_FAILED(cres, inbuf))
1585 {
1586 //VS: it is ok if iconv fails, hence trace only
1587 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1588 return (size_t)-1;
1589 }
1590
1591 return res;
1592 }
1593
1594 #endif // HAVE_ICONV
1595
1596
1597 // ============================================================================
1598 // Win32 conversion classes
1599 // ============================================================================
1600
1601 #ifdef wxHAVE_WIN32_MB2WC
1602
1603 // from utils.cpp
1604 #if wxUSE_FONTMAP
1605 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1606 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1607 #endif
1608
1609 class wxMBConv_win32 : public wxMBConv
1610 {
1611 public:
1612 wxMBConv_win32()
1613 {
1614 m_CodePage = CP_ACP;
1615 }
1616
1617 #if wxUSE_FONTMAP
1618 wxMBConv_win32(const wxChar* name)
1619 {
1620 m_CodePage = wxCharsetToCodepage(name);
1621 }
1622
1623 wxMBConv_win32(wxFontEncoding encoding)
1624 {
1625 m_CodePage = wxEncodingToCodepage(encoding);
1626 }
1627 #endif
1628
1629 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1630 {
1631 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1632 // the behaviour is not compatible with the Unix version (using iconv)
1633 // and break the library itself, e.g. wxTextInputStream::NextChar()
1634 // wouldn't work if reading an incomplete MB char didn't result in an
1635 // error
1636 //
1637 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1638 // an error (tested under Windows Server 2003) and apparently it is
1639 // done on purpose, i.e. the function accepts any input in this case
1640 // and although I'd prefer to return error on ill-formed output, our
1641 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1642 // explicitly ill-formed according to RFC 2152) neither so we don't
1643 // even have any fallback here...
1644 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1645
1646 const size_t len = ::MultiByteToWideChar
1647 (
1648 m_CodePage, // code page
1649 flags, // flags: fall on error
1650 psz, // input string
1651 -1, // its length (NUL-terminated)
1652 buf, // output string
1653 buf ? n : 0 // size of output buffer
1654 );
1655
1656 // note that it returns count of written chars for buf != NULL and size
1657 // of the needed buffer for buf == NULL so in either case the length of
1658 // the string (which never includes the terminating NUL) is one less
1659 return len ? len - 1 : (size_t)-1;
1660 }
1661
1662 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1663 {
1664 /*
1665 we have a problem here: by default, WideCharToMultiByte() may
1666 replace characters unrepresentable in the target code page with bad
1667 quality approximations such as turning "1/2" symbol (U+00BD) into
1668 "1" for the code pages which don't have it and we, obviously, want
1669 to avoid this at any price
1670
1671 the trouble is that this function does it _silently_, i.e. it won't
1672 even tell us whether it did or not... Win98/2000 and higher provide
1673 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1674 we have to resort to a round trip, i.e. check that converting back
1675 results in the same string -- this is, of course, expensive but
1676 otherwise we simply can't be sure to not garble the data.
1677 */
1678
1679 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1680 // it doesn't work with CJK encodings (which we test for rather roughly
1681 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1682 // supporting it
1683 BOOL usedDef wxDUMMY_INITIALIZE(false);
1684 BOOL *pUsedDef;
1685 int flags;
1686 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1687 {
1688 // it's our lucky day
1689 flags = WC_NO_BEST_FIT_CHARS;
1690 pUsedDef = &usedDef;
1691 }
1692 else // old system or unsupported encoding
1693 {
1694 flags = 0;
1695 pUsedDef = NULL;
1696 }
1697
1698 const size_t len = ::WideCharToMultiByte
1699 (
1700 m_CodePage, // code page
1701 flags, // either none or no best fit
1702 pwz, // input string
1703 -1, // it is (wide) NUL-terminated
1704 buf, // output buffer
1705 buf ? n : 0, // and its size
1706 NULL, // default "replacement" char
1707 pUsedDef // [out] was it used?
1708 );
1709
1710 if ( !len )
1711 {
1712 // function totally failed
1713 return (size_t)-1;
1714 }
1715
1716 // if we were really converting, check if we succeeded
1717 if ( buf )
1718 {
1719 if ( flags )
1720 {
1721 // check if the conversion failed, i.e. if any replacements
1722 // were done
1723 if ( usedDef )
1724 return (size_t)-1;
1725 }
1726 else // we must resort to double tripping...
1727 {
1728 wxWCharBuffer wcBuf(n);
1729 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1730 wcscmp(wcBuf, pwz) != 0 )
1731 {
1732 // we didn't obtain the same thing we started from, hence
1733 // the conversion was lossy and we consider that it failed
1734 return (size_t)-1;
1735 }
1736 }
1737 }
1738
1739 // see the comment above for the reason of "len - 1"
1740 return len - 1;
1741 }
1742
1743 bool IsOk() const { return m_CodePage != -1; }
1744
1745 private:
1746 static bool CanUseNoBestFit()
1747 {
1748 static int s_isWin98Or2k = -1;
1749
1750 if ( s_isWin98Or2k == -1 )
1751 {
1752 int verMaj, verMin;
1753 switch ( wxGetOsVersion(&verMaj, &verMin) )
1754 {
1755 case wxWIN95:
1756 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1757 break;
1758
1759 case wxWINDOWS_NT:
1760 s_isWin98Or2k = verMaj >= 5;
1761 break;
1762
1763 default:
1764 // unknown, be conseravtive by default
1765 s_isWin98Or2k = 0;
1766 }
1767
1768 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1769 }
1770
1771 return s_isWin98Or2k == 1;
1772 }
1773
1774 long m_CodePage;
1775 };
1776
1777 #endif // wxHAVE_WIN32_MB2WC
1778
1779 // ============================================================================
1780 // Cocoa conversion classes
1781 // ============================================================================
1782
1783 #if defined(__WXCOCOA__)
1784
1785 // RN: There is no UTF-32 support in either Core Foundation or
1786 // Cocoa. Strangely enough, internally Core Foundation uses
1787 // UTF 32 internally quite a bit - its just not public (yet).
1788
1789 #include <CoreFoundation/CFString.h>
1790 #include <CoreFoundation/CFStringEncodingExt.h>
1791
1792 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1793 {
1794 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1795 if ( encoding == wxFONTENCODING_DEFAULT )
1796 {
1797 enc = CFStringGetSystemEncoding();
1798 }
1799 else switch( encoding)
1800 {
1801 case wxFONTENCODING_ISO8859_1 :
1802 enc = kCFStringEncodingISOLatin1 ;
1803 break ;
1804 case wxFONTENCODING_ISO8859_2 :
1805 enc = kCFStringEncodingISOLatin2;
1806 break ;
1807 case wxFONTENCODING_ISO8859_3 :
1808 enc = kCFStringEncodingISOLatin3 ;
1809 break ;
1810 case wxFONTENCODING_ISO8859_4 :
1811 enc = kCFStringEncodingISOLatin4;
1812 break ;
1813 case wxFONTENCODING_ISO8859_5 :
1814 enc = kCFStringEncodingISOLatinCyrillic;
1815 break ;
1816 case wxFONTENCODING_ISO8859_6 :
1817 enc = kCFStringEncodingISOLatinArabic;
1818 break ;
1819 case wxFONTENCODING_ISO8859_7 :
1820 enc = kCFStringEncodingISOLatinGreek;
1821 break ;
1822 case wxFONTENCODING_ISO8859_8 :
1823 enc = kCFStringEncodingISOLatinHebrew;
1824 break ;
1825 case wxFONTENCODING_ISO8859_9 :
1826 enc = kCFStringEncodingISOLatin5;
1827 break ;
1828 case wxFONTENCODING_ISO8859_10 :
1829 enc = kCFStringEncodingISOLatin6;
1830 break ;
1831 case wxFONTENCODING_ISO8859_11 :
1832 enc = kCFStringEncodingISOLatinThai;
1833 break ;
1834 case wxFONTENCODING_ISO8859_13 :
1835 enc = kCFStringEncodingISOLatin7;
1836 break ;
1837 case wxFONTENCODING_ISO8859_14 :
1838 enc = kCFStringEncodingISOLatin8;
1839 break ;
1840 case wxFONTENCODING_ISO8859_15 :
1841 enc = kCFStringEncodingISOLatin9;
1842 break ;
1843
1844 case wxFONTENCODING_KOI8 :
1845 enc = kCFStringEncodingKOI8_R;
1846 break ;
1847 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1848 enc = kCFStringEncodingDOSRussian;
1849 break ;
1850
1851 // case wxFONTENCODING_BULGARIAN :
1852 // enc = ;
1853 // break ;
1854
1855 case wxFONTENCODING_CP437 :
1856 enc =kCFStringEncodingDOSLatinUS ;
1857 break ;
1858 case wxFONTENCODING_CP850 :
1859 enc = kCFStringEncodingDOSLatin1;
1860 break ;
1861 case wxFONTENCODING_CP852 :
1862 enc = kCFStringEncodingDOSLatin2;
1863 break ;
1864 case wxFONTENCODING_CP855 :
1865 enc = kCFStringEncodingDOSCyrillic;
1866 break ;
1867 case wxFONTENCODING_CP866 :
1868 enc =kCFStringEncodingDOSRussian ;
1869 break ;
1870 case wxFONTENCODING_CP874 :
1871 enc = kCFStringEncodingDOSThai;
1872 break ;
1873 case wxFONTENCODING_CP932 :
1874 enc = kCFStringEncodingDOSJapanese;
1875 break ;
1876 case wxFONTENCODING_CP936 :
1877 enc =kCFStringEncodingDOSChineseSimplif ;
1878 break ;
1879 case wxFONTENCODING_CP949 :
1880 enc = kCFStringEncodingDOSKorean;
1881 break ;
1882 case wxFONTENCODING_CP950 :
1883 enc = kCFStringEncodingDOSChineseTrad;
1884 break ;
1885 case wxFONTENCODING_CP1250 :
1886 enc = kCFStringEncodingWindowsLatin2;
1887 break ;
1888 case wxFONTENCODING_CP1251 :
1889 enc =kCFStringEncodingWindowsCyrillic ;
1890 break ;
1891 case wxFONTENCODING_CP1252 :
1892 enc =kCFStringEncodingWindowsLatin1 ;
1893 break ;
1894 case wxFONTENCODING_CP1253 :
1895 enc = kCFStringEncodingWindowsGreek;
1896 break ;
1897 case wxFONTENCODING_CP1254 :
1898 enc = kCFStringEncodingWindowsLatin5;
1899 break ;
1900 case wxFONTENCODING_CP1255 :
1901 enc =kCFStringEncodingWindowsHebrew ;
1902 break ;
1903 case wxFONTENCODING_CP1256 :
1904 enc =kCFStringEncodingWindowsArabic ;
1905 break ;
1906 case wxFONTENCODING_CP1257 :
1907 enc = kCFStringEncodingWindowsBalticRim;
1908 break ;
1909 // This only really encodes to UTF7 (if that) evidently
1910 // case wxFONTENCODING_UTF7 :
1911 // enc = kCFStringEncodingNonLossyASCII ;
1912 // break ;
1913 case wxFONTENCODING_UTF8 :
1914 enc = kCFStringEncodingUTF8 ;
1915 break ;
1916 case wxFONTENCODING_EUC_JP :
1917 enc = kCFStringEncodingEUC_JP;
1918 break ;
1919 case wxFONTENCODING_UTF16 :
1920 enc = kCFStringEncodingUnicode ;
1921 break ;
1922 case wxFONTENCODING_MACROMAN :
1923 enc = kCFStringEncodingMacRoman ;
1924 break ;
1925 case wxFONTENCODING_MACJAPANESE :
1926 enc = kCFStringEncodingMacJapanese ;
1927 break ;
1928 case wxFONTENCODING_MACCHINESETRAD :
1929 enc = kCFStringEncodingMacChineseTrad ;
1930 break ;
1931 case wxFONTENCODING_MACKOREAN :
1932 enc = kCFStringEncodingMacKorean ;
1933 break ;
1934 case wxFONTENCODING_MACARABIC :
1935 enc = kCFStringEncodingMacArabic ;
1936 break ;
1937 case wxFONTENCODING_MACHEBREW :
1938 enc = kCFStringEncodingMacHebrew ;
1939 break ;
1940 case wxFONTENCODING_MACGREEK :
1941 enc = kCFStringEncodingMacGreek ;
1942 break ;
1943 case wxFONTENCODING_MACCYRILLIC :
1944 enc = kCFStringEncodingMacCyrillic ;
1945 break ;
1946 case wxFONTENCODING_MACDEVANAGARI :
1947 enc = kCFStringEncodingMacDevanagari ;
1948 break ;
1949 case wxFONTENCODING_MACGURMUKHI :
1950 enc = kCFStringEncodingMacGurmukhi ;
1951 break ;
1952 case wxFONTENCODING_MACGUJARATI :
1953 enc = kCFStringEncodingMacGujarati ;
1954 break ;
1955 case wxFONTENCODING_MACORIYA :
1956 enc = kCFStringEncodingMacOriya ;
1957 break ;
1958 case wxFONTENCODING_MACBENGALI :
1959 enc = kCFStringEncodingMacBengali ;
1960 break ;
1961 case wxFONTENCODING_MACTAMIL :
1962 enc = kCFStringEncodingMacTamil ;
1963 break ;
1964 case wxFONTENCODING_MACTELUGU :
1965 enc = kCFStringEncodingMacTelugu ;
1966 break ;
1967 case wxFONTENCODING_MACKANNADA :
1968 enc = kCFStringEncodingMacKannada ;
1969 break ;
1970 case wxFONTENCODING_MACMALAJALAM :
1971 enc = kCFStringEncodingMacMalayalam ;
1972 break ;
1973 case wxFONTENCODING_MACSINHALESE :
1974 enc = kCFStringEncodingMacSinhalese ;
1975 break ;
1976 case wxFONTENCODING_MACBURMESE :
1977 enc = kCFStringEncodingMacBurmese ;
1978 break ;
1979 case wxFONTENCODING_MACKHMER :
1980 enc = kCFStringEncodingMacKhmer ;
1981 break ;
1982 case wxFONTENCODING_MACTHAI :
1983 enc = kCFStringEncodingMacThai ;
1984 break ;
1985 case wxFONTENCODING_MACLAOTIAN :
1986 enc = kCFStringEncodingMacLaotian ;
1987 break ;
1988 case wxFONTENCODING_MACGEORGIAN :
1989 enc = kCFStringEncodingMacGeorgian ;
1990 break ;
1991 case wxFONTENCODING_MACARMENIAN :
1992 enc = kCFStringEncodingMacArmenian ;
1993 break ;
1994 case wxFONTENCODING_MACCHINESESIMP :
1995 enc = kCFStringEncodingMacChineseSimp ;
1996 break ;
1997 case wxFONTENCODING_MACTIBETAN :
1998 enc = kCFStringEncodingMacTibetan ;
1999 break ;
2000 case wxFONTENCODING_MACMONGOLIAN :
2001 enc = kCFStringEncodingMacMongolian ;
2002 break ;
2003 case wxFONTENCODING_MACETHIOPIC :
2004 enc = kCFStringEncodingMacEthiopic ;
2005 break ;
2006 case wxFONTENCODING_MACCENTRALEUR :
2007 enc = kCFStringEncodingMacCentralEurRoman ;
2008 break ;
2009 case wxFONTENCODING_MACVIATNAMESE :
2010 enc = kCFStringEncodingMacVietnamese ;
2011 break ;
2012 case wxFONTENCODING_MACARABICEXT :
2013 enc = kCFStringEncodingMacExtArabic ;
2014 break ;
2015 case wxFONTENCODING_MACSYMBOL :
2016 enc = kCFStringEncodingMacSymbol ;
2017 break ;
2018 case wxFONTENCODING_MACDINGBATS :
2019 enc = kCFStringEncodingMacDingbats ;
2020 break ;
2021 case wxFONTENCODING_MACTURKISH :
2022 enc = kCFStringEncodingMacTurkish ;
2023 break ;
2024 case wxFONTENCODING_MACCROATIAN :
2025 enc = kCFStringEncodingMacCroatian ;
2026 break ;
2027 case wxFONTENCODING_MACICELANDIC :
2028 enc = kCFStringEncodingMacIcelandic ;
2029 break ;
2030 case wxFONTENCODING_MACROMANIAN :
2031 enc = kCFStringEncodingMacRomanian ;
2032 break ;
2033 case wxFONTENCODING_MACCELTIC :
2034 enc = kCFStringEncodingMacCeltic ;
2035 break ;
2036 case wxFONTENCODING_MACGAELIC :
2037 enc = kCFStringEncodingMacGaelic ;
2038 break ;
2039 // case wxFONTENCODING_MACKEYBOARD :
2040 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2041 // break ;
2042 default :
2043 // because gcc is picky
2044 break ;
2045 } ;
2046 return enc ;
2047 }
2048
2049 class wxMBConv_cocoa : public wxMBConv
2050 {
2051 public:
2052 wxMBConv_cocoa()
2053 {
2054 Init(CFStringGetSystemEncoding()) ;
2055 }
2056
2057 #if wxUSE_FONTMAP
2058 wxMBConv_cocoa(const wxChar* name)
2059 {
2060 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2061 }
2062 #endif
2063
2064 wxMBConv_cocoa(wxFontEncoding encoding)
2065 {
2066 Init( wxCFStringEncFromFontEnc(encoding) );
2067 }
2068
2069 ~wxMBConv_cocoa()
2070 {
2071 }
2072
2073 void Init( CFStringEncoding encoding)
2074 {
2075 m_encoding = encoding ;
2076 }
2077
2078 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2079 {
2080 wxASSERT(szUnConv);
2081
2082 CFStringRef theString = CFStringCreateWithBytes (
2083 NULL, //the allocator
2084 (const UInt8*)szUnConv,
2085 strlen(szUnConv),
2086 m_encoding,
2087 false //no BOM/external representation
2088 );
2089
2090 wxASSERT(theString);
2091
2092 size_t nOutLength = CFStringGetLength(theString);
2093
2094 if (szOut == NULL)
2095 {
2096 CFRelease(theString);
2097 return nOutLength;
2098 }
2099
2100 CFRange theRange = { 0, nOutSize };
2101
2102 #if SIZEOF_WCHAR_T == 4
2103 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2104 #endif
2105
2106 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2107
2108 CFRelease(theString);
2109
2110 szUniCharBuffer[nOutLength] = '\0' ;
2111
2112 #if SIZEOF_WCHAR_T == 4
2113 wxMBConvUTF16 converter ;
2114 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2115 delete[] szUniCharBuffer;
2116 #endif
2117
2118 return nOutLength;
2119 }
2120
2121 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2122 {
2123 wxASSERT(szUnConv);
2124
2125 size_t nRealOutSize;
2126 size_t nBufSize = wxWcslen(szUnConv);
2127 UniChar* szUniBuffer = (UniChar*) szUnConv;
2128
2129 #if SIZEOF_WCHAR_T == 4
2130 wxMBConvUTF16 converter ;
2131 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2132 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2133 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2134 nBufSize /= sizeof(UniChar);
2135 #endif
2136
2137 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2138 NULL, //allocator
2139 szUniBuffer,
2140 nBufSize,
2141 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2142 );
2143
2144 wxASSERT(theString);
2145
2146 //Note that CER puts a BOM when converting to unicode
2147 //so we check and use getchars instead in that case
2148 if (m_encoding == kCFStringEncodingUnicode)
2149 {
2150 if (szOut != NULL)
2151 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2152
2153 nRealOutSize = CFStringGetLength(theString) + 1;
2154 }
2155 else
2156 {
2157 CFStringGetBytes(
2158 theString,
2159 CFRangeMake(0, CFStringGetLength(theString)),
2160 m_encoding,
2161 0, //what to put in characters that can't be converted -
2162 //0 tells CFString to return NULL if it meets such a character
2163 false, //not an external representation
2164 (UInt8*) szOut,
2165 nOutSize,
2166 (CFIndex*) &nRealOutSize
2167 );
2168 }
2169
2170 CFRelease(theString);
2171
2172 #if SIZEOF_WCHAR_T == 4
2173 delete[] szUniBuffer;
2174 #endif
2175
2176 return nRealOutSize - 1;
2177 }
2178
2179 bool IsOk() const
2180 {
2181 return m_encoding != kCFStringEncodingInvalidId &&
2182 CFStringIsEncodingAvailable(m_encoding);
2183 }
2184
2185 private:
2186 CFStringEncoding m_encoding ;
2187 };
2188
2189 #endif // defined(__WXCOCOA__)
2190
2191 // ============================================================================
2192 // Mac conversion classes
2193 // ============================================================================
2194
2195 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2196
2197 class wxMBConv_mac : public wxMBConv
2198 {
2199 public:
2200 wxMBConv_mac()
2201 {
2202 Init(CFStringGetSystemEncoding()) ;
2203 }
2204
2205 #if wxUSE_FONTMAP
2206 wxMBConv_mac(const wxChar* name)
2207 {
2208 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2209 }
2210 #endif
2211
2212 wxMBConv_mac(wxFontEncoding encoding)
2213 {
2214 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2215 }
2216
2217 ~wxMBConv_mac()
2218 {
2219 OSStatus status = noErr ;
2220 status = TECDisposeConverter(m_MB2WC_converter);
2221 status = TECDisposeConverter(m_WC2MB_converter);
2222 }
2223
2224
2225 void Init( TextEncodingBase encoding)
2226 {
2227 OSStatus status = noErr ;
2228 m_char_encoding = encoding ;
2229 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2230
2231 status = TECCreateConverter(&m_MB2WC_converter,
2232 m_char_encoding,
2233 m_unicode_encoding);
2234 status = TECCreateConverter(&m_WC2MB_converter,
2235 m_unicode_encoding,
2236 m_char_encoding);
2237 }
2238
2239 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2240 {
2241 OSStatus status = noErr ;
2242 ByteCount byteOutLen ;
2243 ByteCount byteInLen = strlen(psz) ;
2244 wchar_t *tbuf = NULL ;
2245 UniChar* ubuf = NULL ;
2246 size_t res = 0 ;
2247
2248 if (buf == NULL)
2249 {
2250 //apple specs say at least 32
2251 n = wxMax( 32 , byteInLen ) ;
2252 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2253 }
2254 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2255 #if SIZEOF_WCHAR_T == 4
2256 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2257 #else
2258 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2259 #endif
2260 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2261 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2262 #if SIZEOF_WCHAR_T == 4
2263 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2264 // is not properly terminated we get random characters at the end
2265 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2266 wxMBConvUTF16 converter ;
2267 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2268 free( ubuf ) ;
2269 #else
2270 res = byteOutLen / sizeof( UniChar ) ;
2271 #endif
2272 if ( buf == NULL )
2273 free(tbuf) ;
2274
2275 if ( buf && res < n)
2276 buf[res] = 0;
2277
2278 return res ;
2279 }
2280
2281 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2282 {
2283 OSStatus status = noErr ;
2284 ByteCount byteOutLen ;
2285 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2286
2287 char *tbuf = NULL ;
2288
2289 if (buf == NULL)
2290 {
2291 //apple specs say at least 32
2292 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2293 tbuf = (char*) malloc( n ) ;
2294 }
2295
2296 ByteCount byteBufferLen = n ;
2297 UniChar* ubuf = NULL ;
2298 #if SIZEOF_WCHAR_T == 4
2299 wxMBConvUTF16 converter ;
2300 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2301 byteInLen = unicharlen ;
2302 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2303 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2304 #else
2305 ubuf = (UniChar*) psz ;
2306 #endif
2307 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2308 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2309 #if SIZEOF_WCHAR_T == 4
2310 free( ubuf ) ;
2311 #endif
2312 if ( buf == NULL )
2313 free(tbuf) ;
2314
2315 size_t res = byteOutLen ;
2316 if ( buf && res < n)
2317 {
2318 buf[res] = 0;
2319
2320 //we need to double-trip to verify it didn't insert any ? in place
2321 //of bogus characters
2322 wxWCharBuffer wcBuf(n);
2323 size_t pszlen = wxWcslen(psz);
2324 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2325 wxWcslen(wcBuf) != pszlen ||
2326 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2327 {
2328 // we didn't obtain the same thing we started from, hence
2329 // the conversion was lossy and we consider that it failed
2330 return (size_t)-1;
2331 }
2332 }
2333
2334 return res ;
2335 }
2336
2337 bool IsOk() const
2338 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2339
2340 private:
2341 TECObjectRef m_MB2WC_converter ;
2342 TECObjectRef m_WC2MB_converter ;
2343
2344 TextEncodingBase m_char_encoding ;
2345 TextEncodingBase m_unicode_encoding ;
2346 };
2347
2348 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2349
2350 // ============================================================================
2351 // wxEncodingConverter based conversion classes
2352 // ============================================================================
2353
2354 #if wxUSE_FONTMAP
2355
2356 class wxMBConv_wxwin : public wxMBConv
2357 {
2358 private:
2359 void Init()
2360 {
2361 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2362 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2363 }
2364
2365 public:
2366 // temporarily just use wxEncodingConverter stuff,
2367 // so that it works while a better implementation is built
2368 wxMBConv_wxwin(const wxChar* name)
2369 {
2370 if (name)
2371 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2372 else
2373 m_enc = wxFONTENCODING_SYSTEM;
2374
2375 Init();
2376 }
2377
2378 wxMBConv_wxwin(wxFontEncoding enc)
2379 {
2380 m_enc = enc;
2381
2382 Init();
2383 }
2384
2385 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2386 {
2387 size_t inbuf = strlen(psz);
2388 if (buf)
2389 {
2390 if (!m2w.Convert(psz,buf))
2391 return (size_t)-1;
2392 }
2393 return inbuf;
2394 }
2395
2396 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2397 {
2398 const size_t inbuf = wxWcslen(psz);
2399 if (buf)
2400 {
2401 if (!w2m.Convert(psz,buf))
2402 return (size_t)-1;
2403 }
2404
2405 return inbuf;
2406 }
2407
2408 bool IsOk() const { return m_ok; }
2409
2410 public:
2411 wxFontEncoding m_enc;
2412 wxEncodingConverter m2w, w2m;
2413
2414 // were we initialized successfully?
2415 bool m_ok;
2416
2417 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2418 };
2419
2420 // make the constructors available for unit testing
2421 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2422 {
2423 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2424 if ( !result->IsOk() )
2425 {
2426 delete result;
2427 return 0;
2428 }
2429 return result;
2430 }
2431
2432 #endif // wxUSE_FONTMAP
2433
2434 // ============================================================================
2435 // wxCSConv implementation
2436 // ============================================================================
2437
2438 void wxCSConv::Init()
2439 {
2440 m_name = NULL;
2441 m_convReal = NULL;
2442 m_deferred = true;
2443 }
2444
2445 wxCSConv::wxCSConv(const wxChar *charset)
2446 {
2447 Init();
2448
2449 if ( charset )
2450 {
2451 SetName(charset);
2452 }
2453
2454 m_encoding = wxFONTENCODING_SYSTEM;
2455 }
2456
2457 wxCSConv::wxCSConv(wxFontEncoding encoding)
2458 {
2459 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2460 {
2461 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2462
2463 encoding = wxFONTENCODING_SYSTEM;
2464 }
2465
2466 Init();
2467
2468 m_encoding = encoding;
2469 }
2470
2471 wxCSConv::~wxCSConv()
2472 {
2473 Clear();
2474 }
2475
2476 wxCSConv::wxCSConv(const wxCSConv& conv)
2477 : wxMBConv()
2478 {
2479 Init();
2480
2481 SetName(conv.m_name);
2482 m_encoding = conv.m_encoding;
2483 }
2484
2485 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2486 {
2487 Clear();
2488
2489 SetName(conv.m_name);
2490 m_encoding = conv.m_encoding;
2491
2492 return *this;
2493 }
2494
2495 void wxCSConv::Clear()
2496 {
2497 free(m_name);
2498 delete m_convReal;
2499
2500 m_name = NULL;
2501 m_convReal = NULL;
2502 }
2503
2504 void wxCSConv::SetName(const wxChar *charset)
2505 {
2506 if (charset)
2507 {
2508 m_name = wxStrdup(charset);
2509 m_deferred = true;
2510 }
2511 }
2512
2513 #if wxUSE_FONTMAP
2514 #include "wx/hashmap.h"
2515
2516 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2517 wxEncodingNameCache );
2518
2519 static wxEncodingNameCache gs_nameCache;
2520 #endif
2521
2522 wxMBConv *wxCSConv::DoCreate() const
2523 {
2524 #if wxUSE_FONTMAP
2525 wxLogTrace(TRACE_STRCONV,
2526 wxT("creating conversion for %s"),
2527 (m_name ? m_name
2528 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2529 #endif // wxUSE_FONTMAP
2530
2531 // check for the special case of ASCII or ISO8859-1 charset: as we have
2532 // special knowledge of it anyhow, we don't need to create a special
2533 // conversion object
2534 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2535 {
2536 // don't convert at all
2537 return NULL;
2538 }
2539
2540 // we trust OS to do conversion better than we can so try external
2541 // conversion methods first
2542 //
2543 // the full order is:
2544 // 1. OS conversion (iconv() under Unix or Win32 API)
2545 // 2. hard coded conversions for UTF
2546 // 3. wxEncodingConverter as fall back
2547
2548 // step (1)
2549 #ifdef HAVE_ICONV
2550 #if !wxUSE_FONTMAP
2551 if ( m_name )
2552 #endif // !wxUSE_FONTMAP
2553 {
2554 wxString name(m_name);
2555 wxFontEncoding encoding(m_encoding);
2556
2557 if ( !name.empty() )
2558 {
2559 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2560 if ( conv->IsOk() )
2561 return conv;
2562
2563 delete conv;
2564
2565 #if wxUSE_FONTMAP
2566 encoding =
2567 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2568 #endif // wxUSE_FONTMAP
2569 }
2570 #if wxUSE_FONTMAP
2571 {
2572 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2573 if ( it != gs_nameCache.end() )
2574 {
2575 if ( it->second.empty() )
2576 return NULL;
2577
2578 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2579 if ( conv->IsOk() )
2580 return conv;
2581
2582 delete conv;
2583 }
2584
2585 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2586
2587 for ( ; *names; ++names )
2588 {
2589 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2590 if ( conv->IsOk() )
2591 {
2592 gs_nameCache[encoding] = *names;
2593 return conv;
2594 }
2595
2596 delete conv;
2597 }
2598
2599 gs_nameCache[encoding] = _T(""); // cache the failure
2600 }
2601 #endif // wxUSE_FONTMAP
2602 }
2603 #endif // HAVE_ICONV
2604
2605 #ifdef wxHAVE_WIN32_MB2WC
2606 {
2607 #if wxUSE_FONTMAP
2608 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2609 : new wxMBConv_win32(m_encoding);
2610 if ( conv->IsOk() )
2611 return conv;
2612
2613 delete conv;
2614 #else
2615 return NULL;
2616 #endif
2617 }
2618 #endif // wxHAVE_WIN32_MB2WC
2619 #if defined(__WXMAC__)
2620 {
2621 // leave UTF16 and UTF32 to the built-ins of wx
2622 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2623 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2624 {
2625
2626 #if wxUSE_FONTMAP
2627 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2628 : new wxMBConv_mac(m_encoding);
2629 #else
2630 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2631 #endif
2632 if ( conv->IsOk() )
2633 return conv;
2634
2635 delete conv;
2636 }
2637 }
2638 #endif
2639 #if defined(__WXCOCOA__)
2640 {
2641 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2642 {
2643
2644 #if wxUSE_FONTMAP
2645 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2646 : new wxMBConv_cocoa(m_encoding);
2647 #else
2648 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2649 #endif
2650 if ( conv->IsOk() )
2651 return conv;
2652
2653 delete conv;
2654 }
2655 }
2656 #endif
2657 // step (2)
2658 wxFontEncoding enc = m_encoding;
2659 #if wxUSE_FONTMAP
2660 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2661 {
2662 // use "false" to suppress interactive dialogs -- we can be called from
2663 // anywhere and popping up a dialog from here is the last thing we want to
2664 // do
2665 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2666 }
2667 #endif // wxUSE_FONTMAP
2668
2669 switch ( enc )
2670 {
2671 case wxFONTENCODING_UTF7:
2672 return new wxMBConvUTF7;
2673
2674 case wxFONTENCODING_UTF8:
2675 return new wxMBConvUTF8;
2676
2677 case wxFONTENCODING_UTF16BE:
2678 return new wxMBConvUTF16BE;
2679
2680 case wxFONTENCODING_UTF16LE:
2681 return new wxMBConvUTF16LE;
2682
2683 case wxFONTENCODING_UTF32BE:
2684 return new wxMBConvUTF32BE;
2685
2686 case wxFONTENCODING_UTF32LE:
2687 return new wxMBConvUTF32LE;
2688
2689 default:
2690 // nothing to do but put here to suppress gcc warnings
2691 ;
2692 }
2693
2694 // step (3)
2695 #if wxUSE_FONTMAP
2696 {
2697 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2698 : new wxMBConv_wxwin(m_encoding);
2699 if ( conv->IsOk() )
2700 return conv;
2701
2702 delete conv;
2703 }
2704 #endif // wxUSE_FONTMAP
2705
2706 // NB: This is a hack to prevent deadlock. What could otherwise happen
2707 // in Unicode build: wxConvLocal creation ends up being here
2708 // because of some failure and logs the error. But wxLog will try to
2709 // attach timestamp, for which it will need wxConvLocal (to convert
2710 // time to char* and then wchar_t*), but that fails, tries to log
2711 // error, but wxLog has a (already locked) critical section that
2712 // guards static buffer.
2713 static bool alreadyLoggingError = false;
2714 if (!alreadyLoggingError)
2715 {
2716 alreadyLoggingError = true;
2717 wxLogError(_("Cannot convert from the charset '%s'!"),
2718 m_name ? m_name
2719 :
2720 #if wxUSE_FONTMAP
2721 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2722 #else // !wxUSE_FONTMAP
2723 wxString::Format(_("encoding %s"), m_encoding).c_str()
2724 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2725 );
2726 alreadyLoggingError = false;
2727 }
2728
2729 return NULL;
2730 }
2731
2732 void wxCSConv::CreateConvIfNeeded() const
2733 {
2734 if ( m_deferred )
2735 {
2736 wxCSConv *self = (wxCSConv *)this; // const_cast
2737
2738 #if wxUSE_INTL
2739 // if we don't have neither the name nor the encoding, use the default
2740 // encoding for this system
2741 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2742 {
2743 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2744 }
2745 #endif // wxUSE_INTL
2746
2747 self->m_convReal = DoCreate();
2748 self->m_deferred = false;
2749 }
2750 }
2751
2752 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2753 {
2754 CreateConvIfNeeded();
2755
2756 if (m_convReal)
2757 return m_convReal->MB2WC(buf, psz, n);
2758
2759 // latin-1 (direct)
2760 size_t len = strlen(psz);
2761
2762 if (buf)
2763 {
2764 for (size_t c = 0; c <= len; c++)
2765 buf[c] = (unsigned char)(psz[c]);
2766 }
2767
2768 return len;
2769 }
2770
2771 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2772 {
2773 CreateConvIfNeeded();
2774
2775 if (m_convReal)
2776 return m_convReal->WC2MB(buf, psz, n);
2777
2778 // latin-1 (direct)
2779 const size_t len = wxWcslen(psz);
2780 if (buf)
2781 {
2782 for (size_t c = 0; c <= len; c++)
2783 {
2784 if (psz[c] > 0xFF)
2785 return (size_t)-1;
2786 buf[c] = (char)psz[c];
2787 }
2788 }
2789 else
2790 {
2791 for (size_t c = 0; c <= len; c++)
2792 {
2793 if (psz[c] > 0xFF)
2794 return (size_t)-1;
2795 }
2796 }
2797
2798 return len;
2799 }
2800
2801 // ----------------------------------------------------------------------------
2802 // globals
2803 // ----------------------------------------------------------------------------
2804
2805 #ifdef __WINDOWS__
2806 static wxMBConv_win32 wxConvLibcObj;
2807 #elif defined(__WXMAC__) && !defined(__MACH__)
2808 static wxMBConv_mac wxConvLibcObj ;
2809 #else
2810 static wxMBConvLibc wxConvLibcObj;
2811 #endif
2812
2813 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2814 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2815 static wxMBConvUTF7 wxConvUTF7Obj;
2816 static wxMBConvUTF8 wxConvUTF8Obj;
2817
2818 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2819 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2820 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2821 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2822 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2823 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2824 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2825 #ifdef __WXOSX__
2826 wxConvUTF8Obj;
2827 #else
2828 wxConvLibcObj;
2829 #endif
2830
2831
2832 #else // !wxUSE_WCHAR_T
2833
2834 // stand-ins in absence of wchar_t
2835 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2836 wxConvISO8859_1,
2837 wxConvLocal,
2838 wxConvUTF8;
2839
2840 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2841
2842