]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Unicode build fix
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WINDOWS__
44 #include "wx/msw/private.h"
45 #include "wx/msw/missing.h"
46 #endif
47
48 #ifndef __WXWINCE__
49 #include <errno.h>
50 #endif
51
52 #include <ctype.h>
53 #include <string.h>
54 #include <stdlib.h>
55
56 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
57 #define wxHAVE_WIN32_MB2WC
58 #endif // __WIN32__ but !__WXMICROWIN__
59
60 #ifdef __SALFORDC__
61 #include <clib.h>
62 #endif
63
64 #ifdef HAVE_ICONV
65 #include <iconv.h>
66 #include "wx/thread.h"
67 #endif
68
69 #include "wx/encconv.h"
70 #include "wx/fontmap.h"
71 #include "wx/utils.h"
72
73 #ifdef __WXMAC__
74 #ifndef __DARWIN__
75 #include <ATSUnicode.h>
76 #include <TextCommon.h>
77 #include <TextEncodingConverter.h>
78 #endif
79
80 #include "wx/mac/private.h" // includes mac headers
81 #endif
82
83 #define TRACE_STRCONV _T("strconv")
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // ----------------------------------------------------------------------------
90 // UTF-16 en/decoding to/from UCS-4
91 // ----------------------------------------------------------------------------
92
93
94 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
95 {
96 if (input<=0xffff)
97 {
98 if (output)
99 *output = (wxUint16) input;
100 return 1;
101 }
102 else if (input>=0x110000)
103 {
104 return (size_t)-1;
105 }
106 else
107 {
108 if (output)
109 {
110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
112 }
113 return 2;
114 }
115 }
116
117 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
118 {
119 if ((*input<0xd800) || (*input>0xdfff))
120 {
121 output = *input;
122 return 1;
123 }
124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
125 {
126 output = *input;
127 return (size_t)-1;
128 }
129 else
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
134 }
135
136
137 // ----------------------------------------------------------------------------
138 // wxMBConv
139 // ----------------------------------------------------------------------------
140
141 wxMBConv::~wxMBConv()
142 {
143 // nothing to do here (necessary for Darwin linking probably)
144 }
145
146 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147 {
148 if ( psz )
149 {
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
161 }
162 }
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
167 }
168
169 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
170 {
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
186
187 return buf;
188 }
189
190 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
191 {
192 wxASSERT(pOutSize != NULL);
193
194 const char* szEnd = szString + nStringLen + 1;
195 const char* szPos = szString;
196 const char* szStart = szPos;
197
198 size_t nActualLength = 0;
199 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
200
201 wxWCharBuffer theBuffer(nCurrentSize);
202
203 //Convert the string until the length() is reached, continuing the
204 //loop every time a null character is reached
205 while(szPos != szEnd)
206 {
207 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
208
209 //Get the length of the current (sub)string
210 size_t nLen = MB2WC(NULL, szPos, 0);
211
212 //Invalid conversion?
213 if( nLen == (size_t)-1 )
214 {
215 *pOutSize = 0;
216 theBuffer.data()[0u] = wxT('\0');
217 return theBuffer;
218 }
219
220
221 //Increase the actual length (+1 for current null character)
222 nActualLength += nLen + 1;
223
224 //if buffer too big, realloc the buffer
225 if (nActualLength > (nCurrentSize+1))
226 {
227 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
228 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
229 theBuffer = theNewBuffer;
230 nCurrentSize <<= 1;
231 }
232
233 //Convert the current (sub)string
234 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
235 {
236 *pOutSize = 0;
237 theBuffer.data()[0u] = wxT('\0');
238 return theBuffer;
239 }
240
241 //Increment to next (sub)string
242 //Note that we have to use strlen instead of nLen here
243 //because XX2XX gives us the size of the output buffer,
244 //which is not necessarily the length of the string
245 szPos += strlen(szPos) + 1;
246 }
247
248 //success - return actual length and the buffer
249 *pOutSize = nActualLength;
250 return theBuffer;
251 }
252
253 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
254 {
255 wxASSERT(pOutSize != NULL);
256
257 const wchar_t* szEnd = szString + nStringLen + 1;
258 const wchar_t* szPos = szString;
259 const wchar_t* szStart = szPos;
260
261 size_t nActualLength = 0;
262 size_t nCurrentSize = nStringLen << 2; //try * 4 first
263
264 wxCharBuffer theBuffer(nCurrentSize);
265
266 //Convert the string until the length() is reached, continuing the
267 //loop every time a null character is reached
268 while(szPos != szEnd)
269 {
270 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
271
272 //Get the length of the current (sub)string
273 size_t nLen = WC2MB(NULL, szPos, 0);
274
275 //Invalid conversion?
276 if( nLen == (size_t)-1 )
277 {
278 *pOutSize = 0;
279 theBuffer.data()[0u] = wxT('\0');
280 return theBuffer;
281 }
282
283 //Increase the actual length (+1 for current null character)
284 nActualLength += nLen + 1;
285
286 //if buffer too big, realloc the buffer
287 if (nActualLength > (nCurrentSize+1))
288 {
289 wxCharBuffer theNewBuffer(nCurrentSize << 1);
290 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
291 theBuffer = theNewBuffer;
292 nCurrentSize <<= 1;
293 }
294
295 //Convert the current (sub)string
296 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
297 {
298 *pOutSize = 0;
299 theBuffer.data()[0u] = wxT('\0');
300 return theBuffer;
301 }
302
303 //Increment to next (sub)string
304 //Note that we have to use wxWcslen instead of nLen here
305 //because XX2XX gives us the size of the output buffer,
306 //which is not necessarily the length of the string
307 szPos += wxWcslen(szPos) + 1;
308 }
309
310 //success - return actual length and the buffer
311 *pOutSize = nActualLength;
312 return theBuffer;
313 }
314
315 // ----------------------------------------------------------------------------
316 // wxMBConvLibc
317 // ----------------------------------------------------------------------------
318
319 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
320 {
321 return wxMB2WC(buf, psz, n);
322 }
323
324 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
325 {
326 return wxWC2MB(buf, psz, n);
327 }
328
329 #ifdef __UNIX__
330
331 // ----------------------------------------------------------------------------
332 // wxConvBrokenFileNames
333 // ----------------------------------------------------------------------------
334
335 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
336 {
337 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
338 || wxStricmp(charset, _T("UTF8")) == 0 )
339 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
340 else
341 m_conv = new wxCSConv(charset);
342 }
343
344 size_t
345 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
346 const char *psz,
347 size_t outputSize) const
348 {
349 return m_conv->MB2WC( outputBuf, psz, outputSize );
350 }
351
352 size_t
353 wxConvBrokenFileNames::WC2MB(char *outputBuf,
354 const wchar_t *psz,
355 size_t outputSize) const
356 {
357 return m_conv->WC2MB( outputBuf, psz, outputSize );
358 }
359
360 #endif
361
362 // ----------------------------------------------------------------------------
363 // UTF-7
364 // ----------------------------------------------------------------------------
365
366 // Implementation (C) 2004 Fredrik Roubert
367
368 //
369 // BASE64 decoding table
370 //
371 static const unsigned char utf7unb64[] =
372 {
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
376 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
379 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
380 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
382 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
383 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
384 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
386 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
387 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
388 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
402 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
403 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
405 };
406
407 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
408 {
409 size_t len = 0;
410
411 while (*psz && ((!buf) || (len < n)))
412 {
413 unsigned char cc = *psz++;
414 if (cc != '+')
415 {
416 // plain ASCII char
417 if (buf)
418 *buf++ = cc;
419 len++;
420 }
421 else if (*psz == '-')
422 {
423 // encoded plus sign
424 if (buf)
425 *buf++ = cc;
426 len++;
427 psz++;
428 }
429 else
430 {
431 // BASE64 encoded string
432 bool lsb;
433 unsigned char c;
434 unsigned int d, l;
435 for (lsb = false, d = 0, l = 0;
436 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
437 {
438 d <<= 6;
439 d += cc;
440 for (l += 6; l >= 8; lsb = !lsb)
441 {
442 c = (unsigned char)((d >> (l -= 8)) % 256);
443 if (lsb)
444 {
445 if (buf)
446 *buf++ |= c;
447 len ++;
448 }
449 else
450 if (buf)
451 *buf = (wchar_t)(c << 8);
452 }
453 }
454 if (*psz == '-')
455 psz++;
456 }
457 }
458 if (buf && (len < n))
459 *buf = 0;
460 return len;
461 }
462
463 //
464 // BASE64 encoding table
465 //
466 static const unsigned char utf7enb64[] =
467 {
468 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
469 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
470 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
471 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
472 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
473 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
474 'w', 'x', 'y', 'z', '0', '1', '2', '3',
475 '4', '5', '6', '7', '8', '9', '+', '/'
476 };
477
478 //
479 // UTF-7 encoding table
480 //
481 // 0 - Set D (directly encoded characters)
482 // 1 - Set O (optional direct characters)
483 // 2 - whitespace characters (optional)
484 // 3 - special characters
485 //
486 static const unsigned char utf7encode[128] =
487 {
488 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
490 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
494 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
496 };
497
498 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
499 {
500
501
502 size_t len = 0;
503
504 while (*psz && ((!buf) || (len < n)))
505 {
506 wchar_t cc = *psz++;
507 if (cc < 0x80 && utf7encode[cc] < 1)
508 {
509 // plain ASCII char
510 if (buf)
511 *buf++ = (char)cc;
512 len++;
513 }
514 #ifndef WC_UTF16
515 else if (((wxUint32)cc) > 0xffff)
516 {
517 // no surrogate pair generation (yet?)
518 return (size_t)-1;
519 }
520 #endif
521 else
522 {
523 if (buf)
524 *buf++ = '+';
525 len++;
526 if (cc != '+')
527 {
528 // BASE64 encode string
529 unsigned int lsb, d, l;
530 for (d = 0, l = 0;; psz++)
531 {
532 for (lsb = 0; lsb < 2; lsb ++)
533 {
534 d <<= 8;
535 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
536
537 for (l += 8; l >= 6; )
538 {
539 l -= 6;
540 if (buf)
541 *buf++ = utf7enb64[(d >> l) % 64];
542 len++;
543 }
544 }
545 cc = *psz;
546 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
547 break;
548 }
549 if (l != 0)
550 {
551 if (buf)
552 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
553 len++;
554 }
555 }
556 if (buf)
557 *buf++ = '-';
558 len++;
559 }
560 }
561 if (buf && (len < n))
562 *buf = 0;
563 return len;
564 }
565
566 // ----------------------------------------------------------------------------
567 // UTF-8
568 // ----------------------------------------------------------------------------
569
570 static wxUint32 utf8_max[]=
571 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
572
573 // boundaries of the private use area we use to (temporarily) remap invalid
574 // characters invalid in a UTF-8 encoded string
575 const wxUint32 wxUnicodePUA = 0x100000;
576 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
577
578 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
579 {
580 size_t len = 0;
581
582 while (*psz && ((!buf) || (len < n)))
583 {
584 const char *opsz = psz;
585 bool invalid = false;
586 unsigned char cc = *psz++, fc = cc;
587 unsigned cnt;
588 for (cnt = 0; fc & 0x80; cnt++)
589 fc <<= 1;
590 if (!cnt)
591 {
592 // plain ASCII char
593 if (buf)
594 *buf++ = cc;
595 len++;
596
597 // escape the escape character for octal escapes
598 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
599 && cc == '\\' && (!buf || len < n))
600 {
601 if (buf)
602 *buf++ = cc;
603 len++;
604 }
605 }
606 else
607 {
608 cnt--;
609 if (!cnt)
610 {
611 // invalid UTF-8 sequence
612 invalid = true;
613 }
614 else
615 {
616 unsigned ocnt = cnt - 1;
617 wxUint32 res = cc & (0x3f >> cnt);
618 while (cnt--)
619 {
620 cc = *psz;
621 if ((cc & 0xC0) != 0x80)
622 {
623 // invalid UTF-8 sequence
624 invalid = true;
625 break;
626 }
627 psz++;
628 res = (res << 6) | (cc & 0x3f);
629 }
630 if (invalid || res <= utf8_max[ocnt])
631 {
632 // illegal UTF-8 encoding
633 invalid = true;
634 }
635 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
636 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
637 {
638 // if one of our PUA characters turns up externally
639 // it must also be treated as an illegal sequence
640 // (a bit like you have to escape an escape character)
641 invalid = true;
642 }
643 else
644 {
645 #ifdef WC_UTF16
646 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
647 size_t pa = encode_utf16(res, (wxUint16 *)buf);
648 if (pa == (size_t)-1)
649 {
650 invalid = true;
651 }
652 else
653 {
654 if (buf)
655 buf += pa;
656 len += pa;
657 }
658 #else // !WC_UTF16
659 if (buf)
660 *buf++ = res;
661 len++;
662 #endif // WC_UTF16/!WC_UTF16
663 }
664 }
665 if (invalid)
666 {
667 if (m_options & MAP_INVALID_UTF8_TO_PUA)
668 {
669 while (opsz < psz && (!buf || len < n))
670 {
671 #ifdef WC_UTF16
672 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
673 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
674 wxASSERT(pa != (size_t)-1);
675 if (buf)
676 buf += pa;
677 opsz++;
678 len += pa;
679 #else
680 if (buf)
681 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
682 opsz++;
683 len++;
684 #endif
685 }
686 }
687 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
688 {
689 while (opsz < psz && (!buf || len < n))
690 {
691 if ( buf && len + 3 < n )
692 {
693 unsigned char n = *opsz;
694 *buf++ = L'\\';
695 *buf++ = (wchar_t)( L'0' + n / 0100 );
696 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
697 *buf++ = (wchar_t)( L'0' + n % 010 );
698 }
699 opsz++;
700 len += 4;
701 }
702 }
703 else // MAP_INVALID_UTF8_NOT
704 {
705 return (size_t)-1;
706 }
707 }
708 }
709 }
710 if (buf && (len < n))
711 *buf = 0;
712 return len;
713 }
714
715 static inline bool isoctal(wchar_t wch)
716 {
717 return L'0' <= wch && wch <= L'7';
718 }
719
720 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
721 {
722 size_t len = 0;
723
724 while (*psz && ((!buf) || (len < n)))
725 {
726 wxUint32 cc;
727 #ifdef WC_UTF16
728 // cast is ok for WC_UTF16
729 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
730 psz += (pa == (size_t)-1) ? 1 : pa;
731 #else
732 cc=(*psz++) & 0x7fffffff;
733 #endif
734
735 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
736 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
737 {
738 if (buf)
739 *buf++ = (char)(cc - wxUnicodePUA);
740 len++;
741 }
742 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
743 && cc == L'\\' && psz[0] == L'\\' )
744 {
745 if (buf)
746 *buf++ = (char)cc;
747 psz++;
748 len++;
749 }
750 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
751 cc == L'\\' &&
752 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
753 {
754 if (buf)
755 {
756 *buf++ = (char) ((psz[0] - L'0')*0100 +
757 (psz[1] - L'0')*010 +
758 (psz[2] - L'0'));
759 }
760
761 psz += 3;
762 len++;
763 }
764 else
765 {
766 unsigned cnt;
767 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
768 if (!cnt)
769 {
770 // plain ASCII char
771 if (buf)
772 *buf++ = (char) cc;
773 len++;
774 }
775
776 else
777 {
778 len += cnt + 1;
779 if (buf)
780 {
781 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
782 while (cnt--)
783 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
784 }
785 }
786 }
787 }
788
789 if (buf && (len<n))
790 *buf = 0;
791
792 return len;
793 }
794
795 // ----------------------------------------------------------------------------
796 // UTF-16
797 // ----------------------------------------------------------------------------
798
799 #ifdef WORDS_BIGENDIAN
800 #define wxMBConvUTF16straight wxMBConvUTF16BE
801 #define wxMBConvUTF16swap wxMBConvUTF16LE
802 #else
803 #define wxMBConvUTF16swap wxMBConvUTF16BE
804 #define wxMBConvUTF16straight wxMBConvUTF16LE
805 #endif
806
807
808 #ifdef WC_UTF16
809
810 // copy 16bit MB to 16bit String
811 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
812 {
813 size_t len=0;
814
815 while (*(wxUint16*)psz && (!buf || len < n))
816 {
817 if (buf)
818 *buf++ = *(wxUint16*)psz;
819 len++;
820
821 psz += sizeof(wxUint16);
822 }
823 if (buf && len<n) *buf=0;
824
825 return len;
826 }
827
828
829 // copy 16bit String to 16bit MB
830 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
831 {
832 size_t len=0;
833
834 while (*psz && (!buf || len < n))
835 {
836 if (buf)
837 {
838 *(wxUint16*)buf = *psz;
839 buf += sizeof(wxUint16);
840 }
841 len += sizeof(wxUint16);
842 psz++;
843 }
844 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
845
846 return len;
847 }
848
849
850 // swap 16bit MB to 16bit String
851 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
852 {
853 size_t len=0;
854
855 while (*(wxUint16*)psz && (!buf || len < n))
856 {
857 if (buf)
858 {
859 ((char *)buf)[0] = psz[1];
860 ((char *)buf)[1] = psz[0];
861 buf++;
862 }
863 len++;
864 psz += sizeof(wxUint16);
865 }
866 if (buf && len<n) *buf=0;
867
868 return len;
869 }
870
871
872 // swap 16bit MB to 16bit String
873 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
874 {
875 size_t len=0;
876
877 while (*psz && (!buf || len < n))
878 {
879 if (buf)
880 {
881 *buf++ = ((char*)psz)[1];
882 *buf++ = ((char*)psz)[0];
883 }
884 len += sizeof(wxUint16);
885 psz++;
886 }
887 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
888
889 return len;
890 }
891
892
893 #else // WC_UTF16
894
895
896 // copy 16bit MB to 32bit String
897 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
898 {
899 size_t len=0;
900
901 while (*(wxUint16*)psz && (!buf || len < n))
902 {
903 wxUint32 cc;
904 size_t pa=decode_utf16((wxUint16*)psz, cc);
905 if (pa == (size_t)-1)
906 return pa;
907
908 if (buf)
909 *buf++ = cc;
910 len++;
911 psz += pa * sizeof(wxUint16);
912 }
913 if (buf && len<n) *buf=0;
914
915 return len;
916 }
917
918
919 // copy 32bit String to 16bit MB
920 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
921 {
922 size_t len=0;
923
924 while (*psz && (!buf || len < n))
925 {
926 wxUint16 cc[2];
927 size_t pa=encode_utf16(*psz, cc);
928
929 if (pa == (size_t)-1)
930 return pa;
931
932 if (buf)
933 {
934 *(wxUint16*)buf = cc[0];
935 buf += sizeof(wxUint16);
936 if (pa > 1)
937 {
938 *(wxUint16*)buf = cc[1];
939 buf += sizeof(wxUint16);
940 }
941 }
942
943 len += pa*sizeof(wxUint16);
944 psz++;
945 }
946 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
947
948 return len;
949 }
950
951
952 // swap 16bit MB to 32bit String
953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954 {
955 size_t len=0;
956
957 while (*(wxUint16*)psz && (!buf || len < n))
958 {
959 wxUint32 cc;
960 char tmp[4];
961 tmp[0]=psz[1]; tmp[1]=psz[0];
962 tmp[2]=psz[3]; tmp[3]=psz[2];
963
964 size_t pa=decode_utf16((wxUint16*)tmp, cc);
965 if (pa == (size_t)-1)
966 return pa;
967
968 if (buf)
969 *buf++ = cc;
970
971 len++;
972 psz += pa * sizeof(wxUint16);
973 }
974 if (buf && len<n) *buf=0;
975
976 return len;
977 }
978
979
980 // swap 32bit String to 16bit MB
981 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
982 {
983 size_t len=0;
984
985 while (*psz && (!buf || len < n))
986 {
987 wxUint16 cc[2];
988 size_t pa=encode_utf16(*psz, cc);
989
990 if (pa == (size_t)-1)
991 return pa;
992
993 if (buf)
994 {
995 *buf++ = ((char*)cc)[1];
996 *buf++ = ((char*)cc)[0];
997 if (pa > 1)
998 {
999 *buf++ = ((char*)cc)[3];
1000 *buf++ = ((char*)cc)[2];
1001 }
1002 }
1003
1004 len += pa*sizeof(wxUint16);
1005 psz++;
1006 }
1007 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1008
1009 return len;
1010 }
1011
1012 #endif // WC_UTF16
1013
1014
1015 // ----------------------------------------------------------------------------
1016 // UTF-32
1017 // ----------------------------------------------------------------------------
1018
1019 #ifdef WORDS_BIGENDIAN
1020 #define wxMBConvUTF32straight wxMBConvUTF32BE
1021 #define wxMBConvUTF32swap wxMBConvUTF32LE
1022 #else
1023 #define wxMBConvUTF32swap wxMBConvUTF32BE
1024 #define wxMBConvUTF32straight wxMBConvUTF32LE
1025 #endif
1026
1027
1028 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1029 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1030
1031
1032 #ifdef WC_UTF16
1033
1034 // copy 32bit MB to 16bit String
1035 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1036 {
1037 size_t len=0;
1038
1039 while (*(wxUint32*)psz && (!buf || len < n))
1040 {
1041 wxUint16 cc[2];
1042
1043 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1044 if (pa == (size_t)-1)
1045 return pa;
1046
1047 if (buf)
1048 {
1049 *buf++ = cc[0];
1050 if (pa > 1)
1051 *buf++ = cc[1];
1052 }
1053 len += pa;
1054 psz += sizeof(wxUint32);
1055 }
1056 if (buf && len<n) *buf=0;
1057
1058 return len;
1059 }
1060
1061
1062 // copy 16bit String to 32bit MB
1063 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1064 {
1065 size_t len=0;
1066
1067 while (*psz && (!buf || len < n))
1068 {
1069 wxUint32 cc;
1070
1071 // cast is ok for WC_UTF16
1072 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1073 if (pa == (size_t)-1)
1074 return pa;
1075
1076 if (buf)
1077 {
1078 *(wxUint32*)buf = cc;
1079 buf += sizeof(wxUint32);
1080 }
1081 len += sizeof(wxUint32);
1082 psz += pa;
1083 }
1084
1085 if (buf && len<=n-sizeof(wxUint32))
1086 *(wxUint32*)buf=0;
1087
1088 return len;
1089 }
1090
1091
1092
1093 // swap 32bit MB to 16bit String
1094 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1095 {
1096 size_t len=0;
1097
1098 while (*(wxUint32*)psz && (!buf || len < n))
1099 {
1100 char tmp[4];
1101 tmp[0] = psz[3]; tmp[1] = psz[2];
1102 tmp[2] = psz[1]; tmp[3] = psz[0];
1103
1104
1105 wxUint16 cc[2];
1106
1107 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1108 if (pa == (size_t)-1)
1109 return pa;
1110
1111 if (buf)
1112 {
1113 *buf++ = cc[0];
1114 if (pa > 1)
1115 *buf++ = cc[1];
1116 }
1117 len += pa;
1118 psz += sizeof(wxUint32);
1119 }
1120
1121 if (buf && len<n)
1122 *buf=0;
1123
1124 return len;
1125 }
1126
1127
1128 // swap 16bit String to 32bit MB
1129 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1130 {
1131 size_t len=0;
1132
1133 while (*psz && (!buf || len < n))
1134 {
1135 char cc[4];
1136
1137 // cast is ok for WC_UTF16
1138 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1139 if (pa == (size_t)-1)
1140 return pa;
1141
1142 if (buf)
1143 {
1144 *buf++ = cc[3];
1145 *buf++ = cc[2];
1146 *buf++ = cc[1];
1147 *buf++ = cc[0];
1148 }
1149 len += sizeof(wxUint32);
1150 psz += pa;
1151 }
1152
1153 if (buf && len<=n-sizeof(wxUint32))
1154 *(wxUint32*)buf=0;
1155
1156 return len;
1157 }
1158
1159 #else // WC_UTF16
1160
1161
1162 // copy 32bit MB to 32bit String
1163 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1164 {
1165 size_t len=0;
1166
1167 while (*(wxUint32*)psz && (!buf || len < n))
1168 {
1169 if (buf)
1170 *buf++ = *(wxUint32*)psz;
1171 len++;
1172 psz += sizeof(wxUint32);
1173 }
1174
1175 if (buf && len<n)
1176 *buf=0;
1177
1178 return len;
1179 }
1180
1181
1182 // copy 32bit String to 32bit MB
1183 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1184 {
1185 size_t len=0;
1186
1187 while (*psz && (!buf || len < n))
1188 {
1189 if (buf)
1190 {
1191 *(wxUint32*)buf = *psz;
1192 buf += sizeof(wxUint32);
1193 }
1194
1195 len += sizeof(wxUint32);
1196 psz++;
1197 }
1198
1199 if (buf && len<=n-sizeof(wxUint32))
1200 *(wxUint32*)buf=0;
1201
1202 return len;
1203 }
1204
1205
1206 // swap 32bit MB to 32bit String
1207 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1208 {
1209 size_t len=0;
1210
1211 while (*(wxUint32*)psz && (!buf || len < n))
1212 {
1213 if (buf)
1214 {
1215 ((char *)buf)[0] = psz[3];
1216 ((char *)buf)[1] = psz[2];
1217 ((char *)buf)[2] = psz[1];
1218 ((char *)buf)[3] = psz[0];
1219 buf++;
1220 }
1221 len++;
1222 psz += sizeof(wxUint32);
1223 }
1224
1225 if (buf && len<n)
1226 *buf=0;
1227
1228 return len;
1229 }
1230
1231
1232 // swap 32bit String to 32bit MB
1233 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1234 {
1235 size_t len=0;
1236
1237 while (*psz && (!buf || len < n))
1238 {
1239 if (buf)
1240 {
1241 *buf++ = ((char *)psz)[3];
1242 *buf++ = ((char *)psz)[2];
1243 *buf++ = ((char *)psz)[1];
1244 *buf++ = ((char *)psz)[0];
1245 }
1246 len += sizeof(wxUint32);
1247 psz++;
1248 }
1249
1250 if (buf && len<=n-sizeof(wxUint32))
1251 *(wxUint32*)buf=0;
1252
1253 return len;
1254 }
1255
1256
1257 #endif // WC_UTF16
1258
1259
1260 // ============================================================================
1261 // The classes doing conversion using the iconv_xxx() functions
1262 // ============================================================================
1263
1264 #ifdef HAVE_ICONV
1265
1266 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1267 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1268 // (unless there's yet another bug in glibc) the only case when iconv()
1269 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1270 // left in the input buffer -- when _real_ error occurs,
1271 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1272 // iconv() failure.
1273 // [This bug does not appear in glibc 2.2.]
1274 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1275 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1276 (errno != E2BIG || bufLeft != 0))
1277 #else
1278 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1279 #endif
1280
1281 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1282
1283 #define ICONV_T_INVALID ((iconv_t)-1)
1284
1285 #if SIZEOF_WCHAR_T == 4
1286 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1287 #define WC_ENC wxFONTENCODING_UTF32
1288 #elif SIZEOF_WCHAR_T == 2
1289 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1290 #define WC_ENC wxFONTENCODING_UTF16
1291 #else // sizeof(wchar_t) != 2 nor 4
1292 // does this ever happen?
1293 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1294 #endif
1295
1296 // ----------------------------------------------------------------------------
1297 // wxMBConv_iconv: encapsulates an iconv character set
1298 // ----------------------------------------------------------------------------
1299
1300 class wxMBConv_iconv : public wxMBConv
1301 {
1302 public:
1303 wxMBConv_iconv(const wxChar *name);
1304 virtual ~wxMBConv_iconv();
1305
1306 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1307 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1308
1309 bool IsOk() const
1310 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1311
1312 protected:
1313 // the iconv handlers used to translate from multibyte to wide char and in
1314 // the other direction
1315 iconv_t m2w,
1316 w2m;
1317 #if wxUSE_THREADS
1318 // guards access to m2w and w2m objects
1319 wxMutex m_iconvMutex;
1320 #endif
1321
1322 private:
1323 // the name (for iconv_open()) of a wide char charset -- if none is
1324 // available on this machine, it will remain NULL
1325 static wxString ms_wcCharsetName;
1326
1327 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1328 // different endian-ness than the native one
1329 static bool ms_wcNeedsSwap;
1330 };
1331
1332 // make the constructor available for unit testing
1333 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1334 {
1335 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1336 if ( !result->IsOk() )
1337 {
1338 delete result;
1339 return 0;
1340 }
1341 return result;
1342 }
1343
1344 wxString wxMBConv_iconv::ms_wcCharsetName;
1345 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1346
1347 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1348 {
1349 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1350 // names for the charsets
1351 const wxCharBuffer cname(wxString(name).ToAscii());
1352
1353 // check for charset that represents wchar_t:
1354 if ( ms_wcCharsetName.empty() )
1355 {
1356 #if wxUSE_FONTMAP
1357 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1358 #else // !wxUSE_FONTMAP
1359 static const wxChar *names[] =
1360 {
1361 #if SIZEOF_WCHAR_T == 4
1362 _T("UCS-4"),
1363 #elif SIZEOF_WCHAR_T = 2
1364 _T("UCS-2"),
1365 #endif
1366 NULL
1367 };
1368 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1369
1370 for ( ; *names; ++names )
1371 {
1372 const wxString name(*names);
1373
1374 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1375 wxString nameXE(name);
1376 #ifdef WORDS_BIGENDIAN
1377 nameXE += _T("BE");
1378 #else // little endian
1379 nameXE += _T("LE");
1380 #endif
1381
1382 m2w = iconv_open(nameXE.ToAscii(), cname);
1383 if ( m2w == ICONV_T_INVALID )
1384 {
1385 // try charset w/o bytesex info (e.g. "UCS4")
1386 m2w = iconv_open(name.ToAscii(), cname);
1387
1388 // and check for bytesex ourselves:
1389 if ( m2w != ICONV_T_INVALID )
1390 {
1391 char buf[2], *bufPtr;
1392 wchar_t wbuf[2], *wbufPtr;
1393 size_t insz, outsz;
1394 size_t res;
1395
1396 buf[0] = 'A';
1397 buf[1] = 0;
1398 wbuf[0] = 0;
1399 insz = 2;
1400 outsz = SIZEOF_WCHAR_T * 2;
1401 wbufPtr = wbuf;
1402 bufPtr = buf;
1403
1404 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1405 (char**)&wbufPtr, &outsz);
1406
1407 if (ICONV_FAILED(res, insz))
1408 {
1409 wxLogLastError(wxT("iconv"));
1410 wxLogError(_("Conversion to charset '%s' doesn't work."),
1411 name.c_str());
1412 }
1413 else // ok, can convert to this encoding, remember it
1414 {
1415 ms_wcCharsetName = name;
1416 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1417 }
1418 }
1419 }
1420 else // use charset not requiring byte swapping
1421 {
1422 ms_wcCharsetName = nameXE;
1423 }
1424 }
1425
1426 wxLogTrace(TRACE_STRCONV,
1427 wxT("iconv wchar_t charset is \"%s\"%s"),
1428 ms_wcCharsetName.empty() ? _T("<none>")
1429 : ms_wcCharsetName.c_str(),
1430 ms_wcNeedsSwap ? _T(" (needs swap)")
1431 : _T(""));
1432 }
1433 else // we already have ms_wcCharsetName
1434 {
1435 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1436 }
1437
1438 if ( ms_wcCharsetName.empty() )
1439 {
1440 w2m = ICONV_T_INVALID;
1441 }
1442 else
1443 {
1444 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1445 if ( w2m == ICONV_T_INVALID )
1446 {
1447 wxLogTrace(TRACE_STRCONV,
1448 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1449 ms_wcCharsetName.c_str(), cname.data());
1450 }
1451 }
1452 }
1453
1454 wxMBConv_iconv::~wxMBConv_iconv()
1455 {
1456 if ( m2w != ICONV_T_INVALID )
1457 iconv_close(m2w);
1458 if ( w2m != ICONV_T_INVALID )
1459 iconv_close(w2m);
1460 }
1461
1462 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1463 {
1464 #if wxUSE_THREADS
1465 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1466 // Unfortunately there is a couple of global wxCSConv objects such as
1467 // wxConvLocal that are used all over wx code, so we have to make sure
1468 // the handle is used by at most one thread at the time. Otherwise
1469 // only a few wx classes would be safe to use from non-main threads
1470 // as MB<->WC conversion would fail "randomly".
1471 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1472 #endif
1473
1474 size_t inbuf = strlen(psz);
1475 size_t outbuf = n * SIZEOF_WCHAR_T;
1476 size_t res, cres;
1477 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1478 wchar_t *bufPtr = buf;
1479 const char *pszPtr = psz;
1480
1481 if (buf)
1482 {
1483 // have destination buffer, convert there
1484 cres = iconv(m2w,
1485 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1486 (char**)&bufPtr, &outbuf);
1487 res = n - (outbuf / SIZEOF_WCHAR_T);
1488
1489 if (ms_wcNeedsSwap)
1490 {
1491 // convert to native endianness
1492 for ( unsigned n = 0; n < res; n++ )
1493 buf[n] = WC_BSWAP(buf[n]);
1494 }
1495
1496 // NB: iconv was given only strlen(psz) characters on input, and so
1497 // it couldn't convert the trailing zero. Let's do it ourselves
1498 // if there's some room left for it in the output buffer.
1499 if (res < n)
1500 buf[res] = 0;
1501 }
1502 else
1503 {
1504 // no destination buffer... convert using temp buffer
1505 // to calculate destination buffer requirement
1506 wchar_t tbuf[8];
1507 res = 0;
1508 do {
1509 bufPtr = tbuf;
1510 outbuf = 8*SIZEOF_WCHAR_T;
1511
1512 cres = iconv(m2w,
1513 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1514 (char**)&bufPtr, &outbuf );
1515
1516 res += 8-(outbuf/SIZEOF_WCHAR_T);
1517 } while ((cres==(size_t)-1) && (errno==E2BIG));
1518 }
1519
1520 if (ICONV_FAILED(cres, inbuf))
1521 {
1522 //VS: it is ok if iconv fails, hence trace only
1523 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1524 return (size_t)-1;
1525 }
1526
1527 return res;
1528 }
1529
1530 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1531 {
1532 #if wxUSE_THREADS
1533 // NB: explained in MB2WC
1534 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1535 #endif
1536
1537 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1538 size_t outbuf = n;
1539 size_t res, cres;
1540
1541 wchar_t *tmpbuf = 0;
1542
1543 if (ms_wcNeedsSwap)
1544 {
1545 // need to copy to temp buffer to switch endianness
1546 // (doing WC_BSWAP twice on the original buffer won't help, as it
1547 // could be in read-only memory, or be accessed in some other thread)
1548 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1549 for ( size_t n = 0; n < inbuf; n++ )
1550 tmpbuf[n] = WC_BSWAP(psz[n]);
1551 tmpbuf[inbuf] = L'\0';
1552 psz = tmpbuf;
1553 }
1554
1555 if (buf)
1556 {
1557 // have destination buffer, convert there
1558 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1559
1560 res = n-outbuf;
1561
1562 // NB: iconv was given only wcslen(psz) characters on input, and so
1563 // it couldn't convert the trailing zero. Let's do it ourselves
1564 // if there's some room left for it in the output buffer.
1565 if (res < n)
1566 buf[0] = 0;
1567 }
1568 else
1569 {
1570 // no destination buffer... convert using temp buffer
1571 // to calculate destination buffer requirement
1572 char tbuf[16];
1573 res = 0;
1574 do {
1575 buf = tbuf; outbuf = 16;
1576
1577 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1578
1579 res += 16 - outbuf;
1580 } while ((cres==(size_t)-1) && (errno==E2BIG));
1581 }
1582
1583 if (ms_wcNeedsSwap)
1584 {
1585 free(tmpbuf);
1586 }
1587
1588 if (ICONV_FAILED(cres, inbuf))
1589 {
1590 //VS: it is ok if iconv fails, hence trace only
1591 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1592 return (size_t)-1;
1593 }
1594
1595 return res;
1596 }
1597
1598 #endif // HAVE_ICONV
1599
1600
1601 // ============================================================================
1602 // Win32 conversion classes
1603 // ============================================================================
1604
1605 #ifdef wxHAVE_WIN32_MB2WC
1606
1607 // from utils.cpp
1608 #if wxUSE_FONTMAP
1609 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1610 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1611 #endif
1612
1613 class wxMBConv_win32 : public wxMBConv
1614 {
1615 public:
1616 wxMBConv_win32()
1617 {
1618 m_CodePage = CP_ACP;
1619 }
1620
1621 #if wxUSE_FONTMAP
1622 wxMBConv_win32(const wxChar* name)
1623 {
1624 m_CodePage = wxCharsetToCodepage(name);
1625 }
1626
1627 wxMBConv_win32(wxFontEncoding encoding)
1628 {
1629 m_CodePage = wxEncodingToCodepage(encoding);
1630 }
1631 #endif
1632
1633 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1634 {
1635 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1636 // the behaviour is not compatible with the Unix version (using iconv)
1637 // and break the library itself, e.g. wxTextInputStream::NextChar()
1638 // wouldn't work if reading an incomplete MB char didn't result in an
1639 // error
1640 //
1641 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1642 // an error (tested under Windows Server 2003) and apparently it is
1643 // done on purpose, i.e. the function accepts any input in this case
1644 // and although I'd prefer to return error on ill-formed output, our
1645 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1646 // explicitly ill-formed according to RFC 2152) neither so we don't
1647 // even have any fallback here...
1648 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1649
1650 const size_t len = ::MultiByteToWideChar
1651 (
1652 m_CodePage, // code page
1653 flags, // flags: fall on error
1654 psz, // input string
1655 -1, // its length (NUL-terminated)
1656 buf, // output string
1657 buf ? n : 0 // size of output buffer
1658 );
1659
1660 // note that it returns count of written chars for buf != NULL and size
1661 // of the needed buffer for buf == NULL so in either case the length of
1662 // the string (which never includes the terminating NUL) is one less
1663 return len ? len - 1 : (size_t)-1;
1664 }
1665
1666 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1667 {
1668 /*
1669 we have a problem here: by default, WideCharToMultiByte() may
1670 replace characters unrepresentable in the target code page with bad
1671 quality approximations such as turning "1/2" symbol (U+00BD) into
1672 "1" for the code pages which don't have it and we, obviously, want
1673 to avoid this at any price
1674
1675 the trouble is that this function does it _silently_, i.e. it won't
1676 even tell us whether it did or not... Win98/2000 and higher provide
1677 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1678 we have to resort to a round trip, i.e. check that converting back
1679 results in the same string -- this is, of course, expensive but
1680 otherwise we simply can't be sure to not garble the data.
1681 */
1682
1683 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1684 // it doesn't work with CJK encodings (which we test for rather roughly
1685 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1686 // supporting it
1687 BOOL usedDef wxDUMMY_INITIALIZE(false);
1688 BOOL *pUsedDef;
1689 int flags;
1690 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1691 {
1692 // it's our lucky day
1693 flags = WC_NO_BEST_FIT_CHARS;
1694 pUsedDef = &usedDef;
1695 }
1696 else // old system or unsupported encoding
1697 {
1698 flags = 0;
1699 pUsedDef = NULL;
1700 }
1701
1702 const size_t len = ::WideCharToMultiByte
1703 (
1704 m_CodePage, // code page
1705 flags, // either none or no best fit
1706 pwz, // input string
1707 -1, // it is (wide) NUL-terminated
1708 buf, // output buffer
1709 buf ? n : 0, // and its size
1710 NULL, // default "replacement" char
1711 pUsedDef // [out] was it used?
1712 );
1713
1714 if ( !len )
1715 {
1716 // function totally failed
1717 return (size_t)-1;
1718 }
1719
1720 // if we were really converting, check if we succeeded
1721 if ( buf )
1722 {
1723 if ( flags )
1724 {
1725 // check if the conversion failed, i.e. if any replacements
1726 // were done
1727 if ( usedDef )
1728 return (size_t)-1;
1729 }
1730 else // we must resort to double tripping...
1731 {
1732 wxWCharBuffer wcBuf(n);
1733 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1734 wcscmp(wcBuf, pwz) != 0 )
1735 {
1736 // we didn't obtain the same thing we started from, hence
1737 // the conversion was lossy and we consider that it failed
1738 return (size_t)-1;
1739 }
1740 }
1741 }
1742
1743 // see the comment above for the reason of "len - 1"
1744 return len - 1;
1745 }
1746
1747 bool IsOk() const { return m_CodePage != -1; }
1748
1749 private:
1750 static bool CanUseNoBestFit()
1751 {
1752 static int s_isWin98Or2k = -1;
1753
1754 if ( s_isWin98Or2k == -1 )
1755 {
1756 int verMaj, verMin;
1757 switch ( wxGetOsVersion(&verMaj, &verMin) )
1758 {
1759 case wxWIN95:
1760 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1761 break;
1762
1763 case wxWINDOWS_NT:
1764 s_isWin98Or2k = verMaj >= 5;
1765 break;
1766
1767 default:
1768 // unknown, be conseravtive by default
1769 s_isWin98Or2k = 0;
1770 }
1771
1772 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1773 }
1774
1775 return s_isWin98Or2k == 1;
1776 }
1777
1778 long m_CodePage;
1779 };
1780
1781 #endif // wxHAVE_WIN32_MB2WC
1782
1783 // ============================================================================
1784 // Cocoa conversion classes
1785 // ============================================================================
1786
1787 #if defined(__WXCOCOA__)
1788
1789 // RN: There is no UTF-32 support in either Core Foundation or
1790 // Cocoa. Strangely enough, internally Core Foundation uses
1791 // UTF 32 internally quite a bit - its just not public (yet).
1792
1793 #include <CoreFoundation/CFString.h>
1794 #include <CoreFoundation/CFStringEncodingExt.h>
1795
1796 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1797 {
1798 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1799 if ( encoding == wxFONTENCODING_DEFAULT )
1800 {
1801 enc = CFStringGetSystemEncoding();
1802 }
1803 else switch( encoding)
1804 {
1805 case wxFONTENCODING_ISO8859_1 :
1806 enc = kCFStringEncodingISOLatin1 ;
1807 break ;
1808 case wxFONTENCODING_ISO8859_2 :
1809 enc = kCFStringEncodingISOLatin2;
1810 break ;
1811 case wxFONTENCODING_ISO8859_3 :
1812 enc = kCFStringEncodingISOLatin3 ;
1813 break ;
1814 case wxFONTENCODING_ISO8859_4 :
1815 enc = kCFStringEncodingISOLatin4;
1816 break ;
1817 case wxFONTENCODING_ISO8859_5 :
1818 enc = kCFStringEncodingISOLatinCyrillic;
1819 break ;
1820 case wxFONTENCODING_ISO8859_6 :
1821 enc = kCFStringEncodingISOLatinArabic;
1822 break ;
1823 case wxFONTENCODING_ISO8859_7 :
1824 enc = kCFStringEncodingISOLatinGreek;
1825 break ;
1826 case wxFONTENCODING_ISO8859_8 :
1827 enc = kCFStringEncodingISOLatinHebrew;
1828 break ;
1829 case wxFONTENCODING_ISO8859_9 :
1830 enc = kCFStringEncodingISOLatin5;
1831 break ;
1832 case wxFONTENCODING_ISO8859_10 :
1833 enc = kCFStringEncodingISOLatin6;
1834 break ;
1835 case wxFONTENCODING_ISO8859_11 :
1836 enc = kCFStringEncodingISOLatinThai;
1837 break ;
1838 case wxFONTENCODING_ISO8859_13 :
1839 enc = kCFStringEncodingISOLatin7;
1840 break ;
1841 case wxFONTENCODING_ISO8859_14 :
1842 enc = kCFStringEncodingISOLatin8;
1843 break ;
1844 case wxFONTENCODING_ISO8859_15 :
1845 enc = kCFStringEncodingISOLatin9;
1846 break ;
1847
1848 case wxFONTENCODING_KOI8 :
1849 enc = kCFStringEncodingKOI8_R;
1850 break ;
1851 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1852 enc = kCFStringEncodingDOSRussian;
1853 break ;
1854
1855 // case wxFONTENCODING_BULGARIAN :
1856 // enc = ;
1857 // break ;
1858
1859 case wxFONTENCODING_CP437 :
1860 enc =kCFStringEncodingDOSLatinUS ;
1861 break ;
1862 case wxFONTENCODING_CP850 :
1863 enc = kCFStringEncodingDOSLatin1;
1864 break ;
1865 case wxFONTENCODING_CP852 :
1866 enc = kCFStringEncodingDOSLatin2;
1867 break ;
1868 case wxFONTENCODING_CP855 :
1869 enc = kCFStringEncodingDOSCyrillic;
1870 break ;
1871 case wxFONTENCODING_CP866 :
1872 enc =kCFStringEncodingDOSRussian ;
1873 break ;
1874 case wxFONTENCODING_CP874 :
1875 enc = kCFStringEncodingDOSThai;
1876 break ;
1877 case wxFONTENCODING_CP932 :
1878 enc = kCFStringEncodingDOSJapanese;
1879 break ;
1880 case wxFONTENCODING_CP936 :
1881 enc =kCFStringEncodingDOSChineseSimplif ;
1882 break ;
1883 case wxFONTENCODING_CP949 :
1884 enc = kCFStringEncodingDOSKorean;
1885 break ;
1886 case wxFONTENCODING_CP950 :
1887 enc = kCFStringEncodingDOSChineseTrad;
1888 break ;
1889 case wxFONTENCODING_CP1250 :
1890 enc = kCFStringEncodingWindowsLatin2;
1891 break ;
1892 case wxFONTENCODING_CP1251 :
1893 enc =kCFStringEncodingWindowsCyrillic ;
1894 break ;
1895 case wxFONTENCODING_CP1252 :
1896 enc =kCFStringEncodingWindowsLatin1 ;
1897 break ;
1898 case wxFONTENCODING_CP1253 :
1899 enc = kCFStringEncodingWindowsGreek;
1900 break ;
1901 case wxFONTENCODING_CP1254 :
1902 enc = kCFStringEncodingWindowsLatin5;
1903 break ;
1904 case wxFONTENCODING_CP1255 :
1905 enc =kCFStringEncodingWindowsHebrew ;
1906 break ;
1907 case wxFONTENCODING_CP1256 :
1908 enc =kCFStringEncodingWindowsArabic ;
1909 break ;
1910 case wxFONTENCODING_CP1257 :
1911 enc = kCFStringEncodingWindowsBalticRim;
1912 break ;
1913 // This only really encodes to UTF7 (if that) evidently
1914 // case wxFONTENCODING_UTF7 :
1915 // enc = kCFStringEncodingNonLossyASCII ;
1916 // break ;
1917 case wxFONTENCODING_UTF8 :
1918 enc = kCFStringEncodingUTF8 ;
1919 break ;
1920 case wxFONTENCODING_EUC_JP :
1921 enc = kCFStringEncodingEUC_JP;
1922 break ;
1923 case wxFONTENCODING_UTF16 :
1924 enc = kCFStringEncodingUnicode ;
1925 break ;
1926 case wxFONTENCODING_MACROMAN :
1927 enc = kCFStringEncodingMacRoman ;
1928 break ;
1929 case wxFONTENCODING_MACJAPANESE :
1930 enc = kCFStringEncodingMacJapanese ;
1931 break ;
1932 case wxFONTENCODING_MACCHINESETRAD :
1933 enc = kCFStringEncodingMacChineseTrad ;
1934 break ;
1935 case wxFONTENCODING_MACKOREAN :
1936 enc = kCFStringEncodingMacKorean ;
1937 break ;
1938 case wxFONTENCODING_MACARABIC :
1939 enc = kCFStringEncodingMacArabic ;
1940 break ;
1941 case wxFONTENCODING_MACHEBREW :
1942 enc = kCFStringEncodingMacHebrew ;
1943 break ;
1944 case wxFONTENCODING_MACGREEK :
1945 enc = kCFStringEncodingMacGreek ;
1946 break ;
1947 case wxFONTENCODING_MACCYRILLIC :
1948 enc = kCFStringEncodingMacCyrillic ;
1949 break ;
1950 case wxFONTENCODING_MACDEVANAGARI :
1951 enc = kCFStringEncodingMacDevanagari ;
1952 break ;
1953 case wxFONTENCODING_MACGURMUKHI :
1954 enc = kCFStringEncodingMacGurmukhi ;
1955 break ;
1956 case wxFONTENCODING_MACGUJARATI :
1957 enc = kCFStringEncodingMacGujarati ;
1958 break ;
1959 case wxFONTENCODING_MACORIYA :
1960 enc = kCFStringEncodingMacOriya ;
1961 break ;
1962 case wxFONTENCODING_MACBENGALI :
1963 enc = kCFStringEncodingMacBengali ;
1964 break ;
1965 case wxFONTENCODING_MACTAMIL :
1966 enc = kCFStringEncodingMacTamil ;
1967 break ;
1968 case wxFONTENCODING_MACTELUGU :
1969 enc = kCFStringEncodingMacTelugu ;
1970 break ;
1971 case wxFONTENCODING_MACKANNADA :
1972 enc = kCFStringEncodingMacKannada ;
1973 break ;
1974 case wxFONTENCODING_MACMALAJALAM :
1975 enc = kCFStringEncodingMacMalayalam ;
1976 break ;
1977 case wxFONTENCODING_MACSINHALESE :
1978 enc = kCFStringEncodingMacSinhalese ;
1979 break ;
1980 case wxFONTENCODING_MACBURMESE :
1981 enc = kCFStringEncodingMacBurmese ;
1982 break ;
1983 case wxFONTENCODING_MACKHMER :
1984 enc = kCFStringEncodingMacKhmer ;
1985 break ;
1986 case wxFONTENCODING_MACTHAI :
1987 enc = kCFStringEncodingMacThai ;
1988 break ;
1989 case wxFONTENCODING_MACLAOTIAN :
1990 enc = kCFStringEncodingMacLaotian ;
1991 break ;
1992 case wxFONTENCODING_MACGEORGIAN :
1993 enc = kCFStringEncodingMacGeorgian ;
1994 break ;
1995 case wxFONTENCODING_MACARMENIAN :
1996 enc = kCFStringEncodingMacArmenian ;
1997 break ;
1998 case wxFONTENCODING_MACCHINESESIMP :
1999 enc = kCFStringEncodingMacChineseSimp ;
2000 break ;
2001 case wxFONTENCODING_MACTIBETAN :
2002 enc = kCFStringEncodingMacTibetan ;
2003 break ;
2004 case wxFONTENCODING_MACMONGOLIAN :
2005 enc = kCFStringEncodingMacMongolian ;
2006 break ;
2007 case wxFONTENCODING_MACETHIOPIC :
2008 enc = kCFStringEncodingMacEthiopic ;
2009 break ;
2010 case wxFONTENCODING_MACCENTRALEUR :
2011 enc = kCFStringEncodingMacCentralEurRoman ;
2012 break ;
2013 case wxFONTENCODING_MACVIATNAMESE :
2014 enc = kCFStringEncodingMacVietnamese ;
2015 break ;
2016 case wxFONTENCODING_MACARABICEXT :
2017 enc = kCFStringEncodingMacExtArabic ;
2018 break ;
2019 case wxFONTENCODING_MACSYMBOL :
2020 enc = kCFStringEncodingMacSymbol ;
2021 break ;
2022 case wxFONTENCODING_MACDINGBATS :
2023 enc = kCFStringEncodingMacDingbats ;
2024 break ;
2025 case wxFONTENCODING_MACTURKISH :
2026 enc = kCFStringEncodingMacTurkish ;
2027 break ;
2028 case wxFONTENCODING_MACCROATIAN :
2029 enc = kCFStringEncodingMacCroatian ;
2030 break ;
2031 case wxFONTENCODING_MACICELANDIC :
2032 enc = kCFStringEncodingMacIcelandic ;
2033 break ;
2034 case wxFONTENCODING_MACROMANIAN :
2035 enc = kCFStringEncodingMacRomanian ;
2036 break ;
2037 case wxFONTENCODING_MACCELTIC :
2038 enc = kCFStringEncodingMacCeltic ;
2039 break ;
2040 case wxFONTENCODING_MACGAELIC :
2041 enc = kCFStringEncodingMacGaelic ;
2042 break ;
2043 // case wxFONTENCODING_MACKEYBOARD :
2044 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2045 // break ;
2046 default :
2047 // because gcc is picky
2048 break ;
2049 } ;
2050 return enc ;
2051 }
2052
2053 class wxMBConv_cocoa : public wxMBConv
2054 {
2055 public:
2056 wxMBConv_cocoa()
2057 {
2058 Init(CFStringGetSystemEncoding()) ;
2059 }
2060
2061 #if wxUSE_FONTMAP
2062 wxMBConv_cocoa(const wxChar* name)
2063 {
2064 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2065 }
2066 #endif
2067
2068 wxMBConv_cocoa(wxFontEncoding encoding)
2069 {
2070 Init( wxCFStringEncFromFontEnc(encoding) );
2071 }
2072
2073 ~wxMBConv_cocoa()
2074 {
2075 }
2076
2077 void Init( CFStringEncoding encoding)
2078 {
2079 m_encoding = encoding ;
2080 }
2081
2082 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2083 {
2084 wxASSERT(szUnConv);
2085
2086 CFStringRef theString = CFStringCreateWithBytes (
2087 NULL, //the allocator
2088 (const UInt8*)szUnConv,
2089 strlen(szUnConv),
2090 m_encoding,
2091 false //no BOM/external representation
2092 );
2093
2094 wxASSERT(theString);
2095
2096 size_t nOutLength = CFStringGetLength(theString);
2097
2098 if (szOut == NULL)
2099 {
2100 CFRelease(theString);
2101 return nOutLength;
2102 }
2103
2104 CFRange theRange = { 0, nOutSize };
2105
2106 #if SIZEOF_WCHAR_T == 4
2107 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2108 #endif
2109
2110 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2111
2112 CFRelease(theString);
2113
2114 szUniCharBuffer[nOutLength] = '\0' ;
2115
2116 #if SIZEOF_WCHAR_T == 4
2117 wxMBConvUTF16 converter ;
2118 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2119 delete[] szUniCharBuffer;
2120 #endif
2121
2122 return nOutLength;
2123 }
2124
2125 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2126 {
2127 wxASSERT(szUnConv);
2128
2129 size_t nRealOutSize;
2130 size_t nBufSize = wxWcslen(szUnConv);
2131 UniChar* szUniBuffer = (UniChar*) szUnConv;
2132
2133 #if SIZEOF_WCHAR_T == 4
2134 wxMBConvUTF16 converter ;
2135 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2136 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2137 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2138 nBufSize /= sizeof(UniChar);
2139 #endif
2140
2141 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2142 NULL, //allocator
2143 szUniBuffer,
2144 nBufSize,
2145 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2146 );
2147
2148 wxASSERT(theString);
2149
2150 //Note that CER puts a BOM when converting to unicode
2151 //so we check and use getchars instead in that case
2152 if (m_encoding == kCFStringEncodingUnicode)
2153 {
2154 if (szOut != NULL)
2155 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2156
2157 nRealOutSize = CFStringGetLength(theString) + 1;
2158 }
2159 else
2160 {
2161 CFStringGetBytes(
2162 theString,
2163 CFRangeMake(0, CFStringGetLength(theString)),
2164 m_encoding,
2165 0, //what to put in characters that can't be converted -
2166 //0 tells CFString to return NULL if it meets such a character
2167 false, //not an external representation
2168 (UInt8*) szOut,
2169 nOutSize,
2170 (CFIndex*) &nRealOutSize
2171 );
2172 }
2173
2174 CFRelease(theString);
2175
2176 #if SIZEOF_WCHAR_T == 4
2177 delete[] szUniBuffer;
2178 #endif
2179
2180 return nRealOutSize - 1;
2181 }
2182
2183 bool IsOk() const
2184 {
2185 return m_encoding != kCFStringEncodingInvalidId &&
2186 CFStringIsEncodingAvailable(m_encoding);
2187 }
2188
2189 private:
2190 CFStringEncoding m_encoding ;
2191 };
2192
2193 #endif // defined(__WXCOCOA__)
2194
2195 // ============================================================================
2196 // Mac conversion classes
2197 // ============================================================================
2198
2199 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2200
2201 class wxMBConv_mac : public wxMBConv
2202 {
2203 public:
2204 wxMBConv_mac()
2205 {
2206 Init(CFStringGetSystemEncoding()) ;
2207 }
2208
2209 #if wxUSE_FONTMAP
2210 wxMBConv_mac(const wxChar* name)
2211 {
2212 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2213 }
2214 #endif
2215
2216 wxMBConv_mac(wxFontEncoding encoding)
2217 {
2218 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2219 }
2220
2221 ~wxMBConv_mac()
2222 {
2223 OSStatus status = noErr ;
2224 status = TECDisposeConverter(m_MB2WC_converter);
2225 status = TECDisposeConverter(m_WC2MB_converter);
2226 }
2227
2228
2229 void Init( TextEncodingBase encoding)
2230 {
2231 OSStatus status = noErr ;
2232 m_char_encoding = encoding ;
2233 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2234
2235 status = TECCreateConverter(&m_MB2WC_converter,
2236 m_char_encoding,
2237 m_unicode_encoding);
2238 status = TECCreateConverter(&m_WC2MB_converter,
2239 m_unicode_encoding,
2240 m_char_encoding);
2241 }
2242
2243 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2244 {
2245 OSStatus status = noErr ;
2246 ByteCount byteOutLen ;
2247 ByteCount byteInLen = strlen(psz) ;
2248 wchar_t *tbuf = NULL ;
2249 UniChar* ubuf = NULL ;
2250 size_t res = 0 ;
2251
2252 if (buf == NULL)
2253 {
2254 //apple specs say at least 32
2255 n = wxMax( 32 , byteInLen ) ;
2256 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2257 }
2258 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2259 #if SIZEOF_WCHAR_T == 4
2260 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2261 #else
2262 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2263 #endif
2264 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2265 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2266 #if SIZEOF_WCHAR_T == 4
2267 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2268 // is not properly terminated we get random characters at the end
2269 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2270 wxMBConvUTF16 converter ;
2271 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2272 free( ubuf ) ;
2273 #else
2274 res = byteOutLen / sizeof( UniChar ) ;
2275 #endif
2276 if ( buf == NULL )
2277 free(tbuf) ;
2278
2279 if ( buf && res < n)
2280 buf[res] = 0;
2281
2282 return res ;
2283 }
2284
2285 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2286 {
2287 OSStatus status = noErr ;
2288 ByteCount byteOutLen ;
2289 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2290
2291 char *tbuf = NULL ;
2292
2293 if (buf == NULL)
2294 {
2295 //apple specs say at least 32
2296 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2297 tbuf = (char*) malloc( n ) ;
2298 }
2299
2300 ByteCount byteBufferLen = n ;
2301 UniChar* ubuf = NULL ;
2302 #if SIZEOF_WCHAR_T == 4
2303 wxMBConvUTF16 converter ;
2304 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2305 byteInLen = unicharlen ;
2306 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2307 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2308 #else
2309 ubuf = (UniChar*) psz ;
2310 #endif
2311 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2312 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2313 #if SIZEOF_WCHAR_T == 4
2314 free( ubuf ) ;
2315 #endif
2316 if ( buf == NULL )
2317 free(tbuf) ;
2318
2319 size_t res = byteOutLen ;
2320 if ( buf && res < n)
2321 {
2322 buf[res] = 0;
2323
2324 //we need to double-trip to verify it didn't insert any ? in place
2325 //of bogus characters
2326 wxWCharBuffer wcBuf(n);
2327 size_t pszlen = wxWcslen(psz);
2328 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2329 wxWcslen(wcBuf) != pszlen ||
2330 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2331 {
2332 // we didn't obtain the same thing we started from, hence
2333 // the conversion was lossy and we consider that it failed
2334 return (size_t)-1;
2335 }
2336 }
2337
2338 return res ;
2339 }
2340
2341 bool IsOk() const
2342 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2343
2344 private:
2345 TECObjectRef m_MB2WC_converter ;
2346 TECObjectRef m_WC2MB_converter ;
2347
2348 TextEncodingBase m_char_encoding ;
2349 TextEncodingBase m_unicode_encoding ;
2350 };
2351
2352 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2353
2354 // ============================================================================
2355 // wxEncodingConverter based conversion classes
2356 // ============================================================================
2357
2358 #if wxUSE_FONTMAP
2359
2360 class wxMBConv_wxwin : public wxMBConv
2361 {
2362 private:
2363 void Init()
2364 {
2365 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2366 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2367 }
2368
2369 public:
2370 // temporarily just use wxEncodingConverter stuff,
2371 // so that it works while a better implementation is built
2372 wxMBConv_wxwin(const wxChar* name)
2373 {
2374 if (name)
2375 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2376 else
2377 m_enc = wxFONTENCODING_SYSTEM;
2378
2379 Init();
2380 }
2381
2382 wxMBConv_wxwin(wxFontEncoding enc)
2383 {
2384 m_enc = enc;
2385
2386 Init();
2387 }
2388
2389 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2390 {
2391 size_t inbuf = strlen(psz);
2392 if (buf)
2393 {
2394 if (!m2w.Convert(psz,buf))
2395 return (size_t)-1;
2396 }
2397 return inbuf;
2398 }
2399
2400 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2401 {
2402 const size_t inbuf = wxWcslen(psz);
2403 if (buf)
2404 {
2405 if (!w2m.Convert(psz,buf))
2406 return (size_t)-1;
2407 }
2408
2409 return inbuf;
2410 }
2411
2412 bool IsOk() const { return m_ok; }
2413
2414 public:
2415 wxFontEncoding m_enc;
2416 wxEncodingConverter m2w, w2m;
2417
2418 // were we initialized successfully?
2419 bool m_ok;
2420
2421 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2422 };
2423
2424 // make the constructors available for unit testing
2425 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2426 {
2427 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2428 if ( !result->IsOk() )
2429 {
2430 delete result;
2431 return 0;
2432 }
2433 return result;
2434 }
2435
2436 #endif // wxUSE_FONTMAP
2437
2438 // ============================================================================
2439 // wxCSConv implementation
2440 // ============================================================================
2441
2442 void wxCSConv::Init()
2443 {
2444 m_name = NULL;
2445 m_convReal = NULL;
2446 m_deferred = true;
2447 }
2448
2449 wxCSConv::wxCSConv(const wxChar *charset)
2450 {
2451 Init();
2452
2453 if ( charset )
2454 {
2455 SetName(charset);
2456 }
2457
2458 m_encoding = wxFONTENCODING_SYSTEM;
2459 }
2460
2461 wxCSConv::wxCSConv(wxFontEncoding encoding)
2462 {
2463 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2464 {
2465 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2466
2467 encoding = wxFONTENCODING_SYSTEM;
2468 }
2469
2470 Init();
2471
2472 m_encoding = encoding;
2473 }
2474
2475 wxCSConv::~wxCSConv()
2476 {
2477 Clear();
2478 }
2479
2480 wxCSConv::wxCSConv(const wxCSConv& conv)
2481 : wxMBConv()
2482 {
2483 Init();
2484
2485 SetName(conv.m_name);
2486 m_encoding = conv.m_encoding;
2487 }
2488
2489 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2490 {
2491 Clear();
2492
2493 SetName(conv.m_name);
2494 m_encoding = conv.m_encoding;
2495
2496 return *this;
2497 }
2498
2499 void wxCSConv::Clear()
2500 {
2501 free(m_name);
2502 delete m_convReal;
2503
2504 m_name = NULL;
2505 m_convReal = NULL;
2506 }
2507
2508 void wxCSConv::SetName(const wxChar *charset)
2509 {
2510 if (charset)
2511 {
2512 m_name = wxStrdup(charset);
2513 m_deferred = true;
2514 }
2515 }
2516
2517 #if wxUSE_FONTMAP
2518 #include "wx/hashmap.h"
2519
2520 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2521 wxEncodingNameCache );
2522
2523 static wxEncodingNameCache gs_nameCache;
2524 #endif
2525
2526 wxMBConv *wxCSConv::DoCreate() const
2527 {
2528 #if wxUSE_FONTMAP
2529 wxLogTrace(TRACE_STRCONV,
2530 wxT("creating conversion for %s"),
2531 (m_name ? m_name
2532 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2533 #endif // wxUSE_FONTMAP
2534
2535 // check for the special case of ASCII or ISO8859-1 charset: as we have
2536 // special knowledge of it anyhow, we don't need to create a special
2537 // conversion object
2538 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2539 {
2540 // don't convert at all
2541 return NULL;
2542 }
2543
2544 // we trust OS to do conversion better than we can so try external
2545 // conversion methods first
2546 //
2547 // the full order is:
2548 // 1. OS conversion (iconv() under Unix or Win32 API)
2549 // 2. hard coded conversions for UTF
2550 // 3. wxEncodingConverter as fall back
2551
2552 // step (1)
2553 #ifdef HAVE_ICONV
2554 #if !wxUSE_FONTMAP
2555 if ( m_name )
2556 #endif // !wxUSE_FONTMAP
2557 {
2558 wxString name(m_name);
2559 wxFontEncoding encoding(m_encoding);
2560
2561 if ( !name.empty() )
2562 {
2563 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2564 if ( conv->IsOk() )
2565 return conv;
2566
2567 delete conv;
2568
2569 #if wxUSE_FONTMAP
2570 encoding =
2571 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2572 #endif // wxUSE_FONTMAP
2573 }
2574 #if wxUSE_FONTMAP
2575 {
2576 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2577 if ( it != gs_nameCache.end() )
2578 {
2579 if ( it->second.empty() )
2580 return NULL;
2581
2582 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2583 if ( conv->IsOk() )
2584 return conv;
2585
2586 delete conv;
2587 }
2588
2589 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2590
2591 for ( ; *names; ++names )
2592 {
2593 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2594 if ( conv->IsOk() )
2595 {
2596 gs_nameCache[encoding] = *names;
2597 return conv;
2598 }
2599
2600 delete conv;
2601 }
2602
2603 gs_nameCache[encoding] = _T(""); // cache the failure
2604 }
2605 #endif // wxUSE_FONTMAP
2606 }
2607 #endif // HAVE_ICONV
2608
2609 #ifdef wxHAVE_WIN32_MB2WC
2610 {
2611 #if wxUSE_FONTMAP
2612 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2613 : new wxMBConv_win32(m_encoding);
2614 if ( conv->IsOk() )
2615 return conv;
2616
2617 delete conv;
2618 #else
2619 return NULL;
2620 #endif
2621 }
2622 #endif // wxHAVE_WIN32_MB2WC
2623 #if defined(__WXMAC__)
2624 {
2625 // leave UTF16 and UTF32 to the built-ins of wx
2626 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2627 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2628 {
2629
2630 #if wxUSE_FONTMAP
2631 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2632 : new wxMBConv_mac(m_encoding);
2633 #else
2634 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2635 #endif
2636 if ( conv->IsOk() )
2637 return conv;
2638
2639 delete conv;
2640 }
2641 }
2642 #endif
2643 #if defined(__WXCOCOA__)
2644 {
2645 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2646 {
2647
2648 #if wxUSE_FONTMAP
2649 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2650 : new wxMBConv_cocoa(m_encoding);
2651 #else
2652 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2653 #endif
2654 if ( conv->IsOk() )
2655 return conv;
2656
2657 delete conv;
2658 }
2659 }
2660 #endif
2661 // step (2)
2662 wxFontEncoding enc = m_encoding;
2663 #if wxUSE_FONTMAP
2664 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2665 {
2666 // use "false" to suppress interactive dialogs -- we can be called from
2667 // anywhere and popping up a dialog from here is the last thing we want to
2668 // do
2669 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2670 }
2671 #endif // wxUSE_FONTMAP
2672
2673 switch ( enc )
2674 {
2675 case wxFONTENCODING_UTF7:
2676 return new wxMBConvUTF7;
2677
2678 case wxFONTENCODING_UTF8:
2679 return new wxMBConvUTF8;
2680
2681 case wxFONTENCODING_UTF16BE:
2682 return new wxMBConvUTF16BE;
2683
2684 case wxFONTENCODING_UTF16LE:
2685 return new wxMBConvUTF16LE;
2686
2687 case wxFONTENCODING_UTF32BE:
2688 return new wxMBConvUTF32BE;
2689
2690 case wxFONTENCODING_UTF32LE:
2691 return new wxMBConvUTF32LE;
2692
2693 default:
2694 // nothing to do but put here to suppress gcc warnings
2695 ;
2696 }
2697
2698 // step (3)
2699 #if wxUSE_FONTMAP
2700 {
2701 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2702 : new wxMBConv_wxwin(m_encoding);
2703 if ( conv->IsOk() )
2704 return conv;
2705
2706 delete conv;
2707 }
2708 #endif // wxUSE_FONTMAP
2709
2710 // NB: This is a hack to prevent deadlock. What could otherwise happen
2711 // in Unicode build: wxConvLocal creation ends up being here
2712 // because of some failure and logs the error. But wxLog will try to
2713 // attach timestamp, for which it will need wxConvLocal (to convert
2714 // time to char* and then wchar_t*), but that fails, tries to log
2715 // error, but wxLog has a (already locked) critical section that
2716 // guards static buffer.
2717 static bool alreadyLoggingError = false;
2718 if (!alreadyLoggingError)
2719 {
2720 alreadyLoggingError = true;
2721 wxLogError(_("Cannot convert from the charset '%s'!"),
2722 m_name ? m_name
2723 :
2724 #if wxUSE_FONTMAP
2725 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2726 #else // !wxUSE_FONTMAP
2727 wxString::Format(_("encoding %s"), m_encoding).c_str()
2728 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2729 );
2730 alreadyLoggingError = false;
2731 }
2732
2733 return NULL;
2734 }
2735
2736 void wxCSConv::CreateConvIfNeeded() const
2737 {
2738 if ( m_deferred )
2739 {
2740 wxCSConv *self = (wxCSConv *)this; // const_cast
2741
2742 #if wxUSE_INTL
2743 // if we don't have neither the name nor the encoding, use the default
2744 // encoding for this system
2745 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2746 {
2747 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2748 }
2749 #endif // wxUSE_INTL
2750
2751 self->m_convReal = DoCreate();
2752 self->m_deferred = false;
2753 }
2754 }
2755
2756 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2757 {
2758 CreateConvIfNeeded();
2759
2760 if (m_convReal)
2761 return m_convReal->MB2WC(buf, psz, n);
2762
2763 // latin-1 (direct)
2764 size_t len = strlen(psz);
2765
2766 if (buf)
2767 {
2768 for (size_t c = 0; c <= len; c++)
2769 buf[c] = (unsigned char)(psz[c]);
2770 }
2771
2772 return len;
2773 }
2774
2775 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2776 {
2777 CreateConvIfNeeded();
2778
2779 if (m_convReal)
2780 return m_convReal->WC2MB(buf, psz, n);
2781
2782 // latin-1 (direct)
2783 const size_t len = wxWcslen(psz);
2784 if (buf)
2785 {
2786 for (size_t c = 0; c <= len; c++)
2787 {
2788 if (psz[c] > 0xFF)
2789 return (size_t)-1;
2790 buf[c] = (char)psz[c];
2791 }
2792 }
2793 else
2794 {
2795 for (size_t c = 0; c <= len; c++)
2796 {
2797 if (psz[c] > 0xFF)
2798 return (size_t)-1;
2799 }
2800 }
2801
2802 return len;
2803 }
2804
2805 // ----------------------------------------------------------------------------
2806 // globals
2807 // ----------------------------------------------------------------------------
2808
2809 #ifdef __WINDOWS__
2810 static wxMBConv_win32 wxConvLibcObj;
2811 #elif defined(__WXMAC__) && !defined(__MACH__)
2812 static wxMBConv_mac wxConvLibcObj ;
2813 #else
2814 static wxMBConvLibc wxConvLibcObj;
2815 #endif
2816
2817 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2818 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2819 static wxMBConvUTF7 wxConvUTF7Obj;
2820 static wxMBConvUTF8 wxConvUTF8Obj;
2821
2822 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2823 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2824 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2825 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2826 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2827 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2828 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2829 #ifdef __WXOSX__
2830 wxConvUTF8Obj;
2831 #else
2832 wxConvLibcObj;
2833 #endif
2834
2835
2836 #else // !wxUSE_WCHAR_T
2837
2838 // stand-ins in absence of wchar_t
2839 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2840 wxConvISO8859_1,
2841 wxConvLocal,
2842 wxConvUTF8;
2843
2844 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2845
2846