]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
bb08007cf4b0d7a4a1a57e2c243e109df43eccb3
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WINDOWS__
44 #include "wx/msw/private.h"
45 #include "wx/msw/missing.h"
46 #endif
47
48 #ifndef __WXWINCE__
49 #include <errno.h>
50 #endif
51
52 #include <ctype.h>
53 #include <string.h>
54 #include <stdlib.h>
55
56 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
57 #define wxHAVE_WIN32_MB2WC
58 #endif // __WIN32__ but !__WXMICROWIN__
59
60 #ifdef __SALFORDC__
61 #include <clib.h>
62 #endif
63
64 #ifdef HAVE_ICONV
65 #include <iconv.h>
66 #include "wx/thread.h"
67 #endif
68
69 #include "wx/encconv.h"
70 #include "wx/fontmap.h"
71 #include "wx/utils.h"
72
73 #ifdef __WXMAC__
74 #ifndef __DARWIN__
75 #include <ATSUnicode.h>
76 #include <TextCommon.h>
77 #include <TextEncodingConverter.h>
78 #endif
79
80 #include "wx/mac/private.h" // includes mac headers
81 #endif
82
83 #define TRACE_STRCONV _T("strconv")
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // ----------------------------------------------------------------------------
90 // UTF-16 en/decoding to/from UCS-4
91 // ----------------------------------------------------------------------------
92
93
94 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
95 {
96 if (input<=0xffff)
97 {
98 if (output)
99 *output = (wxUint16) input;
100 return 1;
101 }
102 else if (input>=0x110000)
103 {
104 return (size_t)-1;
105 }
106 else
107 {
108 if (output)
109 {
110 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
111 *output = (wxUint16) ((input&0x3ff)+0xdc00);
112 }
113 return 2;
114 }
115 }
116
117 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
118 {
119 if ((*input<0xd800) || (*input>0xdfff))
120 {
121 output = *input;
122 return 1;
123 }
124 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
125 {
126 output = *input;
127 return (size_t)-1;
128 }
129 else
130 {
131 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
132 return 2;
133 }
134 }
135
136
137 // ----------------------------------------------------------------------------
138 // wxMBConv
139 // ----------------------------------------------------------------------------
140
141 wxMBConv::~wxMBConv()
142 {
143 // nothing to do here (necessary for Darwin linking probably)
144 }
145
146 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
147 {
148 if ( psz )
149 {
150 // calculate the length of the buffer needed first
151 size_t nLen = MB2WC(NULL, psz, 0);
152 if ( nLen != (size_t)-1 )
153 {
154 // now do the actual conversion
155 wxWCharBuffer buf(nLen);
156 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
157 if ( nLen != (size_t)-1 )
158 {
159 return buf;
160 }
161 }
162 }
163
164 wxWCharBuffer buf((wchar_t *)NULL);
165
166 return buf;
167 }
168
169 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
170 {
171 if ( pwz )
172 {
173 size_t nLen = WC2MB(NULL, pwz, 0);
174 if ( nLen != (size_t)-1 )
175 {
176 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
177 nLen = WC2MB(buf.data(), pwz, nLen + 4);
178 if ( nLen != (size_t)-1 )
179 {
180 return buf;
181 }
182 }
183 }
184
185 wxCharBuffer buf((char *)NULL);
186
187 return buf;
188 }
189
190 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
191 {
192 wxASSERT(pOutSize != NULL);
193
194 const char* szEnd = szString + nStringLen + 1;
195 const char* szPos = szString;
196 const char* szStart = szPos;
197
198 size_t nActualLength = 0;
199 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
200
201 wxWCharBuffer theBuffer(nCurrentSize);
202
203 //Convert the string until the length() is reached, continuing the
204 //loop every time a null character is reached
205 while(szPos != szEnd)
206 {
207 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
208
209 //Get the length of the current (sub)string
210 size_t nLen = MB2WC(NULL, szPos, 0);
211
212 //Invalid conversion?
213 if( nLen == (size_t)-1 )
214 {
215 *pOutSize = 0;
216 theBuffer.data()[0u] = wxT('\0');
217 return theBuffer;
218 }
219
220
221 //Increase the actual length (+1 for current null character)
222 nActualLength += nLen + 1;
223
224 //if buffer too big, realloc the buffer
225 if (nActualLength > (nCurrentSize+1))
226 {
227 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
228 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
229 theBuffer = theNewBuffer;
230 nCurrentSize <<= 1;
231 }
232
233 //Convert the current (sub)string
234 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
235 {
236 *pOutSize = 0;
237 theBuffer.data()[0u] = wxT('\0');
238 return theBuffer;
239 }
240
241 //Increment to next (sub)string
242 //Note that we have to use strlen instead of nLen here
243 //because XX2XX gives us the size of the output buffer,
244 //which is not necessarily the length of the string
245 szPos += strlen(szPos) + 1;
246 }
247
248 //success - return actual length and the buffer
249 *pOutSize = nActualLength;
250 return theBuffer;
251 }
252
253 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
254 {
255 wxASSERT(pOutSize != NULL);
256
257 const wchar_t* szEnd = szString + nStringLen + 1;
258 const wchar_t* szPos = szString;
259 const wchar_t* szStart = szPos;
260
261 size_t nActualLength = 0;
262 size_t nCurrentSize = nStringLen << 2; //try * 4 first
263
264 wxCharBuffer theBuffer(nCurrentSize);
265
266 //Convert the string until the length() is reached, continuing the
267 //loop every time a null character is reached
268 while(szPos != szEnd)
269 {
270 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
271
272 //Get the length of the current (sub)string
273 size_t nLen = WC2MB(NULL, szPos, 0);
274
275 //Invalid conversion?
276 if( nLen == (size_t)-1 )
277 {
278 *pOutSize = 0;
279 theBuffer.data()[0u] = wxT('\0');
280 return theBuffer;
281 }
282
283 //Increase the actual length (+1 for current null character)
284 nActualLength += nLen + 1;
285
286 //if buffer too big, realloc the buffer
287 if (nActualLength > (nCurrentSize+1))
288 {
289 wxCharBuffer theNewBuffer(nCurrentSize << 1);
290 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
291 theBuffer = theNewBuffer;
292 nCurrentSize <<= 1;
293 }
294
295 //Convert the current (sub)string
296 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
297 {
298 *pOutSize = 0;
299 theBuffer.data()[0u] = wxT('\0');
300 return theBuffer;
301 }
302
303 //Increment to next (sub)string
304 //Note that we have to use wxWcslen instead of nLen here
305 //because XX2XX gives us the size of the output buffer,
306 //which is not necessarily the length of the string
307 szPos += wxWcslen(szPos) + 1;
308 }
309
310 //success - return actual length and the buffer
311 *pOutSize = nActualLength;
312 return theBuffer;
313 }
314
315 // ----------------------------------------------------------------------------
316 // wxMBConvLibc
317 // ----------------------------------------------------------------------------
318
319 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
320 {
321 return wxMB2WC(buf, psz, n);
322 }
323
324 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
325 {
326 return wxWC2MB(buf, psz, n);
327 }
328
329 #ifdef __UNIX__
330
331 // ----------------------------------------------------------------------------
332 // wxConvBrokenFileNames
333 // ----------------------------------------------------------------------------
334
335 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
336 {
337 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
338 || wxStricmp(charset, _T("UTF8")) == 0 )
339 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
340 else
341 m_conv = new wxCSConv(charset);
342 }
343
344 size_t
345 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
346 const char *psz,
347 size_t outputSize) const
348 {
349 return m_conv->MB2WC( outputBuf, psz, outputSize );
350 }
351
352 size_t
353 wxConvBrokenFileNames::WC2MB(char *outputBuf,
354 const wchar_t *psz,
355 size_t outputSize) const
356 {
357 return m_conv->WC2MB( outputBuf, psz, outputSize );
358 }
359
360 #endif
361
362 // ----------------------------------------------------------------------------
363 // UTF-7
364 // ----------------------------------------------------------------------------
365
366 // Implementation (C) 2004 Fredrik Roubert
367
368 //
369 // BASE64 decoding table
370 //
371 static const unsigned char utf7unb64[] =
372 {
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
376 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
379 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
380 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
382 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
383 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
384 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
386 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
387 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
388 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
402 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
403 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
404 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
405 };
406
407 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
408 {
409 size_t len = 0;
410
411 while (*psz && ((!buf) || (len < n)))
412 {
413 unsigned char cc = *psz++;
414 if (cc != '+')
415 {
416 // plain ASCII char
417 if (buf)
418 *buf++ = cc;
419 len++;
420 }
421 else if (*psz == '-')
422 {
423 // encoded plus sign
424 if (buf)
425 *buf++ = cc;
426 len++;
427 psz++;
428 }
429 else
430 {
431 // BASE64 encoded string
432 bool lsb;
433 unsigned char c;
434 unsigned int d, l;
435 for (lsb = false, d = 0, l = 0;
436 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
437 {
438 d <<= 6;
439 d += cc;
440 for (l += 6; l >= 8; lsb = !lsb)
441 {
442 c = (unsigned char)((d >> (l -= 8)) % 256);
443 if (lsb)
444 {
445 if (buf)
446 *buf++ |= c;
447 len ++;
448 }
449 else
450 if (buf)
451 *buf = (wchar_t)(c << 8);
452 }
453 }
454 if (*psz == '-')
455 psz++;
456 }
457 }
458 if (buf && (len < n))
459 *buf = 0;
460 return len;
461 }
462
463 //
464 // BASE64 encoding table
465 //
466 static const unsigned char utf7enb64[] =
467 {
468 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
469 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
470 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
471 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
472 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
473 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
474 'w', 'x', 'y', 'z', '0', '1', '2', '3',
475 '4', '5', '6', '7', '8', '9', '+', '/'
476 };
477
478 //
479 // UTF-7 encoding table
480 //
481 // 0 - Set D (directly encoded characters)
482 // 1 - Set O (optional direct characters)
483 // 2 - whitespace characters (optional)
484 // 3 - special characters
485 //
486 static const unsigned char utf7encode[128] =
487 {
488 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
490 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
494 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
496 };
497
498 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
499 {
500
501
502 size_t len = 0;
503
504 while (*psz && ((!buf) || (len < n)))
505 {
506 wchar_t cc = *psz++;
507 if (cc < 0x80 && utf7encode[cc] < 1)
508 {
509 // plain ASCII char
510 if (buf)
511 *buf++ = (char)cc;
512 len++;
513 }
514 #ifndef WC_UTF16
515 else if (((wxUint32)cc) > 0xffff)
516 {
517 // no surrogate pair generation (yet?)
518 return (size_t)-1;
519 }
520 #endif
521 else
522 {
523 if (buf)
524 *buf++ = '+';
525 len++;
526 if (cc != '+')
527 {
528 // BASE64 encode string
529 unsigned int lsb, d, l;
530 for (d = 0, l = 0;; psz++)
531 {
532 for (lsb = 0; lsb < 2; lsb ++)
533 {
534 d <<= 8;
535 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
536
537 for (l += 8; l >= 6; )
538 {
539 l -= 6;
540 if (buf)
541 *buf++ = utf7enb64[(d >> l) % 64];
542 len++;
543 }
544 }
545 cc = *psz;
546 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
547 break;
548 }
549 if (l != 0)
550 {
551 if (buf)
552 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
553 len++;
554 }
555 }
556 if (buf)
557 *buf++ = '-';
558 len++;
559 }
560 }
561 if (buf && (len < n))
562 *buf = 0;
563 return len;
564 }
565
566 // ----------------------------------------------------------------------------
567 // UTF-8
568 // ----------------------------------------------------------------------------
569
570 static wxUint32 utf8_max[]=
571 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
572
573 // boundaries of the private use area we use to (temporarily) remap invalid
574 // characters invalid in a UTF-8 encoded string
575 const wxUint32 wxUnicodePUA = 0x100000;
576 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
577
578 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
579 {
580 size_t len = 0;
581
582 while (*psz && ((!buf) || (len < n)))
583 {
584 const char *opsz = psz;
585 bool invalid = false;
586 unsigned char cc = *psz++, fc = cc;
587 unsigned cnt;
588 for (cnt = 0; fc & 0x80; cnt++)
589 fc <<= 1;
590 if (!cnt)
591 {
592 // plain ASCII char
593 if (buf)
594 *buf++ = cc;
595 len++;
596
597 // escape the escape character for octal escapes
598 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
599 && cc == '\\' && (!buf || len < n))
600 {
601 if (buf)
602 *buf++ = cc;
603 len++;
604 }
605 }
606 else
607 {
608 cnt--;
609 if (!cnt)
610 {
611 // invalid UTF-8 sequence
612 invalid = true;
613 }
614 else
615 {
616 unsigned ocnt = cnt - 1;
617 wxUint32 res = cc & (0x3f >> cnt);
618 while (cnt--)
619 {
620 cc = *psz;
621 if ((cc & 0xC0) != 0x80)
622 {
623 // invalid UTF-8 sequence
624 invalid = true;
625 break;
626 }
627 psz++;
628 res = (res << 6) | (cc & 0x3f);
629 }
630 if (invalid || res <= utf8_max[ocnt])
631 {
632 // illegal UTF-8 encoding
633 invalid = true;
634 }
635 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
636 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
637 {
638 // if one of our PUA characters turns up externally
639 // it must also be treated as an illegal sequence
640 // (a bit like you have to escape an escape character)
641 invalid = true;
642 }
643 else
644 {
645 #ifdef WC_UTF16
646 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
647 size_t pa = encode_utf16(res, (wxUint16 *)buf);
648 if (pa == (size_t)-1)
649 {
650 invalid = true;
651 }
652 else
653 {
654 if (buf)
655 buf += pa;
656 len += pa;
657 }
658 #else // !WC_UTF16
659 if (buf)
660 *buf++ = res;
661 len++;
662 #endif // WC_UTF16/!WC_UTF16
663 }
664 }
665 if (invalid)
666 {
667 if (m_options & MAP_INVALID_UTF8_TO_PUA)
668 {
669 while (opsz < psz && (!buf || len < n))
670 {
671 #ifdef WC_UTF16
672 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
673 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
674 wxASSERT(pa != (size_t)-1);
675 if (buf)
676 buf += pa;
677 opsz++;
678 len += pa;
679 #else
680 if (buf)
681 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
682 opsz++;
683 len++;
684 #endif
685 }
686 }
687 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
688 {
689 while (opsz < psz && (!buf || len < n))
690 {
691 if ( buf && len + 3 < n )
692 {
693 unsigned char n = *opsz;
694 *buf++ = L'\\';
695 *buf++ = (wchar_t)( L'0' + n / 0100 );
696 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
697 *buf++ = (wchar_t)( L'0' + n % 010 );
698 }
699 opsz++;
700 len += 4;
701 }
702 }
703 else // MAP_INVALID_UTF8_NOT
704 {
705 return (size_t)-1;
706 }
707 }
708 }
709 }
710 if (buf && (len < n))
711 *buf = 0;
712 return len;
713 }
714
715 static inline bool isoctal(wchar_t wch)
716 {
717 return L'0' <= wch && wch <= L'7';
718 }
719
720 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
721 {
722 size_t len = 0;
723
724 while (*psz && ((!buf) || (len < n)))
725 {
726 wxUint32 cc;
727 #ifdef WC_UTF16
728 // cast is ok for WC_UTF16
729 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
730 psz += (pa == (size_t)-1) ? 1 : pa;
731 #else
732 cc=(*psz++) & 0x7fffffff;
733 #endif
734
735 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
736 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
737 {
738 if (buf)
739 *buf++ = (char)(cc - wxUnicodePUA);
740 len++;
741 }
742 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
743 && cc == L'\\' && psz[0] == L'\\' )
744 {
745 if (buf)
746 *buf++ = (char)cc;
747 psz++;
748 len++;
749 }
750 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
751 cc == L'\\' &&
752 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
753 {
754 if (buf)
755 {
756 *buf++ = (char) ((psz[0] - L'0')*0100 +
757 (psz[1] - L'0')*010 +
758 (psz[2] - L'0'));
759 }
760
761 psz += 3;
762 len++;
763 }
764 else
765 {
766 unsigned cnt;
767 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
768 if (!cnt)
769 {
770 // plain ASCII char
771 if (buf)
772 *buf++ = (char) cc;
773 len++;
774 }
775
776 else
777 {
778 len += cnt + 1;
779 if (buf)
780 {
781 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
782 while (cnt--)
783 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
784 }
785 }
786 }
787 }
788
789 if (buf && (len<n))
790 *buf = 0;
791
792 return len;
793 }
794
795 // ----------------------------------------------------------------------------
796 // UTF-16
797 // ----------------------------------------------------------------------------
798
799 #ifdef WORDS_BIGENDIAN
800 #define wxMBConvUTF16straight wxMBConvUTF16BE
801 #define wxMBConvUTF16swap wxMBConvUTF16LE
802 #else
803 #define wxMBConvUTF16swap wxMBConvUTF16BE
804 #define wxMBConvUTF16straight wxMBConvUTF16LE
805 #endif
806
807
808 #ifdef WC_UTF16
809
810 // copy 16bit MB to 16bit String
811 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
812 {
813 size_t len=0;
814
815 while (*(wxUint16*)psz && (!buf || len < n))
816 {
817 if (buf)
818 *buf++ = *(wxUint16*)psz;
819 len++;
820
821 psz += sizeof(wxUint16);
822 }
823 if (buf && len<n) *buf=0;
824
825 return len;
826 }
827
828
829 // copy 16bit String to 16bit MB
830 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
831 {
832 size_t len=0;
833
834 while (*psz && (!buf || len < n))
835 {
836 if (buf)
837 {
838 *(wxUint16*)buf = *psz;
839 buf += sizeof(wxUint16);
840 }
841 len += sizeof(wxUint16);
842 psz++;
843 }
844 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
845
846 return len;
847 }
848
849
850 // swap 16bit MB to 16bit String
851 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
852 {
853 size_t len=0;
854
855 while (*(wxUint16*)psz && (!buf || len < n))
856 {
857 if (buf)
858 {
859 ((char *)buf)[0] = psz[1];
860 ((char *)buf)[1] = psz[0];
861 buf++;
862 }
863 len++;
864 psz += sizeof(wxUint16);
865 }
866 if (buf && len<n) *buf=0;
867
868 return len;
869 }
870
871
872 // swap 16bit MB to 16bit String
873 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
874 {
875 size_t len=0;
876
877 while (*psz && (!buf || len < n))
878 {
879 if (buf)
880 {
881 *buf++ = ((char*)psz)[1];
882 *buf++ = ((char*)psz)[0];
883 }
884 len += sizeof(wxUint16);
885 psz++;
886 }
887 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
888
889 return len;
890 }
891
892
893 #else // WC_UTF16
894
895
896 // copy 16bit MB to 32bit String
897 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
898 {
899 size_t len=0;
900
901 while (*(wxUint16*)psz && (!buf || len < n))
902 {
903 wxUint32 cc;
904 size_t pa=decode_utf16((wxUint16*)psz, cc);
905 if (pa == (size_t)-1)
906 return pa;
907
908 if (buf)
909 *buf++ = cc;
910 len++;
911 psz += pa * sizeof(wxUint16);
912 }
913 if (buf && len<n) *buf=0;
914
915 return len;
916 }
917
918
919 // copy 32bit String to 16bit MB
920 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
921 {
922 size_t len=0;
923
924 while (*psz && (!buf || len < n))
925 {
926 wxUint16 cc[2];
927 size_t pa=encode_utf16(*psz, cc);
928
929 if (pa == (size_t)-1)
930 return pa;
931
932 if (buf)
933 {
934 *(wxUint16*)buf = cc[0];
935 buf += sizeof(wxUint16);
936 if (pa > 1)
937 {
938 *(wxUint16*)buf = cc[1];
939 buf += sizeof(wxUint16);
940 }
941 }
942
943 len += pa*sizeof(wxUint16);
944 psz++;
945 }
946 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
947
948 return len;
949 }
950
951
952 // swap 16bit MB to 32bit String
953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954 {
955 size_t len=0;
956
957 while (*(wxUint16*)psz && (!buf || len < n))
958 {
959 wxUint32 cc;
960 char tmp[4];
961 tmp[0]=psz[1]; tmp[1]=psz[0];
962 tmp[2]=psz[3]; tmp[3]=psz[2];
963
964 size_t pa=decode_utf16((wxUint16*)tmp, cc);
965 if (pa == (size_t)-1)
966 return pa;
967
968 if (buf)
969 *buf++ = cc;
970
971 len++;
972 psz += pa * sizeof(wxUint16);
973 }
974 if (buf && len<n) *buf=0;
975
976 return len;
977 }
978
979
980 // swap 32bit String to 16bit MB
981 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
982 {
983 size_t len=0;
984
985 while (*psz && (!buf || len < n))
986 {
987 wxUint16 cc[2];
988 size_t pa=encode_utf16(*psz, cc);
989
990 if (pa == (size_t)-1)
991 return pa;
992
993 if (buf)
994 {
995 *buf++ = ((char*)cc)[1];
996 *buf++ = ((char*)cc)[0];
997 if (pa > 1)
998 {
999 *buf++ = ((char*)cc)[3];
1000 *buf++ = ((char*)cc)[2];
1001 }
1002 }
1003
1004 len += pa*sizeof(wxUint16);
1005 psz++;
1006 }
1007 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1008
1009 return len;
1010 }
1011
1012 #endif // WC_UTF16
1013
1014
1015 // ----------------------------------------------------------------------------
1016 // UTF-32
1017 // ----------------------------------------------------------------------------
1018
1019 #ifdef WORDS_BIGENDIAN
1020 #define wxMBConvUTF32straight wxMBConvUTF32BE
1021 #define wxMBConvUTF32swap wxMBConvUTF32LE
1022 #else
1023 #define wxMBConvUTF32swap wxMBConvUTF32BE
1024 #define wxMBConvUTF32straight wxMBConvUTF32LE
1025 #endif
1026
1027
1028 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1029 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1030
1031
1032 #ifdef WC_UTF16
1033
1034 // copy 32bit MB to 16bit String
1035 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1036 {
1037 size_t len=0;
1038
1039 while (*(wxUint32*)psz && (!buf || len < n))
1040 {
1041 wxUint16 cc[2];
1042
1043 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1044 if (pa == (size_t)-1)
1045 return pa;
1046
1047 if (buf)
1048 {
1049 *buf++ = cc[0];
1050 if (pa > 1)
1051 *buf++ = cc[1];
1052 }
1053 len += pa;
1054 psz += sizeof(wxUint32);
1055 }
1056 if (buf && len<n) *buf=0;
1057
1058 return len;
1059 }
1060
1061
1062 // copy 16bit String to 32bit MB
1063 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1064 {
1065 size_t len=0;
1066
1067 while (*psz && (!buf || len < n))
1068 {
1069 wxUint32 cc;
1070
1071 // cast is ok for WC_UTF16
1072 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1073 if (pa == (size_t)-1)
1074 return pa;
1075
1076 if (buf)
1077 {
1078 *(wxUint32*)buf = cc;
1079 buf += sizeof(wxUint32);
1080 }
1081 len += sizeof(wxUint32);
1082 psz += pa;
1083 }
1084
1085 if (buf && len<=n-sizeof(wxUint32))
1086 *(wxUint32*)buf=0;
1087
1088 return len;
1089 }
1090
1091
1092
1093 // swap 32bit MB to 16bit String
1094 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1095 {
1096 size_t len=0;
1097
1098 while (*(wxUint32*)psz && (!buf || len < n))
1099 {
1100 char tmp[4];
1101 tmp[0] = psz[3]; tmp[1] = psz[2];
1102 tmp[2] = psz[1]; tmp[3] = psz[0];
1103
1104
1105 wxUint16 cc[2];
1106
1107 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1108 if (pa == (size_t)-1)
1109 return pa;
1110
1111 if (buf)
1112 {
1113 *buf++ = cc[0];
1114 if (pa > 1)
1115 *buf++ = cc[1];
1116 }
1117 len += pa;
1118 psz += sizeof(wxUint32);
1119 }
1120
1121 if (buf && len<n)
1122 *buf=0;
1123
1124 return len;
1125 }
1126
1127
1128 // swap 16bit String to 32bit MB
1129 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1130 {
1131 size_t len=0;
1132
1133 while (*psz && (!buf || len < n))
1134 {
1135 char cc[4];
1136
1137 // cast is ok for WC_UTF16
1138 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1139 if (pa == (size_t)-1)
1140 return pa;
1141
1142 if (buf)
1143 {
1144 *buf++ = cc[3];
1145 *buf++ = cc[2];
1146 *buf++ = cc[1];
1147 *buf++ = cc[0];
1148 }
1149 len += sizeof(wxUint32);
1150 psz += pa;
1151 }
1152
1153 if (buf && len<=n-sizeof(wxUint32))
1154 *(wxUint32*)buf=0;
1155
1156 return len;
1157 }
1158
1159 #else // WC_UTF16
1160
1161
1162 // copy 32bit MB to 32bit String
1163 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1164 {
1165 size_t len=0;
1166
1167 while (*(wxUint32*)psz && (!buf || len < n))
1168 {
1169 if (buf)
1170 *buf++ = *(wxUint32*)psz;
1171 len++;
1172 psz += sizeof(wxUint32);
1173 }
1174
1175 if (buf && len<n)
1176 *buf=0;
1177
1178 return len;
1179 }
1180
1181
1182 // copy 32bit String to 32bit MB
1183 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1184 {
1185 size_t len=0;
1186
1187 while (*psz && (!buf || len < n))
1188 {
1189 if (buf)
1190 {
1191 *(wxUint32*)buf = *psz;
1192 buf += sizeof(wxUint32);
1193 }
1194
1195 len += sizeof(wxUint32);
1196 psz++;
1197 }
1198
1199 if (buf && len<=n-sizeof(wxUint32))
1200 *(wxUint32*)buf=0;
1201
1202 return len;
1203 }
1204
1205
1206 // swap 32bit MB to 32bit String
1207 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1208 {
1209 size_t len=0;
1210
1211 while (*(wxUint32*)psz && (!buf || len < n))
1212 {
1213 if (buf)
1214 {
1215 ((char *)buf)[0] = psz[3];
1216 ((char *)buf)[1] = psz[2];
1217 ((char *)buf)[2] = psz[1];
1218 ((char *)buf)[3] = psz[0];
1219 buf++;
1220 }
1221 len++;
1222 psz += sizeof(wxUint32);
1223 }
1224
1225 if (buf && len<n)
1226 *buf=0;
1227
1228 return len;
1229 }
1230
1231
1232 // swap 32bit String to 32bit MB
1233 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1234 {
1235 size_t len=0;
1236
1237 while (*psz && (!buf || len < n))
1238 {
1239 if (buf)
1240 {
1241 *buf++ = ((char *)psz)[3];
1242 *buf++ = ((char *)psz)[2];
1243 *buf++ = ((char *)psz)[1];
1244 *buf++ = ((char *)psz)[0];
1245 }
1246 len += sizeof(wxUint32);
1247 psz++;
1248 }
1249
1250 if (buf && len<=n-sizeof(wxUint32))
1251 *(wxUint32*)buf=0;
1252
1253 return len;
1254 }
1255
1256
1257 #endif // WC_UTF16
1258
1259
1260 // ============================================================================
1261 // The classes doing conversion using the iconv_xxx() functions
1262 // ============================================================================
1263
1264 #ifdef HAVE_ICONV
1265
1266 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1267 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1268 // (unless there's yet another bug in glibc) the only case when iconv()
1269 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1270 // left in the input buffer -- when _real_ error occurs,
1271 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1272 // iconv() failure.
1273 // [This bug does not appear in glibc 2.2.]
1274 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1275 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1276 (errno != E2BIG || bufLeft != 0))
1277 #else
1278 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1279 #endif
1280
1281 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1282
1283 #define ICONV_T_INVALID ((iconv_t)-1)
1284
1285 #if SIZEOF_WCHAR_T == 4
1286 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1287 #define WC_ENC wxFONTENCODING_UTF32
1288 #elif SIZEOF_WCHAR_T == 2
1289 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1290 #define WC_ENC wxFONTENCODING_UTF16
1291 #else // sizeof(wchar_t) != 2 nor 4
1292 // does this ever happen?
1293 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1294 #endif
1295
1296 // ----------------------------------------------------------------------------
1297 // wxMBConv_iconv: encapsulates an iconv character set
1298 // ----------------------------------------------------------------------------
1299
1300 class wxMBConv_iconv : public wxMBConv
1301 {
1302 public:
1303 wxMBConv_iconv(const wxChar *name);
1304 virtual ~wxMBConv_iconv();
1305
1306 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1307 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1308
1309 bool IsOk() const
1310 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1311
1312 protected:
1313 // the iconv handlers used to translate from multibyte to wide char and in
1314 // the other direction
1315 iconv_t m2w,
1316 w2m;
1317 #if wxUSE_THREADS
1318 // guards access to m2w and w2m objects
1319 wxMutex m_iconvMutex;
1320 #endif
1321
1322 private:
1323 // the name (for iconv_open()) of a wide char charset -- if none is
1324 // available on this machine, it will remain NULL
1325 static wxString ms_wcCharsetName;
1326
1327 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1328 // different endian-ness than the native one
1329 static bool ms_wcNeedsSwap;
1330 };
1331
1332 // make the constructor available for unit testing
1333 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1334 {
1335 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1336 if ( !result->IsOk() )
1337 {
1338 delete result;
1339 return 0;
1340 }
1341 return result;
1342 }
1343
1344 wxString wxMBConv_iconv::ms_wcCharsetName = NULL;
1345 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1346
1347 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1348 {
1349 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1350 // names for the charsets
1351 const wxCharBuffer cname(wxString(name).ToAscii());
1352
1353 // check for charset that represents wchar_t:
1354 if ( ms_wcCharsetName.empty() )
1355 {
1356 #if wxUSE_FONTMAP
1357 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1358 #else // !wxUSE_FONTMAP
1359 static const wxChar *names[] =
1360 {
1361 #if SIZEOF_WCHAR_T == 4
1362 _T("UCS-4"),
1363 #elif SIZEOF_WCHAR_T = 2
1364 _T("UCS-2"),
1365 #endif
1366 NULL
1367 };
1368 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1369
1370 for ( ; *names; ++names )
1371 {
1372 const wxString name(*names);
1373
1374 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1375 wxString nameXE(name);
1376 #ifdef WORDS_BIGENDIAN
1377 nameXE += _T("BE");
1378 #else // little endian
1379 nameXE += _T("LE");
1380 #endif
1381
1382 m2w = iconv_open(nameXE.ToAscii(), cname);
1383 if ( m2w == ICONV_T_INVALID )
1384 {
1385 // try charset w/o bytesex info (e.g. "UCS4")
1386 m2w = iconv_open(name.ToAscii(), cname);
1387
1388 // and check for bytesex ourselves:
1389 if ( m2w != ICONV_T_INVALID )
1390 {
1391 char buf[2], *bufPtr;
1392 wchar_t wbuf[2], *wbufPtr;
1393 size_t insz, outsz;
1394 size_t res;
1395
1396 buf[0] = 'A';
1397 buf[1] = 0;
1398 wbuf[0] = 0;
1399 insz = 2;
1400 outsz = SIZEOF_WCHAR_T * 2;
1401 wbufPtr = wbuf;
1402 bufPtr = buf;
1403
1404 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1405 (char**)&wbufPtr, &outsz);
1406
1407 if (ICONV_FAILED(res, insz))
1408 {
1409 wxLogLastError(wxT("iconv"));
1410 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1411 }
1412 else // ok, can convert to this encoding, remember it
1413 {
1414 ms_wcCharsetName = name;
1415 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1416 }
1417 }
1418 }
1419 else // use charset not requiring byte swapping
1420 {
1421 ms_wcCharsetName = nameXE;
1422 }
1423 }
1424
1425 wxLogTrace(TRACE_STRCONV,
1426 wxT("iconv wchar_t charset is \"%s\"%s"),
1427 ms_wcCharsetName.empty() ? "<none>"
1428 : ms_wcCharsetName.c_str(),
1429 ms_wcNeedsSwap ? _T(" (needs swap)")
1430 : _T(""));
1431 }
1432 else // we already have ms_wcCharsetName
1433 {
1434 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1435 }
1436
1437 if ( ms_wcCharsetName.empty() )
1438 {
1439 w2m = ICONV_T_INVALID;
1440 }
1441 else
1442 {
1443 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1444 if ( w2m == ICONV_T_INVALID )
1445 {
1446 wxLogTrace(TRACE_STRCONV,
1447 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1448 ms_wcCharsetName.c_str(), cname);
1449 }
1450 }
1451 }
1452
1453 wxMBConv_iconv::~wxMBConv_iconv()
1454 {
1455 if ( m2w != ICONV_T_INVALID )
1456 iconv_close(m2w);
1457 if ( w2m != ICONV_T_INVALID )
1458 iconv_close(w2m);
1459 }
1460
1461 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1462 {
1463 #if wxUSE_THREADS
1464 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1465 // Unfortunately there is a couple of global wxCSConv objects such as
1466 // wxConvLocal that are used all over wx code, so we have to make sure
1467 // the handle is used by at most one thread at the time. Otherwise
1468 // only a few wx classes would be safe to use from non-main threads
1469 // as MB<->WC conversion would fail "randomly".
1470 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1471 #endif
1472
1473 size_t inbuf = strlen(psz);
1474 size_t outbuf = n * SIZEOF_WCHAR_T;
1475 size_t res, cres;
1476 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1477 wchar_t *bufPtr = buf;
1478 const char *pszPtr = psz;
1479
1480 if (buf)
1481 {
1482 // have destination buffer, convert there
1483 cres = iconv(m2w,
1484 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1485 (char**)&bufPtr, &outbuf);
1486 res = n - (outbuf / SIZEOF_WCHAR_T);
1487
1488 if (ms_wcNeedsSwap)
1489 {
1490 // convert to native endianness
1491 for ( unsigned n = 0; n < res; n++ )
1492 buf[n] = WC_BSWAP(buf[n]);
1493 }
1494
1495 // NB: iconv was given only strlen(psz) characters on input, and so
1496 // it couldn't convert the trailing zero. Let's do it ourselves
1497 // if there's some room left for it in the output buffer.
1498 if (res < n)
1499 buf[res] = 0;
1500 }
1501 else
1502 {
1503 // no destination buffer... convert using temp buffer
1504 // to calculate destination buffer requirement
1505 wchar_t tbuf[8];
1506 res = 0;
1507 do {
1508 bufPtr = tbuf;
1509 outbuf = 8*SIZEOF_WCHAR_T;
1510
1511 cres = iconv(m2w,
1512 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1513 (char**)&bufPtr, &outbuf );
1514
1515 res += 8-(outbuf/SIZEOF_WCHAR_T);
1516 } while ((cres==(size_t)-1) && (errno==E2BIG));
1517 }
1518
1519 if (ICONV_FAILED(cres, inbuf))
1520 {
1521 //VS: it is ok if iconv fails, hence trace only
1522 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1523 return (size_t)-1;
1524 }
1525
1526 return res;
1527 }
1528
1529 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1530 {
1531 #if wxUSE_THREADS
1532 // NB: explained in MB2WC
1533 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1534 #endif
1535
1536 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1537 size_t outbuf = n;
1538 size_t res, cres;
1539
1540 wchar_t *tmpbuf = 0;
1541
1542 if (ms_wcNeedsSwap)
1543 {
1544 // need to copy to temp buffer to switch endianness
1545 // (doing WC_BSWAP twice on the original buffer won't help, as it
1546 // could be in read-only memory, or be accessed in some other thread)
1547 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1548 for ( size_t n = 0; n < inbuf; n++ )
1549 tmpbuf[n] = WC_BSWAP(psz[n]);
1550 tmpbuf[inbuf] = L'\0';
1551 psz = tmpbuf;
1552 }
1553
1554 if (buf)
1555 {
1556 // have destination buffer, convert there
1557 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1558
1559 res = n-outbuf;
1560
1561 // NB: iconv was given only wcslen(psz) characters on input, and so
1562 // it couldn't convert the trailing zero. Let's do it ourselves
1563 // if there's some room left for it in the output buffer.
1564 if (res < n)
1565 buf[0] = 0;
1566 }
1567 else
1568 {
1569 // no destination buffer... convert using temp buffer
1570 // to calculate destination buffer requirement
1571 char tbuf[16];
1572 res = 0;
1573 do {
1574 buf = tbuf; outbuf = 16;
1575
1576 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1577
1578 res += 16 - outbuf;
1579 } while ((cres==(size_t)-1) && (errno==E2BIG));
1580 }
1581
1582 if (ms_wcNeedsSwap)
1583 {
1584 free(tmpbuf);
1585 }
1586
1587 if (ICONV_FAILED(cres, inbuf))
1588 {
1589 //VS: it is ok if iconv fails, hence trace only
1590 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1591 return (size_t)-1;
1592 }
1593
1594 return res;
1595 }
1596
1597 #endif // HAVE_ICONV
1598
1599
1600 // ============================================================================
1601 // Win32 conversion classes
1602 // ============================================================================
1603
1604 #ifdef wxHAVE_WIN32_MB2WC
1605
1606 // from utils.cpp
1607 #if wxUSE_FONTMAP
1608 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1609 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1610 #endif
1611
1612 class wxMBConv_win32 : public wxMBConv
1613 {
1614 public:
1615 wxMBConv_win32()
1616 {
1617 m_CodePage = CP_ACP;
1618 }
1619
1620 #if wxUSE_FONTMAP
1621 wxMBConv_win32(const wxChar* name)
1622 {
1623 m_CodePage = wxCharsetToCodepage(name);
1624 }
1625
1626 wxMBConv_win32(wxFontEncoding encoding)
1627 {
1628 m_CodePage = wxEncodingToCodepage(encoding);
1629 }
1630 #endif
1631
1632 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1633 {
1634 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1635 // the behaviour is not compatible with the Unix version (using iconv)
1636 // and break the library itself, e.g. wxTextInputStream::NextChar()
1637 // wouldn't work if reading an incomplete MB char didn't result in an
1638 // error
1639 //
1640 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1641 // an error (tested under Windows Server 2003) and apparently it is
1642 // done on purpose, i.e. the function accepts any input in this case
1643 // and although I'd prefer to return error on ill-formed output, our
1644 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1645 // explicitly ill-formed according to RFC 2152) neither so we don't
1646 // even have any fallback here...
1647 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1648
1649 const size_t len = ::MultiByteToWideChar
1650 (
1651 m_CodePage, // code page
1652 flags, // flags: fall on error
1653 psz, // input string
1654 -1, // its length (NUL-terminated)
1655 buf, // output string
1656 buf ? n : 0 // size of output buffer
1657 );
1658
1659 // note that it returns count of written chars for buf != NULL and size
1660 // of the needed buffer for buf == NULL so in either case the length of
1661 // the string (which never includes the terminating NUL) is one less
1662 return len ? len - 1 : (size_t)-1;
1663 }
1664
1665 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1666 {
1667 /*
1668 we have a problem here: by default, WideCharToMultiByte() may
1669 replace characters unrepresentable in the target code page with bad
1670 quality approximations such as turning "1/2" symbol (U+00BD) into
1671 "1" for the code pages which don't have it and we, obviously, want
1672 to avoid this at any price
1673
1674 the trouble is that this function does it _silently_, i.e. it won't
1675 even tell us whether it did or not... Win98/2000 and higher provide
1676 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1677 we have to resort to a round trip, i.e. check that converting back
1678 results in the same string -- this is, of course, expensive but
1679 otherwise we simply can't be sure to not garble the data.
1680 */
1681
1682 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1683 // it doesn't work with CJK encodings (which we test for rather roughly
1684 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1685 // supporting it
1686 BOOL usedDef wxDUMMY_INITIALIZE(false);
1687 BOOL *pUsedDef;
1688 int flags;
1689 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1690 {
1691 // it's our lucky day
1692 flags = WC_NO_BEST_FIT_CHARS;
1693 pUsedDef = &usedDef;
1694 }
1695 else // old system or unsupported encoding
1696 {
1697 flags = 0;
1698 pUsedDef = NULL;
1699 }
1700
1701 const size_t len = ::WideCharToMultiByte
1702 (
1703 m_CodePage, // code page
1704 flags, // either none or no best fit
1705 pwz, // input string
1706 -1, // it is (wide) NUL-terminated
1707 buf, // output buffer
1708 buf ? n : 0, // and its size
1709 NULL, // default "replacement" char
1710 pUsedDef // [out] was it used?
1711 );
1712
1713 if ( !len )
1714 {
1715 // function totally failed
1716 return (size_t)-1;
1717 }
1718
1719 // if we were really converting, check if we succeeded
1720 if ( buf )
1721 {
1722 if ( flags )
1723 {
1724 // check if the conversion failed, i.e. if any replacements
1725 // were done
1726 if ( usedDef )
1727 return (size_t)-1;
1728 }
1729 else // we must resort to double tripping...
1730 {
1731 wxWCharBuffer wcBuf(n);
1732 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1733 wcscmp(wcBuf, pwz) != 0 )
1734 {
1735 // we didn't obtain the same thing we started from, hence
1736 // the conversion was lossy and we consider that it failed
1737 return (size_t)-1;
1738 }
1739 }
1740 }
1741
1742 // see the comment above for the reason of "len - 1"
1743 return len - 1;
1744 }
1745
1746 bool IsOk() const { return m_CodePage != -1; }
1747
1748 private:
1749 static bool CanUseNoBestFit()
1750 {
1751 static int s_isWin98Or2k = -1;
1752
1753 if ( s_isWin98Or2k == -1 )
1754 {
1755 int verMaj, verMin;
1756 switch ( wxGetOsVersion(&verMaj, &verMin) )
1757 {
1758 case wxWIN95:
1759 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1760 break;
1761
1762 case wxWINDOWS_NT:
1763 s_isWin98Or2k = verMaj >= 5;
1764 break;
1765
1766 default:
1767 // unknown, be conseravtive by default
1768 s_isWin98Or2k = 0;
1769 }
1770
1771 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1772 }
1773
1774 return s_isWin98Or2k == 1;
1775 }
1776
1777 long m_CodePage;
1778 };
1779
1780 #endif // wxHAVE_WIN32_MB2WC
1781
1782 // ============================================================================
1783 // Cocoa conversion classes
1784 // ============================================================================
1785
1786 #if defined(__WXCOCOA__)
1787
1788 // RN: There is no UTF-32 support in either Core Foundation or
1789 // Cocoa. Strangely enough, internally Core Foundation uses
1790 // UTF 32 internally quite a bit - its just not public (yet).
1791
1792 #include <CoreFoundation/CFString.h>
1793 #include <CoreFoundation/CFStringEncodingExt.h>
1794
1795 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1796 {
1797 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1798 if ( encoding == wxFONTENCODING_DEFAULT )
1799 {
1800 enc = CFStringGetSystemEncoding();
1801 }
1802 else switch( encoding)
1803 {
1804 case wxFONTENCODING_ISO8859_1 :
1805 enc = kCFStringEncodingISOLatin1 ;
1806 break ;
1807 case wxFONTENCODING_ISO8859_2 :
1808 enc = kCFStringEncodingISOLatin2;
1809 break ;
1810 case wxFONTENCODING_ISO8859_3 :
1811 enc = kCFStringEncodingISOLatin3 ;
1812 break ;
1813 case wxFONTENCODING_ISO8859_4 :
1814 enc = kCFStringEncodingISOLatin4;
1815 break ;
1816 case wxFONTENCODING_ISO8859_5 :
1817 enc = kCFStringEncodingISOLatinCyrillic;
1818 break ;
1819 case wxFONTENCODING_ISO8859_6 :
1820 enc = kCFStringEncodingISOLatinArabic;
1821 break ;
1822 case wxFONTENCODING_ISO8859_7 :
1823 enc = kCFStringEncodingISOLatinGreek;
1824 break ;
1825 case wxFONTENCODING_ISO8859_8 :
1826 enc = kCFStringEncodingISOLatinHebrew;
1827 break ;
1828 case wxFONTENCODING_ISO8859_9 :
1829 enc = kCFStringEncodingISOLatin5;
1830 break ;
1831 case wxFONTENCODING_ISO8859_10 :
1832 enc = kCFStringEncodingISOLatin6;
1833 break ;
1834 case wxFONTENCODING_ISO8859_11 :
1835 enc = kCFStringEncodingISOLatinThai;
1836 break ;
1837 case wxFONTENCODING_ISO8859_13 :
1838 enc = kCFStringEncodingISOLatin7;
1839 break ;
1840 case wxFONTENCODING_ISO8859_14 :
1841 enc = kCFStringEncodingISOLatin8;
1842 break ;
1843 case wxFONTENCODING_ISO8859_15 :
1844 enc = kCFStringEncodingISOLatin9;
1845 break ;
1846
1847 case wxFONTENCODING_KOI8 :
1848 enc = kCFStringEncodingKOI8_R;
1849 break ;
1850 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1851 enc = kCFStringEncodingDOSRussian;
1852 break ;
1853
1854 // case wxFONTENCODING_BULGARIAN :
1855 // enc = ;
1856 // break ;
1857
1858 case wxFONTENCODING_CP437 :
1859 enc =kCFStringEncodingDOSLatinUS ;
1860 break ;
1861 case wxFONTENCODING_CP850 :
1862 enc = kCFStringEncodingDOSLatin1;
1863 break ;
1864 case wxFONTENCODING_CP852 :
1865 enc = kCFStringEncodingDOSLatin2;
1866 break ;
1867 case wxFONTENCODING_CP855 :
1868 enc = kCFStringEncodingDOSCyrillic;
1869 break ;
1870 case wxFONTENCODING_CP866 :
1871 enc =kCFStringEncodingDOSRussian ;
1872 break ;
1873 case wxFONTENCODING_CP874 :
1874 enc = kCFStringEncodingDOSThai;
1875 break ;
1876 case wxFONTENCODING_CP932 :
1877 enc = kCFStringEncodingDOSJapanese;
1878 break ;
1879 case wxFONTENCODING_CP936 :
1880 enc =kCFStringEncodingDOSChineseSimplif ;
1881 break ;
1882 case wxFONTENCODING_CP949 :
1883 enc = kCFStringEncodingDOSKorean;
1884 break ;
1885 case wxFONTENCODING_CP950 :
1886 enc = kCFStringEncodingDOSChineseTrad;
1887 break ;
1888 case wxFONTENCODING_CP1250 :
1889 enc = kCFStringEncodingWindowsLatin2;
1890 break ;
1891 case wxFONTENCODING_CP1251 :
1892 enc =kCFStringEncodingWindowsCyrillic ;
1893 break ;
1894 case wxFONTENCODING_CP1252 :
1895 enc =kCFStringEncodingWindowsLatin1 ;
1896 break ;
1897 case wxFONTENCODING_CP1253 :
1898 enc = kCFStringEncodingWindowsGreek;
1899 break ;
1900 case wxFONTENCODING_CP1254 :
1901 enc = kCFStringEncodingWindowsLatin5;
1902 break ;
1903 case wxFONTENCODING_CP1255 :
1904 enc =kCFStringEncodingWindowsHebrew ;
1905 break ;
1906 case wxFONTENCODING_CP1256 :
1907 enc =kCFStringEncodingWindowsArabic ;
1908 break ;
1909 case wxFONTENCODING_CP1257 :
1910 enc = kCFStringEncodingWindowsBalticRim;
1911 break ;
1912 // This only really encodes to UTF7 (if that) evidently
1913 // case wxFONTENCODING_UTF7 :
1914 // enc = kCFStringEncodingNonLossyASCII ;
1915 // break ;
1916 case wxFONTENCODING_UTF8 :
1917 enc = kCFStringEncodingUTF8 ;
1918 break ;
1919 case wxFONTENCODING_EUC_JP :
1920 enc = kCFStringEncodingEUC_JP;
1921 break ;
1922 case wxFONTENCODING_UTF16 :
1923 enc = kCFStringEncodingUnicode ;
1924 break ;
1925 case wxFONTENCODING_MACROMAN :
1926 enc = kCFStringEncodingMacRoman ;
1927 break ;
1928 case wxFONTENCODING_MACJAPANESE :
1929 enc = kCFStringEncodingMacJapanese ;
1930 break ;
1931 case wxFONTENCODING_MACCHINESETRAD :
1932 enc = kCFStringEncodingMacChineseTrad ;
1933 break ;
1934 case wxFONTENCODING_MACKOREAN :
1935 enc = kCFStringEncodingMacKorean ;
1936 break ;
1937 case wxFONTENCODING_MACARABIC :
1938 enc = kCFStringEncodingMacArabic ;
1939 break ;
1940 case wxFONTENCODING_MACHEBREW :
1941 enc = kCFStringEncodingMacHebrew ;
1942 break ;
1943 case wxFONTENCODING_MACGREEK :
1944 enc = kCFStringEncodingMacGreek ;
1945 break ;
1946 case wxFONTENCODING_MACCYRILLIC :
1947 enc = kCFStringEncodingMacCyrillic ;
1948 break ;
1949 case wxFONTENCODING_MACDEVANAGARI :
1950 enc = kCFStringEncodingMacDevanagari ;
1951 break ;
1952 case wxFONTENCODING_MACGURMUKHI :
1953 enc = kCFStringEncodingMacGurmukhi ;
1954 break ;
1955 case wxFONTENCODING_MACGUJARATI :
1956 enc = kCFStringEncodingMacGujarati ;
1957 break ;
1958 case wxFONTENCODING_MACORIYA :
1959 enc = kCFStringEncodingMacOriya ;
1960 break ;
1961 case wxFONTENCODING_MACBENGALI :
1962 enc = kCFStringEncodingMacBengali ;
1963 break ;
1964 case wxFONTENCODING_MACTAMIL :
1965 enc = kCFStringEncodingMacTamil ;
1966 break ;
1967 case wxFONTENCODING_MACTELUGU :
1968 enc = kCFStringEncodingMacTelugu ;
1969 break ;
1970 case wxFONTENCODING_MACKANNADA :
1971 enc = kCFStringEncodingMacKannada ;
1972 break ;
1973 case wxFONTENCODING_MACMALAJALAM :
1974 enc = kCFStringEncodingMacMalayalam ;
1975 break ;
1976 case wxFONTENCODING_MACSINHALESE :
1977 enc = kCFStringEncodingMacSinhalese ;
1978 break ;
1979 case wxFONTENCODING_MACBURMESE :
1980 enc = kCFStringEncodingMacBurmese ;
1981 break ;
1982 case wxFONTENCODING_MACKHMER :
1983 enc = kCFStringEncodingMacKhmer ;
1984 break ;
1985 case wxFONTENCODING_MACTHAI :
1986 enc = kCFStringEncodingMacThai ;
1987 break ;
1988 case wxFONTENCODING_MACLAOTIAN :
1989 enc = kCFStringEncodingMacLaotian ;
1990 break ;
1991 case wxFONTENCODING_MACGEORGIAN :
1992 enc = kCFStringEncodingMacGeorgian ;
1993 break ;
1994 case wxFONTENCODING_MACARMENIAN :
1995 enc = kCFStringEncodingMacArmenian ;
1996 break ;
1997 case wxFONTENCODING_MACCHINESESIMP :
1998 enc = kCFStringEncodingMacChineseSimp ;
1999 break ;
2000 case wxFONTENCODING_MACTIBETAN :
2001 enc = kCFStringEncodingMacTibetan ;
2002 break ;
2003 case wxFONTENCODING_MACMONGOLIAN :
2004 enc = kCFStringEncodingMacMongolian ;
2005 break ;
2006 case wxFONTENCODING_MACETHIOPIC :
2007 enc = kCFStringEncodingMacEthiopic ;
2008 break ;
2009 case wxFONTENCODING_MACCENTRALEUR :
2010 enc = kCFStringEncodingMacCentralEurRoman ;
2011 break ;
2012 case wxFONTENCODING_MACVIATNAMESE :
2013 enc = kCFStringEncodingMacVietnamese ;
2014 break ;
2015 case wxFONTENCODING_MACARABICEXT :
2016 enc = kCFStringEncodingMacExtArabic ;
2017 break ;
2018 case wxFONTENCODING_MACSYMBOL :
2019 enc = kCFStringEncodingMacSymbol ;
2020 break ;
2021 case wxFONTENCODING_MACDINGBATS :
2022 enc = kCFStringEncodingMacDingbats ;
2023 break ;
2024 case wxFONTENCODING_MACTURKISH :
2025 enc = kCFStringEncodingMacTurkish ;
2026 break ;
2027 case wxFONTENCODING_MACCROATIAN :
2028 enc = kCFStringEncodingMacCroatian ;
2029 break ;
2030 case wxFONTENCODING_MACICELANDIC :
2031 enc = kCFStringEncodingMacIcelandic ;
2032 break ;
2033 case wxFONTENCODING_MACROMANIAN :
2034 enc = kCFStringEncodingMacRomanian ;
2035 break ;
2036 case wxFONTENCODING_MACCELTIC :
2037 enc = kCFStringEncodingMacCeltic ;
2038 break ;
2039 case wxFONTENCODING_MACGAELIC :
2040 enc = kCFStringEncodingMacGaelic ;
2041 break ;
2042 // case wxFONTENCODING_MACKEYBOARD :
2043 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2044 // break ;
2045 default :
2046 // because gcc is picky
2047 break ;
2048 } ;
2049 return enc ;
2050 }
2051
2052 class wxMBConv_cocoa : public wxMBConv
2053 {
2054 public:
2055 wxMBConv_cocoa()
2056 {
2057 Init(CFStringGetSystemEncoding()) ;
2058 }
2059
2060 #if wxUSE_FONTMAP
2061 wxMBConv_cocoa(const wxChar* name)
2062 {
2063 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2064 }
2065 #endif
2066
2067 wxMBConv_cocoa(wxFontEncoding encoding)
2068 {
2069 Init( wxCFStringEncFromFontEnc(encoding) );
2070 }
2071
2072 ~wxMBConv_cocoa()
2073 {
2074 }
2075
2076 void Init( CFStringEncoding encoding)
2077 {
2078 m_encoding = encoding ;
2079 }
2080
2081 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2082 {
2083 wxASSERT(szUnConv);
2084
2085 CFStringRef theString = CFStringCreateWithBytes (
2086 NULL, //the allocator
2087 (const UInt8*)szUnConv,
2088 strlen(szUnConv),
2089 m_encoding,
2090 false //no BOM/external representation
2091 );
2092
2093 wxASSERT(theString);
2094
2095 size_t nOutLength = CFStringGetLength(theString);
2096
2097 if (szOut == NULL)
2098 {
2099 CFRelease(theString);
2100 return nOutLength;
2101 }
2102
2103 CFRange theRange = { 0, nOutSize };
2104
2105 #if SIZEOF_WCHAR_T == 4
2106 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2107 #endif
2108
2109 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2110
2111 CFRelease(theString);
2112
2113 szUniCharBuffer[nOutLength] = '\0' ;
2114
2115 #if SIZEOF_WCHAR_T == 4
2116 wxMBConvUTF16 converter ;
2117 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2118 delete[] szUniCharBuffer;
2119 #endif
2120
2121 return nOutLength;
2122 }
2123
2124 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2125 {
2126 wxASSERT(szUnConv);
2127
2128 size_t nRealOutSize;
2129 size_t nBufSize = wxWcslen(szUnConv);
2130 UniChar* szUniBuffer = (UniChar*) szUnConv;
2131
2132 #if SIZEOF_WCHAR_T == 4
2133 wxMBConvUTF16 converter ;
2134 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2135 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2136 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2137 nBufSize /= sizeof(UniChar);
2138 #endif
2139
2140 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2141 NULL, //allocator
2142 szUniBuffer,
2143 nBufSize,
2144 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2145 );
2146
2147 wxASSERT(theString);
2148
2149 //Note that CER puts a BOM when converting to unicode
2150 //so we check and use getchars instead in that case
2151 if (m_encoding == kCFStringEncodingUnicode)
2152 {
2153 if (szOut != NULL)
2154 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2155
2156 nRealOutSize = CFStringGetLength(theString) + 1;
2157 }
2158 else
2159 {
2160 CFStringGetBytes(
2161 theString,
2162 CFRangeMake(0, CFStringGetLength(theString)),
2163 m_encoding,
2164 0, //what to put in characters that can't be converted -
2165 //0 tells CFString to return NULL if it meets such a character
2166 false, //not an external representation
2167 (UInt8*) szOut,
2168 nOutSize,
2169 (CFIndex*) &nRealOutSize
2170 );
2171 }
2172
2173 CFRelease(theString);
2174
2175 #if SIZEOF_WCHAR_T == 4
2176 delete[] szUniBuffer;
2177 #endif
2178
2179 return nRealOutSize - 1;
2180 }
2181
2182 bool IsOk() const
2183 {
2184 return m_encoding != kCFStringEncodingInvalidId &&
2185 CFStringIsEncodingAvailable(m_encoding);
2186 }
2187
2188 private:
2189 CFStringEncoding m_encoding ;
2190 };
2191
2192 #endif // defined(__WXCOCOA__)
2193
2194 // ============================================================================
2195 // Mac conversion classes
2196 // ============================================================================
2197
2198 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2199
2200 class wxMBConv_mac : public wxMBConv
2201 {
2202 public:
2203 wxMBConv_mac()
2204 {
2205 Init(CFStringGetSystemEncoding()) ;
2206 }
2207
2208 #if wxUSE_FONTMAP
2209 wxMBConv_mac(const wxChar* name)
2210 {
2211 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2212 }
2213 #endif
2214
2215 wxMBConv_mac(wxFontEncoding encoding)
2216 {
2217 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2218 }
2219
2220 ~wxMBConv_mac()
2221 {
2222 OSStatus status = noErr ;
2223 status = TECDisposeConverter(m_MB2WC_converter);
2224 status = TECDisposeConverter(m_WC2MB_converter);
2225 }
2226
2227
2228 void Init( TextEncodingBase encoding)
2229 {
2230 OSStatus status = noErr ;
2231 m_char_encoding = encoding ;
2232 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2233
2234 status = TECCreateConverter(&m_MB2WC_converter,
2235 m_char_encoding,
2236 m_unicode_encoding);
2237 status = TECCreateConverter(&m_WC2MB_converter,
2238 m_unicode_encoding,
2239 m_char_encoding);
2240 }
2241
2242 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2243 {
2244 OSStatus status = noErr ;
2245 ByteCount byteOutLen ;
2246 ByteCount byteInLen = strlen(psz) ;
2247 wchar_t *tbuf = NULL ;
2248 UniChar* ubuf = NULL ;
2249 size_t res = 0 ;
2250
2251 if (buf == NULL)
2252 {
2253 //apple specs say at least 32
2254 n = wxMax( 32 , byteInLen ) ;
2255 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2256 }
2257 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2258 #if SIZEOF_WCHAR_T == 4
2259 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2260 #else
2261 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2262 #endif
2263 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2264 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2265 #if SIZEOF_WCHAR_T == 4
2266 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2267 // is not properly terminated we get random characters at the end
2268 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2269 wxMBConvUTF16 converter ;
2270 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2271 free( ubuf ) ;
2272 #else
2273 res = byteOutLen / sizeof( UniChar ) ;
2274 #endif
2275 if ( buf == NULL )
2276 free(tbuf) ;
2277
2278 if ( buf && res < n)
2279 buf[res] = 0;
2280
2281 return res ;
2282 }
2283
2284 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2285 {
2286 OSStatus status = noErr ;
2287 ByteCount byteOutLen ;
2288 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2289
2290 char *tbuf = NULL ;
2291
2292 if (buf == NULL)
2293 {
2294 //apple specs say at least 32
2295 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2296 tbuf = (char*) malloc( n ) ;
2297 }
2298
2299 ByteCount byteBufferLen = n ;
2300 UniChar* ubuf = NULL ;
2301 #if SIZEOF_WCHAR_T == 4
2302 wxMBConvUTF16 converter ;
2303 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2304 byteInLen = unicharlen ;
2305 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2306 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2307 #else
2308 ubuf = (UniChar*) psz ;
2309 #endif
2310 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2311 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2312 #if SIZEOF_WCHAR_T == 4
2313 free( ubuf ) ;
2314 #endif
2315 if ( buf == NULL )
2316 free(tbuf) ;
2317
2318 size_t res = byteOutLen ;
2319 if ( buf && res < n)
2320 {
2321 buf[res] = 0;
2322
2323 //we need to double-trip to verify it didn't insert any ? in place
2324 //of bogus characters
2325 wxWCharBuffer wcBuf(n);
2326 size_t pszlen = wxWcslen(psz);
2327 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2328 wxWcslen(wcBuf) != pszlen ||
2329 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2330 {
2331 // we didn't obtain the same thing we started from, hence
2332 // the conversion was lossy and we consider that it failed
2333 return (size_t)-1;
2334 }
2335 }
2336
2337 return res ;
2338 }
2339
2340 bool IsOk() const
2341 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2342
2343 private:
2344 TECObjectRef m_MB2WC_converter ;
2345 TECObjectRef m_WC2MB_converter ;
2346
2347 TextEncodingBase m_char_encoding ;
2348 TextEncodingBase m_unicode_encoding ;
2349 };
2350
2351 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2352
2353 // ============================================================================
2354 // wxEncodingConverter based conversion classes
2355 // ============================================================================
2356
2357 #if wxUSE_FONTMAP
2358
2359 class wxMBConv_wxwin : public wxMBConv
2360 {
2361 private:
2362 void Init()
2363 {
2364 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2365 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2366 }
2367
2368 public:
2369 // temporarily just use wxEncodingConverter stuff,
2370 // so that it works while a better implementation is built
2371 wxMBConv_wxwin(const wxChar* name)
2372 {
2373 if (name)
2374 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2375 else
2376 m_enc = wxFONTENCODING_SYSTEM;
2377
2378 Init();
2379 }
2380
2381 wxMBConv_wxwin(wxFontEncoding enc)
2382 {
2383 m_enc = enc;
2384
2385 Init();
2386 }
2387
2388 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2389 {
2390 size_t inbuf = strlen(psz);
2391 if (buf)
2392 {
2393 if (!m2w.Convert(psz,buf))
2394 return (size_t)-1;
2395 }
2396 return inbuf;
2397 }
2398
2399 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2400 {
2401 const size_t inbuf = wxWcslen(psz);
2402 if (buf)
2403 {
2404 if (!w2m.Convert(psz,buf))
2405 return (size_t)-1;
2406 }
2407
2408 return inbuf;
2409 }
2410
2411 bool IsOk() const { return m_ok; }
2412
2413 public:
2414 wxFontEncoding m_enc;
2415 wxEncodingConverter m2w, w2m;
2416
2417 // were we initialized successfully?
2418 bool m_ok;
2419
2420 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2421 };
2422
2423 // make the constructors available for unit testing
2424 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2425 {
2426 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2427 if ( !result->IsOk() )
2428 {
2429 delete result;
2430 return 0;
2431 }
2432 return result;
2433 }
2434
2435 #endif // wxUSE_FONTMAP
2436
2437 // ============================================================================
2438 // wxCSConv implementation
2439 // ============================================================================
2440
2441 void wxCSConv::Init()
2442 {
2443 m_name = NULL;
2444 m_convReal = NULL;
2445 m_deferred = true;
2446 }
2447
2448 wxCSConv::wxCSConv(const wxChar *charset)
2449 {
2450 Init();
2451
2452 if ( charset )
2453 {
2454 SetName(charset);
2455 }
2456
2457 m_encoding = wxFONTENCODING_SYSTEM;
2458 }
2459
2460 wxCSConv::wxCSConv(wxFontEncoding encoding)
2461 {
2462 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2463 {
2464 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2465
2466 encoding = wxFONTENCODING_SYSTEM;
2467 }
2468
2469 Init();
2470
2471 m_encoding = encoding;
2472 }
2473
2474 wxCSConv::~wxCSConv()
2475 {
2476 Clear();
2477 }
2478
2479 wxCSConv::wxCSConv(const wxCSConv& conv)
2480 : wxMBConv()
2481 {
2482 Init();
2483
2484 SetName(conv.m_name);
2485 m_encoding = conv.m_encoding;
2486 }
2487
2488 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2489 {
2490 Clear();
2491
2492 SetName(conv.m_name);
2493 m_encoding = conv.m_encoding;
2494
2495 return *this;
2496 }
2497
2498 void wxCSConv::Clear()
2499 {
2500 free(m_name);
2501 delete m_convReal;
2502
2503 m_name = NULL;
2504 m_convReal = NULL;
2505 }
2506
2507 void wxCSConv::SetName(const wxChar *charset)
2508 {
2509 if (charset)
2510 {
2511 m_name = wxStrdup(charset);
2512 m_deferred = true;
2513 }
2514 }
2515
2516 #if wxUSE_FONTMAP
2517 #include "wx/hashmap.h"
2518
2519 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2520 wxEncodingNameCache );
2521
2522 static wxEncodingNameCache gs_nameCache;
2523 #endif
2524
2525 wxMBConv *wxCSConv::DoCreate() const
2526 {
2527 #if wxUSE_FONTMAP
2528 wxLogTrace(TRACE_STRCONV,
2529 wxT("creating conversion for %s"),
2530 (m_name ? m_name
2531 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2532 #endif // wxUSE_FONTMAP
2533
2534 // check for the special case of ASCII or ISO8859-1 charset: as we have
2535 // special knowledge of it anyhow, we don't need to create a special
2536 // conversion object
2537 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2538 {
2539 // don't convert at all
2540 return NULL;
2541 }
2542
2543 // we trust OS to do conversion better than we can so try external
2544 // conversion methods first
2545 //
2546 // the full order is:
2547 // 1. OS conversion (iconv() under Unix or Win32 API)
2548 // 2. hard coded conversions for UTF
2549 // 3. wxEncodingConverter as fall back
2550
2551 // step (1)
2552 #ifdef HAVE_ICONV
2553 #if !wxUSE_FONTMAP
2554 if ( m_name )
2555 #endif // !wxUSE_FONTMAP
2556 {
2557 wxString name(m_name);
2558 wxFontEncoding encoding(m_encoding);
2559
2560 if ( !name.empty() )
2561 {
2562 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2563 if ( conv->IsOk() )
2564 return conv;
2565
2566 delete conv;
2567
2568 #if wxUSE_FONTMAP
2569 encoding =
2570 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2571 #endif // wxUSE_FONTMAP
2572 }
2573 #if wxUSE_FONTMAP
2574 {
2575 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2576 if ( it != gs_nameCache.end() )
2577 {
2578 if ( it->second.empty() )
2579 return NULL;
2580
2581 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2582 if ( conv->IsOk() )
2583 return conv;
2584
2585 delete conv;
2586 }
2587
2588 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2589
2590 for ( ; *names; ++names )
2591 {
2592 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2593 if ( conv->IsOk() )
2594 {
2595 gs_nameCache[encoding] = *names;
2596 return conv;
2597 }
2598
2599 delete conv;
2600 }
2601
2602 gs_nameCache[encoding] = _T(""); // cache the failure
2603 }
2604 #endif // wxUSE_FONTMAP
2605 }
2606 #endif // HAVE_ICONV
2607
2608 #ifdef wxHAVE_WIN32_MB2WC
2609 {
2610 #if wxUSE_FONTMAP
2611 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2612 : new wxMBConv_win32(m_encoding);
2613 if ( conv->IsOk() )
2614 return conv;
2615
2616 delete conv;
2617 #else
2618 return NULL;
2619 #endif
2620 }
2621 #endif // wxHAVE_WIN32_MB2WC
2622 #if defined(__WXMAC__)
2623 {
2624 // leave UTF16 and UTF32 to the built-ins of wx
2625 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2626 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2627 {
2628
2629 #if wxUSE_FONTMAP
2630 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2631 : new wxMBConv_mac(m_encoding);
2632 #else
2633 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2634 #endif
2635 if ( conv->IsOk() )
2636 return conv;
2637
2638 delete conv;
2639 }
2640 }
2641 #endif
2642 #if defined(__WXCOCOA__)
2643 {
2644 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2645 {
2646
2647 #if wxUSE_FONTMAP
2648 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2649 : new wxMBConv_cocoa(m_encoding);
2650 #else
2651 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2652 #endif
2653 if ( conv->IsOk() )
2654 return conv;
2655
2656 delete conv;
2657 }
2658 }
2659 #endif
2660 // step (2)
2661 wxFontEncoding enc = m_encoding;
2662 #if wxUSE_FONTMAP
2663 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2664 {
2665 // use "false" to suppress interactive dialogs -- we can be called from
2666 // anywhere and popping up a dialog from here is the last thing we want to
2667 // do
2668 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2669 }
2670 #endif // wxUSE_FONTMAP
2671
2672 switch ( enc )
2673 {
2674 case wxFONTENCODING_UTF7:
2675 return new wxMBConvUTF7;
2676
2677 case wxFONTENCODING_UTF8:
2678 return new wxMBConvUTF8;
2679
2680 case wxFONTENCODING_UTF16BE:
2681 return new wxMBConvUTF16BE;
2682
2683 case wxFONTENCODING_UTF16LE:
2684 return new wxMBConvUTF16LE;
2685
2686 case wxFONTENCODING_UTF32BE:
2687 return new wxMBConvUTF32BE;
2688
2689 case wxFONTENCODING_UTF32LE:
2690 return new wxMBConvUTF32LE;
2691
2692 default:
2693 // nothing to do but put here to suppress gcc warnings
2694 ;
2695 }
2696
2697 // step (3)
2698 #if wxUSE_FONTMAP
2699 {
2700 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2701 : new wxMBConv_wxwin(m_encoding);
2702 if ( conv->IsOk() )
2703 return conv;
2704
2705 delete conv;
2706 }
2707 #endif // wxUSE_FONTMAP
2708
2709 // NB: This is a hack to prevent deadlock. What could otherwise happen
2710 // in Unicode build: wxConvLocal creation ends up being here
2711 // because of some failure and logs the error. But wxLog will try to
2712 // attach timestamp, for which it will need wxConvLocal (to convert
2713 // time to char* and then wchar_t*), but that fails, tries to log
2714 // error, but wxLog has a (already locked) critical section that
2715 // guards static buffer.
2716 static bool alreadyLoggingError = false;
2717 if (!alreadyLoggingError)
2718 {
2719 alreadyLoggingError = true;
2720 wxLogError(_("Cannot convert from the charset '%s'!"),
2721 m_name ? m_name
2722 :
2723 #if wxUSE_FONTMAP
2724 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2725 #else // !wxUSE_FONTMAP
2726 wxString::Format(_("encoding %s"), m_encoding).c_str()
2727 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2728 );
2729 alreadyLoggingError = false;
2730 }
2731
2732 return NULL;
2733 }
2734
2735 void wxCSConv::CreateConvIfNeeded() const
2736 {
2737 if ( m_deferred )
2738 {
2739 wxCSConv *self = (wxCSConv *)this; // const_cast
2740
2741 #if wxUSE_INTL
2742 // if we don't have neither the name nor the encoding, use the default
2743 // encoding for this system
2744 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2745 {
2746 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2747 }
2748 #endif // wxUSE_INTL
2749
2750 self->m_convReal = DoCreate();
2751 self->m_deferred = false;
2752 }
2753 }
2754
2755 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2756 {
2757 CreateConvIfNeeded();
2758
2759 if (m_convReal)
2760 return m_convReal->MB2WC(buf, psz, n);
2761
2762 // latin-1 (direct)
2763 size_t len = strlen(psz);
2764
2765 if (buf)
2766 {
2767 for (size_t c = 0; c <= len; c++)
2768 buf[c] = (unsigned char)(psz[c]);
2769 }
2770
2771 return len;
2772 }
2773
2774 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2775 {
2776 CreateConvIfNeeded();
2777
2778 if (m_convReal)
2779 return m_convReal->WC2MB(buf, psz, n);
2780
2781 // latin-1 (direct)
2782 const size_t len = wxWcslen(psz);
2783 if (buf)
2784 {
2785 for (size_t c = 0; c <= len; c++)
2786 {
2787 if (psz[c] > 0xFF)
2788 return (size_t)-1;
2789 buf[c] = (char)psz[c];
2790 }
2791 }
2792 else
2793 {
2794 for (size_t c = 0; c <= len; c++)
2795 {
2796 if (psz[c] > 0xFF)
2797 return (size_t)-1;
2798 }
2799 }
2800
2801 return len;
2802 }
2803
2804 // ----------------------------------------------------------------------------
2805 // globals
2806 // ----------------------------------------------------------------------------
2807
2808 #ifdef __WINDOWS__
2809 static wxMBConv_win32 wxConvLibcObj;
2810 #elif defined(__WXMAC__) && !defined(__MACH__)
2811 static wxMBConv_mac wxConvLibcObj ;
2812 #else
2813 static wxMBConvLibc wxConvLibcObj;
2814 #endif
2815
2816 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2817 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2818 static wxMBConvUTF7 wxConvUTF7Obj;
2819 static wxMBConvUTF8 wxConvUTF8Obj;
2820
2821 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2822 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2823 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2824 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2825 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2826 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2827 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2828 #ifdef __WXOSX__
2829 wxConvUTF8Obj;
2830 #else
2831 wxConvLibcObj;
2832 #endif
2833
2834
2835 #else // !wxUSE_WCHAR_T
2836
2837 // stand-ins in absence of wchar_t
2838 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2839 wxConvISO8859_1,
2840 wxConvLocal,
2841 wxConvUTF8;
2842
2843 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2844
2845