]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
position is always unsigned in InsetPage(), no need to compare it with 0
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 // ============================================================================
82 // implementation
83 // ============================================================================
84
85 // ----------------------------------------------------------------------------
86 // UTF-16 en/decoding to/from UCS-4
87 // ----------------------------------------------------------------------------
88
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input<=0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96 return 1;
97 }
98 else if (input>=0x110000)
99 {
100 return (size_t)-1;
101 }
102 else
103 {
104 if (output)
105 {
106 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
107 *output = (wxUint16) ((input&0x3ff)+0xdc00);
108 }
109 return 2;
110 }
111 }
112
113 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
114 {
115 if ((*input<0xd800) || (*input>0xdfff))
116 {
117 output = *input;
118 return 1;
119 }
120 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
121 {
122 output = *input;
123 return (size_t)-1;
124 }
125 else
126 {
127 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
128 return 2;
129 }
130 }
131
132
133 // ----------------------------------------------------------------------------
134 // wxMBConv
135 // ----------------------------------------------------------------------------
136
137 wxMBConv::~wxMBConv()
138 {
139 // nothing to do here (necessary for Darwin linking probably)
140 }
141
142 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
143 {
144 if ( psz )
145 {
146 // calculate the length of the buffer needed first
147 size_t nLen = MB2WC(NULL, psz, 0);
148 if ( nLen != (size_t)-1 )
149 {
150 // now do the actual conversion
151 wxWCharBuffer buf(nLen);
152 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
153 if ( nLen != (size_t)-1 )
154 {
155 return buf;
156 }
157 }
158 }
159
160 wxWCharBuffer buf((wchar_t *)NULL);
161
162 return buf;
163 }
164
165 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
166 {
167 if ( pwz )
168 {
169 size_t nLen = WC2MB(NULL, pwz, 0);
170 if ( nLen != (size_t)-1 )
171 {
172 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
173 nLen = WC2MB(buf.data(), pwz, nLen + 4);
174 if ( nLen != (size_t)-1 )
175 {
176 return buf;
177 }
178 }
179 }
180
181 wxCharBuffer buf((char *)NULL);
182
183 return buf;
184 }
185
186 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
187 {
188 wxASSERT(pOutSize != NULL);
189
190 const char* szEnd = szString + nStringLen + 1;
191 const char* szPos = szString;
192 const char* szStart = szPos;
193
194 size_t nActualLength = 0;
195 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
196
197 wxWCharBuffer theBuffer(nCurrentSize);
198
199 //Convert the string until the length() is reached, continuing the
200 //loop every time a null character is reached
201 while(szPos != szEnd)
202 {
203 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
204
205 //Get the length of the current (sub)string
206 size_t nLen = MB2WC(NULL, szPos, 0);
207
208 //Invalid conversion?
209 if( nLen == (size_t)-1 )
210 {
211 *pOutSize = 0;
212 theBuffer.data()[0u] = wxT('\0');
213 return theBuffer;
214 }
215
216
217 //Increase the actual length (+1 for current null character)
218 nActualLength += nLen + 1;
219
220 //if buffer too big, realloc the buffer
221 if (nActualLength > (nCurrentSize+1))
222 {
223 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
224 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
225 theBuffer = theNewBuffer;
226 nCurrentSize <<= 1;
227 }
228
229 //Convert the current (sub)string
230 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
231 {
232 *pOutSize = 0;
233 theBuffer.data()[0u] = wxT('\0');
234 return theBuffer;
235 }
236
237 //Increment to next (sub)string
238 //Note that we have to use strlen instead of nLen here
239 //because XX2XX gives us the size of the output buffer,
240 //which is not necessarily the length of the string
241 szPos += strlen(szPos) + 1;
242 }
243
244 //success - return actual length and the buffer
245 *pOutSize = nActualLength;
246 return theBuffer;
247 }
248
249 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
250 {
251 wxASSERT(pOutSize != NULL);
252
253 const wchar_t* szEnd = szString + nStringLen + 1;
254 const wchar_t* szPos = szString;
255 const wchar_t* szStart = szPos;
256
257 size_t nActualLength = 0;
258 size_t nCurrentSize = nStringLen << 2; //try * 4 first
259
260 wxCharBuffer theBuffer(nCurrentSize);
261
262 //Convert the string until the length() is reached, continuing the
263 //loop every time a null character is reached
264 while(szPos != szEnd)
265 {
266 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
267
268 //Get the length of the current (sub)string
269 size_t nLen = WC2MB(NULL, szPos, 0);
270
271 //Invalid conversion?
272 if( nLen == (size_t)-1 )
273 {
274 *pOutSize = 0;
275 theBuffer.data()[0u] = wxT('\0');
276 return theBuffer;
277 }
278
279 //Increase the actual length (+1 for current null character)
280 nActualLength += nLen + 1;
281
282 //if buffer too big, realloc the buffer
283 if (nActualLength > (nCurrentSize+1))
284 {
285 wxCharBuffer theNewBuffer(nCurrentSize << 1);
286 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
287 theBuffer = theNewBuffer;
288 nCurrentSize <<= 1;
289 }
290
291 //Convert the current (sub)string
292 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
293 {
294 *pOutSize = 0;
295 theBuffer.data()[0u] = wxT('\0');
296 return theBuffer;
297 }
298
299 //Increment to next (sub)string
300 //Note that we have to use wxWcslen instead of nLen here
301 //because XX2XX gives us the size of the output buffer,
302 //which is not necessarily the length of the string
303 szPos += wxWcslen(szPos) + 1;
304 }
305
306 //success - return actual length and the buffer
307 *pOutSize = nActualLength;
308 return theBuffer;
309 }
310
311 // ----------------------------------------------------------------------------
312 // wxMBConvLibc
313 // ----------------------------------------------------------------------------
314
315 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
316 {
317 return wxMB2WC(buf, psz, n);
318 }
319
320 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
321 {
322 return wxWC2MB(buf, psz, n);
323 }
324
325 #ifdef __UNIX__
326
327 // ----------------------------------------------------------------------------
328 // wxConvBrokenFileNames
329 // ----------------------------------------------------------------------------
330
331 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
332 {
333 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
334 || wxStricmp(charset, _T("UTF8")) == 0 )
335 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
336 else
337 m_conv = new wxCSConv(charset);
338 }
339
340 size_t
341 wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
342 const char *psz,
343 size_t outputSize) const
344 {
345 return m_conv->MB2WC( outputBuf, psz, outputSize );
346 }
347
348 size_t
349 wxConvBrokenFileNames::WC2MB(char *outputBuf,
350 const wchar_t *psz,
351 size_t outputSize) const
352 {
353 return m_conv->WC2MB( outputBuf, psz, outputSize );
354 }
355
356 #endif
357
358 // ----------------------------------------------------------------------------
359 // UTF-7
360 // ----------------------------------------------------------------------------
361
362 // Implementation (C) 2004 Fredrik Roubert
363
364 //
365 // BASE64 decoding table
366 //
367 static const unsigned char utf7unb64[] =
368 {
369 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
375 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
376 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
378 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
379 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
380 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
382 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
383 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
384 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
401 };
402
403 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
404 {
405 size_t len = 0;
406
407 while (*psz && ((!buf) || (len < n)))
408 {
409 unsigned char cc = *psz++;
410 if (cc != '+')
411 {
412 // plain ASCII char
413 if (buf)
414 *buf++ = cc;
415 len++;
416 }
417 else if (*psz == '-')
418 {
419 // encoded plus sign
420 if (buf)
421 *buf++ = cc;
422 len++;
423 psz++;
424 }
425 else
426 {
427 // BASE64 encoded string
428 bool lsb;
429 unsigned char c;
430 unsigned int d, l;
431 for (lsb = false, d = 0, l = 0;
432 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
433 {
434 d <<= 6;
435 d += cc;
436 for (l += 6; l >= 8; lsb = !lsb)
437 {
438 c = (unsigned char)((d >> (l -= 8)) % 256);
439 if (lsb)
440 {
441 if (buf)
442 *buf++ |= c;
443 len ++;
444 }
445 else
446 if (buf)
447 *buf = (wchar_t)(c << 8);
448 }
449 }
450 if (*psz == '-')
451 psz++;
452 }
453 }
454 if (buf && (len < n))
455 *buf = 0;
456 return len;
457 }
458
459 //
460 // BASE64 encoding table
461 //
462 static const unsigned char utf7enb64[] =
463 {
464 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
465 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
466 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
467 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
468 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
469 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
470 'w', 'x', 'y', 'z', '0', '1', '2', '3',
471 '4', '5', '6', '7', '8', '9', '+', '/'
472 };
473
474 //
475 // UTF-7 encoding table
476 //
477 // 0 - Set D (directly encoded characters)
478 // 1 - Set O (optional direct characters)
479 // 2 - whitespace characters (optional)
480 // 3 - special characters
481 //
482 static const unsigned char utf7encode[128] =
483 {
484 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
486 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
492 };
493
494 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
495 {
496
497
498 size_t len = 0;
499
500 while (*psz && ((!buf) || (len < n)))
501 {
502 wchar_t cc = *psz++;
503 if (cc < 0x80 && utf7encode[cc] < 1)
504 {
505 // plain ASCII char
506 if (buf)
507 *buf++ = (char)cc;
508 len++;
509 }
510 #ifndef WC_UTF16
511 else if (((wxUint32)cc) > 0xffff)
512 {
513 // no surrogate pair generation (yet?)
514 return (size_t)-1;
515 }
516 #endif
517 else
518 {
519 if (buf)
520 *buf++ = '+';
521 len++;
522 if (cc != '+')
523 {
524 // BASE64 encode string
525 unsigned int lsb, d, l;
526 for (d = 0, l = 0;; psz++)
527 {
528 for (lsb = 0; lsb < 2; lsb ++)
529 {
530 d <<= 8;
531 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
532
533 for (l += 8; l >= 6; )
534 {
535 l -= 6;
536 if (buf)
537 *buf++ = utf7enb64[(d >> l) % 64];
538 len++;
539 }
540 }
541 cc = *psz;
542 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
543 break;
544 }
545 if (l != 0)
546 {
547 if (buf)
548 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
549 len++;
550 }
551 }
552 if (buf)
553 *buf++ = '-';
554 len++;
555 }
556 }
557 if (buf && (len < n))
558 *buf = 0;
559 return len;
560 }
561
562 // ----------------------------------------------------------------------------
563 // UTF-8
564 // ----------------------------------------------------------------------------
565
566 static wxUint32 utf8_max[]=
567 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
568
569 // boundaries of the private use area we use to (temporarily) remap invalid
570 // characters invalid in a UTF-8 encoded string
571 const wxUint32 wxUnicodePUA = 0x100000;
572 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
573
574 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
575 {
576 size_t len = 0;
577
578 while (*psz && ((!buf) || (len < n)))
579 {
580 const char *opsz = psz;
581 bool invalid = false;
582 unsigned char cc = *psz++, fc = cc;
583 unsigned cnt;
584 for (cnt = 0; fc & 0x80; cnt++)
585 fc <<= 1;
586 if (!cnt)
587 {
588 // plain ASCII char
589 if (buf)
590 *buf++ = cc;
591 len++;
592
593 // escape the escape character for octal escapes
594 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
595 && cc == '\\' && (!buf || len < n))
596 {
597 if (buf)
598 *buf++ = cc;
599 len++;
600 }
601 }
602 else
603 {
604 cnt--;
605 if (!cnt)
606 {
607 // invalid UTF-8 sequence
608 invalid = true;
609 }
610 else
611 {
612 unsigned ocnt = cnt - 1;
613 wxUint32 res = cc & (0x3f >> cnt);
614 while (cnt--)
615 {
616 cc = *psz;
617 if ((cc & 0xC0) != 0x80)
618 {
619 // invalid UTF-8 sequence
620 invalid = true;
621 break;
622 }
623 psz++;
624 res = (res << 6) | (cc & 0x3f);
625 }
626 if (invalid || res <= utf8_max[ocnt])
627 {
628 // illegal UTF-8 encoding
629 invalid = true;
630 }
631 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
632 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
633 {
634 // if one of our PUA characters turns up externally
635 // it must also be treated as an illegal sequence
636 // (a bit like you have to escape an escape character)
637 invalid = true;
638 }
639 else
640 {
641 #ifdef WC_UTF16
642 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
643 size_t pa = encode_utf16(res, (wxUint16 *)buf);
644 if (pa == (size_t)-1)
645 {
646 invalid = true;
647 }
648 else
649 {
650 if (buf)
651 buf += pa;
652 len += pa;
653 }
654 #else // !WC_UTF16
655 if (buf)
656 *buf++ = res;
657 len++;
658 #endif // WC_UTF16/!WC_UTF16
659 }
660 }
661 if (invalid)
662 {
663 if (m_options & MAP_INVALID_UTF8_TO_PUA)
664 {
665 while (opsz < psz && (!buf || len < n))
666 {
667 #ifdef WC_UTF16
668 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
669 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
670 wxASSERT(pa != (size_t)-1);
671 if (buf)
672 buf += pa;
673 opsz++;
674 len += pa;
675 #else
676 if (buf)
677 *buf++ = wxUnicodePUA + (unsigned char)*opsz;
678 opsz++;
679 len++;
680 #endif
681 }
682 }
683 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
684 {
685 while (opsz < psz && (!buf || len < n))
686 {
687 if ( buf && len + 3 < n )
688 {
689 unsigned char n = *opsz;
690 *buf++ = L'\\';
691 *buf++ = (wchar_t)( L'0' + n / 0100 );
692 *buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
693 *buf++ = (wchar_t)( L'0' + n % 010 );
694 }
695 opsz++;
696 len += 4;
697 }
698 }
699 else // MAP_INVALID_UTF8_NOT
700 {
701 return (size_t)-1;
702 }
703 }
704 }
705 }
706 if (buf && (len < n))
707 *buf = 0;
708 return len;
709 }
710
711 static inline bool isoctal(wchar_t wch)
712 {
713 return L'0' <= wch && wch <= L'7';
714 }
715
716 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
717 {
718 size_t len = 0;
719
720 while (*psz && ((!buf) || (len < n)))
721 {
722 wxUint32 cc;
723 #ifdef WC_UTF16
724 // cast is ok for WC_UTF16
725 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
726 psz += (pa == (size_t)-1) ? 1 : pa;
727 #else
728 cc=(*psz++) & 0x7fffffff;
729 #endif
730
731 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
732 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
733 {
734 if (buf)
735 *buf++ = (char)(cc - wxUnicodePUA);
736 len++;
737 }
738 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
739 && cc == L'\\' && psz[0] == L'\\' )
740 {
741 if (buf)
742 *buf++ = (char)cc;
743 psz++;
744 len++;
745 }
746 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
747 cc == L'\\' &&
748 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
749 {
750 if (buf)
751 {
752 *buf++ = (char) ((psz[0] - L'0')*0100 +
753 (psz[1] - L'0')*010 +
754 (psz[2] - L'0'));
755 }
756
757 psz += 3;
758 len++;
759 }
760 else
761 {
762 unsigned cnt;
763 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
764 if (!cnt)
765 {
766 // plain ASCII char
767 if (buf)
768 *buf++ = (char) cc;
769 len++;
770 }
771
772 else
773 {
774 len += cnt + 1;
775 if (buf)
776 {
777 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
778 while (cnt--)
779 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
780 }
781 }
782 }
783 }
784
785 if (buf && (len<n))
786 *buf = 0;
787
788 return len;
789 }
790
791 // ----------------------------------------------------------------------------
792 // UTF-16
793 // ----------------------------------------------------------------------------
794
795 #ifdef WORDS_BIGENDIAN
796 #define wxMBConvUTF16straight wxMBConvUTF16BE
797 #define wxMBConvUTF16swap wxMBConvUTF16LE
798 #else
799 #define wxMBConvUTF16swap wxMBConvUTF16BE
800 #define wxMBConvUTF16straight wxMBConvUTF16LE
801 #endif
802
803
804 #ifdef WC_UTF16
805
806 // copy 16bit MB to 16bit String
807 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
808 {
809 size_t len=0;
810
811 while (*(wxUint16*)psz && (!buf || len < n))
812 {
813 if (buf)
814 *buf++ = *(wxUint16*)psz;
815 len++;
816
817 psz += sizeof(wxUint16);
818 }
819 if (buf && len<n) *buf=0;
820
821 return len;
822 }
823
824
825 // copy 16bit String to 16bit MB
826 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
827 {
828 size_t len=0;
829
830 while (*psz && (!buf || len < n))
831 {
832 if (buf)
833 {
834 *(wxUint16*)buf = *psz;
835 buf += sizeof(wxUint16);
836 }
837 len += sizeof(wxUint16);
838 psz++;
839 }
840 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
841
842 return len;
843 }
844
845
846 // swap 16bit MB to 16bit String
847 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
848 {
849 size_t len=0;
850
851 while (*(wxUint16*)psz && (!buf || len < n))
852 {
853 if (buf)
854 {
855 ((char *)buf)[0] = psz[1];
856 ((char *)buf)[1] = psz[0];
857 buf++;
858 }
859 len++;
860 psz += sizeof(wxUint16);
861 }
862 if (buf && len<n) *buf=0;
863
864 return len;
865 }
866
867
868 // swap 16bit MB to 16bit String
869 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
870 {
871 size_t len=0;
872
873 while (*psz && (!buf || len < n))
874 {
875 if (buf)
876 {
877 *buf++ = ((char*)psz)[1];
878 *buf++ = ((char*)psz)[0];
879 }
880 len += sizeof(wxUint16);
881 psz++;
882 }
883 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
884
885 return len;
886 }
887
888
889 #else // WC_UTF16
890
891
892 // copy 16bit MB to 32bit String
893 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
894 {
895 size_t len=0;
896
897 while (*(wxUint16*)psz && (!buf || len < n))
898 {
899 wxUint32 cc;
900 size_t pa=decode_utf16((wxUint16*)psz, cc);
901 if (pa == (size_t)-1)
902 return pa;
903
904 if (buf)
905 *buf++ = cc;
906 len++;
907 psz += pa * sizeof(wxUint16);
908 }
909 if (buf && len<n) *buf=0;
910
911 return len;
912 }
913
914
915 // copy 32bit String to 16bit MB
916 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
917 {
918 size_t len=0;
919
920 while (*psz && (!buf || len < n))
921 {
922 wxUint16 cc[2];
923 size_t pa=encode_utf16(*psz, cc);
924
925 if (pa == (size_t)-1)
926 return pa;
927
928 if (buf)
929 {
930 *(wxUint16*)buf = cc[0];
931 buf += sizeof(wxUint16);
932 if (pa > 1)
933 {
934 *(wxUint16*)buf = cc[1];
935 buf += sizeof(wxUint16);
936 }
937 }
938
939 len += pa*sizeof(wxUint16);
940 psz++;
941 }
942 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
943
944 return len;
945 }
946
947
948 // swap 16bit MB to 32bit String
949 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
950 {
951 size_t len=0;
952
953 while (*(wxUint16*)psz && (!buf || len < n))
954 {
955 wxUint32 cc;
956 char tmp[4];
957 tmp[0]=psz[1]; tmp[1]=psz[0];
958 tmp[2]=psz[3]; tmp[3]=psz[2];
959
960 size_t pa=decode_utf16((wxUint16*)tmp, cc);
961 if (pa == (size_t)-1)
962 return pa;
963
964 if (buf)
965 *buf++ = cc;
966
967 len++;
968 psz += pa * sizeof(wxUint16);
969 }
970 if (buf && len<n) *buf=0;
971
972 return len;
973 }
974
975
976 // swap 32bit String to 16bit MB
977 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
978 {
979 size_t len=0;
980
981 while (*psz && (!buf || len < n))
982 {
983 wxUint16 cc[2];
984 size_t pa=encode_utf16(*psz, cc);
985
986 if (pa == (size_t)-1)
987 return pa;
988
989 if (buf)
990 {
991 *buf++ = ((char*)cc)[1];
992 *buf++ = ((char*)cc)[0];
993 if (pa > 1)
994 {
995 *buf++ = ((char*)cc)[3];
996 *buf++ = ((char*)cc)[2];
997 }
998 }
999
1000 len += pa*sizeof(wxUint16);
1001 psz++;
1002 }
1003 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1004
1005 return len;
1006 }
1007
1008 #endif // WC_UTF16
1009
1010
1011 // ----------------------------------------------------------------------------
1012 // UTF-32
1013 // ----------------------------------------------------------------------------
1014
1015 #ifdef WORDS_BIGENDIAN
1016 #define wxMBConvUTF32straight wxMBConvUTF32BE
1017 #define wxMBConvUTF32swap wxMBConvUTF32LE
1018 #else
1019 #define wxMBConvUTF32swap wxMBConvUTF32BE
1020 #define wxMBConvUTF32straight wxMBConvUTF32LE
1021 #endif
1022
1023
1024 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1025 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1026
1027
1028 #ifdef WC_UTF16
1029
1030 // copy 32bit MB to 16bit String
1031 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1032 {
1033 size_t len=0;
1034
1035 while (*(wxUint32*)psz && (!buf || len < n))
1036 {
1037 wxUint16 cc[2];
1038
1039 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1040 if (pa == (size_t)-1)
1041 return pa;
1042
1043 if (buf)
1044 {
1045 *buf++ = cc[0];
1046 if (pa > 1)
1047 *buf++ = cc[1];
1048 }
1049 len += pa;
1050 psz += sizeof(wxUint32);
1051 }
1052 if (buf && len<n) *buf=0;
1053
1054 return len;
1055 }
1056
1057
1058 // copy 16bit String to 32bit MB
1059 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1060 {
1061 size_t len=0;
1062
1063 while (*psz && (!buf || len < n))
1064 {
1065 wxUint32 cc;
1066
1067 // cast is ok for WC_UTF16
1068 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1069 if (pa == (size_t)-1)
1070 return pa;
1071
1072 if (buf)
1073 {
1074 *(wxUint32*)buf = cc;
1075 buf += sizeof(wxUint32);
1076 }
1077 len += sizeof(wxUint32);
1078 psz += pa;
1079 }
1080
1081 if (buf && len<=n-sizeof(wxUint32))
1082 *(wxUint32*)buf=0;
1083
1084 return len;
1085 }
1086
1087
1088
1089 // swap 32bit MB to 16bit String
1090 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1091 {
1092 size_t len=0;
1093
1094 while (*(wxUint32*)psz && (!buf || len < n))
1095 {
1096 char tmp[4];
1097 tmp[0] = psz[3]; tmp[1] = psz[2];
1098 tmp[2] = psz[1]; tmp[3] = psz[0];
1099
1100
1101 wxUint16 cc[2];
1102
1103 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1104 if (pa == (size_t)-1)
1105 return pa;
1106
1107 if (buf)
1108 {
1109 *buf++ = cc[0];
1110 if (pa > 1)
1111 *buf++ = cc[1];
1112 }
1113 len += pa;
1114 psz += sizeof(wxUint32);
1115 }
1116
1117 if (buf && len<n)
1118 *buf=0;
1119
1120 return len;
1121 }
1122
1123
1124 // swap 16bit String to 32bit MB
1125 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126 {
1127 size_t len=0;
1128
1129 while (*psz && (!buf || len < n))
1130 {
1131 char cc[4];
1132
1133 // cast is ok for WC_UTF16
1134 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1135 if (pa == (size_t)-1)
1136 return pa;
1137
1138 if (buf)
1139 {
1140 *buf++ = cc[3];
1141 *buf++ = cc[2];
1142 *buf++ = cc[1];
1143 *buf++ = cc[0];
1144 }
1145 len += sizeof(wxUint32);
1146 psz += pa;
1147 }
1148
1149 if (buf && len<=n-sizeof(wxUint32))
1150 *(wxUint32*)buf=0;
1151
1152 return len;
1153 }
1154
1155 #else // WC_UTF16
1156
1157
1158 // copy 32bit MB to 32bit String
1159 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1160 {
1161 size_t len=0;
1162
1163 while (*(wxUint32*)psz && (!buf || len < n))
1164 {
1165 if (buf)
1166 *buf++ = *(wxUint32*)psz;
1167 len++;
1168 psz += sizeof(wxUint32);
1169 }
1170
1171 if (buf && len<n)
1172 *buf=0;
1173
1174 return len;
1175 }
1176
1177
1178 // copy 32bit String to 32bit MB
1179 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1180 {
1181 size_t len=0;
1182
1183 while (*psz && (!buf || len < n))
1184 {
1185 if (buf)
1186 {
1187 *(wxUint32*)buf = *psz;
1188 buf += sizeof(wxUint32);
1189 }
1190
1191 len += sizeof(wxUint32);
1192 psz++;
1193 }
1194
1195 if (buf && len<=n-sizeof(wxUint32))
1196 *(wxUint32*)buf=0;
1197
1198 return len;
1199 }
1200
1201
1202 // swap 32bit MB to 32bit String
1203 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1204 {
1205 size_t len=0;
1206
1207 while (*(wxUint32*)psz && (!buf || len < n))
1208 {
1209 if (buf)
1210 {
1211 ((char *)buf)[0] = psz[3];
1212 ((char *)buf)[1] = psz[2];
1213 ((char *)buf)[2] = psz[1];
1214 ((char *)buf)[3] = psz[0];
1215 buf++;
1216 }
1217 len++;
1218 psz += sizeof(wxUint32);
1219 }
1220
1221 if (buf && len<n)
1222 *buf=0;
1223
1224 return len;
1225 }
1226
1227
1228 // swap 32bit String to 32bit MB
1229 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1230 {
1231 size_t len=0;
1232
1233 while (*psz && (!buf || len < n))
1234 {
1235 if (buf)
1236 {
1237 *buf++ = ((char *)psz)[3];
1238 *buf++ = ((char *)psz)[2];
1239 *buf++ = ((char *)psz)[1];
1240 *buf++ = ((char *)psz)[0];
1241 }
1242 len += sizeof(wxUint32);
1243 psz++;
1244 }
1245
1246 if (buf && len<=n-sizeof(wxUint32))
1247 *(wxUint32*)buf=0;
1248
1249 return len;
1250 }
1251
1252
1253 #endif // WC_UTF16
1254
1255
1256 // ============================================================================
1257 // The classes doing conversion using the iconv_xxx() functions
1258 // ============================================================================
1259
1260 #ifdef HAVE_ICONV
1261
1262 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1263 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1264 // (unless there's yet another bug in glibc) the only case when iconv()
1265 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1266 // left in the input buffer -- when _real_ error occurs,
1267 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1268 // iconv() failure.
1269 // [This bug does not appear in glibc 2.2.]
1270 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1271 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1272 (errno != E2BIG || bufLeft != 0))
1273 #else
1274 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1275 #endif
1276
1277 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1278
1279 #define ICONV_T_INVALID ((iconv_t)-1)
1280
1281 #if SIZEOF_WCHAR_T == 4
1282 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1283 #define WC_ENC wxFONTENCODING_UTF32
1284 #elif SIZEOF_WCHAR_T == 2
1285 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1286 #define WC_ENC wxFONTENCODING_UTF16
1287 #else // sizeof(wchar_t) != 2 nor 4
1288 // does this ever happen?
1289 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1290 #endif
1291
1292 // ----------------------------------------------------------------------------
1293 // wxMBConv_iconv: encapsulates an iconv character set
1294 // ----------------------------------------------------------------------------
1295
1296 class wxMBConv_iconv : public wxMBConv
1297 {
1298 public:
1299 wxMBConv_iconv(const wxChar *name);
1300 virtual ~wxMBConv_iconv();
1301
1302 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1303 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1304
1305 bool IsOk() const
1306 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1307
1308 protected:
1309 // the iconv handlers used to translate from multibyte to wide char and in
1310 // the other direction
1311 iconv_t m2w,
1312 w2m;
1313 #if wxUSE_THREADS
1314 // guards access to m2w and w2m objects
1315 wxMutex m_iconvMutex;
1316 #endif
1317
1318 private:
1319 // the name (for iconv_open()) of a wide char charset -- if none is
1320 // available on this machine, it will remain NULL
1321 static wxString ms_wcCharsetName;
1322
1323 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1324 // different endian-ness than the native one
1325 static bool ms_wcNeedsSwap;
1326 };
1327
1328 // make the constructor available for unit testing
1329 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1330 {
1331 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1332 if ( !result->IsOk() )
1333 {
1334 delete result;
1335 return 0;
1336 }
1337 return result;
1338 }
1339
1340 wxString wxMBConv_iconv::ms_wcCharsetName;
1341 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1342
1343 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1344 {
1345 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1346 // names for the charsets
1347 const wxCharBuffer cname(wxString(name).ToAscii());
1348
1349 // check for charset that represents wchar_t:
1350 if ( ms_wcCharsetName.empty() )
1351 {
1352 #if wxUSE_FONTMAP
1353 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1354 #else // !wxUSE_FONTMAP
1355 static const wxChar *names[] =
1356 {
1357 #if SIZEOF_WCHAR_T == 4
1358 _T("UCS-4"),
1359 #elif SIZEOF_WCHAR_T = 2
1360 _T("UCS-2"),
1361 #endif
1362 NULL
1363 };
1364 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1365
1366 for ( ; *names; ++names )
1367 {
1368 const wxString name(*names);
1369
1370 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1371 wxString nameXE(name);
1372 #ifdef WORDS_BIGENDIAN
1373 nameXE += _T("BE");
1374 #else // little endian
1375 nameXE += _T("LE");
1376 #endif
1377
1378 m2w = iconv_open(nameXE.ToAscii(), cname);
1379 if ( m2w == ICONV_T_INVALID )
1380 {
1381 // try charset w/o bytesex info (e.g. "UCS4")
1382 m2w = iconv_open(name.ToAscii(), cname);
1383
1384 // and check for bytesex ourselves:
1385 if ( m2w != ICONV_T_INVALID )
1386 {
1387 char buf[2], *bufPtr;
1388 wchar_t wbuf[2], *wbufPtr;
1389 size_t insz, outsz;
1390 size_t res;
1391
1392 buf[0] = 'A';
1393 buf[1] = 0;
1394 wbuf[0] = 0;
1395 insz = 2;
1396 outsz = SIZEOF_WCHAR_T * 2;
1397 wbufPtr = wbuf;
1398 bufPtr = buf;
1399
1400 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1401 (char**)&wbufPtr, &outsz);
1402
1403 if (ICONV_FAILED(res, insz))
1404 {
1405 wxLogLastError(wxT("iconv"));
1406 wxLogError(_("Conversion to charset '%s' doesn't work."),
1407 name.c_str());
1408 }
1409 else // ok, can convert to this encoding, remember it
1410 {
1411 ms_wcCharsetName = name;
1412 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1413 }
1414 }
1415 }
1416 else // use charset not requiring byte swapping
1417 {
1418 ms_wcCharsetName = nameXE;
1419 }
1420 }
1421
1422 wxLogTrace(TRACE_STRCONV,
1423 wxT("iconv wchar_t charset is \"%s\"%s"),
1424 ms_wcCharsetName.empty() ? _T("<none>")
1425 : ms_wcCharsetName.c_str(),
1426 ms_wcNeedsSwap ? _T(" (needs swap)")
1427 : _T(""));
1428 }
1429 else // we already have ms_wcCharsetName
1430 {
1431 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1432 }
1433
1434 if ( ms_wcCharsetName.empty() )
1435 {
1436 w2m = ICONV_T_INVALID;
1437 }
1438 else
1439 {
1440 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1441 if ( w2m == ICONV_T_INVALID )
1442 {
1443 wxLogTrace(TRACE_STRCONV,
1444 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1445 ms_wcCharsetName.c_str(), cname.data());
1446 }
1447 }
1448 }
1449
1450 wxMBConv_iconv::~wxMBConv_iconv()
1451 {
1452 if ( m2w != ICONV_T_INVALID )
1453 iconv_close(m2w);
1454 if ( w2m != ICONV_T_INVALID )
1455 iconv_close(w2m);
1456 }
1457
1458 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1459 {
1460 #if wxUSE_THREADS
1461 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1462 // Unfortunately there is a couple of global wxCSConv objects such as
1463 // wxConvLocal that are used all over wx code, so we have to make sure
1464 // the handle is used by at most one thread at the time. Otherwise
1465 // only a few wx classes would be safe to use from non-main threads
1466 // as MB<->WC conversion would fail "randomly".
1467 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1468 #endif
1469
1470 size_t inbuf = strlen(psz);
1471 size_t outbuf = n * SIZEOF_WCHAR_T;
1472 size_t res, cres;
1473 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1474 wchar_t *bufPtr = buf;
1475 const char *pszPtr = psz;
1476
1477 if (buf)
1478 {
1479 // have destination buffer, convert there
1480 cres = iconv(m2w,
1481 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1482 (char**)&bufPtr, &outbuf);
1483 res = n - (outbuf / SIZEOF_WCHAR_T);
1484
1485 if (ms_wcNeedsSwap)
1486 {
1487 // convert to native endianness
1488 for ( unsigned n = 0; n < res; n++ )
1489 buf[n] = WC_BSWAP(buf[n]);
1490 }
1491
1492 // NB: iconv was given only strlen(psz) characters on input, and so
1493 // it couldn't convert the trailing zero. Let's do it ourselves
1494 // if there's some room left for it in the output buffer.
1495 if (res < n)
1496 buf[res] = 0;
1497 }
1498 else
1499 {
1500 // no destination buffer... convert using temp buffer
1501 // to calculate destination buffer requirement
1502 wchar_t tbuf[8];
1503 res = 0;
1504 do {
1505 bufPtr = tbuf;
1506 outbuf = 8*SIZEOF_WCHAR_T;
1507
1508 cres = iconv(m2w,
1509 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1510 (char**)&bufPtr, &outbuf );
1511
1512 res += 8-(outbuf/SIZEOF_WCHAR_T);
1513 } while ((cres==(size_t)-1) && (errno==E2BIG));
1514 }
1515
1516 if (ICONV_FAILED(cres, inbuf))
1517 {
1518 //VS: it is ok if iconv fails, hence trace only
1519 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1520 return (size_t)-1;
1521 }
1522
1523 return res;
1524 }
1525
1526 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1527 {
1528 #if wxUSE_THREADS
1529 // NB: explained in MB2WC
1530 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1531 #endif
1532
1533 size_t inlen = wxWcslen(psz);
1534 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1535 size_t outbuf = n;
1536 size_t res, cres;
1537
1538 wchar_t *tmpbuf = 0;
1539
1540 if (ms_wcNeedsSwap)
1541 {
1542 // need to copy to temp buffer to switch endianness
1543 // (doing WC_BSWAP twice on the original buffer won't help, as it
1544 // could be in read-only memory, or be accessed in some other thread)
1545 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1546 for ( size_t n = 0; n < inlen; n++ )
1547 tmpbuf[n] = WC_BSWAP(psz[n]);
1548 tmpbuf[inlen] = L'\0';
1549 psz = tmpbuf;
1550 }
1551
1552 if (buf)
1553 {
1554 // have destination buffer, convert there
1555 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1556
1557 res = n-outbuf;
1558
1559 // NB: iconv was given only wcslen(psz) characters on input, and so
1560 // it couldn't convert the trailing zero. Let's do it ourselves
1561 // if there's some room left for it in the output buffer.
1562 if (res < n)
1563 buf[0] = 0;
1564 }
1565 else
1566 {
1567 // no destination buffer... convert using temp buffer
1568 // to calculate destination buffer requirement
1569 char tbuf[16];
1570 res = 0;
1571 do {
1572 buf = tbuf; outbuf = 16;
1573
1574 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1575
1576 res += 16 - outbuf;
1577 } while ((cres==(size_t)-1) && (errno==E2BIG));
1578 }
1579
1580 if (ms_wcNeedsSwap)
1581 {
1582 free(tmpbuf);
1583 }
1584
1585 if (ICONV_FAILED(cres, inbuf))
1586 {
1587 //VS: it is ok if iconv fails, hence trace only
1588 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1589 return (size_t)-1;
1590 }
1591
1592 return res;
1593 }
1594
1595 #endif // HAVE_ICONV
1596
1597
1598 // ============================================================================
1599 // Win32 conversion classes
1600 // ============================================================================
1601
1602 #ifdef wxHAVE_WIN32_MB2WC
1603
1604 // from utils.cpp
1605 #if wxUSE_FONTMAP
1606 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1607 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1608 #endif
1609
1610 class wxMBConv_win32 : public wxMBConv
1611 {
1612 public:
1613 wxMBConv_win32()
1614 {
1615 m_CodePage = CP_ACP;
1616 }
1617
1618 #if wxUSE_FONTMAP
1619 wxMBConv_win32(const wxChar* name)
1620 {
1621 m_CodePage = wxCharsetToCodepage(name);
1622 }
1623
1624 wxMBConv_win32(wxFontEncoding encoding)
1625 {
1626 m_CodePage = wxEncodingToCodepage(encoding);
1627 }
1628 #endif
1629
1630 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1631 {
1632 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1633 // the behaviour is not compatible with the Unix version (using iconv)
1634 // and break the library itself, e.g. wxTextInputStream::NextChar()
1635 // wouldn't work if reading an incomplete MB char didn't result in an
1636 // error
1637 //
1638 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1639 // an error (tested under Windows Server 2003) and apparently it is
1640 // done on purpose, i.e. the function accepts any input in this case
1641 // and although I'd prefer to return error on ill-formed output, our
1642 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1643 // explicitly ill-formed according to RFC 2152) neither so we don't
1644 // even have any fallback here...
1645 int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1646
1647 const size_t len = ::MultiByteToWideChar
1648 (
1649 m_CodePage, // code page
1650 flags, // flags: fall on error
1651 psz, // input string
1652 -1, // its length (NUL-terminated)
1653 buf, // output string
1654 buf ? n : 0 // size of output buffer
1655 );
1656
1657 // note that it returns count of written chars for buf != NULL and size
1658 // of the needed buffer for buf == NULL so in either case the length of
1659 // the string (which never includes the terminating NUL) is one less
1660 return len ? len - 1 : (size_t)-1;
1661 }
1662
1663 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1664 {
1665 /*
1666 we have a problem here: by default, WideCharToMultiByte() may
1667 replace characters unrepresentable in the target code page with bad
1668 quality approximations such as turning "1/2" symbol (U+00BD) into
1669 "1" for the code pages which don't have it and we, obviously, want
1670 to avoid this at any price
1671
1672 the trouble is that this function does it _silently_, i.e. it won't
1673 even tell us whether it did or not... Win98/2000 and higher provide
1674 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1675 we have to resort to a round trip, i.e. check that converting back
1676 results in the same string -- this is, of course, expensive but
1677 otherwise we simply can't be sure to not garble the data.
1678 */
1679
1680 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1681 // it doesn't work with CJK encodings (which we test for rather roughly
1682 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1683 // supporting it
1684 BOOL usedDef wxDUMMY_INITIALIZE(false);
1685 BOOL *pUsedDef;
1686 int flags;
1687 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1688 {
1689 // it's our lucky day
1690 flags = WC_NO_BEST_FIT_CHARS;
1691 pUsedDef = &usedDef;
1692 }
1693 else // old system or unsupported encoding
1694 {
1695 flags = 0;
1696 pUsedDef = NULL;
1697 }
1698
1699 const size_t len = ::WideCharToMultiByte
1700 (
1701 m_CodePage, // code page
1702 flags, // either none or no best fit
1703 pwz, // input string
1704 -1, // it is (wide) NUL-terminated
1705 buf, // output buffer
1706 buf ? n : 0, // and its size
1707 NULL, // default "replacement" char
1708 pUsedDef // [out] was it used?
1709 );
1710
1711 if ( !len )
1712 {
1713 // function totally failed
1714 return (size_t)-1;
1715 }
1716
1717 // if we were really converting, check if we succeeded
1718 if ( buf )
1719 {
1720 if ( flags )
1721 {
1722 // check if the conversion failed, i.e. if any replacements
1723 // were done
1724 if ( usedDef )
1725 return (size_t)-1;
1726 }
1727 else // we must resort to double tripping...
1728 {
1729 wxWCharBuffer wcBuf(n);
1730 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1731 wcscmp(wcBuf, pwz) != 0 )
1732 {
1733 // we didn't obtain the same thing we started from, hence
1734 // the conversion was lossy and we consider that it failed
1735 return (size_t)-1;
1736 }
1737 }
1738 }
1739
1740 // see the comment above for the reason of "len - 1"
1741 return len - 1;
1742 }
1743
1744 bool IsOk() const { return m_CodePage != -1; }
1745
1746 private:
1747 static bool CanUseNoBestFit()
1748 {
1749 static int s_isWin98Or2k = -1;
1750
1751 if ( s_isWin98Or2k == -1 )
1752 {
1753 int verMaj, verMin;
1754 switch ( wxGetOsVersion(&verMaj, &verMin) )
1755 {
1756 case wxWIN95:
1757 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1758 break;
1759
1760 case wxWINDOWS_NT:
1761 s_isWin98Or2k = verMaj >= 5;
1762 break;
1763
1764 default:
1765 // unknown, be conseravtive by default
1766 s_isWin98Or2k = 0;
1767 }
1768
1769 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1770 }
1771
1772 return s_isWin98Or2k == 1;
1773 }
1774
1775 long m_CodePage;
1776 };
1777
1778 #endif // wxHAVE_WIN32_MB2WC
1779
1780 // ============================================================================
1781 // Cocoa conversion classes
1782 // ============================================================================
1783
1784 #if defined(__WXCOCOA__)
1785
1786 // RN: There is no UTF-32 support in either Core Foundation or
1787 // Cocoa. Strangely enough, internally Core Foundation uses
1788 // UTF 32 internally quite a bit - its just not public (yet).
1789
1790 #include <CoreFoundation/CFString.h>
1791 #include <CoreFoundation/CFStringEncodingExt.h>
1792
1793 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1794 {
1795 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1796 if ( encoding == wxFONTENCODING_DEFAULT )
1797 {
1798 enc = CFStringGetSystemEncoding();
1799 }
1800 else switch( encoding)
1801 {
1802 case wxFONTENCODING_ISO8859_1 :
1803 enc = kCFStringEncodingISOLatin1 ;
1804 break ;
1805 case wxFONTENCODING_ISO8859_2 :
1806 enc = kCFStringEncodingISOLatin2;
1807 break ;
1808 case wxFONTENCODING_ISO8859_3 :
1809 enc = kCFStringEncodingISOLatin3 ;
1810 break ;
1811 case wxFONTENCODING_ISO8859_4 :
1812 enc = kCFStringEncodingISOLatin4;
1813 break ;
1814 case wxFONTENCODING_ISO8859_5 :
1815 enc = kCFStringEncodingISOLatinCyrillic;
1816 break ;
1817 case wxFONTENCODING_ISO8859_6 :
1818 enc = kCFStringEncodingISOLatinArabic;
1819 break ;
1820 case wxFONTENCODING_ISO8859_7 :
1821 enc = kCFStringEncodingISOLatinGreek;
1822 break ;
1823 case wxFONTENCODING_ISO8859_8 :
1824 enc = kCFStringEncodingISOLatinHebrew;
1825 break ;
1826 case wxFONTENCODING_ISO8859_9 :
1827 enc = kCFStringEncodingISOLatin5;
1828 break ;
1829 case wxFONTENCODING_ISO8859_10 :
1830 enc = kCFStringEncodingISOLatin6;
1831 break ;
1832 case wxFONTENCODING_ISO8859_11 :
1833 enc = kCFStringEncodingISOLatinThai;
1834 break ;
1835 case wxFONTENCODING_ISO8859_13 :
1836 enc = kCFStringEncodingISOLatin7;
1837 break ;
1838 case wxFONTENCODING_ISO8859_14 :
1839 enc = kCFStringEncodingISOLatin8;
1840 break ;
1841 case wxFONTENCODING_ISO8859_15 :
1842 enc = kCFStringEncodingISOLatin9;
1843 break ;
1844
1845 case wxFONTENCODING_KOI8 :
1846 enc = kCFStringEncodingKOI8_R;
1847 break ;
1848 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1849 enc = kCFStringEncodingDOSRussian;
1850 break ;
1851
1852 // case wxFONTENCODING_BULGARIAN :
1853 // enc = ;
1854 // break ;
1855
1856 case wxFONTENCODING_CP437 :
1857 enc =kCFStringEncodingDOSLatinUS ;
1858 break ;
1859 case wxFONTENCODING_CP850 :
1860 enc = kCFStringEncodingDOSLatin1;
1861 break ;
1862 case wxFONTENCODING_CP852 :
1863 enc = kCFStringEncodingDOSLatin2;
1864 break ;
1865 case wxFONTENCODING_CP855 :
1866 enc = kCFStringEncodingDOSCyrillic;
1867 break ;
1868 case wxFONTENCODING_CP866 :
1869 enc =kCFStringEncodingDOSRussian ;
1870 break ;
1871 case wxFONTENCODING_CP874 :
1872 enc = kCFStringEncodingDOSThai;
1873 break ;
1874 case wxFONTENCODING_CP932 :
1875 enc = kCFStringEncodingDOSJapanese;
1876 break ;
1877 case wxFONTENCODING_CP936 :
1878 enc =kCFStringEncodingDOSChineseSimplif ;
1879 break ;
1880 case wxFONTENCODING_CP949 :
1881 enc = kCFStringEncodingDOSKorean;
1882 break ;
1883 case wxFONTENCODING_CP950 :
1884 enc = kCFStringEncodingDOSChineseTrad;
1885 break ;
1886 case wxFONTENCODING_CP1250 :
1887 enc = kCFStringEncodingWindowsLatin2;
1888 break ;
1889 case wxFONTENCODING_CP1251 :
1890 enc =kCFStringEncodingWindowsCyrillic ;
1891 break ;
1892 case wxFONTENCODING_CP1252 :
1893 enc =kCFStringEncodingWindowsLatin1 ;
1894 break ;
1895 case wxFONTENCODING_CP1253 :
1896 enc = kCFStringEncodingWindowsGreek;
1897 break ;
1898 case wxFONTENCODING_CP1254 :
1899 enc = kCFStringEncodingWindowsLatin5;
1900 break ;
1901 case wxFONTENCODING_CP1255 :
1902 enc =kCFStringEncodingWindowsHebrew ;
1903 break ;
1904 case wxFONTENCODING_CP1256 :
1905 enc =kCFStringEncodingWindowsArabic ;
1906 break ;
1907 case wxFONTENCODING_CP1257 :
1908 enc = kCFStringEncodingWindowsBalticRim;
1909 break ;
1910 // This only really encodes to UTF7 (if that) evidently
1911 // case wxFONTENCODING_UTF7 :
1912 // enc = kCFStringEncodingNonLossyASCII ;
1913 // break ;
1914 case wxFONTENCODING_UTF8 :
1915 enc = kCFStringEncodingUTF8 ;
1916 break ;
1917 case wxFONTENCODING_EUC_JP :
1918 enc = kCFStringEncodingEUC_JP;
1919 break ;
1920 case wxFONTENCODING_UTF16 :
1921 enc = kCFStringEncodingUnicode ;
1922 break ;
1923 case wxFONTENCODING_MACROMAN :
1924 enc = kCFStringEncodingMacRoman ;
1925 break ;
1926 case wxFONTENCODING_MACJAPANESE :
1927 enc = kCFStringEncodingMacJapanese ;
1928 break ;
1929 case wxFONTENCODING_MACCHINESETRAD :
1930 enc = kCFStringEncodingMacChineseTrad ;
1931 break ;
1932 case wxFONTENCODING_MACKOREAN :
1933 enc = kCFStringEncodingMacKorean ;
1934 break ;
1935 case wxFONTENCODING_MACARABIC :
1936 enc = kCFStringEncodingMacArabic ;
1937 break ;
1938 case wxFONTENCODING_MACHEBREW :
1939 enc = kCFStringEncodingMacHebrew ;
1940 break ;
1941 case wxFONTENCODING_MACGREEK :
1942 enc = kCFStringEncodingMacGreek ;
1943 break ;
1944 case wxFONTENCODING_MACCYRILLIC :
1945 enc = kCFStringEncodingMacCyrillic ;
1946 break ;
1947 case wxFONTENCODING_MACDEVANAGARI :
1948 enc = kCFStringEncodingMacDevanagari ;
1949 break ;
1950 case wxFONTENCODING_MACGURMUKHI :
1951 enc = kCFStringEncodingMacGurmukhi ;
1952 break ;
1953 case wxFONTENCODING_MACGUJARATI :
1954 enc = kCFStringEncodingMacGujarati ;
1955 break ;
1956 case wxFONTENCODING_MACORIYA :
1957 enc = kCFStringEncodingMacOriya ;
1958 break ;
1959 case wxFONTENCODING_MACBENGALI :
1960 enc = kCFStringEncodingMacBengali ;
1961 break ;
1962 case wxFONTENCODING_MACTAMIL :
1963 enc = kCFStringEncodingMacTamil ;
1964 break ;
1965 case wxFONTENCODING_MACTELUGU :
1966 enc = kCFStringEncodingMacTelugu ;
1967 break ;
1968 case wxFONTENCODING_MACKANNADA :
1969 enc = kCFStringEncodingMacKannada ;
1970 break ;
1971 case wxFONTENCODING_MACMALAJALAM :
1972 enc = kCFStringEncodingMacMalayalam ;
1973 break ;
1974 case wxFONTENCODING_MACSINHALESE :
1975 enc = kCFStringEncodingMacSinhalese ;
1976 break ;
1977 case wxFONTENCODING_MACBURMESE :
1978 enc = kCFStringEncodingMacBurmese ;
1979 break ;
1980 case wxFONTENCODING_MACKHMER :
1981 enc = kCFStringEncodingMacKhmer ;
1982 break ;
1983 case wxFONTENCODING_MACTHAI :
1984 enc = kCFStringEncodingMacThai ;
1985 break ;
1986 case wxFONTENCODING_MACLAOTIAN :
1987 enc = kCFStringEncodingMacLaotian ;
1988 break ;
1989 case wxFONTENCODING_MACGEORGIAN :
1990 enc = kCFStringEncodingMacGeorgian ;
1991 break ;
1992 case wxFONTENCODING_MACARMENIAN :
1993 enc = kCFStringEncodingMacArmenian ;
1994 break ;
1995 case wxFONTENCODING_MACCHINESESIMP :
1996 enc = kCFStringEncodingMacChineseSimp ;
1997 break ;
1998 case wxFONTENCODING_MACTIBETAN :
1999 enc = kCFStringEncodingMacTibetan ;
2000 break ;
2001 case wxFONTENCODING_MACMONGOLIAN :
2002 enc = kCFStringEncodingMacMongolian ;
2003 break ;
2004 case wxFONTENCODING_MACETHIOPIC :
2005 enc = kCFStringEncodingMacEthiopic ;
2006 break ;
2007 case wxFONTENCODING_MACCENTRALEUR :
2008 enc = kCFStringEncodingMacCentralEurRoman ;
2009 break ;
2010 case wxFONTENCODING_MACVIATNAMESE :
2011 enc = kCFStringEncodingMacVietnamese ;
2012 break ;
2013 case wxFONTENCODING_MACARABICEXT :
2014 enc = kCFStringEncodingMacExtArabic ;
2015 break ;
2016 case wxFONTENCODING_MACSYMBOL :
2017 enc = kCFStringEncodingMacSymbol ;
2018 break ;
2019 case wxFONTENCODING_MACDINGBATS :
2020 enc = kCFStringEncodingMacDingbats ;
2021 break ;
2022 case wxFONTENCODING_MACTURKISH :
2023 enc = kCFStringEncodingMacTurkish ;
2024 break ;
2025 case wxFONTENCODING_MACCROATIAN :
2026 enc = kCFStringEncodingMacCroatian ;
2027 break ;
2028 case wxFONTENCODING_MACICELANDIC :
2029 enc = kCFStringEncodingMacIcelandic ;
2030 break ;
2031 case wxFONTENCODING_MACROMANIAN :
2032 enc = kCFStringEncodingMacRomanian ;
2033 break ;
2034 case wxFONTENCODING_MACCELTIC :
2035 enc = kCFStringEncodingMacCeltic ;
2036 break ;
2037 case wxFONTENCODING_MACGAELIC :
2038 enc = kCFStringEncodingMacGaelic ;
2039 break ;
2040 // case wxFONTENCODING_MACKEYBOARD :
2041 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2042 // break ;
2043 default :
2044 // because gcc is picky
2045 break ;
2046 } ;
2047 return enc ;
2048 }
2049
2050 class wxMBConv_cocoa : public wxMBConv
2051 {
2052 public:
2053 wxMBConv_cocoa()
2054 {
2055 Init(CFStringGetSystemEncoding()) ;
2056 }
2057
2058 #if wxUSE_FONTMAP
2059 wxMBConv_cocoa(const wxChar* name)
2060 {
2061 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2062 }
2063 #endif
2064
2065 wxMBConv_cocoa(wxFontEncoding encoding)
2066 {
2067 Init( wxCFStringEncFromFontEnc(encoding) );
2068 }
2069
2070 ~wxMBConv_cocoa()
2071 {
2072 }
2073
2074 void Init( CFStringEncoding encoding)
2075 {
2076 m_encoding = encoding ;
2077 }
2078
2079 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2080 {
2081 wxASSERT(szUnConv);
2082
2083 CFStringRef theString = CFStringCreateWithBytes (
2084 NULL, //the allocator
2085 (const UInt8*)szUnConv,
2086 strlen(szUnConv),
2087 m_encoding,
2088 false //no BOM/external representation
2089 );
2090
2091 wxASSERT(theString);
2092
2093 size_t nOutLength = CFStringGetLength(theString);
2094
2095 if (szOut == NULL)
2096 {
2097 CFRelease(theString);
2098 return nOutLength;
2099 }
2100
2101 CFRange theRange = { 0, nOutSize };
2102
2103 #if SIZEOF_WCHAR_T == 4
2104 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2105 #endif
2106
2107 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2108
2109 CFRelease(theString);
2110
2111 szUniCharBuffer[nOutLength] = '\0' ;
2112
2113 #if SIZEOF_WCHAR_T == 4
2114 wxMBConvUTF16 converter ;
2115 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2116 delete[] szUniCharBuffer;
2117 #endif
2118
2119 return nOutLength;
2120 }
2121
2122 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2123 {
2124 wxASSERT(szUnConv);
2125
2126 size_t nRealOutSize;
2127 size_t nBufSize = wxWcslen(szUnConv);
2128 UniChar* szUniBuffer = (UniChar*) szUnConv;
2129
2130 #if SIZEOF_WCHAR_T == 4
2131 wxMBConvUTF16 converter ;
2132 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2133 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2134 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2135 nBufSize /= sizeof(UniChar);
2136 #endif
2137
2138 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2139 NULL, //allocator
2140 szUniBuffer,
2141 nBufSize,
2142 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2143 );
2144
2145 wxASSERT(theString);
2146
2147 //Note that CER puts a BOM when converting to unicode
2148 //so we check and use getchars instead in that case
2149 if (m_encoding == kCFStringEncodingUnicode)
2150 {
2151 if (szOut != NULL)
2152 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2153
2154 nRealOutSize = CFStringGetLength(theString) + 1;
2155 }
2156 else
2157 {
2158 CFStringGetBytes(
2159 theString,
2160 CFRangeMake(0, CFStringGetLength(theString)),
2161 m_encoding,
2162 0, //what to put in characters that can't be converted -
2163 //0 tells CFString to return NULL if it meets such a character
2164 false, //not an external representation
2165 (UInt8*) szOut,
2166 nOutSize,
2167 (CFIndex*) &nRealOutSize
2168 );
2169 }
2170
2171 CFRelease(theString);
2172
2173 #if SIZEOF_WCHAR_T == 4
2174 delete[] szUniBuffer;
2175 #endif
2176
2177 return nRealOutSize - 1;
2178 }
2179
2180 bool IsOk() const
2181 {
2182 return m_encoding != kCFStringEncodingInvalidId &&
2183 CFStringIsEncodingAvailable(m_encoding);
2184 }
2185
2186 private:
2187 CFStringEncoding m_encoding ;
2188 };
2189
2190 #endif // defined(__WXCOCOA__)
2191
2192 // ============================================================================
2193 // Mac conversion classes
2194 // ============================================================================
2195
2196 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2197
2198 class wxMBConv_mac : public wxMBConv
2199 {
2200 public:
2201 wxMBConv_mac()
2202 {
2203 Init(CFStringGetSystemEncoding()) ;
2204 }
2205
2206 #if wxUSE_FONTMAP
2207 wxMBConv_mac(const wxChar* name)
2208 {
2209 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2210 }
2211 #endif
2212
2213 wxMBConv_mac(wxFontEncoding encoding)
2214 {
2215 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2216 }
2217
2218 ~wxMBConv_mac()
2219 {
2220 OSStatus status = noErr ;
2221 status = TECDisposeConverter(m_MB2WC_converter);
2222 status = TECDisposeConverter(m_WC2MB_converter);
2223 }
2224
2225
2226 void Init( TextEncodingBase encoding)
2227 {
2228 OSStatus status = noErr ;
2229 m_char_encoding = encoding ;
2230 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2231
2232 status = TECCreateConverter(&m_MB2WC_converter,
2233 m_char_encoding,
2234 m_unicode_encoding);
2235 status = TECCreateConverter(&m_WC2MB_converter,
2236 m_unicode_encoding,
2237 m_char_encoding);
2238 }
2239
2240 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2241 {
2242 OSStatus status = noErr ;
2243 ByteCount byteOutLen ;
2244 ByteCount byteInLen = strlen(psz) ;
2245 wchar_t *tbuf = NULL ;
2246 UniChar* ubuf = NULL ;
2247 size_t res = 0 ;
2248
2249 if (buf == NULL)
2250 {
2251 //apple specs say at least 32
2252 n = wxMax( 32 , byteInLen ) ;
2253 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2254 }
2255 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2256 #if SIZEOF_WCHAR_T == 4
2257 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2258 #else
2259 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2260 #endif
2261 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2262 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2263 #if SIZEOF_WCHAR_T == 4
2264 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2265 // is not properly terminated we get random characters at the end
2266 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2267 wxMBConvUTF16 converter ;
2268 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2269 free( ubuf ) ;
2270 #else
2271 res = byteOutLen / sizeof( UniChar ) ;
2272 #endif
2273 if ( buf == NULL )
2274 free(tbuf) ;
2275
2276 if ( buf && res < n)
2277 buf[res] = 0;
2278
2279 return res ;
2280 }
2281
2282 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2283 {
2284 OSStatus status = noErr ;
2285 ByteCount byteOutLen ;
2286 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2287
2288 char *tbuf = NULL ;
2289
2290 if (buf == NULL)
2291 {
2292 //apple specs say at least 32
2293 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2294 tbuf = (char*) malloc( n ) ;
2295 }
2296
2297 ByteCount byteBufferLen = n ;
2298 UniChar* ubuf = NULL ;
2299 #if SIZEOF_WCHAR_T == 4
2300 wxMBConvUTF16 converter ;
2301 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2302 byteInLen = unicharlen ;
2303 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2304 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2305 #else
2306 ubuf = (UniChar*) psz ;
2307 #endif
2308 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2309 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2310 #if SIZEOF_WCHAR_T == 4
2311 free( ubuf ) ;
2312 #endif
2313 if ( buf == NULL )
2314 free(tbuf) ;
2315
2316 size_t res = byteOutLen ;
2317 if ( buf && res < n)
2318 {
2319 buf[res] = 0;
2320
2321 //we need to double-trip to verify it didn't insert any ? in place
2322 //of bogus characters
2323 wxWCharBuffer wcBuf(n);
2324 size_t pszlen = wxWcslen(psz);
2325 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2326 wxWcslen(wcBuf) != pszlen ||
2327 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2328 {
2329 // we didn't obtain the same thing we started from, hence
2330 // the conversion was lossy and we consider that it failed
2331 return (size_t)-1;
2332 }
2333 }
2334
2335 return res ;
2336 }
2337
2338 bool IsOk() const
2339 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2340
2341 private:
2342 TECObjectRef m_MB2WC_converter ;
2343 TECObjectRef m_WC2MB_converter ;
2344
2345 TextEncodingBase m_char_encoding ;
2346 TextEncodingBase m_unicode_encoding ;
2347 };
2348
2349 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2350
2351 // ============================================================================
2352 // wxEncodingConverter based conversion classes
2353 // ============================================================================
2354
2355 #if wxUSE_FONTMAP
2356
2357 class wxMBConv_wxwin : public wxMBConv
2358 {
2359 private:
2360 void Init()
2361 {
2362 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2363 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2364 }
2365
2366 public:
2367 // temporarily just use wxEncodingConverter stuff,
2368 // so that it works while a better implementation is built
2369 wxMBConv_wxwin(const wxChar* name)
2370 {
2371 if (name)
2372 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2373 else
2374 m_enc = wxFONTENCODING_SYSTEM;
2375
2376 Init();
2377 }
2378
2379 wxMBConv_wxwin(wxFontEncoding enc)
2380 {
2381 m_enc = enc;
2382
2383 Init();
2384 }
2385
2386 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2387 {
2388 size_t inbuf = strlen(psz);
2389 if (buf)
2390 {
2391 if (!m2w.Convert(psz,buf))
2392 return (size_t)-1;
2393 }
2394 return inbuf;
2395 }
2396
2397 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2398 {
2399 const size_t inbuf = wxWcslen(psz);
2400 if (buf)
2401 {
2402 if (!w2m.Convert(psz,buf))
2403 return (size_t)-1;
2404 }
2405
2406 return inbuf;
2407 }
2408
2409 bool IsOk() const { return m_ok; }
2410
2411 public:
2412 wxFontEncoding m_enc;
2413 wxEncodingConverter m2w, w2m;
2414
2415 // were we initialized successfully?
2416 bool m_ok;
2417
2418 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2419 };
2420
2421 // make the constructors available for unit testing
2422 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2423 {
2424 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2425 if ( !result->IsOk() )
2426 {
2427 delete result;
2428 return 0;
2429 }
2430 return result;
2431 }
2432
2433 #endif // wxUSE_FONTMAP
2434
2435 // ============================================================================
2436 // wxCSConv implementation
2437 // ============================================================================
2438
2439 void wxCSConv::Init()
2440 {
2441 m_name = NULL;
2442 m_convReal = NULL;
2443 m_deferred = true;
2444 }
2445
2446 wxCSConv::wxCSConv(const wxChar *charset)
2447 {
2448 Init();
2449
2450 if ( charset )
2451 {
2452 SetName(charset);
2453 }
2454
2455 m_encoding = wxFONTENCODING_SYSTEM;
2456 }
2457
2458 wxCSConv::wxCSConv(wxFontEncoding encoding)
2459 {
2460 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2461 {
2462 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2463
2464 encoding = wxFONTENCODING_SYSTEM;
2465 }
2466
2467 Init();
2468
2469 m_encoding = encoding;
2470 }
2471
2472 wxCSConv::~wxCSConv()
2473 {
2474 Clear();
2475 }
2476
2477 wxCSConv::wxCSConv(const wxCSConv& conv)
2478 : wxMBConv()
2479 {
2480 Init();
2481
2482 SetName(conv.m_name);
2483 m_encoding = conv.m_encoding;
2484 }
2485
2486 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2487 {
2488 Clear();
2489
2490 SetName(conv.m_name);
2491 m_encoding = conv.m_encoding;
2492
2493 return *this;
2494 }
2495
2496 void wxCSConv::Clear()
2497 {
2498 free(m_name);
2499 delete m_convReal;
2500
2501 m_name = NULL;
2502 m_convReal = NULL;
2503 }
2504
2505 void wxCSConv::SetName(const wxChar *charset)
2506 {
2507 if (charset)
2508 {
2509 m_name = wxStrdup(charset);
2510 m_deferred = true;
2511 }
2512 }
2513
2514 #if wxUSE_FONTMAP
2515 #include "wx/hashmap.h"
2516
2517 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2518 wxEncodingNameCache );
2519
2520 static wxEncodingNameCache gs_nameCache;
2521 #endif
2522
2523 wxMBConv *wxCSConv::DoCreate() const
2524 {
2525 #if wxUSE_FONTMAP
2526 wxLogTrace(TRACE_STRCONV,
2527 wxT("creating conversion for %s"),
2528 (m_name ? m_name
2529 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2530 #endif // wxUSE_FONTMAP
2531
2532 // check for the special case of ASCII or ISO8859-1 charset: as we have
2533 // special knowledge of it anyhow, we don't need to create a special
2534 // conversion object
2535 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2536 {
2537 // don't convert at all
2538 return NULL;
2539 }
2540
2541 // we trust OS to do conversion better than we can so try external
2542 // conversion methods first
2543 //
2544 // the full order is:
2545 // 1. OS conversion (iconv() under Unix or Win32 API)
2546 // 2. hard coded conversions for UTF
2547 // 3. wxEncodingConverter as fall back
2548
2549 // step (1)
2550 #ifdef HAVE_ICONV
2551 #if !wxUSE_FONTMAP
2552 if ( m_name )
2553 #endif // !wxUSE_FONTMAP
2554 {
2555 wxString name(m_name);
2556 wxFontEncoding encoding(m_encoding);
2557
2558 if ( !name.empty() )
2559 {
2560 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2561 if ( conv->IsOk() )
2562 return conv;
2563
2564 delete conv;
2565
2566 #if wxUSE_FONTMAP
2567 encoding =
2568 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2569 #endif // wxUSE_FONTMAP
2570 }
2571 #if wxUSE_FONTMAP
2572 {
2573 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2574 if ( it != gs_nameCache.end() )
2575 {
2576 if ( it->second.empty() )
2577 return NULL;
2578
2579 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2580 if ( conv->IsOk() )
2581 return conv;
2582
2583 delete conv;
2584 }
2585
2586 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2587
2588 for ( ; *names; ++names )
2589 {
2590 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2591 if ( conv->IsOk() )
2592 {
2593 gs_nameCache[encoding] = *names;
2594 return conv;
2595 }
2596
2597 delete conv;
2598 }
2599
2600 gs_nameCache[encoding] = _T(""); // cache the failure
2601 }
2602 #endif // wxUSE_FONTMAP
2603 }
2604 #endif // HAVE_ICONV
2605
2606 #ifdef wxHAVE_WIN32_MB2WC
2607 {
2608 #if wxUSE_FONTMAP
2609 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2610 : new wxMBConv_win32(m_encoding);
2611 if ( conv->IsOk() )
2612 return conv;
2613
2614 delete conv;
2615 #else
2616 return NULL;
2617 #endif
2618 }
2619 #endif // wxHAVE_WIN32_MB2WC
2620 #if defined(__WXMAC__)
2621 {
2622 // leave UTF16 and UTF32 to the built-ins of wx
2623 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2624 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2625 {
2626
2627 #if wxUSE_FONTMAP
2628 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2629 : new wxMBConv_mac(m_encoding);
2630 #else
2631 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2632 #endif
2633 if ( conv->IsOk() )
2634 return conv;
2635
2636 delete conv;
2637 }
2638 }
2639 #endif
2640 #if defined(__WXCOCOA__)
2641 {
2642 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2643 {
2644
2645 #if wxUSE_FONTMAP
2646 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2647 : new wxMBConv_cocoa(m_encoding);
2648 #else
2649 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2650 #endif
2651 if ( conv->IsOk() )
2652 return conv;
2653
2654 delete conv;
2655 }
2656 }
2657 #endif
2658 // step (2)
2659 wxFontEncoding enc = m_encoding;
2660 #if wxUSE_FONTMAP
2661 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2662 {
2663 // use "false" to suppress interactive dialogs -- we can be called from
2664 // anywhere and popping up a dialog from here is the last thing we want to
2665 // do
2666 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2667 }
2668 #endif // wxUSE_FONTMAP
2669
2670 switch ( enc )
2671 {
2672 case wxFONTENCODING_UTF7:
2673 return new wxMBConvUTF7;
2674
2675 case wxFONTENCODING_UTF8:
2676 return new wxMBConvUTF8;
2677
2678 case wxFONTENCODING_UTF16BE:
2679 return new wxMBConvUTF16BE;
2680
2681 case wxFONTENCODING_UTF16LE:
2682 return new wxMBConvUTF16LE;
2683
2684 case wxFONTENCODING_UTF32BE:
2685 return new wxMBConvUTF32BE;
2686
2687 case wxFONTENCODING_UTF32LE:
2688 return new wxMBConvUTF32LE;
2689
2690 default:
2691 // nothing to do but put here to suppress gcc warnings
2692 ;
2693 }
2694
2695 // step (3)
2696 #if wxUSE_FONTMAP
2697 {
2698 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2699 : new wxMBConv_wxwin(m_encoding);
2700 if ( conv->IsOk() )
2701 return conv;
2702
2703 delete conv;
2704 }
2705 #endif // wxUSE_FONTMAP
2706
2707 // NB: This is a hack to prevent deadlock. What could otherwise happen
2708 // in Unicode build: wxConvLocal creation ends up being here
2709 // because of some failure and logs the error. But wxLog will try to
2710 // attach timestamp, for which it will need wxConvLocal (to convert
2711 // time to char* and then wchar_t*), but that fails, tries to log
2712 // error, but wxLog has a (already locked) critical section that
2713 // guards static buffer.
2714 static bool alreadyLoggingError = false;
2715 if (!alreadyLoggingError)
2716 {
2717 alreadyLoggingError = true;
2718 wxLogError(_("Cannot convert from the charset '%s'!"),
2719 m_name ? m_name
2720 :
2721 #if wxUSE_FONTMAP
2722 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2723 #else // !wxUSE_FONTMAP
2724 wxString::Format(_("encoding %s"), m_encoding).c_str()
2725 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2726 );
2727 alreadyLoggingError = false;
2728 }
2729
2730 return NULL;
2731 }
2732
2733 void wxCSConv::CreateConvIfNeeded() const
2734 {
2735 if ( m_deferred )
2736 {
2737 wxCSConv *self = (wxCSConv *)this; // const_cast
2738
2739 #if wxUSE_INTL
2740 // if we don't have neither the name nor the encoding, use the default
2741 // encoding for this system
2742 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2743 {
2744 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2745 }
2746 #endif // wxUSE_INTL
2747
2748 self->m_convReal = DoCreate();
2749 self->m_deferred = false;
2750 }
2751 }
2752
2753 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2754 {
2755 CreateConvIfNeeded();
2756
2757 if (m_convReal)
2758 return m_convReal->MB2WC(buf, psz, n);
2759
2760 // latin-1 (direct)
2761 size_t len = strlen(psz);
2762
2763 if (buf)
2764 {
2765 for (size_t c = 0; c <= len; c++)
2766 buf[c] = (unsigned char)(psz[c]);
2767 }
2768
2769 return len;
2770 }
2771
2772 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2773 {
2774 CreateConvIfNeeded();
2775
2776 if (m_convReal)
2777 return m_convReal->WC2MB(buf, psz, n);
2778
2779 // latin-1 (direct)
2780 const size_t len = wxWcslen(psz);
2781 if (buf)
2782 {
2783 for (size_t c = 0; c <= len; c++)
2784 {
2785 if (psz[c] > 0xFF)
2786 return (size_t)-1;
2787 buf[c] = (char)psz[c];
2788 }
2789 }
2790 else
2791 {
2792 for (size_t c = 0; c <= len; c++)
2793 {
2794 if (psz[c] > 0xFF)
2795 return (size_t)-1;
2796 }
2797 }
2798
2799 return len;
2800 }
2801
2802 // ----------------------------------------------------------------------------
2803 // globals
2804 // ----------------------------------------------------------------------------
2805
2806 #ifdef __WINDOWS__
2807 static wxMBConv_win32 wxConvLibcObj;
2808 #elif defined(__WXMAC__) && !defined(__MACH__)
2809 static wxMBConv_mac wxConvLibcObj ;
2810 #else
2811 static wxMBConvLibc wxConvLibcObj;
2812 #endif
2813
2814 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2815 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2816 static wxMBConvUTF7 wxConvUTF7Obj;
2817 static wxMBConvUTF8 wxConvUTF8Obj;
2818
2819 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2820 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2821 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2822 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2823 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2824 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2825 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2826 #ifdef __WXOSX__
2827 wxConvUTF8Obj;
2828 #else
2829 wxConvLibcObj;
2830 #endif
2831
2832
2833 #else // !wxUSE_WCHAR_T
2834
2835 // stand-ins in absence of wchar_t
2836 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2837 wxConvISO8859_1,
2838 wxConvLocal,
2839 wxConvUTF8;
2840
2841 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2842
2843