]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Add wxHL_* styles
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #ifdef __WXMSW__
20 #include "wx/msw/missing.h"
21 #endif
22 #include "wx/intl.h"
23 #include "wx/log.h"
24 #include "wx/utils.h"
25 #include "wx/hashmap.h"
26 #endif
27
28 #include "wx/strconv.h"
29
30 #if wxUSE_WCHAR_T
31
32 #ifdef __WINDOWS__
33 #include "wx/msw/private.h"
34 #endif
35
36 #ifndef __WXWINCE__
37 #include <errno.h>
38 #endif
39
40 #include <ctype.h>
41 #include <string.h>
42 #include <stdlib.h>
43
44 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
45 #define wxHAVE_WIN32_MB2WC
46 #endif
47
48 #ifdef __SALFORDC__
49 #include <clib.h>
50 #endif
51
52 #ifdef HAVE_ICONV
53 #include <iconv.h>
54 #include "wx/thread.h"
55 #endif
56
57 #include "wx/encconv.h"
58 #include "wx/fontmap.h"
59
60 #ifdef __WXMAC__
61 #ifndef __DARWIN__
62 #include <ATSUnicode.h>
63 #include <TextCommon.h>
64 #include <TextEncodingConverter.h>
65 #endif
66
67 // includes Mac headers
68 #include "wx/mac/private.h"
69 #endif
70
71
72 #define TRACE_STRCONV _T("strconv")
73
74 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
75 // be 4 bytes
76 #if SIZEOF_WCHAR_T == 2
77 #define WC_UTF16
78 #endif
79
80
81 // ============================================================================
82 // implementation
83 // ============================================================================
84
85 // helper function of cMB2WC(): check if n bytes at this location are all NUL
86 static bool NotAllNULs(const char *p, size_t n)
87 {
88 while ( n && *p++ == '\0' )
89 n--;
90
91 return n != 0;
92 }
93
94 // ----------------------------------------------------------------------------
95 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
96 // ----------------------------------------------------------------------------
97
98 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
99 {
100 if (input <= 0xffff)
101 {
102 if (output)
103 *output = (wxUint16) input;
104
105 return 1;
106 }
107 else if (input >= 0x110000)
108 {
109 return wxCONV_FAILED;
110 }
111 else
112 {
113 if (output)
114 {
115 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
116 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
117 }
118
119 return 2;
120 }
121 }
122
123 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
124 {
125 if ((*input < 0xd800) || (*input > 0xdfff))
126 {
127 output = *input;
128 return 1;
129 }
130 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
131 {
132 output = *input;
133 return wxCONV_FAILED;
134 }
135 else
136 {
137 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
138 return 2;
139 }
140 }
141
142 #ifdef WC_UTF16
143 typedef wchar_t wxDecodeSurrogate_t;
144 #else // !WC_UTF16
145 typedef wxUint16 wxDecodeSurrogate_t;
146 #endif // WC_UTF16/!WC_UTF16
147
148 // returns the next UTF-32 character from the wchar_t buffer and advances the
149 // pointer to the character after this one
150 //
151 // if an invalid character is found, *pSrc is set to NULL, the caller must
152 // check for this
153 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
154 {
155 wxUint32 out;
156 const size_t
157 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
158 if ( n == wxCONV_FAILED )
159 *pSrc = NULL;
160 else
161 *pSrc += n;
162
163 return out;
164 }
165
166 // ----------------------------------------------------------------------------
167 // wxMBConv
168 // ----------------------------------------------------------------------------
169
170 size_t
171 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
172 const char *src, size_t srcLen) const
173 {
174 // although new conversion classes are supposed to implement this function
175 // directly, the existins ones only implement the old MB2WC() and so, to
176 // avoid to have to rewrite all conversion classes at once, we provide a
177 // default (but not efficient) implementation of this one in terms of the
178 // old function by copying the input to ensure that it's NUL-terminated and
179 // then using MB2WC() to convert it
180
181 // the number of chars [which would be] written to dst [if it were not NULL]
182 size_t dstWritten = 0;
183
184 // the number of NULs terminating this string
185 size_t nulLen = 0; // not really needed, but just to avoid warnings
186
187 // if we were not given the input size we just have to assume that the
188 // string is properly terminated as we have no way of knowing how long it
189 // is anyhow, but if we do have the size check whether there are enough
190 // NULs at the end
191 wxCharBuffer bufTmp;
192 const char *srcEnd;
193 if ( srcLen != wxNO_LEN )
194 {
195 // we need to know how to find the end of this string
196 nulLen = GetMBNulLen();
197 if ( nulLen == wxCONV_FAILED )
198 return wxCONV_FAILED;
199
200 // if there are enough NULs we can avoid the copy
201 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
202 {
203 // make a copy in order to properly NUL-terminate the string
204 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
205 char * const p = bufTmp.data();
206 memcpy(p, src, srcLen);
207 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
208 *s = '\0';
209
210 src = bufTmp;
211 }
212
213 srcEnd = src + srcLen;
214 }
215 else // quit after the first loop iteration
216 {
217 srcEnd = NULL;
218 }
219
220 for ( ;; )
221 {
222 // try to convert the current chunk
223 size_t lenChunk = MB2WC(NULL, src, 0);
224 if ( lenChunk == wxCONV_FAILED )
225 return wxCONV_FAILED;
226
227 lenChunk++; // for the L'\0' at the end of this chunk
228
229 dstWritten += lenChunk;
230
231 if ( lenChunk == 1 )
232 {
233 // nothing left in the input string, conversion succeeded
234 break;
235 }
236
237 if ( dst )
238 {
239 if ( dstWritten > dstLen )
240 return wxCONV_FAILED;
241
242 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
243 return wxCONV_FAILED;
244
245 dst += lenChunk;
246 }
247
248 if ( !srcEnd )
249 {
250 // we convert just one chunk in this case as this is the entire
251 // string anyhow
252 break;
253 }
254
255 // advance the input pointer past the end of this chunk
256 while ( NotAllNULs(src, nulLen) )
257 {
258 // notice that we must skip over multiple bytes here as we suppose
259 // that if NUL takes 2 or 4 bytes, then all the other characters do
260 // too and so if advanced by a single byte we might erroneously
261 // detect sequences of NUL bytes in the middle of the input
262 src += nulLen;
263 }
264
265 src += nulLen; // skipping over its terminator as well
266
267 // note that ">=" (and not just "==") is needed here as the terminator
268 // we skipped just above could be inside or just after the buffer
269 // delimited by inEnd
270 if ( src >= srcEnd )
271 break;
272 }
273
274 return dstWritten;
275 }
276
277 size_t
278 wxMBConv::FromWChar(char *dst, size_t dstLen,
279 const wchar_t *src, size_t srcLen) const
280 {
281 // the number of chars [which would be] written to dst [if it were not NULL]
282 size_t dstWritten = 0;
283
284 // make a copy of the input string unless it is already properly
285 // NUL-terminated
286 //
287 // if we don't know its length we have no choice but to assume that it is,
288 // indeed, properly terminated
289 wxWCharBuffer bufTmp;
290 if ( srcLen == wxNO_LEN )
291 {
292 srcLen = wxWcslen(src) + 1;
293 }
294 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
295 {
296 // make a copy in order to properly NUL-terminate the string
297 bufTmp = wxWCharBuffer(srcLen);
298 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
299 src = bufTmp;
300 }
301
302 const size_t lenNul = GetMBNulLen();
303 for ( const wchar_t * const srcEnd = src + srcLen;
304 src < srcEnd;
305 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
306 {
307 // try to convert the current chunk
308 size_t lenChunk = WC2MB(NULL, src, 0);
309
310 if ( lenChunk == wxCONV_FAILED )
311 return wxCONV_FAILED;
312
313 lenChunk += lenNul;
314 dstWritten += lenChunk;
315
316 if ( dst )
317 {
318 if ( dstWritten > dstLen )
319 return wxCONV_FAILED;
320
321 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
322 return wxCONV_FAILED;
323
324 dst += lenChunk;
325 }
326 }
327
328 return dstWritten;
329 }
330
331 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
332 {
333 size_t rc = ToWChar(outBuff, outLen, inBuff);
334 if ( rc != wxCONV_FAILED )
335 {
336 // ToWChar() returns the buffer length, i.e. including the trailing
337 // NUL, while this method doesn't take it into account
338 rc--;
339 }
340
341 return rc;
342 }
343
344 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
345 {
346 size_t rc = FromWChar(outBuff, outLen, inBuff);
347 if ( rc != wxCONV_FAILED )
348 {
349 rc -= GetMBNulLen();
350 }
351
352 return rc;
353 }
354
355 wxMBConv::~wxMBConv()
356 {
357 // nothing to do here (necessary for Darwin linking probably)
358 }
359
360 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
361 {
362 if ( psz )
363 {
364 // calculate the length of the buffer needed first
365 const size_t nLen = MB2WC(NULL, psz, 0);
366 if ( nLen != wxCONV_FAILED )
367 {
368 // now do the actual conversion
369 wxWCharBuffer buf(nLen /* +1 added implicitly */);
370
371 // +1 for the trailing NULL
372 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
373 return buf;
374 }
375 }
376
377 return wxWCharBuffer();
378 }
379
380 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
381 {
382 if ( pwz )
383 {
384 const size_t nLen = WC2MB(NULL, pwz, 0);
385 if ( nLen != wxCONV_FAILED )
386 {
387 // extra space for trailing NUL(s)
388 static const size_t extraLen = GetMaxMBNulLen();
389
390 wxCharBuffer buf(nLen + extraLen - 1);
391 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
392 return buf;
393 }
394 }
395
396 return wxCharBuffer();
397 }
398
399 const wxWCharBuffer
400 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
401 {
402 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
403 if ( dstLen != wxCONV_FAILED )
404 {
405 wxWCharBuffer wbuf(dstLen - 1);
406 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
407 {
408 if ( outLen )
409 {
410 *outLen = dstLen;
411 if ( wbuf[dstLen - 1] == L'\0' )
412 (*outLen)--;
413 }
414
415 return wbuf;
416 }
417 }
418
419 if ( outLen )
420 *outLen = 0;
421
422 return wxWCharBuffer();
423 }
424
425 const wxCharBuffer
426 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
427 {
428 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
429 if ( dstLen != wxCONV_FAILED )
430 {
431 // special case of empty input: can't allocate 0 size buffer below as
432 // wxCharBuffer insists on NUL-terminating it
433 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
434 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
435 {
436 if ( outLen )
437 {
438 *outLen = dstLen;
439
440 const size_t nulLen = GetMBNulLen();
441 if ( dstLen >= nulLen &&
442 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
443 {
444 // in this case the output is NUL-terminated and we're not
445 // supposed to count NUL
446 *outLen -= nulLen;
447 }
448 }
449
450 return buf;
451 }
452 }
453
454 if ( outLen )
455 *outLen = 0;
456
457 return wxCharBuffer();
458 }
459
460 // ----------------------------------------------------------------------------
461 // wxMBConvLibc
462 // ----------------------------------------------------------------------------
463
464 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
465 {
466 return wxMB2WC(buf, psz, n);
467 }
468
469 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
470 {
471 return wxWC2MB(buf, psz, n);
472 }
473
474 // ----------------------------------------------------------------------------
475 // wxConvBrokenFileNames
476 // ----------------------------------------------------------------------------
477
478 #ifdef __UNIX__
479
480 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
481 {
482 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
483 || wxStricmp(charset, _T("UTF8")) == 0 )
484 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
485 else
486 m_conv = new wxCSConv(charset);
487 }
488
489 #endif // __UNIX__
490
491 // ----------------------------------------------------------------------------
492 // UTF-7
493 // ----------------------------------------------------------------------------
494
495 // Implementation (C) 2004 Fredrik Roubert
496
497 //
498 // BASE64 decoding table
499 //
500 static const unsigned char utf7unb64[] =
501 {
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
508 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
509 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
511 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
512 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
513 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
515 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
516 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
517 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
534 };
535
536 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
537 {
538 size_t len = 0;
539
540 while ( *psz && (!buf || (len < n)) )
541 {
542 unsigned char cc = *psz++;
543 if (cc != '+')
544 {
545 // plain ASCII char
546 if (buf)
547 *buf++ = cc;
548 len++;
549 }
550 else if (*psz == '-')
551 {
552 // encoded plus sign
553 if (buf)
554 *buf++ = cc;
555 len++;
556 psz++;
557 }
558 else // start of BASE64 encoded string
559 {
560 bool lsb, ok;
561 unsigned int d, l;
562 for ( ok = lsb = false, d = 0, l = 0;
563 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
564 psz++ )
565 {
566 d <<= 6;
567 d += cc;
568 for (l += 6; l >= 8; lsb = !lsb)
569 {
570 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
571 if (lsb)
572 {
573 if (buf)
574 *buf++ |= c;
575 len ++;
576 }
577 else
578 {
579 if (buf)
580 *buf = (wchar_t)(c << 8);
581 }
582
583 ok = true;
584 }
585 }
586
587 if ( !ok )
588 {
589 // in valid UTF7 we should have valid characters after '+'
590 return wxCONV_FAILED;
591 }
592
593 if (*psz == '-')
594 psz++;
595 }
596 }
597
598 if ( buf && (len < n) )
599 *buf = '\0';
600
601 return len;
602 }
603
604 //
605 // BASE64 encoding table
606 //
607 static const unsigned char utf7enb64[] =
608 {
609 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
610 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
611 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
612 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
613 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
614 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
615 'w', 'x', 'y', 'z', '0', '1', '2', '3',
616 '4', '5', '6', '7', '8', '9', '+', '/'
617 };
618
619 //
620 // UTF-7 encoding table
621 //
622 // 0 - Set D (directly encoded characters)
623 // 1 - Set O (optional direct characters)
624 // 2 - whitespace characters (optional)
625 // 3 - special characters
626 //
627 static const unsigned char utf7encode[128] =
628 {
629 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
630 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
631 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
635 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
637 };
638
639 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
640 {
641 size_t len = 0;
642
643 while (*psz && ((!buf) || (len < n)))
644 {
645 wchar_t cc = *psz++;
646 if (cc < 0x80 && utf7encode[cc] < 1)
647 {
648 // plain ASCII char
649 if (buf)
650 *buf++ = (char)cc;
651
652 len++;
653 }
654 #ifndef WC_UTF16
655 else if (((wxUint32)cc) > 0xffff)
656 {
657 // no surrogate pair generation (yet?)
658 return wxCONV_FAILED;
659 }
660 #endif
661 else
662 {
663 if (buf)
664 *buf++ = '+';
665
666 len++;
667 if (cc != '+')
668 {
669 // BASE64 encode string
670 unsigned int lsb, d, l;
671 for (d = 0, l = 0; /*nothing*/; psz++)
672 {
673 for (lsb = 0; lsb < 2; lsb ++)
674 {
675 d <<= 8;
676 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
677
678 for (l += 8; l >= 6; )
679 {
680 l -= 6;
681 if (buf)
682 *buf++ = utf7enb64[(d >> l) % 64];
683 len++;
684 }
685 }
686
687 cc = *psz;
688 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
689 break;
690 }
691
692 if (l != 0)
693 {
694 if (buf)
695 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
696
697 len++;
698 }
699 }
700
701 if (buf)
702 *buf++ = '-';
703 len++;
704 }
705 }
706
707 if (buf && (len < n))
708 *buf = 0;
709
710 return len;
711 }
712
713 // ----------------------------------------------------------------------------
714 // UTF-8
715 // ----------------------------------------------------------------------------
716
717 static wxUint32 utf8_max[]=
718 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
719
720 // boundaries of the private use area we use to (temporarily) remap invalid
721 // characters invalid in a UTF-8 encoded string
722 const wxUint32 wxUnicodePUA = 0x100000;
723 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
724
725 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
726 {
727 size_t len = 0;
728
729 while (*psz && ((!buf) || (len < n)))
730 {
731 const char *opsz = psz;
732 bool invalid = false;
733 unsigned char cc = *psz++, fc = cc;
734 unsigned cnt;
735 for (cnt = 0; fc & 0x80; cnt++)
736 fc <<= 1;
737
738 if (!cnt)
739 {
740 // plain ASCII char
741 if (buf)
742 *buf++ = cc;
743 len++;
744
745 // escape the escape character for octal escapes
746 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
747 && cc == '\\' && (!buf || len < n))
748 {
749 if (buf)
750 *buf++ = cc;
751 len++;
752 }
753 }
754 else
755 {
756 cnt--;
757 if (!cnt)
758 {
759 // invalid UTF-8 sequence
760 invalid = true;
761 }
762 else
763 {
764 unsigned ocnt = cnt - 1;
765 wxUint32 res = cc & (0x3f >> cnt);
766 while (cnt--)
767 {
768 cc = *psz;
769 if ((cc & 0xC0) != 0x80)
770 {
771 // invalid UTF-8 sequence
772 invalid = true;
773 break;
774 }
775
776 psz++;
777 res = (res << 6) | (cc & 0x3f);
778 }
779
780 if (invalid || res <= utf8_max[ocnt])
781 {
782 // illegal UTF-8 encoding
783 invalid = true;
784 }
785 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
786 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
787 {
788 // if one of our PUA characters turns up externally
789 // it must also be treated as an illegal sequence
790 // (a bit like you have to escape an escape character)
791 invalid = true;
792 }
793 else
794 {
795 #ifdef WC_UTF16
796 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
797 size_t pa = encode_utf16(res, (wxUint16 *)buf);
798 if (pa == wxCONV_FAILED)
799 {
800 invalid = true;
801 }
802 else
803 {
804 if (buf)
805 buf += pa;
806 len += pa;
807 }
808 #else // !WC_UTF16
809 if (buf)
810 *buf++ = (wchar_t)res;
811 len++;
812 #endif // WC_UTF16/!WC_UTF16
813 }
814 }
815
816 if (invalid)
817 {
818 if (m_options & MAP_INVALID_UTF8_TO_PUA)
819 {
820 while (opsz < psz && (!buf || len < n))
821 {
822 #ifdef WC_UTF16
823 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
824 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
825 wxASSERT(pa != wxCONV_FAILED);
826 if (buf)
827 buf += pa;
828 opsz++;
829 len += pa;
830 #else
831 if (buf)
832 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
833 opsz++;
834 len++;
835 #endif
836 }
837 }
838 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
839 {
840 while (opsz < psz && (!buf || len < n))
841 {
842 if ( buf && len + 3 < n )
843 {
844 unsigned char on = *opsz;
845 *buf++ = L'\\';
846 *buf++ = (wchar_t)( L'0' + on / 0100 );
847 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
848 *buf++ = (wchar_t)( L'0' + on % 010 );
849 }
850
851 opsz++;
852 len += 4;
853 }
854 }
855 else // MAP_INVALID_UTF8_NOT
856 {
857 return wxCONV_FAILED;
858 }
859 }
860 }
861 }
862
863 if (buf && (len < n))
864 *buf = 0;
865
866 return len;
867 }
868
869 static inline bool isoctal(wchar_t wch)
870 {
871 return L'0' <= wch && wch <= L'7';
872 }
873
874 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
875 {
876 size_t len = 0;
877
878 while (*psz && ((!buf) || (len < n)))
879 {
880 wxUint32 cc;
881
882 #ifdef WC_UTF16
883 // cast is ok for WC_UTF16
884 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
885 psz += (pa == wxCONV_FAILED) ? 1 : pa;
886 #else
887 cc = (*psz++) & 0x7fffffff;
888 #endif
889
890 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
891 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
892 {
893 if (buf)
894 *buf++ = (char)(cc - wxUnicodePUA);
895 len++;
896 }
897 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
898 && cc == L'\\' && psz[0] == L'\\' )
899 {
900 if (buf)
901 *buf++ = (char)cc;
902 psz++;
903 len++;
904 }
905 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
906 cc == L'\\' &&
907 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
908 {
909 if (buf)
910 {
911 *buf++ = (char) ((psz[0] - L'0') * 0100 +
912 (psz[1] - L'0') * 010 +
913 (psz[2] - L'0'));
914 }
915
916 psz += 3;
917 len++;
918 }
919 else
920 {
921 unsigned cnt;
922 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
923 {
924 }
925
926 if (!cnt)
927 {
928 // plain ASCII char
929 if (buf)
930 *buf++ = (char) cc;
931 len++;
932 }
933 else
934 {
935 len += cnt + 1;
936 if (buf)
937 {
938 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
939 while (cnt--)
940 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
941 }
942 }
943 }
944 }
945
946 if (buf && (len < n))
947 *buf = 0;
948
949 return len;
950 }
951
952 // ============================================================================
953 // UTF-16
954 // ============================================================================
955
956 #ifdef WORDS_BIGENDIAN
957 #define wxMBConvUTF16straight wxMBConvUTF16BE
958 #define wxMBConvUTF16swap wxMBConvUTF16LE
959 #else
960 #define wxMBConvUTF16swap wxMBConvUTF16BE
961 #define wxMBConvUTF16straight wxMBConvUTF16LE
962 #endif
963
964 /* static */
965 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
966 {
967 if ( srcLen == wxNO_LEN )
968 {
969 // count the number of bytes in input, including the trailing NULs
970 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
971 for ( srcLen = 1; *inBuff++; srcLen++ )
972 ;
973
974 srcLen *= BYTES_PER_CHAR;
975 }
976 else // we already have the length
977 {
978 // we can only convert an entire number of UTF-16 characters
979 if ( srcLen % BYTES_PER_CHAR )
980 return wxCONV_FAILED;
981 }
982
983 return srcLen;
984 }
985
986 // case when in-memory representation is UTF-16 too
987 #ifdef WC_UTF16
988
989 // ----------------------------------------------------------------------------
990 // conversions without endianness change
991 // ----------------------------------------------------------------------------
992
993 size_t
994 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
995 const char *src, size_t srcLen) const
996 {
997 // set up the scene for using memcpy() (which is presumably more efficient
998 // than copying the bytes one by one)
999 srcLen = GetLength(src, srcLen);
1000 if ( srcLen == wxNO_LEN )
1001 return wxCONV_FAILED;
1002
1003 const size_t inLen = srcLen / BYTES_PER_CHAR;
1004 if ( dst )
1005 {
1006 if ( dstLen < inLen )
1007 return wxCONV_FAILED;
1008
1009 memcpy(dst, src, srcLen);
1010 }
1011
1012 return inLen;
1013 }
1014
1015 size_t
1016 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1017 const wchar_t *src, size_t srcLen) const
1018 {
1019 if ( srcLen == wxNO_LEN )
1020 srcLen = wxWcslen(src) + 1;
1021
1022 srcLen *= BYTES_PER_CHAR;
1023
1024 if ( dst )
1025 {
1026 if ( dstLen < srcLen )
1027 return wxCONV_FAILED;
1028
1029 memcpy(dst, src, srcLen);
1030 }
1031
1032 return srcLen;
1033 }
1034
1035 // ----------------------------------------------------------------------------
1036 // endian-reversing conversions
1037 // ----------------------------------------------------------------------------
1038
1039 size_t
1040 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1041 const char *src, size_t srcLen) const
1042 {
1043 srcLen = GetLength(src, srcLen);
1044 if ( srcLen == wxNO_LEN )
1045 return wxCONV_FAILED;
1046
1047 srcLen /= BYTES_PER_CHAR;
1048
1049 if ( dst )
1050 {
1051 if ( dstLen < srcLen )
1052 return wxCONV_FAILED;
1053
1054 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1055 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1056 {
1057 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1058 }
1059 }
1060
1061 return srcLen;
1062 }
1063
1064 size_t
1065 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1066 const wchar_t *src, size_t srcLen) const
1067 {
1068 if ( srcLen == wxNO_LEN )
1069 srcLen = wxWcslen(src) + 1;
1070
1071 srcLen *= BYTES_PER_CHAR;
1072
1073 if ( dst )
1074 {
1075 if ( dstLen < srcLen )
1076 return wxCONV_FAILED;
1077
1078 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1079 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1080 {
1081 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1082 }
1083 }
1084
1085 return srcLen;
1086 }
1087
1088 #else // !WC_UTF16: wchar_t is UTF-32
1089
1090 // ----------------------------------------------------------------------------
1091 // conversions without endianness change
1092 // ----------------------------------------------------------------------------
1093
1094 size_t
1095 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1096 const char *src, size_t srcLen) const
1097 {
1098 srcLen = GetLength(src, srcLen);
1099 if ( srcLen == wxNO_LEN )
1100 return wxCONV_FAILED;
1101
1102 const size_t inLen = srcLen / BYTES_PER_CHAR;
1103 if ( !dst )
1104 {
1105 // optimization: return maximal space which could be needed for this
1106 // string even if the real size could be smaller if the buffer contains
1107 // any surrogates
1108 return inLen;
1109 }
1110
1111 size_t outLen = 0;
1112 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1113 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1114 {
1115 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1116 if ( !inBuff )
1117 return wxCONV_FAILED;
1118
1119 if ( ++outLen > dstLen )
1120 return wxCONV_FAILED;
1121
1122 *dst++ = ch;
1123 }
1124
1125
1126 return outLen;
1127 }
1128
1129 size_t
1130 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1131 const wchar_t *src, size_t srcLen) const
1132 {
1133 if ( srcLen == wxNO_LEN )
1134 srcLen = wxWcslen(src) + 1;
1135
1136 size_t outLen = 0;
1137 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1138 for ( size_t n = 0; n < srcLen; n++ )
1139 {
1140 wxUint16 cc[2];
1141 const size_t numChars = encode_utf16(*src++, cc);
1142 if ( numChars == wxCONV_FAILED )
1143 return wxCONV_FAILED;
1144
1145 outLen += numChars * BYTES_PER_CHAR;
1146 if ( outBuff )
1147 {
1148 if ( outLen > dstLen )
1149 return wxCONV_FAILED;
1150
1151 *outBuff++ = cc[0];
1152 if ( numChars == 2 )
1153 {
1154 // second character of a surrogate
1155 *outBuff++ = cc[1];
1156 }
1157 }
1158 }
1159
1160 return outLen;
1161 }
1162
1163 // ----------------------------------------------------------------------------
1164 // endian-reversing conversions
1165 // ----------------------------------------------------------------------------
1166
1167 size_t
1168 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1169 const char *src, size_t srcLen) const
1170 {
1171 srcLen = GetLength(src, srcLen);
1172 if ( srcLen == wxNO_LEN )
1173 return wxCONV_FAILED;
1174
1175 const size_t inLen = srcLen / BYTES_PER_CHAR;
1176 if ( !dst )
1177 {
1178 // optimization: return maximal space which could be needed for this
1179 // string even if the real size could be smaller if the buffer contains
1180 // any surrogates
1181 return inLen;
1182 }
1183
1184 size_t outLen = 0;
1185 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1186 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1187 {
1188 wxUint32 ch;
1189 wxUint16 tmp[2];
1190
1191 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1192 inBuff++;
1193 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1194
1195 const size_t numChars = decode_utf16(tmp, ch);
1196 if ( numChars == wxCONV_FAILED )
1197 return wxCONV_FAILED;
1198
1199 if ( numChars == 2 )
1200 inBuff++;
1201
1202 if ( ++outLen > dstLen )
1203 return wxCONV_FAILED;
1204
1205 *dst++ = ch;
1206 }
1207
1208
1209 return outLen;
1210 }
1211
1212 size_t
1213 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1214 const wchar_t *src, size_t srcLen) const
1215 {
1216 if ( srcLen == wxNO_LEN )
1217 srcLen = wxWcslen(src) + 1;
1218
1219 size_t outLen = 0;
1220 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1221 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1222 {
1223 wxUint16 cc[2];
1224 const size_t numChars = encode_utf16(*src, cc);
1225 if ( numChars == wxCONV_FAILED )
1226 return wxCONV_FAILED;
1227
1228 outLen += numChars * BYTES_PER_CHAR;
1229 if ( outBuff )
1230 {
1231 if ( outLen > dstLen )
1232 return wxCONV_FAILED;
1233
1234 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1235 if ( numChars == 2 )
1236 {
1237 // second character of a surrogate
1238 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1239 }
1240 }
1241 }
1242
1243 return outLen;
1244 }
1245
1246 #endif // WC_UTF16/!WC_UTF16
1247
1248
1249 // ============================================================================
1250 // UTF-32
1251 // ============================================================================
1252
1253 #ifdef WORDS_BIGENDIAN
1254 #define wxMBConvUTF32straight wxMBConvUTF32BE
1255 #define wxMBConvUTF32swap wxMBConvUTF32LE
1256 #else
1257 #define wxMBConvUTF32swap wxMBConvUTF32BE
1258 #define wxMBConvUTF32straight wxMBConvUTF32LE
1259 #endif
1260
1261
1262 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1263 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1264
1265 /* static */
1266 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1267 {
1268 if ( srcLen == wxNO_LEN )
1269 {
1270 // count the number of bytes in input, including the trailing NULs
1271 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1272 for ( srcLen = 1; *inBuff++; srcLen++ )
1273 ;
1274
1275 srcLen *= BYTES_PER_CHAR;
1276 }
1277 else // we already have the length
1278 {
1279 // we can only convert an entire number of UTF-32 characters
1280 if ( srcLen % BYTES_PER_CHAR )
1281 return wxCONV_FAILED;
1282 }
1283
1284 return srcLen;
1285 }
1286
1287 // case when in-memory representation is UTF-16
1288 #ifdef WC_UTF16
1289
1290 // ----------------------------------------------------------------------------
1291 // conversions without endianness change
1292 // ----------------------------------------------------------------------------
1293
1294 size_t
1295 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1296 const char *src, size_t srcLen) const
1297 {
1298 srcLen = GetLength(src, srcLen);
1299 if ( srcLen == wxNO_LEN )
1300 return wxCONV_FAILED;
1301
1302 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1303 const size_t inLen = srcLen / BYTES_PER_CHAR;
1304 size_t outLen = 0;
1305 for ( size_t n = 0; n < inLen; n++ )
1306 {
1307 wxUint16 cc[2];
1308 const size_t numChars = encode_utf16(*inBuff++, cc);
1309 if ( numChars == wxCONV_FAILED )
1310 return wxCONV_FAILED;
1311
1312 outLen += numChars;
1313 if ( dst )
1314 {
1315 if ( outLen > dstLen )
1316 return wxCONV_FAILED;
1317
1318 *dst++ = cc[0];
1319 if ( numChars == 2 )
1320 {
1321 // second character of a surrogate
1322 *dst++ = cc[1];
1323 }
1324 }
1325 }
1326
1327 return outLen;
1328 }
1329
1330 size_t
1331 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1332 const wchar_t *src, size_t srcLen) const
1333 {
1334 if ( srcLen == wxNO_LEN )
1335 srcLen = wxWcslen(src) + 1;
1336
1337 if ( !dst )
1338 {
1339 // optimization: return maximal space which could be needed for this
1340 // string instead of the exact amount which could be less if there are
1341 // any surrogates in the input
1342 //
1343 // we consider that surrogates are rare enough to make it worthwhile to
1344 // avoid running the loop below at the cost of slightly extra memory
1345 // consumption
1346 return srcLen * BYTES_PER_CHAR;
1347 }
1348
1349 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1350 size_t outLen = 0;
1351 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1352 {
1353 const wxUint32 ch = wxDecodeSurrogate(&src);
1354 if ( !src )
1355 return wxCONV_FAILED;
1356
1357 outLen += BYTES_PER_CHAR;
1358
1359 if ( outLen > dstLen )
1360 return wxCONV_FAILED;
1361
1362 *outBuff++ = ch;
1363 }
1364
1365 return outLen;
1366 }
1367
1368 // ----------------------------------------------------------------------------
1369 // endian-reversing conversions
1370 // ----------------------------------------------------------------------------
1371
1372 size_t
1373 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1374 const char *src, size_t srcLen) const
1375 {
1376 srcLen = GetLength(src, srcLen);
1377 if ( srcLen == wxNO_LEN )
1378 return wxCONV_FAILED;
1379
1380 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1381 const size_t inLen = srcLen / BYTES_PER_CHAR;
1382 size_t outLen = 0;
1383 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1384 {
1385 wxUint16 cc[2];
1386 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1387 if ( numChars == wxCONV_FAILED )
1388 return wxCONV_FAILED;
1389
1390 outLen += numChars;
1391 if ( dst )
1392 {
1393 if ( outLen > dstLen )
1394 return wxCONV_FAILED;
1395
1396 *dst++ = cc[0];
1397 if ( numChars == 2 )
1398 {
1399 // second character of a surrogate
1400 *dst++ = cc[1];
1401 }
1402 }
1403 }
1404
1405 return outLen;
1406 }
1407
1408 size_t
1409 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1410 const wchar_t *src, size_t srcLen) const
1411 {
1412 if ( srcLen == wxNO_LEN )
1413 srcLen = wxWcslen(src) + 1;
1414
1415 if ( !dst )
1416 {
1417 // optimization: return maximal space which could be needed for this
1418 // string instead of the exact amount which could be less if there are
1419 // any surrogates in the input
1420 //
1421 // we consider that surrogates are rare enough to make it worthwhile to
1422 // avoid running the loop below at the cost of slightly extra memory
1423 // consumption
1424 return srcLen*BYTES_PER_CHAR;
1425 }
1426
1427 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1428 size_t outLen = 0;
1429 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1430 {
1431 const wxUint32 ch = wxDecodeSurrogate(&src);
1432 if ( !src )
1433 return wxCONV_FAILED;
1434
1435 outLen += BYTES_PER_CHAR;
1436
1437 if ( outLen > dstLen )
1438 return wxCONV_FAILED;
1439
1440 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1441 }
1442
1443 return outLen;
1444 }
1445
1446 #else // !WC_UTF16: wchar_t is UTF-32
1447
1448 // ----------------------------------------------------------------------------
1449 // conversions without endianness change
1450 // ----------------------------------------------------------------------------
1451
1452 size_t
1453 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1454 const char *src, size_t srcLen) const
1455 {
1456 // use memcpy() as it should be much faster than hand-written loop
1457 srcLen = GetLength(src, srcLen);
1458 if ( srcLen == wxNO_LEN )
1459 return wxCONV_FAILED;
1460
1461 const size_t inLen = srcLen/BYTES_PER_CHAR;
1462 if ( dst )
1463 {
1464 if ( dstLen < inLen )
1465 return wxCONV_FAILED;
1466
1467 memcpy(dst, src, srcLen);
1468 }
1469
1470 return inLen;
1471 }
1472
1473 size_t
1474 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1475 const wchar_t *src, size_t srcLen) const
1476 {
1477 if ( srcLen == wxNO_LEN )
1478 srcLen = wxWcslen(src) + 1;
1479
1480 srcLen *= BYTES_PER_CHAR;
1481
1482 if ( dst )
1483 {
1484 if ( dstLen < srcLen )
1485 return wxCONV_FAILED;
1486
1487 memcpy(dst, src, srcLen);
1488 }
1489
1490 return srcLen;
1491 }
1492
1493 // ----------------------------------------------------------------------------
1494 // endian-reversing conversions
1495 // ----------------------------------------------------------------------------
1496
1497 size_t
1498 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1499 const char *src, size_t srcLen) const
1500 {
1501 srcLen = GetLength(src, srcLen);
1502 if ( srcLen == wxNO_LEN )
1503 return wxCONV_FAILED;
1504
1505 srcLen /= BYTES_PER_CHAR;
1506
1507 if ( dst )
1508 {
1509 if ( dstLen < srcLen )
1510 return wxCONV_FAILED;
1511
1512 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1513 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1514 {
1515 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1516 }
1517 }
1518
1519 return srcLen;
1520 }
1521
1522 size_t
1523 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1524 const wchar_t *src, size_t srcLen) const
1525 {
1526 if ( srcLen == wxNO_LEN )
1527 srcLen = wxWcslen(src) + 1;
1528
1529 srcLen *= BYTES_PER_CHAR;
1530
1531 if ( dst )
1532 {
1533 if ( dstLen < srcLen )
1534 return wxCONV_FAILED;
1535
1536 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1537 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1538 {
1539 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1540 }
1541 }
1542
1543 return srcLen;
1544 }
1545
1546 #endif // WC_UTF16/!WC_UTF16
1547
1548
1549 // ============================================================================
1550 // The classes doing conversion using the iconv_xxx() functions
1551 // ============================================================================
1552
1553 #ifdef HAVE_ICONV
1554
1555 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1556 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1557 // (unless there's yet another bug in glibc) the only case when iconv()
1558 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1559 // left in the input buffer -- when _real_ error occurs,
1560 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1561 // iconv() failure.
1562 // [This bug does not appear in glibc 2.2.]
1563 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1564 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1565 (errno != E2BIG || bufLeft != 0))
1566 #else
1567 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1568 #endif
1569
1570 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1571
1572 #define ICONV_T_INVALID ((iconv_t)-1)
1573
1574 #if SIZEOF_WCHAR_T == 4
1575 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1576 #define WC_ENC wxFONTENCODING_UTF32
1577 #elif SIZEOF_WCHAR_T == 2
1578 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1579 #define WC_ENC wxFONTENCODING_UTF16
1580 #else // sizeof(wchar_t) != 2 nor 4
1581 // does this ever happen?
1582 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1583 #endif
1584
1585 // ----------------------------------------------------------------------------
1586 // wxMBConv_iconv: encapsulates an iconv character set
1587 // ----------------------------------------------------------------------------
1588
1589 class wxMBConv_iconv : public wxMBConv
1590 {
1591 public:
1592 wxMBConv_iconv(const wxChar *name);
1593 virtual ~wxMBConv_iconv();
1594
1595 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1596 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1597
1598 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1599 virtual size_t GetMBNulLen() const;
1600
1601 virtual wxMBConv *Clone() const
1602 {
1603 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1604 p->m_minMBCharWidth = m_minMBCharWidth;
1605 return p;
1606 }
1607
1608 bool IsOk() const
1609 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1610
1611 protected:
1612 // the iconv handlers used to translate from multibyte
1613 // to wide char and in the other direction
1614 iconv_t m2w,
1615 w2m;
1616
1617 #if wxUSE_THREADS
1618 // guards access to m2w and w2m objects
1619 wxMutex m_iconvMutex;
1620 #endif
1621
1622 private:
1623 // the name (for iconv_open()) of a wide char charset -- if none is
1624 // available on this machine, it will remain NULL
1625 static wxString ms_wcCharsetName;
1626
1627 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1628 // different endian-ness than the native one
1629 static bool ms_wcNeedsSwap;
1630
1631
1632 // name of the encoding handled by this conversion
1633 wxString m_name;
1634
1635 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1636 // initially
1637 size_t m_minMBCharWidth;
1638 };
1639
1640 // make the constructor available for unit testing
1641 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1642 {
1643 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1644 if ( !result->IsOk() )
1645 {
1646 delete result;
1647 return 0;
1648 }
1649
1650 return result;
1651 }
1652
1653 wxString wxMBConv_iconv::ms_wcCharsetName;
1654 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1655
1656 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1657 : m_name(name)
1658 {
1659 m_minMBCharWidth = 0;
1660
1661 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1662 // names for the charsets
1663 const wxCharBuffer cname(wxString(name).ToAscii());
1664
1665 // check for charset that represents wchar_t:
1666 if ( ms_wcCharsetName.empty() )
1667 {
1668 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1669
1670 #if wxUSE_FONTMAP
1671 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1672 #else // !wxUSE_FONTMAP
1673 static const wxChar *names_static[] =
1674 {
1675 #if SIZEOF_WCHAR_T == 4
1676 _T("UCS-4"),
1677 #elif SIZEOF_WCHAR_T = 2
1678 _T("UCS-2"),
1679 #endif
1680 NULL
1681 };
1682 const wxChar **names = names_static;
1683 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1684
1685 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1686 {
1687 const wxString nameCS(*names);
1688
1689 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1690 wxString nameXE(nameCS);
1691
1692 #ifdef WORDS_BIGENDIAN
1693 nameXE += _T("BE");
1694 #else // little endian
1695 nameXE += _T("LE");
1696 #endif
1697
1698 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1699 nameXE.c_str());
1700
1701 m2w = iconv_open(nameXE.ToAscii(), cname);
1702 if ( m2w == ICONV_T_INVALID )
1703 {
1704 // try charset w/o bytesex info (e.g. "UCS4")
1705 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1706 nameCS.c_str());
1707 m2w = iconv_open(nameCS.ToAscii(), cname);
1708
1709 // and check for bytesex ourselves:
1710 if ( m2w != ICONV_T_INVALID )
1711 {
1712 char buf[2], *bufPtr;
1713 wchar_t wbuf[2], *wbufPtr;
1714 size_t insz, outsz;
1715 size_t res;
1716
1717 buf[0] = 'A';
1718 buf[1] = 0;
1719 wbuf[0] = 0;
1720 insz = 2;
1721 outsz = SIZEOF_WCHAR_T * 2;
1722 wbufPtr = wbuf;
1723 bufPtr = buf;
1724
1725 res = iconv(
1726 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1727 (char**)&wbufPtr, &outsz);
1728
1729 if (ICONV_FAILED(res, insz))
1730 {
1731 wxLogLastError(wxT("iconv"));
1732 wxLogError(_("Conversion to charset '%s' doesn't work."),
1733 nameCS.c_str());
1734 }
1735 else // ok, can convert to this encoding, remember it
1736 {
1737 ms_wcCharsetName = nameCS;
1738 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1739 }
1740 }
1741 }
1742 else // use charset not requiring byte swapping
1743 {
1744 ms_wcCharsetName = nameXE;
1745 }
1746 }
1747
1748 wxLogTrace(TRACE_STRCONV,
1749 wxT("iconv wchar_t charset is \"%s\"%s"),
1750 ms_wcCharsetName.empty() ? _T("<none>")
1751 : ms_wcCharsetName.c_str(),
1752 ms_wcNeedsSwap ? _T(" (needs swap)")
1753 : _T(""));
1754 }
1755 else // we already have ms_wcCharsetName
1756 {
1757 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1758 }
1759
1760 if ( ms_wcCharsetName.empty() )
1761 {
1762 w2m = ICONV_T_INVALID;
1763 }
1764 else
1765 {
1766 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1767 if ( w2m == ICONV_T_INVALID )
1768 {
1769 wxLogTrace(TRACE_STRCONV,
1770 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1771 ms_wcCharsetName.c_str(), cname.data());
1772 }
1773 }
1774 }
1775
1776 wxMBConv_iconv::~wxMBConv_iconv()
1777 {
1778 if ( m2w != ICONV_T_INVALID )
1779 iconv_close(m2w);
1780 if ( w2m != ICONV_T_INVALID )
1781 iconv_close(w2m);
1782 }
1783
1784 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1785 {
1786 // find the string length: notice that must be done differently for
1787 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1788 size_t inbuf;
1789 const size_t nulLen = GetMBNulLen();
1790 switch ( nulLen )
1791 {
1792 default:
1793 return wxCONV_FAILED;
1794
1795 case 1:
1796 inbuf = strlen(psz); // arguably more optimized than our version
1797 break;
1798
1799 case 2:
1800 case 4:
1801 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1802 // they also have to start at character boundary and not span two
1803 // adjacent characters
1804 const char *p;
1805 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1806 ;
1807 inbuf = p - psz;
1808 break;
1809 }
1810
1811 #if wxUSE_THREADS
1812 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1813 // Unfortunately there are a couple of global wxCSConv objects such as
1814 // wxConvLocal that are used all over wx code, so we have to make sure
1815 // the handle is used by at most one thread at the time. Otherwise
1816 // only a few wx classes would be safe to use from non-main threads
1817 // as MB<->WC conversion would fail "randomly".
1818 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1819 #endif // wxUSE_THREADS
1820
1821 size_t outbuf = n * SIZEOF_WCHAR_T;
1822 size_t res, cres;
1823 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1824 wchar_t *bufPtr = buf;
1825 const char *pszPtr = psz;
1826
1827 if (buf)
1828 {
1829 // have destination buffer, convert there
1830 cres = iconv(m2w,
1831 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1832 (char**)&bufPtr, &outbuf);
1833 res = n - (outbuf / SIZEOF_WCHAR_T);
1834
1835 if (ms_wcNeedsSwap)
1836 {
1837 // convert to native endianness
1838 for ( unsigned i = 0; i < res; i++ )
1839 buf[n] = WC_BSWAP(buf[i]);
1840 }
1841
1842 // NUL-terminate the string if there is any space left
1843 if (res < n)
1844 buf[res] = 0;
1845 }
1846 else
1847 {
1848 // no destination buffer... convert using temp buffer
1849 // to calculate destination buffer requirement
1850 wchar_t tbuf[8];
1851 res = 0;
1852
1853 do
1854 {
1855 bufPtr = tbuf;
1856 outbuf = 8 * SIZEOF_WCHAR_T;
1857
1858 cres = iconv(m2w,
1859 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1860 (char**)&bufPtr, &outbuf );
1861
1862 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1863 }
1864 while ((cres == (size_t)-1) && (errno == E2BIG));
1865 }
1866
1867 if (ICONV_FAILED(cres, inbuf))
1868 {
1869 //VS: it is ok if iconv fails, hence trace only
1870 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1871 return wxCONV_FAILED;
1872 }
1873
1874 return res;
1875 }
1876
1877 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1878 {
1879 #if wxUSE_THREADS
1880 // NB: explained in MB2WC
1881 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1882 #endif
1883
1884 size_t inlen = wxWcslen(psz);
1885 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1886 size_t outbuf = n;
1887 size_t res, cres;
1888
1889 wchar_t *tmpbuf = 0;
1890
1891 if (ms_wcNeedsSwap)
1892 {
1893 // need to copy to temp buffer to switch endianness
1894 // (doing WC_BSWAP twice on the original buffer won't help, as it
1895 // could be in read-only memory, or be accessed in some other thread)
1896 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1897 for ( size_t i = 0; i < inlen; i++ )
1898 tmpbuf[n] = WC_BSWAP(psz[i]);
1899
1900 tmpbuf[inlen] = L'\0';
1901 psz = tmpbuf;
1902 }
1903
1904 if (buf)
1905 {
1906 // have destination buffer, convert there
1907 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1908
1909 res = n - outbuf;
1910
1911 // NB: iconv was given only wcslen(psz) characters on input, and so
1912 // it couldn't convert the trailing zero. Let's do it ourselves
1913 // if there's some room left for it in the output buffer.
1914 if (res < n)
1915 buf[0] = 0;
1916 }
1917 else
1918 {
1919 // no destination buffer: convert using temp buffer
1920 // to calculate destination buffer requirement
1921 char tbuf[16];
1922 res = 0;
1923 do
1924 {
1925 buf = tbuf;
1926 outbuf = 16;
1927
1928 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1929
1930 res += 16 - outbuf;
1931 }
1932 while ((cres == (size_t)-1) && (errno == E2BIG));
1933 }
1934
1935 if (ms_wcNeedsSwap)
1936 {
1937 free(tmpbuf);
1938 }
1939
1940 if (ICONV_FAILED(cres, inbuf))
1941 {
1942 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1943 return wxCONV_FAILED;
1944 }
1945
1946 return res;
1947 }
1948
1949 size_t wxMBConv_iconv::GetMBNulLen() const
1950 {
1951 if ( m_minMBCharWidth == 0 )
1952 {
1953 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1954
1955 #if wxUSE_THREADS
1956 // NB: explained in MB2WC
1957 wxMutexLocker lock(self->m_iconvMutex);
1958 #endif
1959
1960 wchar_t *wnul = L"";
1961 char buf[8]; // should be enough for NUL in any encoding
1962 size_t inLen = sizeof(wchar_t),
1963 outLen = WXSIZEOF(buf);
1964 char *inBuff = (char *)wnul;
1965 char *outBuff = buf;
1966 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1967 {
1968 self->m_minMBCharWidth = (size_t)-1;
1969 }
1970 else // ok
1971 {
1972 self->m_minMBCharWidth = outBuff - buf;
1973 }
1974 }
1975
1976 return m_minMBCharWidth;
1977 }
1978
1979 #endif // HAVE_ICONV
1980
1981
1982 // ============================================================================
1983 // Win32 conversion classes
1984 // ============================================================================
1985
1986 #ifdef wxHAVE_WIN32_MB2WC
1987
1988 // from utils.cpp
1989 #if wxUSE_FONTMAP
1990 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1991 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1992 #endif
1993
1994 class wxMBConv_win32 : public wxMBConv
1995 {
1996 public:
1997 wxMBConv_win32()
1998 {
1999 m_CodePage = CP_ACP;
2000 m_minMBCharWidth = 0;
2001 }
2002
2003 wxMBConv_win32(const wxMBConv_win32& conv)
2004 : wxMBConv()
2005 {
2006 m_CodePage = conv.m_CodePage;
2007 m_minMBCharWidth = conv.m_minMBCharWidth;
2008 }
2009
2010 #if wxUSE_FONTMAP
2011 wxMBConv_win32(const wxChar* name)
2012 {
2013 m_CodePage = wxCharsetToCodepage(name);
2014 m_minMBCharWidth = 0;
2015 }
2016
2017 wxMBConv_win32(wxFontEncoding encoding)
2018 {
2019 m_CodePage = wxEncodingToCodepage(encoding);
2020 m_minMBCharWidth = 0;
2021 }
2022 #endif // wxUSE_FONTMAP
2023
2024 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2025 {
2026 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2027 // the behaviour is not compatible with the Unix version (using iconv)
2028 // and break the library itself, e.g. wxTextInputStream::NextChar()
2029 // wouldn't work if reading an incomplete MB char didn't result in an
2030 // error
2031 //
2032 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2033 // Win XP or newer and it is not supported for UTF-[78] so we always
2034 // use our own conversions in this case. See
2035 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2036 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2037 if ( m_CodePage == CP_UTF8 )
2038 {
2039 return wxConvUTF8.MB2WC(buf, psz, n);
2040 }
2041
2042 if ( m_CodePage == CP_UTF7 )
2043 {
2044 return wxConvUTF7.MB2WC(buf, psz, n);
2045 }
2046
2047 int flags = 0;
2048 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2049 IsAtLeastWin2kSP4() )
2050 {
2051 flags = MB_ERR_INVALID_CHARS;
2052 }
2053
2054 const size_t len = ::MultiByteToWideChar
2055 (
2056 m_CodePage, // code page
2057 flags, // flags: fall on error
2058 psz, // input string
2059 -1, // its length (NUL-terminated)
2060 buf, // output string
2061 buf ? n : 0 // size of output buffer
2062 );
2063 if ( !len )
2064 {
2065 // function totally failed
2066 return wxCONV_FAILED;
2067 }
2068
2069 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2070 // check if we succeeded, by doing a double trip:
2071 if ( !flags && buf )
2072 {
2073 const size_t mbLen = strlen(psz);
2074 wxCharBuffer mbBuf(mbLen);
2075 if ( ::WideCharToMultiByte
2076 (
2077 m_CodePage,
2078 0,
2079 buf,
2080 -1,
2081 mbBuf.data(),
2082 mbLen + 1, // size in bytes, not length
2083 NULL,
2084 NULL
2085 ) == 0 ||
2086 strcmp(mbBuf, psz) != 0 )
2087 {
2088 // we didn't obtain the same thing we started from, hence
2089 // the conversion was lossy and we consider that it failed
2090 return wxCONV_FAILED;
2091 }
2092 }
2093
2094 // note that it returns count of written chars for buf != NULL and size
2095 // of the needed buffer for buf == NULL so in either case the length of
2096 // the string (which never includes the terminating NUL) is one less
2097 return len - 1;
2098 }
2099
2100 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2101 {
2102 /*
2103 we have a problem here: by default, WideCharToMultiByte() may
2104 replace characters unrepresentable in the target code page with bad
2105 quality approximations such as turning "1/2" symbol (U+00BD) into
2106 "1" for the code pages which don't have it and we, obviously, want
2107 to avoid this at any price
2108
2109 the trouble is that this function does it _silently_, i.e. it won't
2110 even tell us whether it did or not... Win98/2000 and higher provide
2111 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2112 we have to resort to a round trip, i.e. check that converting back
2113 results in the same string -- this is, of course, expensive but
2114 otherwise we simply can't be sure to not garble the data.
2115 */
2116
2117 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2118 // it doesn't work with CJK encodings (which we test for rather roughly
2119 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2120 // supporting it
2121 BOOL usedDef wxDUMMY_INITIALIZE(false);
2122 BOOL *pUsedDef;
2123 int flags;
2124 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2125 {
2126 // it's our lucky day
2127 flags = WC_NO_BEST_FIT_CHARS;
2128 pUsedDef = &usedDef;
2129 }
2130 else // old system or unsupported encoding
2131 {
2132 flags = 0;
2133 pUsedDef = NULL;
2134 }
2135
2136 const size_t len = ::WideCharToMultiByte
2137 (
2138 m_CodePage, // code page
2139 flags, // either none or no best fit
2140 pwz, // input string
2141 -1, // it is (wide) NUL-terminated
2142 buf, // output buffer
2143 buf ? n : 0, // and its size
2144 NULL, // default "replacement" char
2145 pUsedDef // [out] was it used?
2146 );
2147
2148 if ( !len )
2149 {
2150 // function totally failed
2151 return wxCONV_FAILED;
2152 }
2153
2154 // if we were really converting, check if we succeeded
2155 if ( buf )
2156 {
2157 if ( flags )
2158 {
2159 // check if the conversion failed, i.e. if any replacements
2160 // were done
2161 if ( usedDef )
2162 return wxCONV_FAILED;
2163 }
2164 else // we must resort to double tripping...
2165 {
2166 wxWCharBuffer wcBuf(n);
2167 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2168 wcscmp(wcBuf, pwz) != 0 )
2169 {
2170 // we didn't obtain the same thing we started from, hence
2171 // the conversion was lossy and we consider that it failed
2172 return wxCONV_FAILED;
2173 }
2174 }
2175 }
2176
2177 // see the comment above for the reason of "len - 1"
2178 return len - 1;
2179 }
2180
2181 virtual size_t GetMBNulLen() const
2182 {
2183 if ( m_minMBCharWidth == 0 )
2184 {
2185 int len = ::WideCharToMultiByte
2186 (
2187 m_CodePage, // code page
2188 0, // no flags
2189 L"", // input string
2190 1, // translate just the NUL
2191 NULL, // output buffer
2192 0, // and its size
2193 NULL, // no replacement char
2194 NULL // [out] don't care if it was used
2195 );
2196
2197 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2198 switch ( len )
2199 {
2200 default:
2201 wxLogDebug(_T("Unexpected NUL length %d"), len);
2202 self->m_minMBCharWidth = (size_t)-1;
2203 break;
2204
2205 case 0:
2206 self->m_minMBCharWidth = (size_t)-1;
2207 break;
2208
2209 case 1:
2210 case 2:
2211 case 4:
2212 self->m_minMBCharWidth = len;
2213 break;
2214 }
2215 }
2216
2217 return m_minMBCharWidth;
2218 }
2219
2220 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2221
2222 bool IsOk() const { return m_CodePage != -1; }
2223
2224 private:
2225 static bool CanUseNoBestFit()
2226 {
2227 static int s_isWin98Or2k = -1;
2228
2229 if ( s_isWin98Or2k == -1 )
2230 {
2231 int verMaj, verMin;
2232 switch ( wxGetOsVersion(&verMaj, &verMin) )
2233 {
2234 case wxOS_WINDOWS_9X:
2235 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2236 break;
2237
2238 case wxOS_WINDOWS_NT:
2239 s_isWin98Or2k = verMaj >= 5;
2240 break;
2241
2242 default:
2243 // unknown: be conservative by default
2244 s_isWin98Or2k = 0;
2245 break;
2246 }
2247
2248 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2249 }
2250
2251 return s_isWin98Or2k == 1;
2252 }
2253
2254 static bool IsAtLeastWin2kSP4()
2255 {
2256 #ifdef __WXWINCE__
2257 return false;
2258 #else
2259 static int s_isAtLeastWin2kSP4 = -1;
2260
2261 if ( s_isAtLeastWin2kSP4 == -1 )
2262 {
2263 OSVERSIONINFOEX ver;
2264
2265 memset(&ver, 0, sizeof(ver));
2266 ver.dwOSVersionInfoSize = sizeof(ver);
2267 GetVersionEx((OSVERSIONINFO*)&ver);
2268
2269 s_isAtLeastWin2kSP4 =
2270 ((ver.dwMajorVersion > 5) || // Vista+
2271 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2272 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2273 ver.wServicePackMajor >= 4)) // 2000 SP4+
2274 ? 1 : 0;
2275 }
2276
2277 return s_isAtLeastWin2kSP4 == 1;
2278 #endif
2279 }
2280
2281
2282 // the code page we're working with
2283 long m_CodePage;
2284
2285 // cached result of GetMBNulLen(), set to 0 initially meaning
2286 // "unknown"
2287 size_t m_minMBCharWidth;
2288 };
2289
2290 #endif // wxHAVE_WIN32_MB2WC
2291
2292 // ============================================================================
2293 // Cocoa conversion classes
2294 // ============================================================================
2295
2296 #if defined(__WXCOCOA__)
2297
2298 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2299 // Strangely enough, internally Core Foundation uses
2300 // UTF-32 internally quite a bit - its just not public (yet).
2301
2302 #include <CoreFoundation/CFString.h>
2303 #include <CoreFoundation/CFStringEncodingExt.h>
2304
2305 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2306 {
2307 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2308
2309 switch (encoding)
2310 {
2311 case wxFONTENCODING_DEFAULT :
2312 enc = CFStringGetSystemEncoding();
2313 break ;
2314
2315 case wxFONTENCODING_ISO8859_1 :
2316 enc = kCFStringEncodingISOLatin1 ;
2317 break ;
2318 case wxFONTENCODING_ISO8859_2 :
2319 enc = kCFStringEncodingISOLatin2;
2320 break ;
2321 case wxFONTENCODING_ISO8859_3 :
2322 enc = kCFStringEncodingISOLatin3 ;
2323 break ;
2324 case wxFONTENCODING_ISO8859_4 :
2325 enc = kCFStringEncodingISOLatin4;
2326 break ;
2327 case wxFONTENCODING_ISO8859_5 :
2328 enc = kCFStringEncodingISOLatinCyrillic;
2329 break ;
2330 case wxFONTENCODING_ISO8859_6 :
2331 enc = kCFStringEncodingISOLatinArabic;
2332 break ;
2333 case wxFONTENCODING_ISO8859_7 :
2334 enc = kCFStringEncodingISOLatinGreek;
2335 break ;
2336 case wxFONTENCODING_ISO8859_8 :
2337 enc = kCFStringEncodingISOLatinHebrew;
2338 break ;
2339 case wxFONTENCODING_ISO8859_9 :
2340 enc = kCFStringEncodingISOLatin5;
2341 break ;
2342 case wxFONTENCODING_ISO8859_10 :
2343 enc = kCFStringEncodingISOLatin6;
2344 break ;
2345 case wxFONTENCODING_ISO8859_11 :
2346 enc = kCFStringEncodingISOLatinThai;
2347 break ;
2348 case wxFONTENCODING_ISO8859_13 :
2349 enc = kCFStringEncodingISOLatin7;
2350 break ;
2351 case wxFONTENCODING_ISO8859_14 :
2352 enc = kCFStringEncodingISOLatin8;
2353 break ;
2354 case wxFONTENCODING_ISO8859_15 :
2355 enc = kCFStringEncodingISOLatin9;
2356 break ;
2357
2358 case wxFONTENCODING_KOI8 :
2359 enc = kCFStringEncodingKOI8_R;
2360 break ;
2361 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2362 enc = kCFStringEncodingDOSRussian;
2363 break ;
2364
2365 // case wxFONTENCODING_BULGARIAN :
2366 // enc = ;
2367 // break ;
2368
2369 case wxFONTENCODING_CP437 :
2370 enc = kCFStringEncodingDOSLatinUS ;
2371 break ;
2372 case wxFONTENCODING_CP850 :
2373 enc = kCFStringEncodingDOSLatin1;
2374 break ;
2375 case wxFONTENCODING_CP852 :
2376 enc = kCFStringEncodingDOSLatin2;
2377 break ;
2378 case wxFONTENCODING_CP855 :
2379 enc = kCFStringEncodingDOSCyrillic;
2380 break ;
2381 case wxFONTENCODING_CP866 :
2382 enc = kCFStringEncodingDOSRussian ;
2383 break ;
2384 case wxFONTENCODING_CP874 :
2385 enc = kCFStringEncodingDOSThai;
2386 break ;
2387 case wxFONTENCODING_CP932 :
2388 enc = kCFStringEncodingDOSJapanese;
2389 break ;
2390 case wxFONTENCODING_CP936 :
2391 enc = kCFStringEncodingDOSChineseSimplif ;
2392 break ;
2393 case wxFONTENCODING_CP949 :
2394 enc = kCFStringEncodingDOSKorean;
2395 break ;
2396 case wxFONTENCODING_CP950 :
2397 enc = kCFStringEncodingDOSChineseTrad;
2398 break ;
2399 case wxFONTENCODING_CP1250 :
2400 enc = kCFStringEncodingWindowsLatin2;
2401 break ;
2402 case wxFONTENCODING_CP1251 :
2403 enc = kCFStringEncodingWindowsCyrillic ;
2404 break ;
2405 case wxFONTENCODING_CP1252 :
2406 enc = kCFStringEncodingWindowsLatin1 ;
2407 break ;
2408 case wxFONTENCODING_CP1253 :
2409 enc = kCFStringEncodingWindowsGreek;
2410 break ;
2411 case wxFONTENCODING_CP1254 :
2412 enc = kCFStringEncodingWindowsLatin5;
2413 break ;
2414 case wxFONTENCODING_CP1255 :
2415 enc = kCFStringEncodingWindowsHebrew ;
2416 break ;
2417 case wxFONTENCODING_CP1256 :
2418 enc = kCFStringEncodingWindowsArabic ;
2419 break ;
2420 case wxFONTENCODING_CP1257 :
2421 enc = kCFStringEncodingWindowsBalticRim;
2422 break ;
2423 // This only really encodes to UTF7 (if that) evidently
2424 // case wxFONTENCODING_UTF7 :
2425 // enc = kCFStringEncodingNonLossyASCII ;
2426 // break ;
2427 case wxFONTENCODING_UTF8 :
2428 enc = kCFStringEncodingUTF8 ;
2429 break ;
2430 case wxFONTENCODING_EUC_JP :
2431 enc = kCFStringEncodingEUC_JP;
2432 break ;
2433 case wxFONTENCODING_UTF16 :
2434 enc = kCFStringEncodingUnicode ;
2435 break ;
2436 case wxFONTENCODING_MACROMAN :
2437 enc = kCFStringEncodingMacRoman ;
2438 break ;
2439 case wxFONTENCODING_MACJAPANESE :
2440 enc = kCFStringEncodingMacJapanese ;
2441 break ;
2442 case wxFONTENCODING_MACCHINESETRAD :
2443 enc = kCFStringEncodingMacChineseTrad ;
2444 break ;
2445 case wxFONTENCODING_MACKOREAN :
2446 enc = kCFStringEncodingMacKorean ;
2447 break ;
2448 case wxFONTENCODING_MACARABIC :
2449 enc = kCFStringEncodingMacArabic ;
2450 break ;
2451 case wxFONTENCODING_MACHEBREW :
2452 enc = kCFStringEncodingMacHebrew ;
2453 break ;
2454 case wxFONTENCODING_MACGREEK :
2455 enc = kCFStringEncodingMacGreek ;
2456 break ;
2457 case wxFONTENCODING_MACCYRILLIC :
2458 enc = kCFStringEncodingMacCyrillic ;
2459 break ;
2460 case wxFONTENCODING_MACDEVANAGARI :
2461 enc = kCFStringEncodingMacDevanagari ;
2462 break ;
2463 case wxFONTENCODING_MACGURMUKHI :
2464 enc = kCFStringEncodingMacGurmukhi ;
2465 break ;
2466 case wxFONTENCODING_MACGUJARATI :
2467 enc = kCFStringEncodingMacGujarati ;
2468 break ;
2469 case wxFONTENCODING_MACORIYA :
2470 enc = kCFStringEncodingMacOriya ;
2471 break ;
2472 case wxFONTENCODING_MACBENGALI :
2473 enc = kCFStringEncodingMacBengali ;
2474 break ;
2475 case wxFONTENCODING_MACTAMIL :
2476 enc = kCFStringEncodingMacTamil ;
2477 break ;
2478 case wxFONTENCODING_MACTELUGU :
2479 enc = kCFStringEncodingMacTelugu ;
2480 break ;
2481 case wxFONTENCODING_MACKANNADA :
2482 enc = kCFStringEncodingMacKannada ;
2483 break ;
2484 case wxFONTENCODING_MACMALAJALAM :
2485 enc = kCFStringEncodingMacMalayalam ;
2486 break ;
2487 case wxFONTENCODING_MACSINHALESE :
2488 enc = kCFStringEncodingMacSinhalese ;
2489 break ;
2490 case wxFONTENCODING_MACBURMESE :
2491 enc = kCFStringEncodingMacBurmese ;
2492 break ;
2493 case wxFONTENCODING_MACKHMER :
2494 enc = kCFStringEncodingMacKhmer ;
2495 break ;
2496 case wxFONTENCODING_MACTHAI :
2497 enc = kCFStringEncodingMacThai ;
2498 break ;
2499 case wxFONTENCODING_MACLAOTIAN :
2500 enc = kCFStringEncodingMacLaotian ;
2501 break ;
2502 case wxFONTENCODING_MACGEORGIAN :
2503 enc = kCFStringEncodingMacGeorgian ;
2504 break ;
2505 case wxFONTENCODING_MACARMENIAN :
2506 enc = kCFStringEncodingMacArmenian ;
2507 break ;
2508 case wxFONTENCODING_MACCHINESESIMP :
2509 enc = kCFStringEncodingMacChineseSimp ;
2510 break ;
2511 case wxFONTENCODING_MACTIBETAN :
2512 enc = kCFStringEncodingMacTibetan ;
2513 break ;
2514 case wxFONTENCODING_MACMONGOLIAN :
2515 enc = kCFStringEncodingMacMongolian ;
2516 break ;
2517 case wxFONTENCODING_MACETHIOPIC :
2518 enc = kCFStringEncodingMacEthiopic ;
2519 break ;
2520 case wxFONTENCODING_MACCENTRALEUR :
2521 enc = kCFStringEncodingMacCentralEurRoman ;
2522 break ;
2523 case wxFONTENCODING_MACVIATNAMESE :
2524 enc = kCFStringEncodingMacVietnamese ;
2525 break ;
2526 case wxFONTENCODING_MACARABICEXT :
2527 enc = kCFStringEncodingMacExtArabic ;
2528 break ;
2529 case wxFONTENCODING_MACSYMBOL :
2530 enc = kCFStringEncodingMacSymbol ;
2531 break ;
2532 case wxFONTENCODING_MACDINGBATS :
2533 enc = kCFStringEncodingMacDingbats ;
2534 break ;
2535 case wxFONTENCODING_MACTURKISH :
2536 enc = kCFStringEncodingMacTurkish ;
2537 break ;
2538 case wxFONTENCODING_MACCROATIAN :
2539 enc = kCFStringEncodingMacCroatian ;
2540 break ;
2541 case wxFONTENCODING_MACICELANDIC :
2542 enc = kCFStringEncodingMacIcelandic ;
2543 break ;
2544 case wxFONTENCODING_MACROMANIAN :
2545 enc = kCFStringEncodingMacRomanian ;
2546 break ;
2547 case wxFONTENCODING_MACCELTIC :
2548 enc = kCFStringEncodingMacCeltic ;
2549 break ;
2550 case wxFONTENCODING_MACGAELIC :
2551 enc = kCFStringEncodingMacGaelic ;
2552 break ;
2553 // case wxFONTENCODING_MACKEYBOARD :
2554 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2555 // break ;
2556
2557 default :
2558 // because gcc is picky
2559 break ;
2560 }
2561
2562 return enc ;
2563 }
2564
2565 class wxMBConv_cocoa : public wxMBConv
2566 {
2567 public:
2568 wxMBConv_cocoa()
2569 {
2570 Init(CFStringGetSystemEncoding()) ;
2571 }
2572
2573 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2574 {
2575 m_encoding = conv.m_encoding;
2576 }
2577
2578 #if wxUSE_FONTMAP
2579 wxMBConv_cocoa(const wxChar* name)
2580 {
2581 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2582 }
2583 #endif
2584
2585 wxMBConv_cocoa(wxFontEncoding encoding)
2586 {
2587 Init( wxCFStringEncFromFontEnc(encoding) );
2588 }
2589
2590 virtual ~wxMBConv_cocoa()
2591 {
2592 }
2593
2594 void Init( CFStringEncoding encoding)
2595 {
2596 m_encoding = encoding ;
2597 }
2598
2599 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2600 {
2601 wxASSERT(szUnConv);
2602
2603 CFStringRef theString = CFStringCreateWithBytes (
2604 NULL, //the allocator
2605 (const UInt8*)szUnConv,
2606 strlen(szUnConv),
2607 m_encoding,
2608 false //no BOM/external representation
2609 );
2610
2611 wxASSERT(theString);
2612
2613 size_t nOutLength = CFStringGetLength(theString);
2614
2615 if (szOut == NULL)
2616 {
2617 CFRelease(theString);
2618 return nOutLength;
2619 }
2620
2621 CFRange theRange = { 0, nOutSize };
2622
2623 #if SIZEOF_WCHAR_T == 4
2624 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2625 #endif
2626
2627 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2628
2629 CFRelease(theString);
2630
2631 szUniCharBuffer[nOutLength] = '\0';
2632
2633 #if SIZEOF_WCHAR_T == 4
2634 wxMBConvUTF16 converter;
2635 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2636 delete [] szUniCharBuffer;
2637 #endif
2638
2639 return nOutLength;
2640 }
2641
2642 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2643 {
2644 wxASSERT(szUnConv);
2645
2646 size_t nRealOutSize;
2647 size_t nBufSize = wxWcslen(szUnConv);
2648 UniChar* szUniBuffer = (UniChar*) szUnConv;
2649
2650 #if SIZEOF_WCHAR_T == 4
2651 wxMBConvUTF16 converter ;
2652 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2653 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2654 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2655 nBufSize /= sizeof(UniChar);
2656 #endif
2657
2658 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2659 NULL, //allocator
2660 szUniBuffer,
2661 nBufSize,
2662 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2663 );
2664
2665 wxASSERT(theString);
2666
2667 //Note that CER puts a BOM when converting to unicode
2668 //so we check and use getchars instead in that case
2669 if (m_encoding == kCFStringEncodingUnicode)
2670 {
2671 if (szOut != NULL)
2672 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2673
2674 nRealOutSize = CFStringGetLength(theString) + 1;
2675 }
2676 else
2677 {
2678 CFStringGetBytes(
2679 theString,
2680 CFRangeMake(0, CFStringGetLength(theString)),
2681 m_encoding,
2682 0, //what to put in characters that can't be converted -
2683 //0 tells CFString to return NULL if it meets such a character
2684 false, //not an external representation
2685 (UInt8*) szOut,
2686 nOutSize,
2687 (CFIndex*) &nRealOutSize
2688 );
2689 }
2690
2691 CFRelease(theString);
2692
2693 #if SIZEOF_WCHAR_T == 4
2694 delete[] szUniBuffer;
2695 #endif
2696
2697 return nRealOutSize - 1;
2698 }
2699
2700 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2701
2702 bool IsOk() const
2703 {
2704 return m_encoding != kCFStringEncodingInvalidId &&
2705 CFStringIsEncodingAvailable(m_encoding);
2706 }
2707
2708 private:
2709 CFStringEncoding m_encoding ;
2710 };
2711
2712 #endif // defined(__WXCOCOA__)
2713
2714 // ============================================================================
2715 // Mac conversion classes
2716 // ============================================================================
2717
2718 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2719
2720 class wxMBConv_mac : public wxMBConv
2721 {
2722 public:
2723 wxMBConv_mac()
2724 {
2725 Init(CFStringGetSystemEncoding()) ;
2726 }
2727
2728 wxMBConv_mac(const wxMBConv_mac& conv)
2729 {
2730 Init(conv.m_char_encoding);
2731 }
2732
2733 #if wxUSE_FONTMAP
2734 wxMBConv_mac(const wxChar* name)
2735 {
2736 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2737 }
2738 #endif
2739
2740 wxMBConv_mac(wxFontEncoding encoding)
2741 {
2742 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2743 }
2744
2745 virtual ~wxMBConv_mac()
2746 {
2747 OSStatus status = noErr ;
2748 if (m_MB2WC_converter)
2749 status = TECDisposeConverter(m_MB2WC_converter);
2750 if (m_WC2MB_converter)
2751 status = TECDisposeConverter(m_WC2MB_converter);
2752 }
2753
2754 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2755 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2756 {
2757 m_MB2WC_converter = NULL ;
2758 m_WC2MB_converter = NULL ;
2759 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2760 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2761 }
2762
2763 virtual void CreateIfNeeded() const
2764 {
2765 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2766 {
2767 OSStatus status = noErr ;
2768 status = TECCreateConverter(&m_MB2WC_converter,
2769 m_char_encoding,
2770 m_unicode_encoding);
2771 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2772 status = TECCreateConverter(&m_WC2MB_converter,
2773 m_unicode_encoding,
2774 m_char_encoding);
2775 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2776 }
2777 }
2778
2779 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2780 {
2781 CreateIfNeeded() ;
2782 OSStatus status = noErr ;
2783 ByteCount byteOutLen ;
2784 ByteCount byteInLen = strlen(psz) + 1;
2785 wchar_t *tbuf = NULL ;
2786 UniChar* ubuf = NULL ;
2787 size_t res = 0 ;
2788
2789 if (buf == NULL)
2790 {
2791 // Apple specs say at least 32
2792 n = wxMax( 32, byteInLen ) ;
2793 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2794 }
2795
2796 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2797
2798 #if SIZEOF_WCHAR_T == 4
2799 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2800 #else
2801 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2802 #endif
2803
2804 status = TECConvertText(
2805 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2806 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2807
2808 #if SIZEOF_WCHAR_T == 4
2809 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2810 // is not properly terminated we get random characters at the end
2811 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2812 wxMBConvUTF16 converter ;
2813 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2814 free( ubuf ) ;
2815 #else
2816 res = byteOutLen / sizeof( UniChar ) ;
2817 #endif
2818
2819 if ( buf == NULL )
2820 free(tbuf) ;
2821
2822 if ( buf && res < n)
2823 buf[res] = 0;
2824
2825 return res ;
2826 }
2827
2828 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2829 {
2830 CreateIfNeeded() ;
2831 OSStatus status = noErr ;
2832 ByteCount byteOutLen ;
2833 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2834
2835 char *tbuf = NULL ;
2836
2837 if (buf == NULL)
2838 {
2839 // Apple specs say at least 32
2840 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2841 tbuf = (char*) malloc( n ) ;
2842 }
2843
2844 ByteCount byteBufferLen = n ;
2845 UniChar* ubuf = NULL ;
2846
2847 #if SIZEOF_WCHAR_T == 4
2848 wxMBConvUTF16 converter ;
2849 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2850 byteInLen = unicharlen ;
2851 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2852 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2853 #else
2854 ubuf = (UniChar*) psz ;
2855 #endif
2856
2857 status = TECConvertText(
2858 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2859 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2860
2861 #if SIZEOF_WCHAR_T == 4
2862 free( ubuf ) ;
2863 #endif
2864
2865 if ( buf == NULL )
2866 free(tbuf) ;
2867
2868 size_t res = byteOutLen ;
2869 if ( buf && res < n)
2870 {
2871 buf[res] = 0;
2872
2873 //we need to double-trip to verify it didn't insert any ? in place
2874 //of bogus characters
2875 wxWCharBuffer wcBuf(n);
2876 size_t pszlen = wxWcslen(psz);
2877 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2878 wxWcslen(wcBuf) != pszlen ||
2879 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2880 {
2881 // we didn't obtain the same thing we started from, hence
2882 // the conversion was lossy and we consider that it failed
2883 return wxCONV_FAILED;
2884 }
2885 }
2886
2887 return res ;
2888 }
2889
2890 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2891
2892 bool IsOk() const
2893 {
2894 CreateIfNeeded() ;
2895 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2896 }
2897
2898 protected :
2899 mutable TECObjectRef m_MB2WC_converter;
2900 mutable TECObjectRef m_WC2MB_converter;
2901
2902 TextEncodingBase m_char_encoding;
2903 TextEncodingBase m_unicode_encoding;
2904 };
2905
2906 // MB is decomposed (D) normalized UTF8
2907
2908 class wxMBConv_macUTF8D : public wxMBConv_mac
2909 {
2910 public :
2911 wxMBConv_macUTF8D()
2912 {
2913 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2914 m_uni = NULL;
2915 m_uniBack = NULL ;
2916 }
2917
2918 virtual ~wxMBConv_macUTF8D()
2919 {
2920 if (m_uni!=NULL)
2921 DisposeUnicodeToTextInfo(&m_uni);
2922 if (m_uniBack!=NULL)
2923 DisposeUnicodeToTextInfo(&m_uniBack);
2924 }
2925
2926 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2927 {
2928 CreateIfNeeded() ;
2929 OSStatus status = noErr ;
2930 ByteCount byteOutLen ;
2931 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2932
2933 char *tbuf = NULL ;
2934
2935 if (buf == NULL)
2936 {
2937 // Apple specs say at least 32
2938 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2939 tbuf = (char*) malloc( n ) ;
2940 }
2941
2942 ByteCount byteBufferLen = n ;
2943 UniChar* ubuf = NULL ;
2944
2945 #if SIZEOF_WCHAR_T == 4
2946 wxMBConvUTF16 converter ;
2947 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2948 byteInLen = unicharlen ;
2949 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2950 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2951 #else
2952 ubuf = (UniChar*) psz ;
2953 #endif
2954
2955 // ubuf is a non-decomposed UniChar buffer
2956
2957 ByteCount dcubuflen = byteInLen * 2 + 2 ;
2958 ByteCount dcubufread , dcubufwritten ;
2959 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2960
2961 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2962 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
2963
2964 // we now convert that decomposed buffer into UTF8
2965
2966 status = TECConvertText(
2967 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2968 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2969
2970 free( dcubuf );
2971
2972 #if SIZEOF_WCHAR_T == 4
2973 free( ubuf ) ;
2974 #endif
2975
2976 if ( buf == NULL )
2977 free(tbuf) ;
2978
2979 size_t res = byteOutLen ;
2980 if ( buf && res < n)
2981 {
2982 buf[res] = 0;
2983 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2984 }
2985
2986 return res ;
2987 }
2988
2989 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2990 {
2991 CreateIfNeeded() ;
2992 OSStatus status = noErr ;
2993 ByteCount byteOutLen ;
2994 ByteCount byteInLen = strlen(psz) + 1;
2995 wchar_t *tbuf = NULL ;
2996 UniChar* ubuf = NULL ;
2997 size_t res = 0 ;
2998
2999 if (buf == NULL)
3000 {
3001 // Apple specs say at least 32
3002 n = wxMax( 32, byteInLen ) ;
3003 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3004 }
3005
3006 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3007
3008 #if SIZEOF_WCHAR_T == 4
3009 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3010 #else
3011 ubuf = (UniChar*) (buf ? buf : tbuf) ;
3012 #endif
3013
3014 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3015 ByteCount dcubufread , dcubufwritten ;
3016 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3017
3018 status = TECConvertText(
3019 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3020 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3021 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3022 // is not properly terminated we get random characters at the end
3023 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3024
3025 // now from the decomposed UniChar to properly composed uniChar
3026 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3027 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
3028
3029 free( dcubuf );
3030 byteOutLen = dcubufwritten ;
3031 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3032
3033
3034 #if SIZEOF_WCHAR_T == 4
3035 wxMBConvUTF16 converter ;
3036 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3037 free( ubuf ) ;
3038 #else
3039 res = byteOutLen / sizeof( UniChar ) ;
3040 #endif
3041
3042 if ( buf == NULL )
3043 free(tbuf) ;
3044
3045 if ( buf && res < n)
3046 buf[res] = 0;
3047
3048 return res ;
3049 }
3050
3051 virtual void CreateIfNeeded() const
3052 {
3053 wxMBConv_mac::CreateIfNeeded() ;
3054 if ( m_uni == NULL )
3055 {
3056 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3057 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3058 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3059 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3060 m_map.mappingVersion = kUnicodeUseLatestMapping;
3061
3062 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3063 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3064
3065 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3066 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3067 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3068 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3069 m_map.mappingVersion = kUnicodeUseLatestMapping;
3070 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3071 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3072 }
3073 }
3074 protected :
3075 mutable UnicodeToTextInfo m_uni;
3076 mutable UnicodeToTextInfo m_uniBack;
3077 mutable UnicodeMapping m_map;
3078 };
3079 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3080
3081 // ============================================================================
3082 // wxEncodingConverter based conversion classes
3083 // ============================================================================
3084
3085 #if wxUSE_FONTMAP
3086
3087 class wxMBConv_wxwin : public wxMBConv
3088 {
3089 private:
3090 void Init()
3091 {
3092 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3093 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3094 }
3095
3096 public:
3097 // temporarily just use wxEncodingConverter stuff,
3098 // so that it works while a better implementation is built
3099 wxMBConv_wxwin(const wxChar* name)
3100 {
3101 if (name)
3102 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3103 else
3104 m_enc = wxFONTENCODING_SYSTEM;
3105
3106 Init();
3107 }
3108
3109 wxMBConv_wxwin(wxFontEncoding enc)
3110 {
3111 m_enc = enc;
3112
3113 Init();
3114 }
3115
3116 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3117 {
3118 size_t inbuf = strlen(psz);
3119 if (buf)
3120 {
3121 if (!m2w.Convert(psz, buf))
3122 return wxCONV_FAILED;
3123 }
3124 return inbuf;
3125 }
3126
3127 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3128 {
3129 const size_t inbuf = wxWcslen(psz);
3130 if (buf)
3131 {
3132 if (!w2m.Convert(psz, buf))
3133 return wxCONV_FAILED;
3134 }
3135
3136 return inbuf;
3137 }
3138
3139 virtual size_t GetMBNulLen() const
3140 {
3141 switch ( m_enc )
3142 {
3143 case wxFONTENCODING_UTF16BE:
3144 case wxFONTENCODING_UTF16LE:
3145 return 2;
3146
3147 case wxFONTENCODING_UTF32BE:
3148 case wxFONTENCODING_UTF32LE:
3149 return 4;
3150
3151 default:
3152 return 1;
3153 }
3154 }
3155
3156 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3157
3158 bool IsOk() const { return m_ok; }
3159
3160 public:
3161 wxFontEncoding m_enc;
3162 wxEncodingConverter m2w, w2m;
3163
3164 private:
3165 // were we initialized successfully?
3166 bool m_ok;
3167
3168 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3169 };
3170
3171 // make the constructors available for unit testing
3172 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
3173 {
3174 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3175 if ( !result->IsOk() )
3176 {
3177 delete result;
3178 return 0;
3179 }
3180
3181 return result;
3182 }
3183
3184 #endif // wxUSE_FONTMAP
3185
3186 // ============================================================================
3187 // wxCSConv implementation
3188 // ============================================================================
3189
3190 void wxCSConv::Init()
3191 {
3192 m_name = NULL;
3193 m_convReal = NULL;
3194 m_deferred = true;
3195 }
3196
3197 wxCSConv::wxCSConv(const wxChar *charset)
3198 {
3199 Init();
3200
3201 if ( charset )
3202 {
3203 SetName(charset);
3204 }
3205
3206 #if wxUSE_FONTMAP
3207 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3208 #else
3209 m_encoding = wxFONTENCODING_SYSTEM;
3210 #endif
3211 }
3212
3213 wxCSConv::wxCSConv(wxFontEncoding encoding)
3214 {
3215 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3216 {
3217 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3218
3219 encoding = wxFONTENCODING_SYSTEM;
3220 }
3221
3222 Init();
3223
3224 m_encoding = encoding;
3225 }
3226
3227 wxCSConv::~wxCSConv()
3228 {
3229 Clear();
3230 }
3231
3232 wxCSConv::wxCSConv(const wxCSConv& conv)
3233 : wxMBConv()
3234 {
3235 Init();
3236
3237 SetName(conv.m_name);
3238 m_encoding = conv.m_encoding;
3239 }
3240
3241 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3242 {
3243 Clear();
3244
3245 SetName(conv.m_name);
3246 m_encoding = conv.m_encoding;
3247
3248 return *this;
3249 }
3250
3251 void wxCSConv::Clear()
3252 {
3253 free(m_name);
3254 delete m_convReal;
3255
3256 m_name = NULL;
3257 m_convReal = NULL;
3258 }
3259
3260 void wxCSConv::SetName(const wxChar *charset)
3261 {
3262 if (charset)
3263 {
3264 m_name = wxStrdup(charset);
3265 m_deferred = true;
3266 }
3267 }
3268
3269 #if wxUSE_FONTMAP
3270
3271 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3272 wxEncodingNameCache );
3273
3274 static wxEncodingNameCache gs_nameCache;
3275 #endif
3276
3277 wxMBConv *wxCSConv::DoCreate() const
3278 {
3279 #if wxUSE_FONTMAP
3280 wxLogTrace(TRACE_STRCONV,
3281 wxT("creating conversion for %s"),
3282 (m_name ? m_name
3283 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3284 #endif // wxUSE_FONTMAP
3285
3286 // check for the special case of ASCII or ISO8859-1 charset: as we have
3287 // special knowledge of it anyhow, we don't need to create a special
3288 // conversion object
3289 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3290 m_encoding == wxFONTENCODING_DEFAULT )
3291 {
3292 // don't convert at all
3293 return NULL;
3294 }
3295
3296 // we trust OS to do conversion better than we can so try external
3297 // conversion methods first
3298 //
3299 // the full order is:
3300 // 1. OS conversion (iconv() under Unix or Win32 API)
3301 // 2. hard coded conversions for UTF
3302 // 3. wxEncodingConverter as fall back
3303
3304 // step (1)
3305 #ifdef HAVE_ICONV
3306 #if !wxUSE_FONTMAP
3307 if ( m_name )
3308 #endif // !wxUSE_FONTMAP
3309 {
3310 wxString name(m_name);
3311 #if wxUSE_FONTMAP
3312 wxFontEncoding encoding(m_encoding);
3313 #endif
3314
3315 if ( !name.empty() )
3316 {
3317 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3318 if ( conv->IsOk() )
3319 return conv;
3320
3321 delete conv;
3322
3323 #if wxUSE_FONTMAP
3324 encoding =
3325 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3326 #endif // wxUSE_FONTMAP
3327 }
3328 #if wxUSE_FONTMAP
3329 {
3330 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3331 if ( it != gs_nameCache.end() )
3332 {
3333 if ( it->second.empty() )
3334 return NULL;
3335
3336 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3337 if ( conv->IsOk() )
3338 return conv;
3339
3340 delete conv;
3341 }
3342
3343 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3344 // CS : in case this does not return valid names (eg for MacRoman) encoding
3345 // got a 'failure' entry in the cache all the same, although it just has to
3346 // be created using a different method, so only store failed iconv creation
3347 // attempts (or perhaps we shoulnd't do this at all ?)
3348 if ( names[0] != NULL )
3349 {
3350 for ( ; *names; ++names )
3351 {
3352 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3353 if ( conv->IsOk() )
3354 {
3355 gs_nameCache[encoding] = *names;
3356 return conv;
3357 }
3358
3359 delete conv;
3360 }
3361
3362 gs_nameCache[encoding] = _T(""); // cache the failure
3363 }
3364 }
3365 #endif // wxUSE_FONTMAP
3366 }
3367 #endif // HAVE_ICONV
3368
3369 #ifdef wxHAVE_WIN32_MB2WC
3370 {
3371 #if wxUSE_FONTMAP
3372 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3373 : new wxMBConv_win32(m_encoding);
3374 if ( conv->IsOk() )
3375 return conv;
3376
3377 delete conv;
3378 #else
3379 return NULL;
3380 #endif
3381 }
3382 #endif // wxHAVE_WIN32_MB2WC
3383
3384 #if defined(__WXMAC__)
3385 {
3386 // leave UTF16 and UTF32 to the built-ins of wx
3387 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3388 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3389 {
3390 #if wxUSE_FONTMAP
3391 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3392 : new wxMBConv_mac(m_encoding);
3393 #else
3394 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3395 #endif
3396 if ( conv->IsOk() )
3397 return conv;
3398
3399 delete conv;
3400 }
3401 }
3402 #endif
3403
3404 #if defined(__WXCOCOA__)
3405 {
3406 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3407 {
3408 #if wxUSE_FONTMAP
3409 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3410 : new wxMBConv_cocoa(m_encoding);
3411 #else
3412 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3413 #endif
3414
3415 if ( conv->IsOk() )
3416 return conv;
3417
3418 delete conv;
3419 }
3420 }
3421 #endif
3422 // step (2)
3423 wxFontEncoding enc = m_encoding;
3424 #if wxUSE_FONTMAP
3425 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3426 {
3427 // use "false" to suppress interactive dialogs -- we can be called from
3428 // anywhere and popping up a dialog from here is the last thing we want to
3429 // do
3430 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3431 }
3432 #endif // wxUSE_FONTMAP
3433
3434 switch ( enc )
3435 {
3436 case wxFONTENCODING_UTF7:
3437 return new wxMBConvUTF7;
3438
3439 case wxFONTENCODING_UTF8:
3440 return new wxMBConvUTF8;
3441
3442 case wxFONTENCODING_UTF16BE:
3443 return new wxMBConvUTF16BE;
3444
3445 case wxFONTENCODING_UTF16LE:
3446 return new wxMBConvUTF16LE;
3447
3448 case wxFONTENCODING_UTF32BE:
3449 return new wxMBConvUTF32BE;
3450
3451 case wxFONTENCODING_UTF32LE:
3452 return new wxMBConvUTF32LE;
3453
3454 default:
3455 // nothing to do but put here to suppress gcc warnings
3456 break;
3457 }
3458
3459 // step (3)
3460 #if wxUSE_FONTMAP
3461 {
3462 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3463 : new wxMBConv_wxwin(m_encoding);
3464 if ( conv->IsOk() )
3465 return conv;
3466
3467 delete conv;
3468 }
3469 #endif // wxUSE_FONTMAP
3470
3471 // NB: This is a hack to prevent deadlock. What could otherwise happen
3472 // in Unicode build: wxConvLocal creation ends up being here
3473 // because of some failure and logs the error. But wxLog will try to
3474 // attach a timestamp, for which it will need wxConvLocal (to convert
3475 // time to char* and then wchar_t*), but that fails, tries to log the
3476 // error, but wxLog has an (already locked) critical section that
3477 // guards the static buffer.
3478 static bool alreadyLoggingError = false;
3479 if (!alreadyLoggingError)
3480 {
3481 alreadyLoggingError = true;
3482 wxLogError(_("Cannot convert from the charset '%s'!"),
3483 m_name ? m_name
3484 :
3485 #if wxUSE_FONTMAP
3486 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3487 #else // !wxUSE_FONTMAP
3488 wxString::Format(_("encoding %i"), m_encoding).c_str()
3489 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3490 );
3491
3492 alreadyLoggingError = false;
3493 }
3494
3495 return NULL;
3496 }
3497
3498 void wxCSConv::CreateConvIfNeeded() const
3499 {
3500 if ( m_deferred )
3501 {
3502 wxCSConv *self = (wxCSConv *)this; // const_cast
3503
3504 // if we don't have neither the name nor the encoding, use the default
3505 // encoding for this system
3506 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3507 {
3508 #if wxUSE_INTL
3509 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3510 #else
3511 // fallback to some reasonable default:
3512 self->m_encoding = wxFONTENCODING_ISO8859_1;
3513 #endif // wxUSE_INTL
3514 }
3515
3516 self->m_convReal = DoCreate();
3517 self->m_deferred = false;
3518 }
3519 }
3520
3521 bool wxCSConv::IsOk() const
3522 {
3523 CreateConvIfNeeded();
3524
3525 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3526 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3527 return true; // always ok as we do it ourselves
3528
3529 // m_convReal->IsOk() is called at its own creation, so we know it must
3530 // be ok if m_convReal is non-NULL
3531 return m_convReal != NULL;
3532 }
3533
3534 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3535 const char *src, size_t srcLen) const
3536 {
3537 CreateConvIfNeeded();
3538
3539 if (m_convReal)
3540 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3541
3542 // latin-1 (direct)
3543 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3544 }
3545
3546 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3547 const wchar_t *src, size_t srcLen) const
3548 {
3549 CreateConvIfNeeded();
3550
3551 if (m_convReal)
3552 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3553
3554 // latin-1 (direct)
3555 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3556 }
3557
3558 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3559 {
3560 CreateConvIfNeeded();
3561
3562 if (m_convReal)
3563 return m_convReal->MB2WC(buf, psz, n);
3564
3565 // latin-1 (direct)
3566 size_t len = strlen(psz);
3567
3568 if (buf)
3569 {
3570 for (size_t c = 0; c <= len; c++)
3571 buf[c] = (unsigned char)(psz[c]);
3572 }
3573
3574 return len;
3575 }
3576
3577 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3578 {
3579 CreateConvIfNeeded();
3580
3581 if (m_convReal)
3582 return m_convReal->WC2MB(buf, psz, n);
3583
3584 // latin-1 (direct)
3585 const size_t len = wxWcslen(psz);
3586 if (buf)
3587 {
3588 for (size_t c = 0; c <= len; c++)
3589 {
3590 if (psz[c] > 0xFF)
3591 return wxCONV_FAILED;
3592
3593 buf[c] = (char)psz[c];
3594 }
3595 }
3596 else
3597 {
3598 for (size_t c = 0; c <= len; c++)
3599 {
3600 if (psz[c] > 0xFF)
3601 return wxCONV_FAILED;
3602 }
3603 }
3604
3605 return len;
3606 }
3607
3608 size_t wxCSConv::GetMBNulLen() const
3609 {
3610 CreateConvIfNeeded();
3611
3612 if ( m_convReal )
3613 {
3614 return m_convReal->GetMBNulLen();
3615 }
3616
3617 return 1;
3618 }
3619
3620 // ----------------------------------------------------------------------------
3621 // globals
3622 // ----------------------------------------------------------------------------
3623
3624 #ifdef __WINDOWS__
3625 static wxMBConv_win32 wxConvLibcObj;
3626 #elif defined(__WXMAC__) && !defined(__MACH__)
3627 static wxMBConv_mac wxConvLibcObj ;
3628 #else
3629 static wxMBConvLibc wxConvLibcObj;
3630 #endif
3631
3632 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3633 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3634 static wxMBConvUTF7 wxConvUTF7Obj;
3635 static wxMBConvUTF8 wxConvUTF8Obj;
3636 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3637 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3638 #endif
3639 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3640 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3641 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3642 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3643 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3644 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3645 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3646 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3647 #ifdef __WXOSX__
3648 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3649 wxConvMacUTF8DObj;
3650 #else
3651 wxConvUTF8Obj;
3652 #endif
3653 #else
3654 wxConvLibcObj;
3655 #endif
3656
3657 #else // !wxUSE_WCHAR_T
3658
3659 // stand-ins in absence of wchar_t
3660 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3661 wxConvISO8859_1,
3662 wxConvLocal,
3663 wxConvUTF8;
3664
3665 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T