initial version of UTF-8 strings representation (still converting to wchar_t* a lot...
[wxWidgets.git] / src / common / string.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/string.cpp
3 // Purpose: wxString class
4 // Author: Vadim Zeitlin, Ryan Norton
5 // Modified by:
6 // Created: 29/01/98
7 // RCS-ID: $Id$
8 // Copyright: (c) 1998 Vadim Zeitlin <zeitlin@dptmaths.ens-cachan.fr>
9 // (c) 2004 Ryan Norton <wxprojects@comcast.net>
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
12
13 /*
14 * About ref counting:
15 * 1) all empty strings use g_strEmpty, nRefs = -1 (set in Init())
16 * 2) AllocBuffer() sets nRefs to 1, Lock() increments it by one
17 * 3) Unlock() decrements nRefs and frees memory if it goes to 0
18 */
19
20 // ===========================================================================
21 // headers, declarations, constants
22 // ===========================================================================
23
24 // For compilers that support precompilation, includes "wx.h".
25 #include "wx/wxprec.h"
26
27 #ifdef __BORLANDC__
28 #pragma hdrstop
29 #endif
30
31 #ifndef WX_PRECOMP
32 #include "wx/string.h"
33 #endif
34
35 #include <ctype.h>
36
37 #ifndef __WXWINCE__
38 #include <errno.h>
39 #endif
40
41 #include <string.h>
42 #include <stdlib.h>
43
44 #ifdef __SALFORDC__
45 #include <clib.h>
46 #endif
47
48 #include "wx/hashmap.h"
49
50 // string handling functions used by wxString:
51 #if wxUSE_UNICODE_UTF8
52 #define wxStringMemcpy memcpy
53 #define wxStringMemcmp memcmp
54 #define wxStringMemchr memchr
55 #define wxStringStrlen strlen
56 #else
57 #define wxStringMemcpy wxTmemcpy
58 #define wxStringMemcmp wxTmemcmp
59 #define wxStringMemchr wxTmemchr
60 #define wxStringStrlen wxStrlen
61 #endif
62
63
64 // ---------------------------------------------------------------------------
65 // static class variables definition
66 // ---------------------------------------------------------------------------
67
68 //According to STL _must_ be a -1 size_t
69 const size_t wxString::npos = (size_t) -1;
70
71 // ----------------------------------------------------------------------------
72 // global functions
73 // ----------------------------------------------------------------------------
74
75 #if wxUSE_STD_IOSTREAM
76
77 #include <iostream>
78
79 wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str)
80 {
81 // FIXME-UTF8: always, not only if wxUSE_UNICODE
82 #if wxUSE_UNICODE && !defined(__BORLANDC__)
83 return os << str.AsWChar();
84 #else
85 return os << str.AsChar();
86 #endif
87 }
88
89 wxSTD ostream& operator<<(wxSTD ostream& os, const wxString& str)
90 {
91 return os << str.c_str();
92 }
93
94 wxSTD ostream& operator<<(wxSTD ostream& os, const wxCharBuffer& str)
95 {
96 return os << str.data();
97 }
98
99 #ifndef __BORLANDC__
100 wxSTD ostream& operator<<(wxSTD ostream& os, const wxWCharBuffer& str)
101 {
102 return os << str.data();
103 }
104 #endif
105
106 #endif // wxUSE_STD_IOSTREAM
107
108 // ===========================================================================
109 // wxString class core
110 // ===========================================================================
111
112 #if wxUSE_UNICODE_UTF8
113
114 // ---------------------------------------------------------------------------
115 // UTF-8 operations
116 // ---------------------------------------------------------------------------
117
118 //
119 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
120 //
121 // Code Points | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
122 // -------------------+----------+----------+----------+----------+
123 // U+0000..U+007F | 00..7F | | | |
124 // U+0080..U+07FF | C2..DF | 80..BF | | |
125 // U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
126 // U+1000..U+FFFF | E1..EF | 80..BF | 80..BF | |
127 // U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
128 // U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
129 // U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
130 // -------------------+----------+----------+----------+----------+
131
132 bool wxString::IsValidUtf8String(const char *str)
133 {
134 if ( !str )
135 return true; // empty string is UTF8 string
136
137 const unsigned char *c = (const unsigned char*)str;
138
139 for ( ; *c; ++c )
140 {
141 unsigned char b = *c;
142
143 if ( b <= 0x7F ) // 00..7F
144 continue;
145
146 else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
147 return false;
148
149 // two-byte sequences:
150 else if ( b <= 0xDF ) // C2..DF
151 {
152 b = *(++c);
153 if ( !(b >= 0x80 && b <= 0xBF ) )
154 return false;
155 }
156
157 // three-byte sequences:
158 else if ( b == 0xE0 )
159 {
160 b = *(++c);
161 if ( !(b >= 0xA0 && b <= 0xBF ) )
162 return false;
163 b = *(++c);
164 if ( !(b >= 0x80 && b <= 0xBF ) )
165 return false;
166 }
167 else if ( b <= 0xEF ) // E1..EF
168 {
169 for ( int i = 0; i < 2; ++i )
170 {
171 b = *(++c);
172 if ( !(b >= 0x80 && b <= 0xBF ) )
173 return false;
174 }
175 }
176
177 // four-byte sequences:
178 else if ( b == 0xF0 )
179 {
180 b = *(++c);
181 if ( !(b >= 0x90 && b <= 0xBF ) )
182 return false;
183 for ( int i = 0; i < 2; ++i )
184 {
185 b = *(++c);
186 if ( !(b >= 0x80 && b <= 0xBF ) )
187 return false;
188 }
189 }
190 else if ( b <= 0xF3 ) // F1..F3
191 {
192 for ( int i = 0; i < 3; ++i )
193 {
194 b = *(++c);
195 if ( !(b >= 0x80 && b <= 0xBF ) )
196 return false;
197 }
198 }
199 else if ( b == 0xF4 )
200 {
201 b = *(++c);
202 if ( !(b >= 0x80 && b <= 0x8F ) )
203 return false;
204 for ( int i = 0; i < 2; ++i )
205 {
206 b = *(++c);
207 if ( !(b >= 0x80 && b <= 0xBF ) )
208 return false;
209 }
210 }
211 else // otherwise, it's invalid lead byte
212 return false;
213 }
214
215 return true;
216 }
217
218 #ifdef __WXDEBUG__
219 /* static */
220 bool wxString::IsValidUtf8LeadByte(unsigned char c)
221 {
222 return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
223 }
224 #endif
225
226 unsigned char wxString::ms_utf8IterTable[256] = {
227 // single-byte sequences (ASCII):
228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
236
237 // these are invalid, we use step 1 to skip
238 // over them (should never happen):
239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F
240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F
241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF
242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF
243 1, 1, // C0,C1
244
245 // two-byte sequences:
246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
247 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
248
249 // three-byte sequences:
250 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
251
252 // four-byte sequences:
253 4, 4, 4, 4, 4, // F0..F4
254
255 // these are invalid again (5- or 6-byte
256 // sequences and sequences for code points
257 // above U+10FFFF, as restricted by RFC 3629):
258 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF
259 };
260
261 /* static */
262 void wxString::DecIter(wxStringImpl::const_iterator& i)
263 {
264 wxASSERT( IsValidUtf8LeadByte(*i) );
265
266 // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
267 // binary), so we just have to go back until we hit a byte that is either
268 // < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in binary; this
269 // includes some invalid values, but we can ignore it here, because we
270 // assume valid UTF-8 input for the purpose of efficient implementation).
271 --i;
272 while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
273 --i;
274 }
275
276 /* static */
277 void wxString::DecIter(wxStringImpl::iterator& i)
278 {
279 // FIXME-UTF8: use template instead
280 wxASSERT( IsValidUtf8LeadByte(*i) );
281 --i;
282 while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
283 --i;
284 }
285
286 /* static */
287 wxStringImpl::const_iterator
288 wxString::AddToIter(wxStringImpl::const_iterator i, int n)
289 {
290 wxStringImpl::const_iterator out(i);
291
292 if ( n > 0 )
293 {
294 for ( int j = 0; j < n; ++j )
295 IncIter(out);
296 }
297 else if ( n < 0 )
298 {
299 for ( int j = 0; j > n; --j )
300 DecIter(out);
301 }
302
303 return out;
304 }
305
306 wxStringImpl::iterator
307 wxString::AddToIter(wxStringImpl::iterator i, int n)
308 {
309 // FIXME-UTF8: use template instead
310 wxStringImpl::iterator out(i);
311
312 if ( n > 0 )
313 {
314 for ( int j = 0; j < n; ++j )
315 IncIter(out);
316 }
317 else if ( n < 0 )
318 {
319 for ( int j = 0; j > n; --j )
320 DecIter(out);
321 }
322
323 return out;
324 }
325
326
327 /* static */
328 int wxString::DiffIters(wxStringImpl::const_iterator i1,
329 wxStringImpl::const_iterator i2)
330 {
331 int dist = 0;
332
333 if ( i1 < i2 )
334 {
335 while ( i1 != i2 )
336 {
337 IncIter(i1);
338 dist--;
339 }
340 }
341 else if ( i2 < i1 )
342 {
343 while ( i2 != i1 )
344 {
345 IncIter(i2);
346 dist++;
347 }
348 }
349
350 return dist;
351 }
352
353 int wxString::DiffIters(wxStringImpl::iterator i1, wxStringImpl::iterator i2)
354 {
355 // FIXME-UTF8: use template instead
356 int dist = 0;
357
358 if ( i1 < i2 )
359 {
360 while ( i1 != i2 )
361 {
362 IncIter(i1);
363 dist--;
364 }
365 }
366 else if ( i2 < i1 )
367 {
368 while ( i2 != i1 )
369 {
370 IncIter(i2);
371 dist++;
372 }
373 }
374
375 return dist;
376 }
377
378 /* static */
379 wxString::Utf8CharBuffer wxString::EncodeChar(wxUniChar ch)
380 {
381 Utf8CharBuffer buf;
382 char *out = buf.data;
383
384 wxUniChar::value_type code = ch.GetValue();
385
386 // Char. number range | UTF-8 octet sequence
387 // (hexadecimal) | (binary)
388 // ----------------------+---------------------------------------------
389 // 0000 0000 - 0000 007F | 0xxxxxxx
390 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
391 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
392 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
393 //
394 // Code point value is stored in bits marked with 'x', lowest-order bit
395 // of the value on the right side in the diagram above.
396 // (from RFC 3629)
397
398 if ( code <= 0x7F )
399 {
400 out[1] = 0;
401 out[0] = (char)code;
402 }
403 else if ( code <= 0x07FF )
404 {
405 out[2] = 0;
406 // NB: this line takes 6 least significant bits, encodes them as
407 // 10xxxxxx and discards them so that the next byte can be encoded:
408 out[1] = 0x80 | (code & 0x3F); code >>= 6;
409 out[0] = 0xC0 | code;
410 }
411 else if ( code < 0xFFFF )
412 {
413 out[3] = 0;
414 out[2] = 0x80 | (code & 0x3F); code >>= 6;
415 out[1] = 0x80 | (code & 0x3F); code >>= 6;
416 out[0] = 0xE0 | code;
417 }
418 else if ( code <= 0x10FFFF )
419 {
420 out[4] = 0;
421 out[3] = 0x80 | (code & 0x3F); code >>= 6;
422 out[2] = 0x80 | (code & 0x3F); code >>= 6;
423 out[1] = 0x80 | (code & 0x3F); code >>= 6;
424 out[0] = 0xF0 | code;
425 }
426 else
427 {
428 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
429 out[0] = 0;
430 }
431
432 return buf;
433 }
434
435 /* static */
436 wxUniChar wxUniCharRef::DecodeChar(wxStringImpl::const_iterator i)
437 {
438 wxASSERT( wxString::IsValidUtf8LeadByte(*i) ); // FIXME-UTF8: no "wxString::"
439
440 wxUniChar::value_type code = 0;
441 size_t len = wxString::GetUtf8CharLength(*i);
442 wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
443
444 // Char. number range | UTF-8 octet sequence
445 // (hexadecimal) | (binary)
446 // ----------------------+---------------------------------------------
447 // 0000 0000 - 0000 007F | 0xxxxxxx
448 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
449 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
450 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
451 //
452 // Code point value is stored in bits marked with 'x', lowest-order bit
453 // of the value on the right side in the diagram above.
454 // (from RFC 3629)
455
456 // mask to extract lead byte's value ('x' bits above), by sequence's length:
457 static const unsigned char s_leadValueMask[4] = { 0x7F, 0x1F, 0x0F, 0x07 };
458 #ifdef __WXDEBUG__
459 // mask and value of lead byte's most significant bits, by length:
460 static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
461 static const unsigned char s_leadMarkerVal[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
462 #endif
463
464 // extract the lead byte's value bits:
465 wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
466 s_leadMarkerVal[len-1],
467 _T("invalid UTF-8 lead byte") );
468 code = (unsigned char)*i & s_leadValueMask[len-1];
469
470 // all remaining bytes, if any, are handled in the same way regardless of
471 // sequence's length:
472 for ( ++i ; len > 1; --len, ++i )
473 {
474 wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
475 _T("invalid UTF-8 byte") );
476
477 code <<= 6;
478 code |= (unsigned char)*i & 0x3F;
479 }
480
481 return wxUniChar(code);
482 }
483
484 /* static */
485 wxCharBuffer wxString::EncodeNChars(size_t n, wxUniChar ch)
486 {
487 Utf8CharBuffer once(EncodeChar(ch));
488 // the IncIter() table can be used to determine the length of ch's encoding:
489 size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
490
491 wxCharBuffer buf(n * len);
492 char *ptr = buf.data();
493 for ( size_t i = 0; i < n; i++, ptr += len )
494 {
495 memcpy(ptr, once.data, len);
496 }
497
498 return buf;
499 }
500
501
502 void wxString::PosLenToImpl(size_t pos, size_t len,
503 size_t *implPos, size_t *implLen) const
504 {
505 if ( pos == npos )
506 *implPos = npos;
507 else
508 {
509 const_iterator i = begin() + pos;
510 *implPos = wxStringImpl::const_iterator(i) - m_impl.begin();
511 if ( len == npos )
512 *implLen = npos;
513 else
514 {
515 // too large length is interpreted as "to the end of the string"
516 // FIXME-UTF8: verify this is the case in std::string, assert
517 // otherwise
518 if ( pos + len > length() )
519 len = length() - pos;
520
521 *implLen = wxStringImpl::const_iterator(i + len) -
522 wxStringImpl::const_iterator(i);
523 }
524 }
525 }
526
527 #endif // wxUSE_UNICODE_UTF8
528
529 // ----------------------------------------------------------------------------
530 // wxCStrData converted strings caching
531 // ----------------------------------------------------------------------------
532
533 // FIXME-UTF8: temporarily disabled because it doesn't work with global
534 // string objects; re-enable after fixing this bug and benchmarking
535 // performance to see if using a hash is a good idea at all
536 #if 0
537
538 // For backward compatibility reasons, it must be possible to assign the value
539 // returned by wxString::c_str() to a char* or wchar_t* variable and work with
540 // it. Returning wxCharBuffer from (const char*)c_str() wouldn't do the trick,
541 // because the memory would be freed immediately, but it has to be valid as long
542 // as the string is not modified, so that code like this still works:
543 //
544 // const wxChar *s = str.c_str();
545 // while ( s ) { ... }
546
547 // FIXME-UTF8: not thread safe!
548 // FIXME-UTF8: we currently clear the cached conversion only when the string is
549 // destroyed, but we should do it when the string is modified, to
550 // keep memory usage down
551 // FIXME-UTF8: we do the conversion every time As[W]Char() is called, but if we
552 // invalidated the cache on every change, we could keep the previous
553 // conversion
554 // FIXME-UTF8: add tracing of usage of these two methods - new code is supposed
555 // to use mb_str() or wc_str() instead of (const [w]char*)c_str()
556
557 template<typename T>
558 static inline void DeleteStringFromConversionCache(T& hash, const wxString *s)
559 {
560 typename T::iterator i = hash.find(wxConstCast(s, wxString));
561 if ( i != hash.end() )
562 {
563 free(i->second);
564 hash.erase(i);
565 }
566 }
567
568 #if wxUSE_UNICODE
569 // NB: non-STL implementation doesn't compile with "const wxString*" key type,
570 // so we have to use wxString* here and const-cast when used
571 WX_DECLARE_HASH_MAP(wxString*, char*, wxPointerHash, wxPointerEqual,
572 wxStringCharConversionCache);
573 static wxStringCharConversionCache gs_stringsCharCache;
574
575 const char* wxCStrData::AsChar() const
576 {
577 // remove previously cache value, if any (see FIXMEs above):
578 DeleteStringFromConversionCache(gs_stringsCharCache, m_str);
579
580 // convert the string and keep it:
581 const char *s = gs_stringsCharCache[wxConstCast(m_str, wxString)] =
582 m_str->mb_str().release();
583
584 return s + m_offset;
585 }
586 #endif // wxUSE_UNICODE
587
588 #if !wxUSE_UNICODE_WCHAR
589 WX_DECLARE_HASH_MAP(wxString*, wchar_t*, wxPointerHash, wxPointerEqual,
590 wxStringWCharConversionCache);
591 static wxStringWCharConversionCache gs_stringsWCharCache;
592
593 const wchar_t* wxCStrData::AsWChar() const
594 {
595 // remove previously cache value, if any (see FIXMEs above):
596 DeleteStringFromConversionCache(gs_stringsWCharCache, m_str);
597
598 // convert the string and keep it:
599 const wchar_t *s = gs_stringsWCharCache[wxConstCast(m_str, wxString)] =
600 m_str->wc_str().release();
601
602 return s + m_offset;
603 }
604 #endif // !wxUSE_UNICODE_WCHAR
605
606 wxString::~wxString()
607 {
608 #if wxUSE_UNICODE
609 // FIXME-UTF8: do this only if locale is not UTF8 if wxUSE_UNICODE_UTF8
610 DeleteStringFromConversionCache(gs_stringsCharCache, this);
611 #endif
612 #if !wxUSE_UNICODE_WCHAR
613 DeleteStringFromConversionCache(gs_stringsWCharCache, this);
614 #endif
615 }
616 #endif
617
618 #if wxUSE_UNICODE
619 const char* wxCStrData::AsChar() const
620 {
621 wxString *str = wxConstCast(m_str, wxString);
622
623 // convert the string:
624 wxCharBuffer buf(str->mb_str());
625
626 // FIXME-UTF8: do the conversion in-place in the existing buffer
627 if ( str->m_convertedToChar &&
628 strlen(buf) == strlen(str->m_convertedToChar) )
629 {
630 // keep the same buffer for as long as possible, so that several calls
631 // to c_str() in a row still work:
632 strcpy(str->m_convertedToChar, buf);
633 }
634 else
635 {
636 str->m_convertedToChar = buf.release();
637 }
638
639 // and keep it:
640 return str->m_convertedToChar + m_offset;
641 }
642 #endif // wxUSE_UNICODE
643
644 #if !wxUSE_UNICODE_WCHAR
645 const wchar_t* wxCStrData::AsWChar() const
646 {
647 wxString *str = wxConstCast(m_str, wxString);
648
649 // convert the string:
650 wxWCharBuffer buf(str->wc_str());
651
652 // FIXME-UTF8: do the conversion in-place in the existing buffer
653 if ( str->m_convertedToWChar &&
654 wxWcslen(buf) == wxWcslen(str->m_convertedToWChar) )
655 {
656 // keep the same buffer for as long as possible, so that several calls
657 // to c_str() in a row still work:
658 memcpy(str->m_convertedToWChar, buf, sizeof(wchar_t) * wxWcslen(buf));
659 }
660 else
661 {
662 str->m_convertedToWChar = buf.release();
663 }
664
665 // and keep it:
666 return str->m_convertedToWChar + m_offset;
667 }
668 #endif // !wxUSE_UNICODE_WCHAR
669
670 // ===========================================================================
671 // wxString class core
672 // ===========================================================================
673
674 // ---------------------------------------------------------------------------
675 // construction and conversion
676 // ---------------------------------------------------------------------------
677
678 #if wxUSE_UNICODE_WCHAR
679 /* static */
680 wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
681 const wxMBConv& conv)
682 {
683 // anything to do?
684 if ( !psz || nLength == 0 )
685 return SubstrBufFromMB(L"", 0);
686
687 if ( nLength == npos )
688 nLength = wxNO_LEN;
689
690 size_t wcLen;
691 wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen));
692 if ( !wcLen )
693 return SubstrBufFromMB(_T(""), 0);
694 else
695 return SubstrBufFromMB(wcBuf, wcLen);
696 }
697 #endif // wxUSE_UNICODE_WCHAR
698
699 #if wxUSE_UNICODE_UTF8
700 /* static */
701 wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
702 const wxMBConv& conv)
703 {
704 // FIXME-UTF8: return as-is without copying under UTF8 locale, return
705 // converted string under other locales - needs wxCharBuffer
706 // changes
707
708 // anything to do?
709 if ( !psz || nLength == 0 )
710 return SubstrBufFromMB("", 0);
711
712 if ( nLength == npos )
713 nLength = wxNO_LEN;
714
715 // first convert to wide string:
716 size_t wcLen;
717 wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen));
718 if ( !wcLen )
719 return SubstrBufFromMB("", 0);
720
721 // and then to UTF-8:
722 SubstrBufFromMB buf(ConvertStr(wcBuf, wcLen, wxConvUTF8));
723 // widechar -> UTF-8 conversion isn't supposed to ever fail:
724 wxASSERT_MSG( buf.data, _T("conversion to UTF-8 failed") );
725
726 return buf;
727 }
728 #endif // wxUSE_UNICODE_UTF8
729
730 #if wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE
731 /* static */
732 wxString::SubstrBufFromWC wxString::ConvertStr(const wchar_t *pwz, size_t nLength,
733 const wxMBConv& conv)
734 {
735 // anything to do?
736 if ( !pwz || nLength == 0 )
737 return SubstrBufFromWC("", 0);
738
739 if ( nLength == npos )
740 nLength = wxNO_LEN;
741
742 size_t mbLen;
743 wxCharBuffer mbBuf(conv.cWC2MB(pwz, nLength, &mbLen));
744 if ( !mbLen )
745 return SubstrBufFromWC("", 0);
746 else
747 return SubstrBufFromWC(mbBuf, mbLen);
748 }
749 #endif // wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE
750
751
752 #if wxUSE_UNICODE_WCHAR
753
754 //Convert wxString in Unicode mode to a multi-byte string
755 const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
756 {
757 return conv.cWC2MB(wx_str(), length() + 1 /* size, not length */, NULL);
758 }
759
760 #elif wxUSE_UNICODE_UTF8
761
762 const wxWCharBuffer wxString::wc_str() const
763 {
764 return wxConvUTF8.cMB2WC(m_impl.c_str(),
765 m_impl.length() + 1 /* size, not length */,
766 NULL);
767 }
768
769 const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
770 {
771 // FIXME-UTF8: optimize the case when conv==wxConvUTF8 or wxConvLibc
772 // under UTF8 locale
773 // FIXME-UTF8: use wc_str() here once we have buffers with length
774
775 size_t wcLen;
776 wxWCharBuffer wcBuf(
777 wxConvUTF8.cMB2WC(m_impl.c_str(),
778 m_impl.length() + 1 /* size, not length */,
779 &wcLen));
780 if ( !wcLen )
781 return wxCharBuffer("");
782
783 return conv.cWC2MB(wcBuf, wcLen, NULL);
784 }
785
786 #else // ANSI
787
788 //Converts this string to a wide character string if unicode
789 //mode is not enabled and wxUSE_WCHAR_T is enabled
790 const wxWCharBuffer wxString::wc_str(const wxMBConv& conv) const
791 {
792 return conv.cMB2WC(wx_str(), length() + 1 /* size, not length */, NULL);
793 }
794
795 #endif // Unicode/ANSI
796
797 // shrink to minimal size (releasing extra memory)
798 bool wxString::Shrink()
799 {
800 wxString tmp(begin(), end());
801 swap(tmp);
802 return tmp.length() == length();
803 }
804
805 // deprecated compatibility code:
806 #if WXWIN_COMPATIBILITY_2_8 && !wxUSE_STL_BASED_WXSTRING && !wxUSE_UNICODE_UTF8
807 wxChar *wxString::GetWriteBuf(size_t nLen)
808 {
809 return DoGetWriteBuf(nLen);
810 }
811
812 void wxString::UngetWriteBuf()
813 {
814 DoUngetWriteBuf();
815 }
816
817 void wxString::UngetWriteBuf(size_t nLen)
818 {
819 DoUngetWriteBuf(nLen);
820 }
821 #endif // WXWIN_COMPATIBILITY_2_8 && !wxUSE_STL_BASED_WXSTRING && !wxUSE_UNICODE_UTF8
822
823
824 // ---------------------------------------------------------------------------
825 // data access
826 // ---------------------------------------------------------------------------
827
828 // all functions are inline in string.h
829
830 // ---------------------------------------------------------------------------
831 // concatenation operators
832 // ---------------------------------------------------------------------------
833
834 /*
835 * concatenation functions come in 5 flavours:
836 * string + string
837 * char + string and string + char
838 * C str + string and string + C str
839 */
840
841 wxString operator+(const wxString& str1, const wxString& str2)
842 {
843 #if !wxUSE_STL_BASED_WXSTRING
844 wxASSERT( str1.IsValid() );
845 wxASSERT( str2.IsValid() );
846 #endif
847
848 wxString s = str1;
849 s += str2;
850
851 return s;
852 }
853
854 wxString operator+(const wxString& str, wxUniChar ch)
855 {
856 #if !wxUSE_STL_BASED_WXSTRING
857 wxASSERT( str.IsValid() );
858 #endif
859
860 wxString s = str;
861 s += ch;
862
863 return s;
864 }
865
866 wxString operator+(wxUniChar ch, const wxString& str)
867 {
868 #if !wxUSE_STL_BASED_WXSTRING
869 wxASSERT( str.IsValid() );
870 #endif
871
872 wxString s = ch;
873 s += str;
874
875 return s;
876 }
877
878 wxString operator+(const wxString& str, const char *psz)
879 {
880 #if !wxUSE_STL_BASED_WXSTRING
881 wxASSERT( str.IsValid() );
882 #endif
883
884 wxString s;
885 if ( !s.Alloc(strlen(psz) + str.length()) ) {
886 wxFAIL_MSG( _T("out of memory in wxString::operator+") );
887 }
888 s += str;
889 s += psz;
890
891 return s;
892 }
893
894 wxString operator+(const wxString& str, const wchar_t *pwz)
895 {
896 #if !wxUSE_STL_BASED_WXSTRING
897 wxASSERT( str.IsValid() );
898 #endif
899
900 wxString s;
901 if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) {
902 wxFAIL_MSG( _T("out of memory in wxString::operator+") );
903 }
904 s += str;
905 s += pwz;
906
907 return s;
908 }
909
910 wxString operator+(const char *psz, const wxString& str)
911 {
912 #if !wxUSE_STL_BASED_WXSTRING
913 wxASSERT( str.IsValid() );
914 #endif
915
916 wxString s;
917 if ( !s.Alloc(strlen(psz) + str.length()) ) {
918 wxFAIL_MSG( _T("out of memory in wxString::operator+") );
919 }
920 s = psz;
921 s += str;
922
923 return s;
924 }
925
926 wxString operator+(const wchar_t *pwz, const wxString& str)
927 {
928 #if !wxUSE_STL_BASED_WXSTRING
929 wxASSERT( str.IsValid() );
930 #endif
931
932 wxString s;
933 if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) {
934 wxFAIL_MSG( _T("out of memory in wxString::operator+") );
935 }
936 s = pwz;
937 s += str;
938
939 return s;
940 }
941
942 // ---------------------------------------------------------------------------
943 // string comparison
944 // ---------------------------------------------------------------------------
945
946 #ifdef HAVE_STD_STRING_COMPARE
947
948 // NB: Comparison code (both if HAVE_STD_STRING_COMPARE and if not) works with
949 // UTF-8 encoded strings too, thanks to UTF-8's design which allows us to
950 // sort strings in characters code point order by sorting the byte sequence
951 // in byte values order (i.e. what strcmp() and memcmp() do).
952
953 int wxString::compare(const wxString& str) const
954 {
955 return m_impl.compare(str.m_impl);
956 }
957
958 int wxString::compare(size_t nStart, size_t nLen,
959 const wxString& str) const
960 {
961 size_t pos, len;
962 PosLenToImpl(nStart, nLen, &pos, &len);
963 return m_impl.compare(pos, len, str.m_impl);
964 }
965
966 int wxString::compare(size_t nStart, size_t nLen,
967 const wxString& str,
968 size_t nStart2, size_t nLen2) const
969 {
970 size_t pos, len;
971 PosLenToImpl(nStart, nLen, &pos, &len);
972
973 size_t pos2, len2;
974 str.PosLenToImpl(nStart2, nLen2, &pos2, &len2);
975
976 return m_impl.compare(pos, len, str.m_impl, pos2, len2);
977 }
978
979 int wxString::compare(const char* sz) const
980 {
981 return m_impl.compare(ImplStr(sz));
982 }
983
984 int wxString::compare(const wchar_t* sz) const
985 {
986 return m_impl.compare(ImplStr(sz));
987 }
988
989 int wxString::compare(size_t nStart, size_t nLen,
990 const char* sz, size_t nCount) const
991 {
992 size_t pos, len;
993 PosLenToImpl(nStart, nLen, &pos, &len);
994
995 SubstrBufFromMB str(ImplStr(sz, nCount));
996
997 return m_impl.compare(pos, len, str.data, str.len);
998 }
999
1000 int wxString::compare(size_t nStart, size_t nLen,
1001 const wchar_t* sz, size_t nCount) const
1002 {
1003 size_t pos, len;
1004 PosLenToImpl(nStart, nLen, &pos, &len);
1005
1006 SubstrBufFromWC str(ImplStr(sz, nCount));
1007
1008 return m_impl.compare(pos, len, str.data, str.len);
1009 }
1010
1011 #else // !HAVE_STD_STRING_COMPARE
1012
1013 static inline int wxDoCmp(const wxStringCharType* s1, size_t l1,
1014 const wxStringCharType* s2, size_t l2)
1015 {
1016 if( l1 == l2 )
1017 return wxStringMemcmp(s1, s2, l1);
1018 else if( l1 < l2 )
1019 {
1020 int ret = wxStringMemcmp(s1, s2, l1);
1021 return ret == 0 ? -1 : ret;
1022 }
1023 else
1024 {
1025 int ret = wxStringMemcmp(s1, s2, l2);
1026 return ret == 0 ? +1 : ret;
1027 }
1028 }
1029
1030 int wxString::compare(const wxString& str) const
1031 {
1032 return ::wxDoCmp(m_impl.data(), m_impl.length(),
1033 str.m_impl.data(), str.m_impl.length());
1034 }
1035
1036 int wxString::compare(size_t nStart, size_t nLen,
1037 const wxString& str) const
1038 {
1039 wxASSERT(nStart <= length());
1040 size_type strLen = length() - nStart;
1041 nLen = strLen < nLen ? strLen : nLen;
1042
1043 size_t pos, len;
1044 PosLenToImpl(nStart, nLen, &pos, &len);
1045
1046 return ::wxDoCmp(m_impl.data() + pos, len,
1047 str.m_impl.data(), str.m_impl.length());
1048 }
1049
1050 int wxString::compare(size_t nStart, size_t nLen,
1051 const wxString& str,
1052 size_t nStart2, size_t nLen2) const
1053 {
1054 wxASSERT(nStart <= length());
1055 wxASSERT(nStart2 <= str.length());
1056 size_type strLen = length() - nStart,
1057 strLen2 = str.length() - nStart2;
1058 nLen = strLen < nLen ? strLen : nLen;
1059 nLen2 = strLen2 < nLen2 ? strLen2 : nLen2;
1060
1061 size_t pos, len;
1062 PosLenToImpl(nStart, nLen, &pos, &len);
1063 size_t pos2, len2;
1064 str.PosLenToImpl(nStart2, nLen2, &pos2, &len2);
1065
1066 return ::wxDoCmp(m_impl.data() + pos, len,
1067 str.m_impl.data() + pos2, len2);
1068 }
1069
1070 int wxString::compare(const char* sz) const
1071 {
1072 SubstrBufFromMB str(ImplStr(sz, npos));
1073 if ( str.len == npos )
1074 str.len = wxStringStrlen(str.data);
1075 return ::wxDoCmp(m_impl.data(), m_impl.length(), str.data, str.len);
1076 }
1077
1078 int wxString::compare(const wchar_t* sz) const
1079 {
1080 SubstrBufFromWC str(ImplStr(sz, npos));
1081 if ( str.len == npos )
1082 str.len = wxStringStrlen(str.data);
1083 return ::wxDoCmp(m_impl.data(), m_impl.length(), str.data, str.len);
1084 }
1085
1086 int wxString::compare(size_t nStart, size_t nLen,
1087 const char* sz, size_t nCount) const
1088 {
1089 wxASSERT(nStart <= length());
1090 size_type strLen = length() - nStart;
1091 nLen = strLen < nLen ? strLen : nLen;
1092
1093 size_t pos, len;
1094 PosLenToImpl(nStart, nLen, &pos, &len);
1095
1096 SubstrBufFromMB str(ImplStr(sz, nCount));
1097 if ( str.len == npos )
1098 str.len = wxStringStrlen(str.data);
1099
1100 return ::wxDoCmp(m_impl.data() + pos, len, str.data, str.len);
1101 }
1102
1103 int wxString::compare(size_t nStart, size_t nLen,
1104 const wchar_t* sz, size_t nCount) const
1105 {
1106 wxASSERT(nStart <= length());
1107 size_type strLen = length() - nStart;
1108 nLen = strLen < nLen ? strLen : nLen;
1109
1110 size_t pos, len;
1111 PosLenToImpl(nStart, nLen, &pos, &len);
1112
1113 SubstrBufFromWC str(ImplStr(sz, nCount));
1114 if ( str.len == npos )
1115 str.len = wxStringStrlen(str.data);
1116
1117 return ::wxDoCmp(m_impl.data() + pos, len, str.data, str.len);
1118 }
1119
1120 #endif // HAVE_STD_STRING_COMPARE/!HAVE_STD_STRING_COMPARE
1121
1122
1123 // ---------------------------------------------------------------------------
1124 // find_{first,last}_[not]_of functions
1125 // ---------------------------------------------------------------------------
1126
1127 #if !wxUSE_STL_BASED_WXSTRING || wxUSE_UNICODE_UTF8
1128
1129 // NB: All these functions are implemented with the argument being wxChar*,
1130 // i.e. widechar string in any Unicode build, even though native string
1131 // representation is char* in the UTF-8 build. This is because we couldn't
1132 // use memchr() to determine if a character is in a set encoded as UTF-8.
1133
1134 size_t wxString::find_first_of(const wxChar* sz, size_t nStart) const
1135 {
1136 return find_first_of(sz, nStart, wxStrlen(sz));
1137 }
1138
1139 size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart) const
1140 {
1141 return find_first_not_of(sz, nStart, wxStrlen(sz));
1142 }
1143
1144 size_t wxString::find_first_of(const wxChar* sz, size_t nStart, size_t n) const
1145 {
1146 wxASSERT_MSG( nStart <= length(), _T("invalid index") );
1147
1148 size_t idx = nStart;
1149 for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1150 {
1151 if ( wxTmemchr(sz, *i, n) )
1152 return idx;
1153 }
1154
1155 return npos;
1156 }
1157
1158 size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart, size_t n) const
1159 {
1160 wxASSERT_MSG( nStart <= length(), _T("invalid index") );
1161
1162 size_t idx = nStart;
1163 for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1164 {
1165 if ( !wxTmemchr(sz, *i, n) )
1166 return idx;
1167 }
1168
1169 return npos;
1170 }
1171
1172
1173 size_t wxString::find_last_of(const wxChar* sz, size_t nStart) const
1174 {
1175 return find_last_of(sz, nStart, wxStrlen(sz));
1176 }
1177
1178 size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart) const
1179 {
1180 return find_last_not_of(sz, nStart, wxStrlen(sz));
1181 }
1182
1183 size_t wxString::find_last_of(const wxChar* sz, size_t nStart, size_t n) const
1184 {
1185 size_t len = length();
1186
1187 if ( nStart == npos )
1188 {
1189 nStart = len - 1;
1190 }
1191 else
1192 {
1193 wxASSERT_MSG( nStart <= len, _T("invalid index") );
1194 }
1195
1196 size_t idx = nStart;
1197 for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1198 i != rend(); --idx, ++i )
1199 {
1200 if ( wxTmemchr(sz, *i, n) )
1201 return idx;
1202 }
1203
1204 return npos;
1205 }
1206
1207 size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart, size_t n) const
1208 {
1209 size_t len = length();
1210
1211 if ( nStart == npos )
1212 {
1213 nStart = len - 1;
1214 }
1215 else
1216 {
1217 wxASSERT_MSG( nStart <= len, _T("invalid index") );
1218 }
1219
1220 size_t idx = nStart;
1221 for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1222 i != rend(); --idx, ++i )
1223 {
1224 if ( !wxTmemchr(sz, *i, n) )
1225 return idx;
1226 }
1227
1228 return npos;
1229 }
1230
1231 size_t wxString::find_first_not_of(wxUniChar ch, size_t nStart) const
1232 {
1233 wxASSERT_MSG( nStart <= length(), _T("invalid index") );
1234
1235 size_t idx = nStart;
1236 for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1237 {
1238 if ( *i != ch )
1239 return idx;
1240 }
1241
1242 return npos;
1243 }
1244
1245 size_t wxString::find_last_not_of(wxUniChar ch, size_t nStart) const
1246 {
1247 size_t len = length();
1248
1249 if ( nStart == npos )
1250 {
1251 nStart = len - 1;
1252 }
1253 else
1254 {
1255 wxASSERT_MSG( nStart <= len, _T("invalid index") );
1256 }
1257
1258 size_t idx = nStart;
1259 for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1260 i != rend(); --idx, ++i )
1261 {
1262 if ( *i != ch )
1263 return idx;
1264 }
1265
1266 return npos;
1267 }
1268
1269 // the functions above were implemented for wchar_t* arguments in Unicode
1270 // build and char* in ANSI build; below are implementations for the other
1271 // version:
1272 #if wxUSE_UNICODE
1273 #define wxOtherCharType char
1274 #define STRCONV (const wxChar*)wxConvLibc.cMB2WC
1275 #else
1276 #define wxOtherCharType wchar_t
1277 #define STRCONV (const wxChar*)wxConvLibc.cWC2MB
1278 #endif
1279
1280 size_t wxString::find_first_of(const wxOtherCharType* sz, size_t nStart) const
1281 { return find_first_of(STRCONV(sz), nStart); }
1282
1283 size_t wxString::find_first_of(const wxOtherCharType* sz, size_t nStart,
1284 size_t n) const
1285 { return find_first_of(STRCONV(sz, n, NULL), nStart, n); }
1286 size_t wxString::find_last_of(const wxOtherCharType* sz, size_t nStart) const
1287 { return find_last_of(STRCONV(sz), nStart); }
1288 size_t wxString::find_last_of(const wxOtherCharType* sz, size_t nStart,
1289 size_t n) const
1290 { return find_last_of(STRCONV(sz, n, NULL), nStart, n); }
1291 size_t wxString::find_first_not_of(const wxOtherCharType* sz, size_t nStart) const
1292 { return find_first_not_of(STRCONV(sz), nStart); }
1293 size_t wxString::find_first_not_of(const wxOtherCharType* sz, size_t nStart,
1294 size_t n) const
1295 { return find_first_not_of(STRCONV(sz, n, NULL), nStart, n); }
1296 size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart) const
1297 { return find_last_not_of(STRCONV(sz), nStart); }
1298 size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart,
1299 size_t n) const
1300 { return find_last_not_of(STRCONV(sz, n, NULL), nStart, n); }
1301
1302 #undef wxOtherCharType
1303 #undef STRCONV
1304
1305 #endif // !wxUSE_STL_BASED_WXSTRING || wxUSE_UNICODE_UTF8
1306
1307 // ===========================================================================
1308 // other common string functions
1309 // ===========================================================================
1310
1311 int wxString::CmpNoCase(const wxString& s) const
1312 {
1313 // FIXME-UTF8: use wxUniChar::ToLower/ToUpper once added
1314
1315 size_t idx = 0;
1316 const_iterator i1 = begin();
1317 const_iterator end1 = end();
1318 const_iterator i2 = s.begin();
1319 const_iterator end2 = s.end();
1320
1321 for ( ; i1 != end1 && i2 != end2; ++idx, ++i1, ++i2 )
1322 {
1323 wxUniChar lower1 = (wxChar)wxTolower(*i1);
1324 wxUniChar lower2 = (wxChar)wxTolower(*i2);
1325 if ( lower1 != lower2 )
1326 return lower1 < lower2 ? -1 : 1;
1327 }
1328
1329 size_t len1 = length();
1330 size_t len2 = s.length();
1331
1332 if ( len1 < len2 )
1333 return -1;
1334 else if ( len1 > len2 )
1335 return 1;
1336 return 0;
1337 }
1338
1339
1340 #if wxUSE_UNICODE
1341
1342 #ifdef __MWERKS__
1343 #ifndef __SCHAR_MAX__
1344 #define __SCHAR_MAX__ 127
1345 #endif
1346 #endif
1347
1348 wxString wxString::FromAscii(const char *ascii)
1349 {
1350 if (!ascii)
1351 return wxEmptyString;
1352
1353 size_t len = strlen( ascii );
1354 wxString res;
1355
1356 if ( len )
1357 {
1358 wxStringBuffer buf(res, len);
1359
1360 wchar_t *dest = buf;
1361
1362 for ( ;; )
1363 {
1364 if ( (*dest++ = (wchar_t)(unsigned char)*ascii++) == L'\0' )
1365 break;
1366 }
1367 }
1368
1369 return res;
1370 }
1371
1372 wxString wxString::FromAscii(const char ascii)
1373 {
1374 // What do we do with '\0' ?
1375
1376 wxString res;
1377 res += (wchar_t)(unsigned char) ascii;
1378
1379 return res;
1380 }
1381
1382 const wxCharBuffer wxString::ToAscii() const
1383 {
1384 // this will allocate enough space for the terminating NUL too
1385 wxCharBuffer buffer(length());
1386
1387
1388 char *dest = buffer.data();
1389
1390 const wchar_t *pwc = c_str();
1391 for ( ;; )
1392 {
1393 *dest++ = (char)(*pwc > SCHAR_MAX ? wxT('_') : *pwc);
1394
1395 // the output string can't have embedded NULs anyhow, so we can safely
1396 // stop at first of them even if we do have any
1397 if ( !*pwc++ )
1398 break;
1399 }
1400
1401 return buffer;
1402 }
1403
1404 #endif // Unicode
1405
1406 // extract string of length nCount starting at nFirst
1407 wxString wxString::Mid(size_t nFirst, size_t nCount) const
1408 {
1409 size_t nLen = length();
1410
1411 // default value of nCount is npos and means "till the end"
1412 if ( nCount == npos )
1413 {
1414 nCount = nLen - nFirst;
1415 }
1416
1417 // out-of-bounds requests return sensible things
1418 if ( nFirst + nCount > nLen )
1419 {
1420 nCount = nLen - nFirst;
1421 }
1422
1423 if ( nFirst > nLen )
1424 {
1425 // AllocCopy() will return empty string
1426 return wxEmptyString;
1427 }
1428
1429 wxString dest(*this, nFirst, nCount);
1430 if ( dest.length() != nCount )
1431 {
1432 wxFAIL_MSG( _T("out of memory in wxString::Mid") );
1433 }
1434
1435 return dest;
1436 }
1437
1438 // check that the string starts with prefix and return the rest of the string
1439 // in the provided pointer if it is not NULL, otherwise return false
1440 bool wxString::StartsWith(const wxChar *prefix, wxString *rest) const
1441 {
1442 wxASSERT_MSG( prefix, _T("invalid parameter in wxString::StartsWith") );
1443
1444 // first check if the beginning of the string matches the prefix: note
1445 // that we don't have to check that we don't run out of this string as
1446 // when we reach the terminating NUL, either prefix string ends too (and
1447 // then it's ok) or we break out of the loop because there is no match
1448 const wxChar *p = c_str();
1449 while ( *prefix )
1450 {
1451 if ( *prefix++ != *p++ )
1452 {
1453 // no match
1454 return false;
1455 }
1456 }
1457
1458 if ( rest )
1459 {
1460 // put the rest of the string into provided pointer
1461 *rest = p;
1462 }
1463
1464 return true;
1465 }
1466
1467
1468 // check that the string ends with suffix and return the rest of it in the
1469 // provided pointer if it is not NULL, otherwise return false
1470 bool wxString::EndsWith(const wxChar *suffix, wxString *rest) const
1471 {
1472 wxASSERT_MSG( suffix, _T("invalid parameter in wxString::EndssWith") );
1473
1474 int start = length() - wxStrlen(suffix);
1475
1476 if ( start < 0 || compare(start, npos, suffix) != 0 )
1477 return false;
1478
1479 if ( rest )
1480 {
1481 // put the rest of the string into provided pointer
1482 rest->assign(*this, 0, start);
1483 }
1484
1485 return true;
1486 }
1487
1488
1489 // extract nCount last (rightmost) characters
1490 wxString wxString::Right(size_t nCount) const
1491 {
1492 if ( nCount > length() )
1493 nCount = length();
1494
1495 wxString dest(*this, length() - nCount, nCount);
1496 if ( dest.length() != nCount ) {
1497 wxFAIL_MSG( _T("out of memory in wxString::Right") );
1498 }
1499 return dest;
1500 }
1501
1502 // get all characters after the last occurence of ch
1503 // (returns the whole string if ch not found)
1504 wxString wxString::AfterLast(wxUniChar ch) const
1505 {
1506 wxString str;
1507 int iPos = Find(ch, true);
1508 if ( iPos == wxNOT_FOUND )
1509 str = *this;
1510 else
1511 str = wx_str() + iPos + 1;
1512
1513 return str;
1514 }
1515
1516 // extract nCount first (leftmost) characters
1517 wxString wxString::Left(size_t nCount) const
1518 {
1519 if ( nCount > length() )
1520 nCount = length();
1521
1522 wxString dest(*this, 0, nCount);
1523 if ( dest.length() != nCount ) {
1524 wxFAIL_MSG( _T("out of memory in wxString::Left") );
1525 }
1526 return dest;
1527 }
1528
1529 // get all characters before the first occurence of ch
1530 // (returns the whole string if ch not found)
1531 wxString wxString::BeforeFirst(wxUniChar ch) const
1532 {
1533 int iPos = Find(ch);
1534 if ( iPos == wxNOT_FOUND ) iPos = length();
1535 return wxString(*this, 0, iPos);
1536 }
1537
1538 /// get all characters before the last occurence of ch
1539 /// (returns empty string if ch not found)
1540 wxString wxString::BeforeLast(wxUniChar ch) const
1541 {
1542 wxString str;
1543 int iPos = Find(ch, true);
1544 if ( iPos != wxNOT_FOUND && iPos != 0 )
1545 str = wxString(c_str(), iPos);
1546
1547 return str;
1548 }
1549
1550 /// get all characters after the first occurence of ch
1551 /// (returns empty string if ch not found)
1552 wxString wxString::AfterFirst(wxUniChar ch) const
1553 {
1554 wxString str;
1555 int iPos = Find(ch);
1556 if ( iPos != wxNOT_FOUND )
1557 str = wx_str() + iPos + 1;
1558
1559 return str;
1560 }
1561
1562 // replace first (or all) occurences of some substring with another one
1563 size_t wxString::Replace(const wxString& strOld,
1564 const wxString& strNew, bool bReplaceAll)
1565 {
1566 // if we tried to replace an empty string we'd enter an infinite loop below
1567 wxCHECK_MSG( !strOld.empty(), 0,
1568 _T("wxString::Replace(): invalid parameter") );
1569
1570 size_t uiCount = 0; // count of replacements made
1571
1572 size_t uiOldLen = strOld.length();
1573 size_t uiNewLen = strNew.length();
1574
1575 size_t dwPos = 0;
1576
1577 while ( (*this)[dwPos] != wxT('\0') )
1578 {
1579 //DO NOT USE STRSTR HERE
1580 //this string can contain embedded null characters,
1581 //so strstr will function incorrectly
1582 dwPos = find(strOld, dwPos);
1583 if ( dwPos == npos )
1584 break; // exit the loop
1585 else
1586 {
1587 //replace this occurance of the old string with the new one
1588 replace(dwPos, uiOldLen, strNew, uiNewLen);
1589
1590 //move up pos past the string that was replaced
1591 dwPos += uiNewLen;
1592
1593 //increase replace count
1594 ++uiCount;
1595
1596 // stop now?
1597 if ( !bReplaceAll )
1598 break; // exit the loop
1599 }
1600 }
1601
1602 return uiCount;
1603 }
1604
1605 bool wxString::IsAscii() const
1606 {
1607 const wxChar *s = (const wxChar*) *this;
1608 while(*s){
1609 if(!isascii(*s)) return(false);
1610 s++;
1611 }
1612 return(true);
1613 }
1614
1615 bool wxString::IsWord() const
1616 {
1617 const wxChar *s = (const wxChar*) *this;
1618 while(*s){
1619 if(!wxIsalpha(*s)) return(false);
1620 s++;
1621 }
1622 return(true);
1623 }
1624
1625 bool wxString::IsNumber() const
1626 {
1627 const wxChar *s = (const wxChar*) *this;
1628 if (wxStrlen(s))
1629 if ((s[0] == wxT('-')) || (s[0] == wxT('+'))) s++;
1630 while(*s){
1631 if(!wxIsdigit(*s)) return(false);
1632 s++;
1633 }
1634 return(true);
1635 }
1636
1637 wxString wxString::Strip(stripType w) const
1638 {
1639 wxString s = *this;
1640 if ( w & leading ) s.Trim(false);
1641 if ( w & trailing ) s.Trim(true);
1642 return s;
1643 }
1644
1645 // ---------------------------------------------------------------------------
1646 // case conversion
1647 // ---------------------------------------------------------------------------
1648
1649 wxString& wxString::MakeUpper()
1650 {
1651 for ( iterator it = begin(), en = end(); it != en; ++it )
1652 *it = (wxChar)wxToupper(*it);
1653
1654 return *this;
1655 }
1656
1657 wxString& wxString::MakeLower()
1658 {
1659 for ( iterator it = begin(), en = end(); it != en; ++it )
1660 *it = (wxChar)wxTolower(*it);
1661
1662 return *this;
1663 }
1664
1665 // ---------------------------------------------------------------------------
1666 // trimming and padding
1667 // ---------------------------------------------------------------------------
1668
1669 // some compilers (VC++ 6.0 not to name them) return true for a call to
1670 // isspace('ê') in the C locale which seems to be broken to me, but we have to
1671 // live with this by checking that the character is a 7 bit one - even if this
1672 // may fail to detect some spaces (I don't know if Unicode doesn't have
1673 // space-like symbols somewhere except in the first 128 chars), it is arguably
1674 // still better than trimming away accented letters
1675 inline int wxSafeIsspace(wxChar ch) { return (ch < 127) && wxIsspace(ch); }
1676
1677 // trims spaces (in the sense of isspace) from left or right side
1678 wxString& wxString::Trim(bool bFromRight)
1679 {
1680 // first check if we're going to modify the string at all
1681 if ( !empty() &&
1682 (
1683 (bFromRight && wxSafeIsspace(GetChar(length() - 1))) ||
1684 (!bFromRight && wxSafeIsspace(GetChar(0u)))
1685 )
1686 )
1687 {
1688 if ( bFromRight )
1689 {
1690 // find last non-space character
1691 reverse_iterator psz = rbegin();
1692 while ( (psz != rend()) && wxSafeIsspace(*psz) )
1693 psz++;
1694
1695 // truncate at trailing space start
1696 erase(psz.base(), end());
1697 }
1698 else
1699 {
1700 // find first non-space character
1701 iterator psz = begin();
1702 while ( (psz != end()) && wxSafeIsspace(*psz) )
1703 psz++;
1704
1705 // fix up data and length
1706 erase(begin(), psz);
1707 }
1708 }
1709
1710 return *this;
1711 }
1712
1713 // adds nCount characters chPad to the string from either side
1714 wxString& wxString::Pad(size_t nCount, wxUniChar chPad, bool bFromRight)
1715 {
1716 wxString s(chPad, nCount);
1717
1718 if ( bFromRight )
1719 *this += s;
1720 else
1721 {
1722 s += *this;
1723 swap(s);
1724 }
1725
1726 return *this;
1727 }
1728
1729 // truncate the string
1730 wxString& wxString::Truncate(size_t uiLen)
1731 {
1732 if ( uiLen < length() )
1733 {
1734 erase(begin() + uiLen, end());
1735 }
1736 //else: nothing to do, string is already short enough
1737
1738 return *this;
1739 }
1740
1741 // ---------------------------------------------------------------------------
1742 // finding (return wxNOT_FOUND if not found and index otherwise)
1743 // ---------------------------------------------------------------------------
1744
1745 // find a character
1746 int wxString::Find(wxUniChar ch, bool bFromEnd) const
1747 {
1748 size_type idx = bFromEnd ? find_last_of(ch) : find_first_of(ch);
1749
1750 return (idx == npos) ? wxNOT_FOUND : (int)idx;
1751 }
1752
1753 // ----------------------------------------------------------------------------
1754 // conversion to numbers
1755 // ----------------------------------------------------------------------------
1756
1757 // the implementation of all the functions below is exactly the same so factor
1758 // it out
1759
1760 template <typename T, typename F>
1761 bool wxStringToIntType(const wxChar *start,
1762 T *val,
1763 int base,
1764 F func)
1765 {
1766 wxCHECK_MSG( val, false, _T("NULL output pointer") );
1767 wxASSERT_MSG( !base || (base > 1 && base <= 36), _T("invalid base") );
1768
1769 #ifndef __WXWINCE__
1770 errno = 0;
1771 #endif
1772
1773 wxChar *end;
1774 *val = (*func)(start, &end, base);
1775
1776 // return true only if scan was stopped by the terminating NUL and if the
1777 // string was not empty to start with and no under/overflow occurred
1778 return !*end && (end != start)
1779 #ifndef __WXWINCE__
1780 && (errno != ERANGE)
1781 #endif
1782 ;
1783 }
1784
1785 bool wxString::ToLong(long *val, int base) const
1786 {
1787 return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtol);
1788 }
1789
1790 bool wxString::ToULong(unsigned long *val, int base) const
1791 {
1792 return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoul);
1793 }
1794
1795 bool wxString::ToLongLong(wxLongLong_t *val, int base) const
1796 {
1797 return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoll);
1798 }
1799
1800 bool wxString::ToULongLong(wxULongLong_t *val, int base) const
1801 {
1802 return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoull);
1803 }
1804
1805 bool wxString::ToDouble(double *val) const
1806 {
1807 wxCHECK_MSG( val, false, _T("NULL pointer in wxString::ToDouble") );
1808
1809 #ifndef __WXWINCE__
1810 errno = 0;
1811 #endif
1812
1813 const wxChar *start = c_str();
1814 wxChar *end;
1815 *val = wxStrtod(start, &end);
1816
1817 // return true only if scan was stopped by the terminating NUL and if the
1818 // string was not empty to start with and no under/overflow occurred
1819 return !*end && (end != start)
1820 #ifndef __WXWINCE__
1821 && (errno != ERANGE)
1822 #endif
1823 ;
1824 }
1825
1826 // ---------------------------------------------------------------------------
1827 // formatted output
1828 // ---------------------------------------------------------------------------
1829
1830 /* static */
1831 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1832 wxString wxStringPrintfMixinBase::DoFormat(const wxChar *format, ...)
1833 #else
1834 wxString wxString::DoFormat(const wxChar *format, ...)
1835 #endif
1836 {
1837 va_list argptr;
1838 va_start(argptr, format);
1839
1840 wxString s;
1841 s.PrintfV(format, argptr);
1842
1843 va_end(argptr);
1844
1845 return s;
1846 }
1847
1848 /* static */
1849 wxString wxString::FormatV(const wxString& format, va_list argptr)
1850 {
1851 wxString s;
1852 s.PrintfV(format, argptr);
1853 return s;
1854 }
1855
1856 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1857 int wxStringPrintfMixinBase::DoPrintf(const wxChar *format, ...)
1858 #else
1859 int wxString::DoPrintf(const wxChar *format, ...)
1860 #endif
1861 {
1862 va_list argptr;
1863 va_start(argptr, format);
1864
1865 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1866 // get a pointer to the wxString instance; we have to use dynamic_cast<>
1867 // because it's the only cast that works safely for downcasting when
1868 // multiple inheritance is used:
1869 wxString *str = static_cast<wxString*>(this);
1870 #else
1871 wxString *str = this;
1872 #endif
1873
1874 int iLen = str->PrintfV(format, argptr);
1875
1876 va_end(argptr);
1877
1878 return iLen;
1879 }
1880
1881 int wxString::PrintfV(const wxString& format, va_list argptr)
1882 {
1883 int size = 1024;
1884
1885 for ( ;; )
1886 {
1887 wxStringBuffer tmp(*this, size + 1);
1888 wxChar *buf = tmp;
1889
1890 if ( !buf )
1891 {
1892 // out of memory
1893 return -1;
1894 }
1895
1896 // wxVsnprintf() may modify the original arg pointer, so pass it
1897 // only a copy
1898 va_list argptrcopy;
1899 wxVaCopy(argptrcopy, argptr);
1900 int len = wxVsnprintf(buf, size, (const wxChar*)/*FIXME-UTF8*/format, argptrcopy);
1901 va_end(argptrcopy);
1902
1903 // some implementations of vsnprintf() don't NUL terminate
1904 // the string if there is not enough space for it so
1905 // always do it manually
1906 buf[size] = _T('\0');
1907
1908 // vsnprintf() may return either -1 (traditional Unix behaviour) or the
1909 // total number of characters which would have been written if the
1910 // buffer were large enough (newer standards such as Unix98)
1911 if ( len < 0 )
1912 {
1913 #if wxUSE_WXVSNPRINTF
1914 // we know that our own implementation of wxVsnprintf() returns -1
1915 // only for a format error - thus there's something wrong with
1916 // the user's format string
1917 return -1;
1918 #else // assume that system version only returns error if not enough space
1919 // still not enough, as we don't know how much we need, double the
1920 // current size of the buffer
1921 size *= 2;
1922 #endif // wxUSE_WXVSNPRINTF/!wxUSE_WXVSNPRINTF
1923 }
1924 else if ( len >= size )
1925 {
1926 #if wxUSE_WXVSNPRINTF
1927 // we know that our own implementation of wxVsnprintf() returns
1928 // size+1 when there's not enough space but that's not the size
1929 // of the required buffer!
1930 size *= 2; // so we just double the current size of the buffer
1931 #else
1932 // some vsnprintf() implementations NUL-terminate the buffer and
1933 // some don't in len == size case, to be safe always add 1
1934 size = len + 1;
1935 #endif
1936 }
1937 else // ok, there was enough space
1938 {
1939 break;
1940 }
1941 }
1942
1943 // we could have overshot
1944 Shrink();
1945
1946 return length();
1947 }
1948
1949 // ----------------------------------------------------------------------------
1950 // misc other operations
1951 // ----------------------------------------------------------------------------
1952
1953 // returns true if the string matches the pattern which may contain '*' and
1954 // '?' metacharacters (as usual, '?' matches any character and '*' any number
1955 // of them)
1956 bool wxString::Matches(const wxString& mask) const
1957 {
1958 // I disable this code as it doesn't seem to be faster (in fact, it seems
1959 // to be much slower) than the old, hand-written code below and using it
1960 // here requires always linking with libregex even if the user code doesn't
1961 // use it
1962 #if 0 // wxUSE_REGEX
1963 // first translate the shell-like mask into a regex
1964 wxString pattern;
1965 pattern.reserve(wxStrlen(pszMask));
1966
1967 pattern += _T('^');
1968 while ( *pszMask )
1969 {
1970 switch ( *pszMask )
1971 {
1972 case _T('?'):
1973 pattern += _T('.');
1974 break;
1975
1976 case _T('*'):
1977 pattern += _T(".*");
1978 break;
1979
1980 case _T('^'):
1981 case _T('.'):
1982 case _T('$'):
1983 case _T('('):
1984 case _T(')'):
1985 case _T('|'):
1986 case _T('+'):
1987 case _T('\\'):
1988 // these characters are special in a RE, quote them
1989 // (however note that we don't quote '[' and ']' to allow
1990 // using them for Unix shell like matching)
1991 pattern += _T('\\');
1992 // fall through
1993
1994 default:
1995 pattern += *pszMask;
1996 }
1997
1998 pszMask++;
1999 }
2000 pattern += _T('$');
2001
2002 // and now use it
2003 return wxRegEx(pattern, wxRE_NOSUB | wxRE_EXTENDED).Matches(c_str());
2004 #else // !wxUSE_REGEX
2005 // TODO: this is, of course, awfully inefficient...
2006
2007 // FIXME-UTF8: implement using iterators, remove #if
2008 #if wxUSE_UNICODE_UTF8
2009 wxWCharBuffer maskBuf = mask.wc_str();
2010 wxWCharBuffer txtBuf = wc_str();
2011 const wxChar *pszMask = maskBuf.data();
2012 const wxChar *pszTxt = txtBuf.data();
2013 #else
2014 const wxChar *pszMask = mask.wx_str();
2015 // the char currently being checked
2016 const wxChar *pszTxt = wx_str();
2017 #endif
2018
2019 // the last location where '*' matched
2020 const wxChar *pszLastStarInText = NULL;
2021 const wxChar *pszLastStarInMask = NULL;
2022
2023 match:
2024 for ( ; *pszMask != wxT('\0'); pszMask++, pszTxt++ ) {
2025 switch ( *pszMask ) {
2026 case wxT('?'):
2027 if ( *pszTxt == wxT('\0') )
2028 return false;
2029
2030 // pszTxt and pszMask will be incremented in the loop statement
2031
2032 break;
2033
2034 case wxT('*'):
2035 {
2036 // remember where we started to be able to backtrack later
2037 pszLastStarInText = pszTxt;
2038 pszLastStarInMask = pszMask;
2039
2040 // ignore special chars immediately following this one
2041 // (should this be an error?)
2042 while ( *pszMask == wxT('*') || *pszMask == wxT('?') )
2043 pszMask++;
2044
2045 // if there is nothing more, match
2046 if ( *pszMask == wxT('\0') )
2047 return true;
2048
2049 // are there any other metacharacters in the mask?
2050 size_t uiLenMask;
2051 const wxChar *pEndMask = wxStrpbrk(pszMask, wxT("*?"));
2052
2053 if ( pEndMask != NULL ) {
2054 // we have to match the string between two metachars
2055 uiLenMask = pEndMask - pszMask;
2056 }
2057 else {
2058 // we have to match the remainder of the string
2059 uiLenMask = wxStrlen(pszMask);
2060 }
2061
2062 wxString strToMatch(pszMask, uiLenMask);
2063 const wxChar* pMatch = wxStrstr(pszTxt, strToMatch);
2064 if ( pMatch == NULL )
2065 return false;
2066
2067 // -1 to compensate "++" in the loop
2068 pszTxt = pMatch + uiLenMask - 1;
2069 pszMask += uiLenMask - 1;
2070 }
2071 break;
2072
2073 default:
2074 if ( *pszMask != *pszTxt )
2075 return false;
2076 break;
2077 }
2078 }
2079
2080 // match only if nothing left
2081 if ( *pszTxt == wxT('\0') )
2082 return true;
2083
2084 // if we failed to match, backtrack if we can
2085 if ( pszLastStarInText ) {
2086 pszTxt = pszLastStarInText + 1;
2087 pszMask = pszLastStarInMask;
2088
2089 pszLastStarInText = NULL;
2090
2091 // don't bother resetting pszLastStarInMask, it's unnecessary
2092
2093 goto match;
2094 }
2095
2096 return false;
2097 #endif // wxUSE_REGEX/!wxUSE_REGEX
2098 }
2099
2100 // Count the number of chars
2101 int wxString::Freq(wxUniChar ch) const
2102 {
2103 int count = 0;
2104 for ( const_iterator i = begin(); i != end(); ++i )
2105 {
2106 if ( *i == ch )
2107 count ++;
2108 }
2109 return count;
2110 }
2111
2112 // convert to upper case, return the copy of the string
2113 wxString wxString::Upper() const
2114 { wxString s(*this); return s.MakeUpper(); }
2115
2116 // convert to lower case, return the copy of the string
2117 wxString wxString::Lower() const { wxString s(*this); return s.MakeLower(); }