src/common/string.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/string.cpp
   3 // Purpose:     wxString class
   4 // Author:      Vadim Zeitlin, Ryan Norton
   5 // Modified by:
   6 // Created:     29/01/98
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 1998 Vadim Zeitlin <zeitlin@dptmaths.ens-cachan.fr>
   9 //              (c) 2004 Ryan Norton <wxprojects@comcast.net>
  10 // Licence:     wxWindows licence
  11 /////////////////////////////////////////////////////////////////////////////
  12
  13 /*
  14  * About ref counting:
  15  *  1) all empty strings use g_strEmpty, nRefs = -1 (set in Init())
  16  *  2) AllocBuffer() sets nRefs to 1, Lock() increments it by one
  17  *  3) Unlock() decrements nRefs and frees memory if it goes to 0
  18  */
  19
  20 // ===========================================================================
  21 // headers, declarations, constants
  22 // ===========================================================================
  23
  24 // For compilers that support precompilation, includes "wx.h".
  25 #include "wx/wxprec.h"
  26
  27 #ifdef __BORLANDC__
  28     #pragma hdrstop
  29 #endif
  30
  31 #ifndef WX_PRECOMP
  32     #include "wx/string.h"
  33 #endif
  34
  35 #include <ctype.h>
  36
  37 #ifndef __WXWINCE__
  38     #include <errno.h>
  39 #endif
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 #ifdef __SALFORDC__
  45     #include <clib.h>
  46 #endif
  47
  48 #include "wx/hashmap.h"
  49
  50 // string handling functions used by wxString:
  51 #if wxUSE_UNICODE_UTF8
  52     #define wxStringMemcpy   memcpy
  53     #define wxStringMemcmp   memcmp
  54     #define wxStringMemchr   memchr
  55     #define wxStringStrlen   strlen
  56 #else
  57     #define wxStringMemcpy   wxTmemcpy
  58     #define wxStringMemcmp   wxTmemcmp
  59     #define wxStringMemchr   wxTmemchr
  60     #define wxStringStrlen   wxStrlen
  61 #endif
  62
  63
  64 // ---------------------------------------------------------------------------
  65 // static class variables definition
  66 // ---------------------------------------------------------------------------
  67
  68 //According to STL _must_ be a -1 size_t
  69 const size_t wxString::npos = (size_t) -1;
  70
  71 // ----------------------------------------------------------------------------
  72 // global functions
  73 // ----------------------------------------------------------------------------
  74
  75 #if wxUSE_STD_IOSTREAM
  76
  77 #include <iostream>
  78
  79 wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str)
  80 {
  81 // FIXME-UTF8: always, not only if wxUSE_UNICODE
  82 #if wxUSE_UNICODE && !defined(__BORLANDC__)
  83     return os << str.AsWChar();
  84 #else
  85     return os << str.AsChar();
  86 #endif
  87 }
  88
  89 wxSTD ostream& operator<<(wxSTD ostream& os, const wxString& str)
  90 {
  91     return os << str.c_str();
  92 }
  93
  94 wxSTD ostream& operator<<(wxSTD ostream& os, const wxCharBuffer& str)
  95 {
  96     return os << str.data();
  97 }
  98
  99 #ifndef __BORLANDC__
 100 wxSTD ostream& operator<<(wxSTD ostream& os, const wxWCharBuffer& str)
 101 {
 102     return os << str.data();
 103 }
 104 #endif
 105
 106 #endif // wxUSE_STD_IOSTREAM
 107
 108 // ===========================================================================
 109 // wxString class core
 110 // ===========================================================================
 111
 112 #if wxUSE_UNICODE_UTF8
 113
 114 // ---------------------------------------------------------------------------
 115 // UTF-8 operations
 116 // ---------------------------------------------------------------------------
 117
 118 //
 119 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
 120 //
 121 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
 122 // -------------------+----------+----------+----------+----------+
 123 //   U+0000..U+007F   |  00..7F  |          |          |          |
 124 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
 125 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
 126 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
 127 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
 128 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
 129 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
 130 // -------------------+----------+----------+----------+----------+
 131
 132 bool wxString::IsValidUtf8String(const char *str)
 133 {
 134     if ( !str )
 135         return true; // empty string is UTF8 string
 136
 137     const unsigned char *c = (const unsigned char*)str;
 138
 139     for ( ; *c; ++c )
 140     {
 141         unsigned char b = *c;
 142
 143         if ( b <= 0x7F ) // 00..7F
 144             continue;
 145
 146         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 147             return false;
 148
 149         // two-byte sequences:
 150         else if ( b <= 0xDF ) // C2..DF
 151         {
 152             b = *(++c);
 153             if ( !(b >= 0x80 && b <= 0xBF ) )
 154                 return false;
 155         }
 156
 157         // three-byte sequences:
 158         else if ( b == 0xE0 )
 159         {
 160             b = *(++c);
 161             if ( !(b >= 0xA0 && b <= 0xBF ) )
 162                 return false;
 163             b = *(++c);
 164             if ( !(b >= 0x80 && b <= 0xBF ) )
 165                 return false;
 166         }
 167         else if ( b <= 0xEF ) // E1..EF
 168         {
 169             for ( int i = 0; i < 2; ++i )
 170             {
 171                 b = *(++c);
 172                 if ( !(b >= 0x80 && b <= 0xBF ) )
 173                     return false;
 174             }
 175         }
 176
 177         // four-byte sequences:
 178         else if ( b == 0xF0 )
 179         {
 180             b = *(++c);
 181             if ( !(b >= 0x90 && b <= 0xBF ) )
 182                 return false;
 183             for ( int i = 0; i < 2; ++i )
 184             {
 185                 b = *(++c);
 186                 if ( !(b >= 0x80 && b <= 0xBF ) )
 187                     return false;
 188             }
 189         }
 190         else if ( b <= 0xF3 ) // F1..F3
 191         {
 192             for ( int i = 0; i < 3; ++i )
 193             {
 194                 b = *(++c);
 195                 if ( !(b >= 0x80 && b <= 0xBF ) )
 196                     return false;
 197             }
 198         }
 199         else if ( b == 0xF4 )
 200         {
 201             b = *(++c);
 202             if ( !(b >= 0x80 && b <= 0x8F ) )
 203                 return false;
 204             for ( int i = 0; i < 2; ++i )
 205             {
 206                 b = *(++c);
 207                 if ( !(b >= 0x80 && b <= 0xBF ) )
 208                     return false;
 209             }
 210         }
 211         else // otherwise, it's invalid lead byte
 212             return false;
 213     }
 214
 215     return true;
 216 }
 217
 218 #ifdef __WXDEBUG__
 219 /* static */
 220 bool wxString::IsValidUtf8LeadByte(unsigned char c)
 221 {
 222     return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
 223 }
 224 #endif
 225
 226 unsigned char wxString::ms_utf8IterTable[256] = {
 227     // single-byte sequences (ASCII):
 228     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 229     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 230     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 231     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 232     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 233     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 234     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 235     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 236
 237     // these are invalid, we use step 1 to skip
 238     // over them (should never happen):
 239     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
 240     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
 241     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
 242     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
 243     1, 1,                                            // C0,C1
 244
 245     // two-byte sequences:
 246           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 247     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 248
 249     // three-byte sequences:
 250     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 251
 252     // four-byte sequences:
 253     4, 4, 4, 4, 4,                                   // F0..F4
 254
 255     // these are invalid again (5- or 6-byte
 256     // sequences and sequences for code points
 257     // above U+10FFFF, as restricted by RFC 3629):
 258                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
 259 };
 260
 261 /* static */
 262 void wxString::DecIter(wxStringImpl::const_iterator& i)
 263 {
 264     wxASSERT( IsValidUtf8LeadByte(*i) );
 265
 266     // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
 267     // binary), so we just have to go back until we hit a byte that is either
 268     // < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in binary; this
 269     // includes some invalid values, but we can ignore it here, because we
 270     // assume valid UTF-8 input for the purpose of efficient implementation).
 271     --i;
 272     while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
 273         --i;
 274 }
 275
 276 /* static */
 277 void wxString::DecIter(wxStringImpl::iterator& i)
 278 {
 279     // FIXME-UTF8: use template instead
 280     wxASSERT( IsValidUtf8LeadByte(*i) );
 281     --i;
 282     while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
 283         --i;
 284 }
 285
 286 /* static */
 287 wxStringImpl::const_iterator
 288 wxString::AddToIter(wxStringImpl::const_iterator i, int n)
 289 {
 290     wxStringImpl::const_iterator out(i);
 291
 292     if ( n > 0 )
 293     {
 294         for ( int j = 0; j < n; ++j )
 295             IncIter(out);
 296     }
 297     else if ( n < 0 )
 298     {
 299         for ( int j = 0; j > n; --j )
 300             DecIter(out);
 301     }
 302
 303     return out;
 304 }
 305
 306 wxStringImpl::iterator
 307 wxString::AddToIter(wxStringImpl::iterator i, int n)
 308 {
 309     // FIXME-UTF8: use template instead
 310     wxStringImpl::iterator out(i);
 311
 312     if ( n > 0 )
 313     {
 314         for ( int j = 0; j < n; ++j )
 315             IncIter(out);
 316     }
 317     else if ( n < 0 )
 318     {
 319         for ( int j = 0; j > n; --j )
 320             DecIter(out);
 321     }
 322
 323     return out;
 324 }
 325
 326
 327 /* static */
 328 int wxString::DiffIters(wxStringImpl::const_iterator i1,
 329                         wxStringImpl::const_iterator i2)
 330 {
 331     int dist = 0;
 332
 333     if ( i1 < i2 )
 334     {
 335         while ( i1 != i2 )
 336         {
 337             IncIter(i1);
 338             dist--;
 339         }
 340     }
 341     else if ( i2 < i1 )
 342     {
 343         while ( i2 != i1 )
 344         {
 345             IncIter(i2);
 346             dist++;
 347         }
 348     }
 349
 350     return dist;
 351 }
 352
 353 int wxString::DiffIters(wxStringImpl::iterator i1, wxStringImpl::iterator i2)
 354 {
 355     // FIXME-UTF8: use template instead
 356     int dist = 0;
 357
 358     if ( i1 < i2 )
 359     {
 360         while ( i1 != i2 )
 361         {
 362             IncIter(i1);
 363             dist--;
 364         }
 365     }
 366     else if ( i2 < i1 )
 367     {
 368         while ( i2 != i1 )
 369         {
 370             IncIter(i2);
 371             dist++;
 372         }
 373     }
 374
 375     return dist;
 376 }
 377
 378 /* static */
 379 wxString::Utf8CharBuffer wxString::EncodeChar(wxUniChar ch)
 380 {
 381     Utf8CharBuffer buf;
 382     char *out = buf.data;
 383
 384     wxUniChar::value_type code = ch.GetValue();
 385
 386     //    Char. number range   |        UTF-8 octet sequence
 387     //       (hexadecimal)     |              (binary)
 388     //   ----------------------+---------------------------------------------
 389     //   0000 0000 - 0000 007F | 0xxxxxxx
 390     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 391     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 392     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 393     //
 394     //   Code point value is stored in bits marked with 'x', lowest-order bit
 395     //   of the value on the right side in the diagram above.
 396     //                                                        (from RFC 3629)
 397
 398     if ( code <= 0x7F )
 399     {
 400         out[1] = 0;
 401         out[0] = (char)code;
 402     }
 403     else if ( code <= 0x07FF )
 404     {
 405         out[2] = 0;
 406         // NB: this line takes 6 least significant bits, encodes them as
 407         // 10xxxxxx and discards them so that the next byte can be encoded:
 408         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 409         out[0] = 0xC0 | code;
 410     }
 411     else if ( code < 0xFFFF )
 412     {
 413         out[3] = 0;
 414         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 415         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 416         out[0] = 0xE0 | code;
 417     }
 418     else if ( code <= 0x10FFFF )
 419     {
 420         out[4] = 0;
 421         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 422         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 423         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 424         out[0] = 0xF0 | code;
 425     }
 426     else
 427     {
 428         wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 429         out[0] = 0;
 430     }
 431
 432     return buf;
 433 }
 434
 435 /* static */
 436 wxUniChar wxUniCharRef::DecodeChar(wxStringImpl::const_iterator i)
 437 {
 438     wxASSERT( wxString::IsValidUtf8LeadByte(*i) ); // FIXME-UTF8: no "wxString::"
 439
 440     wxUniChar::value_type code = 0;
 441     size_t len = wxString::GetUtf8CharLength(*i);
 442     wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
 443
 444     //    Char. number range   |        UTF-8 octet sequence
 445     //       (hexadecimal)     |              (binary)
 446     //   ----------------------+---------------------------------------------
 447     //   0000 0000 - 0000 007F | 0xxxxxxx
 448     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 449     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 450     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 451     //
 452     //   Code point value is stored in bits marked with 'x', lowest-order bit
 453     //   of the value on the right side in the diagram above.
 454     //                                                        (from RFC 3629)
 455
 456     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 457     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 458 #ifdef __WXDEBUG__
 459     // mask and value of lead byte's most significant bits, by length:
 460     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 461     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 462 #endif
 463
 464     // extract the lead byte's value bits:
 465     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 466                   s_leadMarkerVal[len-1],
 467                   _T("invalid UTF-8 lead byte") );
 468     code = (unsigned char)*i & s_leadValueMask[len-1];
 469
 470     // all remaining bytes, if any, are handled in the same way regardless of
 471     // sequence's length:
 472     for ( ++i ; len > 1; --len, ++i )
 473     {
 474         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 475                       _T("invalid UTF-8 byte") );
 476
 477         code <<= 6;
 478         code |= (unsigned char)*i & 0x3F;
 479     }
 480
 481     return wxUniChar(code);
 482 }
 483
 484 /* static */
 485 wxCharBuffer wxString::EncodeNChars(size_t n, wxUniChar ch)
 486 {
 487     Utf8CharBuffer once(EncodeChar(ch));
 488     // the IncIter() table can be used to determine the length of ch's encoding:
 489     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 490
 491     wxCharBuffer buf(n * len);
 492     char *ptr = buf.data();
 493     for ( size_t i = 0; i < n; i++, ptr += len )
 494     {
 495         memcpy(ptr, once.data, len);
 496     }
 497
 498     return buf;
 499 }
 500
 501
 502 void wxString::PosLenToImpl(size_t pos, size_t len,
 503                             size_t *implPos, size_t *implLen) const
 504 {
 505     if ( pos == npos )
 506         *implPos = npos;
 507     else
 508     {
 509         const_iterator i = begin() + pos;
 510         *implPos = wxStringImpl::const_iterator(i) - m_impl.begin();
 511         if ( len == npos )
 512             *implLen = npos;
 513         else
 514         {
 515             // too large length is interpreted as "to the end of the string"
 516             // FIXME-UTF8: verify this is the case in std::string, assert
 517             // otherwise
 518             if ( pos + len > length() )
 519                 len = length() - pos;
 520
 521             *implLen = wxStringImpl::const_iterator(i + len) -
 522                        wxStringImpl::const_iterator(i);
 523         }
 524     }
 525 }
 526
 527 #endif // wxUSE_UNICODE_UTF8
 528
 529 // ----------------------------------------------------------------------------
 530 // wxCStrData converted strings caching
 531 // ----------------------------------------------------------------------------
 532
 533 // FIXME-UTF8: temporarily disabled because it doesn't work with global
 534 //             string objects; re-enable after fixing this bug and benchmarking
 535 //             performance to see if using a hash is a good idea at all
 536 #if 0
 537
 538 // For backward compatibility reasons, it must be possible to assign the value
 539 // returned by wxString::c_str() to a char* or wchar_t* variable and work with
 540 // it. Returning wxCharBuffer from (const char*)c_str() wouldn't do the trick,
 541 // because the memory would be freed immediately, but it has to be valid as long
 542 // as the string is not modified, so that code like this still works:
 543 //
 544 // const wxChar *s = str.c_str();
 545 // while ( s ) { ... }
 546
 547 // FIXME-UTF8: not thread safe!
 548 // FIXME-UTF8: we currently clear the cached conversion only when the string is
 549 //             destroyed, but we should do it when the string is modified, to
 550 //             keep memory usage down
 551 // FIXME-UTF8: we do the conversion every time As[W]Char() is called, but if we
 552 //             invalidated the cache on every change, we could keep the previous
 553 //             conversion
 554 // FIXME-UTF8: add tracing of usage of these two methods - new code is supposed
 555 //             to use mb_str() or wc_str() instead of (const [w]char*)c_str()
 556
 557 template<typename T>
 558 static inline void DeleteStringFromConversionCache(T& hash, const wxString *s)
 559 {
 560     typename T::iterator i = hash.find(wxConstCast(s, wxString));
 561     if ( i != hash.end() )
 562     {
 563         free(i->second);
 564         hash.erase(i);
 565     }
 566 }
 567
 568 #if wxUSE_UNICODE
 569 // NB: non-STL implementation doesn't compile with "const wxString*" key type,
 570 //     so we have to use wxString* here and const-cast when used
 571 WX_DECLARE_HASH_MAP(wxString*, char*, wxPointerHash, wxPointerEqual,
 572                     wxStringCharConversionCache);
 573 static wxStringCharConversionCache gs_stringsCharCache;
 574
 575 const char* wxCStrData::AsChar() const
 576 {
 577     // remove previously cache value, if any (see FIXMEs above):
 578     DeleteStringFromConversionCache(gs_stringsCharCache, m_str);
 579
 580     // convert the string and keep it:
 581     const char *s = gs_stringsCharCache[wxConstCast(m_str, wxString)] =
 582         m_str->mb_str().release();
 583
 584     return s + m_offset;
 585 }
 586 #endif // wxUSE_UNICODE
 587
 588 #if !wxUSE_UNICODE_WCHAR
 589 WX_DECLARE_HASH_MAP(wxString*, wchar_t*, wxPointerHash, wxPointerEqual,
 590                     wxStringWCharConversionCache);
 591 static wxStringWCharConversionCache gs_stringsWCharCache;
 592
 593 const wchar_t* wxCStrData::AsWChar() const
 594 {
 595     // remove previously cache value, if any (see FIXMEs above):
 596     DeleteStringFromConversionCache(gs_stringsWCharCache, m_str);
 597
 598     // convert the string and keep it:
 599     const wchar_t *s = gs_stringsWCharCache[wxConstCast(m_str, wxString)] =
 600         m_str->wc_str().release();
 601
 602     return s + m_offset;
 603 }
 604 #endif // !wxUSE_UNICODE_WCHAR
 605
 606 wxString::~wxString()
 607 {
 608 #if wxUSE_UNICODE
 609     // FIXME-UTF8: do this only if locale is not UTF8 if wxUSE_UNICODE_UTF8
 610     DeleteStringFromConversionCache(gs_stringsCharCache, this);
 611 #endif
 612 #if !wxUSE_UNICODE_WCHAR
 613     DeleteStringFromConversionCache(gs_stringsWCharCache, this);
 614 #endif
 615 }
 616 #endif
 617
 618 #if wxUSE_UNICODE
 619 const char* wxCStrData::AsChar() const
 620 {
 621     wxString *str = wxConstCast(m_str, wxString);
 622
 623     // convert the string:
 624     wxCharBuffer buf(str->mb_str());
 625
 626     // FIXME-UTF8: do the conversion in-place in the existing buffer
 627     if ( str->m_convertedToChar &&
 628          strlen(buf) == strlen(str->m_convertedToChar) )
 629     {
 630         // keep the same buffer for as long as possible, so that several calls
 631         // to c_str() in a row still work:
 632         strcpy(str->m_convertedToChar, buf);
 633     }
 634     else
 635     {
 636         str->m_convertedToChar = buf.release();
 637     }
 638
 639     // and keep it:
 640     return str->m_convertedToChar + m_offset;
 641 }
 642 #endif // wxUSE_UNICODE
 643
 644 #if !wxUSE_UNICODE_WCHAR
 645 const wchar_t* wxCStrData::AsWChar() const
 646 {
 647     wxString *str = wxConstCast(m_str, wxString);
 648
 649     // convert the string:
 650     wxWCharBuffer buf(str->wc_str());
 651
 652     // FIXME-UTF8: do the conversion in-place in the existing buffer
 653     if ( str->m_convertedToWChar &&
 654          wxWcslen(buf) == wxWcslen(str->m_convertedToWChar) )
 655     {
 656         // keep the same buffer for as long as possible, so that several calls
 657         // to c_str() in a row still work:
 658         memcpy(str->m_convertedToWChar, buf, sizeof(wchar_t) * wxWcslen(buf));
 659     }
 660     else
 661     {
 662         str->m_convertedToWChar = buf.release();
 663     }
 664
 665     // and keep it:
 666     return str->m_convertedToWChar + m_offset;
 667 }
 668 #endif // !wxUSE_UNICODE_WCHAR
 669
 670 // ===========================================================================
 671 // wxString class core
 672 // ===========================================================================
 673
 674 // ---------------------------------------------------------------------------
 675 // construction and conversion
 676 // ---------------------------------------------------------------------------
 677
 678 #if wxUSE_UNICODE_WCHAR
 679 /* static */
 680 wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
 681                                                const wxMBConv& conv)
 682 {
 683     // anything to do?
 684     if ( !psz || nLength == 0 )
 685         return SubstrBufFromMB(L"", 0);
 686
 687     if ( nLength == npos )
 688         nLength = wxNO_LEN;
 689
 690     size_t wcLen;
 691     wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen));
 692     if ( !wcLen )
 693         return SubstrBufFromMB(_T(""), 0);
 694     else
 695         return SubstrBufFromMB(wcBuf, wcLen);
 696 }
 697 #endif // wxUSE_UNICODE_WCHAR
 698
 699 #if wxUSE_UNICODE_UTF8
 700 /* static */
 701 wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
 702                                                const wxMBConv& conv)
 703 {
 704     // FIXME-UTF8: return as-is without copying under UTF8 locale, return
 705     //             converted string under other locales - needs wxCharBuffer
 706     //             changes
 707
 708     // anything to do?
 709     if ( !psz || nLength == 0 )
 710         return SubstrBufFromMB("", 0);
 711
 712     if ( nLength == npos )
 713         nLength = wxNO_LEN;
 714
 715     // first convert to wide string:
 716     size_t wcLen;
 717     wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen));
 718     if ( !wcLen )
 719         return SubstrBufFromMB("", 0);
 720
 721     // and then to UTF-8:
 722     SubstrBufFromMB buf(ConvertStr(wcBuf, wcLen, wxConvUTF8));
 723     // widechar -> UTF-8 conversion isn't supposed to ever fail:
 724     wxASSERT_MSG( buf.data, _T("conversion to UTF-8 failed") );
 725
 726     return buf;
 727 }
 728 #endif // wxUSE_UNICODE_UTF8
 729
 730 #if wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE
 731 /* static */
 732 wxString::SubstrBufFromWC wxString::ConvertStr(const wchar_t *pwz, size_t nLength,
 733                                                const wxMBConv& conv)
 734 {
 735     // anything to do?
 736     if ( !pwz || nLength == 0 )
 737         return SubstrBufFromWC("", 0);
 738
 739     if ( nLength == npos )
 740         nLength = wxNO_LEN;
 741
 742     size_t mbLen;
 743     wxCharBuffer mbBuf(conv.cWC2MB(pwz, nLength, &mbLen));
 744     if ( !mbLen )
 745         return SubstrBufFromWC("", 0);
 746     else
 747         return SubstrBufFromWC(mbBuf, mbLen);
 748 }
 749 #endif // wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE
 750
 751
 752 #if wxUSE_UNICODE_WCHAR
 753
 754 //Convert wxString in Unicode mode to a multi-byte string
 755 const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
 756 {
 757     return conv.cWC2MB(wx_str(), length() + 1 /* size, not length */, NULL);
 758 }
 759
 760 #elif wxUSE_UNICODE_UTF8
 761
 762 const wxWCharBuffer wxString::wc_str() const
 763 {
 764     return wxConvUTF8.cMB2WC(m_impl.c_str(),
 765                              m_impl.length() + 1 /* size, not length */,
 766                              NULL);
 767 }
 768
 769 const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
 770 {
 771     // FIXME-UTF8: optimize the case when conv==wxConvUTF8 or wxConvLibc
 772     //             under UTF8 locale
 773     // FIXME-UTF8: use wc_str() here once we have buffers with length
 774
 775     size_t wcLen;
 776     wxWCharBuffer wcBuf(
 777             wxConvUTF8.cMB2WC(m_impl.c_str(),
 778                               m_impl.length() + 1 /* size, not length */,
 779                               &wcLen));
 780     if ( !wcLen )
 781         return wxCharBuffer("");
 782
 783     return conv.cWC2MB(wcBuf, wcLen, NULL);
 784 }
 785
 786 #else // ANSI
 787
 788 //Converts this string to a wide character string if unicode
 789 //mode is not enabled and wxUSE_WCHAR_T is enabled
 790 const wxWCharBuffer wxString::wc_str(const wxMBConv& conv) const
 791 {
 792     return conv.cMB2WC(wx_str(), length() + 1 /* size, not length */, NULL);
 793 }
 794
 795 #endif // Unicode/ANSI
 796
 797 // shrink to minimal size (releasing extra memory)
 798 bool wxString::Shrink()
 799 {
 800   wxString tmp(begin(), end());
 801   swap(tmp);
 802   return tmp.length() == length();
 803 }
 804
 805 // deprecated compatibility code:
 806 #if WXWIN_COMPATIBILITY_2_8 && !wxUSE_STL_BASED_WXSTRING && !wxUSE_UNICODE_UTF8
 807 wxChar *wxString::GetWriteBuf(size_t nLen)
 808 {
 809     return DoGetWriteBuf(nLen);
 810 }
 811
 812 void wxString::UngetWriteBuf()
 813 {
 814     DoUngetWriteBuf();
 815 }
 816
 817 void wxString::UngetWriteBuf(size_t nLen)
 818 {
 819     DoUngetWriteBuf(nLen);
 820 }
 821 #endif // WXWIN_COMPATIBILITY_2_8 && !wxUSE_STL_BASED_WXSTRING && !wxUSE_UNICODE_UTF8
 822
 823
 824 // ---------------------------------------------------------------------------
 825 // data access
 826 // ---------------------------------------------------------------------------
 827
 828 // all functions are inline in string.h
 829
 830 // ---------------------------------------------------------------------------
 831 // concatenation operators
 832 // ---------------------------------------------------------------------------
 833
 834 /*
 835  * concatenation functions come in 5 flavours:
 836  *  string + string
 837  *  char   + string      and      string + char
 838  *  C str  + string      and      string + C str
 839  */
 840
 841 wxString operator+(const wxString& str1, const wxString& str2)
 842 {
 843 #if !wxUSE_STL_BASED_WXSTRING
 844     wxASSERT( str1.IsValid() );
 845     wxASSERT( str2.IsValid() );
 846 #endif
 847
 848     wxString s = str1;
 849     s += str2;
 850
 851     return s;
 852 }
 853
 854 wxString operator+(const wxString& str, wxUniChar ch)
 855 {
 856 #if !wxUSE_STL_BASED_WXSTRING
 857     wxASSERT( str.IsValid() );
 858 #endif
 859
 860     wxString s = str;
 861     s += ch;
 862
 863     return s;
 864 }
 865
 866 wxString operator+(wxUniChar ch, const wxString& str)
 867 {
 868 #if !wxUSE_STL_BASED_WXSTRING
 869     wxASSERT( str.IsValid() );
 870 #endif
 871
 872     wxString s = ch;
 873     s += str;
 874
 875     return s;
 876 }
 877
 878 wxString operator+(const wxString& str, const char *psz)
 879 {
 880 #if !wxUSE_STL_BASED_WXSTRING
 881     wxASSERT( str.IsValid() );
 882 #endif
 883
 884     wxString s;
 885     if ( !s.Alloc(strlen(psz) + str.length()) ) {
 886         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 887     }
 888     s += str;
 889     s += psz;
 890
 891     return s;
 892 }
 893
 894 wxString operator+(const wxString& str, const wchar_t *pwz)
 895 {
 896 #if !wxUSE_STL_BASED_WXSTRING
 897     wxASSERT( str.IsValid() );
 898 #endif
 899
 900     wxString s;
 901     if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) {
 902         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 903     }
 904     s += str;
 905     s += pwz;
 906
 907     return s;
 908 }
 909
 910 wxString operator+(const char *psz, const wxString& str)
 911 {
 912 #if !wxUSE_STL_BASED_WXSTRING
 913     wxASSERT( str.IsValid() );
 914 #endif
 915
 916     wxString s;
 917     if ( !s.Alloc(strlen(psz) + str.length()) ) {
 918         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 919     }
 920     s = psz;
 921     s += str;
 922
 923     return s;
 924 }
 925
 926 wxString operator+(const wchar_t *pwz, const wxString& str)
 927 {
 928 #if !wxUSE_STL_BASED_WXSTRING
 929     wxASSERT( str.IsValid() );
 930 #endif
 931
 932     wxString s;
 933     if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) {
 934         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 935     }
 936     s = pwz;
 937     s += str;
 938
 939     return s;
 940 }
 941
 942 // ---------------------------------------------------------------------------
 943 // string comparison
 944 // ---------------------------------------------------------------------------
 945
 946 #ifdef HAVE_STD_STRING_COMPARE
 947
 948 // NB: Comparison code (both if HAVE_STD_STRING_COMPARE and if not) works with
 949 //     UTF-8 encoded strings too, thanks to UTF-8's design which allows us to
 950 //     sort strings in characters code point order by sorting the byte sequence
 951 //     in byte values order (i.e. what strcmp() and memcmp() do).
 952
 953 int wxString::compare(const wxString& str) const
 954 {
 955     return m_impl.compare(str.m_impl);
 956 }
 957
 958 int wxString::compare(size_t nStart, size_t nLen,
 959                       const wxString& str) const
 960 {
 961     size_t pos, len;
 962     PosLenToImpl(nStart, nLen, &pos, &len);
 963     return m_impl.compare(pos, len, str.m_impl);
 964 }
 965
 966 int wxString::compare(size_t nStart, size_t nLen,
 967                       const wxString& str,
 968                       size_t nStart2, size_t nLen2) const
 969 {
 970     size_t pos, len;
 971     PosLenToImpl(nStart, nLen, &pos, &len);
 972
 973     size_t pos2, len2;
 974     str.PosLenToImpl(nStart2, nLen2, &pos2, &len2);
 975
 976     return m_impl.compare(pos, len, str.m_impl, pos2, len2);
 977 }
 978
 979 int wxString::compare(const char* sz) const
 980 {
 981     return m_impl.compare(ImplStr(sz));
 982 }
 983
 984 int wxString::compare(const wchar_t* sz) const
 985 {
 986     return m_impl.compare(ImplStr(sz));
 987 }
 988
 989 int wxString::compare(size_t nStart, size_t nLen,
 990                       const char* sz, size_t nCount) const
 991 {
 992     size_t pos, len;
 993     PosLenToImpl(nStart, nLen, &pos, &len);
 994
 995     SubstrBufFromMB str(ImplStr(sz, nCount));
 996
 997     return m_impl.compare(pos, len, str.data, str.len);
 998 }
 999
1000 int wxString::compare(size_t nStart, size_t nLen,
1001                       const wchar_t* sz, size_t nCount) const
1002 {
1003     size_t pos, len;
1004     PosLenToImpl(nStart, nLen, &pos, &len);
1005
1006     SubstrBufFromWC str(ImplStr(sz, nCount));
1007
1008     return m_impl.compare(pos, len, str.data, str.len);
1009 }
1010
1011 #else // !HAVE_STD_STRING_COMPARE
1012
1013 static inline int wxDoCmp(const wxStringCharType* s1, size_t l1,
1014                           const wxStringCharType* s2, size_t l2)
1015 {
1016     if( l1 == l2 )
1017         return wxStringMemcmp(s1, s2, l1);
1018     else if( l1 < l2 )
1019     {
1020         int ret = wxStringMemcmp(s1, s2, l1);
1021         return ret == 0 ? -1 : ret;
1022     }
1023     else
1024     {
1025         int ret = wxStringMemcmp(s1, s2, l2);
1026         return ret == 0 ? +1 : ret;
1027     }
1028 }
1029
1030 int wxString::compare(const wxString& str) const
1031 {
1032     return ::wxDoCmp(m_impl.data(), m_impl.length(),
1033                      str.m_impl.data(), str.m_impl.length());
1034 }
1035
1036 int wxString::compare(size_t nStart, size_t nLen,
1037                       const wxString& str) const
1038 {
1039     wxASSERT(nStart <= length());
1040     size_type strLen = length() - nStart;
1041     nLen = strLen < nLen ? strLen : nLen;
1042
1043     size_t pos, len;
1044     PosLenToImpl(nStart, nLen, &pos, &len);
1045
1046     return ::wxDoCmp(m_impl.data() + pos,  len,
1047                      str.m_impl.data(), str.m_impl.length());
1048 }
1049
1050 int wxString::compare(size_t nStart, size_t nLen,
1051                       const wxString& str,
1052                       size_t nStart2, size_t nLen2) const
1053 {
1054     wxASSERT(nStart <= length());
1055     wxASSERT(nStart2 <= str.length());
1056     size_type strLen  =     length() - nStart,
1057               strLen2 = str.length() - nStart2;
1058     nLen  = strLen  < nLen  ? strLen  : nLen;
1059     nLen2 = strLen2 < nLen2 ? strLen2 : nLen2;
1060
1061     size_t pos, len;
1062     PosLenToImpl(nStart, nLen, &pos, &len);
1063     size_t pos2, len2;
1064     str.PosLenToImpl(nStart2, nLen2, &pos2, &len2);
1065
1066     return ::wxDoCmp(m_impl.data() + pos, len,
1067                      str.m_impl.data() + pos2, len2);
1068 }
1069
1070 int wxString::compare(const char* sz) const
1071 {
1072     SubstrBufFromMB str(ImplStr(sz, npos));
1073     if ( str.len == npos )
1074         str.len = wxStringStrlen(str.data);
1075     return ::wxDoCmp(m_impl.data(), m_impl.length(), str.data, str.len);
1076 }
1077
1078 int wxString::compare(const wchar_t* sz) const
1079 {
1080     SubstrBufFromWC str(ImplStr(sz, npos));
1081     if ( str.len == npos )
1082         str.len = wxStringStrlen(str.data);
1083     return ::wxDoCmp(m_impl.data(), m_impl.length(), str.data, str.len);
1084 }
1085
1086 int wxString::compare(size_t nStart, size_t nLen,
1087                       const char* sz, size_t nCount) const
1088 {
1089     wxASSERT(nStart <= length());
1090     size_type strLen = length() - nStart;
1091     nLen = strLen < nLen ? strLen : nLen;
1092
1093     size_t pos, len;
1094     PosLenToImpl(nStart, nLen, &pos, &len);
1095
1096     SubstrBufFromMB str(ImplStr(sz, nCount));
1097     if ( str.len == npos )
1098         str.len = wxStringStrlen(str.data);
1099
1100     return ::wxDoCmp(m_impl.data() + pos, len, str.data, str.len);
1101 }
1102
1103 int wxString::compare(size_t nStart, size_t nLen,
1104                       const wchar_t* sz, size_t nCount) const
1105 {
1106     wxASSERT(nStart <= length());
1107     size_type strLen = length() - nStart;
1108     nLen = strLen < nLen ? strLen : nLen;
1109
1110     size_t pos, len;
1111     PosLenToImpl(nStart, nLen, &pos, &len);
1112
1113     SubstrBufFromWC str(ImplStr(sz, nCount));
1114     if ( str.len == npos )
1115         str.len = wxStringStrlen(str.data);
1116
1117     return ::wxDoCmp(m_impl.data() + pos, len, str.data, str.len);
1118 }
1119
1120 #endif // HAVE_STD_STRING_COMPARE/!HAVE_STD_STRING_COMPARE
1121
1122
1123 // ---------------------------------------------------------------------------
1124 // find_{first,last}_[not]_of functions
1125 // ---------------------------------------------------------------------------
1126
1127 #if !wxUSE_STL_BASED_WXSTRING || wxUSE_UNICODE_UTF8
1128
1129 // NB: All these functions are implemented  with the argument being wxChar*,
1130 //     i.e. widechar string in any Unicode build, even though native string
1131 //     representation is char* in the UTF-8 build. This is because we couldn't
1132 //     use memchr() to determine if a character is in a set encoded as UTF-8.
1133
1134 size_t wxString::find_first_of(const wxChar* sz, size_t nStart) const
1135 {
1136     return find_first_of(sz, nStart, wxStrlen(sz));
1137 }
1138
1139 size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart) const
1140 {
1141     return find_first_not_of(sz, nStart, wxStrlen(sz));
1142 }
1143
1144 size_t wxString::find_first_of(const wxChar* sz, size_t nStart, size_t n) const
1145 {
1146     wxASSERT_MSG( nStart <= length(),  _T("invalid index") );
1147
1148     size_t idx = nStart;
1149     for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1150     {
1151         if ( wxTmemchr(sz, *i, n) )
1152             return idx;
1153     }
1154
1155     return npos;
1156 }
1157
1158 size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart, size_t n) const
1159 {
1160     wxASSERT_MSG( nStart <= length(),  _T("invalid index") );
1161
1162     size_t idx = nStart;
1163     for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1164     {
1165         if ( !wxTmemchr(sz, *i, n) )
1166             return idx;
1167     }
1168
1169     return npos;
1170 }
1171
1172
1173 size_t wxString::find_last_of(const wxChar* sz, size_t nStart) const
1174 {
1175     return find_last_of(sz, nStart, wxStrlen(sz));
1176 }
1177
1178 size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart) const
1179 {
1180     return find_last_not_of(sz, nStart, wxStrlen(sz));
1181 }
1182
1183 size_t wxString::find_last_of(const wxChar* sz, size_t nStart, size_t n) const
1184 {
1185     size_t len = length();
1186
1187     if ( nStart == npos )
1188     {
1189         nStart = len - 1;
1190     }
1191     else
1192     {
1193         wxASSERT_MSG( nStart <= len, _T("invalid index") );
1194     }
1195
1196     size_t idx = nStart;
1197     for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1198           i != rend(); --idx, ++i )
1199     {
1200         if ( wxTmemchr(sz, *i, n) )
1201             return idx;
1202     }
1203
1204     return npos;
1205 }
1206
1207 size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart, size_t n) const
1208 {
1209     size_t len = length();
1210
1211     if ( nStart == npos )
1212     {
1213         nStart = len - 1;
1214     }
1215     else
1216     {
1217         wxASSERT_MSG( nStart <= len, _T("invalid index") );
1218     }
1219
1220     size_t idx = nStart;
1221     for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1222           i != rend(); --idx, ++i )
1223     {
1224         if ( !wxTmemchr(sz, *i, n) )
1225             return idx;
1226     }
1227
1228     return npos;
1229 }
1230
1231 size_t wxString::find_first_not_of(wxUniChar ch, size_t nStart) const
1232 {
1233     wxASSERT_MSG( nStart <= length(),  _T("invalid index") );
1234
1235     size_t idx = nStart;
1236     for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1237     {
1238         if ( *i != ch )
1239             return idx;
1240     }
1241
1242     return npos;
1243 }
1244
1245 size_t wxString::find_last_not_of(wxUniChar ch, size_t nStart) const
1246 {
1247     size_t len = length();
1248
1249     if ( nStart == npos )
1250     {
1251         nStart = len - 1;
1252     }
1253     else
1254     {
1255         wxASSERT_MSG( nStart <= len, _T("invalid index") );
1256     }
1257
1258     size_t idx = nStart;
1259     for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1260           i != rend(); --idx, ++i )
1261     {
1262         if ( *i != ch )
1263             return idx;
1264     }
1265
1266     return npos;
1267 }
1268
1269 // the functions above were implemented for wchar_t* arguments in Unicode
1270 // build and char* in ANSI build; below are implementations for the other
1271 // version:
1272 #if wxUSE_UNICODE
1273     #define wxOtherCharType char
1274     #define STRCONV         (const wxChar*)wxConvLibc.cMB2WC
1275 #else
1276     #define wxOtherCharType wchar_t
1277     #define STRCONV         (const wxChar*)wxConvLibc.cWC2MB
1278 #endif
1279
1280 size_t wxString::find_first_of(const wxOtherCharType* sz, size_t nStart) const
1281     { return find_first_of(STRCONV(sz), nStart); }
1282
1283 size_t wxString::find_first_of(const wxOtherCharType* sz, size_t nStart,
1284                                size_t n) const
1285     { return find_first_of(STRCONV(sz, n, NULL), nStart, n); }
1286 size_t wxString::find_last_of(const wxOtherCharType* sz, size_t nStart) const
1287     { return find_last_of(STRCONV(sz), nStart); }
1288 size_t wxString::find_last_of(const wxOtherCharType* sz, size_t nStart,
1289                               size_t n) const
1290     { return find_last_of(STRCONV(sz, n, NULL), nStart, n); }
1291 size_t wxString::find_first_not_of(const wxOtherCharType* sz, size_t nStart) const
1292     { return find_first_not_of(STRCONV(sz), nStart); }
1293 size_t wxString::find_first_not_of(const wxOtherCharType* sz, size_t nStart,
1294                                    size_t n) const
1295     { return find_first_not_of(STRCONV(sz, n, NULL), nStart, n); }
1296 size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart) const
1297     { return find_last_not_of(STRCONV(sz), nStart); }
1298 size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart,
1299                                   size_t n) const
1300     { return find_last_not_of(STRCONV(sz, n, NULL), nStart, n); }
1301
1302 #undef wxOtherCharType
1303 #undef STRCONV
1304
1305 #endif // !wxUSE_STL_BASED_WXSTRING || wxUSE_UNICODE_UTF8
1306
1307 // ===========================================================================
1308 // other common string functions
1309 // ===========================================================================
1310
1311 int wxString::CmpNoCase(const wxString& s) const
1312 {
1313     // FIXME-UTF8: use wxUniChar::ToLower/ToUpper once added
1314
1315     size_t idx = 0;
1316     const_iterator i1 = begin();
1317     const_iterator end1 = end();
1318     const_iterator i2 = s.begin();
1319     const_iterator end2 = s.end();
1320
1321     for ( ; i1 != end1 && i2 != end2; ++idx, ++i1, ++i2 )
1322     {
1323         wxUniChar lower1 = (wxChar)wxTolower(*i1);
1324         wxUniChar lower2 = (wxChar)wxTolower(*i2);
1325         if ( lower1 != lower2 )
1326             return lower1 < lower2 ? -1 : 1;
1327     }
1328
1329     size_t len1 = length();
1330     size_t len2 = s.length();
1331
1332     if ( len1 < len2 )
1333         return -1;
1334     else if ( len1 > len2 )
1335         return 1;
1336     return 0;
1337 }
1338
1339
1340 #if wxUSE_UNICODE
1341
1342 #ifdef __MWERKS__
1343 #ifndef __SCHAR_MAX__
1344 #define __SCHAR_MAX__ 127
1345 #endif
1346 #endif
1347
1348 wxString wxString::FromAscii(const char *ascii)
1349 {
1350     if (!ascii)
1351        return wxEmptyString;
1352
1353     size_t len = strlen( ascii );
1354     wxString res;
1355
1356     if ( len )
1357     {
1358         wxStringBuffer buf(res, len);
1359
1360         wchar_t *dest = buf;
1361
1362         for ( ;; )
1363         {
1364            if ( (*dest++ = (wchar_t)(unsigned char)*ascii++) == L'\0' )
1365                break;
1366         }
1367     }
1368
1369     return res;
1370 }
1371
1372 wxString wxString::FromAscii(const char ascii)
1373 {
1374     // What do we do with '\0' ?
1375
1376     wxString res;
1377     res += (wchar_t)(unsigned char) ascii;
1378
1379     return res;
1380 }
1381
1382 const wxCharBuffer wxString::ToAscii() const
1383 {
1384     // this will allocate enough space for the terminating NUL too
1385     wxCharBuffer buffer(length());
1386
1387
1388     char *dest = buffer.data();
1389
1390     const wchar_t *pwc = c_str();
1391     for ( ;; )
1392     {
1393         *dest++ = (char)(*pwc > SCHAR_MAX ? wxT('_') : *pwc);
1394
1395         // the output string can't have embedded NULs anyhow, so we can safely
1396         // stop at first of them even if we do have any
1397         if ( !*pwc++ )
1398             break;
1399     }
1400
1401     return buffer;
1402 }
1403
1404 #endif // Unicode
1405
1406 // extract string of length nCount starting at nFirst
1407 wxString wxString::Mid(size_t nFirst, size_t nCount) const
1408 {
1409     size_t nLen = length();
1410
1411     // default value of nCount is npos and means "till the end"
1412     if ( nCount == npos )
1413     {
1414         nCount = nLen - nFirst;
1415     }
1416
1417     // out-of-bounds requests return sensible things
1418     if ( nFirst + nCount > nLen )
1419     {
1420         nCount = nLen - nFirst;
1421     }
1422
1423     if ( nFirst > nLen )
1424     {
1425         // AllocCopy() will return empty string
1426         return wxEmptyString;
1427     }
1428
1429     wxString dest(*this, nFirst, nCount);
1430     if ( dest.length() != nCount )
1431     {
1432         wxFAIL_MSG( _T("out of memory in wxString::Mid") );
1433     }
1434
1435     return dest;
1436 }
1437
1438 // check that the string starts with prefix and return the rest of the string
1439 // in the provided pointer if it is not NULL, otherwise return false
1440 bool wxString::StartsWith(const wxChar *prefix, wxString *rest) const
1441 {
1442     wxASSERT_MSG( prefix, _T("invalid parameter in wxString::StartsWith") );
1443
1444     // first check if the beginning of the string matches the prefix: note
1445     // that we don't have to check that we don't run out of this string as
1446     // when we reach the terminating NUL, either prefix string ends too (and
1447     // then it's ok) or we break out of the loop because there is no match
1448     const wxChar *p = c_str();
1449     while ( *prefix )
1450     {
1451         if ( *prefix++ != *p++ )
1452         {
1453             // no match
1454             return false;
1455         }
1456     }
1457
1458     if ( rest )
1459     {
1460         // put the rest of the string into provided pointer
1461         *rest = p;
1462     }
1463
1464     return true;
1465 }
1466
1467
1468 // check that the string ends with suffix and return the rest of it in the
1469 // provided pointer if it is not NULL, otherwise return false
1470 bool wxString::EndsWith(const wxChar *suffix, wxString *rest) const
1471 {
1472     wxASSERT_MSG( suffix, _T("invalid parameter in wxString::EndssWith") );
1473
1474     int start = length() - wxStrlen(suffix);
1475
1476     if ( start < 0 || compare(start, npos, suffix) != 0 )
1477         return false;
1478
1479     if ( rest )
1480     {
1481         // put the rest of the string into provided pointer
1482         rest->assign(*this, 0, start);
1483     }
1484
1485     return true;
1486 }
1487
1488
1489 // extract nCount last (rightmost) characters
1490 wxString wxString::Right(size_t nCount) const
1491 {
1492   if ( nCount > length() )
1493     nCount = length();
1494
1495   wxString dest(*this, length() - nCount, nCount);
1496   if ( dest.length() != nCount ) {
1497     wxFAIL_MSG( _T("out of memory in wxString::Right") );
1498   }
1499   return dest;
1500 }
1501
1502 // get all characters after the last occurence of ch
1503 // (returns the whole string if ch not found)
1504 wxString wxString::AfterLast(wxUniChar ch) const
1505 {
1506   wxString str;
1507   int iPos = Find(ch, true);
1508   if ( iPos == wxNOT_FOUND )
1509     str = *this;
1510   else
1511     str = wx_str() + iPos + 1;
1512
1513   return str;
1514 }
1515
1516 // extract nCount first (leftmost) characters
1517 wxString wxString::Left(size_t nCount) const
1518 {
1519   if ( nCount > length() )
1520     nCount = length();
1521
1522   wxString dest(*this, 0, nCount);
1523   if ( dest.length() != nCount ) {
1524     wxFAIL_MSG( _T("out of memory in wxString::Left") );
1525   }
1526   return dest;
1527 }
1528
1529 // get all characters before the first occurence of ch
1530 // (returns the whole string if ch not found)
1531 wxString wxString::BeforeFirst(wxUniChar ch) const
1532 {
1533   int iPos = Find(ch);
1534   if ( iPos == wxNOT_FOUND ) iPos = length();
1535   return wxString(*this, 0, iPos);
1536 }
1537
1538 /// get all characters before the last occurence of ch
1539 /// (returns empty string if ch not found)
1540 wxString wxString::BeforeLast(wxUniChar ch) const
1541 {
1542   wxString str;
1543   int iPos = Find(ch, true);
1544   if ( iPos != wxNOT_FOUND && iPos != 0 )
1545     str = wxString(c_str(), iPos);
1546
1547   return str;
1548 }
1549
1550 /// get all characters after the first occurence of ch
1551 /// (returns empty string if ch not found)
1552 wxString wxString::AfterFirst(wxUniChar ch) const
1553 {
1554   wxString str;
1555   int iPos = Find(ch);
1556   if ( iPos != wxNOT_FOUND )
1557     str = wx_str() + iPos + 1;
1558
1559   return str;
1560 }
1561
1562 // replace first (or all) occurences of some substring with another one
1563 size_t wxString::Replace(const wxString& strOld,
1564                          const wxString& strNew, bool bReplaceAll)
1565 {
1566     // if we tried to replace an empty string we'd enter an infinite loop below
1567     wxCHECK_MSG( !strOld.empty(), 0,
1568                  _T("wxString::Replace(): invalid parameter") );
1569
1570     size_t uiCount = 0;   // count of replacements made
1571
1572     size_t uiOldLen = strOld.length();
1573     size_t uiNewLen = strNew.length();
1574
1575     size_t dwPos = 0;
1576
1577     while ( (*this)[dwPos] != wxT('\0') )
1578     {
1579         //DO NOT USE STRSTR HERE
1580         //this string can contain embedded null characters,
1581         //so strstr will function incorrectly
1582         dwPos = find(strOld, dwPos);
1583         if ( dwPos == npos )
1584             break;                  // exit the loop
1585         else
1586         {
1587             //replace this occurance of the old string with the new one
1588             replace(dwPos, uiOldLen, strNew, uiNewLen);
1589
1590             //move up pos past the string that was replaced
1591             dwPos += uiNewLen;
1592
1593             //increase replace count
1594             ++uiCount;
1595
1596             // stop now?
1597             if ( !bReplaceAll )
1598                 break;                  // exit the loop
1599         }
1600     }
1601
1602     return uiCount;
1603 }
1604
1605 bool wxString::IsAscii() const
1606 {
1607   const wxChar *s = (const wxChar*) *this;
1608   while(*s){
1609     if(!isascii(*s)) return(false);
1610     s++;
1611   }
1612   return(true);
1613 }
1614
1615 bool wxString::IsWord() const
1616 {
1617   const wxChar *s = (const wxChar*) *this;
1618   while(*s){
1619     if(!wxIsalpha(*s)) return(false);
1620     s++;
1621   }
1622   return(true);
1623 }
1624
1625 bool wxString::IsNumber() const
1626 {
1627   const wxChar *s = (const wxChar*) *this;
1628   if (wxStrlen(s))
1629      if ((s[0] == wxT('-')) || (s[0] == wxT('+'))) s++;
1630   while(*s){
1631     if(!wxIsdigit(*s)) return(false);
1632     s++;
1633   }
1634   return(true);
1635 }
1636
1637 wxString wxString::Strip(stripType w) const
1638 {
1639     wxString s = *this;
1640     if ( w & leading ) s.Trim(false);
1641     if ( w & trailing ) s.Trim(true);
1642     return s;
1643 }
1644
1645 // ---------------------------------------------------------------------------
1646 // case conversion
1647 // ---------------------------------------------------------------------------
1648
1649 wxString& wxString::MakeUpper()
1650 {
1651   for ( iterator it = begin(), en = end(); it != en; ++it )
1652     *it = (wxChar)wxToupper(*it);
1653
1654   return *this;
1655 }
1656
1657 wxString& wxString::MakeLower()
1658 {
1659   for ( iterator it = begin(), en = end(); it != en; ++it )
1660     *it = (wxChar)wxTolower(*it);
1661
1662   return *this;
1663 }
1664
1665 // ---------------------------------------------------------------------------
1666 // trimming and padding
1667 // ---------------------------------------------------------------------------
1668
1669 // some compilers (VC++ 6.0 not to name them) return true for a call to
1670 // isspace('ê') in the C locale which seems to be broken to me, but we have to
1671 // live with this by checking that the character is a 7 bit one - even if this
1672 // may fail to detect some spaces (I don't know if Unicode doesn't have
1673 // space-like symbols somewhere except in the first 128 chars), it is arguably
1674 // still better than trimming away accented letters
1675 inline int wxSafeIsspace(wxChar ch) { return (ch < 127) && wxIsspace(ch); }
1676
1677 // trims spaces (in the sense of isspace) from left or right side
1678 wxString& wxString::Trim(bool bFromRight)
1679 {
1680     // first check if we're going to modify the string at all
1681     if ( !empty() &&
1682          (
1683           (bFromRight && wxSafeIsspace(GetChar(length() - 1))) ||
1684           (!bFromRight && wxSafeIsspace(GetChar(0u)))
1685          )
1686        )
1687     {
1688         if ( bFromRight )
1689         {
1690             // find last non-space character
1691             reverse_iterator psz = rbegin();
1692             while ( (psz != rend()) && wxSafeIsspace(*psz) )
1693                 psz++;
1694
1695             // truncate at trailing space start
1696             erase(psz.base(), end());
1697         }
1698         else
1699         {
1700             // find first non-space character
1701             iterator psz = begin();
1702             while ( (psz != end()) && wxSafeIsspace(*psz) )
1703                 psz++;
1704
1705             // fix up data and length
1706             erase(begin(), psz);
1707         }
1708     }
1709
1710     return *this;
1711 }
1712
1713 // adds nCount characters chPad to the string from either side
1714 wxString& wxString::Pad(size_t nCount, wxUniChar chPad, bool bFromRight)
1715 {
1716     wxString s(chPad, nCount);
1717
1718     if ( bFromRight )
1719         *this += s;
1720     else
1721     {
1722         s += *this;
1723         swap(s);
1724     }
1725
1726     return *this;
1727 }
1728
1729 // truncate the string
1730 wxString& wxString::Truncate(size_t uiLen)
1731 {
1732     if ( uiLen < length() )
1733     {
1734         erase(begin() + uiLen, end());
1735     }
1736     //else: nothing to do, string is already short enough
1737
1738     return *this;
1739 }
1740
1741 // ---------------------------------------------------------------------------
1742 // finding (return wxNOT_FOUND if not found and index otherwise)
1743 // ---------------------------------------------------------------------------
1744
1745 // find a character
1746 int wxString::Find(wxUniChar ch, bool bFromEnd) const
1747 {
1748     size_type idx = bFromEnd ? find_last_of(ch) : find_first_of(ch);
1749
1750     return (idx == npos) ? wxNOT_FOUND : (int)idx;
1751 }
1752
1753 // ----------------------------------------------------------------------------
1754 // conversion to numbers
1755 // ----------------------------------------------------------------------------
1756
1757 // the implementation of all the functions below is exactly the same so factor
1758 // it out
1759
1760 template <typename T, typename F>
1761 bool wxStringToIntType(const wxChar *start,
1762                        T *val,
1763                        int base,
1764                        F func)
1765 {
1766     wxCHECK_MSG( val, false, _T("NULL output pointer") );
1767     wxASSERT_MSG( !base || (base > 1 && base <= 36), _T("invalid base") );
1768
1769 #ifndef __WXWINCE__
1770     errno = 0;
1771 #endif
1772
1773     wxChar *end;
1774     *val = (*func)(start, &end, base);
1775
1776     // return true only if scan was stopped by the terminating NUL and if the
1777     // string was not empty to start with and no under/overflow occurred
1778     return !*end && (end != start)
1779 #ifndef __WXWINCE__
1780         && (errno != ERANGE)
1781 #endif
1782     ;
1783 }
1784
1785 bool wxString::ToLong(long *val, int base) const
1786 {
1787     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtol);
1788 }
1789
1790 bool wxString::ToULong(unsigned long *val, int base) const
1791 {
1792     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoul);
1793 }
1794
1795 bool wxString::ToLongLong(wxLongLong_t *val, int base) const
1796 {
1797     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoll);
1798 }
1799
1800 bool wxString::ToULongLong(wxULongLong_t *val, int base) const
1801 {
1802     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoull);
1803 }
1804
1805 bool wxString::ToDouble(double *val) const
1806 {
1807     wxCHECK_MSG( val, false, _T("NULL pointer in wxString::ToDouble") );
1808
1809 #ifndef __WXWINCE__
1810     errno = 0;
1811 #endif
1812
1813     const wxChar *start = c_str();
1814     wxChar *end;
1815     *val = wxStrtod(start, &end);
1816
1817     // return true only if scan was stopped by the terminating NUL and if the
1818     // string was not empty to start with and no under/overflow occurred
1819     return !*end && (end != start)
1820 #ifndef __WXWINCE__
1821         && (errno != ERANGE)
1822 #endif
1823     ;
1824 }
1825
1826 // ---------------------------------------------------------------------------
1827 // formatted output
1828 // ---------------------------------------------------------------------------
1829
1830 /* static */
1831 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1832 wxString wxStringPrintfMixinBase::DoFormat(const wxChar *format, ...)
1833 #else
1834 wxString wxString::DoFormat(const wxChar *format, ...)
1835 #endif
1836 {
1837     va_list argptr;
1838     va_start(argptr, format);
1839
1840     wxString s;
1841     s.PrintfV(format, argptr);
1842
1843     va_end(argptr);
1844
1845     return s;
1846 }
1847
1848 /* static */
1849 wxString wxString::FormatV(const wxString& format, va_list argptr)
1850 {
1851     wxString s;
1852     s.PrintfV(format, argptr);
1853     return s;
1854 }
1855
1856 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1857 int wxStringPrintfMixinBase::DoPrintf(const wxChar *format, ...)
1858 #else
1859 int wxString::DoPrintf(const wxChar *format, ...)
1860 #endif
1861 {
1862     va_list argptr;
1863     va_start(argptr, format);
1864
1865 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1866     // get a pointer to the wxString instance; we have to use dynamic_cast<>
1867     // because it's the only cast that works safely for downcasting when
1868     // multiple inheritance is used:
1869     wxString *str = static_cast<wxString*>(this);
1870 #else
1871     wxString *str = this;
1872 #endif
1873
1874     int iLen = str->PrintfV(format, argptr);
1875
1876     va_end(argptr);
1877
1878     return iLen;
1879 }
1880
1881 int wxString::PrintfV(const wxString& format, va_list argptr)
1882 {
1883     int size = 1024;
1884
1885     for ( ;; )
1886     {
1887         wxStringBuffer tmp(*this, size + 1);
1888         wxChar *buf = tmp;
1889
1890         if ( !buf )
1891         {
1892             // out of memory
1893             return -1;
1894         }
1895
1896         // wxVsnprintf() may modify the original arg pointer, so pass it
1897         // only a copy
1898         va_list argptrcopy;
1899         wxVaCopy(argptrcopy, argptr);
1900         int len = wxVsnprintf(buf, size, (const wxChar*)/*FIXME-UTF8*/format, argptrcopy);
1901         va_end(argptrcopy);
1902
1903         // some implementations of vsnprintf() don't NUL terminate
1904         // the string if there is not enough space for it so
1905         // always do it manually
1906         buf[size] = _T('\0');
1907
1908         // vsnprintf() may return either -1 (traditional Unix behaviour) or the
1909         // total number of characters which would have been written if the
1910         // buffer were large enough (newer standards such as Unix98)
1911         if ( len < 0 )
1912         {
1913 #if wxUSE_WXVSNPRINTF
1914             // we know that our own implementation of wxVsnprintf() returns -1
1915             // only for a format error - thus there's something wrong with
1916             // the user's format string
1917             return -1;
1918 #else // assume that system version only returns error if not enough space
1919             // still not enough, as we don't know how much we need, double the
1920             // current size of the buffer
1921             size *= 2;
1922 #endif // wxUSE_WXVSNPRINTF/!wxUSE_WXVSNPRINTF
1923         }
1924         else if ( len >= size )
1925         {
1926 #if wxUSE_WXVSNPRINTF
1927             // we know that our own implementation of wxVsnprintf() returns
1928             // size+1 when there's not enough space but that's not the size
1929             // of the required buffer!
1930             size *= 2;      // so we just double the current size of the buffer
1931 #else
1932             // some vsnprintf() implementations NUL-terminate the buffer and
1933             // some don't in len == size case, to be safe always add 1
1934             size = len + 1;
1935 #endif
1936         }
1937         else // ok, there was enough space
1938         {
1939             break;
1940         }
1941     }
1942
1943     // we could have overshot
1944     Shrink();
1945
1946     return length();
1947 }
1948
1949 // ----------------------------------------------------------------------------
1950 // misc other operations
1951 // ----------------------------------------------------------------------------
1952
1953 // returns true if the string matches the pattern which may contain '*' and
1954 // '?' metacharacters (as usual, '?' matches any character and '*' any number
1955 // of them)
1956 bool wxString::Matches(const wxString& mask) const
1957 {
1958     // I disable this code as it doesn't seem to be faster (in fact, it seems
1959     // to be much slower) than the old, hand-written code below and using it
1960     // here requires always linking with libregex even if the user code doesn't
1961     // use it
1962 #if 0 // wxUSE_REGEX
1963     // first translate the shell-like mask into a regex
1964     wxString pattern;
1965     pattern.reserve(wxStrlen(pszMask));
1966
1967     pattern += _T('^');
1968     while ( *pszMask )
1969     {
1970         switch ( *pszMask )
1971         {
1972             case _T('?'):
1973                 pattern += _T('.');
1974                 break;
1975
1976             case _T('*'):
1977                 pattern += _T(".*");
1978                 break;
1979
1980             case _T('^'):
1981             case _T('.'):
1982             case _T('$'):
1983             case _T('('):
1984             case _T(')'):
1985             case _T('|'):
1986             case _T('+'):
1987             case _T('\\'):
1988                 // these characters are special in a RE, quote them
1989                 // (however note that we don't quote '[' and ']' to allow
1990                 // using them for Unix shell like matching)
1991                 pattern += _T('\\');
1992                 // fall through
1993
1994             default:
1995                 pattern += *pszMask;
1996         }
1997
1998         pszMask++;
1999     }
2000     pattern += _T('$');
2001
2002     // and now use it
2003     return wxRegEx(pattern, wxRE_NOSUB | wxRE_EXTENDED).Matches(c_str());
2004 #else // !wxUSE_REGEX
2005   // TODO: this is, of course, awfully inefficient...
2006
2007   // FIXME-UTF8: implement using iterators, remove #if
2008 #if wxUSE_UNICODE_UTF8
2009   wxWCharBuffer maskBuf = mask.wc_str();
2010   wxWCharBuffer txtBuf = wc_str();
2011   const wxChar *pszMask = maskBuf.data();
2012   const wxChar *pszTxt = txtBuf.data();
2013 #else
2014   const wxChar *pszMask = mask.wx_str();
2015   // the char currently being checked
2016   const wxChar *pszTxt = wx_str();
2017 #endif
2018
2019   // the last location where '*' matched
2020   const wxChar *pszLastStarInText = NULL;
2021   const wxChar *pszLastStarInMask = NULL;
2022
2023 match:
2024   for ( ; *pszMask != wxT('\0'); pszMask++, pszTxt++ ) {
2025     switch ( *pszMask ) {
2026       case wxT('?'):
2027         if ( *pszTxt == wxT('\0') )
2028           return false;
2029
2030         // pszTxt and pszMask will be incremented in the loop statement
2031
2032         break;
2033
2034       case wxT('*'):
2035         {
2036           // remember where we started to be able to backtrack later
2037           pszLastStarInText = pszTxt;
2038           pszLastStarInMask = pszMask;
2039
2040           // ignore special chars immediately following this one
2041           // (should this be an error?)
2042           while ( *pszMask == wxT('*') || *pszMask == wxT('?') )
2043             pszMask++;
2044
2045           // if there is nothing more, match
2046           if ( *pszMask == wxT('\0') )
2047             return true;
2048
2049           // are there any other metacharacters in the mask?
2050           size_t uiLenMask;
2051           const wxChar *pEndMask = wxStrpbrk(pszMask, wxT("*?"));
2052
2053           if ( pEndMask != NULL ) {
2054             // we have to match the string between two metachars
2055             uiLenMask = pEndMask - pszMask;
2056           }
2057           else {
2058             // we have to match the remainder of the string
2059             uiLenMask = wxStrlen(pszMask);
2060           }
2061
2062           wxString strToMatch(pszMask, uiLenMask);
2063           const wxChar* pMatch = wxStrstr(pszTxt, strToMatch);
2064           if ( pMatch == NULL )
2065             return false;
2066
2067           // -1 to compensate "++" in the loop
2068           pszTxt = pMatch + uiLenMask - 1;
2069           pszMask += uiLenMask - 1;
2070         }
2071         break;
2072
2073       default:
2074         if ( *pszMask != *pszTxt )
2075           return false;
2076         break;
2077     }
2078   }
2079
2080   // match only if nothing left
2081   if ( *pszTxt == wxT('\0') )
2082     return true;
2083
2084   // if we failed to match, backtrack if we can
2085   if ( pszLastStarInText ) {
2086     pszTxt = pszLastStarInText + 1;
2087     pszMask = pszLastStarInMask;
2088
2089     pszLastStarInText = NULL;
2090
2091     // don't bother resetting pszLastStarInMask, it's unnecessary
2092
2093     goto match;
2094   }
2095
2096   return false;
2097 #endif // wxUSE_REGEX/!wxUSE_REGEX
2098 }
2099
2100 // Count the number of chars
2101 int wxString::Freq(wxUniChar ch) const
2102 {
2103     int count = 0;
2104     for ( const_iterator i = begin(); i != end(); ++i )
2105     {
2106         if ( *i == ch )
2107             count ++;
2108     }
2109     return count;
2110 }
2111
2112 // convert to upper case, return the copy of the string
2113 wxString wxString::Upper() const
2114 { wxString s(*this); return s.MakeUpper(); }
2115
2116 // convert to lower case, return the copy of the string
2117 wxString wxString::Lower() const { wxString s(*this); return s.MakeLower(); }