src/common/string.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/string.cpp
   3 // Purpose:     wxString class
   4 // Author:      Vadim Zeitlin, Ryan Norton
   5 // Modified by:
   6 // Created:     29/01/98
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) 1998 Vadim Zeitlin <zeitlin@dptmaths.ens-cachan.fr>
   9 //              (c) 2004 Ryan Norton <wxprojects@comcast.net>
  10 // Licence:     wxWindows licence
  11 /////////////////////////////////////////////////////////////////////////////
  12
  13 /*
  14  * About ref counting:
  15  *  1) all empty strings use g_strEmpty, nRefs = -1 (set in Init())
  16  *  2) AllocBuffer() sets nRefs to 1, Lock() increments it by one
  17  *  3) Unlock() decrements nRefs and frees memory if it goes to 0
  18  */
  19
  20 // ===========================================================================
  21 // headers, declarations, constants
  22 // ===========================================================================
  23
  24 // For compilers that support precompilation, includes "wx.h".
  25 #include "wx/wxprec.h"
  26
  27 #ifdef __BORLANDC__
  28     #pragma hdrstop
  29 #endif
  30
  31 #ifndef WX_PRECOMP
  32     #include "wx/string.h"
  33 #endif
  34
  35 #include <ctype.h>
  36
  37 #ifndef __WXWINCE__
  38     #include <errno.h>
  39 #endif
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 #ifdef __SALFORDC__
  45     #include <clib.h>
  46 #endif
  47
  48 #include "wx/hashmap.h"
  49
  50 // string handling functions used by wxString:
  51 #if wxUSE_UNICODE_UTF8
  52     #define wxStringMemcpy   memcpy
  53     #define wxStringMemcmp   memcmp
  54     #define wxStringMemchr   memchr
  55     #define wxStringStrlen   strlen
  56 #else
  57     #define wxStringMemcpy   wxTmemcpy
  58     #define wxStringMemcmp   wxTmemcmp
  59     #define wxStringMemchr   wxTmemchr
  60     #define wxStringStrlen   wxStrlen
  61 #endif
  62
  63
  64 // ---------------------------------------------------------------------------
  65 // static class variables definition
  66 // ---------------------------------------------------------------------------
  67
  68 //According to STL _must_ be a -1 size_t
  69 const size_t wxString::npos = (size_t) -1;
  70
  71 // ----------------------------------------------------------------------------
  72 // global functions
  73 // ----------------------------------------------------------------------------
  74
  75 #if wxUSE_STD_IOSTREAM
  76
  77 #include <iostream>
  78
  79 wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str)
  80 {
  81 // FIXME-UTF8: always, not only if wxUSE_UNICODE
  82 #if wxUSE_UNICODE && !defined(__BORLANDC__)
  83     return os << (const wchar_t*)str.AsWCharBuf();
  84 #else
  85     return os << (const char*)str.AsCharBuf();
  86 #endif
  87 }
  88
  89 wxSTD ostream& operator<<(wxSTD ostream& os, const wxString& str)
  90 {
  91     return os << str.c_str();
  92 }
  93
  94 wxSTD ostream& operator<<(wxSTD ostream& os, const wxCharBuffer& str)
  95 {
  96     return os << str.data();
  97 }
  98
  99 #ifndef __BORLANDC__
 100 wxSTD ostream& operator<<(wxSTD ostream& os, const wxWCharBuffer& str)
 101 {
 102     return os << str.data();
 103 }
 104 #endif
 105
 106 #endif // wxUSE_STD_IOSTREAM
 107
 108 // ===========================================================================
 109 // wxString class core
 110 // ===========================================================================
 111
 112 #if wxUSE_UNICODE_UTF8
 113
 114 // ---------------------------------------------------------------------------
 115 // UTF-8 operations
 116 // ---------------------------------------------------------------------------
 117
 118 //
 119 // Table 3.1B from Unicode spec: Legal UTF-8 Byte Sequences
 120 //
 121 //     Code Points    | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
 122 // -------------------+----------+----------+----------+----------+
 123 //   U+0000..U+007F   |  00..7F  |          |          |          |
 124 //   U+0080..U+07FF   |  C2..DF  |  80..BF  |          |          |
 125 //   U+0800..U+0FFF   |  E0      |  A0..BF  |  80..BF  |          |
 126 //   U+1000..U+FFFF   |  E1..EF  |  80..BF  |  80..BF  |          |
 127 //  U+10000..U+3FFFF  |  F0      |  90..BF  |  80..BF  |  80..BF  |
 128 //  U+40000..U+FFFFF  |  F1..F3  |  80..BF  |  80..BF  |  80..BF  |
 129 // U+100000..U+10FFFF |  F4      |  80..8F  |  80..BF  |  80..BF  |
 130 // -------------------+----------+----------+----------+----------+
 131
 132 bool wxString::IsValidUtf8String(const char *str)
 133 {
 134     if ( !str )
 135         return true; // empty string is UTF8 string
 136
 137     const unsigned char *c = (const unsigned char*)str;
 138
 139     for ( ; *c; ++c )
 140     {
 141         unsigned char b = *c;
 142
 143         if ( b <= 0x7F ) // 00..7F
 144             continue;
 145
 146         else if ( b < 0xC2 ) // invalid lead bytes: 80..C1
 147             return false;
 148
 149         // two-byte sequences:
 150         else if ( b <= 0xDF ) // C2..DF
 151         {
 152             b = *(++c);
 153             if ( !(b >= 0x80 && b <= 0xBF ) )
 154                 return false;
 155         }
 156
 157         // three-byte sequences:
 158         else if ( b == 0xE0 )
 159         {
 160             b = *(++c);
 161             if ( !(b >= 0xA0 && b <= 0xBF ) )
 162                 return false;
 163             b = *(++c);
 164             if ( !(b >= 0x80 && b <= 0xBF ) )
 165                 return false;
 166         }
 167         else if ( b <= 0xEF ) // E1..EF
 168         {
 169             for ( int i = 0; i < 2; ++i )
 170             {
 171                 b = *(++c);
 172                 if ( !(b >= 0x80 && b <= 0xBF ) )
 173                     return false;
 174             }
 175         }
 176
 177         // four-byte sequences:
 178         else if ( b == 0xF0 )
 179         {
 180             b = *(++c);
 181             if ( !(b >= 0x90 && b <= 0xBF ) )
 182                 return false;
 183             for ( int i = 0; i < 2; ++i )
 184             {
 185                 b = *(++c);
 186                 if ( !(b >= 0x80 && b <= 0xBF ) )
 187                     return false;
 188             }
 189         }
 190         else if ( b <= 0xF3 ) // F1..F3
 191         {
 192             for ( int i = 0; i < 3; ++i )
 193             {
 194                 b = *(++c);
 195                 if ( !(b >= 0x80 && b <= 0xBF ) )
 196                     return false;
 197             }
 198         }
 199         else if ( b == 0xF4 )
 200         {
 201             b = *(++c);
 202             if ( !(b >= 0x80 && b <= 0x8F ) )
 203                 return false;
 204             for ( int i = 0; i < 2; ++i )
 205             {
 206                 b = *(++c);
 207                 if ( !(b >= 0x80 && b <= 0xBF ) )
 208                     return false;
 209             }
 210         }
 211         else // otherwise, it's invalid lead byte
 212             return false;
 213     }
 214
 215     return true;
 216 }
 217
 218 #ifdef __WXDEBUG__
 219 /* static */
 220 bool wxString::IsValidUtf8LeadByte(unsigned char c)
 221 {
 222     return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
 223 }
 224 #endif
 225
 226 unsigned char wxString::ms_utf8IterTable[256] = {
 227     // single-byte sequences (ASCII):
 228     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 229     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 230     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 231     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 232     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 233     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 234     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 235     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 236
 237     // these are invalid, we use step 1 to skip
 238     // over them (should never happen):
 239     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
 240     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
 241     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
 242     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
 243     1, 1,                                            // C0,C1
 244
 245     // two-byte sequences:
 246           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 247     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 248
 249     // three-byte sequences:
 250     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 251
 252     // four-byte sequences:
 253     4, 4, 4, 4, 4,                                   // F0..F4
 254
 255     // these are invalid again (5- or 6-byte
 256     // sequences and sequences for code points
 257     // above U+10FFFF, as restricted by RFC 3629):
 258                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
 259 };
 260
 261 /* static */
 262 void wxString::DecIter(wxStringImpl::const_iterator& i)
 263 {
 264     wxASSERT( IsValidUtf8LeadByte(*i) );
 265
 266     // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
 267     // binary), so we just have to go back until we hit a byte that is either
 268     // < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in binary; this
 269     // includes some invalid values, but we can ignore it here, because we
 270     // assume valid UTF-8 input for the purpose of efficient implementation).
 271     --i;
 272     while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
 273         --i;
 274 }
 275
 276 /* static */
 277 void wxString::DecIter(wxStringImpl::iterator& i)
 278 {
 279     // FIXME-UTF8: use template instead
 280     wxASSERT( IsValidUtf8LeadByte(*i) );
 281     --i;
 282     while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
 283         --i;
 284 }
 285
 286 /* static */
 287 wxStringImpl::const_iterator
 288 wxString::AddToIter(wxStringImpl::const_iterator i, int n)
 289 {
 290     wxStringImpl::const_iterator out(i);
 291
 292     if ( n > 0 )
 293     {
 294         for ( int j = 0; j < n; ++j )
 295             IncIter(out);
 296     }
 297     else if ( n < 0 )
 298     {
 299         for ( int j = 0; j > n; --j )
 300             DecIter(out);
 301     }
 302
 303     return out;
 304 }
 305
 306 wxStringImpl::iterator
 307 wxString::AddToIter(wxStringImpl::iterator i, int n)
 308 {
 309     // FIXME-UTF8: use template instead
 310     wxStringImpl::iterator out(i);
 311
 312     if ( n > 0 )
 313     {
 314         for ( int j = 0; j < n; ++j )
 315             IncIter(out);
 316     }
 317     else if ( n < 0 )
 318     {
 319         for ( int j = 0; j > n; --j )
 320             DecIter(out);
 321     }
 322
 323     return out;
 324 }
 325
 326
 327 /* static */
 328 int wxString::DiffIters(wxStringImpl::const_iterator i1,
 329                         wxStringImpl::const_iterator i2)
 330 {
 331     int dist = 0;
 332
 333     if ( i1 < i2 )
 334     {
 335         while ( i1 != i2 )
 336         {
 337             IncIter(i1);
 338             dist--;
 339         }
 340     }
 341     else if ( i2 < i1 )
 342     {
 343         while ( i2 != i1 )
 344         {
 345             IncIter(i2);
 346             dist++;
 347         }
 348     }
 349
 350     return dist;
 351 }
 352
 353 int wxString::DiffIters(wxStringImpl::iterator i1, wxStringImpl::iterator i2)
 354 {
 355     // FIXME-UTF8: use template instead
 356     int dist = 0;
 357
 358     if ( i1 < i2 )
 359     {
 360         while ( i1 != i2 )
 361         {
 362             IncIter(i1);
 363             dist--;
 364         }
 365     }
 366     else if ( i2 < i1 )
 367     {
 368         while ( i2 != i1 )
 369         {
 370             IncIter(i2);
 371             dist++;
 372         }
 373     }
 374
 375     return dist;
 376 }
 377
 378 /* static */
 379 wxString::Utf8CharBuffer wxString::EncodeChar(wxUniChar ch)
 380 {
 381     Utf8CharBuffer buf;
 382     char *out = buf.data;
 383
 384     wxUniChar::value_type code = ch.GetValue();
 385
 386     //    Char. number range   |        UTF-8 octet sequence
 387     //       (hexadecimal)     |              (binary)
 388     //   ----------------------+---------------------------------------------
 389     //   0000 0000 - 0000 007F | 0xxxxxxx
 390     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 391     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 392     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 393     //
 394     //   Code point value is stored in bits marked with 'x', lowest-order bit
 395     //   of the value on the right side in the diagram above.
 396     //                                                        (from RFC 3629)
 397
 398     if ( code <= 0x7F )
 399     {
 400         out[1] = 0;
 401         out[0] = (char)code;
 402     }
 403     else if ( code <= 0x07FF )
 404     {
 405         out[2] = 0;
 406         // NB: this line takes 6 least significant bits, encodes them as
 407         // 10xxxxxx and discards them so that the next byte can be encoded:
 408         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 409         out[0] = 0xC0 | code;
 410     }
 411     else if ( code < 0xFFFF )
 412     {
 413         out[3] = 0;
 414         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 415         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 416         out[0] = 0xE0 | code;
 417     }
 418     else if ( code <= 0x10FFFF )
 419     {
 420         out[4] = 0;
 421         out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 422         out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 423         out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 424         out[0] = 0xF0 | code;
 425     }
 426     else
 427     {
 428         wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 429         out[0] = 0;
 430     }
 431
 432     return buf;
 433 }
 434
 435 /* static */
 436 wxUniChar wxUniCharRef::DecodeChar(wxStringImpl::const_iterator i)
 437 {
 438     wxASSERT( wxString::IsValidUtf8LeadByte(*i) ); // FIXME-UTF8: no "wxString::"
 439
 440     wxUniChar::value_type code = 0;
 441     size_t len = wxString::GetUtf8CharLength(*i);
 442     wxASSERT_MSG( len <= 4, _T("invalid UTF-8 sequence length") );
 443
 444     //    Char. number range   |        UTF-8 octet sequence
 445     //       (hexadecimal)     |              (binary)
 446     //   ----------------------+---------------------------------------------
 447     //   0000 0000 - 0000 007F | 0xxxxxxx
 448     //   0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 449     //   0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 450     //   0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 451     //
 452     //   Code point value is stored in bits marked with 'x', lowest-order bit
 453     //   of the value on the right side in the diagram above.
 454     //                                                        (from RFC 3629)
 455
 456     // mask to extract lead byte's value ('x' bits above), by sequence's length:
 457     static const unsigned char s_leadValueMask[4] =  { 0x7F, 0x1F, 0x0F, 0x07 };
 458 #ifdef __WXDEBUG__
 459     // mask and value of lead byte's most significant bits, by length:
 460     static const unsigned char s_leadMarkerMask[4] = { 0x80, 0xE0, 0xF0, 0xF8 };
 461     static const unsigned char s_leadMarkerVal[4] =  { 0x00, 0xC0, 0xE0, 0xF0 };
 462 #endif
 463
 464     // extract the lead byte's value bits:
 465     wxASSERT_MSG( ((unsigned char)*i & s_leadMarkerMask[len-1]) ==
 466                   s_leadMarkerVal[len-1],
 467                   _T("invalid UTF-8 lead byte") );
 468     code = (unsigned char)*i & s_leadValueMask[len-1];
 469
 470     // all remaining bytes, if any, are handled in the same way regardless of
 471     // sequence's length:
 472     for ( ++i ; len > 1; --len, ++i )
 473     {
 474         wxASSERT_MSG( ((unsigned char)*i & 0xC0) == 0x80,
 475                       _T("invalid UTF-8 byte") );
 476
 477         code <<= 6;
 478         code |= (unsigned char)*i & 0x3F;
 479     }
 480
 481     return wxUniChar(code);
 482 }
 483
 484 /* static */
 485 wxCharBuffer wxString::EncodeNChars(size_t n, wxUniChar ch)
 486 {
 487     Utf8CharBuffer once(EncodeChar(ch));
 488     // the IncIter() table can be used to determine the length of ch's encoding:
 489     size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
 490
 491     wxCharBuffer buf(n * len);
 492     char *ptr = buf.data();
 493     for ( size_t i = 0; i < n; i++, ptr += len )
 494     {
 495         memcpy(ptr, once.data, len);
 496     }
 497
 498     return buf;
 499 }
 500
 501
 502 void wxString::PosLenToImpl(size_t pos, size_t len,
 503                             size_t *implPos, size_t *implLen) const
 504 {
 505     if ( pos == npos )
 506         *implPos = npos;
 507     else
 508     {
 509         const_iterator i = begin() + pos;
 510         *implPos = wxStringImpl::const_iterator(i.impl()) - m_impl.begin();
 511         if ( len == npos )
 512             *implLen = npos;
 513         else
 514         {
 515             // too large length is interpreted as "to the end of the string"
 516             // FIXME-UTF8: verify this is the case in std::string, assert
 517             // otherwise
 518             if ( pos + len > length() )
 519                 len = length() - pos;
 520
 521             *implLen = (i + len).impl() - i.impl();
 522         }
 523     }
 524 }
 525
 526 #endif // wxUSE_UNICODE_UTF8
 527
 528 // ----------------------------------------------------------------------------
 529 // wxCStrData converted strings caching
 530 // ----------------------------------------------------------------------------
 531
 532 // FIXME-UTF8: temporarily disabled because it doesn't work with global
 533 //             string objects; re-enable after fixing this bug and benchmarking
 534 //             performance to see if using a hash is a good idea at all
 535 #if 0
 536
 537 // For backward compatibility reasons, it must be possible to assign the value
 538 // returned by wxString::c_str() to a char* or wchar_t* variable and work with
 539 // it. Returning wxCharBuffer from (const char*)c_str() wouldn't do the trick,
 540 // because the memory would be freed immediately, but it has to be valid as long
 541 // as the string is not modified, so that code like this still works:
 542 //
 543 // const wxChar *s = str.c_str();
 544 // while ( s ) { ... }
 545
 546 // FIXME-UTF8: not thread safe!
 547 // FIXME-UTF8: we currently clear the cached conversion only when the string is
 548 //             destroyed, but we should do it when the string is modified, to
 549 //             keep memory usage down
 550 // FIXME-UTF8: we do the conversion every time As[W]Char() is called, but if we
 551 //             invalidated the cache on every change, we could keep the previous
 552 //             conversion
 553 // FIXME-UTF8: add tracing of usage of these two methods - new code is supposed
 554 //             to use mb_str() or wc_str() instead of (const [w]char*)c_str()
 555
 556 template<typename T>
 557 static inline void DeleteStringFromConversionCache(T& hash, const wxString *s)
 558 {
 559     typename T::iterator i = hash.find(wxConstCast(s, wxString));
 560     if ( i != hash.end() )
 561     {
 562         free(i->second);
 563         hash.erase(i);
 564     }
 565 }
 566
 567 #if wxUSE_UNICODE
 568 // NB: non-STL implementation doesn't compile with "const wxString*" key type,
 569 //     so we have to use wxString* here and const-cast when used
 570 WX_DECLARE_HASH_MAP(wxString*, char*, wxPointerHash, wxPointerEqual,
 571                     wxStringCharConversionCache);
 572 static wxStringCharConversionCache gs_stringsCharCache;
 573
 574 const char* wxCStrData::AsChar() const
 575 {
 576     // remove previously cache value, if any (see FIXMEs above):
 577     DeleteStringFromConversionCache(gs_stringsCharCache, m_str);
 578
 579     // convert the string and keep it:
 580     const char *s = gs_stringsCharCache[wxConstCast(m_str, wxString)] =
 581         m_str->mb_str().release();
 582
 583     return s + m_offset;
 584 }
 585 #endif // wxUSE_UNICODE
 586
 587 #if !wxUSE_UNICODE_WCHAR
 588 WX_DECLARE_HASH_MAP(wxString*, wchar_t*, wxPointerHash, wxPointerEqual,
 589                     wxStringWCharConversionCache);
 590 static wxStringWCharConversionCache gs_stringsWCharCache;
 591
 592 const wchar_t* wxCStrData::AsWChar() const
 593 {
 594     // remove previously cache value, if any (see FIXMEs above):
 595     DeleteStringFromConversionCache(gs_stringsWCharCache, m_str);
 596
 597     // convert the string and keep it:
 598     const wchar_t *s = gs_stringsWCharCache[wxConstCast(m_str, wxString)] =
 599         m_str->wc_str().release();
 600
 601     return s + m_offset;
 602 }
 603 #endif // !wxUSE_UNICODE_WCHAR
 604
 605 wxString::~wxString()
 606 {
 607 #if wxUSE_UNICODE
 608     // FIXME-UTF8: do this only if locale is not UTF8 if wxUSE_UNICODE_UTF8
 609     DeleteStringFromConversionCache(gs_stringsCharCache, this);
 610 #endif
 611 #if !wxUSE_UNICODE_WCHAR
 612     DeleteStringFromConversionCache(gs_stringsWCharCache, this);
 613 #endif
 614 }
 615 #endif
 616
 617 #if wxUSE_UNICODE
 618 const char* wxCStrData::AsChar() const
 619 {
 620     wxString *str = wxConstCast(m_str, wxString);
 621
 622     // convert the string:
 623     wxCharBuffer buf(str->mb_str());
 624
 625     // FIXME-UTF8: do the conversion in-place in the existing buffer
 626     if ( str->m_convertedToChar &&
 627          strlen(buf) == strlen(str->m_convertedToChar) )
 628     {
 629         // keep the same buffer for as long as possible, so that several calls
 630         // to c_str() in a row still work:
 631         strcpy(str->m_convertedToChar, buf);
 632     }
 633     else
 634     {
 635         str->m_convertedToChar = buf.release();
 636     }
 637
 638     // and keep it:
 639     return str->m_convertedToChar + m_offset;
 640 }
 641 #endif // wxUSE_UNICODE
 642
 643 #if !wxUSE_UNICODE_WCHAR
 644 const wchar_t* wxCStrData::AsWChar() const
 645 {
 646     wxString *str = wxConstCast(m_str, wxString);
 647
 648     // convert the string:
 649     wxWCharBuffer buf(str->wc_str());
 650
 651     // FIXME-UTF8: do the conversion in-place in the existing buffer
 652     if ( str->m_convertedToWChar &&
 653          wxWcslen(buf) == wxWcslen(str->m_convertedToWChar) )
 654     {
 655         // keep the same buffer for as long as possible, so that several calls
 656         // to c_str() in a row still work:
 657         memcpy(str->m_convertedToWChar, buf, sizeof(wchar_t) * wxWcslen(buf));
 658     }
 659     else
 660     {
 661         str->m_convertedToWChar = buf.release();
 662     }
 663
 664     // and keep it:
 665     return str->m_convertedToWChar + m_offset;
 666 }
 667 #endif // !wxUSE_UNICODE_WCHAR
 668
 669 // ===========================================================================
 670 // wxString class core
 671 // ===========================================================================
 672
 673 // ---------------------------------------------------------------------------
 674 // construction and conversion
 675 // ---------------------------------------------------------------------------
 676
 677 #if wxUSE_UNICODE_WCHAR
 678 /* static */
 679 wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
 680                                                const wxMBConv& conv)
 681 {
 682     // anything to do?
 683     if ( !psz || nLength == 0 )
 684         return SubstrBufFromMB(L"", 0);
 685
 686     if ( nLength == npos )
 687         nLength = wxNO_LEN;
 688
 689     size_t wcLen;
 690     wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen));
 691     if ( !wcLen )
 692         return SubstrBufFromMB(_T(""), 0);
 693     else
 694         return SubstrBufFromMB(wcBuf, wcLen);
 695 }
 696 #endif // wxUSE_UNICODE_WCHAR
 697
 698 #if wxUSE_UNICODE_UTF8
 699 /* static */
 700 wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength,
 701                                                const wxMBConv& conv)
 702 {
 703     // FIXME-UTF8: return as-is without copying under UTF8 locale, return
 704     //             converted string under other locales - needs wxCharBuffer
 705     //             changes
 706
 707     // anything to do?
 708     if ( !psz || nLength == 0 )
 709         return SubstrBufFromMB("", 0);
 710
 711     if ( nLength == npos )
 712         nLength = wxNO_LEN;
 713
 714     // first convert to wide string:
 715     size_t wcLen;
 716     wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen));
 717     if ( !wcLen )
 718         return SubstrBufFromMB("", 0);
 719
 720     // and then to UTF-8:
 721     SubstrBufFromMB buf(ConvertStr(wcBuf, wcLen, wxConvUTF8));
 722     // widechar -> UTF-8 conversion isn't supposed to ever fail:
 723     wxASSERT_MSG( buf.data, _T("conversion to UTF-8 failed") );
 724
 725     return buf;
 726 }
 727 #endif // wxUSE_UNICODE_UTF8
 728
 729 #if wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE
 730 /* static */
 731 wxString::SubstrBufFromWC wxString::ConvertStr(const wchar_t *pwz, size_t nLength,
 732                                                const wxMBConv& conv)
 733 {
 734     // anything to do?
 735     if ( !pwz || nLength == 0 )
 736         return SubstrBufFromWC("", 0);
 737
 738     if ( nLength == npos )
 739         nLength = wxNO_LEN;
 740
 741     size_t mbLen;
 742     wxCharBuffer mbBuf(conv.cWC2MB(pwz, nLength, &mbLen));
 743     if ( !mbLen )
 744         return SubstrBufFromWC("", 0);
 745     else
 746         return SubstrBufFromWC(mbBuf, mbLen);
 747 }
 748 #endif // wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE
 749
 750
 751 #if wxUSE_UNICODE_WCHAR
 752
 753 //Convert wxString in Unicode mode to a multi-byte string
 754 const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
 755 {
 756     return conv.cWC2MB(wx_str(), length() + 1 /* size, not length */, NULL);
 757 }
 758
 759 #elif wxUSE_UNICODE_UTF8
 760
 761 const wxWCharBuffer wxString::wc_str() const
 762 {
 763     return wxConvUTF8.cMB2WC(m_impl.c_str(),
 764                              m_impl.length() + 1 /* size, not length */,
 765                              NULL);
 766 }
 767
 768 const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const
 769 {
 770     // FIXME-UTF8: optimize the case when conv==wxConvUTF8 or wxConvLibc
 771     //             under UTF8 locale
 772     // FIXME-UTF8: use wc_str() here once we have buffers with length
 773
 774     size_t wcLen;
 775     wxWCharBuffer wcBuf(
 776             wxConvUTF8.cMB2WC(m_impl.c_str(),
 777                               m_impl.length() + 1 /* size, not length */,
 778                               &wcLen));
 779     if ( !wcLen )
 780         return wxCharBuffer("");
 781
 782     return conv.cWC2MB(wcBuf, wcLen, NULL);
 783 }
 784
 785 #else // ANSI
 786
 787 //Converts this string to a wide character string if unicode
 788 //mode is not enabled and wxUSE_WCHAR_T is enabled
 789 const wxWCharBuffer wxString::wc_str(const wxMBConv& conv) const
 790 {
 791     return conv.cMB2WC(wx_str(), length() + 1 /* size, not length */, NULL);
 792 }
 793
 794 #endif // Unicode/ANSI
 795
 796 // shrink to minimal size (releasing extra memory)
 797 bool wxString::Shrink()
 798 {
 799   wxString tmp(begin(), end());
 800   swap(tmp);
 801   return tmp.length() == length();
 802 }
 803
 804 // deprecated compatibility code:
 805 #if WXWIN_COMPATIBILITY_2_8 && !wxUSE_STL_BASED_WXSTRING && !wxUSE_UNICODE_UTF8
 806 wxChar *wxString::GetWriteBuf(size_t nLen)
 807 {
 808     return DoGetWriteBuf(nLen);
 809 }
 810
 811 void wxString::UngetWriteBuf()
 812 {
 813     DoUngetWriteBuf();
 814 }
 815
 816 void wxString::UngetWriteBuf(size_t nLen)
 817 {
 818     DoUngetWriteBuf(nLen);
 819 }
 820 #endif // WXWIN_COMPATIBILITY_2_8 && !wxUSE_STL_BASED_WXSTRING && !wxUSE_UNICODE_UTF8
 821
 822
 823 // ---------------------------------------------------------------------------
 824 // data access
 825 // ---------------------------------------------------------------------------
 826
 827 // all functions are inline in string.h
 828
 829 // ---------------------------------------------------------------------------
 830 // concatenation operators
 831 // ---------------------------------------------------------------------------
 832
 833 /*
 834  * concatenation functions come in 5 flavours:
 835  *  string + string
 836  *  char   + string      and      string + char
 837  *  C str  + string      and      string + C str
 838  */
 839
 840 wxString operator+(const wxString& str1, const wxString& str2)
 841 {
 842 #if !wxUSE_STL_BASED_WXSTRING
 843     wxASSERT( str1.IsValid() );
 844     wxASSERT( str2.IsValid() );
 845 #endif
 846
 847     wxString s = str1;
 848     s += str2;
 849
 850     return s;
 851 }
 852
 853 wxString operator+(const wxString& str, wxUniChar ch)
 854 {
 855 #if !wxUSE_STL_BASED_WXSTRING
 856     wxASSERT( str.IsValid() );
 857 #endif
 858
 859     wxString s = str;
 860     s += ch;
 861
 862     return s;
 863 }
 864
 865 wxString operator+(wxUniChar ch, const wxString& str)
 866 {
 867 #if !wxUSE_STL_BASED_WXSTRING
 868     wxASSERT( str.IsValid() );
 869 #endif
 870
 871     wxString s = ch;
 872     s += str;
 873
 874     return s;
 875 }
 876
 877 wxString operator+(const wxString& str, const char *psz)
 878 {
 879 #if !wxUSE_STL_BASED_WXSTRING
 880     wxASSERT( str.IsValid() );
 881 #endif
 882
 883     wxString s;
 884     if ( !s.Alloc(strlen(psz) + str.length()) ) {
 885         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 886     }
 887     s += str;
 888     s += psz;
 889
 890     return s;
 891 }
 892
 893 wxString operator+(const wxString& str, const wchar_t *pwz)
 894 {
 895 #if !wxUSE_STL_BASED_WXSTRING
 896     wxASSERT( str.IsValid() );
 897 #endif
 898
 899     wxString s;
 900     if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) {
 901         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 902     }
 903     s += str;
 904     s += pwz;
 905
 906     return s;
 907 }
 908
 909 wxString operator+(const char *psz, const wxString& str)
 910 {
 911 #if !wxUSE_STL_BASED_WXSTRING
 912     wxASSERT( str.IsValid() );
 913 #endif
 914
 915     wxString s;
 916     if ( !s.Alloc(strlen(psz) + str.length()) ) {
 917         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 918     }
 919     s = psz;
 920     s += str;
 921
 922     return s;
 923 }
 924
 925 wxString operator+(const wchar_t *pwz, const wxString& str)
 926 {
 927 #if !wxUSE_STL_BASED_WXSTRING
 928     wxASSERT( str.IsValid() );
 929 #endif
 930
 931     wxString s;
 932     if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) {
 933         wxFAIL_MSG( _T("out of memory in wxString::operator+") );
 934     }
 935     s = pwz;
 936     s += str;
 937
 938     return s;
 939 }
 940
 941 // ---------------------------------------------------------------------------
 942 // string comparison
 943 // ---------------------------------------------------------------------------
 944
 945 #ifdef HAVE_STD_STRING_COMPARE
 946
 947 // NB: Comparison code (both if HAVE_STD_STRING_COMPARE and if not) works with
 948 //     UTF-8 encoded strings too, thanks to UTF-8's design which allows us to
 949 //     sort strings in characters code point order by sorting the byte sequence
 950 //     in byte values order (i.e. what strcmp() and memcmp() do).
 951
 952 int wxString::compare(const wxString& str) const
 953 {
 954     return m_impl.compare(str.m_impl);
 955 }
 956
 957 int wxString::compare(size_t nStart, size_t nLen,
 958                       const wxString& str) const
 959 {
 960     size_t pos, len;
 961     PosLenToImpl(nStart, nLen, &pos, &len);
 962     return m_impl.compare(pos, len, str.m_impl);
 963 }
 964
 965 int wxString::compare(size_t nStart, size_t nLen,
 966                       const wxString& str,
 967                       size_t nStart2, size_t nLen2) const
 968 {
 969     size_t pos, len;
 970     PosLenToImpl(nStart, nLen, &pos, &len);
 971
 972     size_t pos2, len2;
 973     str.PosLenToImpl(nStart2, nLen2, &pos2, &len2);
 974
 975     return m_impl.compare(pos, len, str.m_impl, pos2, len2);
 976 }
 977
 978 int wxString::compare(const char* sz) const
 979 {
 980     return m_impl.compare(ImplStr(sz));
 981 }
 982
 983 int wxString::compare(const wchar_t* sz) const
 984 {
 985     return m_impl.compare(ImplStr(sz));
 986 }
 987
 988 int wxString::compare(size_t nStart, size_t nLen,
 989                       const char* sz, size_t nCount) const
 990 {
 991     size_t pos, len;
 992     PosLenToImpl(nStart, nLen, &pos, &len);
 993
 994     SubstrBufFromMB str(ImplStr(sz, nCount));
 995
 996     return m_impl.compare(pos, len, str.data, str.len);
 997 }
 998
 999 int wxString::compare(size_t nStart, size_t nLen,
1000                       const wchar_t* sz, size_t nCount) const
1001 {
1002     size_t pos, len;
1003     PosLenToImpl(nStart, nLen, &pos, &len);
1004
1005     SubstrBufFromWC str(ImplStr(sz, nCount));
1006
1007     return m_impl.compare(pos, len, str.data, str.len);
1008 }
1009
1010 #else // !HAVE_STD_STRING_COMPARE
1011
1012 static inline int wxDoCmp(const wxStringCharType* s1, size_t l1,
1013                           const wxStringCharType* s2, size_t l2)
1014 {
1015     if( l1 == l2 )
1016         return wxStringMemcmp(s1, s2, l1);
1017     else if( l1 < l2 )
1018     {
1019         int ret = wxStringMemcmp(s1, s2, l1);
1020         return ret == 0 ? -1 : ret;
1021     }
1022     else
1023     {
1024         int ret = wxStringMemcmp(s1, s2, l2);
1025         return ret == 0 ? +1 : ret;
1026     }
1027 }
1028
1029 int wxString::compare(const wxString& str) const
1030 {
1031     return ::wxDoCmp(m_impl.data(), m_impl.length(),
1032                      str.m_impl.data(), str.m_impl.length());
1033 }
1034
1035 int wxString::compare(size_t nStart, size_t nLen,
1036                       const wxString& str) const
1037 {
1038     wxASSERT(nStart <= length());
1039     size_type strLen = length() - nStart;
1040     nLen = strLen < nLen ? strLen : nLen;
1041
1042     size_t pos, len;
1043     PosLenToImpl(nStart, nLen, &pos, &len);
1044
1045     return ::wxDoCmp(m_impl.data() + pos,  len,
1046                      str.m_impl.data(), str.m_impl.length());
1047 }
1048
1049 int wxString::compare(size_t nStart, size_t nLen,
1050                       const wxString& str,
1051                       size_t nStart2, size_t nLen2) const
1052 {
1053     wxASSERT(nStart <= length());
1054     wxASSERT(nStart2 <= str.length());
1055     size_type strLen  =     length() - nStart,
1056               strLen2 = str.length() - nStart2;
1057     nLen  = strLen  < nLen  ? strLen  : nLen;
1058     nLen2 = strLen2 < nLen2 ? strLen2 : nLen2;
1059
1060     size_t pos, len;
1061     PosLenToImpl(nStart, nLen, &pos, &len);
1062     size_t pos2, len2;
1063     str.PosLenToImpl(nStart2, nLen2, &pos2, &len2);
1064
1065     return ::wxDoCmp(m_impl.data() + pos, len,
1066                      str.m_impl.data() + pos2, len2);
1067 }
1068
1069 int wxString::compare(const char* sz) const
1070 {
1071     SubstrBufFromMB str(ImplStr(sz, npos));
1072     if ( str.len == npos )
1073         str.len = wxStringStrlen(str.data);
1074     return ::wxDoCmp(m_impl.data(), m_impl.length(), str.data, str.len);
1075 }
1076
1077 int wxString::compare(const wchar_t* sz) const
1078 {
1079     SubstrBufFromWC str(ImplStr(sz, npos));
1080     if ( str.len == npos )
1081         str.len = wxStringStrlen(str.data);
1082     return ::wxDoCmp(m_impl.data(), m_impl.length(), str.data, str.len);
1083 }
1084
1085 int wxString::compare(size_t nStart, size_t nLen,
1086                       const char* sz, size_t nCount) const
1087 {
1088     wxASSERT(nStart <= length());
1089     size_type strLen = length() - nStart;
1090     nLen = strLen < nLen ? strLen : nLen;
1091
1092     size_t pos, len;
1093     PosLenToImpl(nStart, nLen, &pos, &len);
1094
1095     SubstrBufFromMB str(ImplStr(sz, nCount));
1096     if ( str.len == npos )
1097         str.len = wxStringStrlen(str.data);
1098
1099     return ::wxDoCmp(m_impl.data() + pos, len, str.data, str.len);
1100 }
1101
1102 int wxString::compare(size_t nStart, size_t nLen,
1103                       const wchar_t* sz, size_t nCount) const
1104 {
1105     wxASSERT(nStart <= length());
1106     size_type strLen = length() - nStart;
1107     nLen = strLen < nLen ? strLen : nLen;
1108
1109     size_t pos, len;
1110     PosLenToImpl(nStart, nLen, &pos, &len);
1111
1112     SubstrBufFromWC str(ImplStr(sz, nCount));
1113     if ( str.len == npos )
1114         str.len = wxStringStrlen(str.data);
1115
1116     return ::wxDoCmp(m_impl.data() + pos, len, str.data, str.len);
1117 }
1118
1119 #endif // HAVE_STD_STRING_COMPARE/!HAVE_STD_STRING_COMPARE
1120
1121
1122 // ---------------------------------------------------------------------------
1123 // find_{first,last}_[not]_of functions
1124 // ---------------------------------------------------------------------------
1125
1126 #if !wxUSE_STL_BASED_WXSTRING || wxUSE_UNICODE_UTF8
1127
1128 // NB: All these functions are implemented  with the argument being wxChar*,
1129 //     i.e. widechar string in any Unicode build, even though native string
1130 //     representation is char* in the UTF-8 build. This is because we couldn't
1131 //     use memchr() to determine if a character is in a set encoded as UTF-8.
1132
1133 size_t wxString::find_first_of(const wxChar* sz, size_t nStart) const
1134 {
1135     return find_first_of(sz, nStart, wxStrlen(sz));
1136 }
1137
1138 size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart) const
1139 {
1140     return find_first_not_of(sz, nStart, wxStrlen(sz));
1141 }
1142
1143 size_t wxString::find_first_of(const wxChar* sz, size_t nStart, size_t n) const
1144 {
1145     wxASSERT_MSG( nStart <= length(),  _T("invalid index") );
1146
1147     size_t idx = nStart;
1148     for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1149     {
1150         if ( wxTmemchr(sz, *i, n) )
1151             return idx;
1152     }
1153
1154     return npos;
1155 }
1156
1157 size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart, size_t n) const
1158 {
1159     wxASSERT_MSG( nStart <= length(),  _T("invalid index") );
1160
1161     size_t idx = nStart;
1162     for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1163     {
1164         if ( !wxTmemchr(sz, *i, n) )
1165             return idx;
1166     }
1167
1168     return npos;
1169 }
1170
1171
1172 size_t wxString::find_last_of(const wxChar* sz, size_t nStart) const
1173 {
1174     return find_last_of(sz, nStart, wxStrlen(sz));
1175 }
1176
1177 size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart) const
1178 {
1179     return find_last_not_of(sz, nStart, wxStrlen(sz));
1180 }
1181
1182 size_t wxString::find_last_of(const wxChar* sz, size_t nStart, size_t n) const
1183 {
1184     size_t len = length();
1185
1186     if ( nStart == npos )
1187     {
1188         nStart = len - 1;
1189     }
1190     else
1191     {
1192         wxASSERT_MSG( nStart <= len, _T("invalid index") );
1193     }
1194
1195     size_t idx = nStart;
1196     for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1197           i != rend(); --idx, ++i )
1198     {
1199         if ( wxTmemchr(sz, *i, n) )
1200             return idx;
1201     }
1202
1203     return npos;
1204 }
1205
1206 size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart, size_t n) const
1207 {
1208     size_t len = length();
1209
1210     if ( nStart == npos )
1211     {
1212         nStart = len - 1;
1213     }
1214     else
1215     {
1216         wxASSERT_MSG( nStart <= len, _T("invalid index") );
1217     }
1218
1219     size_t idx = nStart;
1220     for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1221           i != rend(); --idx, ++i )
1222     {
1223         if ( !wxTmemchr(sz, *i, n) )
1224             return idx;
1225     }
1226
1227     return npos;
1228 }
1229
1230 size_t wxString::find_first_not_of(wxUniChar ch, size_t nStart) const
1231 {
1232     wxASSERT_MSG( nStart <= length(),  _T("invalid index") );
1233
1234     size_t idx = nStart;
1235     for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i )
1236     {
1237         if ( *i != ch )
1238             return idx;
1239     }
1240
1241     return npos;
1242 }
1243
1244 size_t wxString::find_last_not_of(wxUniChar ch, size_t nStart) const
1245 {
1246     size_t len = length();
1247
1248     if ( nStart == npos )
1249     {
1250         nStart = len - 1;
1251     }
1252     else
1253     {
1254         wxASSERT_MSG( nStart <= len, _T("invalid index") );
1255     }
1256
1257     size_t idx = nStart;
1258     for ( const_reverse_iterator i = rbegin() + (len - nStart - 1);
1259           i != rend(); --idx, ++i )
1260     {
1261         if ( *i != ch )
1262             return idx;
1263     }
1264
1265     return npos;
1266 }
1267
1268 // the functions above were implemented for wchar_t* arguments in Unicode
1269 // build and char* in ANSI build; below are implementations for the other
1270 // version:
1271 #if wxUSE_UNICODE
1272     #define wxOtherCharType char
1273     #define STRCONV         (const wxChar*)wxConvLibc.cMB2WC
1274 #else
1275     #define wxOtherCharType wchar_t
1276     #define STRCONV         (const wxChar*)wxConvLibc.cWC2MB
1277 #endif
1278
1279 size_t wxString::find_first_of(const wxOtherCharType* sz, size_t nStart) const
1280     { return find_first_of(STRCONV(sz), nStart); }
1281
1282 size_t wxString::find_first_of(const wxOtherCharType* sz, size_t nStart,
1283                                size_t n) const
1284     { return find_first_of(STRCONV(sz, n, NULL), nStart, n); }
1285 size_t wxString::find_last_of(const wxOtherCharType* sz, size_t nStart) const
1286     { return find_last_of(STRCONV(sz), nStart); }
1287 size_t wxString::find_last_of(const wxOtherCharType* sz, size_t nStart,
1288                               size_t n) const
1289     { return find_last_of(STRCONV(sz, n, NULL), nStart, n); }
1290 size_t wxString::find_first_not_of(const wxOtherCharType* sz, size_t nStart) const
1291     { return find_first_not_of(STRCONV(sz), nStart); }
1292 size_t wxString::find_first_not_of(const wxOtherCharType* sz, size_t nStart,
1293                                    size_t n) const
1294     { return find_first_not_of(STRCONV(sz, n, NULL), nStart, n); }
1295 size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart) const
1296     { return find_last_not_of(STRCONV(sz), nStart); }
1297 size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart,
1298                                   size_t n) const
1299     { return find_last_not_of(STRCONV(sz, n, NULL), nStart, n); }
1300
1301 #undef wxOtherCharType
1302 #undef STRCONV
1303
1304 #endif // !wxUSE_STL_BASED_WXSTRING || wxUSE_UNICODE_UTF8
1305
1306 // ===========================================================================
1307 // other common string functions
1308 // ===========================================================================
1309
1310 int wxString::CmpNoCase(const wxString& s) const
1311 {
1312     // FIXME-UTF8: use wxUniChar::ToLower/ToUpper once added
1313
1314     size_t idx = 0;
1315     const_iterator i1 = begin();
1316     const_iterator end1 = end();
1317     const_iterator i2 = s.begin();
1318     const_iterator end2 = s.end();
1319
1320     for ( ; i1 != end1 && i2 != end2; ++idx, ++i1, ++i2 )
1321     {
1322         wxUniChar lower1 = (wxChar)wxTolower(*i1);
1323         wxUniChar lower2 = (wxChar)wxTolower(*i2);
1324         if ( lower1 != lower2 )
1325             return lower1 < lower2 ? -1 : 1;
1326     }
1327
1328     size_t len1 = length();
1329     size_t len2 = s.length();
1330
1331     if ( len1 < len2 )
1332         return -1;
1333     else if ( len1 > len2 )
1334         return 1;
1335     return 0;
1336 }
1337
1338
1339 #if wxUSE_UNICODE
1340
1341 #ifdef __MWERKS__
1342 #ifndef __SCHAR_MAX__
1343 #define __SCHAR_MAX__ 127
1344 #endif
1345 #endif
1346
1347 wxString wxString::FromAscii(const char *ascii)
1348 {
1349     if (!ascii)
1350        return wxEmptyString;
1351
1352     size_t len = strlen( ascii );
1353     wxString res;
1354
1355     if ( len )
1356     {
1357         wxStringBuffer buf(res, len);
1358
1359         wchar_t *dest = buf;
1360
1361         for ( ;; )
1362         {
1363            if ( (*dest++ = (wchar_t)(unsigned char)*ascii++) == L'\0' )
1364                break;
1365         }
1366     }
1367
1368     return res;
1369 }
1370
1371 wxString wxString::FromAscii(const char ascii)
1372 {
1373     // What do we do with '\0' ?
1374
1375     wxString res;
1376     res += (wchar_t)(unsigned char) ascii;
1377
1378     return res;
1379 }
1380
1381 const wxCharBuffer wxString::ToAscii() const
1382 {
1383     // this will allocate enough space for the terminating NUL too
1384     wxCharBuffer buffer(length());
1385
1386
1387     char *dest = buffer.data();
1388
1389     const wchar_t *pwc = c_str();
1390     for ( ;; )
1391     {
1392         *dest++ = (char)(*pwc > SCHAR_MAX ? wxT('_') : *pwc);
1393
1394         // the output string can't have embedded NULs anyhow, so we can safely
1395         // stop at first of them even if we do have any
1396         if ( !*pwc++ )
1397             break;
1398     }
1399
1400     return buffer;
1401 }
1402
1403 #endif // Unicode
1404
1405 // extract string of length nCount starting at nFirst
1406 wxString wxString::Mid(size_t nFirst, size_t nCount) const
1407 {
1408     size_t nLen = length();
1409
1410     // default value of nCount is npos and means "till the end"
1411     if ( nCount == npos )
1412     {
1413         nCount = nLen - nFirst;
1414     }
1415
1416     // out-of-bounds requests return sensible things
1417     if ( nFirst + nCount > nLen )
1418     {
1419         nCount = nLen - nFirst;
1420     }
1421
1422     if ( nFirst > nLen )
1423     {
1424         // AllocCopy() will return empty string
1425         return wxEmptyString;
1426     }
1427
1428     wxString dest(*this, nFirst, nCount);
1429     if ( dest.length() != nCount )
1430     {
1431         wxFAIL_MSG( _T("out of memory in wxString::Mid") );
1432     }
1433
1434     return dest;
1435 }
1436
1437 // check that the string starts with prefix and return the rest of the string
1438 // in the provided pointer if it is not NULL, otherwise return false
1439 bool wxString::StartsWith(const wxChar *prefix, wxString *rest) const
1440 {
1441     wxASSERT_MSG( prefix, _T("invalid parameter in wxString::StartsWith") );
1442
1443     // first check if the beginning of the string matches the prefix: note
1444     // that we don't have to check that we don't run out of this string as
1445     // when we reach the terminating NUL, either prefix string ends too (and
1446     // then it's ok) or we break out of the loop because there is no match
1447     const wxChar *p = c_str();
1448     while ( *prefix )
1449     {
1450         if ( *prefix++ != *p++ )
1451         {
1452             // no match
1453             return false;
1454         }
1455     }
1456
1457     if ( rest )
1458     {
1459         // put the rest of the string into provided pointer
1460         *rest = p;
1461     }
1462
1463     return true;
1464 }
1465
1466
1467 // check that the string ends with suffix and return the rest of it in the
1468 // provided pointer if it is not NULL, otherwise return false
1469 bool wxString::EndsWith(const wxChar *suffix, wxString *rest) const
1470 {
1471     wxASSERT_MSG( suffix, _T("invalid parameter in wxString::EndssWith") );
1472
1473     int start = length() - wxStrlen(suffix);
1474
1475     if ( start < 0 || compare(start, npos, suffix) != 0 )
1476         return false;
1477
1478     if ( rest )
1479     {
1480         // put the rest of the string into provided pointer
1481         rest->assign(*this, 0, start);
1482     }
1483
1484     return true;
1485 }
1486
1487
1488 // extract nCount last (rightmost) characters
1489 wxString wxString::Right(size_t nCount) const
1490 {
1491   if ( nCount > length() )
1492     nCount = length();
1493
1494   wxString dest(*this, length() - nCount, nCount);
1495   if ( dest.length() != nCount ) {
1496     wxFAIL_MSG( _T("out of memory in wxString::Right") );
1497   }
1498   return dest;
1499 }
1500
1501 // get all characters after the last occurence of ch
1502 // (returns the whole string if ch not found)
1503 wxString wxString::AfterLast(wxUniChar ch) const
1504 {
1505   wxString str;
1506   int iPos = Find(ch, true);
1507   if ( iPos == wxNOT_FOUND )
1508     str = *this;
1509   else
1510     str = wx_str() + iPos + 1;
1511
1512   return str;
1513 }
1514
1515 // extract nCount first (leftmost) characters
1516 wxString wxString::Left(size_t nCount) const
1517 {
1518   if ( nCount > length() )
1519     nCount = length();
1520
1521   wxString dest(*this, 0, nCount);
1522   if ( dest.length() != nCount ) {
1523     wxFAIL_MSG( _T("out of memory in wxString::Left") );
1524   }
1525   return dest;
1526 }
1527
1528 // get all characters before the first occurence of ch
1529 // (returns the whole string if ch not found)
1530 wxString wxString::BeforeFirst(wxUniChar ch) const
1531 {
1532   int iPos = Find(ch);
1533   if ( iPos == wxNOT_FOUND ) iPos = length();
1534   return wxString(*this, 0, iPos);
1535 }
1536
1537 /// get all characters before the last occurence of ch
1538 /// (returns empty string if ch not found)
1539 wxString wxString::BeforeLast(wxUniChar ch) const
1540 {
1541   wxString str;
1542   int iPos = Find(ch, true);
1543   if ( iPos != wxNOT_FOUND && iPos != 0 )
1544     str = wxString(c_str(), iPos);
1545
1546   return str;
1547 }
1548
1549 /// get all characters after the first occurence of ch
1550 /// (returns empty string if ch not found)
1551 wxString wxString::AfterFirst(wxUniChar ch) const
1552 {
1553   wxString str;
1554   int iPos = Find(ch);
1555   if ( iPos != wxNOT_FOUND )
1556     str = wx_str() + iPos + 1;
1557
1558   return str;
1559 }
1560
1561 // replace first (or all) occurences of some substring with another one
1562 size_t wxString::Replace(const wxString& strOld,
1563                          const wxString& strNew, bool bReplaceAll)
1564 {
1565     // if we tried to replace an empty string we'd enter an infinite loop below
1566     wxCHECK_MSG( !strOld.empty(), 0,
1567                  _T("wxString::Replace(): invalid parameter") );
1568
1569     size_t uiCount = 0;   // count of replacements made
1570
1571     size_t uiOldLen = strOld.length();
1572     size_t uiNewLen = strNew.length();
1573
1574     size_t dwPos = 0;
1575
1576     while ( (*this)[dwPos] != wxT('\0') )
1577     {
1578         //DO NOT USE STRSTR HERE
1579         //this string can contain embedded null characters,
1580         //so strstr will function incorrectly
1581         dwPos = find(strOld, dwPos);
1582         if ( dwPos == npos )
1583             break;                  // exit the loop
1584         else
1585         {
1586             //replace this occurance of the old string with the new one
1587             replace(dwPos, uiOldLen, strNew, uiNewLen);
1588
1589             //move up pos past the string that was replaced
1590             dwPos += uiNewLen;
1591
1592             //increase replace count
1593             ++uiCount;
1594
1595             // stop now?
1596             if ( !bReplaceAll )
1597                 break;                  // exit the loop
1598         }
1599     }
1600
1601     return uiCount;
1602 }
1603
1604 bool wxString::IsAscii() const
1605 {
1606     for ( const_iterator i = begin(); i != end(); ++i )
1607     {
1608         if ( !(*i).IsAscii() )
1609             return false;
1610     }
1611
1612     return true;
1613 }
1614
1615 bool wxString::IsWord() const
1616 {
1617     for ( const_iterator i = begin(); i != end(); ++i )
1618     {
1619         if ( !wxIsalpha(*i) )
1620             return false;
1621     }
1622
1623     return true;
1624 }
1625
1626 bool wxString::IsNumber() const
1627 {
1628     if ( empty() )
1629         return true;
1630
1631     const_iterator i = begin();
1632
1633     if ( *i == _T('-') || *i == _T('+') )
1634         ++i;
1635
1636     for ( ; i != end(); ++i )
1637     {
1638         if ( !wxIsdigit(*i) )
1639             return false;
1640     }
1641
1642     return true;
1643 }
1644
1645 wxString wxString::Strip(stripType w) const
1646 {
1647     wxString s = *this;
1648     if ( w & leading ) s.Trim(false);
1649     if ( w & trailing ) s.Trim(true);
1650     return s;
1651 }
1652
1653 // ---------------------------------------------------------------------------
1654 // case conversion
1655 // ---------------------------------------------------------------------------
1656
1657 wxString& wxString::MakeUpper()
1658 {
1659   for ( iterator it = begin(), en = end(); it != en; ++it )
1660     *it = (wxChar)wxToupper(*it);
1661
1662   return *this;
1663 }
1664
1665 wxString& wxString::MakeLower()
1666 {
1667   for ( iterator it = begin(), en = end(); it != en; ++it )
1668     *it = (wxChar)wxTolower(*it);
1669
1670   return *this;
1671 }
1672
1673 // ---------------------------------------------------------------------------
1674 // trimming and padding
1675 // ---------------------------------------------------------------------------
1676
1677 // some compilers (VC++ 6.0 not to name them) return true for a call to
1678 // isspace('ê') in the C locale which seems to be broken to me, but we have to
1679 // live with this by checking that the character is a 7 bit one - even if this
1680 // may fail to detect some spaces (I don't know if Unicode doesn't have
1681 // space-like symbols somewhere except in the first 128 chars), it is arguably
1682 // still better than trimming away accented letters
1683 inline int wxSafeIsspace(wxChar ch) { return (ch < 127) && wxIsspace(ch); }
1684
1685 // trims spaces (in the sense of isspace) from left or right side
1686 wxString& wxString::Trim(bool bFromRight)
1687 {
1688     // first check if we're going to modify the string at all
1689     if ( !empty() &&
1690          (
1691           (bFromRight && wxSafeIsspace(GetChar(length() - 1))) ||
1692           (!bFromRight && wxSafeIsspace(GetChar(0u)))
1693          )
1694        )
1695     {
1696         if ( bFromRight )
1697         {
1698             // find last non-space character
1699             reverse_iterator psz = rbegin();
1700             while ( (psz != rend()) && wxSafeIsspace(*psz) )
1701                 psz++;
1702
1703             // truncate at trailing space start
1704             erase(psz.base(), end());
1705         }
1706         else
1707         {
1708             // find first non-space character
1709             iterator psz = begin();
1710             while ( (psz != end()) && wxSafeIsspace(*psz) )
1711                 psz++;
1712
1713             // fix up data and length
1714             erase(begin(), psz);
1715         }
1716     }
1717
1718     return *this;
1719 }
1720
1721 // adds nCount characters chPad to the string from either side
1722 wxString& wxString::Pad(size_t nCount, wxUniChar chPad, bool bFromRight)
1723 {
1724     wxString s(chPad, nCount);
1725
1726     if ( bFromRight )
1727         *this += s;
1728     else
1729     {
1730         s += *this;
1731         swap(s);
1732     }
1733
1734     return *this;
1735 }
1736
1737 // truncate the string
1738 wxString& wxString::Truncate(size_t uiLen)
1739 {
1740     if ( uiLen < length() )
1741     {
1742         erase(begin() + uiLen, end());
1743     }
1744     //else: nothing to do, string is already short enough
1745
1746     return *this;
1747 }
1748
1749 // ---------------------------------------------------------------------------
1750 // finding (return wxNOT_FOUND if not found and index otherwise)
1751 // ---------------------------------------------------------------------------
1752
1753 // find a character
1754 int wxString::Find(wxUniChar ch, bool bFromEnd) const
1755 {
1756     size_type idx = bFromEnd ? find_last_of(ch) : find_first_of(ch);
1757
1758     return (idx == npos) ? wxNOT_FOUND : (int)idx;
1759 }
1760
1761 // ----------------------------------------------------------------------------
1762 // conversion to numbers
1763 // ----------------------------------------------------------------------------
1764
1765 // the implementation of all the functions below is exactly the same so factor
1766 // it out
1767
1768 template <typename T, typename F>
1769 bool wxStringToIntType(const wxChar *start,
1770                        T *val,
1771                        int base,
1772                        F func)
1773 {
1774     wxCHECK_MSG( val, false, _T("NULL output pointer") );
1775     wxASSERT_MSG( !base || (base > 1 && base <= 36), _T("invalid base") );
1776
1777 #ifndef __WXWINCE__
1778     errno = 0;
1779 #endif
1780
1781     wxChar *end;
1782     *val = (*func)(start, &end, base);
1783
1784     // return true only if scan was stopped by the terminating NUL and if the
1785     // string was not empty to start with and no under/overflow occurred
1786     return !*end && (end != start)
1787 #ifndef __WXWINCE__
1788         && (errno != ERANGE)
1789 #endif
1790     ;
1791 }
1792
1793 bool wxString::ToLong(long *val, int base) const
1794 {
1795     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtol);
1796 }
1797
1798 bool wxString::ToULong(unsigned long *val, int base) const
1799 {
1800     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoul);
1801 }
1802
1803 bool wxString::ToLongLong(wxLongLong_t *val, int base) const
1804 {
1805     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoll);
1806 }
1807
1808 bool wxString::ToULongLong(wxULongLong_t *val, int base) const
1809 {
1810     return wxStringToIntType((const wxChar*)c_str(), val, base, wxStrtoull);
1811 }
1812
1813 bool wxString::ToDouble(double *val) const
1814 {
1815     wxCHECK_MSG( val, false, _T("NULL pointer in wxString::ToDouble") );
1816
1817 #ifndef __WXWINCE__
1818     errno = 0;
1819 #endif
1820
1821     const wxChar *start = c_str();
1822     wxChar *end;
1823     *val = wxStrtod(start, &end);
1824
1825     // return true only if scan was stopped by the terminating NUL and if the
1826     // string was not empty to start with and no under/overflow occurred
1827     return !*end && (end != start)
1828 #ifndef __WXWINCE__
1829         && (errno != ERANGE)
1830 #endif
1831     ;
1832 }
1833
1834 // ---------------------------------------------------------------------------
1835 // formatted output
1836 // ---------------------------------------------------------------------------
1837
1838 /* static */
1839 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1840 wxString wxStringPrintfMixinBase::DoFormat(const wxChar *format, ...)
1841 #else
1842 wxString wxString::DoFormat(const wxChar *format, ...)
1843 #endif
1844 {
1845     va_list argptr;
1846     va_start(argptr, format);
1847
1848     wxString s;
1849     s.PrintfV(format, argptr);
1850
1851     va_end(argptr);
1852
1853     return s;
1854 }
1855
1856 /* static */
1857 wxString wxString::FormatV(const wxString& format, va_list argptr)
1858 {
1859     wxString s;
1860     s.PrintfV(format, argptr);
1861     return s;
1862 }
1863
1864 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1865 int wxStringPrintfMixinBase::DoPrintf(const wxChar *format, ...)
1866 #else
1867 int wxString::DoPrintf(const wxChar *format, ...)
1868 #endif
1869 {
1870     va_list argptr;
1871     va_start(argptr, format);
1872
1873 #ifdef wxNEEDS_WXSTRING_PRINTF_MIXIN
1874     // get a pointer to the wxString instance; we have to use dynamic_cast<>
1875     // because it's the only cast that works safely for downcasting when
1876     // multiple inheritance is used:
1877     wxString *str = static_cast<wxString*>(this);
1878 #else
1879     wxString *str = this;
1880 #endif
1881
1882     int iLen = str->PrintfV(format, argptr);
1883
1884     va_end(argptr);
1885
1886     return iLen;
1887 }
1888
1889 int wxString::PrintfV(const wxString& format, va_list argptr)
1890 {
1891     int size = 1024;
1892
1893     for ( ;; )
1894     {
1895         wxStringBuffer tmp(*this, size + 1);
1896         wxChar *buf = tmp;
1897
1898         if ( !buf )
1899         {
1900             // out of memory
1901             return -1;
1902         }
1903
1904         // wxVsnprintf() may modify the original arg pointer, so pass it
1905         // only a copy
1906         va_list argptrcopy;
1907         wxVaCopy(argptrcopy, argptr);
1908         int len = wxVsnprintf(buf, size, (const wxChar*)/*FIXME-UTF8*/format, argptrcopy);
1909         va_end(argptrcopy);
1910
1911         // some implementations of vsnprintf() don't NUL terminate
1912         // the string if there is not enough space for it so
1913         // always do it manually
1914         buf[size] = _T('\0');
1915
1916         // vsnprintf() may return either -1 (traditional Unix behaviour) or the
1917         // total number of characters which would have been written if the
1918         // buffer were large enough (newer standards such as Unix98)
1919         if ( len < 0 )
1920         {
1921 #if wxUSE_WXVSNPRINTF
1922             // we know that our own implementation of wxVsnprintf() returns -1
1923             // only for a format error - thus there's something wrong with
1924             // the user's format string
1925             return -1;
1926 #else // assume that system version only returns error if not enough space
1927             // still not enough, as we don't know how much we need, double the
1928             // current size of the buffer
1929             size *= 2;
1930 #endif // wxUSE_WXVSNPRINTF/!wxUSE_WXVSNPRINTF
1931         }
1932         else if ( len >= size )
1933         {
1934 #if wxUSE_WXVSNPRINTF
1935             // we know that our own implementation of wxVsnprintf() returns
1936             // size+1 when there's not enough space but that's not the size
1937             // of the required buffer!
1938             size *= 2;      // so we just double the current size of the buffer
1939 #else
1940             // some vsnprintf() implementations NUL-terminate the buffer and
1941             // some don't in len == size case, to be safe always add 1
1942             size = len + 1;
1943 #endif
1944         }
1945         else // ok, there was enough space
1946         {
1947             break;
1948         }
1949     }
1950
1951     // we could have overshot
1952     Shrink();
1953
1954     return length();
1955 }
1956
1957 // ----------------------------------------------------------------------------
1958 // misc other operations
1959 // ----------------------------------------------------------------------------
1960
1961 // returns true if the string matches the pattern which may contain '*' and
1962 // '?' metacharacters (as usual, '?' matches any character and '*' any number
1963 // of them)
1964 bool wxString::Matches(const wxString& mask) const
1965 {
1966     // I disable this code as it doesn't seem to be faster (in fact, it seems
1967     // to be much slower) than the old, hand-written code below and using it
1968     // here requires always linking with libregex even if the user code doesn't
1969     // use it
1970 #if 0 // wxUSE_REGEX
1971     // first translate the shell-like mask into a regex
1972     wxString pattern;
1973     pattern.reserve(wxStrlen(pszMask));
1974
1975     pattern += _T('^');
1976     while ( *pszMask )
1977     {
1978         switch ( *pszMask )
1979         {
1980             case _T('?'):
1981                 pattern += _T('.');
1982                 break;
1983
1984             case _T('*'):
1985                 pattern += _T(".*");
1986                 break;
1987
1988             case _T('^'):
1989             case _T('.'):
1990             case _T('$'):
1991             case _T('('):
1992             case _T(')'):
1993             case _T('|'):
1994             case _T('+'):
1995             case _T('\\'):
1996                 // these characters are special in a RE, quote them
1997                 // (however note that we don't quote '[' and ']' to allow
1998                 // using them for Unix shell like matching)
1999                 pattern += _T('\\');
2000                 // fall through
2001
2002             default:
2003                 pattern += *pszMask;
2004         }
2005
2006         pszMask++;
2007     }
2008     pattern += _T('$');
2009
2010     // and now use it
2011     return wxRegEx(pattern, wxRE_NOSUB | wxRE_EXTENDED).Matches(c_str());
2012 #else // !wxUSE_REGEX
2013   // TODO: this is, of course, awfully inefficient...
2014
2015   // FIXME-UTF8: implement using iterators, remove #if
2016 #if wxUSE_UNICODE_UTF8
2017   wxWCharBuffer maskBuf = mask.wc_str();
2018   wxWCharBuffer txtBuf = wc_str();
2019   const wxChar *pszMask = maskBuf.data();
2020   const wxChar *pszTxt = txtBuf.data();
2021 #else
2022   const wxChar *pszMask = mask.wx_str();
2023   // the char currently being checked
2024   const wxChar *pszTxt = wx_str();
2025 #endif
2026
2027   // the last location where '*' matched
2028   const wxChar *pszLastStarInText = NULL;
2029   const wxChar *pszLastStarInMask = NULL;
2030
2031 match:
2032   for ( ; *pszMask != wxT('\0'); pszMask++, pszTxt++ ) {
2033     switch ( *pszMask ) {
2034       case wxT('?'):
2035         if ( *pszTxt == wxT('\0') )
2036           return false;
2037
2038         // pszTxt and pszMask will be incremented in the loop statement
2039
2040         break;
2041
2042       case wxT('*'):
2043         {
2044           // remember where we started to be able to backtrack later
2045           pszLastStarInText = pszTxt;
2046           pszLastStarInMask = pszMask;
2047
2048           // ignore special chars immediately following this one
2049           // (should this be an error?)
2050           while ( *pszMask == wxT('*') || *pszMask == wxT('?') )
2051             pszMask++;
2052
2053           // if there is nothing more, match
2054           if ( *pszMask == wxT('\0') )
2055             return true;
2056
2057           // are there any other metacharacters in the mask?
2058           size_t uiLenMask;
2059           const wxChar *pEndMask = wxStrpbrk(pszMask, wxT("*?"));
2060
2061           if ( pEndMask != NULL ) {
2062             // we have to match the string between two metachars
2063             uiLenMask = pEndMask - pszMask;
2064           }
2065           else {
2066             // we have to match the remainder of the string
2067             uiLenMask = wxStrlen(pszMask);
2068           }
2069
2070           wxString strToMatch(pszMask, uiLenMask);
2071           const wxChar* pMatch = wxStrstr(pszTxt, strToMatch);
2072           if ( pMatch == NULL )
2073             return false;
2074
2075           // -1 to compensate "++" in the loop
2076           pszTxt = pMatch + uiLenMask - 1;
2077           pszMask += uiLenMask - 1;
2078         }
2079         break;
2080
2081       default:
2082         if ( *pszMask != *pszTxt )
2083           return false;
2084         break;
2085     }
2086   }
2087
2088   // match only if nothing left
2089   if ( *pszTxt == wxT('\0') )
2090     return true;
2091
2092   // if we failed to match, backtrack if we can
2093   if ( pszLastStarInText ) {
2094     pszTxt = pszLastStarInText + 1;
2095     pszMask = pszLastStarInMask;
2096
2097     pszLastStarInText = NULL;
2098
2099     // don't bother resetting pszLastStarInMask, it's unnecessary
2100
2101     goto match;
2102   }
2103
2104   return false;
2105 #endif // wxUSE_REGEX/!wxUSE_REGEX
2106 }
2107
2108 // Count the number of chars
2109 int wxString::Freq(wxUniChar ch) const
2110 {
2111     int count = 0;
2112     for ( const_iterator i = begin(); i != end(); ++i )
2113     {
2114         if ( *i == ch )
2115             count ++;
2116     }
2117     return count;
2118 }
2119
2120 // convert to upper case, return the copy of the string
2121 wxString wxString::Upper() const
2122 { wxString s(*this); return s.MakeUpper(); }
2123
2124 // convert to lower case, return the copy of the string
2125 wxString wxString::Lower() const { wxString s(*this); return s.MakeLower(); }