X-Git-Url: https://git.saurik.com/wxWidgets.git/blobdiff_plain/7a906e1a2125a3db29b9777ad2384485d2f78ea4..a94c4b852932bb38a5aeef612d6cedb57bd6e2ac:/src/common/string.cpp diff --git a/src/common/string.cpp b/src/common/string.cpp index 36efc0ce8e..a0d42cb42f 100644 --- a/src/common/string.cpp +++ b/src/common/string.cpp @@ -24,6 +24,8 @@ #ifndef WX_PRECOMP #include "wx/string.h" #include "wx/wxcrtvararg.h" + #include "wx/intl.h" + #include "wx/log.h" #endif #include @@ -35,11 +37,17 @@ #include #include -#ifdef __SALFORDC__ - #include -#endif - #include "wx/hashmap.h" +#include "wx/vector.h" +#include "wx/xlocale.h" + +#ifdef __WXMSW__ + #include "wx/msw/wrapwin.h" +#endif // __WXMSW__ + +#if wxUSE_STD_IOSTREAM + #include +#endif // string handling functions used by wxString: #if wxUSE_UNICODE_UTF8 @@ -54,6 +62,22 @@ #define wxStringStrlen wxStrlen #endif +// define a function declared in wx/buffer.h here as we don't have buffer.cpp +// and don't want to add it just because of this simple function +namespace wxPrivate +{ + +// wxXXXBuffer classes can be (implicitly) used during global statics +// initialization so wrap the status UntypedBufferData variable in a function +// to make it safe to access it even before all global statics are initialized +UntypedBufferData *GetUntypedNullData() +{ + static UntypedBufferData s_untypedNullData(NULL, 0); + + return &s_untypedNullData; +} + +} // namespace wxPrivate // --------------------------------------------------------------------------- // static class variables definition @@ -62,6 +86,105 @@ //According to STL _must_ be a -1 size_t const size_t wxString::npos = (size_t) -1; +#if wxUSE_STRING_POS_CACHE + +#ifdef wxHAS_COMPILER_TLS + +wxTLS_TYPE(wxString::Cache) wxString::ms_cache; + +#else // !wxHAS_COMPILER_TLS + +struct wxStrCacheInitializer +{ + wxStrCacheInitializer() + { + // calling this function triggers s_cache initialization in it, and + // from now on it becomes safe to call from multiple threads + wxString::GetCache(); + } +}; + +/* +wxString::Cache& wxString::GetCache() +{ + static wxTLS_TYPE(Cache) s_cache; + + return wxTLS_VALUE(s_cache); +} +*/ + +static wxStrCacheInitializer gs_stringCacheInit; + +#endif // wxHAS_COMPILER_TLS/!wxHAS_COMPILER_TLS + +// gdb seems to be unable to display thread-local variables correctly, at least +// not my 6.4.98 version under amd64, so provide this debugging helper to do it +#if wxDEBUG_LEVEL >= 2 + +struct wxStrCacheDumper +{ + static void ShowAll() + { + puts("*** wxString cache dump:"); + for ( unsigned n = 0; n < wxString::Cache::SIZE; n++ ) + { + const wxString::Cache::Element& + c = wxString::GetCacheBegin()[n]; + + printf("\t%u%s\t%p: pos=(%lu, %lu), len=%ld\n", + n, + n == wxString::LastUsedCacheElement() ? " [*]" : "", + c.str, + (unsigned long)c.pos, + (unsigned long)c.impl, + (long)c.len); + } + } +}; + +void wxDumpStrCache() { wxStrCacheDumper::ShowAll(); } + +#endif // wxDEBUG_LEVEL >= 2 + +#ifdef wxPROFILE_STRING_CACHE + +wxString::CacheStats wxString::ms_cacheStats; + +struct wxStrCacheStatsDumper +{ + ~wxStrCacheStatsDumper() + { + const wxString::CacheStats& stats = wxString::ms_cacheStats; + + if ( stats.postot ) + { + puts("*** wxString cache statistics:"); + printf("\tTotal non-trivial calls to PosToImpl(): %u\n", + stats.postot); + printf("\tHits %u (of which %u not used) or %.2f%%\n", + stats.poshits, + stats.mishits, + 100.*float(stats.poshits - stats.mishits)/stats.postot); + printf("\tAverage position requested: %.2f\n", + float(stats.sumpos) / stats.postot); + printf("\tAverage offset after cached hint: %.2f\n", + float(stats.sumofs) / stats.postot); + } + + if ( stats.lentot ) + { + printf("\tNumber of calls to length(): %u, hits=%.2f%%\n", + stats.lentot, 100.*float(stats.lenhits)/stats.lentot); + } + } +}; + +static wxStrCacheStatsDumper s_showCacheStats; + +#endif // wxPROFILE_STRING_CACHE + +#endif // wxUSE_STRING_POS_CACHE + // ---------------------------------------------------------------------------- // global functions // ---------------------------------------------------------------------------- @@ -73,7 +196,13 @@ const size_t wxString::npos = (size_t) -1; wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str) { #if wxUSE_UNICODE && !wxUSE_UNICODE_UTF8 - return os << (const char *)str.AsCharBuf(); + const wxScopedCharBuffer buf(str.AsCharBuf()); + if ( !buf ) + os.clear(wxSTD ios_base::failbit); + else + os << buf.data(); + + return os; #else return os << str.AsInternal(); #endif @@ -84,18 +213,37 @@ wxSTD ostream& operator<<(wxSTD ostream& os, const wxString& str) return os << str.c_str(); } -wxSTD ostream& operator<<(wxSTD ostream& os, const wxCharBuffer& str) +wxSTD ostream& operator<<(wxSTD ostream& os, const wxScopedCharBuffer& str) { return os << str.data(); } #ifndef __BORLANDC__ -wxSTD ostream& operator<<(wxSTD ostream& os, const wxWCharBuffer& str) +wxSTD ostream& operator<<(wxSTD ostream& os, const wxScopedWCharBuffer& str) { return os << str.data(); } #endif +#if wxUSE_UNICODE && defined(HAVE_WOSTREAM) + +wxSTD wostream& operator<<(wxSTD wostream& wos, const wxString& str) +{ + return wos << str.wc_str(); +} + +wxSTD wostream& operator<<(wxSTD wostream& wos, const wxCStrData& str) +{ + return wos << str.AsWChar(); +} + +wxSTD wostream& operator<<(wxSTD wostream& wos, const wxScopedWCharBuffer& str) +{ + return wos << str.data(); +} + +#endif // wxUSE_UNICODE && defined(HAVE_WOSTREAM) + #endif // wxUSE_STD_IOSTREAM // =========================================================================== @@ -108,22 +256,30 @@ void wxString::PosLenToImpl(size_t pos, size_t len, size_t *implPos, size_t *implLen) const { if ( pos == npos ) + { *implPos = npos; - else + } + else // have valid start position { - const_iterator i = begin() + pos; - *implPos = wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); + const const_iterator b = GetIterForNthChar(pos); + *implPos = wxStringImpl::const_iterator(b.impl()) - m_impl.begin(); if ( len == npos ) + { *implLen = npos; - else + } + else // have valid length too { - // too large length is interpreted as "to the end of the string" - // FIXME-UTF8: verify this is the case in std::string, assert - // otherwise - if ( pos + len > length() ) - len = length() - pos; - - *implLen = (i + len).impl() - i.impl(); + // we need to handle the case of length specifying a substring + // going beyond the end of the string, just as std::string does + const const_iterator e(end()); + const_iterator i(b); + while ( len && i <= e ) + { + ++i; + --len; + } + + *implLen = i.impl() - b.impl(); } } } @@ -219,95 +375,6 @@ wxString::~wxString() } #endif -#if wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY -const char* wxCStrData::AsChar() const -{ -#if wxUSE_UNICODE_UTF8 - if ( wxLocaleIsUtf8 ) - return AsInternal(); -#endif - // under non-UTF8 locales, we have to convert the internal UTF-8 - // representation using wxConvLibc and cache the result - - wxString *str = wxConstCast(m_str, wxString); - - // convert the string: - // - // FIXME-UTF8: we'd like to do the conversion in the existing buffer (if we - // have it) but it's unfortunately not obvious to implement - // because we don't know how big buffer do we need for the - // given string length (in case of multibyte encodings, e.g. - // ISO-2022-JP or UTF-8 when internal representation is wchar_t) - // - // One idea would be to store more than just m_convertedToChar - // in wxString: then we could record the length of the string - // which was converted the last time and try to reuse the same - // buffer if the current length is not greater than it (this - // could still fail because string could have been modified in - // place but it would work most of the time, so we'd do it and - // only allocate the new buffer if in-place conversion returned - // an error). We could also store a bit saying if the string - // was modified since the last conversion (and update it in all - // operation modifying the string, of course) to avoid unneeded - // consequential conversions. But both of these ideas require - // adding more fields to wxString and require profiling results - // to be sure that we really gain enough from them to justify - // doing it. - wxCharBuffer buf(str->mb_str()); - - // if it failed, return empty string and not NULL to avoid crashes in code - // written with either wxWidgets 2 wxString or std::string behaviour in - // mind: neither of them ever returns NULL and so we shouldn't neither - if ( !buf ) - return ""; - - if ( str->m_convertedToChar && - strlen(buf) == strlen(str->m_convertedToChar) ) - { - // keep the same buffer for as long as possible, so that several calls - // to c_str() in a row still work: - strcpy(str->m_convertedToChar, buf); - } - else - { - str->m_convertedToChar = buf.release(); - } - - // and keep it: - return str->m_convertedToChar + m_offset; -} -#endif // wxUSE_UNICODE && !wxUSE_UTF8_LOCALE_ONLY - -#if !wxUSE_UNICODE_WCHAR -const wchar_t* wxCStrData::AsWChar() const -{ - wxString *str = wxConstCast(m_str, wxString); - - // convert the string: - wxWCharBuffer buf(str->wc_str()); - - // notice that here, unlike above in AsChar(), conversion can't fail as our - // internal UTF-8 is always well-formed -- or the string was corrupted and - // all bets are off anyhow - - // FIXME-UTF8: do the conversion in-place in the existing buffer - if ( str->m_convertedToWChar && - wxWcslen(buf) == wxWcslen(str->m_convertedToWChar) ) - { - // keep the same buffer for as long as possible, so that several calls - // to c_str() in a row still work: - memcpy(str->m_convertedToWChar, buf, sizeof(wchar_t) * wxWcslen(buf)); - } - else - { - str->m_convertedToWChar = buf.release(); - } - - // and keep it: - return str->m_convertedToWChar + m_offset; -} -#endif // !wxUSE_UNICODE_WCHAR - // =========================================================================== // wxString class core // =========================================================================== @@ -323,15 +390,15 @@ wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength, { // anything to do? if ( !psz || nLength == 0 ) - return SubstrBufFromMB(L"", 0); + return SubstrBufFromMB(wxWCharBuffer(L""), 0); if ( nLength == npos ) nLength = wxNO_LEN; size_t wcLen; - wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen)); + wxScopedWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen)); if ( !wcLen ) - return SubstrBufFromMB(_T(""), 0); + return SubstrBufFromMB(wxWCharBuffer(L""), 0); else return SubstrBufFromMB(wcBuf, wcLen); } @@ -344,7 +411,7 @@ wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength, { // anything to do? if ( !psz || nLength == 0 ) - return SubstrBufFromMB("", 0); + return SubstrBufFromMB(wxCharBuffer(""), 0); // if psz is already in UTF-8, we don't have to do the roundtrip to // wchar_t* and back: @@ -354,7 +421,11 @@ wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength, // UTF-8 sequence and psz may be invalid: if ( wxStringOperations::IsValidUtf8String(psz, nLength) ) { - return SubstrBufFromMB(wxCharBuffer::CreateNonOwned(psz), nLength); + // we must pass the real string length to SubstrBufFromMB ctor + if ( nLength == npos ) + nLength = psz ? strlen(psz) : 0; + return SubstrBufFromMB(wxScopedCharBuffer::CreateNonOwned(psz, nLength), + nLength); } // else: do the roundtrip through wchar_t* } @@ -364,14 +435,14 @@ wxString::SubstrBufFromMB wxString::ConvertStr(const char *psz, size_t nLength, // first convert to wide string: size_t wcLen; - wxWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen)); + wxScopedWCharBuffer wcBuf(conv.cMB2WC(psz, nLength, &wcLen)); if ( !wcLen ) - return SubstrBufFromMB("", 0); + return SubstrBufFromMB(wxCharBuffer(""), 0); // and then to UTF-8: SubstrBufFromMB buf(ConvertStr(wcBuf, wcLen, wxMBConvStrictUTF8())); // widechar -> UTF-8 conversion isn't supposed to ever fail: - wxASSERT_MSG( buf.data, _T("conversion to UTF-8 failed") ); + wxASSERT_MSG( buf.data, wxT("conversion to UTF-8 failed") ); return buf; } @@ -384,71 +455,111 @@ wxString::SubstrBufFromWC wxString::ConvertStr(const wchar_t *pwz, size_t nLengt { // anything to do? if ( !pwz || nLength == 0 ) - return SubstrBufFromWC("", 0); + return SubstrBufFromWC(wxCharBuffer(""), 0); if ( nLength == npos ) nLength = wxNO_LEN; size_t mbLen; - wxCharBuffer mbBuf(conv.cWC2MB(pwz, nLength, &mbLen)); + wxScopedCharBuffer mbBuf(conv.cWC2MB(pwz, nLength, &mbLen)); if ( !mbLen ) - return SubstrBufFromWC("", 0); + return SubstrBufFromWC(wxCharBuffer(""), 0); else return SubstrBufFromWC(mbBuf, mbLen); } #endif // wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE +// This std::string::c_str()-like method returns a wide char pointer to string +// contents. In wxUSE_UNICODE_WCHAR case it is trivial as it can simply return +// a pointer to the internal representation. Otherwise a conversion is required +// and it returns a temporary buffer. +// +// However for compatibility with c_str() and to avoid breaking existing code +// doing +// +// for ( const wchar_t *p = s.wc_str(); *p; p++ ) +// ... use *p... +// +// we actually need to ensure that the returned buffer is _not_ temporary and +// so we use wxString::m_convertedToWChar to store the returned data +#if !wxUSE_UNICODE_WCHAR -#if wxUSE_UNICODE_WCHAR - -//Convert wxString in Unicode mode to a multi-byte string -const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const +const wchar_t *wxString::AsWChar(const wxMBConv& conv) const { - return conv.cWC2MB(wx_str(), length() + 1 /* size, not length */, NULL); -} + const char * const strMB = m_impl.c_str(); + const size_t lenMB = m_impl.length(); -#elif wxUSE_UNICODE_UTF8 + // find out the size of the buffer needed + const size_t lenWC = conv.ToWChar(NULL, 0, strMB, lenMB); + if ( lenWC == wxCONV_FAILED ) + return NULL; -const wxWCharBuffer wxString::wc_str() const -{ - return wxMBConvStrictUTF8().cMB2WC - ( - m_impl.c_str(), - m_impl.length() + 1, // size, not length - NULL - ); + // keep the same buffer if the string size didn't change: this is not only + // an optimization but also ensure that code which modifies string + // character by character (without changing its length) can continue to use + // the pointer returned by a previous wc_str() call even after changing the + // string + + // TODO-UTF8: we could check for ">" instead of "!=" here as this would + // allow to save on buffer reallocations but at the cost of + // consuming (even) more memory, we should benchmark this to + // determine if it's worth doing + if ( !m_convertedToWChar.m_str || lenWC != m_convertedToWChar.m_len ) + { + if ( !const_cast(this)->m_convertedToWChar.Extend(lenWC) ) + return NULL; + } + + // finally do convert + m_convertedToWChar.m_str[lenWC] = L'\0'; + if ( conv.ToWChar(m_convertedToWChar.m_str, lenWC, + strMB, lenMB) == wxCONV_FAILED ) + return NULL; + + return m_convertedToWChar.m_str; } -const wxCharBuffer wxString::mb_str(const wxMBConv& conv) const +#endif // !wxUSE_UNICODE_WCHAR + + +// Same thing for mb_str() which returns a normal char pointer to string +// contents: this always requires converting it to the specified encoding in +// non-ANSI build except if we need to convert to UTF-8 and this is what we +// already use internally. +#if wxUSE_UNICODE + +const char *wxString::AsChar(const wxMBConv& conv) const { +#if wxUSE_UNICODE_UTF8 if ( conv.IsUTF8() ) - return wxCharBuffer::CreateNonOwned(m_impl.c_str()); + return m_impl.c_str(); - // FIXME-UTF8: use wc_str() here once we have buffers with length + const wchar_t * const strWC = AsWChar(wxMBConvStrictUTF8()); + const size_t lenWC = m_convertedToWChar.m_len; +#else // wxUSE_UNICODE_WCHAR + const wchar_t * const strWC = m_impl.c_str(); + const size_t lenWC = m_impl.length(); +#endif // wxUSE_UNICODE_UTF8/wxUSE_UNICODE_WCHAR - size_t wcLen; - wxWCharBuffer wcBuf(wxMBConvStrictUTF8().cMB2WC - ( - m_impl.c_str(), - m_impl.length() + 1, // size - &wcLen - )); - if ( !wcLen ) - return wxCharBuffer(""); + const size_t lenMB = conv.FromWChar(NULL, 0, strWC, lenWC); + if ( lenMB == wxCONV_FAILED ) + return NULL; - return conv.cWC2MB(wcBuf, wcLen+1, NULL); -} + if ( !m_convertedToChar.m_str || lenMB != m_convertedToChar.m_len ) + { + if ( !const_cast(this)->m_convertedToChar.Extend(lenMB) ) + return NULL; + } -#else // ANSI + m_convertedToChar.m_str[lenMB] = '\0'; + if ( conv.FromWChar(m_convertedToChar.m_str, lenMB, + strWC, lenWC) == wxCONV_FAILED ) + return NULL; -//Converts this string to a wide character string if unicode -//mode is not enabled and wxUSE_WCHAR_T is enabled -const wxWCharBuffer wxString::wc_str(const wxMBConv& conv) const -{ - return conv.cMB2WC(wx_str(), length() + 1 /* size, not length */, NULL); + return m_convertedToChar.m_str; } -#endif // Unicode/ANSI +#endif // wxUSE_UNICODE // shrink to minimal size (releasing extra memory) bool wxString::Shrink() @@ -539,7 +650,7 @@ wxString operator+(const wxString& str, const char *psz) wxString s; if ( !s.Alloc(strlen(psz) + str.length()) ) { - wxFAIL_MSG( _T("out of memory in wxString::operator+") ); + wxFAIL_MSG( wxT("out of memory in wxString::operator+") ); } s += str; s += psz; @@ -555,7 +666,7 @@ wxString operator+(const wxString& str, const wchar_t *pwz) wxString s; if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) { - wxFAIL_MSG( _T("out of memory in wxString::operator+") ); + wxFAIL_MSG( wxT("out of memory in wxString::operator+") ); } s += str; s += pwz; @@ -571,7 +682,7 @@ wxString operator+(const char *psz, const wxString& str) wxString s; if ( !s.Alloc(strlen(psz) + str.length()) ) { - wxFAIL_MSG( _T("out of memory in wxString::operator+") ); + wxFAIL_MSG( wxT("out of memory in wxString::operator+") ); } s = psz; s += str; @@ -587,7 +698,7 @@ wxString operator+(const wchar_t *pwz, const wxString& str) wxString s; if ( !s.Alloc(wxWcslen(pwz) + str.length()) ) { - wxFAIL_MSG( _T("out of memory in wxString::operator+") ); + wxFAIL_MSG( wxT("out of memory in wxString::operator+") ); } s = pwz; s += str; @@ -805,7 +916,7 @@ size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart) const size_t wxString::find_first_of(const wxChar* sz, size_t nStart, size_t n) const { - wxASSERT_MSG( nStart <= length(), _T("invalid index") ); + wxASSERT_MSG( nStart <= length(), wxT("invalid index") ); size_t idx = nStart; for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i ) @@ -819,7 +930,7 @@ size_t wxString::find_first_of(const wxChar* sz, size_t nStart, size_t n) const size_t wxString::find_first_not_of(const wxChar* sz, size_t nStart, size_t n) const { - wxASSERT_MSG( nStart <= length(), _T("invalid index") ); + wxASSERT_MSG( nStart <= length(), wxT("invalid index") ); size_t idx = nStart; for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i ) @@ -852,7 +963,7 @@ size_t wxString::find_last_of(const wxChar* sz, size_t nStart, size_t n) const } else { - wxASSERT_MSG( nStart <= len, _T("invalid index") ); + wxASSERT_MSG( nStart <= len, wxT("invalid index") ); } size_t idx = nStart; @@ -876,7 +987,7 @@ size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart, size_t n) con } else { - wxASSERT_MSG( nStart <= len, _T("invalid index") ); + wxASSERT_MSG( nStart <= len, wxT("invalid index") ); } size_t idx = nStart; @@ -892,7 +1003,7 @@ size_t wxString::find_last_not_of(const wxChar* sz, size_t nStart, size_t n) con size_t wxString::find_first_not_of(wxUniChar ch, size_t nStart) const { - wxASSERT_MSG( nStart <= length(), _T("invalid index") ); + wxASSERT_MSG( nStart <= length(), wxT("invalid index") ); size_t idx = nStart; for ( const_iterator i = begin() + nStart; i != end(); ++idx, ++i ) @@ -914,7 +1025,7 @@ size_t wxString::find_last_not_of(wxUniChar ch, size_t nStart) const } else { - wxASSERT_MSG( nStart <= len, _T("invalid index") ); + wxASSERT_MSG( nStart <= len, wxT("invalid index") ); } size_t idx = nStart; @@ -972,8 +1083,42 @@ size_t wxString::find_last_not_of(const wxOtherCharType* sz, size_t nStart, int wxString::CmpNoCase(const wxString& s) const { - // FIXME-UTF8: use wxUniChar::ToLower/ToUpper once added +#if defined(__WXMSW__) && !wxUSE_UNICODE_UTF8 + // Prefer to use CompareString() if available as it's more efficient than + // doing it manually or even using wxStricmp() (see #10375) + // + // Also note that not using NORM_STRINGSORT may result in not having a + // strict weak ordering (e.g. s1 < s2 and s2 < s3 but s3 < s1) and so break + // algorithms such as std::sort that rely on it. It's also more consistent + // with the fall back version below. + switch ( ::CompareString(LOCALE_USER_DEFAULT, + NORM_IGNORECASE | SORT_STRINGSORT, + m_impl.c_str(), m_impl.length(), + s.m_impl.c_str(), s.m_impl.length()) ) + { + case CSTR_LESS_THAN: + return -1; + + case CSTR_EQUAL: + return 0; + + case CSTR_GREATER_THAN: + return 1; + default: + wxFAIL_MSG( "unexpected CompareString() return value" ); + // fall through + + case 0: + wxLogLastError("CompareString"); + // use generic code below + } +#endif // __WXMSW__ && !wxUSE_UNICODE_UTF8 + + // do the comparison manually: notice that we can't use wxStricmp() as it + // doesn't handle embedded NULs + + // FIXME-UTF8: use wxUniChar::ToLower/ToUpper once added const_iterator i1 = begin(); const_iterator end1 = end(); const_iterator i2 = s.begin(); @@ -1021,7 +1166,7 @@ wxString wxString::FromAscii(const char *ascii, size_t len) { unsigned char c = (unsigned char)*ascii++; wxASSERT_MSG( c < 0x80, - _T("Non-ASCII value passed to FromAscii().") ); + wxT("Non-ASCII value passed to FromAscii().") ); *dest++ = (wchar_t)c; } @@ -1041,13 +1186,13 @@ wxString wxString::FromAscii(char ascii) unsigned char c = (unsigned char)ascii; - wxASSERT_MSG( c < 0x80, _T("Non-ASCII value passed to FromAscii().") ); + wxASSERT_MSG( c < 0x80, wxT("Non-ASCII value passed to FromAscii().") ); // NB: the cast to wchar_t causes interpretation of 'ascii' as Latin1 value return wxString(wxUniChar((wchar_t)c)); } -const wxCharBuffer wxString::ToAscii() const +const wxScopedCharBuffer wxString::ToAscii() const { // this will allocate enough space for the terminating NUL too wxCharBuffer buffer(length()); @@ -1096,7 +1241,7 @@ wxString wxString::Mid(size_t nFirst, size_t nCount) const wxString dest(*this, nFirst, nCount); if ( dest.length() != nCount ) { - wxFAIL_MSG( _T("out of memory in wxString::Mid") ); + wxFAIL_MSG( wxT("out of memory in wxString::Mid") ); } return dest; @@ -1146,12 +1291,12 @@ wxString wxString::Right(size_t nCount) const wxString dest(*this, length() - nCount, nCount); if ( dest.length() != nCount ) { - wxFAIL_MSG( _T("out of memory in wxString::Right") ); + wxFAIL_MSG( wxT("out of memory in wxString::Right") ); } return dest; } -// get all characters after the last occurence of ch +// get all characters after the last occurrence of ch // (returns the whole string if ch not found) wxString wxString::AfterLast(wxUniChar ch) const { @@ -1160,7 +1305,7 @@ wxString wxString::AfterLast(wxUniChar ch) const if ( iPos == wxNOT_FOUND ) str = *this; else - str = wx_str() + iPos + 1; + str.assign(*this, iPos + 1, npos); return str; } @@ -1173,21 +1318,22 @@ wxString wxString::Left(size_t nCount) const wxString dest(*this, 0, nCount); if ( dest.length() != nCount ) { - wxFAIL_MSG( _T("out of memory in wxString::Left") ); + wxFAIL_MSG( wxT("out of memory in wxString::Left") ); } return dest; } -// get all characters before the first occurence of ch +// get all characters before the first occurrence of ch // (returns the whole string if ch not found) wxString wxString::BeforeFirst(wxUniChar ch) const { int iPos = Find(ch); - if ( iPos == wxNOT_FOUND ) iPos = length(); + if ( iPos == wxNOT_FOUND ) + iPos = length(); return wxString(*this, 0, iPos); } -/// get all characters before the last occurence of ch +/// get all characters before the last occurrence of ch /// (returns empty string if ch not found) wxString wxString::BeforeLast(wxUniChar ch) const { @@ -1199,56 +1345,112 @@ wxString wxString::BeforeLast(wxUniChar ch) const return str; } -/// get all characters after the first occurence of ch +/// get all characters after the first occurrence of ch /// (returns empty string if ch not found) wxString wxString::AfterFirst(wxUniChar ch) const { wxString str; int iPos = Find(ch); if ( iPos != wxNOT_FOUND ) - str = wx_str() + iPos + 1; + str.assign(*this, iPos + 1, npos); return str; } -// replace first (or all) occurences of some substring with another one +// replace first (or all) occurrences of some substring with another one size_t wxString::Replace(const wxString& strOld, const wxString& strNew, bool bReplaceAll) { // if we tried to replace an empty string we'd enter an infinite loop below wxCHECK_MSG( !strOld.empty(), 0, - _T("wxString::Replace(): invalid parameter") ); + wxT("wxString::Replace(): invalid parameter") ); + + wxSTRING_INVALIDATE_CACHE(); size_t uiCount = 0; // count of replacements made - size_t uiOldLen = strOld.length(); - size_t uiNewLen = strNew.length(); + // optimize the special common case: replacement of one character by + // another one (in UTF-8 case we can only do this for ASCII characters) + // + // benchmarks show that this special version is around 3 times faster + // (depending on the proportion of matching characters and UTF-8/wchar_t + // build) + if ( strOld.m_impl.length() == 1 && strNew.m_impl.length() == 1 ) + { + const wxStringCharType chOld = strOld.m_impl[0], + chNew = strNew.m_impl[0]; + + // this loop is the simplified version of the one below + for ( size_t pos = 0; ; ) + { + pos = m_impl.find(chOld, pos); + if ( pos == npos ) + break; - size_t dwPos = 0; + m_impl[pos++] = chNew; - while ( (*this)[dwPos] != wxT('\0') ) + uiCount++; + + if ( !bReplaceAll ) + break; + } + } + else if ( !bReplaceAll) { - //DO NOT USE STRSTR HERE - //this string can contain embedded null characters, - //so strstr will function incorrectly - dwPos = find(strOld, dwPos); - if ( dwPos == npos ) - break; // exit the loop - else + size_t pos = m_impl.find(strOld, 0); + if ( pos != npos ) { - //replace this occurance of the old string with the new one - replace(dwPos, uiOldLen, strNew, uiNewLen); + m_impl.replace(pos, strOld.m_impl.length(), strNew.m_impl); + uiCount = 1; + } + } + else // replace all occurrences + { + const size_t uiOldLen = strOld.m_impl.length(); + const size_t uiNewLen = strNew.m_impl.length(); - //move up pos past the string that was replaced - dwPos += uiNewLen; + // first scan the string to find all positions at which the replacement + // should be made + wxVector replacePositions; - //increase replace count + size_t pos; + for ( pos = m_impl.find(strOld.m_impl, 0); + pos != npos; + pos = m_impl.find(strOld.m_impl, pos + uiOldLen)) + { + replacePositions.push_back(pos); ++uiCount; + } - // stop now? - if ( !bReplaceAll ) - break; // exit the loop + if ( !uiCount ) + return 0; + + // allocate enough memory for the whole new string + wxString tmp; + tmp.m_impl.reserve(m_impl.length() + uiCount*(uiNewLen - uiOldLen)); + + // copy this string to tmp doing replacements on the fly + size_t replNum = 0; + for ( pos = 0; replNum < uiCount; replNum++ ) + { + const size_t nextReplPos = replacePositions[replNum]; + + if ( pos != nextReplPos ) + { + tmp.m_impl.append(m_impl, pos, nextReplPos - pos); + } + + tmp.m_impl.append(strNew.m_impl); + pos = nextReplPos + uiOldLen; + } + + if ( pos != m_impl.length() ) + { + // append the rest of the string unchanged + tmp.m_impl.append(m_impl, pos, m_impl.length() - pos); } + + swap(tmp); } return uiCount; @@ -1283,7 +1485,7 @@ bool wxString::IsNumber() const const_iterator i = begin(); - if ( *i == _T('-') || *i == _T('+') ) + if ( *i == wxT('-') || *i == wxT('+') ) ++i; for ( ; i != end(); ++i ) @@ -1323,14 +1525,28 @@ wxString& wxString::MakeLower() return *this; } +wxString& wxString::MakeCapitalized() +{ + const iterator en = end(); + iterator it = begin(); + if ( it != en ) + { + *it = (wxChar)wxToupper(*it); + for ( ++it; it != en; ++it ) + *it = (wxChar)wxTolower(*it); + } + + return *this; +} + // --------------------------------------------------------------------------- // trimming and padding // --------------------------------------------------------------------------- // some compilers (VC++ 6.0 not to name them) return true for a call to -// isspace('ê') in the C locale which seems to be broken to me, but we have to -// live with this by checking that the character is a 7 bit one - even if this -// may fail to detect some spaces (I don't know if Unicode doesn't have +// isspace('\xEA') in the C locale which seems to be broken to me, but we have +// to live with this by checking that the character is a 7 bit one - even if +// this may fail to detect some spaces (I don't know if Unicode doesn't have // space-like symbols somewhere except in the first 128 chars), it is arguably // still better than trimming away accented letters inline int wxSafeIsspace(wxChar ch) { return (ch < 127) && wxIsspace(ch); } @@ -1425,61 +1641,183 @@ int wxString::Find(wxUniChar ch, bool bFromEnd) const #define DO_IF_NOT_WINCE(x) #endif -#define WX_STRING_TO_INT_TYPE(val, base, func) \ - wxCHECK_MSG( val, false, _T("NULL output pointer") ); \ - wxASSERT_MSG( !base || (base > 1 && base <= 36), _T("invalid base") ); \ - \ +#define WX_STRING_TO_X_TYPE_START \ + wxCHECK_MSG( pVal, false, wxT("NULL output pointer") ); \ DO_IF_NOT_WINCE( errno = 0; ) \ - \ const wxStringCharType *start = wx_str(); \ - wxStringCharType *end; \ - *val = func(start, &end, base); \ - \ - /* return true only if scan was stopped by the terminating NUL and */ \ - /* if the string was not empty to start with and no under/overflow */ \ - /* occurred: */ \ - return !*end && (end != start) \ - DO_IF_NOT_WINCE( && (errno != ERANGE) ) + wxStringCharType *end; + +// notice that we return false without modifying the output parameter at all if +// nothing could be parsed but we do modify it and return false then if we did +// parse something successfully but not the entire string +#define WX_STRING_TO_X_TYPE_END \ + if ( end == start DO_IF_NOT_WINCE(|| errno == ERANGE) ) \ + return false; \ + *pVal = val; \ + return !*end; + +bool wxString::ToLong(long *pVal, int base) const +{ + wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") ); + + WX_STRING_TO_X_TYPE_START + long val = wxStrtol(start, &end, base); + WX_STRING_TO_X_TYPE_END +} -bool wxString::ToLong(long *val, int base) const +bool wxString::ToULong(unsigned long *pVal, int base) const { - WX_STRING_TO_INT_TYPE(val, base, wxStrtol); + wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") ); + + WX_STRING_TO_X_TYPE_START + unsigned long val = wxStrtoul(start, &end, base); + WX_STRING_TO_X_TYPE_END } -bool wxString::ToULong(unsigned long *val, int base) const +bool wxString::ToLongLong(wxLongLong_t *pVal, int base) const { - WX_STRING_TO_INT_TYPE(val, base, wxStrtoul); + wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") ); + + WX_STRING_TO_X_TYPE_START + wxLongLong_t val = wxStrtoll(start, &end, base); + WX_STRING_TO_X_TYPE_END } -bool wxString::ToLongLong(wxLongLong_t *val, int base) const +bool wxString::ToULongLong(wxULongLong_t *pVal, int base) const { - WX_STRING_TO_INT_TYPE(val, base, wxStrtoll); + wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") ); + + WX_STRING_TO_X_TYPE_START + wxULongLong_t val = wxStrtoull(start, &end, base); + WX_STRING_TO_X_TYPE_END } -bool wxString::ToULongLong(wxULongLong_t *val, int base) const +bool wxString::ToDouble(double *pVal) const { - WX_STRING_TO_INT_TYPE(val, base, wxStrtoull); + WX_STRING_TO_X_TYPE_START + double val = wxStrtod(start, &end); + WX_STRING_TO_X_TYPE_END } -bool wxString::ToDouble(double *val) const +#if wxUSE_XLOCALE + +bool wxString::ToCLong(long *pVal, int base) const { - wxCHECK_MSG( val, false, _T("NULL pointer in wxString::ToDouble") ); + wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") ); -#ifndef __WXWINCE__ - errno = 0; + WX_STRING_TO_X_TYPE_START +#if (wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE) && defined(wxHAS_XLOCALE_SUPPORT) + long val = wxStrtol_lA(start, &end, base, wxCLocale); +#else + long val = wxStrtol_l(start, &end, base, wxCLocale); #endif + WX_STRING_TO_X_TYPE_END +} + +bool wxString::ToCULong(unsigned long *pVal, int base) const +{ + wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") ); - const wxChar *start = c_str(); - wxChar *end; - *val = wxStrtod(start, &end); + WX_STRING_TO_X_TYPE_START +#if (wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE) && defined(wxHAS_XLOCALE_SUPPORT) + unsigned long val = wxStrtoul_lA(start, &end, base, wxCLocale); +#else + unsigned long val = wxStrtoul_l(start, &end, base, wxCLocale); +#endif + WX_STRING_TO_X_TYPE_END +} - // return true only if scan was stopped by the terminating NUL and if the - // string was not empty to start with and no under/overflow occurred - return !*end && (end != start) -#ifndef __WXWINCE__ - && (errno != ERANGE) +bool wxString::ToCDouble(double *pVal) const +{ + WX_STRING_TO_X_TYPE_START +#if (wxUSE_UNICODE_UTF8 || !wxUSE_UNICODE) && defined(wxHAS_XLOCALE_SUPPORT) + double val = wxStrtod_lA(start, &end, wxCLocale); +#else + double val = wxStrtod_l(start, &end, wxCLocale); #endif - ; + WX_STRING_TO_X_TYPE_END +} + +#else // wxUSE_XLOCALE + +// Provide implementation of these functions even when wxUSE_XLOCALE is +// disabled, we still need them in wxWidgets internal code. + +// For integers we just assume the current locale uses the same number +// representation as the C one as there is nothing else we can do. +bool wxString::ToCLong(long *pVal, int base) const +{ + return ToLong(pVal, base); +} + +bool wxString::ToCULong(unsigned long *pVal, int base) const +{ + return ToULong(pVal, base); +} + +// For floating point numbers we have to handle the problem of the decimal +// point which is different in different locales. +bool wxString::ToCDouble(double *pVal) const +{ + // Create a copy of this string using the decimal point instead of whatever + // separator the current locale uses. +#if wxUSE_INTL + wxString sep = wxLocale::GetInfo(wxLOCALE_DECIMAL_POINT, + wxLOCALE_CAT_NUMBER); + if ( sep == "." ) + { + // We can avoid an unnecessary string copy in this case. + return ToDouble(pVal); + } +#else // !wxUSE_INTL + // We don't know what the current separator is so it might even be a point + // already, try to parse the string as a double: + if ( ToDouble(pVal) ) + { + // It must have been the point, nothing else to do. + return true; + } + + // Try to guess the separator, using the most common alternative value. + wxString sep(","); +#endif // wxUSE_INTL/!wxUSE_INTL + wxString cstr(*this); + cstr.Replace(".", sep); + + return cstr.ToDouble(pVal); +} + +#endif // wxUSE_XLOCALE/!wxUSE_XLOCALE + +// ---------------------------------------------------------------------------- +// number to string conversion +// ---------------------------------------------------------------------------- + +/* static */ +wxString wxString::FromCDouble(double val) +{ +#if wxUSE_STD_IOSTREAM && wxUSE_STD_STRING + // We assume that we can use the ostream and not wstream for numbers. + wxSTD ostringstream os; + os << val; + return os.str(); +#else // wxUSE_STD_IOSTREAM + // Can't use iostream locale support, fall back to the manual method + // instead. + wxString s = FromDouble(val); +#if wxUSE_INTL + wxString sep = wxLocale::GetInfo(wxLOCALE_DECIMAL_POINT, + wxLOCALE_CAT_NUMBER); +#else // !wxUSE_INTL + // As above, this is the most common alternative value. Notice that here it + // doesn't matter if we guess wrongly and the current separator is already + // ".": we'll just waste a call to Replace() in this case. + wxString sep(","); +#endif // wxUSE_INTL/!wxUSE_INTL + + s.Replace(sep, "."); + return s; +#endif // wxUSE_STD_IOSTREAM/!wxUSE_STD_IOSTREAM } // --------------------------------------------------------------------------- @@ -1571,6 +1909,60 @@ int wxString::DoPrintfUtf8(const char *format, ...) } #endif // wxUSE_UNICODE_UTF8 +/* + Uses wxVsnprintf and places the result into the this string. + + In ANSI build, wxVsnprintf is effectively vsnprintf but in Unicode build + it is vswprintf. Due to a discrepancy between vsnprintf and vswprintf in + the ISO C99 (and thus SUSv3) standard the return value for the case of + an undersized buffer is inconsistent. For conforming vsnprintf + implementations the function must return the number of characters that + would have been printed had the buffer been large enough. For conforming + vswprintf implementations the function must return a negative number + and set errno. + + What vswprintf sets errno to is undefined but Darwin seems to set it to + EOVERFLOW. The only expected errno are EILSEQ and EINVAL. Both of + those are defined in the standard and backed up by several conformance + statements. Note that ENOMEM mentioned in the manual page does not + apply to swprintf, only wprintf and fwprintf. + + Official manual page: + http://www.opengroup.org/onlinepubs/009695399/functions/swprintf.html + + Some conformance statements (AIX, Solaris): + http://www.opengroup.org/csq/view.mhtml?RID=ibm%2FSD1%2F3 + http://www.theopengroup.org/csq/view.mhtml?norationale=1&noreferences=1&RID=Fujitsu%2FSE2%2F10 + + Since EILSEQ and EINVAL are rather common but EOVERFLOW is not and since + EILSEQ and EINVAL are specifically defined to mean the error is other than + an undersized buffer and no other errno are defined we treat those two + as meaning hard errors and everything else gets the old behavior which + is to keep looping and increasing buffer size until the function succeeds. + + In practice it's impossible to determine before compilation which behavior + may be used. The vswprintf function may have vsnprintf-like behavior or + vice-versa. Behavior detected on one release can theoretically change + with an updated release. Not to mention that configure testing for it + would require the test to be run on the host system, not the build system + which makes cross compilation difficult. Therefore, we make no assumptions + about behavior and try our best to handle every known case, including the + case where wxVsnprintf returns a negative number and fails to set errno. + + There is yet one more non-standard implementation and that is our own. + Fortunately, that can be detected at compile-time. + + On top of all that, ISO C99 explicitly defines snprintf to write a null + character to the last position of the specified buffer. That would be at + at the given buffer size minus 1. It is supposed to do this even if it + turns out that the buffer is sized too small. + + Darwin (tested on 10.5) follows the C99 behavior exactly. + + Glibc 2.6 almost follows the C99 behavior except vswprintf never sets + errno even when it fails. However, it only seems to ever fail due + to an undersized buffer. +*/ #if wxUSE_UNICODE_UTF8 template #else @@ -1608,13 +2000,20 @@ static int DoStringPrintfV(wxString& str, // only a copy va_list argptrcopy; wxVaCopy(argptrcopy, argptr); + +#ifndef __WXWINCE__ + // Set errno to 0 to make it determinate if wxVsnprintf fails to set it. + errno = 0; +#endif int len = wxVsnprintf(buf, size, format, argptrcopy); va_end(argptrcopy); // some implementations of vsnprintf() don't NUL terminate // the string if there is not enough space for it so // always do it manually - buf[size] = _T('\0'); + // FIXME: This really seems to be the wrong and would be an off-by-one + // bug except the code above allocates an extra character. + buf[size] = wxT('\0'); // vsnprintf() may return either -1 (traditional Unix behaviour) or the // total number of characters which would have been written if the @@ -1635,19 +2034,33 @@ static int DoStringPrintfV(wxString& str, // assume it only returns error if there is not enough space, but // as we don't know how much we need, double the current size of // the buffer - size *= 2; +#ifndef __WXWINCE__ + if( (errno == EILSEQ) || (errno == EINVAL) ) + // If errno was set to one of the two well-known hard errors + // then fail immediately to avoid an infinite loop. + return -1; + else +#endif // __WXWINCE__ + // still not enough, as we don't know how much we need, double the + // current size of the buffer + size *= 2; #endif // wxUSE_WXVSNPRINTF/!wxUSE_WXVSNPRINTF } else if ( len >= size ) { #if wxUSE_WXVSNPRINTF - // we know that our own implementation of wxVsnprintf() returns + // we know that our own implementation of wxVsnprintf() returns // size+1 when there's not enough space but that's not the size // of the required buffer! size *= 2; // so we just double the current size of the buffer #else // some vsnprintf() implementations NUL-terminate the buffer and // some don't in len == size case, to be safe always add 1 + // FIXME: I don't quite understand this comment. The vsnprintf + // function is specifically defined to return the number of + // characters printed not including the null terminator. + // So OF COURSE you need to add 1 to get the right buffer size. + // The following line is definitely correct, no question. size = len + 1; #endif } @@ -1706,31 +2119,31 @@ bool wxString::Matches(const wxString& mask) const wxString pattern; pattern.reserve(wxStrlen(pszMask)); - pattern += _T('^'); + pattern += wxT('^'); while ( *pszMask ) { switch ( *pszMask ) { - case _T('?'): - pattern += _T('.'); + case wxT('?'): + pattern += wxT('.'); break; - case _T('*'): - pattern += _T(".*"); + case wxT('*'): + pattern += wxT(".*"); break; - case _T('^'): - case _T('.'): - case _T('$'): - case _T('('): - case _T(')'): - case _T('|'): - case _T('+'): - case _T('\\'): + case wxT('^'): + case wxT('.'): + case wxT('$'): + case wxT('('): + case wxT(')'): + case wxT('|'): + case wxT('+'): + case wxT('\\'): // these characters are special in a RE, quote them // (however note that we don't quote '[' and ']' to allow // using them for Unix shell like matching) - pattern += _T('\\'); + pattern += wxT('\\'); // fall through default: @@ -1739,7 +2152,7 @@ bool wxString::Matches(const wxString& mask) const pszMask++; } - pattern += _T('$'); + pattern += wxT('$'); // and now use it return wxRegEx(pattern, wxRE_NOSUB | wxRE_EXTENDED).Matches(c_str()); @@ -1748,8 +2161,8 @@ bool wxString::Matches(const wxString& mask) const // FIXME-UTF8: implement using iterators, remove #if #if wxUSE_UNICODE_UTF8 - wxWCharBuffer maskBuf = mask.wc_str(); - wxWCharBuffer txtBuf = wc_str(); + const wxScopedWCharBuffer maskBuf = mask.wc_str(); + const wxScopedWCharBuffer txtBuf = wc_str(); const wxChar *pszMask = maskBuf.data(); const wxChar *pszTxt = txtBuf.data(); #else @@ -1851,38 +2264,3 @@ int wxString::Freq(wxUniChar ch) const return count; } -// convert to upper case, return the copy of the string -wxString wxString::Upper() const -{ wxString s(*this); return s.MakeUpper(); } - -// convert to lower case, return the copy of the string -wxString wxString::Lower() const { wxString s(*this); return s.MakeLower(); } - -// ---------------------------------------------------------------------------- -// wxUTF8StringBuffer -// ---------------------------------------------------------------------------- - -#if wxUSE_UNICODE_WCHAR -wxUTF8StringBuffer::~wxUTF8StringBuffer() -{ - wxMBConvStrictUTF8 conv; - size_t wlen = conv.ToWChar(NULL, 0, m_buf); - wxCHECK_RET( wlen != wxCONV_FAILED, "invalid UTF-8 data in string buffer?" ); - - wxStringInternalBuffer wbuf(m_str, wlen); - conv.ToWChar(wbuf, wlen, m_buf); -} - -wxUTF8StringBufferLength::~wxUTF8StringBufferLength() -{ - wxCHECK_RET(m_lenSet, "length not set"); - - wxMBConvStrictUTF8 conv; - size_t wlen = conv.ToWChar(NULL, 0, m_buf, m_len); - wxCHECK_RET( wlen != wxCONV_FAILED, "invalid UTF-8 data in string buffer?" ); - - wxStringInternalBufferLength wbuf(m_str, wlen); - conv.ToWChar(wbuf, wlen, m_buf, m_len); - wbuf.SetLength(wlen); -} -#endif // wxUSE_UNICODE_WCHAR