From f0dfc29c713f1341a33b272aa395db9c36405a83 Mon Sep 17 00:00:00 2001 From: =?utf8?q?V=C3=A1clav=20Slav=C3=ADk?= Date: Tue, 24 Jul 2007 20:00:12 +0000 Subject: [PATCH] optimized wxStringTokenizer: it's now slightly faster in wchar_t build and much faster in UTF-8 build git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47707 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- include/wx/tokenzr.h | 24 ++++++++--- src/common/tokenzr.cpp | 93 +++++++++++++++++++++++++++++++++++------- 2 files changed, 98 insertions(+), 19 deletions(-) diff --git a/include/wx/tokenzr.h b/include/wx/tokenzr.h index d47d52c324..fca43655d9 100644 --- a/include/wx/tokenzr.h +++ b/include/wx/tokenzr.h @@ -72,12 +72,12 @@ public: // get current tokenizer state // returns the part of the string which remains to tokenize (*not* the // initial string) - wxString GetString() const { return m_string.substr(m_pos); } + wxString GetString() const { return wxString(m_pos, m_string.end()); } // returns the current position (i.e. one index after the last // returned token or 0 if GetNextToken() has never been called) in the // original string - size_t GetPosition() const { return m_pos; } + size_t GetPosition() const { return m_pos - m_string.begin(); } // misc // get the current mode - can be different from the one passed to the @@ -111,10 +111,24 @@ public: protected: bool IsOk() const { return m_mode != wxTOKEN_INVALID; } - wxString m_string, // the string we tokenize - m_delims; // all possible delimiters + bool DoHasMoreTokens() const; - size_t m_pos; // the current position in m_string + enum MoreTokensState + { + MoreTokens_Unknown, + MoreTokens_Yes, + MoreTokens_No + }; + + MoreTokensState m_hasMoreTokens; + + wxString m_string; // the string we tokenize + wxString::const_iterator m_stringEnd; + // FIXME-UTF8: use wxWcharBuffer + wxWxCharBuffer m_delims; // all possible delimiters + size_t m_delimsLen; + + wxString::const_iterator m_pos; // the current position in m_string wxStringTokenizerMode m_mode; // see wxTOKEN_XXX values diff --git a/src/common/tokenzr.cpp b/src/common/tokenzr.cpp index ea615aaf57..12aa6c2e1d 100644 --- a/src/common/tokenzr.cpp +++ b/src/common/tokenzr.cpp @@ -38,6 +38,42 @@ // implementation // ============================================================================ +// ---------------------------------------------------------------------------- +// helpers +// ---------------------------------------------------------------------------- + +static wxString::const_iterator +find_first_of(const wxChar *delims, size_t len, + const wxString::const_iterator& from, + const wxString::const_iterator& end) +{ + wxASSERT_MSG( from <= end, _T("invalid index") ); + + for ( wxString::const_iterator i = from; i != end; ++i ) + { + if ( wxTmemchr(delims, *i, len) ) + return i; + } + + return end; +} + +static wxString::const_iterator +find_first_not_of(const wxChar *delims, size_t len, + const wxString::const_iterator& from, + const wxString::const_iterator& end) +{ + wxASSERT_MSG( from <= end, _T("invalid index") ); + + for ( wxString::const_iterator i = from; i != end; ++i ) + { + if ( !wxTmemchr(delims, *i, len) ) + return i; + } + + return end; +} + // ---------------------------------------------------------------------------- // wxStringTokenizer construction // ---------------------------------------------------------------------------- @@ -78,7 +114,13 @@ void wxStringTokenizer::SetString(const wxString& str, } } - m_delims = delims; +#if wxUSE_UNICODE // FIXME-UTF8: only wc_str() + m_delims = delims.wc_str(); +#else + m_delims = delims.mb_str(); +#endif + m_delimsLen = delims.length(); + m_mode = mode; Reinit(str); @@ -89,8 +131,10 @@ void wxStringTokenizer::Reinit(const wxString& str) wxASSERT_MSG( IsOk(), _T("you should call SetString() first") ); m_string = str; - m_pos = 0; + m_stringEnd = m_string.end(); + m_pos = m_string.begin(); m_lastDelim = _T('\0'); + m_hasMoreTokens = MoreTokens_Unknown; } // ---------------------------------------------------------------------------- @@ -99,10 +143,29 @@ void wxStringTokenizer::Reinit(const wxString& str) // do we have more of them? bool wxStringTokenizer::HasMoreTokens() const +{ + // GetNextToken() calls HasMoreTokens() and so HasMoreTokens() is called + // twice in every interation in the following common usage patten: + // while ( HasMoreTokens() ) + // GetNextToken(); + // We optimize this case by caching HasMoreTokens() return value here: + if ( m_hasMoreTokens == MoreTokens_Unknown ) + { + bool r = DoHasMoreTokens(); + wxConstCast(this, wxStringTokenizer)->m_hasMoreTokens = + r ? MoreTokens_Yes : MoreTokens_No; + return r; + } + else + return m_hasMoreTokens == MoreTokens_Yes; +} + +bool wxStringTokenizer::DoHasMoreTokens() const { wxCHECK_MSG( IsOk(), false, _T("you should call SetString() first") ); - if ( m_string.find_first_not_of(m_delims, m_pos) != wxString::npos ) + if ( find_first_not_of(m_delims, m_delimsLen, m_pos, m_stringEnd) + != m_stringEnd ) { // there are non delimiter characters left, so we do have more tokens return true; @@ -114,7 +177,7 @@ bool wxStringTokenizer::HasMoreTokens() const case wxTOKEN_RET_DELIMS: // special hack for wxTOKEN_RET_EMPTY: we should return the initial // empty token even if there are only delimiters after it - return m_pos == 0 && !m_string.empty(); + return !m_string.empty() && m_pos == m_string.begin(); case wxTOKEN_RET_EMPTY_ALL: // special hack for wxTOKEN_RET_EMPTY_ALL: we can know if we had @@ -123,7 +186,7 @@ bool wxStringTokenizer::HasMoreTokens() const // up to the end of the string in GetNextToken(), but if it is not // NUL yet we still have this last token to return even if m_pos is // already at m_string.length() - return m_pos < m_string.length() || m_lastDelim != _T('\0'); + return m_pos < m_stringEnd || m_lastDelim != _T('\0'); case wxTOKEN_INVALID: case wxTOKEN_DEFAULT: @@ -147,7 +210,7 @@ size_t wxStringTokenizer::CountTokens() const // important if its implementation here is not as efficient as it // could be -- but OTOH like this we're sure to get the correct answer // in all modes - wxStringTokenizer tkz(m_string.c_str() + m_pos, m_delims, m_mode); + wxStringTokenizer tkz(wxString(m_pos, m_stringEnd), m_delims, m_mode); size_t count = 0; while ( tkz.HasMoreTokens() ) @@ -174,18 +237,21 @@ wxString wxStringTokenizer::GetNextToken() break; } + m_hasMoreTokens = MoreTokens_Unknown; + // find the end of this token - size_t pos = m_string.find_first_of(m_delims, m_pos); + wxString::const_iterator pos = + find_first_of(m_delims, m_delimsLen, m_pos, m_stringEnd); // and the start of the next one - if ( pos == wxString::npos ) + if ( pos == m_stringEnd ) { // no more delimiters, the token is everything till the end of // string - token.assign(m_string, m_pos, wxString::npos); + token.assign(m_pos, m_stringEnd); // skip the token - m_pos = m_string.length(); + m_pos = m_stringEnd; // it wasn't terminated m_lastDelim = _T('\0'); @@ -194,16 +260,15 @@ wxString wxStringTokenizer::GetNextToken() { // in wxTOKEN_RET_DELIMS mode we return the delimiter character // with token, otherwise leave it out - size_t len = pos - m_pos; if ( m_mode == wxTOKEN_RET_DELIMS ) - len++; + ++pos; - token.assign(m_string, m_pos, len); + token.assign(m_pos, pos); // skip the token and the trailing delimiter m_pos = pos + 1; - m_lastDelim = m_string[pos]; + m_lastDelim = (pos == m_stringEnd) ? _T('\0') : (wxChar)*pos; } } while ( !AllowEmpty() && token.empty() ); -- 2.45.2