src/common/tokenzr.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/tokenzr.cpp
   3 // Purpose:     String tokenizer
   4 // Author:      Guilhem Lavaux
   5 // Modified by: Vadim Zeitlin (almost full rewrite)
   6 // Created:     04/22/98
   7 // Copyright:   (c) Guilhem Lavaux
   8 // Licence:     wxWindows licence
   9 /////////////////////////////////////////////////////////////////////////////
  10
  11 // ============================================================================
  12 // declarations
  13 // ============================================================================
  14
  15 // ----------------------------------------------------------------------------
  16 // headers
  17 // ----------------------------------------------------------------------------
  18
  19 // For compilers that support precompilation, includes "wx.h".
  20 #include "wx/wxprec.h"
  21
  22 #ifdef __BORLANDC__
  23     #pragma hdrstop
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27
  28 #ifndef WX_PRECOMP
  29     #include "wx/arrstr.h"
  30     #include "wx/crt.h"
  31 #endif
  32
  33 // Required for wxIs... functions
  34 #include <ctype.h>
  35
  36 // ============================================================================
  37 // implementation
  38 // ============================================================================
  39
  40 // ----------------------------------------------------------------------------
  41 // helpers
  42 // ----------------------------------------------------------------------------
  43
  44 static wxString::const_iterator
  45 find_first_of(const wxChar *delims, size_t len,
  46               const wxString::const_iterator& from,
  47               const wxString::const_iterator& end)
  48 {
  49     wxASSERT_MSG( from <= end,  wxT("invalid index") );
  50
  51     for ( wxString::const_iterator i = from; i != end; ++i )
  52     {
  53         if ( wxTmemchr(delims, *i, len) )
  54             return i;
  55     }
  56
  57     return end;
  58 }
  59
  60 static wxString::const_iterator
  61 find_first_not_of(const wxChar *delims, size_t len,
  62                   const wxString::const_iterator& from,
  63                   const wxString::const_iterator& end)
  64 {
  65     wxASSERT_MSG( from <= end,  wxT("invalid index") );
  66
  67     for ( wxString::const_iterator i = from; i != end; ++i )
  68     {
  69         if ( !wxTmemchr(delims, *i, len) )
  70             return i;
  71     }
  72
  73     return end;
  74 }
  75
  76 // ----------------------------------------------------------------------------
  77 // wxStringTokenizer construction
  78 // ----------------------------------------------------------------------------
  79
  80 wxStringTokenizer::wxStringTokenizer(const wxString& str,
  81                                      const wxString& delims,
  82                                      wxStringTokenizerMode mode)
  83 {
  84     SetString(str, delims, mode);
  85 }
  86
  87 void wxStringTokenizer::SetString(const wxString& str,
  88                                   const wxString& delims,
  89                                   wxStringTokenizerMode mode)
  90 {
  91     if ( mode == wxTOKEN_DEFAULT )
  92     {
  93         // by default, we behave like strtok() if the delimiters are only
  94         // whitespace characters and as wxTOKEN_RET_EMPTY otherwise (for
  95         // whitespace delimiters, strtok() behaviour is better because we want
  96         // to count consecutive spaces as one delimiter)
  97         wxString::const_iterator p;
  98         for ( p = delims.begin(); p != delims.end(); ++p )
  99         {
 100             if ( !wxIsspace(*p) )
 101                 break;
 102         }
 103
 104         if ( p != delims.end() )
 105         {
 106             // not whitespace char in delims
 107             mode = wxTOKEN_RET_EMPTY;
 108         }
 109         else
 110         {
 111             // only whitespaces
 112             mode = wxTOKEN_STRTOK;
 113         }
 114     }
 115
 116 #if wxUSE_UNICODE // FIXME-UTF8: only wc_str()
 117     m_delims = delims.wc_str();
 118 #else
 119     m_delims = delims.mb_str();
 120 #endif
 121     m_delimsLen = delims.length();
 122
 123     m_mode = mode;
 124
 125     Reinit(str);
 126 }
 127
 128 void wxStringTokenizer::Reinit(const wxString& str)
 129 {
 130     wxASSERT_MSG( IsOk(), wxT("you should call SetString() first") );
 131
 132     m_string = str;
 133     m_stringEnd = m_string.end();
 134     m_pos = m_string.begin();
 135     m_lastDelim = wxT('\0');
 136     m_hasMoreTokens = MoreTokens_Unknown;
 137 }
 138
 139 // ----------------------------------------------------------------------------
 140 // access to the tokens
 141 // ----------------------------------------------------------------------------
 142
 143 // do we have more of them?
 144 bool wxStringTokenizer::HasMoreTokens() const
 145 {
 146     // GetNextToken() calls HasMoreTokens() and so HasMoreTokens() is called
 147     // twice in every interation in the following common usage patten:
 148     //     while ( HasMoreTokens() )
 149     //        GetNextToken();
 150     // We optimize this case by caching HasMoreTokens() return value here:
 151     if ( m_hasMoreTokens == MoreTokens_Unknown )
 152     {
 153         bool r = DoHasMoreTokens();
 154         wxConstCast(this, wxStringTokenizer)->m_hasMoreTokens =
 155             r ? MoreTokens_Yes : MoreTokens_No;
 156         return r;
 157     }
 158     else
 159         return m_hasMoreTokens == MoreTokens_Yes;
 160 }
 161
 162 bool wxStringTokenizer::DoHasMoreTokens() const
 163 {
 164     wxCHECK_MSG( IsOk(), false, wxT("you should call SetString() first") );
 165
 166     if ( find_first_not_of(m_delims, m_delimsLen, m_pos, m_stringEnd)
 167          != m_stringEnd )
 168     {
 169         // there are non delimiter characters left, so we do have more tokens
 170         return true;
 171     }
 172
 173     switch ( m_mode )
 174     {
 175         case wxTOKEN_RET_EMPTY:
 176         case wxTOKEN_RET_DELIMS:
 177             // special hack for wxTOKEN_RET_EMPTY: we should return the initial
 178             // empty token even if there are only delimiters after it
 179             return !m_string.empty() && m_pos == m_string.begin();
 180
 181         case wxTOKEN_RET_EMPTY_ALL:
 182             // special hack for wxTOKEN_RET_EMPTY_ALL: we can know if we had
 183             // already returned the trailing empty token after the last
 184             // delimiter by examining m_lastDelim: it is set to NUL if we run
 185             // up to the end of the string in GetNextToken(), but if it is not
 186             // NUL yet we still have this last token to return even if m_pos is
 187             // already at m_string.length()
 188             return m_pos < m_stringEnd || m_lastDelim != wxT('\0');
 189
 190         case wxTOKEN_INVALID:
 191         case wxTOKEN_DEFAULT:
 192             wxFAIL_MSG( wxT("unexpected tokenizer mode") );
 193             // fall through
 194
 195         case wxTOKEN_STRTOK:
 196             // never return empty delimiters
 197             break;
 198     }
 199
 200     return false;
 201 }
 202
 203 // count the number of (remaining) tokens in the string
 204 size_t wxStringTokenizer::CountTokens() const
 205 {
 206     wxCHECK_MSG( IsOk(), 0, wxT("you should call SetString() first") );
 207
 208     // VZ: this function is IMHO not very useful, so it's probably not very
 209     //     important if its implementation here is not as efficient as it
 210     //     could be -- but OTOH like this we're sure to get the correct answer
 211     //     in all modes
 212     wxStringTokenizer tkz(wxString(m_pos, m_stringEnd), m_delims, m_mode);
 213
 214     size_t count = 0;
 215     while ( tkz.HasMoreTokens() )
 216     {
 217         count++;
 218
 219         (void)tkz.GetNextToken();
 220     }
 221
 222     return count;
 223 }
 224
 225 // ----------------------------------------------------------------------------
 226 // token extraction
 227 // ----------------------------------------------------------------------------
 228
 229 wxString wxStringTokenizer::GetNextToken()
 230 {
 231     wxString token;
 232     do
 233     {
 234         if ( !HasMoreTokens() )
 235         {
 236             break;
 237         }
 238
 239         m_hasMoreTokens = MoreTokens_Unknown;
 240
 241         // find the end of this token
 242         wxString::const_iterator pos =
 243             find_first_of(m_delims, m_delimsLen, m_pos, m_stringEnd);
 244
 245         // and the start of the next one
 246         if ( pos == m_stringEnd )
 247         {
 248             // no more delimiters, the token is everything till the end of
 249             // string
 250             token.assign(m_pos, m_stringEnd);
 251
 252             // skip the token
 253             m_pos = m_stringEnd;
 254
 255             // it wasn't terminated
 256             m_lastDelim = wxT('\0');
 257         }
 258         else // we found a delimiter at pos
 259         {
 260             // in wxTOKEN_RET_DELIMS mode we return the delimiter character
 261             // with token, otherwise leave it out
 262             wxString::const_iterator tokenEnd(pos);
 263             if ( m_mode == wxTOKEN_RET_DELIMS )
 264                 ++tokenEnd;
 265
 266             token.assign(m_pos, tokenEnd);
 267
 268             // skip the token and the trailing delimiter
 269             m_pos = pos + 1;
 270
 271             m_lastDelim = (pos == m_stringEnd) ? wxT('\0') : (wxChar)*pos;
 272         }
 273     }
 274     while ( !AllowEmpty() && token.empty() );
 275
 276     return token;
 277 }
 278
 279 // ----------------------------------------------------------------------------
 280 // public functions
 281 // ----------------------------------------------------------------------------
 282
 283 wxArrayString wxStringTokenize(const wxString& str,
 284                                const wxString& delims,
 285                                wxStringTokenizerMode mode)
 286 {
 287     wxArrayString tokens;
 288     wxStringTokenizer tk(str, delims, mode);
 289     while ( tk.HasMoreTokens() )
 290     {
 291         tokens.Add(tk.GetNextToken());
 292     }
 293
 294     return tokens;
 295 }