src/common/tokenzr.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/tokenzr.cpp
   3 // Purpose:     String tokenizer
   4 // Author:      Guilhem Lavaux
   5 // Modified by: Vadim Zeitlin (almost full rewrite)
   6 // Created:     04/22/98
   7 // RCS-ID:      $Id$
   8 // Copyright:   (c) Guilhem Lavaux
   9 // Licence:     wxWindows licence
  10 /////////////////////////////////////////////////////////////////////////////
  11
  12 // ============================================================================
  13 // declarations
  14 // ============================================================================
  15
  16 // ----------------------------------------------------------------------------
  17 // headers
  18 // ----------------------------------------------------------------------------
  19
  20 // For compilers that support precompilation, includes "wx.h".
  21 #include "wx/wxprec.h"
  22
  23 #ifdef __BORLANDC__
  24     #pragma hdrstop
  25 #endif
  26
  27 #include "wx/tokenzr.h"
  28
  29 #ifndef WX_PRECOMP
  30     #include "wx/arrstr.h"
  31     #include "wx/crt.h"
  32 #endif
  33
  34 // Required for wxIs... functions
  35 #include <ctype.h>
  36
  37 // ============================================================================
  38 // implementation
  39 // ============================================================================
  40
  41 // ----------------------------------------------------------------------------
  42 // helpers
  43 // ----------------------------------------------------------------------------
  44
  45 static wxString::const_iterator
  46 find_first_of(const wxChar *delims, size_t len,
  47               const wxString::const_iterator& from,
  48               const wxString::const_iterator& end)
  49 {
  50     wxASSERT_MSG( from <= end,  wxT("invalid index") );
  51
  52     for ( wxString::const_iterator i = from; i != end; ++i )
  53     {
  54         if ( wxTmemchr(delims, *i, len) )
  55             return i;
  56     }
  57
  58     return end;
  59 }
  60
  61 static wxString::const_iterator
  62 find_first_not_of(const wxChar *delims, size_t len,
  63                   const wxString::const_iterator& from,
  64                   const wxString::const_iterator& end)
  65 {
  66     wxASSERT_MSG( from <= end,  wxT("invalid index") );
  67
  68     for ( wxString::const_iterator i = from; i != end; ++i )
  69     {
  70         if ( !wxTmemchr(delims, *i, len) )
  71             return i;
  72     }
  73
  74     return end;
  75 }
  76
  77 // ----------------------------------------------------------------------------
  78 // wxStringTokenizer construction
  79 // ----------------------------------------------------------------------------
  80
  81 wxStringTokenizer::wxStringTokenizer(const wxString& str,
  82                                      const wxString& delims,
  83                                      wxStringTokenizerMode mode)
  84 {
  85     SetString(str, delims, mode);
  86 }
  87
  88 void wxStringTokenizer::SetString(const wxString& str,
  89                                   const wxString& delims,
  90                                   wxStringTokenizerMode mode)
  91 {
  92     if ( mode == wxTOKEN_DEFAULT )
  93     {
  94         // by default, we behave like strtok() if the delimiters are only
  95         // whitespace characters and as wxTOKEN_RET_EMPTY otherwise (for
  96         // whitespace delimiters, strtok() behaviour is better because we want
  97         // to count consecutive spaces as one delimiter)
  98         wxString::const_iterator p;
  99         for ( p = delims.begin(); p != delims.end(); ++p )
 100         {
 101             if ( !wxIsspace(*p) )
 102                 break;
 103         }
 104
 105         if ( p != delims.end() )
 106         {
 107             // not whitespace char in delims
 108             mode = wxTOKEN_RET_EMPTY;
 109         }
 110         else
 111         {
 112             // only whitespaces
 113             mode = wxTOKEN_STRTOK;
 114         }
 115     }
 116
 117 #if wxUSE_UNICODE // FIXME-UTF8: only wc_str()
 118     m_delims = delims.wc_str();
 119 #else
 120     m_delims = delims.mb_str();
 121 #endif
 122     m_delimsLen = delims.length();
 123
 124     m_mode = mode;
 125
 126     Reinit(str);
 127 }
 128
 129 void wxStringTokenizer::Reinit(const wxString& str)
 130 {
 131     wxASSERT_MSG( IsOk(), wxT("you should call SetString() first") );
 132
 133     m_string = str;
 134     m_stringEnd = m_string.end();
 135     m_pos = m_string.begin();
 136     m_lastDelim = wxT('\0');
 137     m_hasMoreTokens = MoreTokens_Unknown;
 138 }
 139
 140 // ----------------------------------------------------------------------------
 141 // access to the tokens
 142 // ----------------------------------------------------------------------------
 143
 144 // do we have more of them?
 145 bool wxStringTokenizer::HasMoreTokens() const
 146 {
 147     // GetNextToken() calls HasMoreTokens() and so HasMoreTokens() is called
 148     // twice in every interation in the following common usage patten:
 149     //     while ( HasMoreTokens() )
 150     //        GetNextToken();
 151     // We optimize this case by caching HasMoreTokens() return value here:
 152     if ( m_hasMoreTokens == MoreTokens_Unknown )
 153     {
 154         bool r = DoHasMoreTokens();
 155         wxConstCast(this, wxStringTokenizer)->m_hasMoreTokens =
 156             r ? MoreTokens_Yes : MoreTokens_No;
 157         return r;
 158     }
 159     else
 160         return m_hasMoreTokens == MoreTokens_Yes;
 161 }
 162
 163 bool wxStringTokenizer::DoHasMoreTokens() const
 164 {
 165     wxCHECK_MSG( IsOk(), false, wxT("you should call SetString() first") );
 166
 167     if ( find_first_not_of(m_delims, m_delimsLen, m_pos, m_stringEnd)
 168          != m_stringEnd )
 169     {
 170         // there are non delimiter characters left, so we do have more tokens
 171         return true;
 172     }
 173
 174     switch ( m_mode )
 175     {
 176         case wxTOKEN_RET_EMPTY:
 177         case wxTOKEN_RET_DELIMS:
 178             // special hack for wxTOKEN_RET_EMPTY: we should return the initial
 179             // empty token even if there are only delimiters after it
 180             return !m_string.empty() && m_pos == m_string.begin();
 181
 182         case wxTOKEN_RET_EMPTY_ALL:
 183             // special hack for wxTOKEN_RET_EMPTY_ALL: we can know if we had
 184             // already returned the trailing empty token after the last
 185             // delimiter by examining m_lastDelim: it is set to NUL if we run
 186             // up to the end of the string in GetNextToken(), but if it is not
 187             // NUL yet we still have this last token to return even if m_pos is
 188             // already at m_string.length()
 189             return m_pos < m_stringEnd || m_lastDelim != wxT('\0');
 190
 191         case wxTOKEN_INVALID:
 192         case wxTOKEN_DEFAULT:
 193             wxFAIL_MSG( wxT("unexpected tokenizer mode") );
 194             // fall through
 195
 196         case wxTOKEN_STRTOK:
 197             // never return empty delimiters
 198             break;
 199     }
 200
 201     return false;
 202 }
 203
 204 // count the number of (remaining) tokens in the string
 205 size_t wxStringTokenizer::CountTokens() const
 206 {
 207     wxCHECK_MSG( IsOk(), 0, wxT("you should call SetString() first") );
 208
 209     // VZ: this function is IMHO not very useful, so it's probably not very
 210     //     important if its implementation here is not as efficient as it
 211     //     could be -- but OTOH like this we're sure to get the correct answer
 212     //     in all modes
 213     wxStringTokenizer tkz(wxString(m_pos, m_stringEnd), m_delims, m_mode);
 214
 215     size_t count = 0;
 216     while ( tkz.HasMoreTokens() )
 217     {
 218         count++;
 219
 220         (void)tkz.GetNextToken();
 221     }
 222
 223     return count;
 224 }
 225
 226 // ----------------------------------------------------------------------------
 227 // token extraction
 228 // ----------------------------------------------------------------------------
 229
 230 wxString wxStringTokenizer::GetNextToken()
 231 {
 232     wxString token;
 233     do
 234     {
 235         if ( !HasMoreTokens() )
 236         {
 237             break;
 238         }
 239
 240         m_hasMoreTokens = MoreTokens_Unknown;
 241
 242         // find the end of this token
 243         wxString::const_iterator pos =
 244             find_first_of(m_delims, m_delimsLen, m_pos, m_stringEnd);
 245
 246         // and the start of the next one
 247         if ( pos == m_stringEnd )
 248         {
 249             // no more delimiters, the token is everything till the end of
 250             // string
 251             token.assign(m_pos, m_stringEnd);
 252
 253             // skip the token
 254             m_pos = m_stringEnd;
 255
 256             // it wasn't terminated
 257             m_lastDelim = wxT('\0');
 258         }
 259         else // we found a delimiter at pos
 260         {
 261             // in wxTOKEN_RET_DELIMS mode we return the delimiter character
 262             // with token, otherwise leave it out
 263             wxString::const_iterator tokenEnd(pos);
 264             if ( m_mode == wxTOKEN_RET_DELIMS )
 265                 ++tokenEnd;
 266
 267             token.assign(m_pos, tokenEnd);
 268
 269             // skip the token and the trailing delimiter
 270             m_pos = pos + 1;
 271
 272             m_lastDelim = (pos == m_stringEnd) ? wxT('\0') : (wxChar)*pos;
 273         }
 274     }
 275     while ( !AllowEmpty() && token.empty() );
 276
 277     return token;
 278 }
 279
 280 // ----------------------------------------------------------------------------
 281 // public functions
 282 // ----------------------------------------------------------------------------
 283
 284 wxArrayString wxStringTokenize(const wxString& str,
 285                                const wxString& delims,
 286                                wxStringTokenizerMode mode)
 287 {
 288     wxArrayString tokens;
 289     wxStringTokenizer tk(str, delims, mode);
 290     while ( tk.HasMoreTokens() )
 291     {
 292         tokens.Add(tk.GetNextToken());
 293     }
 294
 295     return tokens;
 296 }