Commit | Line | Data |
---|---|---|
f4ada568 | 1 | ///////////////////////////////////////////////////////////////////////////// |
aaa6d89a | 2 | // Name: src/common/tokenzr.cpp |
f4ada568 GL |
3 | // Purpose: String tokenizer |
4 | // Author: Guilhem Lavaux | |
1e6feb95 | 5 | // Modified by: Vadim Zeitlin (almost full rewrite) |
f4ada568 GL |
6 | // Created: 04/22/98 |
7 | // RCS-ID: $Id$ | |
8 | // Copyright: (c) Guilhem Lavaux | |
65571936 | 9 | // Licence: wxWindows licence |
f4ada568 GL |
10 | ///////////////////////////////////////////////////////////////////////////// |
11 | ||
bbf8fc53 VZ |
12 | // ============================================================================ |
13 | // declarations | |
14 | // ============================================================================ | |
15 | ||
16 | // ---------------------------------------------------------------------------- | |
17 | // headers | |
18 | // ---------------------------------------------------------------------------- | |
19 | ||
fcc6dddd JS |
20 | // For compilers that support precompilation, includes "wx.h". |
21 | #include "wx/wxprec.h" | |
22 | ||
23 | #ifdef __BORLANDC__ | |
85833f5c | 24 | #pragma hdrstop |
fcc6dddd JS |
25 | #endif |
26 | ||
f4ada568 | 27 | #include "wx/tokenzr.h" |
aaa6d89a WS |
28 | |
29 | #ifndef WX_PRECOMP | |
30 | #include "wx/arrstr.h" | |
0bf751e7 | 31 | #include "wx/crt.h" |
aaa6d89a | 32 | #endif |
f4ada568 | 33 | |
3f8e5072 JS |
34 | // Required for wxIs... functions |
35 | #include <ctype.h> | |
36 | ||
bbf8fc53 VZ |
37 | // ============================================================================ |
38 | // implementation | |
39 | // ============================================================================ | |
40 | ||
f0dfc29c VS |
41 | // ---------------------------------------------------------------------------- |
42 | // helpers | |
43 | // ---------------------------------------------------------------------------- | |
44 | ||
45 | static wxString::const_iterator | |
46 | find_first_of(const wxChar *delims, size_t len, | |
47 | const wxString::const_iterator& from, | |
48 | const wxString::const_iterator& end) | |
49 | { | |
9a83f860 | 50 | wxASSERT_MSG( from <= end, wxT("invalid index") ); |
f0dfc29c VS |
51 | |
52 | for ( wxString::const_iterator i = from; i != end; ++i ) | |
53 | { | |
54 | if ( wxTmemchr(delims, *i, len) ) | |
55 | return i; | |
56 | } | |
57 | ||
58 | return end; | |
59 | } | |
60 | ||
61 | static wxString::const_iterator | |
62 | find_first_not_of(const wxChar *delims, size_t len, | |
63 | const wxString::const_iterator& from, | |
64 | const wxString::const_iterator& end) | |
65 | { | |
9a83f860 | 66 | wxASSERT_MSG( from <= end, wxT("invalid index") ); |
f0dfc29c VS |
67 | |
68 | for ( wxString::const_iterator i = from; i != end; ++i ) | |
69 | { | |
70 | if ( !wxTmemchr(delims, *i, len) ) | |
71 | return i; | |
72 | } | |
73 | ||
74 | return end; | |
75 | } | |
76 | ||
bbf8fc53 VZ |
77 | // ---------------------------------------------------------------------------- |
78 | // wxStringTokenizer construction | |
79 | // ---------------------------------------------------------------------------- | |
80 | ||
7c968cee | 81 | wxStringTokenizer::wxStringTokenizer(const wxString& str, |
f4ada568 | 82 | const wxString& delims, |
7c968cee | 83 | wxStringTokenizerMode mode) |
bbf8fc53 | 84 | { |
7c968cee | 85 | SetString(str, delims, mode); |
bbf8fc53 VZ |
86 | } |
87 | ||
7c968cee | 88 | void wxStringTokenizer::SetString(const wxString& str, |
bbf8fc53 | 89 | const wxString& delims, |
7c968cee | 90 | wxStringTokenizerMode mode) |
f4ada568 | 91 | { |
7c968cee VZ |
92 | if ( mode == wxTOKEN_DEFAULT ) |
93 | { | |
94 | // by default, we behave like strtok() if the delimiters are only | |
95 | // whitespace characters and as wxTOKEN_RET_EMPTY otherwise (for | |
96 | // whitespace delimiters, strtok() behaviour is better because we want | |
97 | // to count consecutive spaces as one delimiter) | |
66c71d8a VS |
98 | wxString::const_iterator p; |
99 | for ( p = delims.begin(); p != delims.end(); ++p ) | |
7c968cee VZ |
100 | { |
101 | if ( !wxIsspace(*p) ) | |
102 | break; | |
103 | } | |
104 | ||
66c71d8a | 105 | if ( p != delims.end() ) |
7c968cee VZ |
106 | { |
107 | // not whitespace char in delims | |
108 | mode = wxTOKEN_RET_EMPTY; | |
109 | } | |
110 | else | |
111 | { | |
112 | // only whitespaces | |
113 | mode = wxTOKEN_STRTOK; | |
114 | } | |
115 | } | |
116 | ||
f0dfc29c VS |
117 | #if wxUSE_UNICODE // FIXME-UTF8: only wc_str() |
118 | m_delims = delims.wc_str(); | |
119 | #else | |
120 | m_delims = delims.mb_str(); | |
121 | #endif | |
122 | m_delimsLen = delims.length(); | |
123 | ||
7c968cee | 124 | m_mode = mode; |
bbf8fc53 | 125 | |
7c968cee | 126 | Reinit(str); |
f4ada568 GL |
127 | } |
128 | ||
7c968cee | 129 | void wxStringTokenizer::Reinit(const wxString& str) |
f4ada568 | 130 | { |
9a83f860 | 131 | wxASSERT_MSG( IsOk(), wxT("you should call SetString() first") ); |
7c968cee VZ |
132 | |
133 | m_string = str; | |
f0dfc29c VS |
134 | m_stringEnd = m_string.end(); |
135 | m_pos = m_string.begin(); | |
9a83f860 | 136 | m_lastDelim = wxT('\0'); |
f0dfc29c | 137 | m_hasMoreTokens = MoreTokens_Unknown; |
f4ada568 GL |
138 | } |
139 | ||
bbf8fc53 | 140 | // ---------------------------------------------------------------------------- |
7c968cee | 141 | // access to the tokens |
bbf8fc53 VZ |
142 | // ---------------------------------------------------------------------------- |
143 | ||
7c968cee VZ |
144 | // do we have more of them? |
145 | bool wxStringTokenizer::HasMoreTokens() const | |
f0dfc29c VS |
146 | { |
147 | // GetNextToken() calls HasMoreTokens() and so HasMoreTokens() is called | |
148 | // twice in every interation in the following common usage patten: | |
149 | // while ( HasMoreTokens() ) | |
150 | // GetNextToken(); | |
151 | // We optimize this case by caching HasMoreTokens() return value here: | |
152 | if ( m_hasMoreTokens == MoreTokens_Unknown ) | |
153 | { | |
154 | bool r = DoHasMoreTokens(); | |
155 | wxConstCast(this, wxStringTokenizer)->m_hasMoreTokens = | |
156 | r ? MoreTokens_Yes : MoreTokens_No; | |
157 | return r; | |
158 | } | |
159 | else | |
160 | return m_hasMoreTokens == MoreTokens_Yes; | |
161 | } | |
162 | ||
163 | bool wxStringTokenizer::DoHasMoreTokens() const | |
f4ada568 | 164 | { |
9a83f860 | 165 | wxCHECK_MSG( IsOk(), false, wxT("you should call SetString() first") ); |
7c968cee | 166 | |
f0dfc29c VS |
167 | if ( find_first_not_of(m_delims, m_delimsLen, m_pos, m_stringEnd) |
168 | != m_stringEnd ) | |
bbf8fc53 | 169 | { |
4626c57c VZ |
170 | // there are non delimiter characters left, so we do have more tokens |
171 | return true; | |
7c968cee | 172 | } |
4626c57c VZ |
173 | |
174 | switch ( m_mode ) | |
7c968cee | 175 | { |
4626c57c VZ |
176 | case wxTOKEN_RET_EMPTY: |
177 | case wxTOKEN_RET_DELIMS: | |
178 | // special hack for wxTOKEN_RET_EMPTY: we should return the initial | |
179 | // empty token even if there are only delimiters after it | |
f0dfc29c | 180 | return !m_string.empty() && m_pos == m_string.begin(); |
4626c57c VZ |
181 | |
182 | case wxTOKEN_RET_EMPTY_ALL: | |
183 | // special hack for wxTOKEN_RET_EMPTY_ALL: we can know if we had | |
184 | // already returned the trailing empty token after the last | |
185 | // delimiter by examining m_lastDelim: it is set to NUL if we run | |
186 | // up to the end of the string in GetNextToken(), but if it is not | |
187 | // NUL yet we still have this last token to return even if m_pos is | |
188 | // already at m_string.length() | |
9a83f860 | 189 | return m_pos < m_stringEnd || m_lastDelim != wxT('\0'); |
4626c57c VZ |
190 | |
191 | case wxTOKEN_INVALID: | |
192 | case wxTOKEN_DEFAULT: | |
9a83f860 | 193 | wxFAIL_MSG( wxT("unexpected tokenizer mode") ); |
4626c57c VZ |
194 | // fall through |
195 | ||
196 | case wxTOKEN_STRTOK: | |
197 | // never return empty delimiters | |
198 | break; | |
7c968cee | 199 | } |
4626c57c VZ |
200 | |
201 | return false; | |
7c968cee | 202 | } |
bbf8fc53 | 203 | |
4626c57c | 204 | // count the number of (remaining) tokens in the string |
7c968cee VZ |
205 | size_t wxStringTokenizer::CountTokens() const |
206 | { | |
9a83f860 | 207 | wxCHECK_MSG( IsOk(), 0, wxT("you should call SetString() first") ); |
bbf8fc53 | 208 | |
7c968cee | 209 | // VZ: this function is IMHO not very useful, so it's probably not very |
4626c57c VZ |
210 | // important if its implementation here is not as efficient as it |
211 | // could be -- but OTOH like this we're sure to get the correct answer | |
7c968cee | 212 | // in all modes |
f0dfc29c | 213 | wxStringTokenizer tkz(wxString(m_pos, m_stringEnd), m_delims, m_mode); |
bbf8fc53 | 214 | |
7c968cee | 215 | size_t count = 0; |
4626c57c | 216 | while ( tkz.HasMoreTokens() ) |
bbf8fc53 VZ |
217 | { |
218 | count++; | |
7c968cee | 219 | |
4626c57c | 220 | (void)tkz.GetNextToken(); |
bbf8fc53 VZ |
221 | } |
222 | ||
223 | return count; | |
224 | } | |
225 | ||
226 | // ---------------------------------------------------------------------------- | |
227 | // token extraction | |
228 | // ---------------------------------------------------------------------------- | |
229 | ||
230 | wxString wxStringTokenizer::GetNextToken() | |
231 | { | |
232 | wxString token; | |
7c968cee | 233 | do |
bbf8fc53 | 234 | { |
7c968cee | 235 | if ( !HasMoreTokens() ) |
85833f5c | 236 | { |
7c968cee | 237 | break; |
85833f5c | 238 | } |
4626c57c | 239 | |
f0dfc29c VS |
240 | m_hasMoreTokens = MoreTokens_Unknown; |
241 | ||
7c968cee | 242 | // find the end of this token |
f0dfc29c VS |
243 | wxString::const_iterator pos = |
244 | find_first_of(m_delims, m_delimsLen, m_pos, m_stringEnd); | |
7c968cee VZ |
245 | |
246 | // and the start of the next one | |
f0dfc29c | 247 | if ( pos == m_stringEnd ) |
85833f5c | 248 | { |
7c968cee VZ |
249 | // no more delimiters, the token is everything till the end of |
250 | // string | |
f0dfc29c | 251 | token.assign(m_pos, m_stringEnd); |
7c968cee | 252 | |
4626c57c | 253 | // skip the token |
f0dfc29c | 254 | m_pos = m_stringEnd; |
bbf8fc53 | 255 | |
4626c57c | 256 | // it wasn't terminated |
9a83f860 | 257 | m_lastDelim = wxT('\0'); |
85833f5c | 258 | } |
4626c57c | 259 | else // we found a delimiter at pos |
7c968cee | 260 | { |
7c968cee | 261 | // in wxTOKEN_RET_DELIMS mode we return the delimiter character |
4626c57c | 262 | // with token, otherwise leave it out |
f06a1f33 | 263 | wxString::const_iterator tokenEnd(pos); |
4626c57c | 264 | if ( m_mode == wxTOKEN_RET_DELIMS ) |
f06a1f33 | 265 | ++tokenEnd; |
4626c57c | 266 | |
f06a1f33 | 267 | token.assign(m_pos, tokenEnd); |
dab58492 | 268 | |
4626c57c VZ |
269 | // skip the token and the trailing delimiter |
270 | m_pos = pos + 1; | |
bbf8fc53 | 271 | |
9a83f860 | 272 | m_lastDelim = (pos == m_stringEnd) ? wxT('\0') : (wxChar)*pos; |
7c968cee | 273 | } |
85833f5c | 274 | } |
4626c57c | 275 | while ( !AllowEmpty() && token.empty() ); |
bbf8fc53 VZ |
276 | |
277 | return token; | |
f4ada568 | 278 | } |
1e6feb95 VZ |
279 | |
280 | // ---------------------------------------------------------------------------- | |
281 | // public functions | |
282 | // ---------------------------------------------------------------------------- | |
283 | ||
284 | wxArrayString wxStringTokenize(const wxString& str, | |
285 | const wxString& delims, | |
286 | wxStringTokenizerMode mode) | |
287 | { | |
288 | wxArrayString tokens; | |
289 | wxStringTokenizer tk(str, delims, mode); | |
290 | while ( tk.HasMoreTokens() ) | |
291 | { | |
292 | tokens.Add(tk.GetNextToken()); | |
293 | } | |
294 | ||
295 | return tokens; | |
296 | } |