]> git.saurik.com Git - wxWidgets.git/blob - src/common/regex.cpp
Added wxTreebook:
[wxWidgets.git] / src / common / regex.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/regex.cpp
3 // Purpose: regular expression matching
4 // Author: Karsten Ballüder and Vadim Zeitlin
5 // Modified by:
6 // Created: 13.07.01
7 // RCS-ID: $Id$
8 // Copyright: (c) 2000 Karsten Ballüder <ballueder@gmx.net>
9 // 2001 Vadim Zeitlin <vadim@wxwindows.org>
10 // Licence: wxWindows licence
11 ///////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 // For compilers that support precompilation, includes "wx.h".
22 #include "wx/wxprec.h"
23
24 #ifdef __BORLANDC__
25 #pragma hdrstop
26 #endif
27
28 #if wxUSE_REGEX
29
30 #ifndef WX_PRECOMP
31 #include "wx/object.h"
32 #include "wx/string.h"
33 #include "wx/log.h"
34 #include "wx/intl.h"
35 #endif //WX_PRECOMP
36
37 // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it.
38 // Others also don't seem to need it. If you have an error related to
39 // (not) including <sys/types.h> please report details to
40 // wx-dev@lists.wxwindows.org
41 #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__)
42 # include <sys/types.h>
43 #endif
44
45 #include <regex.h>
46 #include "wx/regex.h"
47
48 // ----------------------------------------------------------------------------
49 // private classes
50 // ----------------------------------------------------------------------------
51
52 // the real implementation of wxRegEx
53 class wxRegExImpl
54 {
55 public:
56 // ctor and dtor
57 wxRegExImpl();
58 ~wxRegExImpl();
59
60 // return true if Compile() had been called successfully
61 bool IsValid() const { return m_isCompiled; }
62
63 // RE operations
64 bool Compile(const wxString& expr, int flags = 0);
65 bool Matches(const wxChar *str, int flags = 0) const;
66 bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
67 size_t GetMatchCount() const;
68 int Replace(wxString *pattern, const wxString& replacement,
69 size_t maxMatches = 0) const;
70
71 private:
72 // return the string containing the error message for the given err code
73 wxString GetErrorMsg(int errorcode, bool badconv) const;
74
75 // init the members
76 void Init()
77 {
78 m_isCompiled = false;
79 m_Matches = NULL;
80 m_nMatches = 0;
81 }
82
83 // free the RE if compiled
84 void Free()
85 {
86 if ( IsValid() )
87 {
88 regfree(&m_RegEx);
89 }
90
91 delete [] m_Matches;
92 }
93
94 // free the RE if any and reinit the members
95 void Reinit()
96 {
97 Free();
98 Init();
99 }
100
101
102 // compiled RE
103 regex_t m_RegEx;
104
105 // the subexpressions data
106 regmatch_t *m_Matches;
107 size_t m_nMatches;
108
109 // true if m_RegEx is valid
110 bool m_isCompiled;
111 };
112
113 // ============================================================================
114 // implementation
115 // ============================================================================
116
117 // ----------------------------------------------------------------------------
118 // wxRegExImpl
119 // ----------------------------------------------------------------------------
120
121 wxRegExImpl::wxRegExImpl()
122 {
123 Init();
124 }
125
126 wxRegExImpl::~wxRegExImpl()
127 {
128 Free();
129 }
130
131 wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
132 {
133 #if wxUSE_UNICODE && !defined(__REG_NOFRONT)
134 // currently only needed when using system library in Unicode mode
135 if ( badconv )
136 {
137 return _("conversion to 8-bit encoding failed");
138 }
139 #else
140 // 'use' badconv to avoid a compiler warning
141 (void)badconv;
142 #endif
143
144 wxString szError;
145
146 // first get the string length needed
147 int len = regerror(errorcode, &m_RegEx, NULL, 0);
148 if ( len > 0 )
149 {
150 char* szcmbError = new char[++len];
151
152 (void)regerror(errorcode, &m_RegEx, szcmbError, len);
153
154 szError = wxConvertMB2WX(szcmbError);
155 delete [] szcmbError;
156 }
157 else // regerror() returned 0
158 {
159 szError = _("unknown error");
160 }
161
162 return szError;
163 }
164
165 bool wxRegExImpl::Compile(const wxString& expr, int flags)
166 {
167 Reinit();
168
169 #ifdef WX_NO_REGEX_ADVANCED
170 # define FLAVORS wxRE_BASIC
171 #else
172 # define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
173 wxASSERT_MSG( (flags & FLAVORS) != FLAVORS,
174 _T("incompatible flags in wxRegEx::Compile") );
175 #endif
176 wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
177 _T("unrecognized flags in wxRegEx::Compile") );
178
179 // translate our flags to regcomp() ones
180 int flagsRE = 0;
181 if ( !(flags & wxRE_BASIC) )
182 #ifndef WX_NO_REGEX_ADVANCED
183 if (flags & wxRE_ADVANCED)
184 flagsRE |= REG_ADVANCED;
185 else
186 #endif
187 flagsRE |= REG_EXTENDED;
188 if ( flags & wxRE_ICASE )
189 flagsRE |= REG_ICASE;
190 if ( flags & wxRE_NOSUB )
191 flagsRE |= REG_NOSUB;
192 if ( flags & wxRE_NEWLINE )
193 flagsRE |= REG_NEWLINE;
194
195 // compile it
196 #ifdef __REG_NOFRONT
197 bool conv = true;
198 int errorcode = wx_re_comp(&m_RegEx, expr, expr.length(), flagsRE);
199 #else
200 const wxWX2MBbuf conv = expr.mbc_str();
201 int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT;
202 #endif
203
204 if ( errorcode )
205 {
206 wxLogError(_("Invalid regular expression '%s': %s"),
207 expr.c_str(), GetErrorMsg(errorcode, !conv).c_str());
208
209 m_isCompiled = false;
210 }
211 else // ok
212 {
213 // don't allocate the matches array now, but do it later if necessary
214 if ( flags & wxRE_NOSUB )
215 {
216 // we don't need it at all
217 m_nMatches = 0;
218 }
219 else
220 {
221 // we will alloc the array later (only if really needed) but count
222 // the number of sub-expressions in the regex right now
223
224 // there is always one for the whole expression
225 m_nMatches = 1;
226
227 // and some more for bracketed subexperessions
228 for ( const wxChar *cptr = expr.c_str(); *cptr; cptr++ )
229 {
230 if ( *cptr == _T('\\') )
231 {
232 // in basic RE syntax groups are inside \(...\)
233 if ( *++cptr == _T('(') && (flags & wxRE_BASIC) )
234 {
235 m_nMatches++;
236 }
237 }
238 else if ( *cptr == _T('(') && !(flags & wxRE_BASIC) )
239 {
240 // we know that the previous character is not an unquoted
241 // backslash because it would have been eaten above, so we
242 // have a bare '(' and this indicates a group start for the
243 // extended syntax. '(?' is used for extensions by perl-
244 // like REs (e.g. advanced), and is not valid for POSIX
245 // extended, so ignore them always.
246 if ( cptr[1] != _T('?') )
247 m_nMatches++;
248 }
249 }
250 }
251
252 m_isCompiled = true;
253 }
254
255 return IsValid();
256 }
257
258 bool wxRegExImpl::Matches(const wxChar *str, int flags) const
259 {
260 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
261
262 // translate our flags to regexec() ones
263 wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
264 _T("unrecognized flags in wxRegEx::Matches") );
265
266 int flagsRE = 0;
267 if ( flags & wxRE_NOTBOL )
268 flagsRE |= REG_NOTBOL;
269 if ( flags & wxRE_NOTEOL )
270 flagsRE |= REG_NOTEOL;
271
272 // allocate matches array if needed
273 wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
274 if ( !m_Matches && m_nMatches )
275 {
276 self->m_Matches = new regmatch_t[m_nMatches];
277 }
278
279 // do match it
280 #ifdef __REG_NOFRONT
281 bool conv = true;
282 int rc = wx_re_exec(&self->m_RegEx, str, wxStrlen(str), NULL, m_nMatches, m_Matches, flagsRE);
283 #else
284 const wxWX2MBbuf conv = wxConvertWX2MB(str);
285 int rc = conv ? regexec(&self->m_RegEx, conv, m_nMatches, m_Matches, flagsRE) : REG_BADPAT;
286 #endif
287
288 switch ( rc )
289 {
290 case 0:
291 // matched successfully
292 return true;
293
294 default:
295 // an error occurred
296 wxLogError(_("Failed to match '%s' in regular expression: %s"),
297 str, GetErrorMsg(rc, !conv).c_str());
298 // fall through
299
300 case REG_NOMATCH:
301 // no match
302 return false;
303 }
304 }
305
306 bool wxRegExImpl::GetMatch(size_t *start, size_t *len, size_t index) const
307 {
308 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
309 wxCHECK_MSG( m_nMatches, false, _T("can't use with wxRE_NOSUB") );
310 wxCHECK_MSG( m_Matches, false, _T("must call Matches() first") );
311 wxCHECK_MSG( index < m_nMatches, false, _T("invalid match index") );
312
313 const regmatch_t& match = m_Matches[index];
314
315 // we need the casts because rm_so can be a 64 bit quantity
316 if ( start )
317 *start = wx_truncate_cast(size_t, match.rm_so);
318 if ( len )
319 *len = wx_truncate_cast(size_t, match.rm_eo - match.rm_so);
320
321 return true;
322 }
323
324 size_t wxRegExImpl::GetMatchCount() const
325 {
326 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
327 wxCHECK_MSG( m_nMatches, 0, _T("can't use with wxRE_NOSUB") );
328
329 return m_nMatches;
330 }
331
332 int wxRegExImpl::Replace(wxString *text,
333 const wxString& replacement,
334 size_t maxMatches) const
335 {
336 wxCHECK_MSG( text, wxNOT_FOUND, _T("NULL text in wxRegEx::Replace") );
337 wxCHECK_MSG( IsValid(), wxNOT_FOUND, _T("must successfully Compile() first") );
338
339 // the replacement text
340 wxString textNew;
341
342 // attempt at optimization: don't iterate over the string if it doesn't
343 // contain back references at all
344 bool mayHaveBackrefs =
345 replacement.find_first_of(_T("\\&")) != wxString::npos;
346
347 if ( !mayHaveBackrefs )
348 {
349 textNew = replacement;
350 }
351
352 // the position where we start looking for the match
353 //
354 // NB: initial version had a nasty bug because it used a wxChar* instead of
355 // an index but the problem is that replace() in the loop invalidates
356 // all pointers into the string so we have to use indices instead
357 size_t matchStart = 0;
358
359 // number of replacement made: we won't make more than maxMatches of them
360 // (unless maxMatches is 0 which doesn't limit the number of replacements)
361 size_t countRepl = 0;
362
363 // note that "^" shouldn't match after the first call to Matches() so we
364 // use wxRE_NOTBOL to prevent it from happening
365 while ( (!maxMatches || countRepl < maxMatches) &&
366 Matches(text->c_str() + matchStart, countRepl ? wxRE_NOTBOL : 0) )
367 {
368 // the string possibly contains back references: we need to calculate
369 // the replacement text anew after each match
370 if ( mayHaveBackrefs )
371 {
372 mayHaveBackrefs = false;
373 textNew.clear();
374 textNew.reserve(replacement.length());
375
376 for ( const wxChar *p = replacement.c_str(); *p; p++ )
377 {
378 size_t index = (size_t)-1;
379
380 if ( *p == _T('\\') )
381 {
382 if ( wxIsdigit(*++p) )
383 {
384 // back reference
385 wxChar *end;
386 index = (size_t)wxStrtoul(p, &end, 10);
387 p = end - 1; // -1 to compensate for p++ in the loop
388 }
389 //else: backslash used as escape character
390 }
391 else if ( *p == _T('&') )
392 {
393 // treat this as "\0" for compatbility with ed and such
394 index = 0;
395 }
396
397 // do we have a back reference?
398 if ( index != (size_t)-1 )
399 {
400 // yes, get its text
401 size_t start, len;
402 if ( !GetMatch(&start, &len, index) )
403 {
404 wxFAIL_MSG( _T("invalid back reference") );
405
406 // just eat it...
407 }
408 else
409 {
410 textNew += wxString(text->c_str() + matchStart + start,
411 len);
412
413 mayHaveBackrefs = true;
414 }
415 }
416 else // ordinary character
417 {
418 textNew += *p;
419 }
420 }
421 }
422
423 size_t start, len;
424 if ( !GetMatch(&start, &len) )
425 {
426 // we did have match as Matches() returned true above!
427 wxFAIL_MSG( _T("internal logic error in wxRegEx::Replace") );
428
429 return wxNOT_FOUND;
430 }
431
432 matchStart += start;
433 text->replace(matchStart, len, textNew);
434
435 countRepl++;
436
437 matchStart += textNew.length();
438 }
439
440 return countRepl;
441 }
442
443 // ----------------------------------------------------------------------------
444 // wxRegEx: all methods are mostly forwarded to wxRegExImpl
445 // ----------------------------------------------------------------------------
446
447 void wxRegEx::Init()
448 {
449 m_impl = NULL;
450 }
451
452
453 wxRegEx::~wxRegEx()
454 {
455 delete m_impl;
456 }
457
458 bool wxRegEx::Compile(const wxString& expr, int flags)
459 {
460 if ( !m_impl )
461 {
462 m_impl = new wxRegExImpl;
463 }
464
465 if ( !m_impl->Compile(expr, flags) )
466 {
467 // error message already given in wxRegExImpl::Compile
468 delete m_impl;
469 m_impl = NULL;
470
471 return false;
472 }
473
474 return true;
475 }
476
477 bool wxRegEx::Matches(const wxChar *str, int flags) const
478 {
479 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
480
481 return m_impl->Matches(str, flags);
482 }
483
484 bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
485 {
486 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
487
488 return m_impl->GetMatch(start, len, index);
489 }
490
491 wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
492 {
493 size_t start, len;
494 if ( !GetMatch(&start, &len, index) )
495 return wxEmptyString;
496
497 return text.Mid(start, len);
498 }
499
500 size_t wxRegEx::GetMatchCount() const
501 {
502 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
503
504 return m_impl->GetMatchCount();
505 }
506
507 int wxRegEx::Replace(wxString *pattern,
508 const wxString& replacement,
509 size_t maxMatches) const
510 {
511 wxCHECK_MSG( IsValid(), wxNOT_FOUND, _T("must successfully Compile() first") );
512
513 return m_impl->Replace(pattern, replacement, maxMatches);
514 }
515
516 #endif // wxUSE_REGEX