Change the name of a symbol to make it more descriptive
[wxWidgets.git] / src / common / regex.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/regex.cpp
3 // Purpose: regular expression matching
4 // Author: Karsten Ballüder and Vadim Zeitlin
5 // Modified by:
6 // Created: 13.07.01
7 // RCS-ID: $Id$
8 // Copyright: (c) 2000 Karsten Ballüder <ballueder@gmx.net>
9 // 2001 Vadim Zeitlin <vadim@wxwindows.org>
10 // Licence: wxWindows licence
11 ///////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 // For compilers that support precompilation, includes "wx.h".
22 #include "wx/wxprec.h"
23
24 #ifdef __BORLANDC__
25 #pragma hdrstop
26 #endif
27
28 #if wxUSE_REGEX
29
30 #ifndef WX_PRECOMP
31 #include "wx/object.h"
32 #include "wx/string.h"
33 #include "wx/log.h"
34 #include "wx/intl.h"
35 #endif //WX_PRECOMP
36
37 // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it.
38 // Others also don't seem to need it. If you have an error related to
39 // (not) including <sys/types.h> please report details to
40 // wx-dev@lists.wxwindows.org
41 #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__)
42 # include <sys/types.h>
43 #endif
44
45 #include <regex.h>
46 #include "wx/regex.h"
47
48 // defined when the regex lib uses 'char' but 'wxChar' is wide
49 #if wxUSE_UNICODE && !defined(__REG_NOFRONT)
50 # define WXREGEX_CONVERT_TO_MB
51 #endif
52
53 // ----------------------------------------------------------------------------
54 // private classes
55 // ----------------------------------------------------------------------------
56
57 // the character type used by the regular expression engine
58 #ifndef WXREGEX_CONVERT_TO_MB
59 typedef wxChar wxRegChar;
60 #else
61 typedef char wxRegChar;
62 #endif
63
64 // the real implementation of wxRegEx
65 class wxRegExImpl
66 {
67 public:
68 // ctor and dtor
69 wxRegExImpl();
70 ~wxRegExImpl();
71
72 // return true if Compile() had been called successfully
73 bool IsValid() const { return m_isCompiled; }
74
75 // RE operations
76 bool Compile(const wxString& expr, int flags = 0);
77 bool Matches(const wxRegChar *str, int flags, size_t len) const;
78 bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
79 size_t GetMatchCount() const;
80 int Replace(wxString *pattern, const wxString& replacement,
81 size_t maxMatches = 0) const;
82
83 private:
84 // return the string containing the error message for the given err code
85 wxString GetErrorMsg(int errorcode, bool badconv) const;
86
87 // init the members
88 void Init()
89 {
90 m_isCompiled = false;
91 m_Matches = NULL;
92 m_nMatches = 0;
93 }
94
95 // free the RE if compiled
96 void Free()
97 {
98 if ( IsValid() )
99 {
100 regfree(&m_RegEx);
101 }
102
103 delete [] m_Matches;
104 }
105
106 // free the RE if any and reinit the members
107 void Reinit()
108 {
109 Free();
110 Init();
111 }
112
113
114 // compiled RE
115 regex_t m_RegEx;
116
117 // the subexpressions data
118 regmatch_t *m_Matches;
119 size_t m_nMatches;
120
121 // true if m_RegEx is valid
122 bool m_isCompiled;
123 };
124
125 // ============================================================================
126 // implementation
127 // ============================================================================
128
129 // ----------------------------------------------------------------------------
130 // wxRegExImpl
131 // ----------------------------------------------------------------------------
132
133 wxRegExImpl::wxRegExImpl()
134 {
135 Init();
136 }
137
138 wxRegExImpl::~wxRegExImpl()
139 {
140 Free();
141 }
142
143 wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
144 {
145 #ifdef WXREGEX_CONVERT_TO_MB
146 // currently only needed when using system library in Unicode mode
147 if ( badconv )
148 {
149 return _("conversion to 8-bit encoding failed");
150 }
151 #else
152 // 'use' badconv to avoid a compiler warning
153 (void)badconv;
154 #endif
155
156 wxString szError;
157
158 // first get the string length needed
159 int len = regerror(errorcode, &m_RegEx, NULL, 0);
160 if ( len > 0 )
161 {
162 char* szcmbError = new char[++len];
163
164 (void)regerror(errorcode, &m_RegEx, szcmbError, len);
165
166 szError = wxConvertMB2WX(szcmbError);
167 delete [] szcmbError;
168 }
169 else // regerror() returned 0
170 {
171 szError = _("unknown error");
172 }
173
174 return szError;
175 }
176
177 bool wxRegExImpl::Compile(const wxString& expr, int flags)
178 {
179 Reinit();
180
181 #ifdef WX_NO_REGEX_ADVANCED
182 # define FLAVORS wxRE_BASIC
183 #else
184 # define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
185 wxASSERT_MSG( (flags & FLAVORS) != FLAVORS,
186 _T("incompatible flags in wxRegEx::Compile") );
187 #endif
188 wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
189 _T("unrecognized flags in wxRegEx::Compile") );
190
191 // translate our flags to regcomp() ones
192 int flagsRE = 0;
193 if ( !(flags & wxRE_BASIC) )
194 #ifndef WX_NO_REGEX_ADVANCED
195 if (flags & wxRE_ADVANCED)
196 flagsRE |= REG_ADVANCED;
197 else
198 #endif
199 flagsRE |= REG_EXTENDED;
200 if ( flags & wxRE_ICASE )
201 flagsRE |= REG_ICASE;
202 if ( flags & wxRE_NOSUB )
203 flagsRE |= REG_NOSUB;
204 if ( flags & wxRE_NEWLINE )
205 flagsRE |= REG_NEWLINE;
206
207 // compile it
208 #ifdef __REG_NOFRONT
209 bool conv = true;
210 int errorcode = wx_re_comp(&m_RegEx, expr, expr.length(), flagsRE);
211 #else
212 const wxWX2MBbuf conv = expr.mbc_str();
213 int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT;
214 #endif
215
216 if ( errorcode )
217 {
218 wxLogError(_("Invalid regular expression '%s': %s"),
219 expr.c_str(), GetErrorMsg(errorcode, !conv).c_str());
220
221 m_isCompiled = false;
222 }
223 else // ok
224 {
225 // don't allocate the matches array now, but do it later if necessary
226 if ( flags & wxRE_NOSUB )
227 {
228 // we don't need it at all
229 m_nMatches = 0;
230 }
231 else
232 {
233 // we will alloc the array later (only if really needed) but count
234 // the number of sub-expressions in the regex right now
235
236 // there is always one for the whole expression
237 m_nMatches = 1;
238
239 // and some more for bracketed subexperessions
240 for ( const wxChar *cptr = expr.c_str(); *cptr; cptr++ )
241 {
242 if ( *cptr == _T('\\') )
243 {
244 // in basic RE syntax groups are inside \(...\)
245 if ( *++cptr == _T('(') && (flags & wxRE_BASIC) )
246 {
247 m_nMatches++;
248 }
249 }
250 else if ( *cptr == _T('(') && !(flags & wxRE_BASIC) )
251 {
252 // we know that the previous character is not an unquoted
253 // backslash because it would have been eaten above, so we
254 // have a bare '(' and this indicates a group start for the
255 // extended syntax. '(?' is used for extensions by perl-
256 // like REs (e.g. advanced), and is not valid for POSIX
257 // extended, so ignore them always.
258 if ( cptr[1] != _T('?') )
259 m_nMatches++;
260 }
261 }
262 }
263
264 m_isCompiled = true;
265 }
266
267 return IsValid();
268 }
269
270 bool wxRegExImpl::Matches(const wxRegChar *str, int flags, size_t len) const
271 {
272 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
273
274 // translate our flags to regexec() ones
275 wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
276 _T("unrecognized flags in wxRegEx::Matches") );
277
278 int flagsRE = 0;
279 if ( flags & wxRE_NOTBOL )
280 flagsRE |= REG_NOTBOL;
281 if ( flags & wxRE_NOTEOL )
282 flagsRE |= REG_NOTEOL;
283
284 // allocate matches array if needed
285 wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
286 if ( !m_Matches && m_nMatches )
287 {
288 self->m_Matches = new regmatch_t[m_nMatches];
289 }
290
291 // do match it
292 #ifdef __REG_NOFRONT
293 int rc = wx_re_exec(&self->m_RegEx, str, len, NULL, m_nMatches, m_Matches, flagsRE);
294 #else
295 int rc = str ? regexec(&self->m_RegEx, str, m_nMatches, m_Matches, flagsRE) : REG_BADPAT;
296 #endif
297
298 switch ( rc )
299 {
300 case 0:
301 // matched successfully
302 return true;
303
304 default:
305 // an error occurred
306 wxLogError(_("Failed to find match for regular expression: %s"),
307 GetErrorMsg(rc, !str).c_str());
308 // fall through
309
310 case REG_NOMATCH:
311 // no match
312 return false;
313 }
314 }
315
316 bool wxRegExImpl::GetMatch(size_t *start, size_t *len, size_t index) const
317 {
318 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
319 wxCHECK_MSG( m_nMatches, false, _T("can't use with wxRE_NOSUB") );
320 wxCHECK_MSG( m_Matches, false, _T("must call Matches() first") );
321 wxCHECK_MSG( index < m_nMatches, false, _T("invalid match index") );
322
323 const regmatch_t& match = m_Matches[index];
324
325 // we need the casts because rm_so can be a 64 bit quantity
326 if ( start )
327 *start = wx_truncate_cast(size_t, match.rm_so);
328 if ( len )
329 *len = wx_truncate_cast(size_t, match.rm_eo - match.rm_so);
330
331 return true;
332 }
333
334 size_t wxRegExImpl::GetMatchCount() const
335 {
336 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
337 wxCHECK_MSG( m_nMatches, 0, _T("can't use with wxRE_NOSUB") );
338
339 return m_nMatches;
340 }
341
342 int wxRegExImpl::Replace(wxString *text,
343 const wxString& replacement,
344 size_t maxMatches) const
345 {
346 wxCHECK_MSG( text, wxNOT_FOUND, _T("NULL text in wxRegEx::Replace") );
347 wxCHECK_MSG( IsValid(), wxNOT_FOUND, _T("must successfully Compile() first") );
348
349 // the input string
350 #ifndef WXREGEX_CONVERT_TO_MB
351 const wxChar *textstr = text->c_str();
352 size_t textlen = text->length();
353 #else
354 const wxWX2MBbuf textstr = wxConvertWX2MB(*text);
355 if (!textstr)
356 {
357 wxLogError(_("Failed to find match for regular expression: %s"),
358 GetErrorMsg(0, true).c_str());
359 return 0;
360 }
361 size_t textlen = strlen(textstr);
362 text->clear();
363 #endif
364
365 // the replacement text
366 wxString textNew;
367
368 // the result, allow 25% extra
369 wxString result;
370 result.reserve(5 * textlen / 4);
371
372 // attempt at optimization: don't iterate over the string if it doesn't
373 // contain back references at all
374 bool mayHaveBackrefs =
375 replacement.find_first_of(_T("\\&")) != wxString::npos;
376
377 if ( !mayHaveBackrefs )
378 {
379 textNew = replacement;
380 }
381
382 // the position where we start looking for the match
383 size_t matchStart = 0;
384
385 // number of replacement made: we won't make more than maxMatches of them
386 // (unless maxMatches is 0 which doesn't limit the number of replacements)
387 size_t countRepl = 0;
388
389 // note that "^" shouldn't match after the first call to Matches() so we
390 // use wxRE_NOTBOL to prevent it from happening
391 while ( (!maxMatches || countRepl < maxMatches) &&
392 Matches(textstr + matchStart,
393 countRepl ? wxRE_NOTBOL : 0,
394 textlen - matchStart) )
395 {
396 // the string possibly contains back references: we need to calculate
397 // the replacement text anew after each match
398 if ( mayHaveBackrefs )
399 {
400 mayHaveBackrefs = false;
401 textNew.clear();
402 textNew.reserve(replacement.length());
403
404 for ( const wxChar *p = replacement.c_str(); *p; p++ )
405 {
406 size_t index = (size_t)-1;
407
408 if ( *p == _T('\\') )
409 {
410 if ( wxIsdigit(*++p) )
411 {
412 // back reference
413 wxChar *end;
414 index = (size_t)wxStrtoul(p, &end, 10);
415 p = end - 1; // -1 to compensate for p++ in the loop
416 }
417 //else: backslash used as escape character
418 }
419 else if ( *p == _T('&') )
420 {
421 // treat this as "\0" for compatbility with ed and such
422 index = 0;
423 }
424
425 // do we have a back reference?
426 if ( index != (size_t)-1 )
427 {
428 // yes, get its text
429 size_t start, len;
430 if ( !GetMatch(&start, &len, index) )
431 {
432 wxFAIL_MSG( _T("invalid back reference") );
433
434 // just eat it...
435 }
436 else
437 {
438 textNew += wxString(textstr + matchStart + start,
439 *wxConvCurrent, len);
440
441 mayHaveBackrefs = true;
442 }
443 }
444 else // ordinary character
445 {
446 textNew += *p;
447 }
448 }
449 }
450
451 size_t start, len;
452 if ( !GetMatch(&start, &len) )
453 {
454 // we did have match as Matches() returned true above!
455 wxFAIL_MSG( _T("internal logic error in wxRegEx::Replace") );
456
457 return wxNOT_FOUND;
458 }
459
460 // an insurance against implementations that don't grow exponentially
461 // to ensure building the result takes linear time
462 if (result.capacity() < result.length() + start + textNew.length())
463 result.reserve(2 * result.length());
464
465 #ifndef WXREGEX_CONVERT_TO_MB
466 result.append(*text, matchStart, start);
467 #else
468 result.append(wxString(textstr + matchStart, *wxConvCurrent, start));
469 #endif
470 matchStart += start;
471 result.append(textNew);
472
473 countRepl++;
474
475 matchStart += len;
476 }
477
478 #ifndef WXREGEX_CONVERT_TO_MB
479 result.append(*text, matchStart, wxString::npos);
480 #else
481 result.append(wxString(textstr + matchStart, *wxConvCurrent));
482 #endif
483 *text = result;
484
485 return countRepl;
486 }
487
488 // ----------------------------------------------------------------------------
489 // wxRegEx: all methods are mostly forwarded to wxRegExImpl
490 // ----------------------------------------------------------------------------
491
492 void wxRegEx::Init()
493 {
494 m_impl = NULL;
495 }
496
497 wxRegEx::~wxRegEx()
498 {
499 delete m_impl;
500 }
501
502 bool wxRegEx::Compile(const wxString& expr, int flags)
503 {
504 if ( !m_impl )
505 {
506 m_impl = new wxRegExImpl;
507 }
508
509 if ( !m_impl->Compile(expr, flags) )
510 {
511 // error message already given in wxRegExImpl::Compile
512 delete m_impl;
513 m_impl = NULL;
514
515 return false;
516 }
517
518 return true;
519 }
520
521 bool wxRegEx::Matches(const wxChar *str, int flags) const
522 {
523 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
524
525 #ifndef WXREGEX_CONVERT_TO_MB
526 return m_impl->Matches(str, flags, wxStrlen(str));
527 #else
528 return m_impl->Matches(wxConvertWX2MB(str), flags, wxStrlen(str));
529 #endif
530 }
531
532 bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
533 {
534 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
535
536 return m_impl->GetMatch(start, len, index);
537 }
538
539 wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
540 {
541 size_t start, len;
542 if ( !GetMatch(&start, &len, index) )
543 return wxEmptyString;
544
545 return text.Mid(start, len);
546 }
547
548 size_t wxRegEx::GetMatchCount() const
549 {
550 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
551
552 return m_impl->GetMatchCount();
553 }
554
555 int wxRegEx::Replace(wxString *pattern,
556 const wxString& replacement,
557 size_t maxMatches) const
558 {
559 wxCHECK_MSG( IsValid(), wxNOT_FOUND, _T("must successfully Compile() first") );
560
561 return m_impl->Replace(pattern, replacement, maxMatches);
562 }
563
564 #endif // wxUSE_REGEX