]> git.saurik.com Git - wxWidgets.git/blob - src/common/regex.cpp
fixed layout of tables that are first element on a page
[wxWidgets.git] / src / common / regex.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/regex.cpp
3 // Purpose: regular expression matching
4 // Author: Karsten Ballüder and Vadim Zeitlin
5 // Modified by:
6 // Created: 13.07.01
7 // RCS-ID: $Id$
8 // Copyright: (c) 2000 Karsten Ballüder <ballueder@gmx.net>
9 // 2001 Vadim Zeitlin <vadim@wxwindows.org>
10 // Licence: wxWindows licence
11 ///////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 // For compilers that support precompilation, includes "wx.h".
22 #include "wx/wxprec.h"
23
24 #ifdef __BORLANDC__
25 #pragma hdrstop
26 #endif
27
28 #if wxUSE_REGEX
29
30 #ifndef WX_PRECOMP
31 #include "wx/object.h"
32 #include "wx/string.h"
33 #include "wx/log.h"
34 #include "wx/intl.h"
35 #endif //WX_PRECOMP
36
37 // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it.
38 // Others also don't seem to need it. If you have an error related to
39 // (not) including <sys/types.h> please report details to
40 // wx-dev@lists.wxwindows.org
41 #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__)
42 # include <sys/types.h>
43 #endif
44
45 #include <regex.h>
46 #include "wx/regex.h"
47
48 // WXREGEX_USING_BUILTIN defined when using the built-in regex lib
49 // WXREGEX_BUILTIN_ONLY() wrap a parameter only used with the built-in regex
50 // WXREGEX_CONVERT_TO_MB indicates when the regex lib is using chars and
51 // wxChar is wide, so conversion must be done
52 #ifdef __REG_NOFRONT
53 # define WXREGEX_USING_BUILTIN
54 # define WXREGEX_BUILTIN_ONLY(x) ,x
55 #else
56 # define WXREGEX_BUILTIN_ONLY(x)
57 # if wxUSE_UNICODE
58 # define WXREGEX_CONVERT_TO_MB
59 # endif
60 #endif
61
62 // ----------------------------------------------------------------------------
63 // private classes
64 // ----------------------------------------------------------------------------
65
66 // the character type used by the regular expression engine
67 #ifndef WXREGEX_CONVERT_TO_MB
68 typedef wxChar wxRegChar;
69 #else
70 typedef char wxRegChar;
71 #endif
72
73 // the real implementation of wxRegEx
74 class wxRegExImpl
75 {
76 public:
77 // ctor and dtor
78 wxRegExImpl();
79 ~wxRegExImpl();
80
81 // return true if Compile() had been called successfully
82 bool IsValid() const { return m_isCompiled; }
83
84 // RE operations
85 bool Compile(const wxString& expr, int flags = 0);
86 bool Matches(const wxRegChar *str, int flags
87 WXREGEX_BUILTIN_ONLY(size_t len)) const;
88 bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
89 size_t GetMatchCount() const;
90 int Replace(wxString *pattern, const wxString& replacement,
91 size_t maxMatches = 0) const;
92
93 private:
94 // return the string containing the error message for the given err code
95 wxString GetErrorMsg(int errorcode, bool badconv) const;
96
97 // init the members
98 void Init()
99 {
100 m_isCompiled = false;
101 m_Matches = NULL;
102 m_nMatches = 0;
103 }
104
105 // free the RE if compiled
106 void Free()
107 {
108 if ( IsValid() )
109 {
110 regfree(&m_RegEx);
111 }
112
113 delete [] m_Matches;
114 }
115
116 // free the RE if any and reinit the members
117 void Reinit()
118 {
119 Free();
120 Init();
121 }
122
123
124 // compiled RE
125 regex_t m_RegEx;
126
127 // the subexpressions data
128 regmatch_t *m_Matches;
129 size_t m_nMatches;
130
131 // true if m_RegEx is valid
132 bool m_isCompiled;
133 };
134
135 // ============================================================================
136 // implementation
137 // ============================================================================
138
139 // ----------------------------------------------------------------------------
140 // wxRegExImpl
141 // ----------------------------------------------------------------------------
142
143 wxRegExImpl::wxRegExImpl()
144 {
145 Init();
146 }
147
148 wxRegExImpl::~wxRegExImpl()
149 {
150 Free();
151 }
152
153 wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
154 {
155 #ifdef WXREGEX_CONVERT_TO_MB
156 // currently only needed when using system library in Unicode mode
157 if ( badconv )
158 {
159 return _("conversion to 8-bit encoding failed");
160 }
161 #else
162 // 'use' badconv to avoid a compiler warning
163 (void)badconv;
164 #endif
165
166 wxString szError;
167
168 // first get the string length needed
169 int len = regerror(errorcode, &m_RegEx, NULL, 0);
170 if ( len > 0 )
171 {
172 char* szcmbError = new char[++len];
173
174 (void)regerror(errorcode, &m_RegEx, szcmbError, len);
175
176 szError = wxConvertMB2WX(szcmbError);
177 delete [] szcmbError;
178 }
179 else // regerror() returned 0
180 {
181 szError = _("unknown error");
182 }
183
184 return szError;
185 }
186
187 bool wxRegExImpl::Compile(const wxString& expr, int flags)
188 {
189 Reinit();
190
191 #ifdef WX_NO_REGEX_ADVANCED
192 # define FLAVORS wxRE_BASIC
193 #else
194 # define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
195 wxASSERT_MSG( (flags & FLAVORS) != FLAVORS,
196 _T("incompatible flags in wxRegEx::Compile") );
197 #endif
198 wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
199 _T("unrecognized flags in wxRegEx::Compile") );
200
201 // translate our flags to regcomp() ones
202 int flagsRE = 0;
203 if ( !(flags & wxRE_BASIC) )
204 #ifndef WX_NO_REGEX_ADVANCED
205 if (flags & wxRE_ADVANCED)
206 flagsRE |= REG_ADVANCED;
207 else
208 #endif
209 flagsRE |= REG_EXTENDED;
210 if ( flags & wxRE_ICASE )
211 flagsRE |= REG_ICASE;
212 if ( flags & wxRE_NOSUB )
213 flagsRE |= REG_NOSUB;
214 if ( flags & wxRE_NEWLINE )
215 flagsRE |= REG_NEWLINE;
216
217 // compile it
218 #ifdef WXREGEX_USING_BUILTIN
219 bool conv = true;
220 int errorcode = wx_re_comp(&m_RegEx, expr, expr.length(), flagsRE);
221 #else
222 const wxWX2MBbuf conv = expr.mbc_str();
223 int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT;
224 #endif
225
226 if ( errorcode )
227 {
228 wxLogError(_("Invalid regular expression '%s': %s"),
229 expr.c_str(), GetErrorMsg(errorcode, !conv).c_str());
230
231 m_isCompiled = false;
232 }
233 else // ok
234 {
235 // don't allocate the matches array now, but do it later if necessary
236 if ( flags & wxRE_NOSUB )
237 {
238 // we don't need it at all
239 m_nMatches = 0;
240 }
241 else
242 {
243 // we will alloc the array later (only if really needed) but count
244 // the number of sub-expressions in the regex right now
245
246 // there is always one for the whole expression
247 m_nMatches = 1;
248
249 // and some more for bracketed subexperessions
250 for ( const wxChar *cptr = expr.c_str(); *cptr; cptr++ )
251 {
252 if ( *cptr == _T('\\') )
253 {
254 // in basic RE syntax groups are inside \(...\)
255 if ( *++cptr == _T('(') && (flags & wxRE_BASIC) )
256 {
257 m_nMatches++;
258 }
259 }
260 else if ( *cptr == _T('(') && !(flags & wxRE_BASIC) )
261 {
262 // we know that the previous character is not an unquoted
263 // backslash because it would have been eaten above, so we
264 // have a bare '(' and this indicates a group start for the
265 // extended syntax. '(?' is used for extensions by perl-
266 // like REs (e.g. advanced), and is not valid for POSIX
267 // extended, so ignore them always.
268 if ( cptr[1] != _T('?') )
269 m_nMatches++;
270 }
271 }
272 }
273
274 m_isCompiled = true;
275 }
276
277 return IsValid();
278 }
279
280 bool wxRegExImpl::Matches(const wxRegChar *str,
281 int flags
282 WXREGEX_BUILTIN_ONLY(size_t len)) const
283 {
284 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
285
286 // translate our flags to regexec() ones
287 wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
288 _T("unrecognized flags in wxRegEx::Matches") );
289
290 int flagsRE = 0;
291 if ( flags & wxRE_NOTBOL )
292 flagsRE |= REG_NOTBOL;
293 if ( flags & wxRE_NOTEOL )
294 flagsRE |= REG_NOTEOL;
295
296 // allocate matches array if needed
297 wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
298 if ( !m_Matches && m_nMatches )
299 {
300 self->m_Matches = new regmatch_t[m_nMatches];
301 }
302
303 // do match it
304 #ifdef WXREGEX_USING_BUILTIN
305 int rc = wx_re_exec(&self->m_RegEx, str, len, NULL, m_nMatches, m_Matches, flagsRE);
306 #else
307 int rc = str ? regexec(&self->m_RegEx, str, m_nMatches, m_Matches, flagsRE) : REG_BADPAT;
308 #endif
309
310 switch ( rc )
311 {
312 case 0:
313 // matched successfully
314 return true;
315
316 default:
317 // an error occurred
318 wxLogError(_("Failed to find match for regular expression: %s"),
319 GetErrorMsg(rc, !str).c_str());
320 // fall through
321
322 case REG_NOMATCH:
323 // no match
324 return false;
325 }
326 }
327
328 bool wxRegExImpl::GetMatch(size_t *start, size_t *len, size_t index) const
329 {
330 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
331 wxCHECK_MSG( m_nMatches, false, _T("can't use with wxRE_NOSUB") );
332 wxCHECK_MSG( m_Matches, false, _T("must call Matches() first") );
333 wxCHECK_MSG( index < m_nMatches, false, _T("invalid match index") );
334
335 const regmatch_t& match = m_Matches[index];
336
337 // we need the casts because rm_so can be a 64 bit quantity
338 if ( start )
339 *start = wx_truncate_cast(size_t, match.rm_so);
340 if ( len )
341 *len = wx_truncate_cast(size_t, match.rm_eo - match.rm_so);
342
343 return true;
344 }
345
346 size_t wxRegExImpl::GetMatchCount() const
347 {
348 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
349 wxCHECK_MSG( m_nMatches, 0, _T("can't use with wxRE_NOSUB") );
350
351 return m_nMatches;
352 }
353
354 int wxRegExImpl::Replace(wxString *text,
355 const wxString& replacement,
356 size_t maxMatches) const
357 {
358 wxCHECK_MSG( text, wxNOT_FOUND, _T("NULL text in wxRegEx::Replace") );
359 wxCHECK_MSG( IsValid(), wxNOT_FOUND, _T("must successfully Compile() first") );
360
361 // the input string
362 #ifndef WXREGEX_CONVERT_TO_MB
363 const wxChar *textstr = text->c_str();
364 size_t textlen = text->length();
365 #else
366 const wxWX2MBbuf textstr = wxConvertWX2MB(*text);
367 if (!textstr)
368 {
369 wxLogError(_("Failed to find match for regular expression: %s"),
370 GetErrorMsg(0, true).c_str());
371 return 0;
372 }
373 size_t textlen = strlen(textstr);
374 text->clear();
375 #endif
376
377 // the replacement text
378 wxString textNew;
379
380 // the result, allow 25% extra
381 wxString result;
382 result.reserve(5 * textlen / 4);
383
384 // attempt at optimization: don't iterate over the string if it doesn't
385 // contain back references at all
386 bool mayHaveBackrefs =
387 replacement.find_first_of(_T("\\&")) != wxString::npos;
388
389 if ( !mayHaveBackrefs )
390 {
391 textNew = replacement;
392 }
393
394 // the position where we start looking for the match
395 size_t matchStart = 0;
396
397 // number of replacement made: we won't make more than maxMatches of them
398 // (unless maxMatches is 0 which doesn't limit the number of replacements)
399 size_t countRepl = 0;
400
401 // note that "^" shouldn't match after the first call to Matches() so we
402 // use wxRE_NOTBOL to prevent it from happening
403 while ( (!maxMatches || countRepl < maxMatches) &&
404 Matches(textstr + matchStart,
405 countRepl ? wxRE_NOTBOL : 0
406 WXREGEX_BUILTIN_ONLY(textlen - matchStart)) )
407 {
408 // the string possibly contains back references: we need to calculate
409 // the replacement text anew after each match
410 if ( mayHaveBackrefs )
411 {
412 mayHaveBackrefs = false;
413 textNew.clear();
414 textNew.reserve(replacement.length());
415
416 for ( const wxChar *p = replacement.c_str(); *p; p++ )
417 {
418 size_t index = (size_t)-1;
419
420 if ( *p == _T('\\') )
421 {
422 if ( wxIsdigit(*++p) )
423 {
424 // back reference
425 wxChar *end;
426 index = (size_t)wxStrtoul(p, &end, 10);
427 p = end - 1; // -1 to compensate for p++ in the loop
428 }
429 //else: backslash used as escape character
430 }
431 else if ( *p == _T('&') )
432 {
433 // treat this as "\0" for compatbility with ed and such
434 index = 0;
435 }
436
437 // do we have a back reference?
438 if ( index != (size_t)-1 )
439 {
440 // yes, get its text
441 size_t start, len;
442 if ( !GetMatch(&start, &len, index) )
443 {
444 wxFAIL_MSG( _T("invalid back reference") );
445
446 // just eat it...
447 }
448 else
449 {
450 textNew += wxString(textstr + matchStart + start,
451 *wxConvCurrent, len);
452
453 mayHaveBackrefs = true;
454 }
455 }
456 else // ordinary character
457 {
458 textNew += *p;
459 }
460 }
461 }
462
463 size_t start, len;
464 if ( !GetMatch(&start, &len) )
465 {
466 // we did have match as Matches() returned true above!
467 wxFAIL_MSG( _T("internal logic error in wxRegEx::Replace") );
468
469 return wxNOT_FOUND;
470 }
471
472 // an insurance against implementations that don't grow exponentially
473 // to ensure building the result takes linear time
474 if (result.capacity() < result.length() + start + textNew.length())
475 result.reserve(2 * result.length());
476
477 #ifndef WXREGEX_CONVERT_TO_MB
478 result.append(*text, matchStart, start);
479 #else
480 result.append(wxString(textstr + matchStart, *wxConvCurrent, start));
481 #endif
482 matchStart += start;
483 result.append(textNew);
484
485 countRepl++;
486
487 matchStart += len;
488 }
489
490 #ifndef WXREGEX_CONVERT_TO_MB
491 result.append(*text, matchStart, wxString::npos);
492 #else
493 result.append(wxString(textstr + matchStart, *wxConvCurrent));
494 #endif
495 *text = result;
496
497 return countRepl;
498 }
499
500 // ----------------------------------------------------------------------------
501 // wxRegEx: all methods are mostly forwarded to wxRegExImpl
502 // ----------------------------------------------------------------------------
503
504 void wxRegEx::Init()
505 {
506 m_impl = NULL;
507 }
508
509 wxRegEx::~wxRegEx()
510 {
511 delete m_impl;
512 }
513
514 bool wxRegEx::Compile(const wxString& expr, int flags)
515 {
516 if ( !m_impl )
517 {
518 m_impl = new wxRegExImpl;
519 }
520
521 if ( !m_impl->Compile(expr, flags) )
522 {
523 // error message already given in wxRegExImpl::Compile
524 delete m_impl;
525 m_impl = NULL;
526
527 return false;
528 }
529
530 return true;
531 }
532
533 bool wxRegEx::Matches(const wxChar *str, int flags, size_t len) const
534 {
535 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
536 (void)len;
537
538 #ifdef WXREGEX_CONVERT_TO_MB
539 return m_impl->Matches(wxConvertWX2MB(str), flags);
540 #else
541 return m_impl->Matches(str, flags WXREGEX_BUILTIN_ONLY(len));
542 #endif
543 }
544
545 bool wxRegEx::Matches(const wxChar *str, int flags) const
546 {
547 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
548
549 #ifdef WXREGEX_CONVERT_TO_MB
550 return m_impl->Matches(wxConvertWX2MB(str), flags);
551 #else
552 return m_impl->Matches(str, flags WXREGEX_BUILTIN_ONLY(wxStrlen(str)));
553 #endif
554 }
555
556 bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
557 {
558 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
559
560 return m_impl->GetMatch(start, len, index);
561 }
562
563 wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
564 {
565 size_t start, len;
566 if ( !GetMatch(&start, &len, index) )
567 return wxEmptyString;
568
569 return text.Mid(start, len);
570 }
571
572 size_t wxRegEx::GetMatchCount() const
573 {
574 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
575
576 return m_impl->GetMatchCount();
577 }
578
579 int wxRegEx::Replace(wxString *pattern,
580 const wxString& replacement,
581 size_t maxMatches) const
582 {
583 wxCHECK_MSG( IsValid(), wxNOT_FOUND, _T("must successfully Compile() first") );
584
585 return m_impl->Replace(pattern, replacement, maxMatches);
586 }
587
588 #endif // wxUSE_REGEX