]>
git.saurik.com Git - wxWidgets.git/blob - src/common/regex.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/regex.cpp
3 // Purpose: regular expression matching
4 // Author: Karsten Ballüder and Vadim Zeitlin
8 // Copyright: (c) 2000 Karsten Ballüder <ballueder@gmx.net>
9 // 2001 Vadim Zeitlin <vadim@wxwindows.org>
10 // Licence: wxWindows licence
11 ///////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 // For compilers that support precompilation, includes "wx.h".
22 #include "wx/wxprec.h"
31 #include "wx/object.h"
32 #include "wx/string.h"
37 // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it.
38 // Others also don't seem to need it. If you have an error related to
39 // (not) including <sys/types.h> please report details to
40 // wx-dev@lists.wxwindows.org
41 #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__)
42 # include <sys/types.h>
48 // defined when the regex lib uses 'char' but 'wxChar' is wide
49 #if wxUSE_UNICODE && !defined(__REG_NOFRONT)
50 # define WXREGEX_CONVERT_TO_MB
53 // ----------------------------------------------------------------------------
55 // ----------------------------------------------------------------------------
57 // the character type used by the regular expression engine
58 #ifndef WXREGEX_CONVERT_TO_MB
59 typedef wxChar wxRegChar
;
61 typedef char wxRegChar
;
64 // the real implementation of wxRegEx
72 // return true if Compile() had been called successfully
73 bool IsValid() const { return m_isCompiled
; }
76 bool Compile(const wxString
& expr
, int flags
= 0);
77 bool Matches(const wxRegChar
*str
, int flags
, size_t len
) const;
78 bool GetMatch(size_t *start
, size_t *len
, size_t index
= 0) const;
79 size_t GetMatchCount() const;
80 int Replace(wxString
*pattern
, const wxString
& replacement
,
81 size_t maxMatches
= 0) const;
84 // return the string containing the error message for the given err code
85 wxString
GetErrorMsg(int errorcode
, bool badconv
) const;
95 // free the RE if compiled
106 // free the RE if any and reinit the members
117 // the subexpressions data
118 regmatch_t
*m_Matches
;
121 // true if m_RegEx is valid
125 // ============================================================================
127 // ============================================================================
129 // ----------------------------------------------------------------------------
131 // ----------------------------------------------------------------------------
133 wxRegExImpl::wxRegExImpl()
138 wxRegExImpl::~wxRegExImpl()
143 wxString
wxRegExImpl::GetErrorMsg(int errorcode
, bool badconv
) const
145 #ifdef WXREGEX_CONVERT_TO_MB
146 // currently only needed when using system library in Unicode mode
149 return _("conversion to 8-bit encoding failed");
152 // 'use' badconv to avoid a compiler warning
158 // first get the string length needed
159 int len
= regerror(errorcode
, &m_RegEx
, NULL
, 0);
162 char* szcmbError
= new char[++len
];
164 (void)regerror(errorcode
, &m_RegEx
, szcmbError
, len
);
166 szError
= wxConvertMB2WX(szcmbError
);
167 delete [] szcmbError
;
169 else // regerror() returned 0
171 szError
= _("unknown error");
177 bool wxRegExImpl::Compile(const wxString
& expr
, int flags
)
181 #ifdef WX_NO_REGEX_ADVANCED
182 # define FLAVORS wxRE_BASIC
184 # define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
185 wxASSERT_MSG( (flags
& FLAVORS
) != FLAVORS
,
186 _T("incompatible flags in wxRegEx::Compile") );
188 wxASSERT_MSG( !(flags
& ~(FLAVORS
| wxRE_ICASE
| wxRE_NOSUB
| wxRE_NEWLINE
)),
189 _T("unrecognized flags in wxRegEx::Compile") );
191 // translate our flags to regcomp() ones
193 if ( !(flags
& wxRE_BASIC
) )
194 #ifndef WX_NO_REGEX_ADVANCED
195 if (flags
& wxRE_ADVANCED
)
196 flagsRE
|= REG_ADVANCED
;
199 flagsRE
|= REG_EXTENDED
;
200 if ( flags
& wxRE_ICASE
)
201 flagsRE
|= REG_ICASE
;
202 if ( flags
& wxRE_NOSUB
)
203 flagsRE
|= REG_NOSUB
;
204 if ( flags
& wxRE_NEWLINE
)
205 flagsRE
|= REG_NEWLINE
;
210 int errorcode
= wx_re_comp(&m_RegEx
, expr
, expr
.length(), flagsRE
);
212 const wxWX2MBbuf conv
= expr
.mbc_str();
213 int errorcode
= conv
? regcomp(&m_RegEx
, conv
, flagsRE
) : REG_BADPAT
;
218 wxLogError(_("Invalid regular expression '%s': %s"),
219 expr
.c_str(), GetErrorMsg(errorcode
, !conv
).c_str());
221 m_isCompiled
= false;
225 // don't allocate the matches array now, but do it later if necessary
226 if ( flags
& wxRE_NOSUB
)
228 // we don't need it at all
233 // we will alloc the array later (only if really needed) but count
234 // the number of sub-expressions in the regex right now
236 // there is always one for the whole expression
239 // and some more for bracketed subexperessions
240 for ( const wxChar
*cptr
= expr
.c_str(); *cptr
; cptr
++ )
242 if ( *cptr
== _T('\\') )
244 // in basic RE syntax groups are inside \(...\)
245 if ( *++cptr
== _T('(') && (flags
& wxRE_BASIC
) )
250 else if ( *cptr
== _T('(') && !(flags
& wxRE_BASIC
) )
252 // we know that the previous character is not an unquoted
253 // backslash because it would have been eaten above, so we
254 // have a bare '(' and this indicates a group start for the
255 // extended syntax. '(?' is used for extensions by perl-
256 // like REs (e.g. advanced), and is not valid for POSIX
257 // extended, so ignore them always.
258 if ( cptr
[1] != _T('?') )
270 bool wxRegExImpl::Matches(const wxRegChar
*str
, int flags
, size_t len
) const
272 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
274 // translate our flags to regexec() ones
275 wxASSERT_MSG( !(flags
& ~(wxRE_NOTBOL
| wxRE_NOTEOL
)),
276 _T("unrecognized flags in wxRegEx::Matches") );
279 if ( flags
& wxRE_NOTBOL
)
280 flagsRE
|= REG_NOTBOL
;
281 if ( flags
& wxRE_NOTEOL
)
282 flagsRE
|= REG_NOTEOL
;
284 // allocate matches array if needed
285 wxRegExImpl
*self
= wxConstCast(this, wxRegExImpl
);
286 if ( !m_Matches
&& m_nMatches
)
288 self
->m_Matches
= new regmatch_t
[m_nMatches
];
293 int rc
= wx_re_exec(&self
->m_RegEx
, str
, len
, NULL
, m_nMatches
, m_Matches
, flagsRE
);
295 int rc
= str
? regexec(&self
->m_RegEx
, str
, m_nMatches
, m_Matches
, flagsRE
) : REG_BADPAT
;
301 // matched successfully
306 wxLogError(_("Failed to find match for regular expression: %s"),
307 GetErrorMsg(rc
, !str
).c_str());
316 bool wxRegExImpl::GetMatch(size_t *start
, size_t *len
, size_t index
) const
318 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
319 wxCHECK_MSG( m_nMatches
, false, _T("can't use with wxRE_NOSUB") );
320 wxCHECK_MSG( m_Matches
, false, _T("must call Matches() first") );
321 wxCHECK_MSG( index
< m_nMatches
, false, _T("invalid match index") );
323 const regmatch_t
& match
= m_Matches
[index
];
325 // we need the casts because rm_so can be a 64 bit quantity
327 *start
= wx_truncate_cast(size_t, match
.rm_so
);
329 *len
= wx_truncate_cast(size_t, match
.rm_eo
- match
.rm_so
);
334 size_t wxRegExImpl::GetMatchCount() const
336 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
337 wxCHECK_MSG( m_nMatches
, 0, _T("can't use with wxRE_NOSUB") );
342 int wxRegExImpl::Replace(wxString
*text
,
343 const wxString
& replacement
,
344 size_t maxMatches
) const
346 wxCHECK_MSG( text
, wxNOT_FOUND
, _T("NULL text in wxRegEx::Replace") );
347 wxCHECK_MSG( IsValid(), wxNOT_FOUND
, _T("must successfully Compile() first") );
350 #ifndef WXREGEX_CONVERT_TO_MB
351 const wxChar
*textstr
= text
->c_str();
352 size_t textlen
= text
->length();
354 const wxWX2MBbuf textstr
= wxConvertWX2MB(*text
);
357 wxLogError(_("Failed to find match for regular expression: %s"),
358 GetErrorMsg(0, true).c_str());
361 size_t textlen
= strlen(textstr
);
365 // the replacement text
368 // the result, allow 25% extra
370 result
.reserve(5 * textlen
/ 4);
372 // attempt at optimization: don't iterate over the string if it doesn't
373 // contain back references at all
374 bool mayHaveBackrefs
=
375 replacement
.find_first_of(_T("\\&")) != wxString::npos
;
377 if ( !mayHaveBackrefs
)
379 textNew
= replacement
;
382 // the position where we start looking for the match
383 size_t matchStart
= 0;
385 // number of replacement made: we won't make more than maxMatches of them
386 // (unless maxMatches is 0 which doesn't limit the number of replacements)
387 size_t countRepl
= 0;
389 // note that "^" shouldn't match after the first call to Matches() so we
390 // use wxRE_NOTBOL to prevent it from happening
391 while ( (!maxMatches
|| countRepl
< maxMatches
) &&
392 Matches(textstr
+ matchStart
,
393 countRepl
? wxRE_NOTBOL
: 0,
394 textlen
- matchStart
) )
396 // the string possibly contains back references: we need to calculate
397 // the replacement text anew after each match
398 if ( mayHaveBackrefs
)
400 mayHaveBackrefs
= false;
402 textNew
.reserve(replacement
.length());
404 for ( const wxChar
*p
= replacement
.c_str(); *p
; p
++ )
406 size_t index
= (size_t)-1;
408 if ( *p
== _T('\\') )
410 if ( wxIsdigit(*++p
) )
414 index
= (size_t)wxStrtoul(p
, &end
, 10);
415 p
= end
- 1; // -1 to compensate for p++ in the loop
417 //else: backslash used as escape character
419 else if ( *p
== _T('&') )
421 // treat this as "\0" for compatbility with ed and such
425 // do we have a back reference?
426 if ( index
!= (size_t)-1 )
430 if ( !GetMatch(&start
, &len
, index
) )
432 wxFAIL_MSG( _T("invalid back reference") );
438 textNew
+= wxString(textstr
+ matchStart
+ start
,
439 *wxConvCurrent
, len
);
441 mayHaveBackrefs
= true;
444 else // ordinary character
452 if ( !GetMatch(&start
, &len
) )
454 // we did have match as Matches() returned true above!
455 wxFAIL_MSG( _T("internal logic error in wxRegEx::Replace") );
460 // an insurance against implementations that don't grow exponentially
461 // to ensure building the result takes linear time
462 if (result
.capacity() < result
.length() + start
+ textNew
.length())
463 result
.reserve(2 * result
.length());
465 #ifndef WXREGEX_CONVERT_TO_MB
466 result
.append(*text
, matchStart
, start
);
468 result
.append(wxString(textstr
+ matchStart
, *wxConvCurrent
, start
));
471 result
.append(textNew
);
478 #ifndef WXREGEX_CONVERT_TO_MB
479 result
.append(*text
, matchStart
, wxString::npos
);
481 result
.append(wxString(textstr
+ matchStart
, *wxConvCurrent
));
488 // ----------------------------------------------------------------------------
489 // wxRegEx: all methods are mostly forwarded to wxRegExImpl
490 // ----------------------------------------------------------------------------
502 bool wxRegEx::Compile(const wxString
& expr
, int flags
)
506 m_impl
= new wxRegExImpl
;
509 if ( !m_impl
->Compile(expr
, flags
) )
511 // error message already given in wxRegExImpl::Compile
521 bool wxRegEx::Matches(const wxChar
*str
, int flags
) const
523 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
525 #ifndef WXREGEX_CONVERT_TO_MB
526 return m_impl
->Matches(str
, flags
, wxStrlen(str
));
528 return m_impl
->Matches(wxConvertWX2MB(str
), flags
, wxStrlen(str
));
532 bool wxRegEx::GetMatch(size_t *start
, size_t *len
, size_t index
) const
534 wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") );
536 return m_impl
->GetMatch(start
, len
, index
);
539 wxString
wxRegEx::GetMatch(const wxString
& text
, size_t index
) const
542 if ( !GetMatch(&start
, &len
, index
) )
543 return wxEmptyString
;
545 return text
.Mid(start
, len
);
548 size_t wxRegEx::GetMatchCount() const
550 wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") );
552 return m_impl
->GetMatchCount();
555 int wxRegEx::Replace(wxString
*pattern
,
556 const wxString
& replacement
,
557 size_t maxMatches
) const
559 wxCHECK_MSG( IsValid(), wxNOT_FOUND
, _T("must successfully Compile() first") );
561 return m_impl
->Replace(pattern
, replacement
, maxMatches
);
564 #endif // wxUSE_REGEX