]>
git.saurik.com Git - wxWidgets.git/blob - src/common/regex.cpp
   1 /////////////////////////////////////////////////////////////////////////////// 
   2 // Name:        src/common/regex.cpp 
   3 // Purpose:     regular expression matching 
   4 // Author:      Karsten Ballüder and Vadim Zeitlin 
   8 // Copyright:   (c) 2000 Karsten Ballüder <ballueder@gmx.net> 
   9 //                  2001 Vadim Zeitlin <vadim@wxwindows.org> 
  10 // Licence:     wxWindows licence 
  11 /////////////////////////////////////////////////////////////////////////////// 
  13 // ============================================================================ 
  15 // ============================================================================ 
  17 // ---------------------------------------------------------------------------- 
  19 // ---------------------------------------------------------------------------- 
  21 // For compilers that support precompilation, includes "wx.h". 
  22 #include "wx/wxprec.h" 
  31     #include "wx/object.h" 
  32     #include "wx/string.h" 
  37 // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it. 
  38 // Others also don't seem to need it. If you have an error related to 
  39 // (not) including <sys/types.h> please report details to 
  40 // wx-dev@lists.wxwindows.org 
  41 #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__) 
  42 #   include <sys/types.h> 
  48 // WXREGEX_USING_BUILTIN    defined when using the built-in regex lib 
  49 // WXREGEX_USING_RE_SEARCH  defined when using re_search in the GNU regex lib 
  50 // WXREGEX_IF_NEED_LEN()    wrap the len parameter only used with the built-in 
  52 // WXREGEX_CONVERT_TO_MB    defined when the regex lib is using chars and 
  53 //                          wxChar is wide, so conversion must be done 
  54 // WXREGEX_CHAR(x)          Convert wxChar to wxRegChar 
  57 #   define WXREGEX_USING_BUILTIN 
  58 #   define WXREGEX_IF_NEED_LEN(x) ,x 
  59 #   define WXREGEX_CHAR(x) x 
  61 #   ifdef HAVE_RE_SEARCH 
  62 #       define WXREGEX_IF_NEED_LEN(x) ,x 
  63 #       define WXREGEX_USING_RE_SEARCH 
  65 #       define WXREGEX_IF_NEED_LEN(x) 
  68 #       define WXREGEX_CONVERT_TO_MB 
  70 #   define WXREGEX_CHAR(x) wxConvertWX2MB(x) 
  71 #   define wx_regfree regfree 
  72 #   define wx_regerror regerror 
  75 // ---------------------------------------------------------------------------- 
  77 // ---------------------------------------------------------------------------- 
  79 #ifndef WXREGEX_USING_RE_SEARCH 
  81 // the array of offsets for the matches, the usual POSIX regmatch_t array. 
  85     typedef regmatch_t 
*match_type
; 
  87     wxRegExMatches(size_t n
)        { m_matches 
= new regmatch_t
[n
]; } 
  88     ~wxRegExMatches()               { delete [] m_matches
; } 
  90     // we just use casts here because the fields of regmatch_t struct may be 64 
  91     // bit but we're limited to size_t in our public API and are not going to 
  92     // change it because operating on strings longer than 4GB using it is 
  93     // absolutely impractical anyhow 
  94     size_t Start(size_t n
) const 
  96         return wx_truncate_cast(size_t, m_matches
[n
].rm_so
); 
  99     size_t End(size_t n
) const 
 101         return wx_truncate_cast(size_t, m_matches
[n
].rm_eo
); 
 104     regmatch_t 
*get() const         { return m_matches
; } 
 107     regmatch_t 
*m_matches
; 
 110 #else // WXREGEX_USING_RE_SEARCH 
 112 // the array of offsets for the matches, the struct used by the GNU lib 
 116     typedef re_registers 
*match_type
; 
 118     wxRegExMatches(size_t n
) 
 120         m_matches
.num_regs 
= n
; 
 121         m_matches
.start 
= new regoff_t
[n
]; 
 122         m_matches
.end 
= new regoff_t
[n
]; 
 127         delete [] m_matches
.start
; 
 128         delete [] m_matches
.end
; 
 131     size_t Start(size_t n
) const    { return m_matches
.start
[n
]; } 
 132     size_t End(size_t n
) const      { return m_matches
.end
[n
]; } 
 134     re_registers 
*get()             { return &m_matches
; } 
 137     re_registers m_matches
; 
 140 #endif // WXREGEX_USING_RE_SEARCH 
 142 // the character type used by the regular expression engine 
 143 #ifndef WXREGEX_CONVERT_TO_MB 
 144 typedef wxChar wxRegChar
; 
 146 typedef char wxRegChar
; 
 149 // the real implementation of wxRegEx 
 157     // return true if Compile() had been called successfully 
 158     bool IsValid() const { return m_isCompiled
; } 
 161     bool Compile(const wxString
& expr
, int flags 
= 0); 
 162     bool Matches(const wxRegChar 
*str
, int flags
 
 163                  WXREGEX_IF_NEED_LEN(size_t len
)) const; 
 164     bool GetMatch(size_t *start
, size_t *len
, size_t index 
= 0) const; 
 165     size_t GetMatchCount() const; 
 166     int Replace(wxString 
*pattern
, const wxString
& replacement
, 
 167                 size_t maxMatches 
= 0) const; 
 170     // return the string containing the error message for the given err code 
 171     wxString 
GetErrorMsg(int errorcode
, bool badconv
) const; 
 176         m_isCompiled 
= false; 
 181     // free the RE if compiled 
 186             wx_regfree(&m_RegEx
); 
 192     // free the RE if any and reinit the members 
 202     // the subexpressions data 
 203     wxRegExMatches 
*m_Matches
; 
 206     // true if m_RegEx is valid 
 211 // ============================================================================ 
 213 // ============================================================================ 
 215 // ---------------------------------------------------------------------------- 
 217 // ---------------------------------------------------------------------------- 
 219 wxRegExImpl::wxRegExImpl() 
 224 wxRegExImpl::~wxRegExImpl() 
 229 wxString 
wxRegExImpl::GetErrorMsg(int errorcode
, bool badconv
) const 
 231 #ifdef WXREGEX_CONVERT_TO_MB 
 232     // currently only needed when using system library in Unicode mode 
 235         return _("conversion to 8-bit encoding failed"); 
 238     // 'use' badconv to avoid a compiler warning 
 244     // first get the string length needed 
 245     int len 
= wx_regerror(errorcode
, &m_RegEx
, NULL
, 0); 
 248         char* szcmbError 
= new char[++len
]; 
 250         (void)wx_regerror(errorcode
, &m_RegEx
, szcmbError
, len
); 
 252         szError 
= wxConvertMB2WX(szcmbError
); 
 253         delete [] szcmbError
; 
 255     else // regerror() returned 0 
 257         szError 
= _("unknown error"); 
 263 bool wxRegExImpl::Compile(const wxString
& expr
, int flags
) 
 267 #ifdef WX_NO_REGEX_ADVANCED 
 268 #   define FLAVORS wxRE_BASIC 
 270 #   define FLAVORS (wxRE_ADVANCED | wxRE_BASIC) 
 271     wxASSERT_MSG( (flags 
& FLAVORS
) != FLAVORS
, 
 272                   _T("incompatible flags in wxRegEx::Compile") ); 
 274     wxASSERT_MSG( !(flags 
& ~(FLAVORS 
| wxRE_ICASE 
| wxRE_NOSUB 
| wxRE_NEWLINE
)), 
 275                   _T("unrecognized flags in wxRegEx::Compile") ); 
 277     // translate our flags to regcomp() ones 
 279     if ( !(flags 
& wxRE_BASIC
) ) 
 280 #ifndef WX_NO_REGEX_ADVANCED 
 281         if (flags 
& wxRE_ADVANCED
) 
 282             flagsRE 
|= REG_ADVANCED
; 
 285             flagsRE 
|= REG_EXTENDED
; 
 286     if ( flags 
& wxRE_ICASE 
) 
 287         flagsRE 
|= REG_ICASE
; 
 288     if ( flags 
& wxRE_NOSUB 
) 
 289         flagsRE 
|= REG_NOSUB
; 
 290     if ( flags 
& wxRE_NEWLINE 
) 
 291         flagsRE 
|= REG_NEWLINE
; 
 294 #ifdef WXREGEX_USING_BUILTIN 
 296     int errorcode 
= wx_re_comp(&m_RegEx
, expr
, expr
.length(), flagsRE
); 
 298     const wxWX2MBbuf conv 
= expr
.mbc_str(); 
 299     int errorcode 
= conv 
? regcomp(&m_RegEx
, conv
, flagsRE
) : REG_BADPAT
; 
 304         wxLogError(_("Invalid regular expression '%s': %s"), 
 305                    expr
.c_str(), GetErrorMsg(errorcode
, !conv
).c_str()); 
 307         m_isCompiled 
= false; 
 311         // don't allocate the matches array now, but do it later if necessary 
 312         if ( flags 
& wxRE_NOSUB 
) 
 314             // we don't need it at all 
 319             // we will alloc the array later (only if really needed) but count 
 320             // the number of sub-expressions in the regex right now 
 322             // there is always one for the whole expression 
 325             // and some more for bracketed subexperessions 
 326             for ( const wxChar 
*cptr 
= expr
.c_str(); *cptr
; cptr
++ ) 
 328                 if ( *cptr 
== _T('\\') ) 
 330                     // in basic RE syntax groups are inside \(...\) 
 331                     if ( *++cptr 
== _T('(') && (flags 
& wxRE_BASIC
) ) 
 336                 else if ( *cptr 
== _T('(') && !(flags 
& wxRE_BASIC
) ) 
 338                     // we know that the previous character is not an unquoted 
 339                     // backslash because it would have been eaten above, so we 
 340                     // have a bare '(' and this indicates a group start for the 
 341                     // extended syntax. '(?' is used for extensions by perl- 
 342                     // like REs (e.g. advanced), and is not valid for POSIX 
 343                     // extended, so ignore them always. 
 344                     if ( cptr
[1] != _T('?') ) 
 356 #ifdef WXREGEX_USING_RE_SEARCH 
 358 // On GNU, regexec is implemented as a wrapper around re_search. re_search 
 359 // requires a length parameter which the POSIX regexec does not have, 
 360 // therefore regexec must do a strlen on the search text each time it is 
 361 // called. This can drastically affect performance when matching is done in 
 362 // a loop along a string, such as during a search and replace. Therefore if 
 363 // re_search is detected by configure, it is used directly. 
 365 static int ReSearch(const regex_t 
*preg
, 
 368                     re_registers 
*matches
, 
 371     regex_t 
*pattern 
= wx_const_cast(regex_t
*, preg
); 
 373     pattern
->not_bol 
= (eflags 
& REG_NOTBOL
) != 0; 
 374     pattern
->not_eol 
= (eflags 
& REG_NOTEOL
) != 0; 
 375     pattern
->regs_allocated 
= REGS_FIXED
; 
 377     int ret 
= re_search(pattern
, text
, len
, 0, len
, matches
); 
 378     return ret 
>= 0 ? 0 : REG_NOMATCH
; 
 381 #endif // WXREGEX_USING_RE_SEARCH 
 383 bool wxRegExImpl::Matches(const wxRegChar 
*str
, 
 385                           WXREGEX_IF_NEED_LEN(size_t len
)) const 
 387     wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") ); 
 389     // translate our flags to regexec() ones 
 390     wxASSERT_MSG( !(flags 
& ~(wxRE_NOTBOL 
| wxRE_NOTEOL
)), 
 391                   _T("unrecognized flags in wxRegEx::Matches") ); 
 394     if ( flags 
& wxRE_NOTBOL 
) 
 395         flagsRE 
|= REG_NOTBOL
; 
 396     if ( flags 
& wxRE_NOTEOL 
) 
 397         flagsRE 
|= REG_NOTEOL
; 
 399     // allocate matches array if needed 
 400     wxRegExImpl 
*self 
= wxConstCast(this, wxRegExImpl
); 
 401     if ( !m_Matches 
&& m_nMatches 
) 
 403         self
->m_Matches 
= new wxRegExMatches(m_nMatches
); 
 406     wxRegExMatches::match_type matches 
= m_Matches 
? m_Matches
->get() : NULL
; 
 409 #if defined WXREGEX_USING_BUILTIN 
 410     int rc 
= wx_re_exec(&self
->m_RegEx
, str
, len
, NULL
, m_nMatches
, matches
, flagsRE
); 
 411 #elif defined WXREGEX_USING_RE_SEARCH 
 412     int rc 
= str 
? ReSearch(&self
->m_RegEx
, str
, len
, matches
, flagsRE
) : REG_BADPAT
; 
 414     int rc 
= str 
? regexec(&self
->m_RegEx
, str
, m_nMatches
, matches
, flagsRE
) : REG_BADPAT
; 
 420             // matched successfully 
 425             wxLogError(_("Failed to find match for regular expression: %s"), 
 426                        GetErrorMsg(rc
, !str
).c_str()); 
 435 bool wxRegExImpl::GetMatch(size_t *start
, size_t *len
, size_t index
) const 
 437     wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") ); 
 438     wxCHECK_MSG( m_nMatches
, false, _T("can't use with wxRE_NOSUB") ); 
 439     wxCHECK_MSG( m_Matches
, false, _T("must call Matches() first") ); 
 440     wxCHECK_MSG( index 
< m_nMatches
, false, _T("invalid match index") ); 
 443         *start 
= m_Matches
->Start(index
); 
 445         *len 
= m_Matches
->End(index
) - m_Matches
->Start(index
); 
 450 size_t wxRegExImpl::GetMatchCount() const 
 452     wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") ); 
 453     wxCHECK_MSG( m_nMatches
, 0, _T("can't use with wxRE_NOSUB") ); 
 458 int wxRegExImpl::Replace(wxString 
*text
, 
 459                          const wxString
& replacement
, 
 460                          size_t maxMatches
) const 
 462     wxCHECK_MSG( text
, wxNOT_FOUND
, _T("NULL text in wxRegEx::Replace") ); 
 463     wxCHECK_MSG( IsValid(), wxNOT_FOUND
, _T("must successfully Compile() first") ); 
 466 #ifndef WXREGEX_CONVERT_TO_MB 
 467     const wxChar 
*textstr 
= text
->c_str(); 
 468     size_t textlen 
= text
->length(); 
 470     const wxWX2MBbuf textstr 
= WXREGEX_CHAR(*text
); 
 473         wxLogError(_("Failed to find match for regular expression: %s"), 
 474                    GetErrorMsg(0, true).c_str()); 
 477     size_t textlen 
= strlen(textstr
); 
 481     // the replacement text 
 484     // the result, allow 25% extra 
 486     result
.reserve(5 * textlen 
/ 4); 
 488     // attempt at optimization: don't iterate over the string if it doesn't 
 489     // contain back references at all 
 490     bool mayHaveBackrefs 
= 
 491         replacement
.find_first_of(_T("\\&")) != wxString::npos
; 
 493     if ( !mayHaveBackrefs 
) 
 495         textNew 
= replacement
; 
 498     // the position where we start looking for the match 
 499     size_t matchStart 
= 0; 
 501     // number of replacement made: we won't make more than maxMatches of them 
 502     // (unless maxMatches is 0 which doesn't limit the number of replacements) 
 503     size_t countRepl 
= 0; 
 505     // note that "^" shouldn't match after the first call to Matches() so we 
 506     // use wxRE_NOTBOL to prevent it from happening 
 507     while ( (!maxMatches 
|| countRepl 
< maxMatches
) && 
 508             Matches(textstr 
+ matchStart
, 
 509                     countRepl 
? wxRE_NOTBOL 
: 0 
 510                     WXREGEX_IF_NEED_LEN(textlen 
- matchStart
)) ) 
 512         // the string possibly contains back references: we need to calculate 
 513         // the replacement text anew after each match 
 514         if ( mayHaveBackrefs 
) 
 516             mayHaveBackrefs 
= false; 
 518             textNew
.reserve(replacement
.length()); 
 520             for ( const wxChar 
*p 
= replacement
.c_str(); *p
; p
++ ) 
 522                 size_t index 
= (size_t)-1; 
 524                 if ( *p 
== _T('\\') ) 
 526                     if ( wxIsdigit(*++p
) ) 
 530                         index 
= (size_t)wxStrtoul(p
, &end
, 10); 
 531                         p 
= end 
- 1; // -1 to compensate for p++ in the loop 
 533                     //else: backslash used as escape character 
 535                 else if ( *p 
== _T('&') ) 
 537                     // treat this as "\0" for compatbility with ed and such 
 541                 // do we have a back reference? 
 542                 if ( index 
!= (size_t)-1 ) 
 546                     if ( !GetMatch(&start
, &len
, index
) ) 
 548                         wxFAIL_MSG( _T("invalid back reference") ); 
 554                         textNew 
+= wxString(textstr 
+ matchStart 
+ start
, 
 555                                             *wxConvCurrent
, len
); 
 557                         mayHaveBackrefs 
= true; 
 560                 else // ordinary character 
 568         if ( !GetMatch(&start
, &len
) ) 
 570             // we did have match as Matches() returned true above! 
 571             wxFAIL_MSG( _T("internal logic error in wxRegEx::Replace") ); 
 576         // an insurance against implementations that don't grow exponentially 
 577         // to ensure building the result takes linear time 
 578         if (result
.capacity() < result
.length() + start 
+ textNew
.length()) 
 579             result
.reserve(2 * result
.length()); 
 581 #ifndef WXREGEX_CONVERT_TO_MB 
 582         result
.append(*text
, matchStart
, start
); 
 584         result
.append(wxString(textstr 
+ matchStart
, *wxConvCurrent
, start
)); 
 587         result
.append(textNew
); 
 594 #ifndef WXREGEX_CONVERT_TO_MB 
 595     result
.append(*text
, matchStart
, wxString::npos
); 
 597     result
.append(wxString(textstr 
+ matchStart
, *wxConvCurrent
)); 
 604 // ---------------------------------------------------------------------------- 
 605 // wxRegEx: all methods are mostly forwarded to wxRegExImpl 
 606 // ---------------------------------------------------------------------------- 
 618 bool wxRegEx::Compile(const wxString
& expr
, int flags
) 
 622         m_impl 
= new wxRegExImpl
; 
 625     if ( !m_impl
->Compile(expr
, flags
) ) 
 627         // error message already given in wxRegExImpl::Compile 
 637 bool wxRegEx::Matches(const wxChar 
*str
, int flags
, size_t len
) const 
 639     wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") ); 
 642     return m_impl
->Matches(WXREGEX_CHAR(str
), flags 
WXREGEX_IF_NEED_LEN(len
)); 
 645 bool wxRegEx::Matches(const wxChar 
*str
, int flags
) const 
 647     wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") ); 
 649     return m_impl
->Matches(WXREGEX_CHAR(str
), 
 651                            WXREGEX_IF_NEED_LEN(wxStrlen(str
))); 
 654 bool wxRegEx::GetMatch(size_t *start
, size_t *len
, size_t index
) const 
 656     wxCHECK_MSG( IsValid(), false, _T("must successfully Compile() first") ); 
 658     return m_impl
->GetMatch(start
, len
, index
); 
 661 wxString 
wxRegEx::GetMatch(const wxString
& text
, size_t index
) const 
 664     if ( !GetMatch(&start
, &len
, index
) ) 
 665         return wxEmptyString
; 
 667     return text
.Mid(start
, len
); 
 670 size_t wxRegEx::GetMatchCount() const 
 672     wxCHECK_MSG( IsValid(), 0, _T("must successfully Compile() first") ); 
 674     return m_impl
->GetMatchCount(); 
 677 int wxRegEx::Replace(wxString 
*pattern
, 
 678                      const wxString
& replacement
, 
 679                      size_t maxMatches
) const 
 681     wxCHECK_MSG( IsValid(), wxNOT_FOUND
, _T("must successfully Compile() first") ); 
 683     return m_impl
->Replace(pattern
, replacement
, maxMatches
); 
 686 #endif // wxUSE_REGEX