]>
git.saurik.com Git - wxWidgets.git/blob - src/common/regex.cpp
   1 /////////////////////////////////////////////////////////////////////////////// 
   2 // Name:        src/common/regex.cpp 
   3 // Purpose:     regular expression matching 
   4 // Author:      Karsten Ballueder and Vadim Zeitlin 
   8 // Copyright:   (c) 2000 Karsten Ballueder <ballueder@gmx.net> 
   9 //                  2001 Vadim Zeitlin <vadim@wxwindows.org> 
  10 // Licence:     wxWindows licence 
  11 /////////////////////////////////////////////////////////////////////////////// 
  13 // ============================================================================ 
  15 // ============================================================================ 
  17 // ---------------------------------------------------------------------------- 
  19 // ---------------------------------------------------------------------------- 
  21 // For compilers that support precompilation, includes "wx.h". 
  22 #include "wx/wxprec.h" 
  33     #include "wx/object.h" 
  39 // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it. 
  40 // Others also don't seem to need it. If you have an error related to 
  41 // (not) including <sys/types.h> please report details to 
  42 // wx-dev@lists.wxwindows.org 
  43 #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__) 
  44 #   include <sys/types.h> 
  49 // WXREGEX_USING_BUILTIN    defined when using the built-in regex lib 
  50 // WXREGEX_USING_RE_SEARCH  defined when using re_search in the GNU regex lib 
  51 // WXREGEX_IF_NEED_LEN()    wrap the len parameter only used with the built-in 
  53 // WXREGEX_CONVERT_TO_MB    defined when the regex lib is using chars and 
  54 //                          wxChar is wide, so conversion must be done 
  55 // WXREGEX_CHAR(x)          Convert wxChar to wxRegChar 
  58 #   define WXREGEX_USING_BUILTIN 
  59 #   define WXREGEX_IF_NEED_LEN(x) ,x 
  61 #       define WXREGEX_CHAR(x) (x).wc_str() 
  63 #       define WXREGEX_CHAR(x) (x).mb_str() 
  66 #   ifdef HAVE_RE_SEARCH 
  67 #       define WXREGEX_IF_NEED_LEN(x) ,x 
  68 #       define WXREGEX_USING_RE_SEARCH 
  70 #       define WXREGEX_IF_NEED_LEN(x) 
  73 #       define WXREGEX_CONVERT_TO_MB 
  75 #   define WXREGEX_CHAR(x) (x).mb_str() 
  76 #   define wx_regfree regfree 
  77 #   define wx_regerror regerror 
  80 // ---------------------------------------------------------------------------- 
  82 // ---------------------------------------------------------------------------- 
  84 #ifndef WXREGEX_USING_RE_SEARCH 
  86 // the array of offsets for the matches, the usual POSIX regmatch_t array. 
  90     typedef regmatch_t 
*match_type
; 
  92     wxRegExMatches(size_t n
)        { m_matches 
= new regmatch_t
[n
]; } 
  93     ~wxRegExMatches()               { delete [] m_matches
; } 
  95     // we just use casts here because the fields of regmatch_t struct may be 64 
  96     // bit but we're limited to size_t in our public API and are not going to 
  97     // change it because operating on strings longer than 4GB using it is 
  98     // absolutely impractical anyhow 
  99     size_t Start(size_t n
) const 
 101         return wx_truncate_cast(size_t, m_matches
[n
].rm_so
); 
 104     size_t End(size_t n
) const 
 106         return wx_truncate_cast(size_t, m_matches
[n
].rm_eo
); 
 109     regmatch_t 
*get() const         { return m_matches
; } 
 112     regmatch_t 
*m_matches
; 
 115 #else // WXREGEX_USING_RE_SEARCH 
 117 // the array of offsets for the matches, the struct used by the GNU lib 
 121     typedef re_registers 
*match_type
; 
 123     wxRegExMatches(size_t n
) 
 125         m_matches
.num_regs 
= n
; 
 126         m_matches
.start 
= new regoff_t
[n
]; 
 127         m_matches
.end 
= new regoff_t
[n
]; 
 132         delete [] m_matches
.start
; 
 133         delete [] m_matches
.end
; 
 136     size_t Start(size_t n
) const    { return m_matches
.start
[n
]; } 
 137     size_t End(size_t n
) const      { return m_matches
.end
[n
]; } 
 139     re_registers 
*get()             { return &m_matches
; } 
 142     re_registers m_matches
; 
 145 #endif // WXREGEX_USING_RE_SEARCH 
 147 // the character type used by the regular expression engine 
 148 #ifndef WXREGEX_CONVERT_TO_MB 
 149 typedef wxChar wxRegChar
; 
 151 typedef char wxRegChar
; 
 154 // the real implementation of wxRegEx 
 162     // return true if Compile() had been called successfully 
 163     bool IsValid() const { return m_isCompiled
; } 
 166     bool Compile(const wxString
& expr
, int flags 
= 0); 
 167     bool Matches(const wxRegChar 
*str
, int flags
 
 168                  WXREGEX_IF_NEED_LEN(size_t len
)) const; 
 169     bool GetMatch(size_t *start
, size_t *len
, size_t index 
= 0) const; 
 170     size_t GetMatchCount() const; 
 171     int Replace(wxString 
*pattern
, const wxString
& replacement
, 
 172                 size_t maxMatches 
= 0) const; 
 175     // return the string containing the error message for the given err code 
 176     wxString 
GetErrorMsg(int errorcode
, bool badconv
) const; 
 181         m_isCompiled 
= false; 
 186     // free the RE if compiled 
 191             wx_regfree(&m_RegEx
); 
 197     // free the RE if any and reinit the members 
 207     // the subexpressions data 
 208     wxRegExMatches 
*m_Matches
; 
 211     // true if m_RegEx is valid 
 216 // ============================================================================ 
 218 // ============================================================================ 
 220 // ---------------------------------------------------------------------------- 
 222 // ---------------------------------------------------------------------------- 
 224 wxRegExImpl::wxRegExImpl() 
 229 wxRegExImpl::~wxRegExImpl() 
 234 wxString 
wxRegExImpl::GetErrorMsg(int errorcode
, bool badconv
) const 
 236 #ifdef WXREGEX_CONVERT_TO_MB 
 237     // currently only needed when using system library in Unicode mode 
 240         return _("conversion to 8-bit encoding failed"); 
 243     // 'use' badconv to avoid a compiler warning 
 249     // first get the string length needed 
 250     int len 
= wx_regerror(errorcode
, &m_RegEx
, NULL
, 0); 
 253         char* szcmbError 
= new char[++len
]; 
 255         (void)wx_regerror(errorcode
, &m_RegEx
, szcmbError
, len
); 
 257         szError 
= wxConvLibc
.cMB2WX(szcmbError
); 
 258         delete [] szcmbError
; 
 260     else // regerror() returned 0 
 262         szError 
= _("unknown error"); 
 268 bool wxRegExImpl::Compile(const wxString
& expr
, int flags
) 
 272 #ifdef WX_NO_REGEX_ADVANCED 
 273 #   define FLAVORS wxRE_BASIC 
 275 #   define FLAVORS (wxRE_ADVANCED | wxRE_BASIC) 
 276     wxASSERT_MSG( (flags 
& FLAVORS
) != FLAVORS
, 
 277                   wxT("incompatible flags in wxRegEx::Compile") ); 
 279     wxASSERT_MSG( !(flags 
& ~(FLAVORS 
| wxRE_ICASE 
| wxRE_NOSUB 
| wxRE_NEWLINE
)), 
 280                   wxT("unrecognized flags in wxRegEx::Compile") ); 
 282     // translate our flags to regcomp() ones 
 284     if ( !(flags 
& wxRE_BASIC
) ) 
 286 #ifndef WX_NO_REGEX_ADVANCED 
 287         if (flags 
& wxRE_ADVANCED
) 
 288             flagsRE 
|= REG_ADVANCED
; 
 291             flagsRE 
|= REG_EXTENDED
; 
 293     if ( flags 
& wxRE_ICASE 
) 
 294         flagsRE 
|= REG_ICASE
; 
 295     if ( flags 
& wxRE_NOSUB 
) 
 296         flagsRE 
|= REG_NOSUB
; 
 297     if ( flags 
& wxRE_NEWLINE 
) 
 298         flagsRE 
|= REG_NEWLINE
; 
 301 #ifdef WXREGEX_USING_BUILTIN 
 303     // FIXME-UTF8: use wc_str() after removing ANSI build 
 304     int errorcode 
= wx_re_comp(&m_RegEx
, expr
.c_str(), expr
.length(), flagsRE
); 
 306     // FIXME-UTF8: this is potentially broken, we shouldn't even try it 
 307     //             and should always use builtin regex library (or PCRE?) 
 308     const wxWX2MBbuf conv 
= expr
.mbc_str(); 
 309     int errorcode 
= conv 
? regcomp(&m_RegEx
, conv
, flagsRE
) : REG_BADPAT
; 
 314         wxLogError(_("Invalid regular expression '%s': %s"), 
 315                    expr
.c_str(), GetErrorMsg(errorcode
, !conv
).c_str()); 
 317         m_isCompiled 
= false; 
 321         // don't allocate the matches array now, but do it later if necessary 
 322         if ( flags 
& wxRE_NOSUB 
) 
 324             // we don't need it at all 
 329             // we will alloc the array later (only if really needed) but count 
 330             // the number of sub-expressions in the regex right now 
 332             // there is always one for the whole expression 
 335             // and some more for bracketed subexperessions 
 336             for ( const wxChar 
*cptr 
= expr
.c_str(); *cptr
; cptr
++ ) 
 338                 if ( *cptr 
== wxT('\\') ) 
 340                     // in basic RE syntax groups are inside \(...\) 
 341                     if ( *++cptr 
== wxT('(') && (flags 
& wxRE_BASIC
) ) 
 346                 else if ( *cptr 
== wxT('(') && !(flags 
& wxRE_BASIC
) ) 
 348                     // we know that the previous character is not an unquoted 
 349                     // backslash because it would have been eaten above, so we 
 350                     // have a bare '(' and this indicates a group start for the 
 351                     // extended syntax. '(?' is used for extensions by perl- 
 352                     // like REs (e.g. advanced), and is not valid for POSIX 
 353                     // extended, so ignore them always. 
 354                     if ( cptr
[1] != wxT('?') ) 
 366 #ifdef WXREGEX_USING_RE_SEARCH 
 368 // On GNU, regexec is implemented as a wrapper around re_search. re_search 
 369 // requires a length parameter which the POSIX regexec does not have, 
 370 // therefore regexec must do a strlen on the search text each time it is 
 371 // called. This can drastically affect performance when matching is done in 
 372 // a loop along a string, such as during a search and replace. Therefore if 
 373 // re_search is detected by configure, it is used directly. 
 375 static int ReSearch(const regex_t 
*preg
, 
 378                     re_registers 
*matches
, 
 381     regex_t 
*pattern 
= const_cast<regex_t
*>(preg
); 
 383     pattern
->not_bol 
= (eflags 
& REG_NOTBOL
) != 0; 
 384     pattern
->not_eol 
= (eflags 
& REG_NOTEOL
) != 0; 
 385     pattern
->regs_allocated 
= REGS_FIXED
; 
 387     int ret 
= re_search(pattern
, text
, len
, 0, len
, matches
); 
 388     return ret 
>= 0 ? 0 : REG_NOMATCH
; 
 391 #endif // WXREGEX_USING_RE_SEARCH 
 393 bool wxRegExImpl::Matches(const wxRegChar 
*str
, 
 395                           WXREGEX_IF_NEED_LEN(size_t len
)) const 
 397     wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); 
 399     // translate our flags to regexec() ones 
 400     wxASSERT_MSG( !(flags 
& ~(wxRE_NOTBOL 
| wxRE_NOTEOL
)), 
 401                   wxT("unrecognized flags in wxRegEx::Matches") ); 
 404     if ( flags 
& wxRE_NOTBOL 
) 
 405         flagsRE 
|= REG_NOTBOL
; 
 406     if ( flags 
& wxRE_NOTEOL 
) 
 407         flagsRE 
|= REG_NOTEOL
; 
 409     // allocate matches array if needed 
 410     wxRegExImpl 
*self 
= wxConstCast(this, wxRegExImpl
); 
 411     if ( !m_Matches 
&& m_nMatches 
) 
 413         self
->m_Matches 
= new wxRegExMatches(m_nMatches
); 
 416     wxRegExMatches::match_type matches 
= m_Matches 
? m_Matches
->get() : NULL
; 
 419 #if defined WXREGEX_USING_BUILTIN 
 420     int rc 
= wx_re_exec(&self
->m_RegEx
, str
, len
, NULL
, m_nMatches
, matches
, flagsRE
); 
 421 #elif defined WXREGEX_USING_RE_SEARCH 
 422     int rc 
= str 
? ReSearch(&self
->m_RegEx
, str
, len
, matches
, flagsRE
) : REG_BADPAT
; 
 424     int rc 
= str 
? regexec(&self
->m_RegEx
, str
, m_nMatches
, matches
, flagsRE
) : REG_BADPAT
; 
 430             // matched successfully 
 435             wxLogError(_("Failed to find match for regular expression: %s"), 
 436                        GetErrorMsg(rc
, !str
).c_str()); 
 445 bool wxRegExImpl::GetMatch(size_t *start
, size_t *len
, size_t index
) const 
 447     wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); 
 448     wxCHECK_MSG( m_nMatches
, false, wxT("can't use with wxRE_NOSUB") ); 
 449     wxCHECK_MSG( m_Matches
, false, wxT("must call Matches() first") ); 
 450     wxCHECK_MSG( index 
< m_nMatches
, false, wxT("invalid match index") ); 
 453         *start 
= m_Matches
->Start(index
); 
 455         *len 
= m_Matches
->End(index
) - m_Matches
->Start(index
); 
 460 size_t wxRegExImpl::GetMatchCount() const 
 462     wxCHECK_MSG( IsValid(), 0, wxT("must successfully Compile() first") ); 
 463     wxCHECK_MSG( m_nMatches
, 0, wxT("can't use with wxRE_NOSUB") ); 
 468 int wxRegExImpl::Replace(wxString 
*text
, 
 469                          const wxString
& replacement
, 
 470                          size_t maxMatches
) const 
 472     wxCHECK_MSG( text
, wxNOT_FOUND
, wxT("NULL text in wxRegEx::Replace") ); 
 473     wxCHECK_MSG( IsValid(), wxNOT_FOUND
, wxT("must successfully Compile() first") ); 
 476 #ifndef WXREGEX_CONVERT_TO_MB 
 477     const wxChar 
*textstr 
= text
->c_str(); 
 478     size_t textlen 
= text
->length(); 
 480     const wxWX2MBbuf textstr 
= WXREGEX_CHAR(*text
); 
 483         wxLogError(_("Failed to find match for regular expression: %s"), 
 484                    GetErrorMsg(0, true).c_str()); 
 487     size_t textlen 
= strlen(textstr
); 
 491     // the replacement text 
 494     // the result, allow 25% extra 
 496     result
.reserve(5 * textlen 
/ 4); 
 498     // attempt at optimization: don't iterate over the string if it doesn't 
 499     // contain back references at all 
 500     bool mayHaveBackrefs 
= 
 501         replacement
.find_first_of(wxT("\\&")) != wxString::npos
; 
 503     if ( !mayHaveBackrefs 
) 
 505         textNew 
= replacement
; 
 508     // the position where we start looking for the match 
 509     size_t matchStart 
= 0; 
 511     // number of replacement made: we won't make more than maxMatches of them 
 512     // (unless maxMatches is 0 which doesn't limit the number of replacements) 
 513     size_t countRepl 
= 0; 
 515     // note that "^" shouldn't match after the first call to Matches() so we 
 516     // use wxRE_NOTBOL to prevent it from happening 
 517     while ( (!maxMatches 
|| countRepl 
< maxMatches
) && 
 519 #ifndef WXREGEX_CONVERT_TO_MB 
 520                     textstr 
+ matchStart
, 
 522                     textstr
.data() + matchStart
, 
 524                     countRepl 
? wxRE_NOTBOL 
: 0 
 525                     WXREGEX_IF_NEED_LEN(textlen 
- matchStart
)) ) 
 527         // the string possibly contains back references: we need to calculate 
 528         // the replacement text anew after each match 
 529         if ( mayHaveBackrefs 
) 
 531             mayHaveBackrefs 
= false; 
 533             textNew
.reserve(replacement
.length()); 
 535             for ( const wxChar 
*p 
= replacement
.c_str(); *p
; p
++ ) 
 537                 size_t index 
= (size_t)-1; 
 539                 if ( *p 
== wxT('\\') ) 
 541                     if ( wxIsdigit(*++p
) ) 
 545                         index 
= (size_t)wxStrtoul(p
, &end
, 10); 
 546                         p 
= end 
- 1; // -1 to compensate for p++ in the loop 
 548                     //else: backslash used as escape character 
 550                 else if ( *p 
== wxT('&') ) 
 552                     // treat this as "\0" for compatbility with ed and such 
 556                 // do we have a back reference? 
 557                 if ( index 
!= (size_t)-1 ) 
 561                     if ( !GetMatch(&start
, &len
, index
) ) 
 563                         wxFAIL_MSG( wxT("invalid back reference") ); 
 570 #ifndef WXREGEX_CONVERT_TO_MB 
 575                                 + matchStart 
+ start
, 
 576                                 *wxConvCurrent
, len
); 
 578                         mayHaveBackrefs 
= true; 
 581                 else // ordinary character 
 589         if ( !GetMatch(&start
, &len
) ) 
 591             // we did have match as Matches() returned true above! 
 592             wxFAIL_MSG( wxT("internal logic error in wxRegEx::Replace") ); 
 597         // an insurance against implementations that don't grow exponentially 
 598         // to ensure building the result takes linear time 
 599         if (result
.capacity() < result
.length() + start 
+ textNew
.length()) 
 600             result
.reserve(2 * result
.length()); 
 602 #ifndef WXREGEX_CONVERT_TO_MB 
 603         result
.append(*text
, matchStart
, start
); 
 605         result
.append(wxString(textstr
.data() + matchStart
, *wxConvCurrent
, start
)); 
 608         result
.append(textNew
); 
 615 #ifndef WXREGEX_CONVERT_TO_MB 
 616     result
.append(*text
, matchStart
, wxString::npos
); 
 618     result
.append(wxString(textstr
.data() + matchStart
, *wxConvCurrent
)); 
 625 // ---------------------------------------------------------------------------- 
 626 // wxRegEx: all methods are mostly forwarded to wxRegExImpl 
 627 // ---------------------------------------------------------------------------- 
 639 bool wxRegEx::Compile(const wxString
& expr
, int flags
) 
 643         m_impl 
= new wxRegExImpl
; 
 646     if ( !m_impl
->Compile(expr
, flags
) ) 
 648         // error message already given in wxRegExImpl::Compile 
 658 bool wxRegEx::Matches(const wxString
& str
, int flags
) const 
 660     wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); 
 662     return m_impl
->Matches(WXREGEX_CHAR(str
), flags
 
 663                             WXREGEX_IF_NEED_LEN(str
.length())); 
 666 bool wxRegEx::GetMatch(size_t *start
, size_t *len
, size_t index
) const 
 668     wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); 
 670     return m_impl
->GetMatch(start
, len
, index
); 
 673 wxString 
wxRegEx::GetMatch(const wxString
& text
, size_t index
) const 
 676     if ( !GetMatch(&start
, &len
, index
) ) 
 677         return wxEmptyString
; 
 679     return text
.Mid(start
, len
); 
 682 size_t wxRegEx::GetMatchCount() const 
 684     wxCHECK_MSG( IsValid(), 0, wxT("must successfully Compile() first") ); 
 686     return m_impl
->GetMatchCount(); 
 689 int wxRegEx::Replace(wxString 
*pattern
, 
 690                      const wxString
& replacement
, 
 691                      size_t maxMatches
) const 
 693     wxCHECK_MSG( IsValid(), wxNOT_FOUND
, wxT("must successfully Compile() first") ); 
 695     return m_impl
->Replace(pattern
, replacement
, maxMatches
); 
 698 #endif // wxUSE_REGEX