contrib/src/stc/scintilla/src/RESearch.cxx

   1 // Scintilla source code edit control
   2 /** @file RESearch.cxx
   3  ** Regular expression search library.
   4  **/
   5
   6 /*
   7  * regex - Regular expression pattern matching  and replacement
   8  *
   9  * By:  Ozan S. Yigit (oz)
  10  *      Dept. of Computer Science
  11  *      York University
  12  *
  13  * Original code available from http://www.cs.yorku.ca/~oz/
  14  * Translation to C++ by Neil Hodgson neilh@scintilla.org
  15  * Removed all use of register.
  16  * Converted to modern function prototypes.
  17  * Put all global/static variables into an object so this code can be
  18  * used from multiple threads etc.
  19  *
  20  * These routines are the PUBLIC DOMAIN equivalents of regex
  21  * routines as found in 4.nBSD UN*X, with minor extensions.
  22  *
  23  * These routines are derived from various implementations found
  24  * in software tools books, and Conroy's grep. They are NOT derived
  25  * from licensed/restricted software.
  26  * For more interesting/academic/complicated implementations,
  27  * see Henry Spencer's regexp routines, or GNU Emacs pattern
  28  * matching module.
  29  *
  30  * Modification history:
  31  *
  32  * $Log$
  33  * Revision 1.1  2001/09/01 03:05:24  RD
  34  * Upgraded to version 1.39 of Scintilla, and upated wxStyledTextCtrl
  35  * accordingly.
  36  *
  37  * Revision 1.6  2001/04/29 13:32:10  nyamatongwe
  38  * Addition of new target methods - versions of ReplaceTarget that take counted
  39  * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
  40  * series of calls rather than a structure.
  41  * Handling of \000 in search and replace.
  42  * Handling of /escapes within character ranges of regular expressions.
  43  * Some handling of bare ^ and $ regular expressions.
  44  *
  45  * Revision 1.5  2001/04/20 07:36:09  nyamatongwe
  46  * Removed DEBUG code that failed to compile on GTK+.
  47  *
  48  * Revision 1.4  2001/04/13 03:52:13  nyamatongwe
  49  * Added URL to find original code to comments.
  50  *
  51  * Revision 1.3  2001/04/06 12:24:21  nyamatongwe
  52  * Made regular expression searching work on a line by line basis, made ^ and
  53  * $ work, made [set] work, and added a case insensitive option.
  54  *
  55  * Revision 1.2  2001/04/05 01:58:04  nyamatongwe
  56  * Replace target functionality to make find and replace operations faster
  57  * by diminishing screen updates and allow for \d patterns in the replacement
  58  * text.
  59  *
  60  * Revision 1.1  2001/04/04 12:52:44  nyamatongwe
  61  * Moved to public domain regular expresion implementation.
  62  *
  63  * Revision 1.4  1991/10/17  03:56:42  oz
  64  * miscellaneous changes, small cleanups etc.
  65  *
  66  * Revision 1.3  1989/04/01  14:18:09  oz
  67  * Change all references to a dfa: this is actually an nfa.
  68  *
  69  * Revision 1.2  88/08/28  15:36:04  oz
  70  * Use a complement bitmap to represent NCL.
  71  * This removes the need to have seperate
  72  * code in the PMatch case block - it is
  73  * just CCL code now.
  74  *
  75  * Use the actual CCL code in the CLO
  76  * section of PMatch. No need for a recursive
  77  * PMatch call.
  78  *
  79  * Use a bitmap table to set char bits in an
  80  * 8-bit chunk.
  81  *
  82  * Interfaces:
  83  *      RESearch::Compile:        compile a regular expression into a NFA.
  84  *
  85  *                      char *RESearch::Compile(s)
  86  *                      char *s;
  87  *
  88  *      RESearch::Execute:        execute the NFA to match a pattern.
  89  *
  90  *                      int RESearch::Execute(s)
  91  *                      char *s;
  92  *
  93  *      RESearch::ModifyWord            change RESearch::Execute's understanding of what a "word"
  94  *                      looks like (for \< and \>) by adding into the
  95  *                      hidden word-syntax table.
  96  *
  97  *                      void RESearch::ModifyWord(s)
  98  *                      char *s;
  99  *
 100  *      RESearch::Substitute:   substitute the matched portions in a new string.
 101  *
 102  *                      int RESearch::Substitute(src, dst)
 103  *                      char *src;
 104  *                      char *dst;
 105  *
 106  *      re_fail:        failure routine for RESearch::Execute.
 107  *
 108  *                      void re_fail(msg, op)
 109  *                      char *msg;
 110  *                      char op;
 111  *
 112  * Regular Expressions:
 113  *
 114  *      [1]     char    matches itself, unless it is a special
 115  *                      character (metachar): . \ [ ] * + ^ $
 116  *
 117  *      [2]     .       matches any character.
 118  *
 119  *      [3]     \       matches the character following it, except
 120  *                      when followed by a left or right round bracket,
 121  *                      a digit 1 to 9 or a left or right angle bracket.
 122  *                      (see [7], [8] and [9])
 123  *                      It is used as an escape character for all
 124  *                      other meta-characters, and itself. When used
 125  *                      in a set ([4]), it is treated as an ordinary
 126  *                      character.
 127  *
 128  *      [4]     [set]   matches one of the characters in the set.
 129  *                      If the first character in the set is "^",
 130  *                      it matches a character NOT in the set, i.e.
 131  *                      complements the set. A shorthand S-E is
 132  *                      used to specify a set of characters S upto
 133  *                      E, inclusive. The special characters "]" and
 134  *                      "-" have no special meaning if they appear
 135  *                      as the first chars in the set.
 136  *                      examples:        match:
 137  *
 138  *                              [a-z]    any lowercase alpha
 139  *
 140  *                              [^]-]    any char except ] and -
 141  *
 142  *                              [^A-Z]   any char except uppercase
 143  *                                       alpha
 144  *
 145  *                              [a-zA-Z] any alpha
 146  *
 147  *      [5]     *       any regular expression form [1] to [4], followed by
 148  *                      closure char (*) matches zero or more matches of
 149  *                      that form.
 150  *
 151  *      [6]     +       same as [5], except it matches one or more.
 152  *
 153  *      [7]             a regular expression in the form [1] to [10], enclosed
 154  *                      as \(form\) matches what form matches. The enclosure
 155  *                      creates a set of tags, used for [8] and for
 156  *                      pattern substution. The tagged forms are numbered
 157  *                      starting from 1.
 158  *
 159  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
 160  *                      previously tagged regular expression ([7]) matched.
 161  *
 162  *      [9]     \<      a regular expression starting with a \< construct
 163  *              \>      and/or ending with a \> construct, restricts the
 164  *                      pattern matching to the beginning of a word, and/or
 165  *                      the end of a word. A word is defined to be a character
 166  *                      string beginning and/or ending with the characters
 167  *                      A-Z a-z 0-9 and _. It must also be preceded and/or
 168  *                      followed by any character outside those mentioned.
 169  *
 170  *      [10]            a composite regular expression xy where x and y
 171  *                      are in the form [1] to [10] matches the longest
 172  *                      match of x followed by a match for y.
 173  *
 174  *      [11]    ^       a regular expression starting with a ^ character
 175  *              $       and/or ending with a $ character, restricts the
 176  *                      pattern matching to the beginning of the line,
 177  *                      or the end of line. [anchors] Elsewhere in the
 178  *                      pattern, ^ and $ are treated as ordinary characters.
 179  *
 180  *
 181  * Acknowledgements:
 182  *
 183  *      HCR's Hugh Redelmeier has been most helpful in various
 184  *      stages of development. He convinced me to include BOW
 185  *      and EOW constructs, originally invented by Rob Pike at
 186  *      the University of Toronto.
 187  *
 188  * References:
 189  *              Software tools                  Kernighan & Plauger
 190  *              Software tools in Pascal        Kernighan & Plauger
 191  *              Grep [rsx-11 C dist]            David Conroy
 192  *              ed - text editor                Un*x Programmer's Manual
 193  *              Advanced editing on Un*x        B. W. Kernighan
 194  *              RegExp routines                 Henry Spencer
 195  *
 196  * Notes:
 197  *
 198  *      This implementation uses a bit-set representation for character
 199  *      classes for speed and compactness. Each character is represented
 200  *      by one bit in a 128-bit block. Thus, CCL always takes a
 201  *      constant 16 bytes in the internal nfa, and RESearch::Execute does a single
 202  *      bit comparison to locate the character in the set.
 203  *
 204  * Examples:
 205  *
 206  *      pattern:        foo*.*
 207  *      compile:        CHR f CHR o CLO CHR o END CLO ANY END END
 208  *      matches:        fo foo fooo foobar fobar foxx ...
 209  *
 210  *      pattern:        fo[ob]a[rz]
 211  *      compile:        CHR f CHR o CCL bitset CHR a CCL bitset END
 212  *      matches:        fobar fooar fobaz fooaz
 213  *
 214  *      pattern:        foo\\+
 215  *      compile:        CHR f CHR o CHR o CHR \ CLO CHR \ END END
 216  *      matches:        foo\ foo\\ foo\\\  ...
 217  *
 218  *      pattern:        \(foo\)[1-3]\1  (same as foo[1-3]foo)
 219  *      compile:        BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
 220  *      matches:        foo1foo foo2foo foo3foo
 221  *
 222  *      pattern:        \(fo.*\)-\1
 223  *      compile:        BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
 224  *      matches:        foo-foo fo-fo fob-fob foobar-foobar ...
 225  */
 226
 227 #include "RESearch.h"
 228
 229 #define OKP     1
 230 #define NOP     0
 231
 232 #define CHR     1
 233 #define ANY     2
 234 #define CCL     3
 235 #define BOL     4
 236 #define EOL     5
 237 #define BOT     6
 238 #define EOT     7
 239 #define BOW     8
 240 #define EOW     9
 241 #define REF     10
 242 #define CLO     11
 243
 244 #define END     0
 245
 246 /*
 247  * The following defines are not meant to be changeable.
 248  * They are for readability only.
 249  */
 250 #define BLKIND  0170
 251 #define BITIND  07
 252
 253 #define ASCIIB  0177
 254
 255 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
 256
 257 #define badpat(x)       (*nfa = END, x)
 258
 259 RESearch::RESearch() {
 260         Init();
 261 }
 262
 263 RESearch::~RESearch() {
 264         Clear();
 265 }
 266
 267 void RESearch::Init() {
 268         sta = NOP;                      /* status of lastpat */
 269         bol = 0;
 270         for (int i=0; i<MAXTAG; i++)
 271                 pat[i] = 0;
 272         for (int j=0; j<BITBLK; j++)
 273                 bittab[j] = 0;
 274 }
 275
 276 void RESearch::Clear() {
 277         for (int i=0; i<MAXTAG; i++) {
 278                 delete []pat[i];
 279                 pat[i] = 0;
 280                 bopat[i] = NOTFOUND;
 281                 eopat[i] = NOTFOUND;
 282         }
 283 }
 284
 285 bool RESearch::GrabMatches(CharacterIndexer &ci) {
 286         bool success = true;
 287         for (unsigned int i=0; i<MAXTAG; i++) {
 288                 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
 289                         unsigned int len = eopat[i] - bopat[i];
 290                         pat[i] = new char[len + 1];
 291                         if (pat[i]) {
 292                                 for (unsigned int j=0; j<len; j++)
 293                                         pat[i][j] = ci.CharAt(bopat[i] + j);
 294                                 pat[i][len] = '\0';
 295                         } else {
 296                                 success = false;
 297                         }
 298                 }
 299         }
 300         return success;
 301 }
 302
 303 void RESearch::ChSet(char c) {
 304         bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
 305 }
 306
 307 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
 308         if (caseSensitive) {
 309                 ChSet(c);
 310         } else {
 311                 if ((c >= 'a') && (c <= 'z')) {
 312                         ChSet(c);
 313                         ChSet(static_cast<char>(c - 'a' + 'A'));
 314                 } else if ((c >= 'A') && (c <= 'Z')) {
 315                         ChSet(c);
 316                         ChSet(static_cast<char>(c - 'A' + 'a'));
 317                 } else {
 318                         ChSet(c);
 319                 }
 320         }
 321 }
 322
 323 const char escapeValue(char ch) {
 324         switch (ch) {
 325         case 'a':       return '\a';
 326         case 'b':       return '\b';
 327         case 'f':       return '\f';
 328         case 'n':       return '\n';
 329         case 'r':       return '\r';
 330         case 't':       return '\t';
 331         case 'v':       return '\v';
 332         }
 333         return 0;
 334 }
 335
 336 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive) {
 337         char *mp=nfa;          /* nfa pointer       */
 338         char *lp;              /* saved pointer..   */
 339         char *sp=nfa;          /* another one..     */
 340
 341         int tagi = 0;          /* tag stack index   */
 342         int tagc = 1;          /* actual tag count  */
 343
 344         int n;
 345         char mask;              /* xor mask -CCL/NCL */
 346         int c1, c2;
 347
 348         if (!pat || !length)
 349                 if (sta)
 350                         return 0;
 351                 else
 352                         return badpat("No previous regular expression");
 353         sta = NOP;
 354
 355         const char *p=pat;               /* pattern pointer   */
 356         for (int i=0; i<length; i++, p++) {
 357                 lp = mp;
 358                 switch(*p) {
 359
 360                 case '.':               /* match any char..  */
 361                         *mp++ = ANY;
 362                         break;
 363
 364                 case '^':               /* match beginning.. */
 365                         if (p == pat)
 366                                 *mp++ = BOL;
 367                         else {
 368                                 *mp++ = CHR;
 369                                 *mp++ = *p;
 370                         }
 371                         break;
 372
 373                 case '$':               /* match endofline.. */
 374                         if (!*(p+1))
 375                                 *mp++ = EOL;
 376                         else {
 377                                 *mp++ = CHR;
 378                                 *mp++ = *p;
 379                         }
 380                         break;
 381
 382                 case '[':               /* match char class..*/
 383                         *mp++ = CCL;
 384
 385                         i++;
 386                         if (*++p == '^') {
 387                                 mask = '\377';
 388                                 i++;
 389                                 p++;
 390                         } else
 391                                 mask = 0;
 392
 393                         if (*p == '-') {                /* real dash */
 394                                 i++;
 395                                 ChSet(*p++);
 396                         }
 397                         if (*p == ']') {        /* real brace */
 398                                 i++;
 399                                 ChSet(*p++);
 400                         }
 401                         while (*p && *p != ']') {
 402                                 if (*p == '-' && *(p+1) && *(p+1) != ']') {
 403                                         i++;
 404                                         p++;
 405                                         c1 = *(p-2) + 1;
 406                                         i++;
 407                                         c2 = *p++;
 408                                         while (c1 <= c2) {
 409                                                 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
 410                                         }
 411                                 } else if (*p == '\\' && *(p+1)) {
 412                                         i++;
 413                                         p++;
 414                                         char escape = escapeValue(*p);
 415                                         if (escape)
 416                                                 ChSetWithCase(escape, caseSensitive);
 417                                         else
 418                                                 ChSetWithCase(*p, caseSensitive);
 419                                         i++;
 420                                         p++;
 421                                 } else {
 422                                         i++;
 423                                         ChSetWithCase(*p++, caseSensitive);
 424                                 }
 425                         }
 426                         if (!*p)
 427                                 return badpat("Missing ]");
 428
 429                         for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 430                                 *mp++ = static_cast<char>(mask ^ bittab[n]);
 431
 432                         break;
 433
 434                 case '*':               /* match 0 or more.. */
 435                 case '+':               /* match 1 or more.. */
 436                         if (p == pat)
 437                                 return badpat("Empty closure");
 438                         lp = sp;                /* previous opcode */
 439                         if (*lp == CLO)         /* equivalence..   */
 440                                 break;
 441                         switch(*lp) {
 442
 443                         case BOL:
 444                         case BOT:
 445                         case EOT:
 446                         case BOW:
 447                         case EOW:
 448                         case REF:
 449                                 return badpat("Illegal closure");
 450                         default:
 451                                 break;
 452                         }
 453
 454                         if (*p == '+')
 455                                 for (sp = mp; lp < sp; lp++)
 456                                         *mp++ = *lp;
 457
 458                         *mp++ = END;
 459                         *mp++ = END;
 460                         sp = mp;
 461                         while (--mp > lp)
 462                                 *mp = mp[-1];
 463                         *mp = CLO;
 464                         mp = sp;
 465                         break;
 466
 467                 case '\\':              /* tags, backrefs .. */
 468                         i++;
 469                         switch(*++p) {
 470
 471                         case '(':
 472                                 if (tagc < MAXTAG) {
 473                                         tagstk[++tagi] = tagc;
 474                                         *mp++ = BOT;
 475                                         *mp++ = static_cast<char>(tagc++);
 476                                 }
 477                                 else
 478                                         return badpat("Too many \\(\\) pairs");
 479                                 break;
 480                         case ')':
 481                                 if (*sp == BOT)
 482                                         return badpat("Null pattern inside \\(\\)");
 483                                 if (tagi > 0) {
 484                                         *mp++ = static_cast<char>(EOT);
 485                                         *mp++ = static_cast<char>(tagstk[tagi--]);
 486                                 }
 487                                 else
 488                                         return badpat("Unmatched \\)");
 489                                 break;
 490                         case '<':
 491                                 *mp++ = BOW;
 492                                 break;
 493                         case '>':
 494                                 if (*sp == BOW)
 495                                         return badpat("Null pattern inside \\<\\>");
 496                                 *mp++ = EOW;
 497                                 break;
 498                         case '1':
 499                         case '2':
 500                         case '3':
 501                         case '4':
 502                         case '5':
 503                         case '6':
 504                         case '7':
 505                         case '8':
 506                         case '9':
 507                                 n = *p-'0';
 508                                 if (tagi > 0 && tagstk[tagi] == n)
 509                                         return badpat("Cyclical reference");
 510                                 if (tagc > n) {
 511                                         *mp++ = static_cast<char>(REF);
 512                                         *mp++ = static_cast<char>(n);
 513                                 }
 514                                 else
 515                                         return badpat("Undetermined reference");
 516                                 break;
 517                         case 'a':
 518                         case 'b':
 519                         case 'n':
 520                         case 'f':
 521                         case 'r':
 522                         case 't':
 523                         case 'v':
 524                                 *mp++ = CHR;
 525                                 *mp++ = escapeValue(*p);
 526                                 break;
 527                         default:
 528                                 *mp++ = CHR;
 529                                 *mp++ = *p;
 530                         }
 531                         break;
 532
 533                 default :               /* an ordinary char  */
 534                         if (caseSensitive) {
 535                                 *mp++ = CHR;
 536                                 *mp++ = *p;
 537                         } else {
 538                                 *mp++ = CCL;
 539                                 mask = 0;
 540                                 ChSetWithCase(*p, false);
 541                                 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 542                                         *mp++ = static_cast<char>(mask ^ bittab[n]);
 543                         }
 544                         break;
 545                 }
 546                 sp = lp;
 547         }
 548         if (tagi > 0)
 549                 return badpat("Unmatched \\(");
 550         *mp = END;
 551         sta = OKP;
 552         return 0;
 553 }
 554
 555 /*
 556  * RESearch::Execute:
 557  *      execute nfa to find a match.
 558  *
 559  *      special cases: (nfa[0])
 560  *              BOL
 561  *                      Match only once, starting from the
 562  *                      beginning.
 563  *              CHR
 564  *                      First locate the character without
 565  *                      calling PMatch, and if found, call
 566  *                      PMatch for the remaining string.
 567  *              END
 568  *                      RESearch::Compile failed, poor luser did not
 569  *                      check for it. Fail fast.
 570  *
 571  *      If a match is found, bopat[0] and eopat[0] are set
 572  *      to the beginning and the end of the matched fragment,
 573  *      respectively.
 574  *
 575  */
 576
 577 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 578         char c;
 579         int ep = NOTFOUND;
 580         char *ap = nfa;
 581
 582         bol = lp;
 583         failure = 0;
 584
 585         Clear();
 586
 587         switch(*ap) {
 588
 589         case BOL:                       /* anchored: match from BOL only */
 590                 ep = PMatch(ci, lp, endp, ap);
 591                 break;
 592         case EOL:                       /* just searching for end of line normal path doesn't work */
 593                 if (*(ap+1) == END) {
 594                         lp = endp;
 595                         ep = lp;
 596                         break;
 597                 } else {
 598                         return 0;
 599                 }
 600         case CHR:                       /* ordinary char: locate it fast */
 601                 c = *(ap+1);
 602                 while ((lp < endp) && (ci.CharAt(lp) != c))
 603                         lp++;
 604                 if (lp >= endp)         /* if EOS, fail, else fall thru. */
 605                         return 0;
 606         default:                        /* regular matching all the way. */
 607                 while (lp < endp) {
 608                         ep = PMatch(ci, lp, endp, ap);
 609                         if (ep != NOTFOUND)
 610                                 break;
 611                         lp++;
 612                 }
 613                 break;
 614         case END:                       /* munged automaton. fail always */
 615                 return 0;
 616         }
 617         if (ep == NOTFOUND)
 618                 return 0;
 619
 620         bopat[0] = lp;
 621         eopat[0] = ep;
 622         return 1;
 623 }
 624
 625 /*
 626  * PMatch: internal routine for the hard part
 627  *
 628  *      This code is partly snarfed from an early grep written by
 629  *      David Conroy. The backref and tag stuff, and various other
 630  *      innovations are by oz.
 631  *
 632  *      special case optimizations: (nfa[n], nfa[n+1])
 633  *              CLO ANY
 634  *                      We KNOW .* will match everything upto the
 635  *                      end of line. Thus, directly go to the end of
 636  *                      line, without recursive PMatch calls. As in
 637  *                      the other closure cases, the remaining pattern
 638  *                      must be matched by moving backwards on the
 639  *                      string recursively, to find a match for xy
 640  *                      (x is ".*" and y is the remaining pattern)
 641  *                      where the match satisfies the LONGEST match for
 642  *                      x followed by a match for y.
 643  *              CLO CHR
 644  *                      We can again scan the string forward for the
 645  *                      single char and at the point of failure, we
 646  *                      execute the remaining nfa recursively, same as
 647  *                      above.
 648  *
 649  *      At the end of a successful match, bopat[n] and eopat[n]
 650  *      are set to the beginning and end of subpatterns matched
 651  *      by tagged expressions (n = 1 to 9).
 652  *
 653  */
 654
 655 extern void re_fail(char *,char);
 656
 657 /*
 658  * character classification table for word boundary operators BOW
 659  * and EOW. the reason for not using ctype macros is that we can
 660  * let the user add into our own table. see RESearch::ModifyWord. This table
 661  * is not in the bitset form, since we may wish to extend it in the
 662  * future for other character classifications.
 663  *
 664  *      TRUE for 0-9 A-Z a-z _
 665  */
 666 static char chrtyp[MAXCHR] = {
 667         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 668         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 669         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 670         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 671         0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
 672         1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
 673         0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
 674         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 675         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 676         1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
 677         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 678         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 679         1, 1, 1, 0, 0, 0, 0, 0
 680         };
 681
 682 #define inascii(x)      (0177&(x))
 683 #define iswordc(x)      chrtyp[inascii(x)]
 684 #define isinset(x,y)    ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
 685
 686 /*
 687  * skip values for CLO XXX to skip past the closure
 688  */
 689
 690 #define ANYSKIP 2       /* [CLO] ANY END ...         */
 691 #define CHRSKIP 3       /* [CLO] CHR chr END ...     */
 692 #define CCLSKIP 18      /* [CLO] CCL 16bytes END ... */
 693
 694 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 695         int op, c, n;
 696         int e;          /* extra pointer for CLO */
 697         int bp;         /* beginning of subpat.. */
 698         int ep;         /* ending of subpat..    */
 699         int are;                        /* to save the line ptr. */
 700
 701         while ((op = *ap++) != END)
 702                 switch(op) {
 703
 704                 case CHR:
 705                         if (ci.CharAt(lp++) != *ap++)
 706                                 return NOTFOUND;
 707                         break;
 708                 case ANY:
 709                         if (lp++ >= endp)
 710                                 return NOTFOUND;
 711                         break;
 712                 case CCL:
 713                         c = ci.CharAt(lp++);
 714                         if (!isinset(ap,c))
 715                                 return NOTFOUND;
 716                         ap += BITBLK;
 717                         break;
 718                 case BOL:
 719                         if (lp != bol)
 720                                 return NOTFOUND;
 721                         break;
 722                 case EOL:
 723                         if (lp < endp)
 724                                 return NOTFOUND;
 725                         break;
 726                 case BOT:
 727                         bopat[*ap++] = lp;
 728                         break;
 729                 case EOT:
 730                         eopat[*ap++] = lp;
 731                         break;
 732                 case BOW:
 733                         if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
 734                                 return NOTFOUND;
 735                         break;
 736                 case EOW:
 737                         if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
 738                                 return NOTFOUND;
 739                         break;
 740                 case REF:
 741                         n = *ap++;
 742                         bp = bopat[n];
 743                         ep = eopat[n];
 744                         while (bp < ep)
 745                                 if (ci.CharAt(bp++) != ci.CharAt(lp++))
 746                                         return NOTFOUND;
 747                         break;
 748                 case CLO:
 749                         are = lp;
 750                         switch(*ap) {
 751
 752                         case ANY:
 753                                 while (lp < endp)
 754                                         lp++;
 755                                 n = ANYSKIP;
 756                                 break;
 757                         case CHR:
 758                                 c = *(ap+1);
 759                                 while ((lp < endp) && (c == ci.CharAt(lp)))
 760                                         lp++;
 761                                 n = CHRSKIP;
 762                                 break;
 763                         case CCL:
 764                                 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
 765                                         lp++;
 766                                 n = CCLSKIP;
 767                                 break;
 768                         default:
 769                                 failure = true;
 770                                 //re_fail("closure: bad nfa.", *ap);
 771                                 return NOTFOUND;
 772                         }
 773
 774                         ap += n;
 775
 776                         while (lp >= are) {
 777                                 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
 778                                         return e;
 779                                 --lp;
 780                         }
 781                         return NOTFOUND;
 782                 default:
 783                         //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
 784                         return NOTFOUND;
 785                 }
 786         return lp;
 787 }
 788
 789 /*
 790  * RESearch::ModifyWord:
 791  *      add new characters into the word table to change RESearch::Execute's
 792  *      understanding of what a word should look like. Note that we
 793  *      only accept additions into the word definition.
 794  *
 795  *      If the string parameter is 0 or null string, the table is
 796  *      reset back to the default containing A-Z a-z 0-9 _. [We use
 797  *      the compact bitset representation for the default table]
 798  */
 799
 800 static char deftab[16] = {
 801         0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
 802         '\376', '\377', '\377', 007
 803 };
 804
 805 void RESearch::ModifyWord(char *s) {
 806         int i;
 807
 808         if (!s || !*s) {
 809                 for (i = 0; i < MAXCHR; i++)
 810                         if (!isinset(deftab,i))
 811                                 iswordc(i) = 0;
 812         }
 813         else
 814                 while(*s)
 815                         iswordc(*s++) = 1;
 816 }
 817
 818 /*
 819  * RESearch::Substitute:
 820  *      substitute the matched portions of the src in dst.
 821  *
 822  *      &       substitute the entire matched pattern.
 823  *
 824  *      \digit  substitute a subpattern, with the given tag number.
 825  *              Tags are numbered from 1 to 9. If the particular
 826  *              tagged subpattern does not exist, null is substituted.
 827  */
 828 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
 829         char c;
 830         int  pin;
 831         int bp;
 832         int ep;
 833
 834         if (!*src || !bopat[0])
 835                 return 0;
 836
 837         while ((c = *src++) != 0) {
 838                 switch(c) {
 839
 840                 case '&':
 841                         pin = 0;
 842                         break;
 843
 844                 case '\\':
 845                         c = *src++;
 846                         if (c >= '0' && c <= '9') {
 847                                 pin = c - '0';
 848                                 break;
 849                         }
 850
 851                 default:
 852                         *dst++ = c;
 853                         continue;
 854                 }
 855
 856                 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
 857                         while (ci.CharAt(bp) && bp < ep)
 858                                 *dst++ = ci.CharAt(bp++);
 859                         if (bp < ep)
 860                                 return 0;
 861                 }
 862         }
 863         *dst = (char) 0;
 864         return 1;
 865 }