src/stc/scintilla/src/RESearch.cxx

   1 // Scintilla source code edit control
   2 /** @file RESearch.cxx
   3  ** Regular expression search library.
   4  **/
   5
   6 /*
   7  * regex - Regular expression pattern matching  and replacement
   8  *
   9  * By:  Ozan S. Yigit (oz)
  10  *      Dept. of Computer Science
  11  *      York University
  12  *
  13  * Original code available from http://www.cs.yorku.ca/~oz/
  14  * Translation to C++ by Neil Hodgson neilh@scintilla.org
  15  * Removed all use of register.
  16  * Converted to modern function prototypes.
  17  * Put all global/static variables into an object so this code can be
  18  * used from multiple threads etc.
  19  *
  20  * These routines are the PUBLIC DOMAIN equivalents of regex
  21  * routines as found in 4.nBSD UN*X, with minor extensions.
  22  *
  23  * These routines are derived from various implementations found
  24  * in software tools books, and Conroy's grep. They are NOT derived
  25  * from licensed/restricted software.
  26  * For more interesting/academic/complicated implementations,
  27  * see Henry Spencer's regexp routines, or GNU Emacs pattern
  28  * matching module.
  29  *
  30  * Modification history:
  31  *
  32  * $Log$
  33  * Revision 1.5  2002/09/11 00:55:27  RD
  34  * Update to Scintilla 1.48
  35  *
  36  * Revision 1.6  2001/04/29 13:32:10  nyamatongwe
  37  * Addition of new target methods - versions of ReplaceTarget that take counted
  38  * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
  39  * series of calls rather than a structure.
  40  * Handling of \000 in search and replace.
  41  * Handling of /escapes within character ranges of regular expressions.
  42  * Some handling of bare ^ and $ regular expressions.
  43  *
  44  * Revision 1.5  2001/04/20 07:36:09  nyamatongwe
  45  * Removed DEBUG code that failed to compile on GTK+.
  46  *
  47  * Revision 1.4  2001/04/13 03:52:13  nyamatongwe
  48  * Added URL to find original code to comments.
  49  *
  50  * Revision 1.3  2001/04/06 12:24:21  nyamatongwe
  51  * Made regular expression searching work on a line by line basis, made ^ and
  52  * $ work, made [set] work, and added a case insensitive option.
  53  *
  54  * Revision 1.2  2001/04/05 01:58:04  nyamatongwe
  55  * Replace target functionality to make find and replace operations faster
  56  * by diminishing screen updates and allow for \d patterns in the replacement
  57  * text.
  58  *
  59  * Revision 1.1  2001/04/04 12:52:44  nyamatongwe
  60  * Moved to public domain regular expresion implementation.
  61  *
  62  * Revision 1.4  1991/10/17  03:56:42  oz
  63  * miscellaneous changes, small cleanups etc.
  64  *
  65  * Revision 1.3  1989/04/01  14:18:09  oz
  66  * Change all references to a dfa: this is actually an nfa.
  67  *
  68  * Revision 1.2  88/08/28  15:36:04  oz
  69  * Use a complement bitmap to represent NCL.
  70  * This removes the need to have seperate
  71  * code in the PMatch case block - it is
  72  * just CCL code now.
  73  *
  74  * Use the actual CCL code in the CLO
  75  * section of PMatch. No need for a recursive
  76  * PMatch call.
  77  *
  78  * Use a bitmap table to set char bits in an
  79  * 8-bit chunk.
  80  *
  81  * Interfaces:
  82  *      RESearch::Compile:        compile a regular expression into a NFA.
  83  *
  84  *                      char *RESearch::Compile(s)
  85  *                      char *s;
  86  *
  87  *      RESearch::Execute:        execute the NFA to match a pattern.
  88  *
  89  *                      int RESearch::Execute(s)
  90  *                      char *s;
  91  *
  92  *      RESearch::ModifyWord            change RESearch::Execute's understanding of what a "word"
  93  *                      looks like (for \< and \>) by adding into the
  94  *                      hidden word-syntax table.
  95  *
  96  *                      void RESearch::ModifyWord(s)
  97  *                      char *s;
  98  *
  99  *      RESearch::Substitute:   substitute the matched portions in a new string.
 100  *
 101  *                      int RESearch::Substitute(src, dst)
 102  *                      char *src;
 103  *                      char *dst;
 104  *
 105  *      re_fail:        failure routine for RESearch::Execute.
 106  *
 107  *                      void re_fail(msg, op)
 108  *                      char *msg;
 109  *                      char op;
 110  *
 111  * Regular Expressions:
 112  *
 113  *      [1]     char    matches itself, unless it is a special
 114  *                      character (metachar): . \ [ ] * + ^ $
 115  *
 116  *      [2]     .       matches any character.
 117  *
 118  *      [3]     \       matches the character following it, except
 119  *                      when followed by a left or right round bracket,
 120  *                      a digit 1 to 9 or a left or right angle bracket.
 121  *                      (see [7], [8] and [9])
 122  *                      It is used as an escape character for all
 123  *                      other meta-characters, and itself. When used
 124  *                      in a set ([4]), it is treated as an ordinary
 125  *                      character.
 126  *
 127  *      [4]     [set]   matches one of the characters in the set.
 128  *                      If the first character in the set is "^",
 129  *                      it matches a character NOT in the set, i.e.
 130  *                      complements the set. A shorthand S-E is
 131  *                      used to specify a set of characters S upto
 132  *                      E, inclusive. The special characters "]" and
 133  *                      "-" have no special meaning if they appear
 134  *                      as the first chars in the set.
 135  *                      examples:        match:
 136  *
 137  *                              [a-z]    any lowercase alpha
 138  *
 139  *                              [^]-]    any char except ] and -
 140  *
 141  *                              [^A-Z]   any char except uppercase
 142  *                                       alpha
 143  *
 144  *                              [a-zA-Z] any alpha
 145  *
 146  *      [5]     *       any regular expression form [1] to [4], followed by
 147  *                      closure char (*) matches zero or more matches of
 148  *                      that form.
 149  *
 150  *      [6]     +       same as [5], except it matches one or more.
 151  *
 152  *      [7]             a regular expression in the form [1] to [10], enclosed
 153  *                      as \(form\) matches what form matches. The enclosure
 154  *                      creates a set of tags, used for [8] and for
 155  *                      pattern substution. The tagged forms are numbered
 156  *                      starting from 1.
 157  *
 158  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
 159  *                      previously tagged regular expression ([7]) matched.
 160  *
 161  *      [9]     \<      a regular expression starting with a \< construct
 162  *              \>      and/or ending with a \> construct, restricts the
 163  *                      pattern matching to the beginning of a word, and/or
 164  *                      the end of a word. A word is defined to be a character
 165  *                      string beginning and/or ending with the characters
 166  *                      A-Z a-z 0-9 and _. It must also be preceded and/or
 167  *                      followed by any character outside those mentioned.
 168  *
 169  *      [10]            a composite regular expression xy where x and y
 170  *                      are in the form [1] to [10] matches the longest
 171  *                      match of x followed by a match for y.
 172  *
 173  *      [11]    ^       a regular expression starting with a ^ character
 174  *              $       and/or ending with a $ character, restricts the
 175  *                      pattern matching to the beginning of the line,
 176  *                      or the end of line. [anchors] Elsewhere in the
 177  *                      pattern, ^ and $ are treated as ordinary characters.
 178  *
 179  *
 180  * Acknowledgements:
 181  *
 182  *      HCR's Hugh Redelmeier has been most helpful in various
 183  *      stages of development. He convinced me to include BOW
 184  *      and EOW constructs, originally invented by Rob Pike at
 185  *      the University of Toronto.
 186  *
 187  * References:
 188  *              Software tools                  Kernighan & Plauger
 189  *              Software tools in Pascal        Kernighan & Plauger
 190  *              Grep [rsx-11 C dist]            David Conroy
 191  *              ed - text editor                Un*x Programmer's Manual
 192  *              Advanced editing on Un*x        B. W. Kernighan
 193  *              RegExp routines                 Henry Spencer
 194  *
 195  * Notes:
 196  *
 197  *      This implementation uses a bit-set representation for character
 198  *      classes for speed and compactness. Each character is represented
 199  *      by one bit in a 128-bit block. Thus, CCL always takes a
 200  *      constant 16 bytes in the internal nfa, and RESearch::Execute does a single
 201  *      bit comparison to locate the character in the set.
 202  *
 203  * Examples:
 204  *
 205  *      pattern:        foo*.*
 206  *      compile:        CHR f CHR o CLO CHR o END CLO ANY END END
 207  *      matches:        fo foo fooo foobar fobar foxx ...
 208  *
 209  *      pattern:        fo[ob]a[rz]
 210  *      compile:        CHR f CHR o CCL bitset CHR a CCL bitset END
 211  *      matches:        fobar fooar fobaz fooaz
 212  *
 213  *      pattern:        foo\\+
 214  *      compile:        CHR f CHR o CHR o CHR \ CLO CHR \ END END
 215  *      matches:        foo\ foo\\ foo\\\  ...
 216  *
 217  *      pattern:        \(foo\)[1-3]\1  (same as foo[1-3]foo)
 218  *      compile:        BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
 219  *      matches:        foo1foo foo2foo foo3foo
 220  *
 221  *      pattern:        \(fo.*\)-\1
 222  *      compile:        BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
 223  *      matches:        foo-foo fo-fo fob-fob foobar-foobar ...
 224  */
 225
 226 #include "RESearch.h"
 227
 228 #define OKP     1
 229 #define NOP     0
 230
 231 #define CHR     1
 232 #define ANY     2
 233 #define CCL     3
 234 #define BOL     4
 235 #define EOL     5
 236 #define BOT     6
 237 #define EOT     7
 238 #define BOW     8
 239 #define EOW     9
 240 #define REF     10
 241 #define CLO     11
 242
 243 #define END     0
 244
 245 /*
 246  * The following defines are not meant to be changeable.
 247  * They are for readability only.
 248  */
 249 #define BLKIND  0170
 250 #define BITIND  07
 251
 252 #define ASCIIB  0177
 253
 254 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
 255
 256 #define badpat(x)       (*nfa = END, x)
 257
 258 RESearch::RESearch() {
 259         Init();
 260 }
 261
 262 RESearch::~RESearch() {
 263         Clear();
 264 }
 265
 266 void RESearch::Init() {
 267         sta = NOP;                      /* status of lastpat */
 268         bol = 0;
 269         for (int i=0; i<MAXTAG; i++)
 270                 pat[i] = 0;
 271         for (int j=0; j<BITBLK; j++)
 272                 bittab[j] = 0;
 273 }
 274
 275 void RESearch::Clear() {
 276         for (int i=0; i<MAXTAG; i++) {
 277                 delete []pat[i];
 278                 pat[i] = 0;
 279                 bopat[i] = NOTFOUND;
 280                 eopat[i] = NOTFOUND;
 281         }
 282 }
 283
 284 bool RESearch::GrabMatches(CharacterIndexer &ci) {
 285         bool success = true;
 286         for (unsigned int i=0; i<MAXTAG; i++) {
 287                 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
 288                         unsigned int len = eopat[i] - bopat[i];
 289                         pat[i] = new char[len + 1];
 290                         if (pat[i]) {
 291                                 for (unsigned int j=0; j<len; j++)
 292                                         pat[i][j] = ci.CharAt(bopat[i] + j);
 293                                 pat[i][len] = '\0';
 294                         } else {
 295                                 success = false;
 296                         }
 297                 }
 298         }
 299         return success;
 300 }
 301
 302 void RESearch::ChSet(char c) {
 303         bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
 304 }
 305
 306 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
 307         if (caseSensitive) {
 308                 ChSet(c);
 309         } else {
 310                 if ((c >= 'a') && (c <= 'z')) {
 311                         ChSet(c);
 312                         ChSet(static_cast<char>(c - 'a' + 'A'));
 313                 } else if ((c >= 'A') && (c <= 'Z')) {
 314                         ChSet(c);
 315                         ChSet(static_cast<char>(c - 'A' + 'a'));
 316                 } else {
 317                         ChSet(c);
 318                 }
 319         }
 320 }
 321
 322 const char escapeValue(char ch) {
 323         switch (ch) {
 324         case 'a':       return '\a';
 325         case 'b':       return '\b';
 326         case 'f':       return '\f';
 327         case 'n':       return '\n';
 328         case 'r':       return '\r';
 329         case 't':       return '\t';
 330         case 'v':       return '\v';
 331         }
 332         return 0;
 333 }
 334
 335 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive) {
 336         char *mp=nfa;          /* nfa pointer       */
 337         char *lp;              /* saved pointer..   */
 338         char *sp=nfa;          /* another one..     */
 339
 340         int tagi = 0;          /* tag stack index   */
 341         int tagc = 1;          /* actual tag count  */
 342
 343         int n;
 344         char mask;              /* xor mask -CCL/NCL */
 345         int c1, c2;
 346
 347         if (!pat || !length)
 348                 if (sta)
 349                         return 0;
 350                 else
 351                         return badpat("No previous regular expression");
 352         sta = NOP;
 353
 354         const char *p=pat;               /* pattern pointer   */
 355         for (int i=0; i<length; i++, p++) {
 356                 lp = mp;
 357                 switch(*p) {
 358
 359                 case '.':               /* match any char..  */
 360                         *mp++ = ANY;
 361                         break;
 362
 363                 case '^':               /* match beginning.. */
 364                         if (p == pat)
 365                                 *mp++ = BOL;
 366                         else {
 367                                 *mp++ = CHR;
 368                                 *mp++ = *p;
 369                         }
 370                         break;
 371
 372                 case '$':               /* match endofline.. */
 373                         if (!*(p+1))
 374                                 *mp++ = EOL;
 375                         else {
 376                                 *mp++ = CHR;
 377                                 *mp++ = *p;
 378                         }
 379                         break;
 380
 381                 case '[':               /* match char class..*/
 382                         *mp++ = CCL;
 383
 384                         i++;
 385                         if (*++p == '^') {
 386                                 mask = '\377';
 387                                 i++;
 388                                 p++;
 389                         } else
 390                                 mask = 0;
 391
 392                         if (*p == '-') {                /* real dash */
 393                                 i++;
 394                                 ChSet(*p++);
 395                         }
 396                         if (*p == ']') {        /* real brace */
 397                                 i++;
 398                                 ChSet(*p++);
 399                         }
 400                         while (*p && *p != ']') {
 401                                 if (*p == '-' && *(p+1) && *(p+1) != ']') {
 402                                         i++;
 403                                         p++;
 404                                         c1 = *(p-2) + 1;
 405                                         i++;
 406                                         c2 = *p++;
 407                                         while (c1 <= c2) {
 408                                                 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
 409                                         }
 410                                 } else if (*p == '\\' && *(p+1)) {
 411                                         i++;
 412                                         p++;
 413                                         char escape = escapeValue(*p);
 414                                         if (escape)
 415                                                 ChSetWithCase(escape, caseSensitive);
 416                                         else
 417                                                 ChSetWithCase(*p, caseSensitive);
 418                                         i++;
 419                                         p++;
 420                                 } else {
 421                                         i++;
 422                                         ChSetWithCase(*p++, caseSensitive);
 423                                 }
 424                         }
 425                         if (!*p)
 426                                 return badpat("Missing ]");
 427
 428                         for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 429                                 *mp++ = static_cast<char>(mask ^ bittab[n]);
 430
 431                         break;
 432
 433                 case '*':               /* match 0 or more.. */
 434                 case '+':               /* match 1 or more.. */
 435                         if (p == pat)
 436                                 return badpat("Empty closure");
 437                         lp = sp;                /* previous opcode */
 438                         if (*lp == CLO)         /* equivalence..   */
 439                                 break;
 440                         switch(*lp) {
 441
 442                         case BOL:
 443                         case BOT:
 444                         case EOT:
 445                         case BOW:
 446                         case EOW:
 447                         case REF:
 448                                 return badpat("Illegal closure");
 449                         default:
 450                                 break;
 451                         }
 452
 453                         if (*p == '+')
 454                                 for (sp = mp; lp < sp; lp++)
 455                                         *mp++ = *lp;
 456
 457                         *mp++ = END;
 458                         *mp++ = END;
 459                         sp = mp;
 460                         while (--mp > lp)
 461                                 *mp = mp[-1];
 462                         *mp = CLO;
 463                         mp = sp;
 464                         break;
 465
 466                 case '\\':              /* tags, backrefs .. */
 467                         i++;
 468                         switch(*++p) {
 469
 470                         case '(':
 471                                 if (tagc < MAXTAG) {
 472                                         tagstk[++tagi] = tagc;
 473                                         *mp++ = BOT;
 474                                         *mp++ = static_cast<char>(tagc++);
 475                                 }
 476                                 else
 477                                         return badpat("Too many \\(\\) pairs");
 478                                 break;
 479                         case ')':
 480                                 if (*sp == BOT)
 481                                         return badpat("Null pattern inside \\(\\)");
 482                                 if (tagi > 0) {
 483                                         *mp++ = static_cast<char>(EOT);
 484                                         *mp++ = static_cast<char>(tagstk[tagi--]);
 485                                 }
 486                                 else
 487                                         return badpat("Unmatched \\)");
 488                                 break;
 489                         case '<':
 490                                 *mp++ = BOW;
 491                                 break;
 492                         case '>':
 493                                 if (*sp == BOW)
 494                                         return badpat("Null pattern inside \\<\\>");
 495                                 *mp++ = EOW;
 496                                 break;
 497                         case '1':
 498                         case '2':
 499                         case '3':
 500                         case '4':
 501                         case '5':
 502                         case '6':
 503                         case '7':
 504                         case '8':
 505                         case '9':
 506                                 n = *p-'0';
 507                                 if (tagi > 0 && tagstk[tagi] == n)
 508                                         return badpat("Cyclical reference");
 509                                 if (tagc > n) {
 510                                         *mp++ = static_cast<char>(REF);
 511                                         *mp++ = static_cast<char>(n);
 512                                 }
 513                                 else
 514                                         return badpat("Undetermined reference");
 515                                 break;
 516                         case 'a':
 517                         case 'b':
 518                         case 'n':
 519                         case 'f':
 520                         case 'r':
 521                         case 't':
 522                         case 'v':
 523                                 *mp++ = CHR;
 524                                 *mp++ = escapeValue(*p);
 525                                 break;
 526                         default:
 527                                 *mp++ = CHR;
 528                                 *mp++ = *p;
 529                         }
 530                         break;
 531
 532                 default :               /* an ordinary char  */
 533                         if (caseSensitive) {
 534                                 *mp++ = CHR;
 535                                 *mp++ = *p;
 536                         } else {
 537                                 *mp++ = CCL;
 538                                 mask = 0;
 539                                 ChSetWithCase(*p, false);
 540                                 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 541                                         *mp++ = static_cast<char>(mask ^ bittab[n]);
 542                         }
 543                         break;
 544                 }
 545                 sp = lp;
 546         }
 547         if (tagi > 0)
 548                 return badpat("Unmatched \\(");
 549         *mp = END;
 550         sta = OKP;
 551         return 0;
 552 }
 553
 554 /*
 555  * RESearch::Execute:
 556  *      execute nfa to find a match.
 557  *
 558  *      special cases: (nfa[0])
 559  *              BOL
 560  *                      Match only once, starting from the
 561  *                      beginning.
 562  *              CHR
 563  *                      First locate the character without
 564  *                      calling PMatch, and if found, call
 565  *                      PMatch for the remaining string.
 566  *              END
 567  *                      RESearch::Compile failed, poor luser did not
 568  *                      check for it. Fail fast.
 569  *
 570  *      If a match is found, bopat[0] and eopat[0] are set
 571  *      to the beginning and the end of the matched fragment,
 572  *      respectively.
 573  *
 574  */
 575
 576 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 577         char c;
 578         int ep = NOTFOUND;
 579         char *ap = nfa;
 580
 581         bol = lp;
 582         failure = 0;
 583
 584         Clear();
 585
 586         switch(*ap) {
 587
 588         case BOL:                       /* anchored: match from BOL only */
 589                 ep = PMatch(ci, lp, endp, ap);
 590                 break;
 591         case EOL:                       /* just searching for end of line normal path doesn't work */
 592                 if (*(ap+1) == END) {
 593                         lp = endp;
 594                         ep = lp;
 595                         break;
 596                 } else {
 597                         return 0;
 598                 }
 599         case CHR:                       /* ordinary char: locate it fast */
 600                 c = *(ap+1);
 601                 while ((lp < endp) && (ci.CharAt(lp) != c))
 602                         lp++;
 603                 if (lp >= endp)         /* if EOS, fail, else fall thru. */
 604                         return 0;
 605         default:                        /* regular matching all the way. */
 606                 while (lp < endp) {
 607                         ep = PMatch(ci, lp, endp, ap);
 608                         if (ep != NOTFOUND)
 609                                 break;
 610                         lp++;
 611                 }
 612                 break;
 613         case END:                       /* munged automaton. fail always */
 614                 return 0;
 615         }
 616         if (ep == NOTFOUND)
 617                 return 0;
 618
 619         bopat[0] = lp;
 620         eopat[0] = ep;
 621         return 1;
 622 }
 623
 624 /*
 625  * PMatch: internal routine for the hard part
 626  *
 627  *      This code is partly snarfed from an early grep written by
 628  *      David Conroy. The backref and tag stuff, and various other
 629  *      innovations are by oz.
 630  *
 631  *      special case optimizations: (nfa[n], nfa[n+1])
 632  *              CLO ANY
 633  *                      We KNOW .* will match everything upto the
 634  *                      end of line. Thus, directly go to the end of
 635  *                      line, without recursive PMatch calls. As in
 636  *                      the other closure cases, the remaining pattern
 637  *                      must be matched by moving backwards on the
 638  *                      string recursively, to find a match for xy
 639  *                      (x is ".*" and y is the remaining pattern)
 640  *                      where the match satisfies the LONGEST match for
 641  *                      x followed by a match for y.
 642  *              CLO CHR
 643  *                      We can again scan the string forward for the
 644  *                      single char and at the point of failure, we
 645  *                      execute the remaining nfa recursively, same as
 646  *                      above.
 647  *
 648  *      At the end of a successful match, bopat[n] and eopat[n]
 649  *      are set to the beginning and end of subpatterns matched
 650  *      by tagged expressions (n = 1 to 9).
 651  *
 652  */
 653
 654 extern void re_fail(char *,char);
 655
 656 /*
 657  * character classification table for word boundary operators BOW
 658  * and EOW. the reason for not using ctype macros is that we can
 659  * let the user add into our own table. see RESearch::ModifyWord. This table
 660  * is not in the bitset form, since we may wish to extend it in the
 661  * future for other character classifications.
 662  *
 663  *      TRUE for 0-9 A-Z a-z _
 664  */
 665 static char chrtyp[MAXCHR] = {
 666         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 667         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 668         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 669         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 670         0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
 671         1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
 672         0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
 673         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 674         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 675         1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
 676         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 677         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 678         1, 1, 1, 0, 0, 0, 0, 0
 679         };
 680
 681 #define inascii(x)      (0177&(x))
 682 #define iswordc(x)      chrtyp[inascii(x)]
 683 #define isinset(x,y)    ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
 684
 685 /*
 686  * skip values for CLO XXX to skip past the closure
 687  */
 688
 689 #define ANYSKIP 2       /* [CLO] ANY END ...         */
 690 #define CHRSKIP 3       /* [CLO] CHR chr END ...     */
 691 #define CCLSKIP 18      /* [CLO] CCL 16bytes END ... */
 692
 693 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 694         int op, c, n;
 695         int e;          /* extra pointer for CLO */
 696         int bp;         /* beginning of subpat.. */
 697         int ep;         /* ending of subpat..    */
 698         int are;                        /* to save the line ptr. */
 699
 700         while ((op = *ap++) != END)
 701                 switch(op) {
 702
 703                 case CHR:
 704                         if (ci.CharAt(lp++) != *ap++)
 705                                 return NOTFOUND;
 706                         break;
 707                 case ANY:
 708                         if (lp++ >= endp)
 709                                 return NOTFOUND;
 710                         break;
 711                 case CCL:
 712                         c = ci.CharAt(lp++);
 713                         if (!isinset(ap,c))
 714                                 return NOTFOUND;
 715                         ap += BITBLK;
 716                         break;
 717                 case BOL:
 718                         if (lp != bol)
 719                                 return NOTFOUND;
 720                         break;
 721                 case EOL:
 722                         if (lp < endp)
 723                                 return NOTFOUND;
 724                         break;
 725                 case BOT:
 726                         bopat[*ap++] = lp;
 727                         break;
 728                 case EOT:
 729                         eopat[*ap++] = lp;
 730                         break;
 731                 case BOW:
 732                         if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
 733                                 return NOTFOUND;
 734                         break;
 735                 case EOW:
 736                         if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
 737                                 return NOTFOUND;
 738                         break;
 739                 case REF:
 740                         n = *ap++;
 741                         bp = bopat[n];
 742                         ep = eopat[n];
 743                         while (bp < ep)
 744                                 if (ci.CharAt(bp++) != ci.CharAt(lp++))
 745                                         return NOTFOUND;
 746                         break;
 747                 case CLO:
 748                         are = lp;
 749                         switch(*ap) {
 750
 751                         case ANY:
 752                                 while (lp < endp)
 753                                         lp++;
 754                                 n = ANYSKIP;
 755                                 break;
 756                         case CHR:
 757                                 c = *(ap+1);
 758                                 while ((lp < endp) && (c == ci.CharAt(lp)))
 759                                         lp++;
 760                                 n = CHRSKIP;
 761                                 break;
 762                         case CCL:
 763                                 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
 764                                         lp++;
 765                                 n = CCLSKIP;
 766                                 break;
 767                         default:
 768                                 failure = true;
 769                                 //re_fail("closure: bad nfa.", *ap);
 770                                 return NOTFOUND;
 771                         }
 772
 773                         ap += n;
 774
 775                         while (lp >= are) {
 776                                 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
 777                                         return e;
 778                                 --lp;
 779                         }
 780                         return NOTFOUND;
 781                 default:
 782                         //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
 783                         return NOTFOUND;
 784                 }
 785         return lp;
 786 }
 787
 788 /*
 789  * RESearch::ModifyWord:
 790  *      add new characters into the word table to change RESearch::Execute's
 791  *      understanding of what a word should look like. Note that we
 792  *      only accept additions into the word definition.
 793  *
 794  *      If the string parameter is 0 or null string, the table is
 795  *      reset back to the default containing A-Z a-z 0-9 _. [We use
 796  *      the compact bitset representation for the default table]
 797  */
 798
 799 static char deftab[16] = {
 800         0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
 801         '\376', '\377', '\377', 007
 802 };
 803
 804 void RESearch::ModifyWord(char *s) {
 805         int i;
 806
 807         if (!s || !*s) {
 808                 for (i = 0; i < MAXCHR; i++)
 809                         if (!isinset(deftab,i))
 810                                 iswordc(i) = 0;
 811         }
 812         else
 813                 while(*s)
 814                         iswordc(*s++) = 1;
 815 }
 816
 817 /*
 818  * RESearch::Substitute:
 819  *      substitute the matched portions of the src in dst.
 820  *
 821  *      &       substitute the entire matched pattern.
 822  *
 823  *      \digit  substitute a subpattern, with the given tag number.
 824  *              Tags are numbered from 1 to 9. If the particular
 825  *              tagged subpattern does not exist, null is substituted.
 826  */
 827 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
 828         char c;
 829         int  pin;
 830         int bp;
 831         int ep;
 832
 833         if (!*src || !bopat[0])
 834                 return 0;
 835
 836         while ((c = *src++) != 0) {
 837                 switch(c) {
 838
 839                 case '&':
 840                         pin = 0;
 841                         break;
 842
 843                 case '\\':
 844                         c = *src++;
 845                         if (c >= '0' && c <= '9') {
 846                                 pin = c - '0';
 847                                 break;
 848                         }
 849
 850                 default:
 851                         *dst++ = c;
 852                         continue;
 853                 }
 854
 855                 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
 856                         while (ci.CharAt(bp) && bp < ep)
 857                                 *dst++ = ci.CharAt(bp++);
 858                         if (bp < ep)
 859                                 return 0;
 860                 }
 861         }
 862         *dst = (char) 0;
 863         return 1;
 864 }