src/stc/scintilla/src/RESearch.cxx

   1 // Scintilla source code edit control
   2 /** @file RESearch.cxx
   3  ** Regular expression search library.
   4  **/
   5
   6 /*
   7  * regex - Regular expression pattern matching  and replacement
   8  *
   9  * By:  Ozan S. Yigit (oz)
  10  *      Dept. of Computer Science
  11  *      York University
  12  *
  13  * Original code available from http://www.cs.yorku.ca/~oz/
  14  * Translation to C++ by Neil Hodgson neilh@scintilla.org
  15  * Removed all use of register.
  16  * Converted to modern function prototypes.
  17  * Put all global/static variables into an object so this code can be
  18  * used from multiple threads etc.
  19  *
  20  * These routines are the PUBLIC DOMAIN equivalents of regex
  21  * routines as found in 4.nBSD UN*X, with minor extensions.
  22  *
  23  * These routines are derived from various implementations found
  24  * in software tools books, and Conroy's grep. They are NOT derived
  25  * from licensed/restricted software.
  26  * For more interesting/academic/complicated implementations,
  27  * see Henry Spencer's regexp routines, or GNU Emacs pattern
  28  * matching module.
  29  *
  30  * Modification history:
  31  *
  32  * $Log$
  33  * Revision 1.8  2003/09/18 05:05:38  RD
  34  * Updated to Scintilla 1.54
  35  * Applied most of patch #806092
  36  * Added new wxSTC sample from Otto Wyss
  37  *
  38  * Revision 1.9  2003/03/21 10:36:08  nyamatongwe
  39  * Detect patterns too long in regular expression search.
  40  *
  41  * Revision 1.8  2003/03/04 10:53:59  nyamatongwe
  42  * Patch from Jakub to optionally implement more POSIX compatible regular
  43  * expressions. \(..\) changes to (..)
  44  * Fixes problem where find previous would not find earlier matches on same
  45  * line.
  46  *
  47  * Revision 1.8  2003/03/03 20:12:56  vrana
  48  * Added posix syntax.
  49  *
  50  * Revision 1.7  2002/09/28 00:33:28  nyamatongwe
  51  * Fixed problem with character ranges caused by expansion to 8 bits.
  52  *
  53  * Revision 1.6  2001/04/29 13:32:10  nyamatongwe
  54  * Addition of new target methods - versions of ReplaceTarget that take counted
  55  * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
  56  * series of calls rather than a structure.
  57  * Handling of \000 in search and replace.
  58  * Handling of /escapes within character ranges of regular expressions.
  59  * Some handling of bare ^ and $ regular expressions.
  60  *
  61  * Revision 1.5  2001/04/20 07:36:09  nyamatongwe
  62  * Removed DEBUG code that failed to compile on GTK+.
  63  *
  64  * Revision 1.4  2001/04/13 03:52:13  nyamatongwe
  65  * Added URL to find original code to comments.
  66  *
  67  * Revision 1.3  2001/04/06 12:24:21  nyamatongwe
  68  * Made regular expression searching work on a line by line basis, made ^ and
  69  * $ work, made [set] work, and added a case insensitive option.
  70  *
  71  * Revision 1.2  2001/04/05 01:58:04  nyamatongwe
  72  * Replace target functionality to make find and replace operations faster
  73  * by diminishing screen updates and allow for \d patterns in the replacement
  74  * text.
  75  *
  76  * Revision 1.1  2001/04/04 12:52:44  nyamatongwe
  77  * Moved to public domain regular expresion implementation.
  78  *
  79  * Revision 1.4  1991/10/17  03:56:42  oz
  80  * miscellaneous changes, small cleanups etc.
  81  *
  82  * Revision 1.3  1989/04/01  14:18:09  oz
  83  * Change all references to a dfa: this is actually an nfa.
  84  *
  85  * Revision 1.2  88/08/28  15:36:04  oz
  86  * Use a complement bitmap to represent NCL.
  87  * This removes the need to have seperate
  88  * code in the PMatch case block - it is
  89  * just CCL code now.
  90  *
  91  * Use the actual CCL code in the CLO
  92  * section of PMatch. No need for a recursive
  93  * PMatch call.
  94  *
  95  * Use a bitmap table to set char bits in an
  96  * 8-bit chunk.
  97  *
  98  * Interfaces:
  99  *      RESearch::Compile:        compile a regular expression into a NFA.
 100  *
 101  *                      char *RESearch::Compile(s)
 102  *                      char *s;
 103  *
 104  *      RESearch::Execute:        execute the NFA to match a pattern.
 105  *
 106  *                      int RESearch::Execute(s)
 107  *                      char *s;
 108  *
 109  *      RESearch::ModifyWord            change RESearch::Execute's understanding of what a "word"
 110  *                      looks like (for \< and \>) by adding into the
 111  *                      hidden word-syntax table.
 112  *
 113  *                      void RESearch::ModifyWord(s)
 114  *                      char *s;
 115  *
 116  *      RESearch::Substitute:   substitute the matched portions in a new string.
 117  *
 118  *                      int RESearch::Substitute(src, dst)
 119  *                      char *src;
 120  *                      char *dst;
 121  *
 122  *      re_fail:        failure routine for RESearch::Execute.
 123  *
 124  *                      void re_fail(msg, op)
 125  *                      char *msg;
 126  *                      char op;
 127  *
 128  * Regular Expressions:
 129  *
 130  *      [1]     char    matches itself, unless it is a special
 131  *                      character (metachar): . \ [ ] * + ^ $
 132  *
 133  *      [2]     .       matches any character.
 134  *
 135  *      [3]     \       matches the character following it, except
 136  *                      when followed by a left or right round bracket,
 137  *                      a digit 1 to 9 or a left or right angle bracket.
 138  *                      (see [7], [8] and [9])
 139  *                      It is used as an escape character for all
 140  *                      other meta-characters, and itself. When used
 141  *                      in a set ([4]), it is treated as an ordinary
 142  *                      character.
 143  *
 144  *      [4]     [set]   matches one of the characters in the set.
 145  *                      If the first character in the set is "^",
 146  *                      it matches a character NOT in the set, i.e.
 147  *                      complements the set. A shorthand S-E is
 148  *                      used to specify a set of characters S upto
 149  *                      E, inclusive. The special characters "]" and
 150  *                      "-" have no special meaning if they appear
 151  *                      as the first chars in the set.
 152  *                      examples:        match:
 153  *
 154  *                              [a-z]    any lowercase alpha
 155  *
 156  *                              [^]-]    any char except ] and -
 157  *
 158  *                              [^A-Z]   any char except uppercase
 159  *                                       alpha
 160  *
 161  *                              [a-zA-Z] any alpha
 162  *
 163  *      [5]     *       any regular expression form [1] to [4], followed by
 164  *                      closure char (*) matches zero or more matches of
 165  *                      that form.
 166  *
 167  *      [6]     +       same as [5], except it matches one or more.
 168  *
 169  *      [7]             a regular expression in the form [1] to [10], enclosed
 170  *                      as \(form\) matches what form matches. The enclosure
 171  *                      creates a set of tags, used for [8] and for
 172  *                      pattern substution. The tagged forms are numbered
 173  *                      starting from 1.
 174  *
 175  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
 176  *                      previously tagged regular expression ([7]) matched.
 177  *
 178  *      [9]     \<      a regular expression starting with a \< construct
 179  *              \>      and/or ending with a \> construct, restricts the
 180  *                      pattern matching to the beginning of a word, and/or
 181  *                      the end of a word. A word is defined to be a character
 182  *                      string beginning and/or ending with the characters
 183  *                      A-Z a-z 0-9 and _. It must also be preceded and/or
 184  *                      followed by any character outside those mentioned.
 185  *
 186  *      [10]            a composite regular expression xy where x and y
 187  *                      are in the form [1] to [10] matches the longest
 188  *                      match of x followed by a match for y.
 189  *
 190  *      [11]    ^       a regular expression starting with a ^ character
 191  *              $       and/or ending with a $ character, restricts the
 192  *                      pattern matching to the beginning of the line,
 193  *                      or the end of line. [anchors] Elsewhere in the
 194  *                      pattern, ^ and $ are treated as ordinary characters.
 195  *
 196  *
 197  * Acknowledgements:
 198  *
 199  *      HCR's Hugh Redelmeier has been most helpful in various
 200  *      stages of development. He convinced me to include BOW
 201  *      and EOW constructs, originally invented by Rob Pike at
 202  *      the University of Toronto.
 203  *
 204  * References:
 205  *              Software tools                  Kernighan & Plauger
 206  *              Software tools in Pascal        Kernighan & Plauger
 207  *              Grep [rsx-11 C dist]            David Conroy
 208  *              ed - text editor                Un*x Programmer's Manual
 209  *              Advanced editing on Un*x        B. W. Kernighan
 210  *              RegExp routines                 Henry Spencer
 211  *
 212  * Notes:
 213  *
 214  *      This implementation uses a bit-set representation for character
 215  *      classes for speed and compactness. Each character is represented
 216  *      by one bit in a 128-bit block. Thus, CCL always takes a
 217  *      constant 16 bytes in the internal nfa, and RESearch::Execute does a single
 218  *      bit comparison to locate the character in the set.
 219  *
 220  * Examples:
 221  *
 222  *      pattern:        foo*.*
 223  *      compile:        CHR f CHR o CLO CHR o END CLO ANY END END
 224  *      matches:        fo foo fooo foobar fobar foxx ...
 225  *
 226  *      pattern:        fo[ob]a[rz]
 227  *      compile:        CHR f CHR o CCL bitset CHR a CCL bitset END
 228  *      matches:        fobar fooar fobaz fooaz
 229  *
 230  *      pattern:        foo\\+
 231  *      compile:        CHR f CHR o CHR o CHR \ CLO CHR \ END END
 232  *      matches:        foo\ foo\\ foo\\\  ...
 233  *
 234  *      pattern:        \(foo\)[1-3]\1  (same as foo[1-3]foo)
 235  *      compile:        BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
 236  *      matches:        foo1foo foo2foo foo3foo
 237  *
 238  *      pattern:        \(fo.*\)-\1
 239  *      compile:        BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
 240  *      matches:        foo-foo fo-fo fob-fob foobar-foobar ...
 241  */
 242
 243 #include "RESearch.h"
 244
 245 #define OKP     1
 246 #define NOP     0
 247
 248 #define CHR     1
 249 #define ANY     2
 250 #define CCL     3
 251 #define BOL     4
 252 #define EOL     5
 253 #define BOT     6
 254 #define EOT     7
 255 #define BOW     8
 256 #define EOW     9
 257 #define REF     10
 258 #define CLO     11
 259
 260 #define END     0
 261
 262 /*
 263  * The following defines are not meant to be changeable.
 264  * They are for readability only.
 265  */
 266 #define BLKIND  0170
 267 #define BITIND  07
 268
 269 #define ASCIIB  0177
 270
 271 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
 272
 273 #define badpat(x)       (*nfa = END, x)
 274
 275 RESearch::RESearch() {
 276         Init();
 277 }
 278
 279 RESearch::~RESearch() {
 280         Clear();
 281 }
 282
 283 void RESearch::Init() {
 284         sta = NOP;                      /* status of lastpat */
 285         bol = 0;
 286         for (int i=0; i<MAXTAG; i++)
 287                 pat[i] = 0;
 288         for (int j=0; j<BITBLK; j++)
 289                 bittab[j] = 0;
 290 }
 291
 292 void RESearch::Clear() {
 293         for (int i=0; i<MAXTAG; i++) {
 294                 delete []pat[i];
 295                 pat[i] = 0;
 296                 bopat[i] = NOTFOUND;
 297                 eopat[i] = NOTFOUND;
 298         }
 299 }
 300
 301 bool RESearch::GrabMatches(CharacterIndexer &ci) {
 302         bool success = true;
 303         for (unsigned int i=0; i<MAXTAG; i++) {
 304                 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
 305                         unsigned int len = eopat[i] - bopat[i];
 306                         pat[i] = new char[len + 1];
 307                         if (pat[i]) {
 308                                 for (unsigned int j=0; j<len; j++)
 309                                         pat[i][j] = ci.CharAt(bopat[i] + j);
 310                                 pat[i][len] = '\0';
 311                         } else {
 312                                 success = false;
 313                         }
 314                 }
 315         }
 316         return success;
 317 }
 318
 319 void RESearch::ChSet(char c) {
 320         bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
 321 }
 322
 323 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
 324         if (caseSensitive) {
 325                 ChSet(c);
 326         } else {
 327                 if ((c >= 'a') && (c <= 'z')) {
 328                         ChSet(c);
 329                         ChSet(static_cast<char>(c - 'a' + 'A'));
 330                 } else if ((c >= 'A') && (c <= 'Z')) {
 331                         ChSet(c);
 332                         ChSet(static_cast<char>(c - 'A' + 'a'));
 333                 } else {
 334                         ChSet(c);
 335                 }
 336         }
 337 }
 338
 339 const char escapeValue(char ch) {
 340         switch (ch) {
 341         case 'a':       return '\a';
 342         case 'b':       return '\b';
 343         case 'f':       return '\f';
 344         case 'n':       return '\n';
 345         case 'r':       return '\r';
 346         case 't':       return '\t';
 347         case 'v':       return '\v';
 348         }
 349         return 0;
 350 }
 351
 352 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
 353         char *mp=nfa;          /* nfa pointer       */
 354         char *lp;              /* saved pointer..   */
 355         char *sp=nfa;          /* another one..     */
 356     char *mpMax = mp + MAXNFA - BITBLK - 10;
 357
 358         int tagi = 0;          /* tag stack index   */
 359         int tagc = 1;          /* actual tag count  */
 360
 361         int n;
 362         char mask;              /* xor mask -CCL/NCL */
 363         int c1, c2;
 364
 365         if (!pat || !length)
 366                 if (sta)
 367                         return 0;
 368                 else
 369                         return badpat("No previous regular expression");
 370         sta = NOP;
 371
 372         const char *p=pat;               /* pattern pointer   */
 373         for (int i=0; i<length; i++, p++) {
 374                 if (mp > mpMax)
 375                         return badpat("Pattern too long");
 376                 lp = mp;
 377                 switch(*p) {
 378
 379                 case '.':               /* match any char..  */
 380                         *mp++ = ANY;
 381                         break;
 382
 383                 case '^':               /* match beginning.. */
 384                         if (p == pat)
 385                                 *mp++ = BOL;
 386                         else {
 387                                 *mp++ = CHR;
 388                                 *mp++ = *p;
 389                         }
 390                         break;
 391
 392                 case '$':               /* match endofline.. */
 393                         if (!*(p+1))
 394                                 *mp++ = EOL;
 395                         else {
 396                                 *mp++ = CHR;
 397                                 *mp++ = *p;
 398                         }
 399                         break;
 400
 401                 case '[':               /* match char class..*/
 402                         *mp++ = CCL;
 403
 404                         i++;
 405                         if (*++p == '^') {
 406                                 mask = '\377';
 407                                 i++;
 408                                 p++;
 409                         } else
 410                                 mask = 0;
 411
 412                         if (*p == '-') {                /* real dash */
 413                                 i++;
 414                                 ChSet(*p++);
 415                         }
 416                         if (*p == ']') {        /* real brace */
 417                                 i++;
 418                                 ChSet(*p++);
 419                         }
 420                         while (*p && *p != ']') {
 421                                 if (*p == '-' && *(p+1) && *(p+1) != ']') {
 422                                         i++;
 423                                         p++;
 424                                         c1 = *(p-2) + 1;
 425                                         i++;
 426                                         c2 = *p++;
 427                                         while (c1 <= c2) {
 428                                                 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
 429                                         }
 430                                 } else if (*p == '\\' && *(p+1)) {
 431                                         i++;
 432                                         p++;
 433                                         char escape = escapeValue(*p);
 434                                         if (escape)
 435                                                 ChSetWithCase(escape, caseSensitive);
 436                                         else
 437                                                 ChSetWithCase(*p, caseSensitive);
 438                                         i++;
 439                                         p++;
 440                                 } else {
 441                                         i++;
 442                                         ChSetWithCase(*p++, caseSensitive);
 443                                 }
 444                         }
 445                         if (!*p)
 446                                 return badpat("Missing ]");
 447
 448                         for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 449                                 *mp++ = static_cast<char>(mask ^ bittab[n]);
 450
 451                         break;
 452
 453                 case '*':               /* match 0 or more.. */
 454                 case '+':               /* match 1 or more.. */
 455                         if (p == pat)
 456                                 return badpat("Empty closure");
 457                         lp = sp;                /* previous opcode */
 458                         if (*lp == CLO)         /* equivalence..   */
 459                                 break;
 460                         switch(*lp) {
 461
 462                         case BOL:
 463                         case BOT:
 464                         case EOT:
 465                         case BOW:
 466                         case EOW:
 467                         case REF:
 468                                 return badpat("Illegal closure");
 469                         default:
 470                                 break;
 471                         }
 472
 473                         if (*p == '+')
 474                                 for (sp = mp; lp < sp; lp++)
 475                                         *mp++ = *lp;
 476
 477                         *mp++ = END;
 478                         *mp++ = END;
 479                         sp = mp;
 480                         while (--mp > lp)
 481                                 *mp = mp[-1];
 482                         *mp = CLO;
 483                         mp = sp;
 484                         break;
 485
 486                 case '\\':              /* tags, backrefs .. */
 487                         i++;
 488                         switch(*++p) {
 489
 490                         case '<':
 491                                 *mp++ = BOW;
 492                                 break;
 493                         case '>':
 494                                 if (*sp == BOW)
 495                                         return badpat("Null pattern inside \\<\\>");
 496                                 *mp++ = EOW;
 497                                 break;
 498                         case '1':
 499                         case '2':
 500                         case '3':
 501                         case '4':
 502                         case '5':
 503                         case '6':
 504                         case '7':
 505                         case '8':
 506                         case '9':
 507                                 n = *p-'0';
 508                                 if (tagi > 0 && tagstk[tagi] == n)
 509                                         return badpat("Cyclical reference");
 510                                 if (tagc > n) {
 511                                         *mp++ = static_cast<char>(REF);
 512                                         *mp++ = static_cast<char>(n);
 513                                 }
 514                                 else
 515                                         return badpat("Undetermined reference");
 516                                 break;
 517                         case 'a':
 518                         case 'b':
 519                         case 'n':
 520                         case 'f':
 521                         case 'r':
 522                         case 't':
 523                         case 'v':
 524                                 *mp++ = CHR;
 525                                 *mp++ = escapeValue(*p);
 526                                 break;
 527                         default:
 528                                 if (!posix && *p == '(') {
 529                                         if (tagc < MAXTAG) {
 530                                                 tagstk[++tagi] = tagc;
 531                                                 *mp++ = BOT;
 532                                                 *mp++ = static_cast<char>(tagc++);
 533                                         }
 534                                         else
 535                                                 return badpat("Too many \\(\\) pairs");
 536                                 } else if (!posix && *p == ')') {
 537                                         if (*sp == BOT)
 538                                                 return badpat("Null pattern inside \\(\\)");
 539                                         if (tagi > 0) {
 540                                                 *mp++ = static_cast<char>(EOT);
 541                                                 *mp++ = static_cast<char>(tagstk[tagi--]);
 542                                         }
 543                                         else
 544                                                 return badpat("Unmatched \\)");
 545                                 } else {
 546                                         *mp++ = CHR;
 547                                         *mp++ = *p;
 548                                 }
 549                         }
 550                         break;
 551
 552                 default :               /* an ordinary char  */
 553                         if (posix && *p == '(') {
 554                                 if (tagc < MAXTAG) {
 555                                         tagstk[++tagi] = tagc;
 556                                         *mp++ = BOT;
 557                                         *mp++ = static_cast<char>(tagc++);
 558                                 }
 559                                 else
 560                                         return badpat("Too many () pairs");
 561                         } else if (posix && *p == ')') {
 562                                 if (*sp == BOT)
 563                                         return badpat("Null pattern inside ()");
 564                                 if (tagi > 0) {
 565                                         *mp++ = static_cast<char>(EOT);
 566                                         *mp++ = static_cast<char>(tagstk[tagi--]);
 567                                 }
 568                                 else
 569                                         return badpat("Unmatched )");
 570                         } else if (caseSensitive) {
 571                                 *mp++ = CHR;
 572                                 *mp++ = *p;
 573                         } else {
 574                                 *mp++ = CCL;
 575                                 mask = 0;
 576                                 ChSetWithCase(*p, false);
 577                                 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 578                                         *mp++ = static_cast<char>(mask ^ bittab[n]);
 579                         }
 580                         break;
 581                 }
 582                 sp = lp;
 583         }
 584         if (tagi > 0)
 585                 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
 586         *mp = END;
 587         sta = OKP;
 588         return 0;
 589 }
 590
 591 /*
 592  * RESearch::Execute:
 593  *      execute nfa to find a match.
 594  *
 595  *      special cases: (nfa[0])
 596  *              BOL
 597  *                      Match only once, starting from the
 598  *                      beginning.
 599  *              CHR
 600  *                      First locate the character without
 601  *                      calling PMatch, and if found, call
 602  *                      PMatch for the remaining string.
 603  *              END
 604  *                      RESearch::Compile failed, poor luser did not
 605  *                      check for it. Fail fast.
 606  *
 607  *      If a match is found, bopat[0] and eopat[0] are set
 608  *      to the beginning and the end of the matched fragment,
 609  *      respectively.
 610  *
 611  */
 612
 613 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 614         char c;
 615         int ep = NOTFOUND;
 616         char *ap = nfa;
 617
 618         bol = lp;
 619         failure = 0;
 620
 621         Clear();
 622
 623         switch(*ap) {
 624
 625         case BOL:                       /* anchored: match from BOL only */
 626                 ep = PMatch(ci, lp, endp, ap);
 627                 break;
 628         case EOL:                       /* just searching for end of line normal path doesn't work */
 629                 if (*(ap+1) == END) {
 630                         lp = endp;
 631                         ep = lp;
 632                         break;
 633                 } else {
 634                         return 0;
 635                 }
 636         case CHR:                       /* ordinary char: locate it fast */
 637                 c = *(ap+1);
 638                 while ((lp < endp) && (ci.CharAt(lp) != c))
 639                         lp++;
 640                 if (lp >= endp)         /* if EOS, fail, else fall thru. */
 641                         return 0;
 642         default:                        /* regular matching all the way. */
 643                 while (lp < endp) {
 644                         ep = PMatch(ci, lp, endp, ap);
 645                         if (ep != NOTFOUND)
 646                                 break;
 647                         lp++;
 648                 }
 649                 break;
 650         case END:                       /* munged automaton. fail always */
 651                 return 0;
 652         }
 653         if (ep == NOTFOUND)
 654                 return 0;
 655
 656         bopat[0] = lp;
 657         eopat[0] = ep;
 658         return 1;
 659 }
 660
 661 /*
 662  * PMatch: internal routine for the hard part
 663  *
 664  *      This code is partly snarfed from an early grep written by
 665  *      David Conroy. The backref and tag stuff, and various other
 666  *      innovations are by oz.
 667  *
 668  *      special case optimizations: (nfa[n], nfa[n+1])
 669  *              CLO ANY
 670  *                      We KNOW .* will match everything upto the
 671  *                      end of line. Thus, directly go to the end of
 672  *                      line, without recursive PMatch calls. As in
 673  *                      the other closure cases, the remaining pattern
 674  *                      must be matched by moving backwards on the
 675  *                      string recursively, to find a match for xy
 676  *                      (x is ".*" and y is the remaining pattern)
 677  *                      where the match satisfies the LONGEST match for
 678  *                      x followed by a match for y.
 679  *              CLO CHR
 680  *                      We can again scan the string forward for the
 681  *                      single char and at the point of failure, we
 682  *                      execute the remaining nfa recursively, same as
 683  *                      above.
 684  *
 685  *      At the end of a successful match, bopat[n] and eopat[n]
 686  *      are set to the beginning and end of subpatterns matched
 687  *      by tagged expressions (n = 1 to 9).
 688  *
 689  */
 690
 691 extern void re_fail(char *,char);
 692
 693 /*
 694  * character classification table for word boundary operators BOW
 695  * and EOW. the reason for not using ctype macros is that we can
 696  * let the user add into our own table. see RESearch::ModifyWord. This table
 697  * is not in the bitset form, since we may wish to extend it in the
 698  * future for other character classifications.
 699  *
 700  *      TRUE for 0-9 A-Z a-z _
 701  */
 702 static char chrtyp[MAXCHR] = {
 703         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 704         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 705         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 706         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 707         0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
 708         1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
 709         0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
 710         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 711         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 712         1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
 713         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 714         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 715         1, 1, 1, 0, 0, 0, 0, 0
 716         };
 717
 718 #define inascii(x)      (0177&(x))
 719 #define iswordc(x)      chrtyp[inascii(x)]
 720 #define isinset(x,y)    ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
 721
 722 /*
 723  * skip values for CLO XXX to skip past the closure
 724  */
 725
 726 #define ANYSKIP 2       /* [CLO] ANY END ...         */
 727 #define CHRSKIP 3       /* [CLO] CHR chr END ...     */
 728 #define CCLSKIP 34      /* [CLO] CCL 32bytes END ... */
 729
 730 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 731         int op, c, n;
 732         int e;          /* extra pointer for CLO */
 733         int bp;         /* beginning of subpat.. */
 734         int ep;         /* ending of subpat..    */
 735         int are;                        /* to save the line ptr. */
 736
 737         while ((op = *ap++) != END)
 738                 switch(op) {
 739
 740                 case CHR:
 741                         if (ci.CharAt(lp++) != *ap++)
 742                                 return NOTFOUND;
 743                         break;
 744                 case ANY:
 745                         if (lp++ >= endp)
 746                                 return NOTFOUND;
 747                         break;
 748                 case CCL:
 749                         c = ci.CharAt(lp++);
 750                         if (!isinset(ap,c))
 751                                 return NOTFOUND;
 752                         ap += BITBLK;
 753                         break;
 754                 case BOL:
 755                         if (lp != bol)
 756                                 return NOTFOUND;
 757                         break;
 758                 case EOL:
 759                         if (lp < endp)
 760                                 return NOTFOUND;
 761                         break;
 762                 case BOT:
 763                         bopat[*ap++] = lp;
 764                         break;
 765                 case EOT:
 766                         eopat[*ap++] = lp;
 767                         break;
 768                 case BOW:
 769                         if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
 770                                 return NOTFOUND;
 771                         break;
 772                 case EOW:
 773                         if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
 774                                 return NOTFOUND;
 775                         break;
 776                 case REF:
 777                         n = *ap++;
 778                         bp = bopat[n];
 779                         ep = eopat[n];
 780                         while (bp < ep)
 781                                 if (ci.CharAt(bp++) != ci.CharAt(lp++))
 782                                         return NOTFOUND;
 783                         break;
 784                 case CLO:
 785                         are = lp;
 786                         switch(*ap) {
 787
 788                         case ANY:
 789                                 while (lp < endp)
 790                                         lp++;
 791                                 n = ANYSKIP;
 792                                 break;
 793                         case CHR:
 794                                 c = *(ap+1);
 795                                 while ((lp < endp) && (c == ci.CharAt(lp)))
 796                                         lp++;
 797                                 n = CHRSKIP;
 798                                 break;
 799                         case CCL:
 800                                 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
 801                                         lp++;
 802                                 n = CCLSKIP;
 803                                 break;
 804                         default:
 805                                 failure = true;
 806                                 //re_fail("closure: bad nfa.", *ap);
 807                                 return NOTFOUND;
 808                         }
 809
 810                         ap += n;
 811
 812                         while (lp >= are) {
 813                                 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
 814                                         return e;
 815                                 --lp;
 816                         }
 817                         return NOTFOUND;
 818                 default:
 819                         //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
 820                         return NOTFOUND;
 821                 }
 822         return lp;
 823 }
 824
 825 /*
 826  * RESearch::ModifyWord:
 827  *      add new characters into the word table to change RESearch::Execute's
 828  *      understanding of what a word should look like. Note that we
 829  *      only accept additions into the word definition.
 830  *
 831  *      If the string parameter is 0 or null string, the table is
 832  *      reset back to the default containing A-Z a-z 0-9 _. [We use
 833  *      the compact bitset representation for the default table]
 834  */
 835
 836 static char deftab[16] = {
 837         0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
 838         '\376', '\377', '\377', 007
 839 };
 840
 841 void RESearch::ModifyWord(char *s) {
 842         int i;
 843
 844         if (!s || !*s) {
 845                 for (i = 0; i < MAXCHR; i++)
 846                         if (!isinset(deftab,i))
 847                                 iswordc(i) = 0;
 848         }
 849         else
 850                 while(*s)
 851                         iswordc(*s++) = 1;
 852 }
 853
 854 /*
 855  * RESearch::Substitute:
 856  *      substitute the matched portions of the src in dst.
 857  *
 858  *      &       substitute the entire matched pattern.
 859  *
 860  *      \digit  substitute a subpattern, with the given tag number.
 861  *              Tags are numbered from 1 to 9. If the particular
 862  *              tagged subpattern does not exist, null is substituted.
 863  */
 864 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
 865         char c;
 866         int  pin;
 867         int bp;
 868         int ep;
 869
 870         if (!*src || !bopat[0])
 871                 return 0;
 872
 873         while ((c = *src++) != 0) {
 874                 switch(c) {
 875
 876                 case '&':
 877                         pin = 0;
 878                         break;
 879
 880                 case '\\':
 881                         c = *src++;
 882                         if (c >= '0' && c <= '9') {
 883                                 pin = c - '0';
 884                                 break;
 885                         }
 886
 887                 default:
 888                         *dst++ = c;
 889                         continue;
 890                 }
 891
 892                 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
 893                         while (ci.CharAt(bp) && bp < ep)
 894                                 *dst++ = ci.CharAt(bp++);
 895                         if (bp < ep)
 896                                 return 0;
 897                 }
 898         }
 899         *dst = (char) 0;
 900         return 1;
 901 }