contrib/src/stc/scintilla/src/RESearch.cxx

   1 // Scintilla source code edit control
   2 /** @file RESearch.cxx
   3  ** Regular expression search library.
   4  **/
   5
   6 /*
   7  * regex - Regular expression pattern matching  and replacement
   8  *
   9  * By:  Ozan S. Yigit (oz)
  10  *      Dept. of Computer Science
  11  *      York University
  12  *
  13  * Original code available from http://www.cs.yorku.ca/~oz/
  14  * Translation to C++ by Neil Hodgson neilh@scintilla.org
  15  * Removed all use of register.
  16  * Converted to modern function prototypes.
  17  * Put all global/static variables into an object so this code can be
  18  * used from multiple threads etc.
  19  *
  20  * These routines are the PUBLIC DOMAIN equivalents of regex
  21  * routines as found in 4.nBSD UN*X, with minor extensions.
  22  *
  23  * These routines are derived from various implementations found
  24  * in software tools books, and Conroy's grep. They are NOT derived
  25  * from licensed/restricted software.
  26  * For more interesting/academic/complicated implementations,
  27  * see Henry Spencer's regexp routines, or GNU Emacs pattern
  28  * matching module.
  29  *
  30  * Modification history:
  31  *
  32  * $Log$
  33  * Revision 1.6  2003/04/19 19:59:49  RD
  34  * Updated Scintilla to 1.52 (on the trunk this time too)
  35  *
  36  * Revision 1.9  2003/03/21 10:36:08  nyamatongwe
  37  * Detect patterns too long in regular expression search.
  38  *
  39  * Revision 1.8  2003/03/04 10:53:59  nyamatongwe
  40  * Patch from Jakub to optionally implement more POSIX compatible regular
  41  * expressions. \(..\) changes to (..)
  42  * Fixes problem where find previous would not find earlier matches on same
  43  * line.
  44  *
  45  * Revision 1.8  2003/03/03 20:12:56  vrana
  46  * Added posix syntax.
  47  *
  48  * Revision 1.7  2002/09/28 00:33:28  nyamatongwe
  49  * Fixed problem with character ranges caused by expansion to 8 bits.
  50  *
  51  * Revision 1.6  2001/04/29 13:32:10  nyamatongwe
  52  * Addition of new target methods - versions of ReplaceTarget that take counted
  53  * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
  54  * series of calls rather than a structure.
  55  * Handling of \000 in search and replace.
  56  * Handling of /escapes within character ranges of regular expressions.
  57  * Some handling of bare ^ and $ regular expressions.
  58  *
  59  * Revision 1.5  2001/04/20 07:36:09  nyamatongwe
  60  * Removed DEBUG code that failed to compile on GTK+.
  61  *
  62  * Revision 1.4  2001/04/13 03:52:13  nyamatongwe
  63  * Added URL to find original code to comments.
  64  *
  65  * Revision 1.3  2001/04/06 12:24:21  nyamatongwe
  66  * Made regular expression searching work on a line by line basis, made ^ and
  67  * $ work, made [set] work, and added a case insensitive option.
  68  *
  69  * Revision 1.2  2001/04/05 01:58:04  nyamatongwe
  70  * Replace target functionality to make find and replace operations faster
  71  * by diminishing screen updates and allow for \d patterns in the replacement
  72  * text.
  73  *
  74  * Revision 1.1  2001/04/04 12:52:44  nyamatongwe
  75  * Moved to public domain regular expresion implementation.
  76  *
  77  * Revision 1.4  1991/10/17  03:56:42  oz
  78  * miscellaneous changes, small cleanups etc.
  79  *
  80  * Revision 1.3  1989/04/01  14:18:09  oz
  81  * Change all references to a dfa: this is actually an nfa.
  82  *
  83  * Revision 1.2  88/08/28  15:36:04  oz
  84  * Use a complement bitmap to represent NCL.
  85  * This removes the need to have seperate
  86  * code in the PMatch case block - it is
  87  * just CCL code now.
  88  *
  89  * Use the actual CCL code in the CLO
  90  * section of PMatch. No need for a recursive
  91  * PMatch call.
  92  *
  93  * Use a bitmap table to set char bits in an
  94  * 8-bit chunk.
  95  *
  96  * Interfaces:
  97  *      RESearch::Compile:        compile a regular expression into a NFA.
  98  *
  99  *                      char *RESearch::Compile(s)
 100  *                      char *s;
 101  *
 102  *      RESearch::Execute:        execute the NFA to match a pattern.
 103  *
 104  *                      int RESearch::Execute(s)
 105  *                      char *s;
 106  *
 107  *      RESearch::ModifyWord            change RESearch::Execute's understanding of what a "word"
 108  *                      looks like (for \< and \>) by adding into the
 109  *                      hidden word-syntax table.
 110  *
 111  *                      void RESearch::ModifyWord(s)
 112  *                      char *s;
 113  *
 114  *      RESearch::Substitute:   substitute the matched portions in a new string.
 115  *
 116  *                      int RESearch::Substitute(src, dst)
 117  *                      char *src;
 118  *                      char *dst;
 119  *
 120  *      re_fail:        failure routine for RESearch::Execute.
 121  *
 122  *                      void re_fail(msg, op)
 123  *                      char *msg;
 124  *                      char op;
 125  *
 126  * Regular Expressions:
 127  *
 128  *      [1]     char    matches itself, unless it is a special
 129  *                      character (metachar): . \ [ ] * + ^ $
 130  *
 131  *      [2]     .       matches any character.
 132  *
 133  *      [3]     \       matches the character following it, except
 134  *                      when followed by a left or right round bracket,
 135  *                      a digit 1 to 9 or a left or right angle bracket.
 136  *                      (see [7], [8] and [9])
 137  *                      It is used as an escape character for all
 138  *                      other meta-characters, and itself. When used
 139  *                      in a set ([4]), it is treated as an ordinary
 140  *                      character.
 141  *
 142  *      [4]     [set]   matches one of the characters in the set.
 143  *                      If the first character in the set is "^",
 144  *                      it matches a character NOT in the set, i.e.
 145  *                      complements the set. A shorthand S-E is
 146  *                      used to specify a set of characters S upto
 147  *                      E, inclusive. The special characters "]" and
 148  *                      "-" have no special meaning if they appear
 149  *                      as the first chars in the set.
 150  *                      examples:        match:
 151  *
 152  *                              [a-z]    any lowercase alpha
 153  *
 154  *                              [^]-]    any char except ] and -
 155  *
 156  *                              [^A-Z]   any char except uppercase
 157  *                                       alpha
 158  *
 159  *                              [a-zA-Z] any alpha
 160  *
 161  *      [5]     *       any regular expression form [1] to [4], followed by
 162  *                      closure char (*) matches zero or more matches of
 163  *                      that form.
 164  *
 165  *      [6]     +       same as [5], except it matches one or more.
 166  *
 167  *      [7]             a regular expression in the form [1] to [10], enclosed
 168  *                      as \(form\) matches what form matches. The enclosure
 169  *                      creates a set of tags, used for [8] and for
 170  *                      pattern substution. The tagged forms are numbered
 171  *                      starting from 1.
 172  *
 173  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
 174  *                      previously tagged regular expression ([7]) matched.
 175  *
 176  *      [9]     \<      a regular expression starting with a \< construct
 177  *              \>      and/or ending with a \> construct, restricts the
 178  *                      pattern matching to the beginning of a word, and/or
 179  *                      the end of a word. A word is defined to be a character
 180  *                      string beginning and/or ending with the characters
 181  *                      A-Z a-z 0-9 and _. It must also be preceded and/or
 182  *                      followed by any character outside those mentioned.
 183  *
 184  *      [10]            a composite regular expression xy where x and y
 185  *                      are in the form [1] to [10] matches the longest
 186  *                      match of x followed by a match for y.
 187  *
 188  *      [11]    ^       a regular expression starting with a ^ character
 189  *              $       and/or ending with a $ character, restricts the
 190  *                      pattern matching to the beginning of the line,
 191  *                      or the end of line. [anchors] Elsewhere in the
 192  *                      pattern, ^ and $ are treated as ordinary characters.
 193  *
 194  *
 195  * Acknowledgements:
 196  *
 197  *      HCR's Hugh Redelmeier has been most helpful in various
 198  *      stages of development. He convinced me to include BOW
 199  *      and EOW constructs, originally invented by Rob Pike at
 200  *      the University of Toronto.
 201  *
 202  * References:
 203  *              Software tools                  Kernighan & Plauger
 204  *              Software tools in Pascal        Kernighan & Plauger
 205  *              Grep [rsx-11 C dist]            David Conroy
 206  *              ed - text editor                Un*x Programmer's Manual
 207  *              Advanced editing on Un*x        B. W. Kernighan
 208  *              RegExp routines                 Henry Spencer
 209  *
 210  * Notes:
 211  *
 212  *      This implementation uses a bit-set representation for character
 213  *      classes for speed and compactness. Each character is represented
 214  *      by one bit in a 128-bit block. Thus, CCL always takes a
 215  *      constant 16 bytes in the internal nfa, and RESearch::Execute does a single
 216  *      bit comparison to locate the character in the set.
 217  *
 218  * Examples:
 219  *
 220  *      pattern:        foo*.*
 221  *      compile:        CHR f CHR o CLO CHR o END CLO ANY END END
 222  *      matches:        fo foo fooo foobar fobar foxx ...
 223  *
 224  *      pattern:        fo[ob]a[rz]
 225  *      compile:        CHR f CHR o CCL bitset CHR a CCL bitset END
 226  *      matches:        fobar fooar fobaz fooaz
 227  *
 228  *      pattern:        foo\\+
 229  *      compile:        CHR f CHR o CHR o CHR \ CLO CHR \ END END
 230  *      matches:        foo\ foo\\ foo\\\  ...
 231  *
 232  *      pattern:        \(foo\)[1-3]\1  (same as foo[1-3]foo)
 233  *      compile:        BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
 234  *      matches:        foo1foo foo2foo foo3foo
 235  *
 236  *      pattern:        \(fo.*\)-\1
 237  *      compile:        BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
 238  *      matches:        foo-foo fo-fo fob-fob foobar-foobar ...
 239  */
 240
 241 #include "RESearch.h"
 242
 243 #define OKP     1
 244 #define NOP     0
 245
 246 #define CHR     1
 247 #define ANY     2
 248 #define CCL     3
 249 #define BOL     4
 250 #define EOL     5
 251 #define BOT     6
 252 #define EOT     7
 253 #define BOW     8
 254 #define EOW     9
 255 #define REF     10
 256 #define CLO     11
 257
 258 #define END     0
 259
 260 /*
 261  * The following defines are not meant to be changeable.
 262  * They are for readability only.
 263  */
 264 #define BLKIND  0170
 265 #define BITIND  07
 266
 267 #define ASCIIB  0177
 268
 269 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
 270
 271 #define badpat(x)       (*nfa = END, x)
 272
 273 RESearch::RESearch() {
 274         Init();
 275 }
 276
 277 RESearch::~RESearch() {
 278         Clear();
 279 }
 280
 281 void RESearch::Init() {
 282         sta = NOP;                      /* status of lastpat */
 283         bol = 0;
 284         for (int i=0; i<MAXTAG; i++)
 285                 pat[i] = 0;
 286         for (int j=0; j<BITBLK; j++)
 287                 bittab[j] = 0;
 288 }
 289
 290 void RESearch::Clear() {
 291         for (int i=0; i<MAXTAG; i++) {
 292                 delete []pat[i];
 293                 pat[i] = 0;
 294                 bopat[i] = NOTFOUND;
 295                 eopat[i] = NOTFOUND;
 296         }
 297 }
 298
 299 bool RESearch::GrabMatches(CharacterIndexer &ci) {
 300         bool success = true;
 301         for (unsigned int i=0; i<MAXTAG; i++) {
 302                 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
 303                         unsigned int len = eopat[i] - bopat[i];
 304                         pat[i] = new char[len + 1];
 305                         if (pat[i]) {
 306                                 for (unsigned int j=0; j<len; j++)
 307                                         pat[i][j] = ci.CharAt(bopat[i] + j);
 308                                 pat[i][len] = '\0';
 309                         } else {
 310                                 success = false;
 311                         }
 312                 }
 313         }
 314         return success;
 315 }
 316
 317 void RESearch::ChSet(char c) {
 318         bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
 319 }
 320
 321 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
 322         if (caseSensitive) {
 323                 ChSet(c);
 324         } else {
 325                 if ((c >= 'a') && (c <= 'z')) {
 326                         ChSet(c);
 327                         ChSet(static_cast<char>(c - 'a' + 'A'));
 328                 } else if ((c >= 'A') && (c <= 'Z')) {
 329                         ChSet(c);
 330                         ChSet(static_cast<char>(c - 'A' + 'a'));
 331                 } else {
 332                         ChSet(c);
 333                 }
 334         }
 335 }
 336
 337 const char escapeValue(char ch) {
 338         switch (ch) {
 339         case 'a':       return '\a';
 340         case 'b':       return '\b';
 341         case 'f':       return '\f';
 342         case 'n':       return '\n';
 343         case 'r':       return '\r';
 344         case 't':       return '\t';
 345         case 'v':       return '\v';
 346         }
 347         return 0;
 348 }
 349
 350 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
 351         char *mp=nfa;          /* nfa pointer       */
 352         char *lp;              /* saved pointer..   */
 353         char *sp=nfa;          /* another one..     */
 354     char *mpMax = mp + MAXNFA - BITBLK - 10;
 355
 356         int tagi = 0;          /* tag stack index   */
 357         int tagc = 1;          /* actual tag count  */
 358
 359         int n;
 360         char mask;              /* xor mask -CCL/NCL */
 361         int c1, c2;
 362
 363         if (!pat || !length)
 364                 if (sta)
 365                         return 0;
 366                 else
 367                         return badpat("No previous regular expression");
 368         sta = NOP;
 369
 370         const char *p=pat;               /* pattern pointer   */
 371         for (int i=0; i<length; i++, p++) {
 372                 if (mp > mpMax)
 373                         return badpat("Pattern too long");
 374                 lp = mp;
 375                 switch(*p) {
 376
 377                 case '.':               /* match any char..  */
 378                         *mp++ = ANY;
 379                         break;
 380
 381                 case '^':               /* match beginning.. */
 382                         if (p == pat)
 383                                 *mp++ = BOL;
 384                         else {
 385                                 *mp++ = CHR;
 386                                 *mp++ = *p;
 387                         }
 388                         break;
 389
 390                 case '$':               /* match endofline.. */
 391                         if (!*(p+1))
 392                                 *mp++ = EOL;
 393                         else {
 394                                 *mp++ = CHR;
 395                                 *mp++ = *p;
 396                         }
 397                         break;
 398
 399                 case '[':               /* match char class..*/
 400                         *mp++ = CCL;
 401
 402                         i++;
 403                         if (*++p == '^') {
 404                                 mask = '\377';
 405                                 i++;
 406                                 p++;
 407                         } else
 408                                 mask = 0;
 409
 410                         if (*p == '-') {                /* real dash */
 411                                 i++;
 412                                 ChSet(*p++);
 413                         }
 414                         if (*p == ']') {        /* real brace */
 415                                 i++;
 416                                 ChSet(*p++);
 417                         }
 418                         while (*p && *p != ']') {
 419                                 if (*p == '-' && *(p+1) && *(p+1) != ']') {
 420                                         i++;
 421                                         p++;
 422                                         c1 = *(p-2) + 1;
 423                                         i++;
 424                                         c2 = *p++;
 425                                         while (c1 <= c2) {
 426                                                 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
 427                                         }
 428                                 } else if (*p == '\\' && *(p+1)) {
 429                                         i++;
 430                                         p++;
 431                                         char escape = escapeValue(*p);
 432                                         if (escape)
 433                                                 ChSetWithCase(escape, caseSensitive);
 434                                         else
 435                                                 ChSetWithCase(*p, caseSensitive);
 436                                         i++;
 437                                         p++;
 438                                 } else {
 439                                         i++;
 440                                         ChSetWithCase(*p++, caseSensitive);
 441                                 }
 442                         }
 443                         if (!*p)
 444                                 return badpat("Missing ]");
 445
 446                         for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 447                                 *mp++ = static_cast<char>(mask ^ bittab[n]);
 448
 449                         break;
 450
 451                 case '*':               /* match 0 or more.. */
 452                 case '+':               /* match 1 or more.. */
 453                         if (p == pat)
 454                                 return badpat("Empty closure");
 455                         lp = sp;                /* previous opcode */
 456                         if (*lp == CLO)         /* equivalence..   */
 457                                 break;
 458                         switch(*lp) {
 459
 460                         case BOL:
 461                         case BOT:
 462                         case EOT:
 463                         case BOW:
 464                         case EOW:
 465                         case REF:
 466                                 return badpat("Illegal closure");
 467                         default:
 468                                 break;
 469                         }
 470
 471                         if (*p == '+')
 472                                 for (sp = mp; lp < sp; lp++)
 473                                         *mp++ = *lp;
 474
 475                         *mp++ = END;
 476                         *mp++ = END;
 477                         sp = mp;
 478                         while (--mp > lp)
 479                                 *mp = mp[-1];
 480                         *mp = CLO;
 481                         mp = sp;
 482                         break;
 483
 484                 case '\\':              /* tags, backrefs .. */
 485                         i++;
 486                         switch(*++p) {
 487
 488                         case '<':
 489                                 *mp++ = BOW;
 490                                 break;
 491                         case '>':
 492                                 if (*sp == BOW)
 493                                         return badpat("Null pattern inside \\<\\>");
 494                                 *mp++ = EOW;
 495                                 break;
 496                         case '1':
 497                         case '2':
 498                         case '3':
 499                         case '4':
 500                         case '5':
 501                         case '6':
 502                         case '7':
 503                         case '8':
 504                         case '9':
 505                                 n = *p-'0';
 506                                 if (tagi > 0 && tagstk[tagi] == n)
 507                                         return badpat("Cyclical reference");
 508                                 if (tagc > n) {
 509                                         *mp++ = static_cast<char>(REF);
 510                                         *mp++ = static_cast<char>(n);
 511                                 }
 512                                 else
 513                                         return badpat("Undetermined reference");
 514                                 break;
 515                         case 'a':
 516                         case 'b':
 517                         case 'n':
 518                         case 'f':
 519                         case 'r':
 520                         case 't':
 521                         case 'v':
 522                                 *mp++ = CHR;
 523                                 *mp++ = escapeValue(*p);
 524                                 break;
 525                         default:
 526                                 if (!posix && *p == '(') {
 527                                         if (tagc < MAXTAG) {
 528                                                 tagstk[++tagi] = tagc;
 529                                                 *mp++ = BOT;
 530                                                 *mp++ = static_cast<char>(tagc++);
 531                                         }
 532                                         else
 533                                                 return badpat("Too many \\(\\) pairs");
 534                                 } else if (!posix && *p == ')') {
 535                                         if (*sp == BOT)
 536                                                 return badpat("Null pattern inside \\(\\)");
 537                                         if (tagi > 0) {
 538                                                 *mp++ = static_cast<char>(EOT);
 539                                                 *mp++ = static_cast<char>(tagstk[tagi--]);
 540                                         }
 541                                         else
 542                                                 return badpat("Unmatched \\)");
 543                                 } else {
 544                                         *mp++ = CHR;
 545                                         *mp++ = *p;
 546                                 }
 547                         }
 548                         break;
 549
 550                 default :               /* an ordinary char  */
 551                         if (posix && *p == '(') {
 552                                 if (tagc < MAXTAG) {
 553                                         tagstk[++tagi] = tagc;
 554                                         *mp++ = BOT;
 555                                         *mp++ = static_cast<char>(tagc++);
 556                                 }
 557                                 else
 558                                         return badpat("Too many () pairs");
 559                         } else if (posix && *p == ')') {
 560                                 if (*sp == BOT)
 561                                         return badpat("Null pattern inside ()");
 562                                 if (tagi > 0) {
 563                                         *mp++ = static_cast<char>(EOT);
 564                                         *mp++ = static_cast<char>(tagstk[tagi--]);
 565                                 }
 566                                 else
 567                                         return badpat("Unmatched )");
 568                         } else if (caseSensitive) {
 569                                 *mp++ = CHR;
 570                                 *mp++ = *p;
 571                         } else {
 572                                 *mp++ = CCL;
 573                                 mask = 0;
 574                                 ChSetWithCase(*p, false);
 575                                 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
 576                                         *mp++ = static_cast<char>(mask ^ bittab[n]);
 577                         }
 578                         break;
 579                 }
 580                 sp = lp;
 581         }
 582         if (tagi > 0)
 583                 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
 584         *mp = END;
 585         sta = OKP;
 586         return 0;
 587 }
 588
 589 /*
 590  * RESearch::Execute:
 591  *      execute nfa to find a match.
 592  *
 593  *      special cases: (nfa[0])
 594  *              BOL
 595  *                      Match only once, starting from the
 596  *                      beginning.
 597  *              CHR
 598  *                      First locate the character without
 599  *                      calling PMatch, and if found, call
 600  *                      PMatch for the remaining string.
 601  *              END
 602  *                      RESearch::Compile failed, poor luser did not
 603  *                      check for it. Fail fast.
 604  *
 605  *      If a match is found, bopat[0] and eopat[0] are set
 606  *      to the beginning and the end of the matched fragment,
 607  *      respectively.
 608  *
 609  */
 610
 611 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
 612         char c;
 613         int ep = NOTFOUND;
 614         char *ap = nfa;
 615
 616         bol = lp;
 617         failure = 0;
 618
 619         Clear();
 620
 621         switch(*ap) {
 622
 623         case BOL:                       /* anchored: match from BOL only */
 624                 ep = PMatch(ci, lp, endp, ap);
 625                 break;
 626         case EOL:                       /* just searching for end of line normal path doesn't work */
 627                 if (*(ap+1) == END) {
 628                         lp = endp;
 629                         ep = lp;
 630                         break;
 631                 } else {
 632                         return 0;
 633                 }
 634         case CHR:                       /* ordinary char: locate it fast */
 635                 c = *(ap+1);
 636                 while ((lp < endp) && (ci.CharAt(lp) != c))
 637                         lp++;
 638                 if (lp >= endp)         /* if EOS, fail, else fall thru. */
 639                         return 0;
 640         default:                        /* regular matching all the way. */
 641                 while (lp < endp) {
 642                         ep = PMatch(ci, lp, endp, ap);
 643                         if (ep != NOTFOUND)
 644                                 break;
 645                         lp++;
 646                 }
 647                 break;
 648         case END:                       /* munged automaton. fail always */
 649                 return 0;
 650         }
 651         if (ep == NOTFOUND)
 652                 return 0;
 653
 654         bopat[0] = lp;
 655         eopat[0] = ep;
 656         return 1;
 657 }
 658
 659 /*
 660  * PMatch: internal routine for the hard part
 661  *
 662  *      This code is partly snarfed from an early grep written by
 663  *      David Conroy. The backref and tag stuff, and various other
 664  *      innovations are by oz.
 665  *
 666  *      special case optimizations: (nfa[n], nfa[n+1])
 667  *              CLO ANY
 668  *                      We KNOW .* will match everything upto the
 669  *                      end of line. Thus, directly go to the end of
 670  *                      line, without recursive PMatch calls. As in
 671  *                      the other closure cases, the remaining pattern
 672  *                      must be matched by moving backwards on the
 673  *                      string recursively, to find a match for xy
 674  *                      (x is ".*" and y is the remaining pattern)
 675  *                      where the match satisfies the LONGEST match for
 676  *                      x followed by a match for y.
 677  *              CLO CHR
 678  *                      We can again scan the string forward for the
 679  *                      single char and at the point of failure, we
 680  *                      execute the remaining nfa recursively, same as
 681  *                      above.
 682  *
 683  *      At the end of a successful match, bopat[n] and eopat[n]
 684  *      are set to the beginning and end of subpatterns matched
 685  *      by tagged expressions (n = 1 to 9).
 686  *
 687  */
 688
 689 extern void re_fail(char *,char);
 690
 691 /*
 692  * character classification table for word boundary operators BOW
 693  * and EOW. the reason for not using ctype macros is that we can
 694  * let the user add into our own table. see RESearch::ModifyWord. This table
 695  * is not in the bitset form, since we may wish to extend it in the
 696  * future for other character classifications.
 697  *
 698  *      TRUE for 0-9 A-Z a-z _
 699  */
 700 static char chrtyp[MAXCHR] = {
 701         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 702         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 703         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 704         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 705         0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
 706         1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
 707         0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
 708         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 709         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 710         1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
 711         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 712         1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 713         1, 1, 1, 0, 0, 0, 0, 0
 714         };
 715
 716 #define inascii(x)      (0177&(x))
 717 #define iswordc(x)      chrtyp[inascii(x)]
 718 #define isinset(x,y)    ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
 719
 720 /*
 721  * skip values for CLO XXX to skip past the closure
 722  */
 723
 724 #define ANYSKIP 2       /* [CLO] ANY END ...         */
 725 #define CHRSKIP 3       /* [CLO] CHR chr END ...     */
 726 #define CCLSKIP 34      /* [CLO] CCL 32bytes END ... */
 727
 728 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
 729         int op, c, n;
 730         int e;          /* extra pointer for CLO */
 731         int bp;         /* beginning of subpat.. */
 732         int ep;         /* ending of subpat..    */
 733         int are;                        /* to save the line ptr. */
 734
 735         while ((op = *ap++) != END)
 736                 switch(op) {
 737
 738                 case CHR:
 739                         if (ci.CharAt(lp++) != *ap++)
 740                                 return NOTFOUND;
 741                         break;
 742                 case ANY:
 743                         if (lp++ >= endp)
 744                                 return NOTFOUND;
 745                         break;
 746                 case CCL:
 747                         c = ci.CharAt(lp++);
 748                         if (!isinset(ap,c))
 749                                 return NOTFOUND;
 750                         ap += BITBLK;
 751                         break;
 752                 case BOL:
 753                         if (lp != bol)
 754                                 return NOTFOUND;
 755                         break;
 756                 case EOL:
 757                         if (lp < endp)
 758                                 return NOTFOUND;
 759                         break;
 760                 case BOT:
 761                         bopat[*ap++] = lp;
 762                         break;
 763                 case EOT:
 764                         eopat[*ap++] = lp;
 765                         break;
 766                 case BOW:
 767                         if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
 768                                 return NOTFOUND;
 769                         break;
 770                 case EOW:
 771                         if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
 772                                 return NOTFOUND;
 773                         break;
 774                 case REF:
 775                         n = *ap++;
 776                         bp = bopat[n];
 777                         ep = eopat[n];
 778                         while (bp < ep)
 779                                 if (ci.CharAt(bp++) != ci.CharAt(lp++))
 780                                         return NOTFOUND;
 781                         break;
 782                 case CLO:
 783                         are = lp;
 784                         switch(*ap) {
 785
 786                         case ANY:
 787                                 while (lp < endp)
 788                                         lp++;
 789                                 n = ANYSKIP;
 790                                 break;
 791                         case CHR:
 792                                 c = *(ap+1);
 793                                 while ((lp < endp) && (c == ci.CharAt(lp)))
 794                                         lp++;
 795                                 n = CHRSKIP;
 796                                 break;
 797                         case CCL:
 798                                 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
 799                                         lp++;
 800                                 n = CCLSKIP;
 801                                 break;
 802                         default:
 803                                 failure = true;
 804                                 //re_fail("closure: bad nfa.", *ap);
 805                                 return NOTFOUND;
 806                         }
 807
 808                         ap += n;
 809
 810                         while (lp >= are) {
 811                                 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
 812                                         return e;
 813                                 --lp;
 814                         }
 815                         return NOTFOUND;
 816                 default:
 817                         //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
 818                         return NOTFOUND;
 819                 }
 820         return lp;
 821 }
 822
 823 /*
 824  * RESearch::ModifyWord:
 825  *      add new characters into the word table to change RESearch::Execute's
 826  *      understanding of what a word should look like. Note that we
 827  *      only accept additions into the word definition.
 828  *
 829  *      If the string parameter is 0 or null string, the table is
 830  *      reset back to the default containing A-Z a-z 0-9 _. [We use
 831  *      the compact bitset representation for the default table]
 832  */
 833
 834 static char deftab[16] = {
 835         0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
 836         '\376', '\377', '\377', 007
 837 };
 838
 839 void RESearch::ModifyWord(char *s) {
 840         int i;
 841
 842         if (!s || !*s) {
 843                 for (i = 0; i < MAXCHR; i++)
 844                         if (!isinset(deftab,i))
 845                                 iswordc(i) = 0;
 846         }
 847         else
 848                 while(*s)
 849                         iswordc(*s++) = 1;
 850 }
 851
 852 /*
 853  * RESearch::Substitute:
 854  *      substitute the matched portions of the src in dst.
 855  *
 856  *      &       substitute the entire matched pattern.
 857  *
 858  *      \digit  substitute a subpattern, with the given tag number.
 859  *              Tags are numbered from 1 to 9. If the particular
 860  *              tagged subpattern does not exist, null is substituted.
 861  */
 862 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
 863         char c;
 864         int  pin;
 865         int bp;
 866         int ep;
 867
 868         if (!*src || !bopat[0])
 869                 return 0;
 870
 871         while ((c = *src++) != 0) {
 872                 switch(c) {
 873
 874                 case '&':
 875                         pin = 0;
 876                         break;
 877
 878                 case '\\':
 879                         c = *src++;
 880                         if (c >= '0' && c <= '9') {
 881                                 pin = c - '0';
 882                                 break;
 883                         }
 884
 885                 default:
 886                         *dst++ = c;
 887                         continue;
 888                 }
 889
 890                 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
 891                         while (ci.CharAt(bp) && bp < ep)
 892                                 *dst++ = ci.CharAt(bp++);
 893                         if (bp < ep)
 894                                 return 0;
 895                 }
 896         }
 897         *dst = (char) 0;
 898         return 1;
 899 }