]> git.saurik.com Git - wxWidgets.git/blob - src/stc/scintilla/src/RESearch.cxx
Updated Scintilla to 1.52 (on the trunk this time too)
[wxWidgets.git] / src / stc / scintilla / src / RESearch.cxx
1 // Scintilla source code edit control
2 /** @file RESearch.cxx
3 ** Regular expression search library.
4 **/
5
6 /*
7 * regex - Regular expression pattern matching and replacement
8 *
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
11 * York University
12 *
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
19 *
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
22 *
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
28 * matching module.
29 *
30 * Modification history:
31 *
32 * $Log$
33 * Revision 1.6 2003/04/19 19:59:49 RD
34 * Updated Scintilla to 1.52 (on the trunk this time too)
35 *
36 * Revision 1.9 2003/03/21 10:36:08 nyamatongwe
37 * Detect patterns too long in regular expression search.
38 *
39 * Revision 1.8 2003/03/04 10:53:59 nyamatongwe
40 * Patch from Jakub to optionally implement more POSIX compatible regular
41 * expressions. \(..\) changes to (..)
42 * Fixes problem where find previous would not find earlier matches on same
43 * line.
44 *
45 * Revision 1.8 2003/03/03 20:12:56 vrana
46 * Added posix syntax.
47 *
48 * Revision 1.7 2002/09/28 00:33:28 nyamatongwe
49 * Fixed problem with character ranges caused by expansion to 8 bits.
50 *
51 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
52 * Addition of new target methods - versions of ReplaceTarget that take counted
53 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
54 * series of calls rather than a structure.
55 * Handling of \000 in search and replace.
56 * Handling of /escapes within character ranges of regular expressions.
57 * Some handling of bare ^ and $ regular expressions.
58 *
59 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
60 * Removed DEBUG code that failed to compile on GTK+.
61 *
62 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
63 * Added URL to find original code to comments.
64 *
65 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
66 * Made regular expression searching work on a line by line basis, made ^ and
67 * $ work, made [set] work, and added a case insensitive option.
68 *
69 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
70 * Replace target functionality to make find and replace operations faster
71 * by diminishing screen updates and allow for \d patterns in the replacement
72 * text.
73 *
74 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
75 * Moved to public domain regular expresion implementation.
76 *
77 * Revision 1.4 1991/10/17 03:56:42 oz
78 * miscellaneous changes, small cleanups etc.
79 *
80 * Revision 1.3 1989/04/01 14:18:09 oz
81 * Change all references to a dfa: this is actually an nfa.
82 *
83 * Revision 1.2 88/08/28 15:36:04 oz
84 * Use a complement bitmap to represent NCL.
85 * This removes the need to have seperate
86 * code in the PMatch case block - it is
87 * just CCL code now.
88 *
89 * Use the actual CCL code in the CLO
90 * section of PMatch. No need for a recursive
91 * PMatch call.
92 *
93 * Use a bitmap table to set char bits in an
94 * 8-bit chunk.
95 *
96 * Interfaces:
97 * RESearch::Compile: compile a regular expression into a NFA.
98 *
99 * char *RESearch::Compile(s)
100 * char *s;
101 *
102 * RESearch::Execute: execute the NFA to match a pattern.
103 *
104 * int RESearch::Execute(s)
105 * char *s;
106 *
107 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
108 * looks like (for \< and \>) by adding into the
109 * hidden word-syntax table.
110 *
111 * void RESearch::ModifyWord(s)
112 * char *s;
113 *
114 * RESearch::Substitute: substitute the matched portions in a new string.
115 *
116 * int RESearch::Substitute(src, dst)
117 * char *src;
118 * char *dst;
119 *
120 * re_fail: failure routine for RESearch::Execute.
121 *
122 * void re_fail(msg, op)
123 * char *msg;
124 * char op;
125 *
126 * Regular Expressions:
127 *
128 * [1] char matches itself, unless it is a special
129 * character (metachar): . \ [ ] * + ^ $
130 *
131 * [2] . matches any character.
132 *
133 * [3] \ matches the character following it, except
134 * when followed by a left or right round bracket,
135 * a digit 1 to 9 or a left or right angle bracket.
136 * (see [7], [8] and [9])
137 * It is used as an escape character for all
138 * other meta-characters, and itself. When used
139 * in a set ([4]), it is treated as an ordinary
140 * character.
141 *
142 * [4] [set] matches one of the characters in the set.
143 * If the first character in the set is "^",
144 * it matches a character NOT in the set, i.e.
145 * complements the set. A shorthand S-E is
146 * used to specify a set of characters S upto
147 * E, inclusive. The special characters "]" and
148 * "-" have no special meaning if they appear
149 * as the first chars in the set.
150 * examples: match:
151 *
152 * [a-z] any lowercase alpha
153 *
154 * [^]-] any char except ] and -
155 *
156 * [^A-Z] any char except uppercase
157 * alpha
158 *
159 * [a-zA-Z] any alpha
160 *
161 * [5] * any regular expression form [1] to [4], followed by
162 * closure char (*) matches zero or more matches of
163 * that form.
164 *
165 * [6] + same as [5], except it matches one or more.
166 *
167 * [7] a regular expression in the form [1] to [10], enclosed
168 * as \(form\) matches what form matches. The enclosure
169 * creates a set of tags, used for [8] and for
170 * pattern substution. The tagged forms are numbered
171 * starting from 1.
172 *
173 * [8] a \ followed by a digit 1 to 9 matches whatever a
174 * previously tagged regular expression ([7]) matched.
175 *
176 * [9] \< a regular expression starting with a \< construct
177 * \> and/or ending with a \> construct, restricts the
178 * pattern matching to the beginning of a word, and/or
179 * the end of a word. A word is defined to be a character
180 * string beginning and/or ending with the characters
181 * A-Z a-z 0-9 and _. It must also be preceded and/or
182 * followed by any character outside those mentioned.
183 *
184 * [10] a composite regular expression xy where x and y
185 * are in the form [1] to [10] matches the longest
186 * match of x followed by a match for y.
187 *
188 * [11] ^ a regular expression starting with a ^ character
189 * $ and/or ending with a $ character, restricts the
190 * pattern matching to the beginning of the line,
191 * or the end of line. [anchors] Elsewhere in the
192 * pattern, ^ and $ are treated as ordinary characters.
193 *
194 *
195 * Acknowledgements:
196 *
197 * HCR's Hugh Redelmeier has been most helpful in various
198 * stages of development. He convinced me to include BOW
199 * and EOW constructs, originally invented by Rob Pike at
200 * the University of Toronto.
201 *
202 * References:
203 * Software tools Kernighan & Plauger
204 * Software tools in Pascal Kernighan & Plauger
205 * Grep [rsx-11 C dist] David Conroy
206 * ed - text editor Un*x Programmer's Manual
207 * Advanced editing on Un*x B. W. Kernighan
208 * RegExp routines Henry Spencer
209 *
210 * Notes:
211 *
212 * This implementation uses a bit-set representation for character
213 * classes for speed and compactness. Each character is represented
214 * by one bit in a 128-bit block. Thus, CCL always takes a
215 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
216 * bit comparison to locate the character in the set.
217 *
218 * Examples:
219 *
220 * pattern: foo*.*
221 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
222 * matches: fo foo fooo foobar fobar foxx ...
223 *
224 * pattern: fo[ob]a[rz]
225 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
226 * matches: fobar fooar fobaz fooaz
227 *
228 * pattern: foo\\+
229 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
230 * matches: foo\ foo\\ foo\\\ ...
231 *
232 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
233 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
234 * matches: foo1foo foo2foo foo3foo
235 *
236 * pattern: \(fo.*\)-\1
237 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
238 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
239 */
240
241 #include "RESearch.h"
242
243 #define OKP 1
244 #define NOP 0
245
246 #define CHR 1
247 #define ANY 2
248 #define CCL 3
249 #define BOL 4
250 #define EOL 5
251 #define BOT 6
252 #define EOT 7
253 #define BOW 8
254 #define EOW 9
255 #define REF 10
256 #define CLO 11
257
258 #define END 0
259
260 /*
261 * The following defines are not meant to be changeable.
262 * They are for readability only.
263 */
264 #define BLKIND 0170
265 #define BITIND 07
266
267 #define ASCIIB 0177
268
269 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
270
271 #define badpat(x) (*nfa = END, x)
272
273 RESearch::RESearch() {
274 Init();
275 }
276
277 RESearch::~RESearch() {
278 Clear();
279 }
280
281 void RESearch::Init() {
282 sta = NOP; /* status of lastpat */
283 bol = 0;
284 for (int i=0; i<MAXTAG; i++)
285 pat[i] = 0;
286 for (int j=0; j<BITBLK; j++)
287 bittab[j] = 0;
288 }
289
290 void RESearch::Clear() {
291 for (int i=0; i<MAXTAG; i++) {
292 delete []pat[i];
293 pat[i] = 0;
294 bopat[i] = NOTFOUND;
295 eopat[i] = NOTFOUND;
296 }
297 }
298
299 bool RESearch::GrabMatches(CharacterIndexer &ci) {
300 bool success = true;
301 for (unsigned int i=0; i<MAXTAG; i++) {
302 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
303 unsigned int len = eopat[i] - bopat[i];
304 pat[i] = new char[len + 1];
305 if (pat[i]) {
306 for (unsigned int j=0; j<len; j++)
307 pat[i][j] = ci.CharAt(bopat[i] + j);
308 pat[i][len] = '\0';
309 } else {
310 success = false;
311 }
312 }
313 }
314 return success;
315 }
316
317 void RESearch::ChSet(char c) {
318 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
319 }
320
321 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
322 if (caseSensitive) {
323 ChSet(c);
324 } else {
325 if ((c >= 'a') && (c <= 'z')) {
326 ChSet(c);
327 ChSet(static_cast<char>(c - 'a' + 'A'));
328 } else if ((c >= 'A') && (c <= 'Z')) {
329 ChSet(c);
330 ChSet(static_cast<char>(c - 'A' + 'a'));
331 } else {
332 ChSet(c);
333 }
334 }
335 }
336
337 const char escapeValue(char ch) {
338 switch (ch) {
339 case 'a': return '\a';
340 case 'b': return '\b';
341 case 'f': return '\f';
342 case 'n': return '\n';
343 case 'r': return '\r';
344 case 't': return '\t';
345 case 'v': return '\v';
346 }
347 return 0;
348 }
349
350 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
351 char *mp=nfa; /* nfa pointer */
352 char *lp; /* saved pointer.. */
353 char *sp=nfa; /* another one.. */
354 char *mpMax = mp + MAXNFA - BITBLK - 10;
355
356 int tagi = 0; /* tag stack index */
357 int tagc = 1; /* actual tag count */
358
359 int n;
360 char mask; /* xor mask -CCL/NCL */
361 int c1, c2;
362
363 if (!pat || !length)
364 if (sta)
365 return 0;
366 else
367 return badpat("No previous regular expression");
368 sta = NOP;
369
370 const char *p=pat; /* pattern pointer */
371 for (int i=0; i<length; i++, p++) {
372 if (mp > mpMax)
373 return badpat("Pattern too long");
374 lp = mp;
375 switch(*p) {
376
377 case '.': /* match any char.. */
378 *mp++ = ANY;
379 break;
380
381 case '^': /* match beginning.. */
382 if (p == pat)
383 *mp++ = BOL;
384 else {
385 *mp++ = CHR;
386 *mp++ = *p;
387 }
388 break;
389
390 case '$': /* match endofline.. */
391 if (!*(p+1))
392 *mp++ = EOL;
393 else {
394 *mp++ = CHR;
395 *mp++ = *p;
396 }
397 break;
398
399 case '[': /* match char class..*/
400 *mp++ = CCL;
401
402 i++;
403 if (*++p == '^') {
404 mask = '\377';
405 i++;
406 p++;
407 } else
408 mask = 0;
409
410 if (*p == '-') { /* real dash */
411 i++;
412 ChSet(*p++);
413 }
414 if (*p == ']') { /* real brace */
415 i++;
416 ChSet(*p++);
417 }
418 while (*p && *p != ']') {
419 if (*p == '-' && *(p+1) && *(p+1) != ']') {
420 i++;
421 p++;
422 c1 = *(p-2) + 1;
423 i++;
424 c2 = *p++;
425 while (c1 <= c2) {
426 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
427 }
428 } else if (*p == '\\' && *(p+1)) {
429 i++;
430 p++;
431 char escape = escapeValue(*p);
432 if (escape)
433 ChSetWithCase(escape, caseSensitive);
434 else
435 ChSetWithCase(*p, caseSensitive);
436 i++;
437 p++;
438 } else {
439 i++;
440 ChSetWithCase(*p++, caseSensitive);
441 }
442 }
443 if (!*p)
444 return badpat("Missing ]");
445
446 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
447 *mp++ = static_cast<char>(mask ^ bittab[n]);
448
449 break;
450
451 case '*': /* match 0 or more.. */
452 case '+': /* match 1 or more.. */
453 if (p == pat)
454 return badpat("Empty closure");
455 lp = sp; /* previous opcode */
456 if (*lp == CLO) /* equivalence.. */
457 break;
458 switch(*lp) {
459
460 case BOL:
461 case BOT:
462 case EOT:
463 case BOW:
464 case EOW:
465 case REF:
466 return badpat("Illegal closure");
467 default:
468 break;
469 }
470
471 if (*p == '+')
472 for (sp = mp; lp < sp; lp++)
473 *mp++ = *lp;
474
475 *mp++ = END;
476 *mp++ = END;
477 sp = mp;
478 while (--mp > lp)
479 *mp = mp[-1];
480 *mp = CLO;
481 mp = sp;
482 break;
483
484 case '\\': /* tags, backrefs .. */
485 i++;
486 switch(*++p) {
487
488 case '<':
489 *mp++ = BOW;
490 break;
491 case '>':
492 if (*sp == BOW)
493 return badpat("Null pattern inside \\<\\>");
494 *mp++ = EOW;
495 break;
496 case '1':
497 case '2':
498 case '3':
499 case '4':
500 case '5':
501 case '6':
502 case '7':
503 case '8':
504 case '9':
505 n = *p-'0';
506 if (tagi > 0 && tagstk[tagi] == n)
507 return badpat("Cyclical reference");
508 if (tagc > n) {
509 *mp++ = static_cast<char>(REF);
510 *mp++ = static_cast<char>(n);
511 }
512 else
513 return badpat("Undetermined reference");
514 break;
515 case 'a':
516 case 'b':
517 case 'n':
518 case 'f':
519 case 'r':
520 case 't':
521 case 'v':
522 *mp++ = CHR;
523 *mp++ = escapeValue(*p);
524 break;
525 default:
526 if (!posix && *p == '(') {
527 if (tagc < MAXTAG) {
528 tagstk[++tagi] = tagc;
529 *mp++ = BOT;
530 *mp++ = static_cast<char>(tagc++);
531 }
532 else
533 return badpat("Too many \\(\\) pairs");
534 } else if (!posix && *p == ')') {
535 if (*sp == BOT)
536 return badpat("Null pattern inside \\(\\)");
537 if (tagi > 0) {
538 *mp++ = static_cast<char>(EOT);
539 *mp++ = static_cast<char>(tagstk[tagi--]);
540 }
541 else
542 return badpat("Unmatched \\)");
543 } else {
544 *mp++ = CHR;
545 *mp++ = *p;
546 }
547 }
548 break;
549
550 default : /* an ordinary char */
551 if (posix && *p == '(') {
552 if (tagc < MAXTAG) {
553 tagstk[++tagi] = tagc;
554 *mp++ = BOT;
555 *mp++ = static_cast<char>(tagc++);
556 }
557 else
558 return badpat("Too many () pairs");
559 } else if (posix && *p == ')') {
560 if (*sp == BOT)
561 return badpat("Null pattern inside ()");
562 if (tagi > 0) {
563 *mp++ = static_cast<char>(EOT);
564 *mp++ = static_cast<char>(tagstk[tagi--]);
565 }
566 else
567 return badpat("Unmatched )");
568 } else if (caseSensitive) {
569 *mp++ = CHR;
570 *mp++ = *p;
571 } else {
572 *mp++ = CCL;
573 mask = 0;
574 ChSetWithCase(*p, false);
575 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
576 *mp++ = static_cast<char>(mask ^ bittab[n]);
577 }
578 break;
579 }
580 sp = lp;
581 }
582 if (tagi > 0)
583 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
584 *mp = END;
585 sta = OKP;
586 return 0;
587 }
588
589 /*
590 * RESearch::Execute:
591 * execute nfa to find a match.
592 *
593 * special cases: (nfa[0])
594 * BOL
595 * Match only once, starting from the
596 * beginning.
597 * CHR
598 * First locate the character without
599 * calling PMatch, and if found, call
600 * PMatch for the remaining string.
601 * END
602 * RESearch::Compile failed, poor luser did not
603 * check for it. Fail fast.
604 *
605 * If a match is found, bopat[0] and eopat[0] are set
606 * to the beginning and the end of the matched fragment,
607 * respectively.
608 *
609 */
610
611 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
612 char c;
613 int ep = NOTFOUND;
614 char *ap = nfa;
615
616 bol = lp;
617 failure = 0;
618
619 Clear();
620
621 switch(*ap) {
622
623 case BOL: /* anchored: match from BOL only */
624 ep = PMatch(ci, lp, endp, ap);
625 break;
626 case EOL: /* just searching for end of line normal path doesn't work */
627 if (*(ap+1) == END) {
628 lp = endp;
629 ep = lp;
630 break;
631 } else {
632 return 0;
633 }
634 case CHR: /* ordinary char: locate it fast */
635 c = *(ap+1);
636 while ((lp < endp) && (ci.CharAt(lp) != c))
637 lp++;
638 if (lp >= endp) /* if EOS, fail, else fall thru. */
639 return 0;
640 default: /* regular matching all the way. */
641 while (lp < endp) {
642 ep = PMatch(ci, lp, endp, ap);
643 if (ep != NOTFOUND)
644 break;
645 lp++;
646 }
647 break;
648 case END: /* munged automaton. fail always */
649 return 0;
650 }
651 if (ep == NOTFOUND)
652 return 0;
653
654 bopat[0] = lp;
655 eopat[0] = ep;
656 return 1;
657 }
658
659 /*
660 * PMatch: internal routine for the hard part
661 *
662 * This code is partly snarfed from an early grep written by
663 * David Conroy. The backref and tag stuff, and various other
664 * innovations are by oz.
665 *
666 * special case optimizations: (nfa[n], nfa[n+1])
667 * CLO ANY
668 * We KNOW .* will match everything upto the
669 * end of line. Thus, directly go to the end of
670 * line, without recursive PMatch calls. As in
671 * the other closure cases, the remaining pattern
672 * must be matched by moving backwards on the
673 * string recursively, to find a match for xy
674 * (x is ".*" and y is the remaining pattern)
675 * where the match satisfies the LONGEST match for
676 * x followed by a match for y.
677 * CLO CHR
678 * We can again scan the string forward for the
679 * single char and at the point of failure, we
680 * execute the remaining nfa recursively, same as
681 * above.
682 *
683 * At the end of a successful match, bopat[n] and eopat[n]
684 * are set to the beginning and end of subpatterns matched
685 * by tagged expressions (n = 1 to 9).
686 *
687 */
688
689 extern void re_fail(char *,char);
690
691 /*
692 * character classification table for word boundary operators BOW
693 * and EOW. the reason for not using ctype macros is that we can
694 * let the user add into our own table. see RESearch::ModifyWord. This table
695 * is not in the bitset form, since we may wish to extend it in the
696 * future for other character classifications.
697 *
698 * TRUE for 0-9 A-Z a-z _
699 */
700 static char chrtyp[MAXCHR] = {
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
705 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
706 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
707 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
708 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
709 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
710 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
711 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
712 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
713 1, 1, 1, 0, 0, 0, 0, 0
714 };
715
716 #define inascii(x) (0177&(x))
717 #define iswordc(x) chrtyp[inascii(x)]
718 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
719
720 /*
721 * skip values for CLO XXX to skip past the closure
722 */
723
724 #define ANYSKIP 2 /* [CLO] ANY END ... */
725 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
726 #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
727
728 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
729 int op, c, n;
730 int e; /* extra pointer for CLO */
731 int bp; /* beginning of subpat.. */
732 int ep; /* ending of subpat.. */
733 int are; /* to save the line ptr. */
734
735 while ((op = *ap++) != END)
736 switch(op) {
737
738 case CHR:
739 if (ci.CharAt(lp++) != *ap++)
740 return NOTFOUND;
741 break;
742 case ANY:
743 if (lp++ >= endp)
744 return NOTFOUND;
745 break;
746 case CCL:
747 c = ci.CharAt(lp++);
748 if (!isinset(ap,c))
749 return NOTFOUND;
750 ap += BITBLK;
751 break;
752 case BOL:
753 if (lp != bol)
754 return NOTFOUND;
755 break;
756 case EOL:
757 if (lp < endp)
758 return NOTFOUND;
759 break;
760 case BOT:
761 bopat[*ap++] = lp;
762 break;
763 case EOT:
764 eopat[*ap++] = lp;
765 break;
766 case BOW:
767 if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
768 return NOTFOUND;
769 break;
770 case EOW:
771 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
772 return NOTFOUND;
773 break;
774 case REF:
775 n = *ap++;
776 bp = bopat[n];
777 ep = eopat[n];
778 while (bp < ep)
779 if (ci.CharAt(bp++) != ci.CharAt(lp++))
780 return NOTFOUND;
781 break;
782 case CLO:
783 are = lp;
784 switch(*ap) {
785
786 case ANY:
787 while (lp < endp)
788 lp++;
789 n = ANYSKIP;
790 break;
791 case CHR:
792 c = *(ap+1);
793 while ((lp < endp) && (c == ci.CharAt(lp)))
794 lp++;
795 n = CHRSKIP;
796 break;
797 case CCL:
798 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
799 lp++;
800 n = CCLSKIP;
801 break;
802 default:
803 failure = true;
804 //re_fail("closure: bad nfa.", *ap);
805 return NOTFOUND;
806 }
807
808 ap += n;
809
810 while (lp >= are) {
811 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
812 return e;
813 --lp;
814 }
815 return NOTFOUND;
816 default:
817 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
818 return NOTFOUND;
819 }
820 return lp;
821 }
822
823 /*
824 * RESearch::ModifyWord:
825 * add new characters into the word table to change RESearch::Execute's
826 * understanding of what a word should look like. Note that we
827 * only accept additions into the word definition.
828 *
829 * If the string parameter is 0 or null string, the table is
830 * reset back to the default containing A-Z a-z 0-9 _. [We use
831 * the compact bitset representation for the default table]
832 */
833
834 static char deftab[16] = {
835 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
836 '\376', '\377', '\377', 007
837 };
838
839 void RESearch::ModifyWord(char *s) {
840 int i;
841
842 if (!s || !*s) {
843 for (i = 0; i < MAXCHR; i++)
844 if (!isinset(deftab,i))
845 iswordc(i) = 0;
846 }
847 else
848 while(*s)
849 iswordc(*s++) = 1;
850 }
851
852 /*
853 * RESearch::Substitute:
854 * substitute the matched portions of the src in dst.
855 *
856 * & substitute the entire matched pattern.
857 *
858 * \digit substitute a subpattern, with the given tag number.
859 * Tags are numbered from 1 to 9. If the particular
860 * tagged subpattern does not exist, null is substituted.
861 */
862 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
863 char c;
864 int pin;
865 int bp;
866 int ep;
867
868 if (!*src || !bopat[0])
869 return 0;
870
871 while ((c = *src++) != 0) {
872 switch(c) {
873
874 case '&':
875 pin = 0;
876 break;
877
878 case '\\':
879 c = *src++;
880 if (c >= '0' && c <= '9') {
881 pin = c - '0';
882 break;
883 }
884
885 default:
886 *dst++ = c;
887 continue;
888 }
889
890 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
891 while (ci.CharAt(bp) && bp < ep)
892 *dst++ = ci.CharAt(bp++);
893 if (bp < ep)
894 return 0;
895 }
896 }
897 *dst = (char) 0;
898 return 1;
899 }