]> git.saurik.com Git - wxWidgets.git/blob - src/stc/scintilla/src/RESearch.cxx
Added 'Back' to accelerator possibilities
[wxWidgets.git] / src / stc / scintilla / src / RESearch.cxx
1 // Scintilla source code edit control
2 /** @file RESearch.cxx
3 ** Regular expression search library.
4 **/
5
6 /*
7 * regex - Regular expression pattern matching and replacement
8 *
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
11 * York University
12 *
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
19 *
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
22 *
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
28 * matching module.
29 *
30 * Modification history:
31 *
32 * $Log$
33 * Revision 1.8 2003/09/18 05:05:38 RD
34 * Updated to Scintilla 1.54
35 * Applied most of patch #806092
36 * Added new wxSTC sample from Otto Wyss
37 *
38 * Revision 1.9 2003/03/21 10:36:08 nyamatongwe
39 * Detect patterns too long in regular expression search.
40 *
41 * Revision 1.8 2003/03/04 10:53:59 nyamatongwe
42 * Patch from Jakub to optionally implement more POSIX compatible regular
43 * expressions. \(..\) changes to (..)
44 * Fixes problem where find previous would not find earlier matches on same
45 * line.
46 *
47 * Revision 1.8 2003/03/03 20:12:56 vrana
48 * Added posix syntax.
49 *
50 * Revision 1.7 2002/09/28 00:33:28 nyamatongwe
51 * Fixed problem with character ranges caused by expansion to 8 bits.
52 *
53 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
54 * Addition of new target methods - versions of ReplaceTarget that take counted
55 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
56 * series of calls rather than a structure.
57 * Handling of \000 in search and replace.
58 * Handling of /escapes within character ranges of regular expressions.
59 * Some handling of bare ^ and $ regular expressions.
60 *
61 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
62 * Removed DEBUG code that failed to compile on GTK+.
63 *
64 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
65 * Added URL to find original code to comments.
66 *
67 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
68 * Made regular expression searching work on a line by line basis, made ^ and
69 * $ work, made [set] work, and added a case insensitive option.
70 *
71 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
72 * Replace target functionality to make find and replace operations faster
73 * by diminishing screen updates and allow for \d patterns in the replacement
74 * text.
75 *
76 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
77 * Moved to public domain regular expresion implementation.
78 *
79 * Revision 1.4 1991/10/17 03:56:42 oz
80 * miscellaneous changes, small cleanups etc.
81 *
82 * Revision 1.3 1989/04/01 14:18:09 oz
83 * Change all references to a dfa: this is actually an nfa.
84 *
85 * Revision 1.2 88/08/28 15:36:04 oz
86 * Use a complement bitmap to represent NCL.
87 * This removes the need to have seperate
88 * code in the PMatch case block - it is
89 * just CCL code now.
90 *
91 * Use the actual CCL code in the CLO
92 * section of PMatch. No need for a recursive
93 * PMatch call.
94 *
95 * Use a bitmap table to set char bits in an
96 * 8-bit chunk.
97 *
98 * Interfaces:
99 * RESearch::Compile: compile a regular expression into a NFA.
100 *
101 * char *RESearch::Compile(s)
102 * char *s;
103 *
104 * RESearch::Execute: execute the NFA to match a pattern.
105 *
106 * int RESearch::Execute(s)
107 * char *s;
108 *
109 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
110 * looks like (for \< and \>) by adding into the
111 * hidden word-syntax table.
112 *
113 * void RESearch::ModifyWord(s)
114 * char *s;
115 *
116 * RESearch::Substitute: substitute the matched portions in a new string.
117 *
118 * int RESearch::Substitute(src, dst)
119 * char *src;
120 * char *dst;
121 *
122 * re_fail: failure routine for RESearch::Execute.
123 *
124 * void re_fail(msg, op)
125 * char *msg;
126 * char op;
127 *
128 * Regular Expressions:
129 *
130 * [1] char matches itself, unless it is a special
131 * character (metachar): . \ [ ] * + ^ $
132 *
133 * [2] . matches any character.
134 *
135 * [3] \ matches the character following it, except
136 * when followed by a left or right round bracket,
137 * a digit 1 to 9 or a left or right angle bracket.
138 * (see [7], [8] and [9])
139 * It is used as an escape character for all
140 * other meta-characters, and itself. When used
141 * in a set ([4]), it is treated as an ordinary
142 * character.
143 *
144 * [4] [set] matches one of the characters in the set.
145 * If the first character in the set is "^",
146 * it matches a character NOT in the set, i.e.
147 * complements the set. A shorthand S-E is
148 * used to specify a set of characters S upto
149 * E, inclusive. The special characters "]" and
150 * "-" have no special meaning if they appear
151 * as the first chars in the set.
152 * examples: match:
153 *
154 * [a-z] any lowercase alpha
155 *
156 * [^]-] any char except ] and -
157 *
158 * [^A-Z] any char except uppercase
159 * alpha
160 *
161 * [a-zA-Z] any alpha
162 *
163 * [5] * any regular expression form [1] to [4], followed by
164 * closure char (*) matches zero or more matches of
165 * that form.
166 *
167 * [6] + same as [5], except it matches one or more.
168 *
169 * [7] a regular expression in the form [1] to [10], enclosed
170 * as \(form\) matches what form matches. The enclosure
171 * creates a set of tags, used for [8] and for
172 * pattern substution. The tagged forms are numbered
173 * starting from 1.
174 *
175 * [8] a \ followed by a digit 1 to 9 matches whatever a
176 * previously tagged regular expression ([7]) matched.
177 *
178 * [9] \< a regular expression starting with a \< construct
179 * \> and/or ending with a \> construct, restricts the
180 * pattern matching to the beginning of a word, and/or
181 * the end of a word. A word is defined to be a character
182 * string beginning and/or ending with the characters
183 * A-Z a-z 0-9 and _. It must also be preceded and/or
184 * followed by any character outside those mentioned.
185 *
186 * [10] a composite regular expression xy where x and y
187 * are in the form [1] to [10] matches the longest
188 * match of x followed by a match for y.
189 *
190 * [11] ^ a regular expression starting with a ^ character
191 * $ and/or ending with a $ character, restricts the
192 * pattern matching to the beginning of the line,
193 * or the end of line. [anchors] Elsewhere in the
194 * pattern, ^ and $ are treated as ordinary characters.
195 *
196 *
197 * Acknowledgements:
198 *
199 * HCR's Hugh Redelmeier has been most helpful in various
200 * stages of development. He convinced me to include BOW
201 * and EOW constructs, originally invented by Rob Pike at
202 * the University of Toronto.
203 *
204 * References:
205 * Software tools Kernighan & Plauger
206 * Software tools in Pascal Kernighan & Plauger
207 * Grep [rsx-11 C dist] David Conroy
208 * ed - text editor Un*x Programmer's Manual
209 * Advanced editing on Un*x B. W. Kernighan
210 * RegExp routines Henry Spencer
211 *
212 * Notes:
213 *
214 * This implementation uses a bit-set representation for character
215 * classes for speed and compactness. Each character is represented
216 * by one bit in a 128-bit block. Thus, CCL always takes a
217 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
218 * bit comparison to locate the character in the set.
219 *
220 * Examples:
221 *
222 * pattern: foo*.*
223 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
224 * matches: fo foo fooo foobar fobar foxx ...
225 *
226 * pattern: fo[ob]a[rz]
227 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
228 * matches: fobar fooar fobaz fooaz
229 *
230 * pattern: foo\\+
231 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
232 * matches: foo\ foo\\ foo\\\ ...
233 *
234 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
235 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
236 * matches: foo1foo foo2foo foo3foo
237 *
238 * pattern: \(fo.*\)-\1
239 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
240 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
241 */
242
243 #include "RESearch.h"
244
245 #define OKP 1
246 #define NOP 0
247
248 #define CHR 1
249 #define ANY 2
250 #define CCL 3
251 #define BOL 4
252 #define EOL 5
253 #define BOT 6
254 #define EOT 7
255 #define BOW 8
256 #define EOW 9
257 #define REF 10
258 #define CLO 11
259
260 #define END 0
261
262 /*
263 * The following defines are not meant to be changeable.
264 * They are for readability only.
265 */
266 #define BLKIND 0170
267 #define BITIND 07
268
269 #define ASCIIB 0177
270
271 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
272
273 #define badpat(x) (*nfa = END, x)
274
275 RESearch::RESearch() {
276 Init();
277 }
278
279 RESearch::~RESearch() {
280 Clear();
281 }
282
283 void RESearch::Init() {
284 sta = NOP; /* status of lastpat */
285 bol = 0;
286 for (int i=0; i<MAXTAG; i++)
287 pat[i] = 0;
288 for (int j=0; j<BITBLK; j++)
289 bittab[j] = 0;
290 }
291
292 void RESearch::Clear() {
293 for (int i=0; i<MAXTAG; i++) {
294 delete []pat[i];
295 pat[i] = 0;
296 bopat[i] = NOTFOUND;
297 eopat[i] = NOTFOUND;
298 }
299 }
300
301 bool RESearch::GrabMatches(CharacterIndexer &ci) {
302 bool success = true;
303 for (unsigned int i=0; i<MAXTAG; i++) {
304 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
305 unsigned int len = eopat[i] - bopat[i];
306 pat[i] = new char[len + 1];
307 if (pat[i]) {
308 for (unsigned int j=0; j<len; j++)
309 pat[i][j] = ci.CharAt(bopat[i] + j);
310 pat[i][len] = '\0';
311 } else {
312 success = false;
313 }
314 }
315 }
316 return success;
317 }
318
319 void RESearch::ChSet(char c) {
320 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
321 }
322
323 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
324 if (caseSensitive) {
325 ChSet(c);
326 } else {
327 if ((c >= 'a') && (c <= 'z')) {
328 ChSet(c);
329 ChSet(static_cast<char>(c - 'a' + 'A'));
330 } else if ((c >= 'A') && (c <= 'Z')) {
331 ChSet(c);
332 ChSet(static_cast<char>(c - 'A' + 'a'));
333 } else {
334 ChSet(c);
335 }
336 }
337 }
338
339 const char escapeValue(char ch) {
340 switch (ch) {
341 case 'a': return '\a';
342 case 'b': return '\b';
343 case 'f': return '\f';
344 case 'n': return '\n';
345 case 'r': return '\r';
346 case 't': return '\t';
347 case 'v': return '\v';
348 }
349 return 0;
350 }
351
352 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
353 char *mp=nfa; /* nfa pointer */
354 char *lp; /* saved pointer.. */
355 char *sp=nfa; /* another one.. */
356 char *mpMax = mp + MAXNFA - BITBLK - 10;
357
358 int tagi = 0; /* tag stack index */
359 int tagc = 1; /* actual tag count */
360
361 int n;
362 char mask; /* xor mask -CCL/NCL */
363 int c1, c2;
364
365 if (!pat || !length)
366 if (sta)
367 return 0;
368 else
369 return badpat("No previous regular expression");
370 sta = NOP;
371
372 const char *p=pat; /* pattern pointer */
373 for (int i=0; i<length; i++, p++) {
374 if (mp > mpMax)
375 return badpat("Pattern too long");
376 lp = mp;
377 switch(*p) {
378
379 case '.': /* match any char.. */
380 *mp++ = ANY;
381 break;
382
383 case '^': /* match beginning.. */
384 if (p == pat)
385 *mp++ = BOL;
386 else {
387 *mp++ = CHR;
388 *mp++ = *p;
389 }
390 break;
391
392 case '$': /* match endofline.. */
393 if (!*(p+1))
394 *mp++ = EOL;
395 else {
396 *mp++ = CHR;
397 *mp++ = *p;
398 }
399 break;
400
401 case '[': /* match char class..*/
402 *mp++ = CCL;
403
404 i++;
405 if (*++p == '^') {
406 mask = '\377';
407 i++;
408 p++;
409 } else
410 mask = 0;
411
412 if (*p == '-') { /* real dash */
413 i++;
414 ChSet(*p++);
415 }
416 if (*p == ']') { /* real brace */
417 i++;
418 ChSet(*p++);
419 }
420 while (*p && *p != ']') {
421 if (*p == '-' && *(p+1) && *(p+1) != ']') {
422 i++;
423 p++;
424 c1 = *(p-2) + 1;
425 i++;
426 c2 = *p++;
427 while (c1 <= c2) {
428 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
429 }
430 } else if (*p == '\\' && *(p+1)) {
431 i++;
432 p++;
433 char escape = escapeValue(*p);
434 if (escape)
435 ChSetWithCase(escape, caseSensitive);
436 else
437 ChSetWithCase(*p, caseSensitive);
438 i++;
439 p++;
440 } else {
441 i++;
442 ChSetWithCase(*p++, caseSensitive);
443 }
444 }
445 if (!*p)
446 return badpat("Missing ]");
447
448 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
449 *mp++ = static_cast<char>(mask ^ bittab[n]);
450
451 break;
452
453 case '*': /* match 0 or more.. */
454 case '+': /* match 1 or more.. */
455 if (p == pat)
456 return badpat("Empty closure");
457 lp = sp; /* previous opcode */
458 if (*lp == CLO) /* equivalence.. */
459 break;
460 switch(*lp) {
461
462 case BOL:
463 case BOT:
464 case EOT:
465 case BOW:
466 case EOW:
467 case REF:
468 return badpat("Illegal closure");
469 default:
470 break;
471 }
472
473 if (*p == '+')
474 for (sp = mp; lp < sp; lp++)
475 *mp++ = *lp;
476
477 *mp++ = END;
478 *mp++ = END;
479 sp = mp;
480 while (--mp > lp)
481 *mp = mp[-1];
482 *mp = CLO;
483 mp = sp;
484 break;
485
486 case '\\': /* tags, backrefs .. */
487 i++;
488 switch(*++p) {
489
490 case '<':
491 *mp++ = BOW;
492 break;
493 case '>':
494 if (*sp == BOW)
495 return badpat("Null pattern inside \\<\\>");
496 *mp++ = EOW;
497 break;
498 case '1':
499 case '2':
500 case '3':
501 case '4':
502 case '5':
503 case '6':
504 case '7':
505 case '8':
506 case '9':
507 n = *p-'0';
508 if (tagi > 0 && tagstk[tagi] == n)
509 return badpat("Cyclical reference");
510 if (tagc > n) {
511 *mp++ = static_cast<char>(REF);
512 *mp++ = static_cast<char>(n);
513 }
514 else
515 return badpat("Undetermined reference");
516 break;
517 case 'a':
518 case 'b':
519 case 'n':
520 case 'f':
521 case 'r':
522 case 't':
523 case 'v':
524 *mp++ = CHR;
525 *mp++ = escapeValue(*p);
526 break;
527 default:
528 if (!posix && *p == '(') {
529 if (tagc < MAXTAG) {
530 tagstk[++tagi] = tagc;
531 *mp++ = BOT;
532 *mp++ = static_cast<char>(tagc++);
533 }
534 else
535 return badpat("Too many \\(\\) pairs");
536 } else if (!posix && *p == ')') {
537 if (*sp == BOT)
538 return badpat("Null pattern inside \\(\\)");
539 if (tagi > 0) {
540 *mp++ = static_cast<char>(EOT);
541 *mp++ = static_cast<char>(tagstk[tagi--]);
542 }
543 else
544 return badpat("Unmatched \\)");
545 } else {
546 *mp++ = CHR;
547 *mp++ = *p;
548 }
549 }
550 break;
551
552 default : /* an ordinary char */
553 if (posix && *p == '(') {
554 if (tagc < MAXTAG) {
555 tagstk[++tagi] = tagc;
556 *mp++ = BOT;
557 *mp++ = static_cast<char>(tagc++);
558 }
559 else
560 return badpat("Too many () pairs");
561 } else if (posix && *p == ')') {
562 if (*sp == BOT)
563 return badpat("Null pattern inside ()");
564 if (tagi > 0) {
565 *mp++ = static_cast<char>(EOT);
566 *mp++ = static_cast<char>(tagstk[tagi--]);
567 }
568 else
569 return badpat("Unmatched )");
570 } else if (caseSensitive) {
571 *mp++ = CHR;
572 *mp++ = *p;
573 } else {
574 *mp++ = CCL;
575 mask = 0;
576 ChSetWithCase(*p, false);
577 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
578 *mp++ = static_cast<char>(mask ^ bittab[n]);
579 }
580 break;
581 }
582 sp = lp;
583 }
584 if (tagi > 0)
585 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
586 *mp = END;
587 sta = OKP;
588 return 0;
589 }
590
591 /*
592 * RESearch::Execute:
593 * execute nfa to find a match.
594 *
595 * special cases: (nfa[0])
596 * BOL
597 * Match only once, starting from the
598 * beginning.
599 * CHR
600 * First locate the character without
601 * calling PMatch, and if found, call
602 * PMatch for the remaining string.
603 * END
604 * RESearch::Compile failed, poor luser did not
605 * check for it. Fail fast.
606 *
607 * If a match is found, bopat[0] and eopat[0] are set
608 * to the beginning and the end of the matched fragment,
609 * respectively.
610 *
611 */
612
613 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
614 char c;
615 int ep = NOTFOUND;
616 char *ap = nfa;
617
618 bol = lp;
619 failure = 0;
620
621 Clear();
622
623 switch(*ap) {
624
625 case BOL: /* anchored: match from BOL only */
626 ep = PMatch(ci, lp, endp, ap);
627 break;
628 case EOL: /* just searching for end of line normal path doesn't work */
629 if (*(ap+1) == END) {
630 lp = endp;
631 ep = lp;
632 break;
633 } else {
634 return 0;
635 }
636 case CHR: /* ordinary char: locate it fast */
637 c = *(ap+1);
638 while ((lp < endp) && (ci.CharAt(lp) != c))
639 lp++;
640 if (lp >= endp) /* if EOS, fail, else fall thru. */
641 return 0;
642 default: /* regular matching all the way. */
643 while (lp < endp) {
644 ep = PMatch(ci, lp, endp, ap);
645 if (ep != NOTFOUND)
646 break;
647 lp++;
648 }
649 break;
650 case END: /* munged automaton. fail always */
651 return 0;
652 }
653 if (ep == NOTFOUND)
654 return 0;
655
656 bopat[0] = lp;
657 eopat[0] = ep;
658 return 1;
659 }
660
661 /*
662 * PMatch: internal routine for the hard part
663 *
664 * This code is partly snarfed from an early grep written by
665 * David Conroy. The backref and tag stuff, and various other
666 * innovations are by oz.
667 *
668 * special case optimizations: (nfa[n], nfa[n+1])
669 * CLO ANY
670 * We KNOW .* will match everything upto the
671 * end of line. Thus, directly go to the end of
672 * line, without recursive PMatch calls. As in
673 * the other closure cases, the remaining pattern
674 * must be matched by moving backwards on the
675 * string recursively, to find a match for xy
676 * (x is ".*" and y is the remaining pattern)
677 * where the match satisfies the LONGEST match for
678 * x followed by a match for y.
679 * CLO CHR
680 * We can again scan the string forward for the
681 * single char and at the point of failure, we
682 * execute the remaining nfa recursively, same as
683 * above.
684 *
685 * At the end of a successful match, bopat[n] and eopat[n]
686 * are set to the beginning and end of subpatterns matched
687 * by tagged expressions (n = 1 to 9).
688 *
689 */
690
691 extern void re_fail(char *,char);
692
693 /*
694 * character classification table for word boundary operators BOW
695 * and EOW. the reason for not using ctype macros is that we can
696 * let the user add into our own table. see RESearch::ModifyWord. This table
697 * is not in the bitset form, since we may wish to extend it in the
698 * future for other character classifications.
699 *
700 * TRUE for 0-9 A-Z a-z _
701 */
702 static char chrtyp[MAXCHR] = {
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
707 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
708 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
709 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
710 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
711 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
712 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
713 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
714 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
715 1, 1, 1, 0, 0, 0, 0, 0
716 };
717
718 #define inascii(x) (0177&(x))
719 #define iswordc(x) chrtyp[inascii(x)]
720 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
721
722 /*
723 * skip values for CLO XXX to skip past the closure
724 */
725
726 #define ANYSKIP 2 /* [CLO] ANY END ... */
727 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
728 #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
729
730 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
731 int op, c, n;
732 int e; /* extra pointer for CLO */
733 int bp; /* beginning of subpat.. */
734 int ep; /* ending of subpat.. */
735 int are; /* to save the line ptr. */
736
737 while ((op = *ap++) != END)
738 switch(op) {
739
740 case CHR:
741 if (ci.CharAt(lp++) != *ap++)
742 return NOTFOUND;
743 break;
744 case ANY:
745 if (lp++ >= endp)
746 return NOTFOUND;
747 break;
748 case CCL:
749 c = ci.CharAt(lp++);
750 if (!isinset(ap,c))
751 return NOTFOUND;
752 ap += BITBLK;
753 break;
754 case BOL:
755 if (lp != bol)
756 return NOTFOUND;
757 break;
758 case EOL:
759 if (lp < endp)
760 return NOTFOUND;
761 break;
762 case BOT:
763 bopat[*ap++] = lp;
764 break;
765 case EOT:
766 eopat[*ap++] = lp;
767 break;
768 case BOW:
769 if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
770 return NOTFOUND;
771 break;
772 case EOW:
773 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
774 return NOTFOUND;
775 break;
776 case REF:
777 n = *ap++;
778 bp = bopat[n];
779 ep = eopat[n];
780 while (bp < ep)
781 if (ci.CharAt(bp++) != ci.CharAt(lp++))
782 return NOTFOUND;
783 break;
784 case CLO:
785 are = lp;
786 switch(*ap) {
787
788 case ANY:
789 while (lp < endp)
790 lp++;
791 n = ANYSKIP;
792 break;
793 case CHR:
794 c = *(ap+1);
795 while ((lp < endp) && (c == ci.CharAt(lp)))
796 lp++;
797 n = CHRSKIP;
798 break;
799 case CCL:
800 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
801 lp++;
802 n = CCLSKIP;
803 break;
804 default:
805 failure = true;
806 //re_fail("closure: bad nfa.", *ap);
807 return NOTFOUND;
808 }
809
810 ap += n;
811
812 while (lp >= are) {
813 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
814 return e;
815 --lp;
816 }
817 return NOTFOUND;
818 default:
819 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
820 return NOTFOUND;
821 }
822 return lp;
823 }
824
825 /*
826 * RESearch::ModifyWord:
827 * add new characters into the word table to change RESearch::Execute's
828 * understanding of what a word should look like. Note that we
829 * only accept additions into the word definition.
830 *
831 * If the string parameter is 0 or null string, the table is
832 * reset back to the default containing A-Z a-z 0-9 _. [We use
833 * the compact bitset representation for the default table]
834 */
835
836 static char deftab[16] = {
837 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
838 '\376', '\377', '\377', 007
839 };
840
841 void RESearch::ModifyWord(char *s) {
842 int i;
843
844 if (!s || !*s) {
845 for (i = 0; i < MAXCHR; i++)
846 if (!isinset(deftab,i))
847 iswordc(i) = 0;
848 }
849 else
850 while(*s)
851 iswordc(*s++) = 1;
852 }
853
854 /*
855 * RESearch::Substitute:
856 * substitute the matched portions of the src in dst.
857 *
858 * & substitute the entire matched pattern.
859 *
860 * \digit substitute a subpattern, with the given tag number.
861 * Tags are numbered from 1 to 9. If the particular
862 * tagged subpattern does not exist, null is substituted.
863 */
864 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
865 char c;
866 int pin;
867 int bp;
868 int ep;
869
870 if (!*src || !bopat[0])
871 return 0;
872
873 while ((c = *src++) != 0) {
874 switch(c) {
875
876 case '&':
877 pin = 0;
878 break;
879
880 case '\\':
881 c = *src++;
882 if (c >= '0' && c <= '9') {
883 pin = c - '0';
884 break;
885 }
886
887 default:
888 *dst++ = c;
889 continue;
890 }
891
892 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
893 while (ci.CharAt(bp) && bp < ep)
894 *dst++ = ci.CharAt(bp++);
895 if (bp < ep)
896 return 0;
897 }
898 }
899 *dst = (char) 0;
900 return 1;
901 }