]> git.saurik.com Git - wxWidgets.git/blob - contrib/src/stc/scintilla/src/RESearch.cxx
07534db1ef020406ab7185bfc8161defe90d273b
[wxWidgets.git] / contrib / src / stc / scintilla / src / RESearch.cxx
1 // Scintilla source code edit control
2 /** @file RESearch.cxx
3 ** Regular expression search library.
4 **/
5
6 /*
7 * regex - Regular expression pattern matching and replacement
8 *
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
11 * York University
12 *
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
19 *
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
22 *
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
28 * matching module.
29 *
30 * Modification history:
31 *
32 * $Log$
33 * Revision 1.1 2001/09/01 03:05:24 RD
34 * Upgraded to version 1.39 of Scintilla, and upated wxStyledTextCtrl
35 * accordingly.
36 *
37 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
38 * Addition of new target methods - versions of ReplaceTarget that take counted
39 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
40 * series of calls rather than a structure.
41 * Handling of \000 in search and replace.
42 * Handling of /escapes within character ranges of regular expressions.
43 * Some handling of bare ^ and $ regular expressions.
44 *
45 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
46 * Removed DEBUG code that failed to compile on GTK+.
47 *
48 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
49 * Added URL to find original code to comments.
50 *
51 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
52 * Made regular expression searching work on a line by line basis, made ^ and
53 * $ work, made [set] work, and added a case insensitive option.
54 *
55 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
56 * Replace target functionality to make find and replace operations faster
57 * by diminishing screen updates and allow for \d patterns in the replacement
58 * text.
59 *
60 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
61 * Moved to public domain regular expresion implementation.
62 *
63 * Revision 1.4 1991/10/17 03:56:42 oz
64 * miscellaneous changes, small cleanups etc.
65 *
66 * Revision 1.3 1989/04/01 14:18:09 oz
67 * Change all references to a dfa: this is actually an nfa.
68 *
69 * Revision 1.2 88/08/28 15:36:04 oz
70 * Use a complement bitmap to represent NCL.
71 * This removes the need to have seperate
72 * code in the PMatch case block - it is
73 * just CCL code now.
74 *
75 * Use the actual CCL code in the CLO
76 * section of PMatch. No need for a recursive
77 * PMatch call.
78 *
79 * Use a bitmap table to set char bits in an
80 * 8-bit chunk.
81 *
82 * Interfaces:
83 * RESearch::Compile: compile a regular expression into a NFA.
84 *
85 * char *RESearch::Compile(s)
86 * char *s;
87 *
88 * RESearch::Execute: execute the NFA to match a pattern.
89 *
90 * int RESearch::Execute(s)
91 * char *s;
92 *
93 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
94 * looks like (for \< and \>) by adding into the
95 * hidden word-syntax table.
96 *
97 * void RESearch::ModifyWord(s)
98 * char *s;
99 *
100 * RESearch::Substitute: substitute the matched portions in a new string.
101 *
102 * int RESearch::Substitute(src, dst)
103 * char *src;
104 * char *dst;
105 *
106 * re_fail: failure routine for RESearch::Execute.
107 *
108 * void re_fail(msg, op)
109 * char *msg;
110 * char op;
111 *
112 * Regular Expressions:
113 *
114 * [1] char matches itself, unless it is a special
115 * character (metachar): . \ [ ] * + ^ $
116 *
117 * [2] . matches any character.
118 *
119 * [3] \ matches the character following it, except
120 * when followed by a left or right round bracket,
121 * a digit 1 to 9 or a left or right angle bracket.
122 * (see [7], [8] and [9])
123 * It is used as an escape character for all
124 * other meta-characters, and itself. When used
125 * in a set ([4]), it is treated as an ordinary
126 * character.
127 *
128 * [4] [set] matches one of the characters in the set.
129 * If the first character in the set is "^",
130 * it matches a character NOT in the set, i.e.
131 * complements the set. A shorthand S-E is
132 * used to specify a set of characters S upto
133 * E, inclusive. The special characters "]" and
134 * "-" have no special meaning if they appear
135 * as the first chars in the set.
136 * examples: match:
137 *
138 * [a-z] any lowercase alpha
139 *
140 * [^]-] any char except ] and -
141 *
142 * [^A-Z] any char except uppercase
143 * alpha
144 *
145 * [a-zA-Z] any alpha
146 *
147 * [5] * any regular expression form [1] to [4], followed by
148 * closure char (*) matches zero or more matches of
149 * that form.
150 *
151 * [6] + same as [5], except it matches one or more.
152 *
153 * [7] a regular expression in the form [1] to [10], enclosed
154 * as \(form\) matches what form matches. The enclosure
155 * creates a set of tags, used for [8] and for
156 * pattern substution. The tagged forms are numbered
157 * starting from 1.
158 *
159 * [8] a \ followed by a digit 1 to 9 matches whatever a
160 * previously tagged regular expression ([7]) matched.
161 *
162 * [9] \< a regular expression starting with a \< construct
163 * \> and/or ending with a \> construct, restricts the
164 * pattern matching to the beginning of a word, and/or
165 * the end of a word. A word is defined to be a character
166 * string beginning and/or ending with the characters
167 * A-Z a-z 0-9 and _. It must also be preceded and/or
168 * followed by any character outside those mentioned.
169 *
170 * [10] a composite regular expression xy where x and y
171 * are in the form [1] to [10] matches the longest
172 * match of x followed by a match for y.
173 *
174 * [11] ^ a regular expression starting with a ^ character
175 * $ and/or ending with a $ character, restricts the
176 * pattern matching to the beginning of the line,
177 * or the end of line. [anchors] Elsewhere in the
178 * pattern, ^ and $ are treated as ordinary characters.
179 *
180 *
181 * Acknowledgements:
182 *
183 * HCR's Hugh Redelmeier has been most helpful in various
184 * stages of development. He convinced me to include BOW
185 * and EOW constructs, originally invented by Rob Pike at
186 * the University of Toronto.
187 *
188 * References:
189 * Software tools Kernighan & Plauger
190 * Software tools in Pascal Kernighan & Plauger
191 * Grep [rsx-11 C dist] David Conroy
192 * ed - text editor Un*x Programmer's Manual
193 * Advanced editing on Un*x B. W. Kernighan
194 * RegExp routines Henry Spencer
195 *
196 * Notes:
197 *
198 * This implementation uses a bit-set representation for character
199 * classes for speed and compactness. Each character is represented
200 * by one bit in a 128-bit block. Thus, CCL always takes a
201 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
202 * bit comparison to locate the character in the set.
203 *
204 * Examples:
205 *
206 * pattern: foo*.*
207 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
208 * matches: fo foo fooo foobar fobar foxx ...
209 *
210 * pattern: fo[ob]a[rz]
211 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
212 * matches: fobar fooar fobaz fooaz
213 *
214 * pattern: foo\\+
215 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
216 * matches: foo\ foo\\ foo\\\ ...
217 *
218 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
219 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
220 * matches: foo1foo foo2foo foo3foo
221 *
222 * pattern: \(fo.*\)-\1
223 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
224 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
225 */
226
227 #include "RESearch.h"
228
229 #define OKP 1
230 #define NOP 0
231
232 #define CHR 1
233 #define ANY 2
234 #define CCL 3
235 #define BOL 4
236 #define EOL 5
237 #define BOT 6
238 #define EOT 7
239 #define BOW 8
240 #define EOW 9
241 #define REF 10
242 #define CLO 11
243
244 #define END 0
245
246 /*
247 * The following defines are not meant to be changeable.
248 * They are for readability only.
249 */
250 #define BLKIND 0170
251 #define BITIND 07
252
253 #define ASCIIB 0177
254
255 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
256
257 #define badpat(x) (*nfa = END, x)
258
259 RESearch::RESearch() {
260 Init();
261 }
262
263 RESearch::~RESearch() {
264 Clear();
265 }
266
267 void RESearch::Init() {
268 sta = NOP; /* status of lastpat */
269 bol = 0;
270 for (int i=0; i<MAXTAG; i++)
271 pat[i] = 0;
272 for (int j=0; j<BITBLK; j++)
273 bittab[j] = 0;
274 }
275
276 void RESearch::Clear() {
277 for (int i=0; i<MAXTAG; i++) {
278 delete []pat[i];
279 pat[i] = 0;
280 bopat[i] = NOTFOUND;
281 eopat[i] = NOTFOUND;
282 }
283 }
284
285 bool RESearch::GrabMatches(CharacterIndexer &ci) {
286 bool success = true;
287 for (unsigned int i=0; i<MAXTAG; i++) {
288 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
289 unsigned int len = eopat[i] - bopat[i];
290 pat[i] = new char[len + 1];
291 if (pat[i]) {
292 for (unsigned int j=0; j<len; j++)
293 pat[i][j] = ci.CharAt(bopat[i] + j);
294 pat[i][len] = '\0';
295 } else {
296 success = false;
297 }
298 }
299 }
300 return success;
301 }
302
303 void RESearch::ChSet(char c) {
304 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
305 }
306
307 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
308 if (caseSensitive) {
309 ChSet(c);
310 } else {
311 if ((c >= 'a') && (c <= 'z')) {
312 ChSet(c);
313 ChSet(static_cast<char>(c - 'a' + 'A'));
314 } else if ((c >= 'A') && (c <= 'Z')) {
315 ChSet(c);
316 ChSet(static_cast<char>(c - 'A' + 'a'));
317 } else {
318 ChSet(c);
319 }
320 }
321 }
322
323 const char escapeValue(char ch) {
324 switch (ch) {
325 case 'a': return '\a';
326 case 'b': return '\b';
327 case 'f': return '\f';
328 case 'n': return '\n';
329 case 'r': return '\r';
330 case 't': return '\t';
331 case 'v': return '\v';
332 }
333 return 0;
334 }
335
336 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive) {
337 char *mp=nfa; /* nfa pointer */
338 char *lp; /* saved pointer.. */
339 char *sp=nfa; /* another one.. */
340
341 int tagi = 0; /* tag stack index */
342 int tagc = 1; /* actual tag count */
343
344 int n;
345 char mask; /* xor mask -CCL/NCL */
346 int c1, c2;
347
348 if (!pat || !length)
349 if (sta)
350 return 0;
351 else
352 return badpat("No previous regular expression");
353 sta = NOP;
354
355 const char *p=pat; /* pattern pointer */
356 for (int i=0; i<length; i++, p++) {
357 lp = mp;
358 switch(*p) {
359
360 case '.': /* match any char.. */
361 *mp++ = ANY;
362 break;
363
364 case '^': /* match beginning.. */
365 if (p == pat)
366 *mp++ = BOL;
367 else {
368 *mp++ = CHR;
369 *mp++ = *p;
370 }
371 break;
372
373 case '$': /* match endofline.. */
374 if (!*(p+1))
375 *mp++ = EOL;
376 else {
377 *mp++ = CHR;
378 *mp++ = *p;
379 }
380 break;
381
382 case '[': /* match char class..*/
383 *mp++ = CCL;
384
385 i++;
386 if (*++p == '^') {
387 mask = '\377';
388 i++;
389 p++;
390 } else
391 mask = 0;
392
393 if (*p == '-') { /* real dash */
394 i++;
395 ChSet(*p++);
396 }
397 if (*p == ']') { /* real brace */
398 i++;
399 ChSet(*p++);
400 }
401 while (*p && *p != ']') {
402 if (*p == '-' && *(p+1) && *(p+1) != ']') {
403 i++;
404 p++;
405 c1 = *(p-2) + 1;
406 i++;
407 c2 = *p++;
408 while (c1 <= c2) {
409 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
410 }
411 } else if (*p == '\\' && *(p+1)) {
412 i++;
413 p++;
414 char escape = escapeValue(*p);
415 if (escape)
416 ChSetWithCase(escape, caseSensitive);
417 else
418 ChSetWithCase(*p, caseSensitive);
419 i++;
420 p++;
421 } else {
422 i++;
423 ChSetWithCase(*p++, caseSensitive);
424 }
425 }
426 if (!*p)
427 return badpat("Missing ]");
428
429 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
430 *mp++ = static_cast<char>(mask ^ bittab[n]);
431
432 break;
433
434 case '*': /* match 0 or more.. */
435 case '+': /* match 1 or more.. */
436 if (p == pat)
437 return badpat("Empty closure");
438 lp = sp; /* previous opcode */
439 if (*lp == CLO) /* equivalence.. */
440 break;
441 switch(*lp) {
442
443 case BOL:
444 case BOT:
445 case EOT:
446 case BOW:
447 case EOW:
448 case REF:
449 return badpat("Illegal closure");
450 default:
451 break;
452 }
453
454 if (*p == '+')
455 for (sp = mp; lp < sp; lp++)
456 *mp++ = *lp;
457
458 *mp++ = END;
459 *mp++ = END;
460 sp = mp;
461 while (--mp > lp)
462 *mp = mp[-1];
463 *mp = CLO;
464 mp = sp;
465 break;
466
467 case '\\': /* tags, backrefs .. */
468 i++;
469 switch(*++p) {
470
471 case '(':
472 if (tagc < MAXTAG) {
473 tagstk[++tagi] = tagc;
474 *mp++ = BOT;
475 *mp++ = static_cast<char>(tagc++);
476 }
477 else
478 return badpat("Too many \\(\\) pairs");
479 break;
480 case ')':
481 if (*sp == BOT)
482 return badpat("Null pattern inside \\(\\)");
483 if (tagi > 0) {
484 *mp++ = static_cast<char>(EOT);
485 *mp++ = static_cast<char>(tagstk[tagi--]);
486 }
487 else
488 return badpat("Unmatched \\)");
489 break;
490 case '<':
491 *mp++ = BOW;
492 break;
493 case '>':
494 if (*sp == BOW)
495 return badpat("Null pattern inside \\<\\>");
496 *mp++ = EOW;
497 break;
498 case '1':
499 case '2':
500 case '3':
501 case '4':
502 case '5':
503 case '6':
504 case '7':
505 case '8':
506 case '9':
507 n = *p-'0';
508 if (tagi > 0 && tagstk[tagi] == n)
509 return badpat("Cyclical reference");
510 if (tagc > n) {
511 *mp++ = static_cast<char>(REF);
512 *mp++ = static_cast<char>(n);
513 }
514 else
515 return badpat("Undetermined reference");
516 break;
517 case 'a':
518 case 'b':
519 case 'n':
520 case 'f':
521 case 'r':
522 case 't':
523 case 'v':
524 *mp++ = CHR;
525 *mp++ = escapeValue(*p);
526 break;
527 default:
528 *mp++ = CHR;
529 *mp++ = *p;
530 }
531 break;
532
533 default : /* an ordinary char */
534 if (caseSensitive) {
535 *mp++ = CHR;
536 *mp++ = *p;
537 } else {
538 *mp++ = CCL;
539 mask = 0;
540 ChSetWithCase(*p, false);
541 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
542 *mp++ = static_cast<char>(mask ^ bittab[n]);
543 }
544 break;
545 }
546 sp = lp;
547 }
548 if (tagi > 0)
549 return badpat("Unmatched \\(");
550 *mp = END;
551 sta = OKP;
552 return 0;
553 }
554
555 /*
556 * RESearch::Execute:
557 * execute nfa to find a match.
558 *
559 * special cases: (nfa[0])
560 * BOL
561 * Match only once, starting from the
562 * beginning.
563 * CHR
564 * First locate the character without
565 * calling PMatch, and if found, call
566 * PMatch for the remaining string.
567 * END
568 * RESearch::Compile failed, poor luser did not
569 * check for it. Fail fast.
570 *
571 * If a match is found, bopat[0] and eopat[0] are set
572 * to the beginning and the end of the matched fragment,
573 * respectively.
574 *
575 */
576
577 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
578 char c;
579 int ep = NOTFOUND;
580 char *ap = nfa;
581
582 bol = lp;
583 failure = 0;
584
585 Clear();
586
587 switch(*ap) {
588
589 case BOL: /* anchored: match from BOL only */
590 ep = PMatch(ci, lp, endp, ap);
591 break;
592 case EOL: /* just searching for end of line normal path doesn't work */
593 if (*(ap+1) == END) {
594 lp = endp;
595 ep = lp;
596 break;
597 } else {
598 return 0;
599 }
600 case CHR: /* ordinary char: locate it fast */
601 c = *(ap+1);
602 while ((lp < endp) && (ci.CharAt(lp) != c))
603 lp++;
604 if (lp >= endp) /* if EOS, fail, else fall thru. */
605 return 0;
606 default: /* regular matching all the way. */
607 while (lp < endp) {
608 ep = PMatch(ci, lp, endp, ap);
609 if (ep != NOTFOUND)
610 break;
611 lp++;
612 }
613 break;
614 case END: /* munged automaton. fail always */
615 return 0;
616 }
617 if (ep == NOTFOUND)
618 return 0;
619
620 bopat[0] = lp;
621 eopat[0] = ep;
622 return 1;
623 }
624
625 /*
626 * PMatch: internal routine for the hard part
627 *
628 * This code is partly snarfed from an early grep written by
629 * David Conroy. The backref and tag stuff, and various other
630 * innovations are by oz.
631 *
632 * special case optimizations: (nfa[n], nfa[n+1])
633 * CLO ANY
634 * We KNOW .* will match everything upto the
635 * end of line. Thus, directly go to the end of
636 * line, without recursive PMatch calls. As in
637 * the other closure cases, the remaining pattern
638 * must be matched by moving backwards on the
639 * string recursively, to find a match for xy
640 * (x is ".*" and y is the remaining pattern)
641 * where the match satisfies the LONGEST match for
642 * x followed by a match for y.
643 * CLO CHR
644 * We can again scan the string forward for the
645 * single char and at the point of failure, we
646 * execute the remaining nfa recursively, same as
647 * above.
648 *
649 * At the end of a successful match, bopat[n] and eopat[n]
650 * are set to the beginning and end of subpatterns matched
651 * by tagged expressions (n = 1 to 9).
652 *
653 */
654
655 extern void re_fail(char *,char);
656
657 /*
658 * character classification table for word boundary operators BOW
659 * and EOW. the reason for not using ctype macros is that we can
660 * let the user add into our own table. see RESearch::ModifyWord. This table
661 * is not in the bitset form, since we may wish to extend it in the
662 * future for other character classifications.
663 *
664 * TRUE for 0-9 A-Z a-z _
665 */
666 static char chrtyp[MAXCHR] = {
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
671 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
672 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
673 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
676 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
679 1, 1, 1, 0, 0, 0, 0, 0
680 };
681
682 #define inascii(x) (0177&(x))
683 #define iswordc(x) chrtyp[inascii(x)]
684 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
685
686 /*
687 * skip values for CLO XXX to skip past the closure
688 */
689
690 #define ANYSKIP 2 /* [CLO] ANY END ... */
691 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
692 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
693
694 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
695 int op, c, n;
696 int e; /* extra pointer for CLO */
697 int bp; /* beginning of subpat.. */
698 int ep; /* ending of subpat.. */
699 int are; /* to save the line ptr. */
700
701 while ((op = *ap++) != END)
702 switch(op) {
703
704 case CHR:
705 if (ci.CharAt(lp++) != *ap++)
706 return NOTFOUND;
707 break;
708 case ANY:
709 if (lp++ >= endp)
710 return NOTFOUND;
711 break;
712 case CCL:
713 c = ci.CharAt(lp++);
714 if (!isinset(ap,c))
715 return NOTFOUND;
716 ap += BITBLK;
717 break;
718 case BOL:
719 if (lp != bol)
720 return NOTFOUND;
721 break;
722 case EOL:
723 if (lp < endp)
724 return NOTFOUND;
725 break;
726 case BOT:
727 bopat[*ap++] = lp;
728 break;
729 case EOT:
730 eopat[*ap++] = lp;
731 break;
732 case BOW:
733 if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
734 return NOTFOUND;
735 break;
736 case EOW:
737 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
738 return NOTFOUND;
739 break;
740 case REF:
741 n = *ap++;
742 bp = bopat[n];
743 ep = eopat[n];
744 while (bp < ep)
745 if (ci.CharAt(bp++) != ci.CharAt(lp++))
746 return NOTFOUND;
747 break;
748 case CLO:
749 are = lp;
750 switch(*ap) {
751
752 case ANY:
753 while (lp < endp)
754 lp++;
755 n = ANYSKIP;
756 break;
757 case CHR:
758 c = *(ap+1);
759 while ((lp < endp) && (c == ci.CharAt(lp)))
760 lp++;
761 n = CHRSKIP;
762 break;
763 case CCL:
764 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
765 lp++;
766 n = CCLSKIP;
767 break;
768 default:
769 failure = true;
770 //re_fail("closure: bad nfa.", *ap);
771 return NOTFOUND;
772 }
773
774 ap += n;
775
776 while (lp >= are) {
777 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
778 return e;
779 --lp;
780 }
781 return NOTFOUND;
782 default:
783 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
784 return NOTFOUND;
785 }
786 return lp;
787 }
788
789 /*
790 * RESearch::ModifyWord:
791 * add new characters into the word table to change RESearch::Execute's
792 * understanding of what a word should look like. Note that we
793 * only accept additions into the word definition.
794 *
795 * If the string parameter is 0 or null string, the table is
796 * reset back to the default containing A-Z a-z 0-9 _. [We use
797 * the compact bitset representation for the default table]
798 */
799
800 static char deftab[16] = {
801 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
802 '\376', '\377', '\377', 007
803 };
804
805 void RESearch::ModifyWord(char *s) {
806 int i;
807
808 if (!s || !*s) {
809 for (i = 0; i < MAXCHR; i++)
810 if (!isinset(deftab,i))
811 iswordc(i) = 0;
812 }
813 else
814 while(*s)
815 iswordc(*s++) = 1;
816 }
817
818 /*
819 * RESearch::Substitute:
820 * substitute the matched portions of the src in dst.
821 *
822 * & substitute the entire matched pattern.
823 *
824 * \digit substitute a subpattern, with the given tag number.
825 * Tags are numbered from 1 to 9. If the particular
826 * tagged subpattern does not exist, null is substituted.
827 */
828 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
829 char c;
830 int pin;
831 int bp;
832 int ep;
833
834 if (!*src || !bopat[0])
835 return 0;
836
837 while ((c = *src++) != 0) {
838 switch(c) {
839
840 case '&':
841 pin = 0;
842 break;
843
844 case '\\':
845 c = *src++;
846 if (c >= '0' && c <= '9') {
847 pin = c - '0';
848 break;
849 }
850
851 default:
852 *dst++ = c;
853 continue;
854 }
855
856 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
857 while (ci.CharAt(bp) && bp < ep)
858 *dst++ = ci.CharAt(bp++);
859 if (bp < ep)
860 return 0;
861 }
862 }
863 *dst = (char) 0;
864 return 1;
865 }