]> git.saurik.com Git - wxWidgets.git/blob - src/stc/scintilla/src/RESearch.cxx
f176bbf19e02107aac4100a5fc1492f5d4596343
[wxWidgets.git] / src / stc / scintilla / src / RESearch.cxx
1 // Scintilla source code edit control
2 /** @file RESearch.cxx
3 ** Regular expression search library.
4 **/
5
6 /*
7 * regex - Regular expression pattern matching and replacement
8 *
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
11 * York University
12 *
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
19 *
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
22 *
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
28 * matching module.
29 *
30 * Modification history:
31 *
32 * $Log$
33 * Revision 1.5 2002/09/11 00:55:27 RD
34 * Update to Scintilla 1.48
35 *
36 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
37 * Addition of new target methods - versions of ReplaceTarget that take counted
38 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
39 * series of calls rather than a structure.
40 * Handling of \000 in search and replace.
41 * Handling of /escapes within character ranges of regular expressions.
42 * Some handling of bare ^ and $ regular expressions.
43 *
44 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
45 * Removed DEBUG code that failed to compile on GTK+.
46 *
47 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
48 * Added URL to find original code to comments.
49 *
50 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
51 * Made regular expression searching work on a line by line basis, made ^ and
52 * $ work, made [set] work, and added a case insensitive option.
53 *
54 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
55 * Replace target functionality to make find and replace operations faster
56 * by diminishing screen updates and allow for \d patterns in the replacement
57 * text.
58 *
59 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
60 * Moved to public domain regular expresion implementation.
61 *
62 * Revision 1.4 1991/10/17 03:56:42 oz
63 * miscellaneous changes, small cleanups etc.
64 *
65 * Revision 1.3 1989/04/01 14:18:09 oz
66 * Change all references to a dfa: this is actually an nfa.
67 *
68 * Revision 1.2 88/08/28 15:36:04 oz
69 * Use a complement bitmap to represent NCL.
70 * This removes the need to have seperate
71 * code in the PMatch case block - it is
72 * just CCL code now.
73 *
74 * Use the actual CCL code in the CLO
75 * section of PMatch. No need for a recursive
76 * PMatch call.
77 *
78 * Use a bitmap table to set char bits in an
79 * 8-bit chunk.
80 *
81 * Interfaces:
82 * RESearch::Compile: compile a regular expression into a NFA.
83 *
84 * char *RESearch::Compile(s)
85 * char *s;
86 *
87 * RESearch::Execute: execute the NFA to match a pattern.
88 *
89 * int RESearch::Execute(s)
90 * char *s;
91 *
92 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
93 * looks like (for \< and \>) by adding into the
94 * hidden word-syntax table.
95 *
96 * void RESearch::ModifyWord(s)
97 * char *s;
98 *
99 * RESearch::Substitute: substitute the matched portions in a new string.
100 *
101 * int RESearch::Substitute(src, dst)
102 * char *src;
103 * char *dst;
104 *
105 * re_fail: failure routine for RESearch::Execute.
106 *
107 * void re_fail(msg, op)
108 * char *msg;
109 * char op;
110 *
111 * Regular Expressions:
112 *
113 * [1] char matches itself, unless it is a special
114 * character (metachar): . \ [ ] * + ^ $
115 *
116 * [2] . matches any character.
117 *
118 * [3] \ matches the character following it, except
119 * when followed by a left or right round bracket,
120 * a digit 1 to 9 or a left or right angle bracket.
121 * (see [7], [8] and [9])
122 * It is used as an escape character for all
123 * other meta-characters, and itself. When used
124 * in a set ([4]), it is treated as an ordinary
125 * character.
126 *
127 * [4] [set] matches one of the characters in the set.
128 * If the first character in the set is "^",
129 * it matches a character NOT in the set, i.e.
130 * complements the set. A shorthand S-E is
131 * used to specify a set of characters S upto
132 * E, inclusive. The special characters "]" and
133 * "-" have no special meaning if they appear
134 * as the first chars in the set.
135 * examples: match:
136 *
137 * [a-z] any lowercase alpha
138 *
139 * [^]-] any char except ] and -
140 *
141 * [^A-Z] any char except uppercase
142 * alpha
143 *
144 * [a-zA-Z] any alpha
145 *
146 * [5] * any regular expression form [1] to [4], followed by
147 * closure char (*) matches zero or more matches of
148 * that form.
149 *
150 * [6] + same as [5], except it matches one or more.
151 *
152 * [7] a regular expression in the form [1] to [10], enclosed
153 * as \(form\) matches what form matches. The enclosure
154 * creates a set of tags, used for [8] and for
155 * pattern substution. The tagged forms are numbered
156 * starting from 1.
157 *
158 * [8] a \ followed by a digit 1 to 9 matches whatever a
159 * previously tagged regular expression ([7]) matched.
160 *
161 * [9] \< a regular expression starting with a \< construct
162 * \> and/or ending with a \> construct, restricts the
163 * pattern matching to the beginning of a word, and/or
164 * the end of a word. A word is defined to be a character
165 * string beginning and/or ending with the characters
166 * A-Z a-z 0-9 and _. It must also be preceded and/or
167 * followed by any character outside those mentioned.
168 *
169 * [10] a composite regular expression xy where x and y
170 * are in the form [1] to [10] matches the longest
171 * match of x followed by a match for y.
172 *
173 * [11] ^ a regular expression starting with a ^ character
174 * $ and/or ending with a $ character, restricts the
175 * pattern matching to the beginning of the line,
176 * or the end of line. [anchors] Elsewhere in the
177 * pattern, ^ and $ are treated as ordinary characters.
178 *
179 *
180 * Acknowledgements:
181 *
182 * HCR's Hugh Redelmeier has been most helpful in various
183 * stages of development. He convinced me to include BOW
184 * and EOW constructs, originally invented by Rob Pike at
185 * the University of Toronto.
186 *
187 * References:
188 * Software tools Kernighan & Plauger
189 * Software tools in Pascal Kernighan & Plauger
190 * Grep [rsx-11 C dist] David Conroy
191 * ed - text editor Un*x Programmer's Manual
192 * Advanced editing on Un*x B. W. Kernighan
193 * RegExp routines Henry Spencer
194 *
195 * Notes:
196 *
197 * This implementation uses a bit-set representation for character
198 * classes for speed and compactness. Each character is represented
199 * by one bit in a 128-bit block. Thus, CCL always takes a
200 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
201 * bit comparison to locate the character in the set.
202 *
203 * Examples:
204 *
205 * pattern: foo*.*
206 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
207 * matches: fo foo fooo foobar fobar foxx ...
208 *
209 * pattern: fo[ob]a[rz]
210 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
211 * matches: fobar fooar fobaz fooaz
212 *
213 * pattern: foo\\+
214 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
215 * matches: foo\ foo\\ foo\\\ ...
216 *
217 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
218 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
219 * matches: foo1foo foo2foo foo3foo
220 *
221 * pattern: \(fo.*\)-\1
222 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
223 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
224 */
225
226 #include "RESearch.h"
227
228 #define OKP 1
229 #define NOP 0
230
231 #define CHR 1
232 #define ANY 2
233 #define CCL 3
234 #define BOL 4
235 #define EOL 5
236 #define BOT 6
237 #define EOT 7
238 #define BOW 8
239 #define EOW 9
240 #define REF 10
241 #define CLO 11
242
243 #define END 0
244
245 /*
246 * The following defines are not meant to be changeable.
247 * They are for readability only.
248 */
249 #define BLKIND 0170
250 #define BITIND 07
251
252 #define ASCIIB 0177
253
254 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
255
256 #define badpat(x) (*nfa = END, x)
257
258 RESearch::RESearch() {
259 Init();
260 }
261
262 RESearch::~RESearch() {
263 Clear();
264 }
265
266 void RESearch::Init() {
267 sta = NOP; /* status of lastpat */
268 bol = 0;
269 for (int i=0; i<MAXTAG; i++)
270 pat[i] = 0;
271 for (int j=0; j<BITBLK; j++)
272 bittab[j] = 0;
273 }
274
275 void RESearch::Clear() {
276 for (int i=0; i<MAXTAG; i++) {
277 delete []pat[i];
278 pat[i] = 0;
279 bopat[i] = NOTFOUND;
280 eopat[i] = NOTFOUND;
281 }
282 }
283
284 bool RESearch::GrabMatches(CharacterIndexer &ci) {
285 bool success = true;
286 for (unsigned int i=0; i<MAXTAG; i++) {
287 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
288 unsigned int len = eopat[i] - bopat[i];
289 pat[i] = new char[len + 1];
290 if (pat[i]) {
291 for (unsigned int j=0; j<len; j++)
292 pat[i][j] = ci.CharAt(bopat[i] + j);
293 pat[i][len] = '\0';
294 } else {
295 success = false;
296 }
297 }
298 }
299 return success;
300 }
301
302 void RESearch::ChSet(char c) {
303 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
304 }
305
306 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
307 if (caseSensitive) {
308 ChSet(c);
309 } else {
310 if ((c >= 'a') && (c <= 'z')) {
311 ChSet(c);
312 ChSet(static_cast<char>(c - 'a' + 'A'));
313 } else if ((c >= 'A') && (c <= 'Z')) {
314 ChSet(c);
315 ChSet(static_cast<char>(c - 'A' + 'a'));
316 } else {
317 ChSet(c);
318 }
319 }
320 }
321
322 const char escapeValue(char ch) {
323 switch (ch) {
324 case 'a': return '\a';
325 case 'b': return '\b';
326 case 'f': return '\f';
327 case 'n': return '\n';
328 case 'r': return '\r';
329 case 't': return '\t';
330 case 'v': return '\v';
331 }
332 return 0;
333 }
334
335 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive) {
336 char *mp=nfa; /* nfa pointer */
337 char *lp; /* saved pointer.. */
338 char *sp=nfa; /* another one.. */
339
340 int tagi = 0; /* tag stack index */
341 int tagc = 1; /* actual tag count */
342
343 int n;
344 char mask; /* xor mask -CCL/NCL */
345 int c1, c2;
346
347 if (!pat || !length)
348 if (sta)
349 return 0;
350 else
351 return badpat("No previous regular expression");
352 sta = NOP;
353
354 const char *p=pat; /* pattern pointer */
355 for (int i=0; i<length; i++, p++) {
356 lp = mp;
357 switch(*p) {
358
359 case '.': /* match any char.. */
360 *mp++ = ANY;
361 break;
362
363 case '^': /* match beginning.. */
364 if (p == pat)
365 *mp++ = BOL;
366 else {
367 *mp++ = CHR;
368 *mp++ = *p;
369 }
370 break;
371
372 case '$': /* match endofline.. */
373 if (!*(p+1))
374 *mp++ = EOL;
375 else {
376 *mp++ = CHR;
377 *mp++ = *p;
378 }
379 break;
380
381 case '[': /* match char class..*/
382 *mp++ = CCL;
383
384 i++;
385 if (*++p == '^') {
386 mask = '\377';
387 i++;
388 p++;
389 } else
390 mask = 0;
391
392 if (*p == '-') { /* real dash */
393 i++;
394 ChSet(*p++);
395 }
396 if (*p == ']') { /* real brace */
397 i++;
398 ChSet(*p++);
399 }
400 while (*p && *p != ']') {
401 if (*p == '-' && *(p+1) && *(p+1) != ']') {
402 i++;
403 p++;
404 c1 = *(p-2) + 1;
405 i++;
406 c2 = *p++;
407 while (c1 <= c2) {
408 ChSetWithCase(static_cast<char>(c1++), caseSensitive);
409 }
410 } else if (*p == '\\' && *(p+1)) {
411 i++;
412 p++;
413 char escape = escapeValue(*p);
414 if (escape)
415 ChSetWithCase(escape, caseSensitive);
416 else
417 ChSetWithCase(*p, caseSensitive);
418 i++;
419 p++;
420 } else {
421 i++;
422 ChSetWithCase(*p++, caseSensitive);
423 }
424 }
425 if (!*p)
426 return badpat("Missing ]");
427
428 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
429 *mp++ = static_cast<char>(mask ^ bittab[n]);
430
431 break;
432
433 case '*': /* match 0 or more.. */
434 case '+': /* match 1 or more.. */
435 if (p == pat)
436 return badpat("Empty closure");
437 lp = sp; /* previous opcode */
438 if (*lp == CLO) /* equivalence.. */
439 break;
440 switch(*lp) {
441
442 case BOL:
443 case BOT:
444 case EOT:
445 case BOW:
446 case EOW:
447 case REF:
448 return badpat("Illegal closure");
449 default:
450 break;
451 }
452
453 if (*p == '+')
454 for (sp = mp; lp < sp; lp++)
455 *mp++ = *lp;
456
457 *mp++ = END;
458 *mp++ = END;
459 sp = mp;
460 while (--mp > lp)
461 *mp = mp[-1];
462 *mp = CLO;
463 mp = sp;
464 break;
465
466 case '\\': /* tags, backrefs .. */
467 i++;
468 switch(*++p) {
469
470 case '(':
471 if (tagc < MAXTAG) {
472 tagstk[++tagi] = tagc;
473 *mp++ = BOT;
474 *mp++ = static_cast<char>(tagc++);
475 }
476 else
477 return badpat("Too many \\(\\) pairs");
478 break;
479 case ')':
480 if (*sp == BOT)
481 return badpat("Null pattern inside \\(\\)");
482 if (tagi > 0) {
483 *mp++ = static_cast<char>(EOT);
484 *mp++ = static_cast<char>(tagstk[tagi--]);
485 }
486 else
487 return badpat("Unmatched \\)");
488 break;
489 case '<':
490 *mp++ = BOW;
491 break;
492 case '>':
493 if (*sp == BOW)
494 return badpat("Null pattern inside \\<\\>");
495 *mp++ = EOW;
496 break;
497 case '1':
498 case '2':
499 case '3':
500 case '4':
501 case '5':
502 case '6':
503 case '7':
504 case '8':
505 case '9':
506 n = *p-'0';
507 if (tagi > 0 && tagstk[tagi] == n)
508 return badpat("Cyclical reference");
509 if (tagc > n) {
510 *mp++ = static_cast<char>(REF);
511 *mp++ = static_cast<char>(n);
512 }
513 else
514 return badpat("Undetermined reference");
515 break;
516 case 'a':
517 case 'b':
518 case 'n':
519 case 'f':
520 case 'r':
521 case 't':
522 case 'v':
523 *mp++ = CHR;
524 *mp++ = escapeValue(*p);
525 break;
526 default:
527 *mp++ = CHR;
528 *mp++ = *p;
529 }
530 break;
531
532 default : /* an ordinary char */
533 if (caseSensitive) {
534 *mp++ = CHR;
535 *mp++ = *p;
536 } else {
537 *mp++ = CCL;
538 mask = 0;
539 ChSetWithCase(*p, false);
540 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
541 *mp++ = static_cast<char>(mask ^ bittab[n]);
542 }
543 break;
544 }
545 sp = lp;
546 }
547 if (tagi > 0)
548 return badpat("Unmatched \\(");
549 *mp = END;
550 sta = OKP;
551 return 0;
552 }
553
554 /*
555 * RESearch::Execute:
556 * execute nfa to find a match.
557 *
558 * special cases: (nfa[0])
559 * BOL
560 * Match only once, starting from the
561 * beginning.
562 * CHR
563 * First locate the character without
564 * calling PMatch, and if found, call
565 * PMatch for the remaining string.
566 * END
567 * RESearch::Compile failed, poor luser did not
568 * check for it. Fail fast.
569 *
570 * If a match is found, bopat[0] and eopat[0] are set
571 * to the beginning and the end of the matched fragment,
572 * respectively.
573 *
574 */
575
576 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
577 char c;
578 int ep = NOTFOUND;
579 char *ap = nfa;
580
581 bol = lp;
582 failure = 0;
583
584 Clear();
585
586 switch(*ap) {
587
588 case BOL: /* anchored: match from BOL only */
589 ep = PMatch(ci, lp, endp, ap);
590 break;
591 case EOL: /* just searching for end of line normal path doesn't work */
592 if (*(ap+1) == END) {
593 lp = endp;
594 ep = lp;
595 break;
596 } else {
597 return 0;
598 }
599 case CHR: /* ordinary char: locate it fast */
600 c = *(ap+1);
601 while ((lp < endp) && (ci.CharAt(lp) != c))
602 lp++;
603 if (lp >= endp) /* if EOS, fail, else fall thru. */
604 return 0;
605 default: /* regular matching all the way. */
606 while (lp < endp) {
607 ep = PMatch(ci, lp, endp, ap);
608 if (ep != NOTFOUND)
609 break;
610 lp++;
611 }
612 break;
613 case END: /* munged automaton. fail always */
614 return 0;
615 }
616 if (ep == NOTFOUND)
617 return 0;
618
619 bopat[0] = lp;
620 eopat[0] = ep;
621 return 1;
622 }
623
624 /*
625 * PMatch: internal routine for the hard part
626 *
627 * This code is partly snarfed from an early grep written by
628 * David Conroy. The backref and tag stuff, and various other
629 * innovations are by oz.
630 *
631 * special case optimizations: (nfa[n], nfa[n+1])
632 * CLO ANY
633 * We KNOW .* will match everything upto the
634 * end of line. Thus, directly go to the end of
635 * line, without recursive PMatch calls. As in
636 * the other closure cases, the remaining pattern
637 * must be matched by moving backwards on the
638 * string recursively, to find a match for xy
639 * (x is ".*" and y is the remaining pattern)
640 * where the match satisfies the LONGEST match for
641 * x followed by a match for y.
642 * CLO CHR
643 * We can again scan the string forward for the
644 * single char and at the point of failure, we
645 * execute the remaining nfa recursively, same as
646 * above.
647 *
648 * At the end of a successful match, bopat[n] and eopat[n]
649 * are set to the beginning and end of subpatterns matched
650 * by tagged expressions (n = 1 to 9).
651 *
652 */
653
654 extern void re_fail(char *,char);
655
656 /*
657 * character classification table for word boundary operators BOW
658 * and EOW. the reason for not using ctype macros is that we can
659 * let the user add into our own table. see RESearch::ModifyWord. This table
660 * is not in the bitset form, since we may wish to extend it in the
661 * future for other character classifications.
662 *
663 * TRUE for 0-9 A-Z a-z _
664 */
665 static char chrtyp[MAXCHR] = {
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
671 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
672 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
675 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
678 1, 1, 1, 0, 0, 0, 0, 0
679 };
680
681 #define inascii(x) (0177&(x))
682 #define iswordc(x) chrtyp[inascii(x)]
683 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
684
685 /*
686 * skip values for CLO XXX to skip past the closure
687 */
688
689 #define ANYSKIP 2 /* [CLO] ANY END ... */
690 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
691 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
692
693 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
694 int op, c, n;
695 int e; /* extra pointer for CLO */
696 int bp; /* beginning of subpat.. */
697 int ep; /* ending of subpat.. */
698 int are; /* to save the line ptr. */
699
700 while ((op = *ap++) != END)
701 switch(op) {
702
703 case CHR:
704 if (ci.CharAt(lp++) != *ap++)
705 return NOTFOUND;
706 break;
707 case ANY:
708 if (lp++ >= endp)
709 return NOTFOUND;
710 break;
711 case CCL:
712 c = ci.CharAt(lp++);
713 if (!isinset(ap,c))
714 return NOTFOUND;
715 ap += BITBLK;
716 break;
717 case BOL:
718 if (lp != bol)
719 return NOTFOUND;
720 break;
721 case EOL:
722 if (lp < endp)
723 return NOTFOUND;
724 break;
725 case BOT:
726 bopat[*ap++] = lp;
727 break;
728 case EOT:
729 eopat[*ap++] = lp;
730 break;
731 case BOW:
732 if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
733 return NOTFOUND;
734 break;
735 case EOW:
736 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
737 return NOTFOUND;
738 break;
739 case REF:
740 n = *ap++;
741 bp = bopat[n];
742 ep = eopat[n];
743 while (bp < ep)
744 if (ci.CharAt(bp++) != ci.CharAt(lp++))
745 return NOTFOUND;
746 break;
747 case CLO:
748 are = lp;
749 switch(*ap) {
750
751 case ANY:
752 while (lp < endp)
753 lp++;
754 n = ANYSKIP;
755 break;
756 case CHR:
757 c = *(ap+1);
758 while ((lp < endp) && (c == ci.CharAt(lp)))
759 lp++;
760 n = CHRSKIP;
761 break;
762 case CCL:
763 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
764 lp++;
765 n = CCLSKIP;
766 break;
767 default:
768 failure = true;
769 //re_fail("closure: bad nfa.", *ap);
770 return NOTFOUND;
771 }
772
773 ap += n;
774
775 while (lp >= are) {
776 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
777 return e;
778 --lp;
779 }
780 return NOTFOUND;
781 default:
782 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
783 return NOTFOUND;
784 }
785 return lp;
786 }
787
788 /*
789 * RESearch::ModifyWord:
790 * add new characters into the word table to change RESearch::Execute's
791 * understanding of what a word should look like. Note that we
792 * only accept additions into the word definition.
793 *
794 * If the string parameter is 0 or null string, the table is
795 * reset back to the default containing A-Z a-z 0-9 _. [We use
796 * the compact bitset representation for the default table]
797 */
798
799 static char deftab[16] = {
800 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
801 '\376', '\377', '\377', 007
802 };
803
804 void RESearch::ModifyWord(char *s) {
805 int i;
806
807 if (!s || !*s) {
808 for (i = 0; i < MAXCHR; i++)
809 if (!isinset(deftab,i))
810 iswordc(i) = 0;
811 }
812 else
813 while(*s)
814 iswordc(*s++) = 1;
815 }
816
817 /*
818 * RESearch::Substitute:
819 * substitute the matched portions of the src in dst.
820 *
821 * & substitute the entire matched pattern.
822 *
823 * \digit substitute a subpattern, with the given tag number.
824 * Tags are numbered from 1 to 9. If the particular
825 * tagged subpattern does not exist, null is substituted.
826 */
827 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
828 char c;
829 int pin;
830 int bp;
831 int ep;
832
833 if (!*src || !bopat[0])
834 return 0;
835
836 while ((c = *src++) != 0) {
837 switch(c) {
838
839 case '&':
840 pin = 0;
841 break;
842
843 case '\\':
844 c = *src++;
845 if (c >= '0' && c <= '9') {
846 pin = c - '0';
847 break;
848 }
849
850 default:
851 *dst++ = c;
852 continue;
853 }
854
855 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
856 while (ci.CharAt(bp) && bp < ep)
857 *dst++ = ci.CharAt(bp++);
858 if (bp < ep)
859 return 0;
860 }
861 }
862 *dst = (char) 0;
863 return 1;
864 }