]>
git.saurik.com Git - wxWidgets.git/blob - contrib/src/stc/scintilla/src/RESearch.cxx
656570a24b9b2309c765f7756eb1e07e8f4b0faa
1 // Scintilla source code edit control
3 ** Regular expression search library.
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
30 * Modification history:
33 * Revision 1.6 2003/04/19 19:59:49 RD
34 * Updated Scintilla to 1.52 (on the trunk this time too)
36 * Revision 1.9 2003/03/21 10:36:08 nyamatongwe
37 * Detect patterns too long in regular expression search.
39 * Revision 1.8 2003/03/04 10:53:59 nyamatongwe
40 * Patch from Jakub to optionally implement more POSIX compatible regular
41 * expressions. \(..\) changes to (..)
42 * Fixes problem where find previous would not find earlier matches on same
45 * Revision 1.8 2003/03/03 20:12:56 vrana
48 * Revision 1.7 2002/09/28 00:33:28 nyamatongwe
49 * Fixed problem with character ranges caused by expansion to 8 bits.
51 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
52 * Addition of new target methods - versions of ReplaceTarget that take counted
53 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
54 * series of calls rather than a structure.
55 * Handling of \000 in search and replace.
56 * Handling of /escapes within character ranges of regular expressions.
57 * Some handling of bare ^ and $ regular expressions.
59 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
60 * Removed DEBUG code that failed to compile on GTK+.
62 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
63 * Added URL to find original code to comments.
65 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
66 * Made regular expression searching work on a line by line basis, made ^ and
67 * $ work, made [set] work, and added a case insensitive option.
69 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
70 * Replace target functionality to make find and replace operations faster
71 * by diminishing screen updates and allow for \d patterns in the replacement
74 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
75 * Moved to public domain regular expresion implementation.
77 * Revision 1.4 1991/10/17 03:56:42 oz
78 * miscellaneous changes, small cleanups etc.
80 * Revision 1.3 1989/04/01 14:18:09 oz
81 * Change all references to a dfa: this is actually an nfa.
83 * Revision 1.2 88/08/28 15:36:04 oz
84 * Use a complement bitmap to represent NCL.
85 * This removes the need to have seperate
86 * code in the PMatch case block - it is
89 * Use the actual CCL code in the CLO
90 * section of PMatch. No need for a recursive
93 * Use a bitmap table to set char bits in an
97 * RESearch::Compile: compile a regular expression into a NFA.
99 * char *RESearch::Compile(s)
102 * RESearch::Execute: execute the NFA to match a pattern.
104 * int RESearch::Execute(s)
107 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
108 * looks like (for \< and \>) by adding into the
109 * hidden word-syntax table.
111 * void RESearch::ModifyWord(s)
114 * RESearch::Substitute: substitute the matched portions in a new string.
116 * int RESearch::Substitute(src, dst)
120 * re_fail: failure routine for RESearch::Execute.
122 * void re_fail(msg, op)
126 * Regular Expressions:
128 * [1] char matches itself, unless it is a special
129 * character (metachar): . \ [ ] * + ^ $
131 * [2] . matches any character.
133 * [3] \ matches the character following it, except
134 * when followed by a left or right round bracket,
135 * a digit 1 to 9 or a left or right angle bracket.
136 * (see [7], [8] and [9])
137 * It is used as an escape character for all
138 * other meta-characters, and itself. When used
139 * in a set ([4]), it is treated as an ordinary
142 * [4] [set] matches one of the characters in the set.
143 * If the first character in the set is "^",
144 * it matches a character NOT in the set, i.e.
145 * complements the set. A shorthand S-E is
146 * used to specify a set of characters S upto
147 * E, inclusive. The special characters "]" and
148 * "-" have no special meaning if they appear
149 * as the first chars in the set.
152 * [a-z] any lowercase alpha
154 * [^]-] any char except ] and -
156 * [^A-Z] any char except uppercase
161 * [5] * any regular expression form [1] to [4], followed by
162 * closure char (*) matches zero or more matches of
165 * [6] + same as [5], except it matches one or more.
167 * [7] a regular expression in the form [1] to [10], enclosed
168 * as \(form\) matches what form matches. The enclosure
169 * creates a set of tags, used for [8] and for
170 * pattern substution. The tagged forms are numbered
173 * [8] a \ followed by a digit 1 to 9 matches whatever a
174 * previously tagged regular expression ([7]) matched.
176 * [9] \< a regular expression starting with a \< construct
177 * \> and/or ending with a \> construct, restricts the
178 * pattern matching to the beginning of a word, and/or
179 * the end of a word. A word is defined to be a character
180 * string beginning and/or ending with the characters
181 * A-Z a-z 0-9 and _. It must also be preceded and/or
182 * followed by any character outside those mentioned.
184 * [10] a composite regular expression xy where x and y
185 * are in the form [1] to [10] matches the longest
186 * match of x followed by a match for y.
188 * [11] ^ a regular expression starting with a ^ character
189 * $ and/or ending with a $ character, restricts the
190 * pattern matching to the beginning of the line,
191 * or the end of line. [anchors] Elsewhere in the
192 * pattern, ^ and $ are treated as ordinary characters.
197 * HCR's Hugh Redelmeier has been most helpful in various
198 * stages of development. He convinced me to include BOW
199 * and EOW constructs, originally invented by Rob Pike at
200 * the University of Toronto.
203 * Software tools Kernighan & Plauger
204 * Software tools in Pascal Kernighan & Plauger
205 * Grep [rsx-11 C dist] David Conroy
206 * ed - text editor Un*x Programmer's Manual
207 * Advanced editing on Un*x B. W. Kernighan
208 * RegExp routines Henry Spencer
212 * This implementation uses a bit-set representation for character
213 * classes for speed and compactness. Each character is represented
214 * by one bit in a 128-bit block. Thus, CCL always takes a
215 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
216 * bit comparison to locate the character in the set.
221 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
222 * matches: fo foo fooo foobar fobar foxx ...
224 * pattern: fo[ob]a[rz]
225 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
226 * matches: fobar fooar fobaz fooaz
229 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
230 * matches: foo\ foo\\ foo\\\ ...
232 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
233 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
234 * matches: foo1foo foo2foo foo3foo
236 * pattern: \(fo.*\)-\1
237 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
238 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
241 #include "RESearch.h"
261 * The following defines are not meant to be changeable.
262 * They are for readability only.
269 const char bitarr
[] = {1,2,4,8,16,32,64,'\200'};
271 #define badpat(x) (*nfa = END, x)
273 RESearch::RESearch() {
277 RESearch::~RESearch() {
281 void RESearch::Init() {
282 sta
= NOP
; /* status of lastpat */
284 for (int i
=0; i
<MAXTAG
; i
++)
286 for (int j
=0; j
<BITBLK
; j
++)
290 void RESearch::Clear() {
291 for (int i
=0; i
<MAXTAG
; i
++) {
299 bool RESearch::GrabMatches(CharacterIndexer
&ci
) {
301 for (unsigned int i
=0; i
<MAXTAG
; i
++) {
302 if ((bopat
[i
] != NOTFOUND
) && (eopat
[i
] != NOTFOUND
)) {
303 unsigned int len
= eopat
[i
] - bopat
[i
];
304 pat
[i
] = new char[len
+ 1];
306 for (unsigned int j
=0; j
<len
; j
++)
307 pat
[i
][j
] = ci
.CharAt(bopat
[i
] + j
);
317 void RESearch::ChSet(char c
) {
318 bittab
[((c
) & BLKIND
) >> 3] |= bitarr
[(c
) & BITIND
];
321 void RESearch::ChSetWithCase(char c
, bool caseSensitive
) {
325 if ((c
>= 'a') && (c
<= 'z')) {
327 ChSet(static_cast<char>(c
- 'a' + 'A'));
328 } else if ((c
>= 'A') && (c
<= 'Z')) {
330 ChSet(static_cast<char>(c
- 'A' + 'a'));
337 const char escapeValue(char ch
) {
339 case 'a': return '\a';
340 case 'b': return '\b';
341 case 'f': return '\f';
342 case 'n': return '\n';
343 case 'r': return '\r';
344 case 't': return '\t';
345 case 'v': return '\v';
350 const char *RESearch::Compile(const char *pat
, int length
, bool caseSensitive
, bool posix
) {
351 char *mp
=nfa
; /* nfa pointer */
352 char *lp
; /* saved pointer.. */
353 char *sp
=nfa
; /* another one.. */
354 char *mpMax
= mp
+ MAXNFA
- BITBLK
- 10;
356 int tagi
= 0; /* tag stack index */
357 int tagc
= 1; /* actual tag count */
360 char mask
; /* xor mask -CCL/NCL */
367 return badpat("No previous regular expression");
370 const char *p
=pat
; /* pattern pointer */
371 for (int i
=0; i
<length
; i
++, p
++) {
373 return badpat("Pattern too long");
377 case '.': /* match any char.. */
381 case '^': /* match beginning.. */
390 case '$': /* match endofline.. */
399 case '[': /* match char class..*/
410 if (*p
== '-') { /* real dash */
414 if (*p
== ']') { /* real brace */
418 while (*p
&& *p
!= ']') {
419 if (*p
== '-' && *(p
+1) && *(p
+1) != ']') {
426 ChSetWithCase(static_cast<char>(c1
++), caseSensitive
);
428 } else if (*p
== '\\' && *(p
+1)) {
431 char escape
= escapeValue(*p
);
433 ChSetWithCase(escape
, caseSensitive
);
435 ChSetWithCase(*p
, caseSensitive
);
440 ChSetWithCase(*p
++, caseSensitive
);
444 return badpat("Missing ]");
446 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
447 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
451 case '*': /* match 0 or more.. */
452 case '+': /* match 1 or more.. */
454 return badpat("Empty closure");
455 lp
= sp
; /* previous opcode */
456 if (*lp
== CLO
) /* equivalence.. */
466 return badpat("Illegal closure");
472 for (sp
= mp
; lp
< sp
; lp
++)
484 case '\\': /* tags, backrefs .. */
493 return badpat("Null pattern inside \\<\\>");
506 if (tagi
> 0 && tagstk
[tagi
] == n
)
507 return badpat("Cyclical reference");
509 *mp
++ = static_cast<char>(REF
);
510 *mp
++ = static_cast<char>(n
);
513 return badpat("Undetermined reference");
523 *mp
++ = escapeValue(*p
);
526 if (!posix
&& *p
== '(') {
528 tagstk
[++tagi
] = tagc
;
530 *mp
++ = static_cast<char>(tagc
++);
533 return badpat("Too many \\(\\) pairs");
534 } else if (!posix
&& *p
== ')') {
536 return badpat("Null pattern inside \\(\\)");
538 *mp
++ = static_cast<char>(EOT
);
539 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
542 return badpat("Unmatched \\)");
550 default : /* an ordinary char */
551 if (posix
&& *p
== '(') {
553 tagstk
[++tagi
] = tagc
;
555 *mp
++ = static_cast<char>(tagc
++);
558 return badpat("Too many () pairs");
559 } else if (posix
&& *p
== ')') {
561 return badpat("Null pattern inside ()");
563 *mp
++ = static_cast<char>(EOT
);
564 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
567 return badpat("Unmatched )");
568 } else if (caseSensitive
) {
574 ChSetWithCase(*p
, false);
575 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
576 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
583 return badpat((posix
? "Unmatched (" : "Unmatched \\("));
591 * execute nfa to find a match.
593 * special cases: (nfa[0])
595 * Match only once, starting from the
598 * First locate the character without
599 * calling PMatch, and if found, call
600 * PMatch for the remaining string.
602 * RESearch::Compile failed, poor luser did not
603 * check for it. Fail fast.
605 * If a match is found, bopat[0] and eopat[0] are set
606 * to the beginning and the end of the matched fragment,
611 int RESearch::Execute(CharacterIndexer
&ci
, int lp
, int endp
) {
623 case BOL
: /* anchored: match from BOL only */
624 ep
= PMatch(ci
, lp
, endp
, ap
);
626 case EOL
: /* just searching for end of line normal path doesn't work */
627 if (*(ap
+1) == END
) {
634 case CHR
: /* ordinary char: locate it fast */
636 while ((lp
< endp
) && (ci
.CharAt(lp
) != c
))
638 if (lp
>= endp
) /* if EOS, fail, else fall thru. */
640 default: /* regular matching all the way. */
642 ep
= PMatch(ci
, lp
, endp
, ap
);
648 case END
: /* munged automaton. fail always */
660 * PMatch: internal routine for the hard part
662 * This code is partly snarfed from an early grep written by
663 * David Conroy. The backref and tag stuff, and various other
664 * innovations are by oz.
666 * special case optimizations: (nfa[n], nfa[n+1])
668 * We KNOW .* will match everything upto the
669 * end of line. Thus, directly go to the end of
670 * line, without recursive PMatch calls. As in
671 * the other closure cases, the remaining pattern
672 * must be matched by moving backwards on the
673 * string recursively, to find a match for xy
674 * (x is ".*" and y is the remaining pattern)
675 * where the match satisfies the LONGEST match for
676 * x followed by a match for y.
678 * We can again scan the string forward for the
679 * single char and at the point of failure, we
680 * execute the remaining nfa recursively, same as
683 * At the end of a successful match, bopat[n] and eopat[n]
684 * are set to the beginning and end of subpatterns matched
685 * by tagged expressions (n = 1 to 9).
689 extern void re_fail(char *,char);
692 * character classification table for word boundary operators BOW
693 * and EOW. the reason for not using ctype macros is that we can
694 * let the user add into our own table. see RESearch::ModifyWord. This table
695 * is not in the bitset form, since we may wish to extend it in the
696 * future for other character classifications.
698 * TRUE for 0-9 A-Z a-z _
700 static char chrtyp
[MAXCHR
] = {
701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
705 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
706 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
707 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
708 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
709 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
710 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
711 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
712 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
713 1, 1, 1, 0, 0, 0, 0, 0
716 #define inascii(x) (0177&(x))
717 #define iswordc(x) chrtyp[inascii(x)]
718 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
721 * skip values for CLO XXX to skip past the closure
724 #define ANYSKIP 2 /* [CLO] ANY END ... */
725 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
726 #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
728 int RESearch::PMatch(CharacterIndexer
&ci
, int lp
, int endp
, char *ap
) {
730 int e
; /* extra pointer for CLO */
731 int bp
; /* beginning of subpat.. */
732 int ep
; /* ending of subpat.. */
733 int are
; /* to save the line ptr. */
735 while ((op
= *ap
++) != END
)
739 if (ci
.CharAt(lp
++) != *ap
++)
767 if (lp
!=bol
&& iswordc(ci
.CharAt(lp
-1)) || !iswordc(ci
.CharAt(lp
)))
771 if (lp
==bol
|| !iswordc(ci
.CharAt(lp
-1)) || iswordc(ci
.CharAt(lp
)))
779 if (ci
.CharAt(bp
++) != ci
.CharAt(lp
++))
793 while ((lp
< endp
) && (c
== ci
.CharAt(lp
)))
798 while ((lp
< endp
) && isinset(ap
+1,ci
.CharAt(lp
)))
804 //re_fail("closure: bad nfa.", *ap);
811 if ((e
= PMatch(ci
, lp
, endp
, ap
)) != NOTFOUND
)
817 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
824 * RESearch::ModifyWord:
825 * add new characters into the word table to change RESearch::Execute's
826 * understanding of what a word should look like. Note that we
827 * only accept additions into the word definition.
829 * If the string parameter is 0 or null string, the table is
830 * reset back to the default containing A-Z a-z 0-9 _. [We use
831 * the compact bitset representation for the default table]
834 static char deftab
[16] = {
835 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
836 '\376', '\377', '\377', 007
839 void RESearch::ModifyWord(char *s
) {
843 for (i
= 0; i
< MAXCHR
; i
++)
844 if (!isinset(deftab
,i
))
853 * RESearch::Substitute:
854 * substitute the matched portions of the src in dst.
856 * & substitute the entire matched pattern.
858 * \digit substitute a subpattern, with the given tag number.
859 * Tags are numbered from 1 to 9. If the particular
860 * tagged subpattern does not exist, null is substituted.
862 int RESearch::Substitute(CharacterIndexer
&ci
, char *src
, char *dst
) {
868 if (!*src
|| !bopat
[0])
871 while ((c
= *src
++) != 0) {
880 if (c
>= '0' && c
<= '9') {
890 if ((bp
= bopat
[pin
]) != 0 && (ep
= eopat
[pin
]) != 0) {
891 while (ci
.CharAt(bp
) && bp
< ep
)
892 *dst
++ = ci
.CharAt(bp
++);