]>
git.saurik.com Git - wxWidgets.git/blob - src/stc/scintilla/src/RESearch.cxx
f176bbf19e02107aac4100a5fc1492f5d4596343
1 // Scintilla source code edit control
3 ** Regular expression search library.
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
30 * Modification history:
33 * Revision 1.5 2002/09/11 00:55:27 RD
34 * Update to Scintilla 1.48
36 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
37 * Addition of new target methods - versions of ReplaceTarget that take counted
38 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
39 * series of calls rather than a structure.
40 * Handling of \000 in search and replace.
41 * Handling of /escapes within character ranges of regular expressions.
42 * Some handling of bare ^ and $ regular expressions.
44 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
45 * Removed DEBUG code that failed to compile on GTK+.
47 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
48 * Added URL to find original code to comments.
50 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
51 * Made regular expression searching work on a line by line basis, made ^ and
52 * $ work, made [set] work, and added a case insensitive option.
54 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
55 * Replace target functionality to make find and replace operations faster
56 * by diminishing screen updates and allow for \d patterns in the replacement
59 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
60 * Moved to public domain regular expresion implementation.
62 * Revision 1.4 1991/10/17 03:56:42 oz
63 * miscellaneous changes, small cleanups etc.
65 * Revision 1.3 1989/04/01 14:18:09 oz
66 * Change all references to a dfa: this is actually an nfa.
68 * Revision 1.2 88/08/28 15:36:04 oz
69 * Use a complement bitmap to represent NCL.
70 * This removes the need to have seperate
71 * code in the PMatch case block - it is
74 * Use the actual CCL code in the CLO
75 * section of PMatch. No need for a recursive
78 * Use a bitmap table to set char bits in an
82 * RESearch::Compile: compile a regular expression into a NFA.
84 * char *RESearch::Compile(s)
87 * RESearch::Execute: execute the NFA to match a pattern.
89 * int RESearch::Execute(s)
92 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
93 * looks like (for \< and \>) by adding into the
94 * hidden word-syntax table.
96 * void RESearch::ModifyWord(s)
99 * RESearch::Substitute: substitute the matched portions in a new string.
101 * int RESearch::Substitute(src, dst)
105 * re_fail: failure routine for RESearch::Execute.
107 * void re_fail(msg, op)
111 * Regular Expressions:
113 * [1] char matches itself, unless it is a special
114 * character (metachar): . \ [ ] * + ^ $
116 * [2] . matches any character.
118 * [3] \ matches the character following it, except
119 * when followed by a left or right round bracket,
120 * a digit 1 to 9 or a left or right angle bracket.
121 * (see [7], [8] and [9])
122 * It is used as an escape character for all
123 * other meta-characters, and itself. When used
124 * in a set ([4]), it is treated as an ordinary
127 * [4] [set] matches one of the characters in the set.
128 * If the first character in the set is "^",
129 * it matches a character NOT in the set, i.e.
130 * complements the set. A shorthand S-E is
131 * used to specify a set of characters S upto
132 * E, inclusive. The special characters "]" and
133 * "-" have no special meaning if they appear
134 * as the first chars in the set.
137 * [a-z] any lowercase alpha
139 * [^]-] any char except ] and -
141 * [^A-Z] any char except uppercase
146 * [5] * any regular expression form [1] to [4], followed by
147 * closure char (*) matches zero or more matches of
150 * [6] + same as [5], except it matches one or more.
152 * [7] a regular expression in the form [1] to [10], enclosed
153 * as \(form\) matches what form matches. The enclosure
154 * creates a set of tags, used for [8] and for
155 * pattern substution. The tagged forms are numbered
158 * [8] a \ followed by a digit 1 to 9 matches whatever a
159 * previously tagged regular expression ([7]) matched.
161 * [9] \< a regular expression starting with a \< construct
162 * \> and/or ending with a \> construct, restricts the
163 * pattern matching to the beginning of a word, and/or
164 * the end of a word. A word is defined to be a character
165 * string beginning and/or ending with the characters
166 * A-Z a-z 0-9 and _. It must also be preceded and/or
167 * followed by any character outside those mentioned.
169 * [10] a composite regular expression xy where x and y
170 * are in the form [1] to [10] matches the longest
171 * match of x followed by a match for y.
173 * [11] ^ a regular expression starting with a ^ character
174 * $ and/or ending with a $ character, restricts the
175 * pattern matching to the beginning of the line,
176 * or the end of line. [anchors] Elsewhere in the
177 * pattern, ^ and $ are treated as ordinary characters.
182 * HCR's Hugh Redelmeier has been most helpful in various
183 * stages of development. He convinced me to include BOW
184 * and EOW constructs, originally invented by Rob Pike at
185 * the University of Toronto.
188 * Software tools Kernighan & Plauger
189 * Software tools in Pascal Kernighan & Plauger
190 * Grep [rsx-11 C dist] David Conroy
191 * ed - text editor Un*x Programmer's Manual
192 * Advanced editing on Un*x B. W. Kernighan
193 * RegExp routines Henry Spencer
197 * This implementation uses a bit-set representation for character
198 * classes for speed and compactness. Each character is represented
199 * by one bit in a 128-bit block. Thus, CCL always takes a
200 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
201 * bit comparison to locate the character in the set.
206 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
207 * matches: fo foo fooo foobar fobar foxx ...
209 * pattern: fo[ob]a[rz]
210 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
211 * matches: fobar fooar fobaz fooaz
214 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
215 * matches: foo\ foo\\ foo\\\ ...
217 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
218 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
219 * matches: foo1foo foo2foo foo3foo
221 * pattern: \(fo.*\)-\1
222 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
223 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
226 #include "RESearch.h"
246 * The following defines are not meant to be changeable.
247 * They are for readability only.
254 const char bitarr
[] = {1,2,4,8,16,32,64,'\200'};
256 #define badpat(x) (*nfa = END, x)
258 RESearch::RESearch() {
262 RESearch::~RESearch() {
266 void RESearch::Init() {
267 sta
= NOP
; /* status of lastpat */
269 for (int i
=0; i
<MAXTAG
; i
++)
271 for (int j
=0; j
<BITBLK
; j
++)
275 void RESearch::Clear() {
276 for (int i
=0; i
<MAXTAG
; i
++) {
284 bool RESearch::GrabMatches(CharacterIndexer
&ci
) {
286 for (unsigned int i
=0; i
<MAXTAG
; i
++) {
287 if ((bopat
[i
] != NOTFOUND
) && (eopat
[i
] != NOTFOUND
)) {
288 unsigned int len
= eopat
[i
] - bopat
[i
];
289 pat
[i
] = new char[len
+ 1];
291 for (unsigned int j
=0; j
<len
; j
++)
292 pat
[i
][j
] = ci
.CharAt(bopat
[i
] + j
);
302 void RESearch::ChSet(char c
) {
303 bittab
[((c
) & BLKIND
) >> 3] |= bitarr
[(c
) & BITIND
];
306 void RESearch::ChSetWithCase(char c
, bool caseSensitive
) {
310 if ((c
>= 'a') && (c
<= 'z')) {
312 ChSet(static_cast<char>(c
- 'a' + 'A'));
313 } else if ((c
>= 'A') && (c
<= 'Z')) {
315 ChSet(static_cast<char>(c
- 'A' + 'a'));
322 const char escapeValue(char ch
) {
324 case 'a': return '\a';
325 case 'b': return '\b';
326 case 'f': return '\f';
327 case 'n': return '\n';
328 case 'r': return '\r';
329 case 't': return '\t';
330 case 'v': return '\v';
335 const char *RESearch::Compile(const char *pat
, int length
, bool caseSensitive
) {
336 char *mp
=nfa
; /* nfa pointer */
337 char *lp
; /* saved pointer.. */
338 char *sp
=nfa
; /* another one.. */
340 int tagi
= 0; /* tag stack index */
341 int tagc
= 1; /* actual tag count */
344 char mask
; /* xor mask -CCL/NCL */
351 return badpat("No previous regular expression");
354 const char *p
=pat
; /* pattern pointer */
355 for (int i
=0; i
<length
; i
++, p
++) {
359 case '.': /* match any char.. */
363 case '^': /* match beginning.. */
372 case '$': /* match endofline.. */
381 case '[': /* match char class..*/
392 if (*p
== '-') { /* real dash */
396 if (*p
== ']') { /* real brace */
400 while (*p
&& *p
!= ']') {
401 if (*p
== '-' && *(p
+1) && *(p
+1) != ']') {
408 ChSetWithCase(static_cast<char>(c1
++), caseSensitive
);
410 } else if (*p
== '\\' && *(p
+1)) {
413 char escape
= escapeValue(*p
);
415 ChSetWithCase(escape
, caseSensitive
);
417 ChSetWithCase(*p
, caseSensitive
);
422 ChSetWithCase(*p
++, caseSensitive
);
426 return badpat("Missing ]");
428 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
429 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
433 case '*': /* match 0 or more.. */
434 case '+': /* match 1 or more.. */
436 return badpat("Empty closure");
437 lp
= sp
; /* previous opcode */
438 if (*lp
== CLO
) /* equivalence.. */
448 return badpat("Illegal closure");
454 for (sp
= mp
; lp
< sp
; lp
++)
466 case '\\': /* tags, backrefs .. */
472 tagstk
[++tagi
] = tagc
;
474 *mp
++ = static_cast<char>(tagc
++);
477 return badpat("Too many \\(\\) pairs");
481 return badpat("Null pattern inside \\(\\)");
483 *mp
++ = static_cast<char>(EOT
);
484 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
487 return badpat("Unmatched \\)");
494 return badpat("Null pattern inside \\<\\>");
507 if (tagi
> 0 && tagstk
[tagi
] == n
)
508 return badpat("Cyclical reference");
510 *mp
++ = static_cast<char>(REF
);
511 *mp
++ = static_cast<char>(n
);
514 return badpat("Undetermined reference");
524 *mp
++ = escapeValue(*p
);
532 default : /* an ordinary char */
539 ChSetWithCase(*p
, false);
540 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
541 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
548 return badpat("Unmatched \\(");
556 * execute nfa to find a match.
558 * special cases: (nfa[0])
560 * Match only once, starting from the
563 * First locate the character without
564 * calling PMatch, and if found, call
565 * PMatch for the remaining string.
567 * RESearch::Compile failed, poor luser did not
568 * check for it. Fail fast.
570 * If a match is found, bopat[0] and eopat[0] are set
571 * to the beginning and the end of the matched fragment,
576 int RESearch::Execute(CharacterIndexer
&ci
, int lp
, int endp
) {
588 case BOL
: /* anchored: match from BOL only */
589 ep
= PMatch(ci
, lp
, endp
, ap
);
591 case EOL
: /* just searching for end of line normal path doesn't work */
592 if (*(ap
+1) == END
) {
599 case CHR
: /* ordinary char: locate it fast */
601 while ((lp
< endp
) && (ci
.CharAt(lp
) != c
))
603 if (lp
>= endp
) /* if EOS, fail, else fall thru. */
605 default: /* regular matching all the way. */
607 ep
= PMatch(ci
, lp
, endp
, ap
);
613 case END
: /* munged automaton. fail always */
625 * PMatch: internal routine for the hard part
627 * This code is partly snarfed from an early grep written by
628 * David Conroy. The backref and tag stuff, and various other
629 * innovations are by oz.
631 * special case optimizations: (nfa[n], nfa[n+1])
633 * We KNOW .* will match everything upto the
634 * end of line. Thus, directly go to the end of
635 * line, without recursive PMatch calls. As in
636 * the other closure cases, the remaining pattern
637 * must be matched by moving backwards on the
638 * string recursively, to find a match for xy
639 * (x is ".*" and y is the remaining pattern)
640 * where the match satisfies the LONGEST match for
641 * x followed by a match for y.
643 * We can again scan the string forward for the
644 * single char and at the point of failure, we
645 * execute the remaining nfa recursively, same as
648 * At the end of a successful match, bopat[n] and eopat[n]
649 * are set to the beginning and end of subpatterns matched
650 * by tagged expressions (n = 1 to 9).
654 extern void re_fail(char *,char);
657 * character classification table for word boundary operators BOW
658 * and EOW. the reason for not using ctype macros is that we can
659 * let the user add into our own table. see RESearch::ModifyWord. This table
660 * is not in the bitset form, since we may wish to extend it in the
661 * future for other character classifications.
663 * TRUE for 0-9 A-Z a-z _
665 static char chrtyp
[MAXCHR
] = {
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
671 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
672 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
675 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
678 1, 1, 1, 0, 0, 0, 0, 0
681 #define inascii(x) (0177&(x))
682 #define iswordc(x) chrtyp[inascii(x)]
683 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
686 * skip values for CLO XXX to skip past the closure
689 #define ANYSKIP 2 /* [CLO] ANY END ... */
690 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
691 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
693 int RESearch::PMatch(CharacterIndexer
&ci
, int lp
, int endp
, char *ap
) {
695 int e
; /* extra pointer for CLO */
696 int bp
; /* beginning of subpat.. */
697 int ep
; /* ending of subpat.. */
698 int are
; /* to save the line ptr. */
700 while ((op
= *ap
++) != END
)
704 if (ci
.CharAt(lp
++) != *ap
++)
732 if (lp
!=bol
&& iswordc(ci
.CharAt(lp
-1)) || !iswordc(ci
.CharAt(lp
)))
736 if (lp
==bol
|| !iswordc(ci
.CharAt(lp
-1)) || iswordc(ci
.CharAt(lp
)))
744 if (ci
.CharAt(bp
++) != ci
.CharAt(lp
++))
758 while ((lp
< endp
) && (c
== ci
.CharAt(lp
)))
763 while ((lp
< endp
) && isinset(ap
+1,ci
.CharAt(lp
)))
769 //re_fail("closure: bad nfa.", *ap);
776 if ((e
= PMatch(ci
, lp
, endp
, ap
)) != NOTFOUND
)
782 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
789 * RESearch::ModifyWord:
790 * add new characters into the word table to change RESearch::Execute's
791 * understanding of what a word should look like. Note that we
792 * only accept additions into the word definition.
794 * If the string parameter is 0 or null string, the table is
795 * reset back to the default containing A-Z a-z 0-9 _. [We use
796 * the compact bitset representation for the default table]
799 static char deftab
[16] = {
800 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
801 '\376', '\377', '\377', 007
804 void RESearch::ModifyWord(char *s
) {
808 for (i
= 0; i
< MAXCHR
; i
++)
809 if (!isinset(deftab
,i
))
818 * RESearch::Substitute:
819 * substitute the matched portions of the src in dst.
821 * & substitute the entire matched pattern.
823 * \digit substitute a subpattern, with the given tag number.
824 * Tags are numbered from 1 to 9. If the particular
825 * tagged subpattern does not exist, null is substituted.
827 int RESearch::Substitute(CharacterIndexer
&ci
, char *src
, char *dst
) {
833 if (!*src
|| !bopat
[0])
836 while ((c
= *src
++) != 0) {
845 if (c
>= '0' && c
<= '9') {
855 if ((bp
= bopat
[pin
]) != 0 && (ep
= eopat
[pin
]) != 0) {
856 while (ci
.CharAt(bp
) && bp
< ep
)
857 *dst
++ = ci
.CharAt(bp
++);