]>
git.saurik.com Git - wxWidgets.git/blob - contrib/src/stc/scintilla/src/RESearch.cxx
07534db1ef020406ab7185bfc8161defe90d273b
1 // Scintilla source code edit control
3 ** Regular expression search library.
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
30 * Modification history:
33 * Revision 1.1 2001/09/01 03:05:24 RD
34 * Upgraded to version 1.39 of Scintilla, and upated wxStyledTextCtrl
37 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
38 * Addition of new target methods - versions of ReplaceTarget that take counted
39 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
40 * series of calls rather than a structure.
41 * Handling of \000 in search and replace.
42 * Handling of /escapes within character ranges of regular expressions.
43 * Some handling of bare ^ and $ regular expressions.
45 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
46 * Removed DEBUG code that failed to compile on GTK+.
48 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
49 * Added URL to find original code to comments.
51 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
52 * Made regular expression searching work on a line by line basis, made ^ and
53 * $ work, made [set] work, and added a case insensitive option.
55 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
56 * Replace target functionality to make find and replace operations faster
57 * by diminishing screen updates and allow for \d patterns in the replacement
60 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
61 * Moved to public domain regular expresion implementation.
63 * Revision 1.4 1991/10/17 03:56:42 oz
64 * miscellaneous changes, small cleanups etc.
66 * Revision 1.3 1989/04/01 14:18:09 oz
67 * Change all references to a dfa: this is actually an nfa.
69 * Revision 1.2 88/08/28 15:36:04 oz
70 * Use a complement bitmap to represent NCL.
71 * This removes the need to have seperate
72 * code in the PMatch case block - it is
75 * Use the actual CCL code in the CLO
76 * section of PMatch. No need for a recursive
79 * Use a bitmap table to set char bits in an
83 * RESearch::Compile: compile a regular expression into a NFA.
85 * char *RESearch::Compile(s)
88 * RESearch::Execute: execute the NFA to match a pattern.
90 * int RESearch::Execute(s)
93 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
94 * looks like (for \< and \>) by adding into the
95 * hidden word-syntax table.
97 * void RESearch::ModifyWord(s)
100 * RESearch::Substitute: substitute the matched portions in a new string.
102 * int RESearch::Substitute(src, dst)
106 * re_fail: failure routine for RESearch::Execute.
108 * void re_fail(msg, op)
112 * Regular Expressions:
114 * [1] char matches itself, unless it is a special
115 * character (metachar): . \ [ ] * + ^ $
117 * [2] . matches any character.
119 * [3] \ matches the character following it, except
120 * when followed by a left or right round bracket,
121 * a digit 1 to 9 or a left or right angle bracket.
122 * (see [7], [8] and [9])
123 * It is used as an escape character for all
124 * other meta-characters, and itself. When used
125 * in a set ([4]), it is treated as an ordinary
128 * [4] [set] matches one of the characters in the set.
129 * If the first character in the set is "^",
130 * it matches a character NOT in the set, i.e.
131 * complements the set. A shorthand S-E is
132 * used to specify a set of characters S upto
133 * E, inclusive. The special characters "]" and
134 * "-" have no special meaning if they appear
135 * as the first chars in the set.
138 * [a-z] any lowercase alpha
140 * [^]-] any char except ] and -
142 * [^A-Z] any char except uppercase
147 * [5] * any regular expression form [1] to [4], followed by
148 * closure char (*) matches zero or more matches of
151 * [6] + same as [5], except it matches one or more.
153 * [7] a regular expression in the form [1] to [10], enclosed
154 * as \(form\) matches what form matches. The enclosure
155 * creates a set of tags, used for [8] and for
156 * pattern substution. The tagged forms are numbered
159 * [8] a \ followed by a digit 1 to 9 matches whatever a
160 * previously tagged regular expression ([7]) matched.
162 * [9] \< a regular expression starting with a \< construct
163 * \> and/or ending with a \> construct, restricts the
164 * pattern matching to the beginning of a word, and/or
165 * the end of a word. A word is defined to be a character
166 * string beginning and/or ending with the characters
167 * A-Z a-z 0-9 and _. It must also be preceded and/or
168 * followed by any character outside those mentioned.
170 * [10] a composite regular expression xy where x and y
171 * are in the form [1] to [10] matches the longest
172 * match of x followed by a match for y.
174 * [11] ^ a regular expression starting with a ^ character
175 * $ and/or ending with a $ character, restricts the
176 * pattern matching to the beginning of the line,
177 * or the end of line. [anchors] Elsewhere in the
178 * pattern, ^ and $ are treated as ordinary characters.
183 * HCR's Hugh Redelmeier has been most helpful in various
184 * stages of development. He convinced me to include BOW
185 * and EOW constructs, originally invented by Rob Pike at
186 * the University of Toronto.
189 * Software tools Kernighan & Plauger
190 * Software tools in Pascal Kernighan & Plauger
191 * Grep [rsx-11 C dist] David Conroy
192 * ed - text editor Un*x Programmer's Manual
193 * Advanced editing on Un*x B. W. Kernighan
194 * RegExp routines Henry Spencer
198 * This implementation uses a bit-set representation for character
199 * classes for speed and compactness. Each character is represented
200 * by one bit in a 128-bit block. Thus, CCL always takes a
201 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
202 * bit comparison to locate the character in the set.
207 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
208 * matches: fo foo fooo foobar fobar foxx ...
210 * pattern: fo[ob]a[rz]
211 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
212 * matches: fobar fooar fobaz fooaz
215 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
216 * matches: foo\ foo\\ foo\\\ ...
218 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
219 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
220 * matches: foo1foo foo2foo foo3foo
222 * pattern: \(fo.*\)-\1
223 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
224 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
227 #include "RESearch.h"
247 * The following defines are not meant to be changeable.
248 * They are for readability only.
255 const char bitarr
[] = {1,2,4,8,16,32,64,'\200'};
257 #define badpat(x) (*nfa = END, x)
259 RESearch::RESearch() {
263 RESearch::~RESearch() {
267 void RESearch::Init() {
268 sta
= NOP
; /* status of lastpat */
270 for (int i
=0; i
<MAXTAG
; i
++)
272 for (int j
=0; j
<BITBLK
; j
++)
276 void RESearch::Clear() {
277 for (int i
=0; i
<MAXTAG
; i
++) {
285 bool RESearch::GrabMatches(CharacterIndexer
&ci
) {
287 for (unsigned int i
=0; i
<MAXTAG
; i
++) {
288 if ((bopat
[i
] != NOTFOUND
) && (eopat
[i
] != NOTFOUND
)) {
289 unsigned int len
= eopat
[i
] - bopat
[i
];
290 pat
[i
] = new char[len
+ 1];
292 for (unsigned int j
=0; j
<len
; j
++)
293 pat
[i
][j
] = ci
.CharAt(bopat
[i
] + j
);
303 void RESearch::ChSet(char c
) {
304 bittab
[((c
) & BLKIND
) >> 3] |= bitarr
[(c
) & BITIND
];
307 void RESearch::ChSetWithCase(char c
, bool caseSensitive
) {
311 if ((c
>= 'a') && (c
<= 'z')) {
313 ChSet(static_cast<char>(c
- 'a' + 'A'));
314 } else if ((c
>= 'A') && (c
<= 'Z')) {
316 ChSet(static_cast<char>(c
- 'A' + 'a'));
323 const char escapeValue(char ch
) {
325 case 'a': return '\a';
326 case 'b': return '\b';
327 case 'f': return '\f';
328 case 'n': return '\n';
329 case 'r': return '\r';
330 case 't': return '\t';
331 case 'v': return '\v';
336 const char *RESearch::Compile(const char *pat
, int length
, bool caseSensitive
) {
337 char *mp
=nfa
; /* nfa pointer */
338 char *lp
; /* saved pointer.. */
339 char *sp
=nfa
; /* another one.. */
341 int tagi
= 0; /* tag stack index */
342 int tagc
= 1; /* actual tag count */
345 char mask
; /* xor mask -CCL/NCL */
352 return badpat("No previous regular expression");
355 const char *p
=pat
; /* pattern pointer */
356 for (int i
=0; i
<length
; i
++, p
++) {
360 case '.': /* match any char.. */
364 case '^': /* match beginning.. */
373 case '$': /* match endofline.. */
382 case '[': /* match char class..*/
393 if (*p
== '-') { /* real dash */
397 if (*p
== ']') { /* real brace */
401 while (*p
&& *p
!= ']') {
402 if (*p
== '-' && *(p
+1) && *(p
+1) != ']') {
409 ChSetWithCase(static_cast<char>(c1
++), caseSensitive
);
411 } else if (*p
== '\\' && *(p
+1)) {
414 char escape
= escapeValue(*p
);
416 ChSetWithCase(escape
, caseSensitive
);
418 ChSetWithCase(*p
, caseSensitive
);
423 ChSetWithCase(*p
++, caseSensitive
);
427 return badpat("Missing ]");
429 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
430 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
434 case '*': /* match 0 or more.. */
435 case '+': /* match 1 or more.. */
437 return badpat("Empty closure");
438 lp
= sp
; /* previous opcode */
439 if (*lp
== CLO
) /* equivalence.. */
449 return badpat("Illegal closure");
455 for (sp
= mp
; lp
< sp
; lp
++)
467 case '\\': /* tags, backrefs .. */
473 tagstk
[++tagi
] = tagc
;
475 *mp
++ = static_cast<char>(tagc
++);
478 return badpat("Too many \\(\\) pairs");
482 return badpat("Null pattern inside \\(\\)");
484 *mp
++ = static_cast<char>(EOT
);
485 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
488 return badpat("Unmatched \\)");
495 return badpat("Null pattern inside \\<\\>");
508 if (tagi
> 0 && tagstk
[tagi
] == n
)
509 return badpat("Cyclical reference");
511 *mp
++ = static_cast<char>(REF
);
512 *mp
++ = static_cast<char>(n
);
515 return badpat("Undetermined reference");
525 *mp
++ = escapeValue(*p
);
533 default : /* an ordinary char */
540 ChSetWithCase(*p
, false);
541 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
542 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
549 return badpat("Unmatched \\(");
557 * execute nfa to find a match.
559 * special cases: (nfa[0])
561 * Match only once, starting from the
564 * First locate the character without
565 * calling PMatch, and if found, call
566 * PMatch for the remaining string.
568 * RESearch::Compile failed, poor luser did not
569 * check for it. Fail fast.
571 * If a match is found, bopat[0] and eopat[0] are set
572 * to the beginning and the end of the matched fragment,
577 int RESearch::Execute(CharacterIndexer
&ci
, int lp
, int endp
) {
589 case BOL
: /* anchored: match from BOL only */
590 ep
= PMatch(ci
, lp
, endp
, ap
);
592 case EOL
: /* just searching for end of line normal path doesn't work */
593 if (*(ap
+1) == END
) {
600 case CHR
: /* ordinary char: locate it fast */
602 while ((lp
< endp
) && (ci
.CharAt(lp
) != c
))
604 if (lp
>= endp
) /* if EOS, fail, else fall thru. */
606 default: /* regular matching all the way. */
608 ep
= PMatch(ci
, lp
, endp
, ap
);
614 case END
: /* munged automaton. fail always */
626 * PMatch: internal routine for the hard part
628 * This code is partly snarfed from an early grep written by
629 * David Conroy. The backref and tag stuff, and various other
630 * innovations are by oz.
632 * special case optimizations: (nfa[n], nfa[n+1])
634 * We KNOW .* will match everything upto the
635 * end of line. Thus, directly go to the end of
636 * line, without recursive PMatch calls. As in
637 * the other closure cases, the remaining pattern
638 * must be matched by moving backwards on the
639 * string recursively, to find a match for xy
640 * (x is ".*" and y is the remaining pattern)
641 * where the match satisfies the LONGEST match for
642 * x followed by a match for y.
644 * We can again scan the string forward for the
645 * single char and at the point of failure, we
646 * execute the remaining nfa recursively, same as
649 * At the end of a successful match, bopat[n] and eopat[n]
650 * are set to the beginning and end of subpatterns matched
651 * by tagged expressions (n = 1 to 9).
655 extern void re_fail(char *,char);
658 * character classification table for word boundary operators BOW
659 * and EOW. the reason for not using ctype macros is that we can
660 * let the user add into our own table. see RESearch::ModifyWord. This table
661 * is not in the bitset form, since we may wish to extend it in the
662 * future for other character classifications.
664 * TRUE for 0-9 A-Z a-z _
666 static char chrtyp
[MAXCHR
] = {
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
671 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
672 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
673 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
676 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
679 1, 1, 1, 0, 0, 0, 0, 0
682 #define inascii(x) (0177&(x))
683 #define iswordc(x) chrtyp[inascii(x)]
684 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
687 * skip values for CLO XXX to skip past the closure
690 #define ANYSKIP 2 /* [CLO] ANY END ... */
691 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
692 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
694 int RESearch::PMatch(CharacterIndexer
&ci
, int lp
, int endp
, char *ap
) {
696 int e
; /* extra pointer for CLO */
697 int bp
; /* beginning of subpat.. */
698 int ep
; /* ending of subpat.. */
699 int are
; /* to save the line ptr. */
701 while ((op
= *ap
++) != END
)
705 if (ci
.CharAt(lp
++) != *ap
++)
733 if (lp
!=bol
&& iswordc(ci
.CharAt(lp
-1)) || !iswordc(ci
.CharAt(lp
)))
737 if (lp
==bol
|| !iswordc(ci
.CharAt(lp
-1)) || iswordc(ci
.CharAt(lp
)))
745 if (ci
.CharAt(bp
++) != ci
.CharAt(lp
++))
759 while ((lp
< endp
) && (c
== ci
.CharAt(lp
)))
764 while ((lp
< endp
) && isinset(ap
+1,ci
.CharAt(lp
)))
770 //re_fail("closure: bad nfa.", *ap);
777 if ((e
= PMatch(ci
, lp
, endp
, ap
)) != NOTFOUND
)
783 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
790 * RESearch::ModifyWord:
791 * add new characters into the word table to change RESearch::Execute's
792 * understanding of what a word should look like. Note that we
793 * only accept additions into the word definition.
795 * If the string parameter is 0 or null string, the table is
796 * reset back to the default containing A-Z a-z 0-9 _. [We use
797 * the compact bitset representation for the default table]
800 static char deftab
[16] = {
801 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
802 '\376', '\377', '\377', 007
805 void RESearch::ModifyWord(char *s
) {
809 for (i
= 0; i
< MAXCHR
; i
++)
810 if (!isinset(deftab
,i
))
819 * RESearch::Substitute:
820 * substitute the matched portions of the src in dst.
822 * & substitute the entire matched pattern.
824 * \digit substitute a subpattern, with the given tag number.
825 * Tags are numbered from 1 to 9. If the particular
826 * tagged subpattern does not exist, null is substituted.
828 int RESearch::Substitute(CharacterIndexer
&ci
, char *src
, char *dst
) {
834 if (!*src
|| !bopat
[0])
837 while ((c
= *src
++) != 0) {
846 if (c
>= '0' && c
<= '9') {
856 if ((bp
= bopat
[pin
]) != 0 && (ep
= eopat
[pin
]) != 0) {
857 while (ci
.CharAt(bp
) && bp
< ep
)
858 *dst
++ = ci
.CharAt(bp
++);