]>
git.saurik.com Git - wxWidgets.git/blob - src/stc/scintilla/src/RESearch.cxx
1 // Scintilla source code edit control
3 ** Regular expression search library.
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
30 * Modification history:
33 * Revision 1.8 2003/09/18 05:05:38 RD
34 * Updated to Scintilla 1.54
35 * Applied most of patch #806092
36 * Added new wxSTC sample from Otto Wyss
38 * Revision 1.9 2003/03/21 10:36:08 nyamatongwe
39 * Detect patterns too long in regular expression search.
41 * Revision 1.8 2003/03/04 10:53:59 nyamatongwe
42 * Patch from Jakub to optionally implement more POSIX compatible regular
43 * expressions. \(..\) changes to (..)
44 * Fixes problem where find previous would not find earlier matches on same
47 * Revision 1.8 2003/03/03 20:12:56 vrana
50 * Revision 1.7 2002/09/28 00:33:28 nyamatongwe
51 * Fixed problem with character ranges caused by expansion to 8 bits.
53 * Revision 1.6 2001/04/29 13:32:10 nyamatongwe
54 * Addition of new target methods - versions of ReplaceTarget that take counted
55 * strings to allow for nulls, SearchInTarget and Get/SetSearchFlags to use a
56 * series of calls rather than a structure.
57 * Handling of \000 in search and replace.
58 * Handling of /escapes within character ranges of regular expressions.
59 * Some handling of bare ^ and $ regular expressions.
61 * Revision 1.5 2001/04/20 07:36:09 nyamatongwe
62 * Removed DEBUG code that failed to compile on GTK+.
64 * Revision 1.4 2001/04/13 03:52:13 nyamatongwe
65 * Added URL to find original code to comments.
67 * Revision 1.3 2001/04/06 12:24:21 nyamatongwe
68 * Made regular expression searching work on a line by line basis, made ^ and
69 * $ work, made [set] work, and added a case insensitive option.
71 * Revision 1.2 2001/04/05 01:58:04 nyamatongwe
72 * Replace target functionality to make find and replace operations faster
73 * by diminishing screen updates and allow for \d patterns in the replacement
76 * Revision 1.1 2001/04/04 12:52:44 nyamatongwe
77 * Moved to public domain regular expresion implementation.
79 * Revision 1.4 1991/10/17 03:56:42 oz
80 * miscellaneous changes, small cleanups etc.
82 * Revision 1.3 1989/04/01 14:18:09 oz
83 * Change all references to a dfa: this is actually an nfa.
85 * Revision 1.2 88/08/28 15:36:04 oz
86 * Use a complement bitmap to represent NCL.
87 * This removes the need to have seperate
88 * code in the PMatch case block - it is
91 * Use the actual CCL code in the CLO
92 * section of PMatch. No need for a recursive
95 * Use a bitmap table to set char bits in an
99 * RESearch::Compile: compile a regular expression into a NFA.
101 * char *RESearch::Compile(s)
104 * RESearch::Execute: execute the NFA to match a pattern.
106 * int RESearch::Execute(s)
109 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
110 * looks like (for \< and \>) by adding into the
111 * hidden word-syntax table.
113 * void RESearch::ModifyWord(s)
116 * RESearch::Substitute: substitute the matched portions in a new string.
118 * int RESearch::Substitute(src, dst)
122 * re_fail: failure routine for RESearch::Execute.
124 * void re_fail(msg, op)
128 * Regular Expressions:
130 * [1] char matches itself, unless it is a special
131 * character (metachar): . \ [ ] * + ^ $
133 * [2] . matches any character.
135 * [3] \ matches the character following it, except
136 * when followed by a left or right round bracket,
137 * a digit 1 to 9 or a left or right angle bracket.
138 * (see [7], [8] and [9])
139 * It is used as an escape character for all
140 * other meta-characters, and itself. When used
141 * in a set ([4]), it is treated as an ordinary
144 * [4] [set] matches one of the characters in the set.
145 * If the first character in the set is "^",
146 * it matches a character NOT in the set, i.e.
147 * complements the set. A shorthand S-E is
148 * used to specify a set of characters S upto
149 * E, inclusive. The special characters "]" and
150 * "-" have no special meaning if they appear
151 * as the first chars in the set.
154 * [a-z] any lowercase alpha
156 * [^]-] any char except ] and -
158 * [^A-Z] any char except uppercase
163 * [5] * any regular expression form [1] to [4], followed by
164 * closure char (*) matches zero or more matches of
167 * [6] + same as [5], except it matches one or more.
169 * [7] a regular expression in the form [1] to [10], enclosed
170 * as \(form\) matches what form matches. The enclosure
171 * creates a set of tags, used for [8] and for
172 * pattern substution. The tagged forms are numbered
175 * [8] a \ followed by a digit 1 to 9 matches whatever a
176 * previously tagged regular expression ([7]) matched.
178 * [9] \< a regular expression starting with a \< construct
179 * \> and/or ending with a \> construct, restricts the
180 * pattern matching to the beginning of a word, and/or
181 * the end of a word. A word is defined to be a character
182 * string beginning and/or ending with the characters
183 * A-Z a-z 0-9 and _. It must also be preceded and/or
184 * followed by any character outside those mentioned.
186 * [10] a composite regular expression xy where x and y
187 * are in the form [1] to [10] matches the longest
188 * match of x followed by a match for y.
190 * [11] ^ a regular expression starting with a ^ character
191 * $ and/or ending with a $ character, restricts the
192 * pattern matching to the beginning of the line,
193 * or the end of line. [anchors] Elsewhere in the
194 * pattern, ^ and $ are treated as ordinary characters.
199 * HCR's Hugh Redelmeier has been most helpful in various
200 * stages of development. He convinced me to include BOW
201 * and EOW constructs, originally invented by Rob Pike at
202 * the University of Toronto.
205 * Software tools Kernighan & Plauger
206 * Software tools in Pascal Kernighan & Plauger
207 * Grep [rsx-11 C dist] David Conroy
208 * ed - text editor Un*x Programmer's Manual
209 * Advanced editing on Un*x B. W. Kernighan
210 * RegExp routines Henry Spencer
214 * This implementation uses a bit-set representation for character
215 * classes for speed and compactness. Each character is represented
216 * by one bit in a 128-bit block. Thus, CCL always takes a
217 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
218 * bit comparison to locate the character in the set.
223 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
224 * matches: fo foo fooo foobar fobar foxx ...
226 * pattern: fo[ob]a[rz]
227 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
228 * matches: fobar fooar fobaz fooaz
231 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
232 * matches: foo\ foo\\ foo\\\ ...
234 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
235 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
236 * matches: foo1foo foo2foo foo3foo
238 * pattern: \(fo.*\)-\1
239 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
240 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
243 #include "RESearch.h"
263 * The following defines are not meant to be changeable.
264 * They are for readability only.
271 const char bitarr
[] = {1,2,4,8,16,32,64,'\200'};
273 #define badpat(x) (*nfa = END, x)
275 RESearch::RESearch() {
279 RESearch::~RESearch() {
283 void RESearch::Init() {
284 sta
= NOP
; /* status of lastpat */
286 for (int i
=0; i
<MAXTAG
; i
++)
288 for (int j
=0; j
<BITBLK
; j
++)
292 void RESearch::Clear() {
293 for (int i
=0; i
<MAXTAG
; i
++) {
301 bool RESearch::GrabMatches(CharacterIndexer
&ci
) {
303 for (unsigned int i
=0; i
<MAXTAG
; i
++) {
304 if ((bopat
[i
] != NOTFOUND
) && (eopat
[i
] != NOTFOUND
)) {
305 unsigned int len
= eopat
[i
] - bopat
[i
];
306 pat
[i
] = new char[len
+ 1];
308 for (unsigned int j
=0; j
<len
; j
++)
309 pat
[i
][j
] = ci
.CharAt(bopat
[i
] + j
);
319 void RESearch::ChSet(char c
) {
320 bittab
[((c
) & BLKIND
) >> 3] |= bitarr
[(c
) & BITIND
];
323 void RESearch::ChSetWithCase(char c
, bool caseSensitive
) {
327 if ((c
>= 'a') && (c
<= 'z')) {
329 ChSet(static_cast<char>(c
- 'a' + 'A'));
330 } else if ((c
>= 'A') && (c
<= 'Z')) {
332 ChSet(static_cast<char>(c
- 'A' + 'a'));
339 const char escapeValue(char ch
) {
341 case 'a': return '\a';
342 case 'b': return '\b';
343 case 'f': return '\f';
344 case 'n': return '\n';
345 case 'r': return '\r';
346 case 't': return '\t';
347 case 'v': return '\v';
352 const char *RESearch::Compile(const char *pat
, int length
, bool caseSensitive
, bool posix
) {
353 char *mp
=nfa
; /* nfa pointer */
354 char *lp
; /* saved pointer.. */
355 char *sp
=nfa
; /* another one.. */
356 char *mpMax
= mp
+ MAXNFA
- BITBLK
- 10;
358 int tagi
= 0; /* tag stack index */
359 int tagc
= 1; /* actual tag count */
362 char mask
; /* xor mask -CCL/NCL */
369 return badpat("No previous regular expression");
372 const char *p
=pat
; /* pattern pointer */
373 for (int i
=0; i
<length
; i
++, p
++) {
375 return badpat("Pattern too long");
379 case '.': /* match any char.. */
383 case '^': /* match beginning.. */
392 case '$': /* match endofline.. */
401 case '[': /* match char class..*/
412 if (*p
== '-') { /* real dash */
416 if (*p
== ']') { /* real brace */
420 while (*p
&& *p
!= ']') {
421 if (*p
== '-' && *(p
+1) && *(p
+1) != ']') {
428 ChSetWithCase(static_cast<char>(c1
++), caseSensitive
);
430 } else if (*p
== '\\' && *(p
+1)) {
433 char escape
= escapeValue(*p
);
435 ChSetWithCase(escape
, caseSensitive
);
437 ChSetWithCase(*p
, caseSensitive
);
442 ChSetWithCase(*p
++, caseSensitive
);
446 return badpat("Missing ]");
448 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
449 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
453 case '*': /* match 0 or more.. */
454 case '+': /* match 1 or more.. */
456 return badpat("Empty closure");
457 lp
= sp
; /* previous opcode */
458 if (*lp
== CLO
) /* equivalence.. */
468 return badpat("Illegal closure");
474 for (sp
= mp
; lp
< sp
; lp
++)
486 case '\\': /* tags, backrefs .. */
495 return badpat("Null pattern inside \\<\\>");
508 if (tagi
> 0 && tagstk
[tagi
] == n
)
509 return badpat("Cyclical reference");
511 *mp
++ = static_cast<char>(REF
);
512 *mp
++ = static_cast<char>(n
);
515 return badpat("Undetermined reference");
525 *mp
++ = escapeValue(*p
);
528 if (!posix
&& *p
== '(') {
530 tagstk
[++tagi
] = tagc
;
532 *mp
++ = static_cast<char>(tagc
++);
535 return badpat("Too many \\(\\) pairs");
536 } else if (!posix
&& *p
== ')') {
538 return badpat("Null pattern inside \\(\\)");
540 *mp
++ = static_cast<char>(EOT
);
541 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
544 return badpat("Unmatched \\)");
552 default : /* an ordinary char */
553 if (posix
&& *p
== '(') {
555 tagstk
[++tagi
] = tagc
;
557 *mp
++ = static_cast<char>(tagc
++);
560 return badpat("Too many () pairs");
561 } else if (posix
&& *p
== ')') {
563 return badpat("Null pattern inside ()");
565 *mp
++ = static_cast<char>(EOT
);
566 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
569 return badpat("Unmatched )");
570 } else if (caseSensitive
) {
576 ChSetWithCase(*p
, false);
577 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
578 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
585 return badpat((posix
? "Unmatched (" : "Unmatched \\("));
593 * execute nfa to find a match.
595 * special cases: (nfa[0])
597 * Match only once, starting from the
600 * First locate the character without
601 * calling PMatch, and if found, call
602 * PMatch for the remaining string.
604 * RESearch::Compile failed, poor luser did not
605 * check for it. Fail fast.
607 * If a match is found, bopat[0] and eopat[0] are set
608 * to the beginning and the end of the matched fragment,
613 int RESearch::Execute(CharacterIndexer
&ci
, int lp
, int endp
) {
625 case BOL
: /* anchored: match from BOL only */
626 ep
= PMatch(ci
, lp
, endp
, ap
);
628 case EOL
: /* just searching for end of line normal path doesn't work */
629 if (*(ap
+1) == END
) {
636 case CHR
: /* ordinary char: locate it fast */
638 while ((lp
< endp
) && (ci
.CharAt(lp
) != c
))
640 if (lp
>= endp
) /* if EOS, fail, else fall thru. */
642 default: /* regular matching all the way. */
644 ep
= PMatch(ci
, lp
, endp
, ap
);
650 case END
: /* munged automaton. fail always */
662 * PMatch: internal routine for the hard part
664 * This code is partly snarfed from an early grep written by
665 * David Conroy. The backref and tag stuff, and various other
666 * innovations are by oz.
668 * special case optimizations: (nfa[n], nfa[n+1])
670 * We KNOW .* will match everything upto the
671 * end of line. Thus, directly go to the end of
672 * line, without recursive PMatch calls. As in
673 * the other closure cases, the remaining pattern
674 * must be matched by moving backwards on the
675 * string recursively, to find a match for xy
676 * (x is ".*" and y is the remaining pattern)
677 * where the match satisfies the LONGEST match for
678 * x followed by a match for y.
680 * We can again scan the string forward for the
681 * single char and at the point of failure, we
682 * execute the remaining nfa recursively, same as
685 * At the end of a successful match, bopat[n] and eopat[n]
686 * are set to the beginning and end of subpatterns matched
687 * by tagged expressions (n = 1 to 9).
691 extern void re_fail(char *,char);
694 * character classification table for word boundary operators BOW
695 * and EOW. the reason for not using ctype macros is that we can
696 * let the user add into our own table. see RESearch::ModifyWord. This table
697 * is not in the bitset form, since we may wish to extend it in the
698 * future for other character classifications.
700 * TRUE for 0-9 A-Z a-z _
702 static char chrtyp
[MAXCHR
] = {
703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
707 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
708 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
709 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
710 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
711 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
712 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
713 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
714 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
715 1, 1, 1, 0, 0, 0, 0, 0
718 #define inascii(x) (0177&(x))
719 #define iswordc(x) chrtyp[inascii(x)]
720 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
723 * skip values for CLO XXX to skip past the closure
726 #define ANYSKIP 2 /* [CLO] ANY END ... */
727 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
728 #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
730 int RESearch::PMatch(CharacterIndexer
&ci
, int lp
, int endp
, char *ap
) {
732 int e
; /* extra pointer for CLO */
733 int bp
; /* beginning of subpat.. */
734 int ep
; /* ending of subpat.. */
735 int are
; /* to save the line ptr. */
737 while ((op
= *ap
++) != END
)
741 if (ci
.CharAt(lp
++) != *ap
++)
769 if (lp
!=bol
&& iswordc(ci
.CharAt(lp
-1)) || !iswordc(ci
.CharAt(lp
)))
773 if (lp
==bol
|| !iswordc(ci
.CharAt(lp
-1)) || iswordc(ci
.CharAt(lp
)))
781 if (ci
.CharAt(bp
++) != ci
.CharAt(lp
++))
795 while ((lp
< endp
) && (c
== ci
.CharAt(lp
)))
800 while ((lp
< endp
) && isinset(ap
+1,ci
.CharAt(lp
)))
806 //re_fail("closure: bad nfa.", *ap);
813 if ((e
= PMatch(ci
, lp
, endp
, ap
)) != NOTFOUND
)
819 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
826 * RESearch::ModifyWord:
827 * add new characters into the word table to change RESearch::Execute's
828 * understanding of what a word should look like. Note that we
829 * only accept additions into the word definition.
831 * If the string parameter is 0 or null string, the table is
832 * reset back to the default containing A-Z a-z 0-9 _. [We use
833 * the compact bitset representation for the default table]
836 static char deftab
[16] = {
837 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
838 '\376', '\377', '\377', 007
841 void RESearch::ModifyWord(char *s
) {
845 for (i
= 0; i
< MAXCHR
; i
++)
846 if (!isinset(deftab
,i
))
855 * RESearch::Substitute:
856 * substitute the matched portions of the src in dst.
858 * & substitute the entire matched pattern.
860 * \digit substitute a subpattern, with the given tag number.
861 * Tags are numbered from 1 to 9. If the particular
862 * tagged subpattern does not exist, null is substituted.
864 int RESearch::Substitute(CharacterIndexer
&ci
, char *src
, char *dst
) {
870 if (!*src
|| !bopat
[0])
873 while ((c
= *src
++) != 0) {
882 if (c
>= '0' && c
<= '9') {
892 if ((bp
= bopat
[pin
]) != 0 && (ep
= eopat
[pin
]) != 0) {
893 while (ci
.CharAt(bp
) && bp
< ep
)
894 *dst
++ = ci
.CharAt(bp
++);