]>
git.saurik.com Git - wxWidgets.git/blob - contrib/src/stc/scintilla/src/RESearch.cxx
b7ea71bfb9631d81d3b92743c6626b2dfda35cee
1 // Scintilla source code edit control
3 ** Regular expression search library.
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads etc.
20 * These routines are the PUBLIC DOMAIN equivalents of regex
21 * routines as found in 4.nBSD UN*X, with minor extensions.
23 * These routines are derived from various implementations found
24 * in software tools books, and Conroy's grep. They are NOT derived
25 * from licensed/restricted software.
26 * For more interesting/academic/complicated implementations,
27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
30 * Modification history removed.
33 * RESearch::Compile: compile a regular expression into a NFA.
35 * char *RESearch::Compile(s)
38 * RESearch::Execute: execute the NFA to match a pattern.
40 * int RESearch::Execute(s)
43 * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
44 * looks like (for \< and \>) by adding into the
45 * hidden word-syntax table.
47 * void RESearch::ModifyWord(s)
50 * RESearch::Substitute: substitute the matched portions in a new string.
52 * int RESearch::Substitute(src, dst)
56 * re_fail: failure routine for RESearch::Execute.
58 * void re_fail(msg, op)
62 * Regular Expressions:
64 * [1] char matches itself, unless it is a special
65 * character (metachar): . \ [ ] * + ^ $
67 * [2] . matches any character.
69 * [3] \ matches the character following it, except
70 * when followed by a left or right round bracket,
71 * a digit 1 to 9 or a left or right angle bracket.
72 * (see [7], [8] and [9])
73 * It is used as an escape character for all
74 * other meta-characters, and itself. When used
75 * in a set ([4]), it is treated as an ordinary
78 * [4] [set] matches one of the characters in the set.
79 * If the first character in the set is "^",
80 * it matches a character NOT in the set, i.e.
81 * complements the set. A shorthand S-E is
82 * used to specify a set of characters S upto
83 * E, inclusive. The special characters "]" and
84 * "-" have no special meaning if they appear
85 * as the first chars in the set.
88 * [a-z] any lowercase alpha
90 * [^]-] any char except ] and -
92 * [^A-Z] any char except uppercase
97 * [5] * any regular expression form [1] to [4], followed by
98 * closure char (*) matches zero or more matches of
101 * [6] + same as [5], except it matches one or more.
103 * [7] a regular expression in the form [1] to [10], enclosed
104 * as \(form\) matches what form matches. The enclosure
105 * creates a set of tags, used for [8] and for
106 * pattern substution. The tagged forms are numbered
109 * [8] a \ followed by a digit 1 to 9 matches whatever a
110 * previously tagged regular expression ([7]) matched.
112 * [9] \< a regular expression starting with a \< construct
113 * \> and/or ending with a \> construct, restricts the
114 * pattern matching to the beginning of a word, and/or
115 * the end of a word. A word is defined to be a character
116 * string beginning and/or ending with the characters
117 * A-Z a-z 0-9 and _. It must also be preceded and/or
118 * followed by any character outside those mentioned.
120 * [10] a composite regular expression xy where x and y
121 * are in the form [1] to [10] matches the longest
122 * match of x followed by a match for y.
124 * [11] ^ a regular expression starting with a ^ character
125 * $ and/or ending with a $ character, restricts the
126 * pattern matching to the beginning of the line,
127 * or the end of line. [anchors] Elsewhere in the
128 * pattern, ^ and $ are treated as ordinary characters.
133 * HCR's Hugh Redelmeier has been most helpful in various
134 * stages of development. He convinced me to include BOW
135 * and EOW constructs, originally invented by Rob Pike at
136 * the University of Toronto.
139 * Software tools Kernighan & Plauger
140 * Software tools in Pascal Kernighan & Plauger
141 * Grep [rsx-11 C dist] David Conroy
142 * ed - text editor Un*x Programmer's Manual
143 * Advanced editing on Un*x B. W. Kernighan
144 * RegExp routines Henry Spencer
148 * This implementation uses a bit-set representation for character
149 * classes for speed and compactness. Each character is represented
150 * by one bit in a 128-bit block. Thus, CCL always takes a
151 * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
152 * bit comparison to locate the character in the set.
157 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
158 * matches: fo foo fooo foobar fobar foxx ...
160 * pattern: fo[ob]a[rz]
161 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
162 * matches: fobar fooar fobaz fooaz
165 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
166 * matches: foo\ foo\\ foo\\\ ...
168 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
169 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
170 * matches: foo1foo foo2foo foo3foo
172 * pattern: \(fo.*\)-\1
173 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
174 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
177 #include "RESearch.h"
197 * The following defines are not meant to be changeable.
198 * They are for readability only.
205 const char bitarr
[] = {1,2,4,8,16,32,64,'\200'};
207 #define badpat(x) (*nfa = END, x)
209 RESearch::RESearch() {
213 RESearch::~RESearch() {
217 void RESearch::Init() {
218 sta
= NOP
; /* status of lastpat */
220 for (int i
=0; i
<MAXTAG
; i
++)
222 for (int j
=0; j
<BITBLK
; j
++)
226 void RESearch::Clear() {
227 for (int i
=0; i
<MAXTAG
; i
++) {
235 bool RESearch::GrabMatches(CharacterIndexer
&ci
) {
237 for (unsigned int i
=0; i
<MAXTAG
; i
++) {
238 if ((bopat
[i
] != NOTFOUND
) && (eopat
[i
] != NOTFOUND
)) {
239 unsigned int len
= eopat
[i
] - bopat
[i
];
240 pat
[i
] = new char[len
+ 1];
242 for (unsigned int j
=0; j
<len
; j
++)
243 pat
[i
][j
] = ci
.CharAt(bopat
[i
] + j
);
253 void RESearch::ChSet(char c
) {
254 bittab
[((c
) & BLKIND
) >> 3] |= bitarr
[(c
) & BITIND
];
257 void RESearch::ChSetWithCase(char c
, bool caseSensitive
) {
261 if ((c
>= 'a') && (c
<= 'z')) {
263 ChSet(static_cast<char>(c
- 'a' + 'A'));
264 } else if ((c
>= 'A') && (c
<= 'Z')) {
266 ChSet(static_cast<char>(c
- 'A' + 'a'));
273 const char escapeValue(char ch
) {
275 case 'a': return '\a';
276 case 'b': return '\b';
277 case 'f': return '\f';
278 case 'n': return '\n';
279 case 'r': return '\r';
280 case 't': return '\t';
281 case 'v': return '\v';
286 const char *RESearch::Compile(const char *pat
, int length
, bool caseSensitive
, bool posix
) {
287 char *mp
=nfa
; /* nfa pointer */
288 char *lp
; /* saved pointer.. */
289 char *sp
=nfa
; /* another one.. */
290 char *mpMax
= mp
+ MAXNFA
- BITBLK
- 10;
292 int tagi
= 0; /* tag stack index */
293 int tagc
= 1; /* actual tag count */
296 char mask
; /* xor mask -CCL/NCL */
303 return badpat("No previous regular expression");
306 const char *p
=pat
; /* pattern pointer */
307 for (int i
=0; i
<length
; i
++, p
++) {
309 return badpat("Pattern too long");
313 case '.': /* match any char.. */
317 case '^': /* match beginning.. */
326 case '$': /* match endofline.. */
335 case '[': /* match char class..*/
346 if (*p
== '-') { /* real dash */
350 if (*p
== ']') { /* real brace */
354 while (*p
&& *p
!= ']') {
355 if (*p
== '-' && *(p
+1) && *(p
+1) != ']') {
362 ChSetWithCase(static_cast<char>(c1
++), caseSensitive
);
364 } else if (*p
== '\\' && *(p
+1)) {
367 char escape
= escapeValue(*p
);
369 ChSetWithCase(escape
, caseSensitive
);
371 ChSetWithCase(*p
, caseSensitive
);
376 ChSetWithCase(*p
++, caseSensitive
);
380 return badpat("Missing ]");
382 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
383 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
387 case '*': /* match 0 or more.. */
388 case '+': /* match 1 or more.. */
390 return badpat("Empty closure");
391 lp
= sp
; /* previous opcode */
392 if (*lp
== CLO
) /* equivalence.. */
402 return badpat("Illegal closure");
408 for (sp
= mp
; lp
< sp
; lp
++)
420 case '\\': /* tags, backrefs .. */
429 return badpat("Null pattern inside \\<\\>");
442 if (tagi
> 0 && tagstk
[tagi
] == n
)
443 return badpat("Cyclical reference");
445 *mp
++ = static_cast<char>(REF
);
446 *mp
++ = static_cast<char>(n
);
449 return badpat("Undetermined reference");
459 *mp
++ = escapeValue(*p
);
462 if (!posix
&& *p
== '(') {
464 tagstk
[++tagi
] = tagc
;
466 *mp
++ = static_cast<char>(tagc
++);
469 return badpat("Too many \\(\\) pairs");
470 } else if (!posix
&& *p
== ')') {
472 return badpat("Null pattern inside \\(\\)");
474 *mp
++ = static_cast<char>(EOT
);
475 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
478 return badpat("Unmatched \\)");
486 default : /* an ordinary char */
487 if (posix
&& *p
== '(') {
489 tagstk
[++tagi
] = tagc
;
491 *mp
++ = static_cast<char>(tagc
++);
494 return badpat("Too many () pairs");
495 } else if (posix
&& *p
== ')') {
497 return badpat("Null pattern inside ()");
499 *mp
++ = static_cast<char>(EOT
);
500 *mp
++ = static_cast<char>(tagstk
[tagi
--]);
503 return badpat("Unmatched )");
504 } else if (caseSensitive
) {
510 ChSetWithCase(*p
, false);
511 for (n
= 0; n
< BITBLK
; bittab
[n
++] = (char) 0)
512 *mp
++ = static_cast<char>(mask
^ bittab
[n
]);
519 return badpat((posix
? "Unmatched (" : "Unmatched \\("));
527 * execute nfa to find a match.
529 * special cases: (nfa[0])
531 * Match only once, starting from the
534 * First locate the character without
535 * calling PMatch, and if found, call
536 * PMatch for the remaining string.
538 * RESearch::Compile failed, poor luser did not
539 * check for it. Fail fast.
541 * If a match is found, bopat[0] and eopat[0] are set
542 * to the beginning and the end of the matched fragment,
547 int RESearch::Execute(CharacterIndexer
&ci
, int lp
, int endp
) {
559 case BOL
: /* anchored: match from BOL only */
560 ep
= PMatch(ci
, lp
, endp
, ap
);
562 case EOL
: /* just searching for end of line normal path doesn't work */
563 if (*(ap
+1) == END
) {
570 case CHR
: /* ordinary char: locate it fast */
572 while ((lp
< endp
) && (ci
.CharAt(lp
) != c
))
574 if (lp
>= endp
) /* if EOS, fail, else fall thru. */
576 default: /* regular matching all the way. */
578 ep
= PMatch(ci
, lp
, endp
, ap
);
584 case END
: /* munged automaton. fail always */
596 * PMatch: internal routine for the hard part
598 * This code is partly snarfed from an early grep written by
599 * David Conroy. The backref and tag stuff, and various other
600 * innovations are by oz.
602 * special case optimizations: (nfa[n], nfa[n+1])
604 * We KNOW .* will match everything upto the
605 * end of line. Thus, directly go to the end of
606 * line, without recursive PMatch calls. As in
607 * the other closure cases, the remaining pattern
608 * must be matched by moving backwards on the
609 * string recursively, to find a match for xy
610 * (x is ".*" and y is the remaining pattern)
611 * where the match satisfies the LONGEST match for
612 * x followed by a match for y.
614 * We can again scan the string forward for the
615 * single char and at the point of failure, we
616 * execute the remaining nfa recursively, same as
619 * At the end of a successful match, bopat[n] and eopat[n]
620 * are set to the beginning and end of subpatterns matched
621 * by tagged expressions (n = 1 to 9).
625 extern void re_fail(char *,char);
628 * character classification table for word boundary operators BOW
629 * and EOW. the reason for not using ctype macros is that we can
630 * let the user add into our own table. see RESearch::ModifyWord. This table
631 * is not in the bitset form, since we may wish to extend it in the
632 * future for other character classifications.
634 * TRUE for 0-9 A-Z a-z _
636 static char chrtyp
[MAXCHR
] = {
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
641 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
643 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
647 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
649 1, 1, 1, 0, 0, 0, 0, 0
652 #define inascii(x) (0177&(x))
653 #define iswordc(x) chrtyp[inascii(x)]
654 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
657 * skip values for CLO XXX to skip past the closure
660 #define ANYSKIP 2 /* [CLO] ANY END ... */
661 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
662 #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
664 int RESearch::PMatch(CharacterIndexer
&ci
, int lp
, int endp
, char *ap
) {
666 int e
; /* extra pointer for CLO */
667 int bp
; /* beginning of subpat.. */
668 int ep
; /* ending of subpat.. */
669 int are
; /* to save the line ptr. */
671 while ((op
= *ap
++) != END
)
675 if (ci
.CharAt(lp
++) != *ap
++)
703 if (lp
!=bol
&& iswordc(ci
.CharAt(lp
-1)) || !iswordc(ci
.CharAt(lp
)))
707 if (lp
==bol
|| !iswordc(ci
.CharAt(lp
-1)) || iswordc(ci
.CharAt(lp
)))
715 if (ci
.CharAt(bp
++) != ci
.CharAt(lp
++))
729 while ((lp
< endp
) && (c
== ci
.CharAt(lp
)))
734 while ((lp
< endp
) && isinset(ap
+1,ci
.CharAt(lp
)))
740 //re_fail("closure: bad nfa.", *ap);
747 if ((e
= PMatch(ci
, lp
, endp
, ap
)) != NOTFOUND
)
753 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
760 * RESearch::ModifyWord:
761 * add new characters into the word table to change RESearch::Execute's
762 * understanding of what a word should look like. Note that we
763 * only accept additions into the word definition.
765 * If the string parameter is 0 or null string, the table is
766 * reset back to the default containing A-Z a-z 0-9 _. [We use
767 * the compact bitset representation for the default table]
770 static char deftab
[16] = {
771 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
772 '\376', '\377', '\377', 007
775 void RESearch::ModifyWord(char *s
) {
779 for (i
= 0; i
< MAXCHR
; i
++)
780 if (!isinset(deftab
,i
))
789 * RESearch::Substitute:
790 * substitute the matched portions of the src in dst.
792 * & substitute the entire matched pattern.
794 * \digit substitute a subpattern, with the given tag number.
795 * Tags are numbered from 1 to 9. If the particular
796 * tagged subpattern does not exist, null is substituted.
798 int RESearch::Substitute(CharacterIndexer
&ci
, char *src
, char *dst
) {
804 if (!*src
|| !bopat
[0])
807 while ((c
= *src
++) != 0) {
816 if (c
>= '0' && c
<= '9') {
826 if ((bp
= bopat
[pin
]) != 0 && (ep
= eopat
[pin
]) != 0) {
827 while (ci
.CharAt(bp
) && bp
< ep
)
828 *dst
++ = ci
.CharAt(bp
++);