]>
git.saurik.com Git - wxWidgets.git/blob - src/regex/regc_lex.c
3 * This file is #included by regcomp.c.
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 /* scanning macros (know about v) */
34 #define ATEOS() (v->now >= v->stop)
35 #define HAVE(n) (v->stop - v->now >= (n))
36 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
37 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
38 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
39 *(v->now+1) == CHR(b) && \
40 *(v->now+2) == CHR(c))
41 #define SET(c) (v->nexttype = (c))
42 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
43 #define RET(c) return (SET(c), 1)
44 #define RETV(c, n) return (SETV(c, n), 1)
45 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
46 #define LASTTYPE(t) (v->lasttype == (t))
48 /* lexical contexts */
49 #define L_ERE 1 /* mainline ERE/ARE */
50 #define L_BRE 2 /* mainline BRE */
51 #define L_Q 3 /* REG_QUOTE */
52 #define L_EBND 4 /* ERE/ARE bound */
53 #define L_BBND 5 /* BRE bound */
54 #define L_BRACK 6 /* brackets */
55 #define L_CEL 7 /* collating element */
56 #define L_ECL 8 /* equivalence class */
57 #define L_CCL 9 /* character class */
58 #define INTOCON(c) (v->lexcon = (c))
59 #define INCON(con) (v->lexcon == (con))
61 /* construct pointer past end of chr array */
62 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65 - lexstart - set up lexical stuff, scan leading options
66 ^ static VOID lexstart(struct vars *);
72 prefixes(v
); /* may turn on new type bits etc. */
75 if (v
->cflags
®_QUOTE
) {
76 assert(!(v
->cflags
&(REG_ADVANCED
|REG_EXPANDED
|REG_NEWLINE
)));
78 } else if (v
->cflags
®_EXTENDED
) {
79 assert(!(v
->cflags
®_QUOTE
));
82 assert(!(v
->cflags
&(REG_QUOTE
|REG_ADVF
)));
86 v
->nexttype
= EMPTY
; /* remember we were at the start */
87 next(v
); /* set up the first token */
91 - prefixes - implement various special prefixes
92 ^ static VOID prefixes(struct vars *);
98 /* literal string doesn't get any of this stuff */
99 if (v
->cflags
®_QUOTE
)
102 /* initial "***" gets special things */
103 if (HAVE(4) && NEXT3('*', '*', '*'))
104 switch (*(v
->now
+ 3)) {
105 case CHR('?'): /* "***?" error, msg shows version */
107 return; /* proceed no further */
109 case CHR('='): /* "***=" shifts to literal string */
111 v
->cflags
|= REG_QUOTE
;
112 v
->cflags
&= ~(REG_ADVANCED
|REG_EXPANDED
|REG_NEWLINE
);
114 return; /* and there can be no more prefixes */
116 case CHR(':'): /* "***:" shifts to AREs */
118 v
->cflags
|= REG_ADVANCED
;
121 default: /* otherwise *** is just an error */
127 /* BREs and EREs don't get embedded options */
128 if ((v
->cflags
®_ADVANCED
) != REG_ADVANCED
)
131 /* embedded options (AREs only) */
132 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v
->now
+ 2))) {
135 for (; !ATEOS() && iscalpha(*v
->now
); v
->now
++)
137 case CHR('b'): /* BREs (but why???) */
138 v
->cflags
&= ~(REG_ADVANCED
|REG_QUOTE
);
140 case CHR('c'): /* case sensitive */
141 v
->cflags
&= ~REG_ICASE
;
143 case CHR('e'): /* plain EREs */
144 v
->cflags
|= REG_EXTENDED
;
145 v
->cflags
&= ~(REG_ADVF
|REG_QUOTE
);
147 case CHR('i'): /* case insensitive */
148 v
->cflags
|= REG_ICASE
;
150 case CHR('m'): /* Perloid synonym for n */
151 case CHR('n'): /* \n affects ^ $ . [^ */
152 v
->cflags
|= REG_NEWLINE
;
154 case CHR('p'): /* ~Perl, \n affects . [^ */
155 v
->cflags
|= REG_NLSTOP
;
156 v
->cflags
&= ~REG_NLANCH
;
158 case CHR('q'): /* literal string */
159 v
->cflags
|= REG_QUOTE
;
160 v
->cflags
&= ~REG_ADVANCED
;
162 case CHR('s'): /* single line, \n ordinary */
163 v
->cflags
&= ~REG_NEWLINE
;
165 case CHR('t'): /* tight syntax */
166 v
->cflags
&= ~REG_EXPANDED
;
168 case CHR('w'): /* weird, \n affects ^ $ only */
169 v
->cflags
&= ~REG_NLSTOP
;
170 v
->cflags
|= REG_NLANCH
;
172 case CHR('x'): /* expanded syntax */
173 v
->cflags
|= REG_EXPANDED
;
184 if (v
->cflags
®_QUOTE
)
185 v
->cflags
&= ~(REG_EXPANDED
|REG_NEWLINE
);
190 - lexnest - "call a subroutine", interpolating string at the lexical level
191 * Note, this is not a very general facility. There are a number of
192 * implicit assumptions about what sorts of strings can be subroutines.
193 ^ static VOID lexnest(struct vars *, chr *, chr *);
196 lexnest(v
, beginp
, endp
)
198 chr
*beginp
; /* start of interpolation */
199 chr
*endp
; /* one past end of interpolation */
201 assert(v
->savenow
== NULL
); /* only one level of nesting */
203 v
->savestop
= v
->stop
;
209 * string constants to interpolate as expansions of things like \d
211 static chr backd
[] = { /* \d */
212 CHR('['), CHR('['), CHR(':'),
213 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
214 CHR(':'), CHR(']'), CHR(']')
216 static chr backD
[] = { /* \D */
217 CHR('['), CHR('^'), CHR('['), CHR(':'),
218 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
219 CHR(':'), CHR(']'), CHR(']')
221 static chr brbackd
[] = { /* \d within brackets */
223 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
226 static chr backs
[] = { /* \s */
227 CHR('['), CHR('['), CHR(':'),
228 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
229 CHR(':'), CHR(']'), CHR(']')
231 static chr backS
[] = { /* \S */
232 CHR('['), CHR('^'), CHR('['), CHR(':'),
233 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
234 CHR(':'), CHR(']'), CHR(']')
236 static chr brbacks
[] = { /* \s within brackets */
238 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
241 static chr backw
[] = { /* \w */
242 CHR('['), CHR('['), CHR(':'),
243 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
244 CHR(':'), CHR(']'), CHR('_'), CHR(']')
246 static chr backW
[] = { /* \W */
247 CHR('['), CHR('^'), CHR('['), CHR(':'),
248 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
249 CHR(':'), CHR(']'), CHR('_'), CHR(']')
251 static chr brbackw
[] = { /* \w within brackets */
253 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
254 CHR(':'), CHR(']'), CHR('_')
258 - lexword - interpolate a bracket expression for word characters
259 * Possibly ought to inquire whether there is a "word" character class.
260 ^ static VOID lexword(struct vars *);
266 lexnest(v
, backw
, ENDOF(backw
));
270 - next - get next token
271 ^ static int next(struct vars *);
273 static int /* 1 normal, 0 failure */
279 /* errors yield an infinite sequence of failures */
281 return 0; /* the error has set nexttype to EOS */
283 /* remember flavor of last token */
284 v
->lasttype
= v
->nexttype
;
287 if (v
->nexttype
== EMPTY
&& (v
->cflags
®_BOSONLY
)) {
288 /* at start of a REG_BOSONLY RE */
289 RETV(SBEGIN
, 0); /* same as \A */
292 /* if we're nested and we've hit end, return to outer level */
293 if (v
->savenow
!= NULL
&& ATEOS()) {
295 v
->stop
= v
->savestop
;
296 v
->savenow
= v
->savestop
= NULL
;
299 /* skip white space etc. if appropriate (not in literal or []) */
300 if (v
->cflags
®_EXPANDED
)
310 /* handle EOS, depending on context */
332 /* okay, time to actually get a character */
335 /* deal with the easy contexts, punt EREs to code below */
337 case L_BRE
: /* punt BREs to separate function */
338 return brenext(v
, c
);
340 case L_ERE
: /* see below */
342 case L_Q
: /* literal strings are easy */
345 case L_BBND
: /* bounds are fairly simple */
348 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
349 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
350 case CHR('8'): case CHR('9'):
351 RETV(DIGIT
, (chr
)DIGITVAL(c
));
356 case CHR('}'): /* ERE bound ends with } */
359 if ((v
->cflags
®_ADVF
) && NEXT1('?')) {
368 case CHR('\\'): /* BRE bound ends with \} */
369 if (INCON(L_BBND
) && NEXT1('}')) {
382 case L_BRACK
: /* brackets are not too hard */
388 INTOCON((v
->cflags
®_EXTENDED
) ?
395 if (!(v
->cflags
®_ADVF
))
400 (DISCARD
)lexescape(v
);
401 switch (v
->nexttype
) { /* not all escapes okay here */
406 switch (v
->nextvalue
) {
408 lexnest(v
, brbackd
, ENDOF(brbackd
));
411 lexnest(v
, brbacks
, ENDOF(brbacks
));
414 lexnest(v
, brbackw
, ENDOF(brbackw
));
420 /* lexnest done, back up and try again */
421 v
->nexttype
= v
->lasttype
;
425 /* not one of the acceptable escapes */
429 if (LASTTYPE('[') || NEXT1(']'))
440 /* might or might not be locale-specific */
466 case L_CEL
: /* collating elements are easy */
467 if (c
== CHR('.') && NEXT1(']')) {
474 case L_ECL
: /* ditto equivalence classes */
475 if (c
== CHR('=') && NEXT1(']')) {
482 case L_CCL
: /* ditto character classes */
483 if (c
== CHR(':') && NEXT1(']')) {
495 /* that got rid of everything except EREs and AREs */
496 assert(INCON(L_ERE
));
498 /* deal with EREs and AREs, except for backslashes */
504 if ((v
->cflags
®_ADVF
) && NEXT1('?')) {
512 if ((v
->cflags
®_ADVF
) && NEXT1('?')) {
520 if ((v
->cflags
®_ADVF
) && NEXT1('?')) {
527 case CHR('{'): /* bounds start or plain character */
528 if (v
->cflags
®_EXPANDED
)
530 if (ATEOS() || !iscdigit(*v
->now
)) {
541 case CHR('('): /* parenthesis, or advanced extension */
542 if ((v
->cflags
®_ADVF
) && NEXT1('?')) {
546 case CHR(':'): /* non-capturing paren */
549 case CHR('#'): /* comment */
550 while (!ATEOS() && *v
->now
!= CHR(')'))
554 assert(v
->nexttype
== v
->lasttype
);
557 case CHR('='): /* positive lookahead */
558 NOTE(REG_ULOOKAHEAD
);
561 case CHR('!'): /* negative lookahead */
562 NOTE(REG_ULOOKAHEAD
);
571 if (v
->cflags
®_NOSUB
)
572 RETV('(', 0); /* all parens non-capturing */
582 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
583 if (HAVE(6) && *(v
->now
+0) == CHR('[') &&
584 *(v
->now
+1) == CHR(':') &&
585 (*(v
->now
+2) == CHR('<') ||
586 *(v
->now
+2) == CHR('>')) &&
587 *(v
->now
+3) == CHR(':') &&
588 *(v
->now
+4) == CHR(']') &&
589 *(v
->now
+5) == CHR(']')) {
593 RET((c
== CHR('<')) ? '<' : '>');
611 case CHR('\\'): /* mostly punt backslashes to code below */
615 default: /* ordinary character */
620 /* ERE/ARE backslash handling; backslash already eaten */
622 if (!(v
->cflags
®_ADVF
)) { /* only AREs have non-trivial escapes */
623 if (iscalnum(*v
->now
)) {
627 RETV(PLAIN
, *v
->now
++);
629 (DISCARD
)lexescape(v
);
632 if (v
->nexttype
== CCLASS
) { /* fudge at lexical level */
633 switch (v
->nextvalue
) {
634 case 'd': lexnest(v
, backd
, ENDOF(backd
)); break;
635 case 'D': lexnest(v
, backD
, ENDOF(backD
)); break;
636 case 's': lexnest(v
, backs
, ENDOF(backs
)); break;
637 case 'S': lexnest(v
, backS
, ENDOF(backS
)); break;
638 case 'w': lexnest(v
, backw
, ENDOF(backw
)); break;
639 case 'W': lexnest(v
, backW
, ENDOF(backW
)); break;
645 /* lexnest done, back up and try again */
646 v
->nexttype
= v
->lasttype
;
649 /* otherwise, lexescape has already done the work */
654 - lexescape - parse an ARE backslash escape (backslash already eaten)
655 * Note slightly nonstandard use of the CCLASS type code.
656 ^ static int lexescape(struct vars *);
658 static int /* not actually used, but convenient for RETV */
663 static chr alert
[] = {
664 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
667 CHR('E'), CHR('S'), CHR('C')
671 assert(v
->cflags
®_ADVF
);
681 RETV(PLAIN
, chrnamed(v
, alert
, ENDOF(alert
), CHR('\007')));
687 RETV(PLAIN
, CHR('\b'));
690 RETV(PLAIN
, CHR('\\'));
696 RETV(PLAIN
, (chr
)(*v
->now
++ & 037));
708 RETV(PLAIN
, chrnamed(v
, esc
, ENDOF(esc
), CHR('\033')));
711 RETV(PLAIN
, CHR('\f'));
720 RETV(PLAIN
, CHR('\n'));
723 RETV(PLAIN
, CHR('\r'));
734 RETV(PLAIN
, CHR('\t'));
737 c
= lexdigits(v
, 16, 4, 4);
743 c
= lexdigits(v
, 16, 8, 8);
749 RETV(PLAIN
, CHR('\v'));
761 c
= lexdigits(v
, 16, 1, 255); /* REs >255 long outside spec */
777 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
778 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
781 v
->now
--; /* put first digit back */
782 c
= lexdigits(v
, 10, 1, 255); /* REs >255 long outside spec */
785 /* ugly heuristic (first test is "exactly 1 digit?") */
786 if (v
->now
- save
== 0 || (int)c
<= v
->nsubexp
) {
788 RETV(BACKREF
, (chr
)c
);
790 /* oops, doesn't look like it's a backref after all... */
792 /* and fall through into octal number */
795 v
->now
--; /* put first digit back */
796 c
= lexdigits(v
, 8, 1, 3);
803 FAILW(REG_EESCAPE
); /* unknown alphabetic escape */
810 - lexdigits - slurp up digits and return chr value
811 ^ static chr lexdigits(struct vars *, int, int, int);
813 static chr
/* chr value; errors signalled via ERR */
814 lexdigits(v
, base
, minlen
, maxlen
)
820 uchr n
; /* unsigned to avoid overflow misbehavior */
824 CONST uchr ub
= (uchr
) base
;
827 for (len
= 0; len
< maxlen
&& !ATEOS(); len
++) {
830 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
831 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
832 case CHR('8'): case CHR('9'):
835 case CHR('a'): case CHR('A'): d
= 10; break;
836 case CHR('b'): case CHR('B'): d
= 11; break;
837 case CHR('c'): case CHR('C'): d
= 12; break;
838 case CHR('d'): case CHR('D'): d
= 13; break;
839 case CHR('e'): case CHR('E'): d
= 14; break;
840 case CHR('f'): case CHR('F'): d
= 15; break;
842 v
->now
--; /* oops, not a digit at all */
847 if (d
>= base
) { /* not a plausible digit */
852 break; /* NOTE BREAK OUT */
862 - brenext - get next BRE token
863 * This is much like EREs except for all the stupid backslashes and the
864 * context-dependency of some things.
865 ^ static int brenext(struct vars *, pchr);
867 static int /* 1 normal, 0 failure */
876 if (LASTTYPE(EMPTY
) || LASTTYPE('(') || LASTTYPE('^'))
881 if (HAVE(6) && *(v
->now
+0) == CHR('[') &&
882 *(v
->now
+1) == CHR(':') &&
883 (*(v
->now
+2) == CHR('<') ||
884 *(v
->now
+2) == CHR('>')) &&
885 *(v
->now
+3) == CHR(':') &&
886 *(v
->now
+4) == CHR(']') &&
887 *(v
->now
+5) == CHR(']')) {
891 RET((c
== CHR('<')) ? '<' : '>');
913 if (v
->cflags
®_EXPANDED
)
917 if (NEXT2('\\', ')')) {
924 break; /* see below */
930 assert(c
== CHR('\\'));
956 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
957 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
960 RETV(BACKREF
, (chr
)DIGITVAL(c
));
975 - skip - skip white space and comments in expanded form
976 ^ static VOID skip(struct vars *);
984 assert(v
->cflags
®_EXPANDED
);
987 while (!ATEOS() && iscspace(*v
->now
))
989 if (ATEOS() || *v
->now
!= CHR('#'))
990 break; /* NOTE BREAK OUT */
992 while (!ATEOS() && *v
->now
!= CHR('\n'))
994 /* leave the newline to be picked up by the iscspace loop */
1002 - newline - return the chr for a newline
1003 * This helps confine use of CHR to this source file.
1004 ^ static chr newline(NOPARMS);
1013 - ch - return the chr sequence for regc_locale.c's fake collating element ch
1014 * This helps confine use of CHR to this source file. Beware that the caller
1015 * knows how long the sequence is.
1017 ^ static chr *ch(NOPARMS);
1024 static chr chstr
[] = { CHR('c'), CHR('h'), CHR('\0') };
1031 - chrnamed - return the chr known by a given (chr string) name
1032 * The code is a bit clumsy, but this routine gets only such specialized
1033 * use that it hardly matters.
1034 ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
1037 chrnamed(v
, startp
, endp
, lastresort
)
1039 chr
*startp
; /* start of name */
1040 chr
*endp
; /* just past end of name */
1041 pchr lastresort
; /* what to return if name lookup fails */
1050 c
= element(v
, startp
, endp
);
1055 return (chr
)lastresort
;
1057 cv
= range(v
, c
, c
, 0);
1059 return (chr
)lastresort
;