3 * This file is #included by regcomp.c.
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 /* scanning macros (know about v) */
36 #define ATEOS() (v->now >= v->stop)
37 #define HAVE(n) (v->stop - v->now >= (n))
38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41 *(v->now+1) == CHR(b) && \
42 *(v->now+2) == CHR(c))
43 #define SET(c) (v->nexttype = (c))
44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c) return (SET(c), 1)
46 #define RETV(c, n) return (SETV(c, n), 1)
47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
50 /* lexical contexts */
51 #define L_ERE 1 /* mainline ERE/ARE */
52 #define L_BRE 2 /* mainline BRE */
53 #define L_Q 3 /* REG_QUOTE */
54 #define L_EBND 4 /* ERE/ARE bound */
55 #define L_BBND 5 /* BRE bound */
56 #define L_BRACK 6 /* brackets */
57 #define L_CEL 7 /* collating element */
58 #define L_ECL 8 /* equivalence class */
59 #define L_CCL 9 /* character class */
60 #define INTOCON(c) (v->lexcon = (c))
61 #define INCON(con) (v->lexcon == (con))
63 /* construct pointer past end of chr array */
64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
67 * lexstart - set up lexical stuff, scan leading options
70 lexstart(struct vars
* v
)
72 prefixes(v
); /* may turn on new type bits etc. */
75 if (v
->cflags
& REG_QUOTE
)
77 assert(!(v
->cflags
& (REG_ADVANCED
| REG_EXPANDED
| REG_NEWLINE
)));
80 else if (v
->cflags
& REG_EXTENDED
)
82 assert(!(v
->cflags
& REG_QUOTE
));
87 assert(!(v
->cflags
& (REG_QUOTE
| REG_ADVF
)));
91 v
->nexttype
= EMPTY
; /* remember we were at the start */
92 next(v
); /* set up the first token */
96 * prefixes - implement various special prefixes
99 prefixes(struct vars
* v
)
101 /* literal string doesn't get any of this stuff */
102 if (v
->cflags
& REG_QUOTE
)
105 /* initial "***" gets special things */
106 if (HAVE(4) && NEXT3('*', '*', '*'))
107 switch (*(v
->now
+ 3))
109 case CHR('?'): /* "***?" error, msg shows version */
111 return; /* proceed no further */
113 case CHR('='): /* "***=" shifts to literal string */
115 v
->cflags
|= REG_QUOTE
;
116 v
->cflags
&= ~(REG_ADVANCED
| REG_EXPANDED
| REG_NEWLINE
);
118 return; /* and there can be no more prefixes */
120 case CHR(':'): /* "***:" shifts to AREs */
122 v
->cflags
|= REG_ADVANCED
;
125 default: /* otherwise *** is just an error */
131 /* BREs and EREs don't get embedded options */
132 if ((v
->cflags
& REG_ADVANCED
) != REG_ADVANCED
)
135 /* embedded options (AREs only) */
136 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v
->now
+ 2)))
140 for (; !ATEOS() && iscalpha(*v
->now
); v
->now
++)
143 case CHR('b'): /* BREs (but why???) */
144 v
->cflags
&= ~(REG_ADVANCED
| REG_QUOTE
);
146 case CHR('c'): /* case sensitive */
147 v
->cflags
&= ~REG_ICASE
;
149 case CHR('e'): /* plain EREs */
150 v
->cflags
|= REG_EXTENDED
;
151 v
->cflags
&= ~(REG_ADVF
| REG_QUOTE
);
153 case CHR('i'): /* case insensitive */
154 v
->cflags
|= REG_ICASE
;
156 case CHR('m'): /* Perloid synonym for n */
157 case CHR('n'): /* \n affects ^ $ . [^ */
158 v
->cflags
|= REG_NEWLINE
;
160 case CHR('p'): /* ~Perl, \n affects . [^ */
161 v
->cflags
|= REG_NLSTOP
;
162 v
->cflags
&= ~REG_NLANCH
;
164 case CHR('q'): /* literal string */
165 v
->cflags
|= REG_QUOTE
;
166 v
->cflags
&= ~REG_ADVANCED
;
168 case CHR('s'): /* single line, \n ordinary */
169 v
->cflags
&= ~REG_NEWLINE
;
171 case CHR('t'): /* tight syntax */
172 v
->cflags
&= ~REG_EXPANDED
;
174 case CHR('w'): /* weird, \n affects ^ $ only */
175 v
->cflags
&= ~REG_NLSTOP
;
176 v
->cflags
|= REG_NLANCH
;
178 case CHR('x'): /* expanded syntax */
179 v
->cflags
|= REG_EXPANDED
;
191 if (v
->cflags
& REG_QUOTE
)
192 v
->cflags
&= ~(REG_EXPANDED
| REG_NEWLINE
);
197 * lexnest - "call a subroutine", interpolating string at the lexical level
199 * Note, this is not a very general facility. There are a number of
200 * implicit assumptions about what sorts of strings can be subroutines.
203 lexnest(struct vars
* v
,
204 chr
*beginp
, /* start of interpolation */
205 chr
*endp
) /* one past end of interpolation */
207 assert(v
->savenow
== NULL
); /* only one level of nesting */
209 v
->savestop
= v
->stop
;
215 * string constants to interpolate as expansions of things like \d
217 static chr backd
[] = { /* \d */
218 CHR('['), CHR('['), CHR(':'),
219 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220 CHR(':'), CHR(']'), CHR(']')
222 static chr backD
[] = { /* \D */
223 CHR('['), CHR('^'), CHR('['), CHR(':'),
224 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225 CHR(':'), CHR(']'), CHR(']')
227 static chr brbackd
[] = { /* \d within brackets */
229 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
232 static chr backs
[] = { /* \s */
233 CHR('['), CHR('['), CHR(':'),
234 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235 CHR(':'), CHR(']'), CHR(']')
237 static chr backS
[] = { /* \S */
238 CHR('['), CHR('^'), CHR('['), CHR(':'),
239 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240 CHR(':'), CHR(']'), CHR(']')
242 static chr brbacks
[] = { /* \s within brackets */
244 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
247 static chr backw
[] = { /* \w */
248 CHR('['), CHR('['), CHR(':'),
249 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250 CHR(':'), CHR(']'), CHR('_'), CHR(']')
252 static chr backW
[] = { /* \W */
253 CHR('['), CHR('^'), CHR('['), CHR(':'),
254 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255 CHR(':'), CHR(']'), CHR('_'), CHR(']')
257 static chr brbackw
[] = { /* \w within brackets */
259 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260 CHR(':'), CHR(']'), CHR('_')
264 * lexword - interpolate a bracket expression for word characters
265 * Possibly ought to inquire whether there is a "word" character class.
268 lexword(struct vars
* v
)
270 lexnest(v
, backw
, ENDOF(backw
));
274 * next - get next token
276 static int /* 1 normal, 0 failure */
277 next(struct vars
* v
)
281 /* errors yield an infinite sequence of failures */
283 return 0; /* the error has set nexttype to EOS */
285 /* remember flavor of last token */
286 v
->lasttype
= v
->nexttype
;
289 if (v
->nexttype
== EMPTY
&& (v
->cflags
& REG_BOSONLY
))
291 /* at start of a REG_BOSONLY RE */
292 RETV(SBEGIN
, 0); /* same as \A */
295 /* if we're nested and we've hit end, return to outer level */
296 if (v
->savenow
!= NULL
&& ATEOS())
299 v
->stop
= v
->savestop
;
300 v
->savenow
= v
->savestop
= NULL
;
303 /* skip white space etc. if appropriate (not in literal or []) */
304 if (v
->cflags
& REG_EXPANDED
)
315 /* handle EOS, depending on context */
339 /* okay, time to actually get a character */
342 /* deal with the easy contexts, punt EREs to code below */
345 case L_BRE
: /* punt BREs to separate function */
346 return brenext(v
, c
);
348 case L_ERE
: /* see below */
350 case L_Q
: /* literal strings are easy */
353 case L_BBND
: /* bounds are fairly simple */
367 RETV(DIGIT
, (chr
) DIGITVAL(c
));
372 case CHR('}'): /* ERE bound ends with } */
376 if ((v
->cflags
& REG_ADVF
) && NEXT1('?'))
387 case CHR('\\'): /* BRE bound ends with \} */
388 if (INCON(L_BBND
) && NEXT1('}'))
403 case L_BRACK
: /* brackets are not too hard */
411 INTOCON((v
->cflags
& REG_EXTENDED
) ?
418 if (!(v
->cflags
& REG_ADVF
))
423 (DISCARD
) lexescape(v
);
425 { /* not all escapes okay here */
430 switch (v
->nextvalue
)
433 lexnest(v
, brbackd
, ENDOF(brbackd
));
436 lexnest(v
, brbacks
, ENDOF(brbacks
));
439 lexnest(v
, brbackw
, ENDOF(brbackw
));
445 /* lexnest done, back up and try again */
446 v
->nexttype
= v
->lasttype
;
450 /* not one of the acceptable escapes */
454 if (LASTTYPE('[') || NEXT1(']'))
466 /* might or might not be locale-specific */
492 case L_CEL
: /* collating elements are easy */
493 if (c
== CHR('.') && NEXT1(']'))
502 case L_ECL
: /* ditto equivalence classes */
503 if (c
== CHR('=') && NEXT1(']'))
512 case L_CCL
: /* ditto character classes */
513 if (c
== CHR(':') && NEXT1(']'))
527 /* that got rid of everything except EREs and AREs */
528 assert(INCON(L_ERE
));
530 /* deal with EREs and AREs, except for backslashes */
537 if ((v
->cflags
& REG_ADVF
) && NEXT1('?'))
546 if ((v
->cflags
& REG_ADVF
) && NEXT1('?'))
555 if ((v
->cflags
& REG_ADVF
) && NEXT1('?'))
563 case CHR('{'): /* bounds start or plain character */
564 if (v
->cflags
& REG_EXPANDED
)
566 if (ATEOS() || !iscdigit(*v
->now
))
580 case CHR('('): /* parenthesis, or advanced extension */
581 if ((v
->cflags
& REG_ADVF
) && NEXT1('?'))
587 case CHR(':'): /* non-capturing paren */
590 case CHR('#'): /* comment */
591 while (!ATEOS() && *v
->now
!= CHR(')'))
595 assert(v
->nexttype
== v
->lasttype
);
598 case CHR('='): /* positive lookahead */
599 NOTE(REG_ULOOKAHEAD
);
602 case CHR('!'): /* negative lookahead */
603 NOTE(REG_ULOOKAHEAD
);
612 if (v
->cflags
& REG_NOSUB
)
613 RETV('(', 0); /* all parens non-capturing */
622 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
623 if (HAVE(6) && *(v
->now
+ 0) == CHR('[') &&
624 *(v
->now
+ 1) == CHR(':') &&
625 (*(v
->now
+ 2) == CHR('<') ||
626 *(v
->now
+ 2) == CHR('>')) &&
627 *(v
->now
+ 3) == CHR(':') &&
628 *(v
->now
+ 4) == CHR(']') &&
629 *(v
->now
+ 5) == CHR(']'))
634 RET((c
== CHR('<')) ? '<' : '>');
653 case CHR('\\'): /* mostly punt backslashes to code below */
657 default: /* ordinary character */
662 /* ERE/ARE backslash handling; backslash already eaten */
664 if (!(v
->cflags
& REG_ADVF
))
665 { /* only AREs have non-trivial escapes */
666 if (iscalnum(*v
->now
))
671 RETV(PLAIN
, *v
->now
++);
673 (DISCARD
) lexescape(v
);
676 if (v
->nexttype
== CCLASS
)
677 { /* fudge at lexical level */
678 switch (v
->nextvalue
)
681 lexnest(v
, backd
, ENDOF(backd
));
684 lexnest(v
, backD
, ENDOF(backD
));
687 lexnest(v
, backs
, ENDOF(backs
));
690 lexnest(v
, backS
, ENDOF(backS
));
693 lexnest(v
, backw
, ENDOF(backw
));
696 lexnest(v
, backW
, ENDOF(backW
));
703 /* lexnest done, back up and try again */
704 v
->nexttype
= v
->lasttype
;
707 /* otherwise, lexescape has already done the work */
712 * lexescape - parse an ARE backslash escape (backslash already eaten)
713 * Note slightly nonstandard use of the CCLASS type code.
715 static int /* not actually used, but convenient for
717 lexescape(struct vars
* v
)
720 static chr alert
[] = {
721 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
724 CHR('E'), CHR('S'), CHR('C')
728 assert(v
->cflags
& REG_ADVF
);
739 RETV(PLAIN
, chrnamed(v
, alert
, ENDOF(alert
), CHR('\007')));
745 RETV(PLAIN
, CHR('\b'));
748 RETV(PLAIN
, CHR('\\'));
754 RETV(PLAIN
, (chr
) (*v
->now
++ & 037));
766 RETV(PLAIN
, chrnamed(v
, esc
, ENDOF(esc
), CHR('\033')));
769 RETV(PLAIN
, CHR('\f'));
778 RETV(PLAIN
, CHR('\n'));
781 RETV(PLAIN
, CHR('\r'));
792 RETV(PLAIN
, CHR('\t'));
795 c
= lexdigits(v
, 16, 4, 4);
801 c
= lexdigits(v
, 16, 8, 8);
807 RETV(PLAIN
, CHR('\v'));
819 c
= lexdigits(v
, 16, 1, 255); /* REs >255 long outside
846 v
->now
--; /* put first digit back */
847 c
= lexdigits(v
, 10, 1, 255); /* REs >255 long outside
851 /* ugly heuristic (first test is "exactly 1 digit?") */
852 if (v
->now
- save
== 0 || (int) c
<= v
->nsubexp
)
855 RETV(BACKREF
, (chr
) c
);
857 /* oops, doesn't look like it's a backref after all... */
859 /* and fall through into octal number */
862 v
->now
--; /* put first digit back */
863 c
= lexdigits(v
, 8, 1, 3);
870 FAILW(REG_EESCAPE
); /* unknown alphabetic escape */
877 * lexdigits - slurp up digits and return chr value
879 static chr
/* chr value; errors signalled via ERR */
880 lexdigits(struct vars
* v
,
885 uchr n
; /* unsigned to avoid overflow misbehavior */
889 const uchr ub
= (uchr
) base
;
892 for (len
= 0; len
< maxlen
&& !ATEOS(); len
++)
934 v
->now
--; /* oops, not a digit at all */
940 { /* not a plausible digit */
945 break; /* NOTE BREAK OUT */
946 n
= n
* ub
+ (uchr
) d
;
955 * brenext - get next BRE token
957 * This is much like EREs except for all the stupid backslashes and the
958 * context-dependency of some things.
960 static int /* 1 normal, 0 failure */
961 brenext(struct vars
* v
,
969 if (LASTTYPE(EMPTY
) || LASTTYPE('(') || LASTTYPE('^'))
974 if (HAVE(6) && *(v
->now
+ 0) == CHR('[') &&
975 *(v
->now
+ 1) == CHR(':') &&
976 (*(v
->now
+ 2) == CHR('<') ||
977 *(v
->now
+ 2) == CHR('>')) &&
978 *(v
->now
+ 3) == CHR(':') &&
979 *(v
->now
+ 4) == CHR(']') &&
980 *(v
->now
+ 5) == CHR(']'))
985 RET((c
== CHR('<')) ? '<' : '>');
1009 if (v
->cflags
& REG_EXPANDED
)
1013 if (NEXT2('\\', ')'))
1021 break; /* see below */
1027 assert(c
== CHR('\\'));
1047 NOTE(REG_UNONPOSIX
);
1051 NOTE(REG_UNONPOSIX
);
1064 RETV(BACKREF
, (chr
) DIGITVAL(c
));
1080 * skip - skip white space and comments in expanded form
1083 skip(struct vars
* v
)
1085 chr
*start
= v
->now
;
1087 assert(v
->cflags
& REG_EXPANDED
);
1091 while (!ATEOS() && iscspace(*v
->now
))
1093 if (ATEOS() || *v
->now
!= CHR('#'))
1094 break; /* NOTE BREAK OUT */
1096 while (!ATEOS() && *v
->now
!= CHR('\n'))
1098 /* leave the newline to be picked up by the iscspace loop */
1101 if (v
->now
!= start
)
1102 NOTE(REG_UNONPOSIX
);
1106 * newline - return the chr for a newline
1108 * This helps confine use of CHR to this source file.
1117 * chrnamed - return the chr known by a given (chr string) name
1119 * The code is a bit clumsy, but this routine gets only such specialized
1120 * use that it hardly matters.
1123 chrnamed(struct vars
* v
,
1124 chr
*startp
, /* start of name */
1125 chr
*endp
, /* just past end of name */
1126 chr lastresort
) /* what to return if name lookup fails */
1135 c
= element(v
, startp
, endp
);
1140 return (chr
) lastresort
;
1142 cv
= range(v
, c
, c
, 0);
1144 return (chr
) lastresort
;