Commit | Line | Data |
---|---|---|
38d679fd RN |
1 | /* |
2 | * lexical analyzer | |
3 | * This file is #included by regcomp.c. | |
4 | * | |
3ca4086b VS |
5 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. |
6 | * | |
38d679fd RN |
7 | * Development of this software was funded, in part, by Cray Research Inc., |
8 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics | |
9 | * Corporation, none of whom are responsible for the results. The author | |
3ca4086b VS |
10 | * thanks all of them. |
11 | * | |
38d679fd RN |
12 | * Redistribution and use in source and binary forms -- with or without |
13 | * modification -- are permitted for any purpose, provided that | |
14 | * redistributions in source form retain this entire copyright notice and | |
15 | * indicate the origin and nature of any modifications. | |
3ca4086b | 16 | * |
38d679fd RN |
17 | * I'd appreciate being given credit for this package in the documentation |
18 | * of software which uses it, but that is not a requirement. | |
3ca4086b | 19 | * |
38d679fd RN |
20 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, |
21 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
22 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL | |
23 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
24 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
26 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
28 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
29 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
30 | * | |
38d679fd RN |
31 | */ |
32 | ||
33 | /* scanning macros (know about v) */ | |
3ca4086b VS |
34 | #define ATEOS() (v->now >= v->stop) |
35 | #define HAVE(n) (v->stop - v->now >= (n)) | |
36 | #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) | |
37 | #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) | |
38 | #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ | |
38d679fd RN |
39 | *(v->now+1) == CHR(b) && \ |
40 | *(v->now+2) == CHR(c)) | |
3ca4086b VS |
41 | #define SET(c) (v->nexttype = (c)) |
42 | #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) | |
43 | #define RET(c) return (SET(c), 1) | |
44 | #define RETV(c, n) return (SETV(c, n), 1) | |
45 | #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ | |
46 | #define LASTTYPE(t) (v->lasttype == (t)) | |
38d679fd RN |
47 | |
48 | /* lexical contexts */ | |
3ca4086b VS |
49 | #define L_ERE 1 /* mainline ERE/ARE */ |
50 | #define L_BRE 2 /* mainline BRE */ | |
51 | #define L_Q 3 /* REG_QUOTE */ | |
52 | #define L_EBND 4 /* ERE/ARE bound */ | |
53 | #define L_BBND 5 /* BRE bound */ | |
54 | #define L_BRACK 6 /* brackets */ | |
55 | #define L_CEL 7 /* collating element */ | |
56 | #define L_ECL 8 /* equivalence class */ | |
57 | #define L_CCL 9 /* character class */ | |
58 | #define INTOCON(c) (v->lexcon = (c)) | |
59 | #define INCON(con) (v->lexcon == (con)) | |
38d679fd RN |
60 | |
61 | /* construct pointer past end of chr array */ | |
3ca4086b | 62 | #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) |
38d679fd RN |
63 | |
64 | /* | |
3ca4086b VS |
65 | - lexstart - set up lexical stuff, scan leading options |
66 | ^ static VOID lexstart(struct vars *); | |
38d679fd | 67 | */ |
3ca4086b VS |
68 | static VOID |
69 | lexstart(v) | |
70 | struct vars *v; | |
38d679fd | 71 | { |
3ca4086b | 72 | prefixes(v); /* may turn on new type bits etc. */ |
38d679fd RN |
73 | NOERR(); |
74 | ||
3ca4086b VS |
75 | if (v->cflags®_QUOTE) { |
76 | assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); | |
38d679fd | 77 | INTOCON(L_Q); |
3ca4086b VS |
78 | } else if (v->cflags®_EXTENDED) { |
79 | assert(!(v->cflags®_QUOTE)); | |
38d679fd | 80 | INTOCON(L_ERE); |
3ca4086b VS |
81 | } else { |
82 | assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); | |
38d679fd RN |
83 | INTOCON(L_BRE); |
84 | } | |
85 | ||
86 | v->nexttype = EMPTY; /* remember we were at the start */ | |
3ca4086b | 87 | next(v); /* set up the first token */ |
38d679fd RN |
88 | } |
89 | ||
90 | /* | |
3ca4086b VS |
91 | - prefixes - implement various special prefixes |
92 | ^ static VOID prefixes(struct vars *); | |
38d679fd | 93 | */ |
3ca4086b VS |
94 | static VOID |
95 | prefixes(v) | |
96 | struct vars *v; | |
38d679fd RN |
97 | { |
98 | /* literal string doesn't get any of this stuff */ | |
3ca4086b | 99 | if (v->cflags®_QUOTE) |
38d679fd RN |
100 | return; |
101 | ||
3ca4086b | 102 | /* initial "***" gets special things */ |
38d679fd | 103 | if (HAVE(4) && NEXT3('*', '*', '*')) |
3ca4086b VS |
104 | switch (*(v->now + 3)) { |
105 | case CHR('?'): /* "***?" error, msg shows version */ | |
106 | ERR(REG_BADPAT); | |
107 | return; /* proceed no further */ | |
108 | break; | |
109 | case CHR('='): /* "***=" shifts to literal string */ | |
110 | NOTE(REG_UNONPOSIX); | |
111 | v->cflags |= REG_QUOTE; | |
112 | v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); | |
113 | v->now += 4; | |
114 | return; /* and there can be no more prefixes */ | |
115 | break; | |
116 | case CHR(':'): /* "***:" shifts to AREs */ | |
117 | NOTE(REG_UNONPOSIX); | |
118 | v->cflags |= REG_ADVANCED; | |
119 | v->now += 4; | |
120 | break; | |
121 | default: /* otherwise *** is just an error */ | |
122 | ERR(REG_BADRPT); | |
123 | return; | |
124 | break; | |
38d679fd RN |
125 | } |
126 | ||
127 | /* BREs and EREs don't get embedded options */ | |
3ca4086b | 128 | if ((v->cflags®_ADVANCED) != REG_ADVANCED) |
38d679fd RN |
129 | return; |
130 | ||
131 | /* embedded options (AREs only) */ | |
3ca4086b | 132 | if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { |
38d679fd RN |
133 | NOTE(REG_UNONPOSIX); |
134 | v->now += 2; | |
135 | for (; !ATEOS() && iscalpha(*v->now); v->now++) | |
3ca4086b VS |
136 | switch (*v->now) { |
137 | case CHR('b'): /* BREs (but why???) */ | |
138 | v->cflags &= ~(REG_ADVANCED|REG_QUOTE); | |
139 | break; | |
140 | case CHR('c'): /* case sensitive */ | |
141 | v->cflags &= ~REG_ICASE; | |
142 | break; | |
143 | case CHR('e'): /* plain EREs */ | |
144 | v->cflags |= REG_EXTENDED; | |
145 | v->cflags &= ~(REG_ADVF|REG_QUOTE); | |
146 | break; | |
147 | case CHR('i'): /* case insensitive */ | |
148 | v->cflags |= REG_ICASE; | |
149 | break; | |
150 | case CHR('m'): /* Perloid synonym for n */ | |
151 | case CHR('n'): /* \n affects ^ $ . [^ */ | |
152 | v->cflags |= REG_NEWLINE; | |
153 | break; | |
154 | case CHR('p'): /* ~Perl, \n affects . [^ */ | |
155 | v->cflags |= REG_NLSTOP; | |
156 | v->cflags &= ~REG_NLANCH; | |
157 | break; | |
158 | case CHR('q'): /* literal string */ | |
159 | v->cflags |= REG_QUOTE; | |
160 | v->cflags &= ~REG_ADVANCED; | |
161 | break; | |
162 | case CHR('s'): /* single line, \n ordinary */ | |
163 | v->cflags &= ~REG_NEWLINE; | |
164 | break; | |
165 | case CHR('t'): /* tight syntax */ | |
166 | v->cflags &= ~REG_EXPANDED; | |
167 | break; | |
168 | case CHR('w'): /* weird, \n affects ^ $ only */ | |
169 | v->cflags &= ~REG_NLSTOP; | |
170 | v->cflags |= REG_NLANCH; | |
171 | break; | |
172 | case CHR('x'): /* expanded syntax */ | |
173 | v->cflags |= REG_EXPANDED; | |
174 | break; | |
175 | default: | |
176 | ERR(REG_BADOPT); | |
177 | return; | |
38d679fd | 178 | } |
3ca4086b | 179 | if (!NEXT1(')')) { |
38d679fd RN |
180 | ERR(REG_BADOPT); |
181 | return; | |
182 | } | |
183 | v->now++; | |
3ca4086b VS |
184 | if (v->cflags®_QUOTE) |
185 | v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); | |
38d679fd RN |
186 | } |
187 | } | |
188 | ||
189 | /* | |
3ca4086b | 190 | - lexnest - "call a subroutine", interpolating string at the lexical level |
38d679fd RN |
191 | * Note, this is not a very general facility. There are a number of |
192 | * implicit assumptions about what sorts of strings can be subroutines. | |
3ca4086b | 193 | ^ static VOID lexnest(struct vars *, chr *, chr *); |
38d679fd | 194 | */ |
3ca4086b VS |
195 | static VOID |
196 | lexnest(v, beginp, endp) | |
197 | struct vars *v; | |
198 | chr *beginp; /* start of interpolation */ | |
199 | chr *endp; /* one past end of interpolation */ | |
38d679fd | 200 | { |
3ca4086b | 201 | assert(v->savenow == NULL); /* only one level of nesting */ |
38d679fd RN |
202 | v->savenow = v->now; |
203 | v->savestop = v->stop; | |
204 | v->now = beginp; | |
205 | v->stop = endp; | |
206 | } | |
207 | ||
208 | /* | |
209 | * string constants to interpolate as expansions of things like \d | |
210 | */ | |
3ca4086b | 211 | static chr backd[] = { /* \d */ |
38d679fd RN |
212 | CHR('['), CHR('['), CHR(':'), |
213 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
214 | CHR(':'), CHR(']'), CHR(']') | |
215 | }; | |
3ca4086b | 216 | static chr backD[] = { /* \D */ |
38d679fd RN |
217 | CHR('['), CHR('^'), CHR('['), CHR(':'), |
218 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
219 | CHR(':'), CHR(']'), CHR(']') | |
220 | }; | |
3ca4086b | 221 | static chr brbackd[] = { /* \d within brackets */ |
38d679fd RN |
222 | CHR('['), CHR(':'), |
223 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
224 | CHR(':'), CHR(']') | |
225 | }; | |
3ca4086b | 226 | static chr backs[] = { /* \s */ |
38d679fd RN |
227 | CHR('['), CHR('['), CHR(':'), |
228 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
229 | CHR(':'), CHR(']'), CHR(']') | |
230 | }; | |
3ca4086b | 231 | static chr backS[] = { /* \S */ |
38d679fd RN |
232 | CHR('['), CHR('^'), CHR('['), CHR(':'), |
233 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
234 | CHR(':'), CHR(']'), CHR(']') | |
235 | }; | |
3ca4086b | 236 | static chr brbacks[] = { /* \s within brackets */ |
38d679fd RN |
237 | CHR('['), CHR(':'), |
238 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
239 | CHR(':'), CHR(']') | |
240 | }; | |
3ca4086b | 241 | static chr backw[] = { /* \w */ |
38d679fd RN |
242 | CHR('['), CHR('['), CHR(':'), |
243 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
244 | CHR(':'), CHR(']'), CHR('_'), CHR(']') | |
245 | }; | |
3ca4086b | 246 | static chr backW[] = { /* \W */ |
38d679fd RN |
247 | CHR('['), CHR('^'), CHR('['), CHR(':'), |
248 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
249 | CHR(':'), CHR(']'), CHR('_'), CHR(']') | |
250 | }; | |
3ca4086b | 251 | static chr brbackw[] = { /* \w within brackets */ |
38d679fd RN |
252 | CHR('['), CHR(':'), |
253 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
254 | CHR(':'), CHR(']'), CHR('_') | |
255 | }; | |
256 | ||
257 | /* | |
3ca4086b | 258 | - lexword - interpolate a bracket expression for word characters |
38d679fd | 259 | * Possibly ought to inquire whether there is a "word" character class. |
3ca4086b | 260 | ^ static VOID lexword(struct vars *); |
38d679fd | 261 | */ |
3ca4086b VS |
262 | static VOID |
263 | lexword(v) | |
264 | struct vars *v; | |
38d679fd RN |
265 | { |
266 | lexnest(v, backw, ENDOF(backw)); | |
267 | } | |
268 | ||
269 | /* | |
3ca4086b VS |
270 | - next - get next token |
271 | ^ static int next(struct vars *); | |
38d679fd | 272 | */ |
3ca4086b VS |
273 | static int /* 1 normal, 0 failure */ |
274 | next(v) | |
275 | struct vars *v; | |
38d679fd | 276 | { |
3ca4086b | 277 | chr c; |
38d679fd RN |
278 | |
279 | /* errors yield an infinite sequence of failures */ | |
280 | if (ISERR()) | |
3ca4086b | 281 | return 0; /* the error has set nexttype to EOS */ |
38d679fd RN |
282 | |
283 | /* remember flavor of last token */ | |
284 | v->lasttype = v->nexttype; | |
285 | ||
286 | /* REG_BOSONLY */ | |
3ca4086b | 287 | if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { |
38d679fd RN |
288 | /* at start of a REG_BOSONLY RE */ |
289 | RETV(SBEGIN, 0); /* same as \A */ | |
290 | } | |
291 | ||
292 | /* if we're nested and we've hit end, return to outer level */ | |
3ca4086b | 293 | if (v->savenow != NULL && ATEOS()) { |
38d679fd RN |
294 | v->now = v->savenow; |
295 | v->stop = v->savestop; | |
296 | v->savenow = v->savestop = NULL; | |
297 | } | |
298 | ||
299 | /* skip white space etc. if appropriate (not in literal or []) */ | |
3ca4086b VS |
300 | if (v->cflags®_EXPANDED) |
301 | switch (v->lexcon) { | |
302 | case L_ERE: | |
303 | case L_BRE: | |
304 | case L_EBND: | |
305 | case L_BBND: | |
306 | skip(v); | |
307 | break; | |
38d679fd RN |
308 | } |
309 | ||
310 | /* handle EOS, depending on context */ | |
3ca4086b VS |
311 | if (ATEOS()) { |
312 | switch (v->lexcon) { | |
313 | case L_ERE: | |
314 | case L_BRE: | |
315 | case L_Q: | |
316 | RET(EOS); | |
317 | break; | |
318 | case L_EBND: | |
319 | case L_BBND: | |
320 | FAILW(REG_EBRACE); | |
321 | break; | |
322 | case L_BRACK: | |
323 | case L_CEL: | |
324 | case L_ECL: | |
325 | case L_CCL: | |
326 | FAILW(REG_EBRACK); | |
327 | break; | |
38d679fd RN |
328 | } |
329 | assert(NOTREACHED); | |
330 | } | |
331 | ||
332 | /* okay, time to actually get a character */ | |
333 | c = *v->now++; | |
334 | ||
335 | /* deal with the easy contexts, punt EREs to code below */ | |
3ca4086b VS |
336 | switch (v->lexcon) { |
337 | case L_BRE: /* punt BREs to separate function */ | |
338 | return brenext(v, c); | |
339 | break; | |
340 | case L_ERE: /* see below */ | |
341 | break; | |
342 | case L_Q: /* literal strings are easy */ | |
343 | RETV(PLAIN, c); | |
344 | break; | |
345 | case L_BBND: /* bounds are fairly simple */ | |
346 | case L_EBND: | |
347 | switch (c) { | |
348 | case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): | |
349 | case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): | |
350 | case CHR('8'): case CHR('9'): | |
351 | RETV(DIGIT, (chr)DIGITVAL(c)); | |
38d679fd | 352 | break; |
3ca4086b VS |
353 | case CHR(','): |
354 | RET(','); | |
38d679fd | 355 | break; |
3ca4086b VS |
356 | case CHR('}'): /* ERE bound ends with } */ |
357 | if (INCON(L_EBND)) { | |
358 | INTOCON(L_ERE); | |
359 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
360 | v->now++; | |
361 | NOTE(REG_UNONPOSIX); | |
362 | RETV('}', 0); | |
363 | } | |
364 | RETV('}', 1); | |
365 | } else | |
366 | FAILW(REG_BADBR); | |
38d679fd | 367 | break; |
3ca4086b VS |
368 | case CHR('\\'): /* BRE bound ends with \} */ |
369 | if (INCON(L_BBND) && NEXT1('}')) { | |
370 | v->now++; | |
371 | INTOCON(L_BRE); | |
372 | RET('}'); | |
373 | } else | |
374 | FAILW(REG_BADBR); | |
375 | break; | |
376 | default: | |
377 | FAILW(REG_BADBR); | |
378 | break; | |
379 | } | |
380 | assert(NOTREACHED); | |
381 | break; | |
382 | case L_BRACK: /* brackets are not too hard */ | |
383 | switch (c) { | |
384 | case CHR(']'): | |
385 | if (LASTTYPE('[')) | |
386 | RETV(PLAIN, c); | |
387 | else { | |
388 | INTOCON((v->cflags®_EXTENDED) ? | |
389 | L_ERE : L_BRE); | |
390 | RET(']'); | |
38d679fd | 391 | } |
38d679fd | 392 | break; |
3ca4086b VS |
393 | case CHR('\\'): |
394 | NOTE(REG_UBBS); | |
395 | if (!(v->cflags®_ADVF)) | |
396 | RETV(PLAIN, c); | |
397 | NOTE(REG_UNONPOSIX); | |
398 | if (ATEOS()) | |
399 | FAILW(REG_EESCAPE); | |
400 | (DISCARD)lexescape(v); | |
401 | switch (v->nexttype) { /* not all escapes okay here */ | |
402 | case PLAIN: | |
403 | return 1; | |
404 | break; | |
405 | case CCLASS: | |
406 | switch (v->nextvalue) { | |
407 | case 'd': | |
408 | lexnest(v, brbackd, ENDOF(brbackd)); | |
38d679fd | 409 | break; |
3ca4086b VS |
410 | case 's': |
411 | lexnest(v, brbacks, ENDOF(brbacks)); | |
38d679fd | 412 | break; |
3ca4086b VS |
413 | case 'w': |
414 | lexnest(v, brbackw, ENDOF(brbackw)); | |
38d679fd RN |
415 | break; |
416 | default: | |
3ca4086b | 417 | FAILW(REG_EESCAPE); |
38d679fd | 418 | break; |
3ca4086b VS |
419 | } |
420 | /* lexnest done, back up and try again */ | |
421 | v->nexttype = v->lasttype; | |
422 | return next(v); | |
423 | break; | |
38d679fd | 424 | } |
3ca4086b VS |
425 | /* not one of the acceptable escapes */ |
426 | FAILW(REG_EESCAPE); | |
38d679fd | 427 | break; |
3ca4086b VS |
428 | case CHR('-'): |
429 | if (LASTTYPE('[') || NEXT1(']')) | |
38d679fd | 430 | RETV(PLAIN, c); |
38d679fd | 431 | else |
3ca4086b | 432 | RETV(RANGE, c); |
38d679fd | 433 | break; |
3ca4086b VS |
434 | case CHR('['): |
435 | if (ATEOS()) | |
436 | FAILW(REG_EBRACK); | |
437 | switch (*v->now++) { | |
438 | case CHR('.'): | |
439 | INTOCON(L_CEL); | |
440 | /* might or might not be locale-specific */ | |
441 | RET(COLLEL); | |
442 | break; | |
443 | case CHR('='): | |
444 | INTOCON(L_ECL); | |
445 | NOTE(REG_ULOCALE); | |
446 | RET(ECLASS); | |
447 | break; | |
448 | case CHR(':'): | |
449 | INTOCON(L_CCL); | |
450 | NOTE(REG_ULOCALE); | |
451 | RET(CCLASS); | |
452 | break; | |
453 | default: /* oops */ | |
454 | v->now--; | |
38d679fd | 455 | RETV(PLAIN, c); |
3ca4086b VS |
456 | break; |
457 | } | |
458 | assert(NOTREACHED); | |
38d679fd RN |
459 | break; |
460 | default: | |
3ca4086b | 461 | RETV(PLAIN, c); |
38d679fd | 462 | break; |
3ca4086b VS |
463 | } |
464 | assert(NOTREACHED); | |
465 | break; | |
466 | case L_CEL: /* collating elements are easy */ | |
467 | if (c == CHR('.') && NEXT1(']')) { | |
468 | v->now++; | |
469 | INTOCON(L_BRACK); | |
470 | RETV(END, '.'); | |
471 | } else | |
472 | RETV(PLAIN, c); | |
473 | break; | |
474 | case L_ECL: /* ditto equivalence classes */ | |
475 | if (c == CHR('=') && NEXT1(']')) { | |
476 | v->now++; | |
477 | INTOCON(L_BRACK); | |
478 | RETV(END, '='); | |
479 | } else | |
480 | RETV(PLAIN, c); | |
481 | break; | |
482 | case L_CCL: /* ditto character classes */ | |
483 | if (c == CHR(':') && NEXT1(']')) { | |
484 | v->now++; | |
485 | INTOCON(L_BRACK); | |
486 | RETV(END, ':'); | |
487 | } else | |
488 | RETV(PLAIN, c); | |
489 | break; | |
490 | default: | |
491 | assert(NOTREACHED); | |
492 | break; | |
38d679fd RN |
493 | } |
494 | ||
495 | /* that got rid of everything except EREs and AREs */ | |
496 | assert(INCON(L_ERE)); | |
497 | ||
498 | /* deal with EREs and AREs, except for backslashes */ | |
3ca4086b VS |
499 | switch (c) { |
500 | case CHR('|'): | |
501 | RET('|'); | |
502 | break; | |
503 | case CHR('*'): | |
504 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
505 | v->now++; | |
506 | NOTE(REG_UNONPOSIX); | |
507 | RETV('*', 0); | |
508 | } | |
509 | RETV('*', 1); | |
510 | break; | |
511 | case CHR('+'): | |
512 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
513 | v->now++; | |
514 | NOTE(REG_UNONPOSIX); | |
515 | RETV('+', 0); | |
516 | } | |
517 | RETV('+', 1); | |
518 | break; | |
519 | case CHR('?'): | |
520 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
521 | v->now++; | |
522 | NOTE(REG_UNONPOSIX); | |
523 | RETV('?', 0); | |
524 | } | |
525 | RETV('?', 1); | |
526 | break; | |
527 | case CHR('{'): /* bounds start or plain character */ | |
528 | if (v->cflags®_EXPANDED) | |
529 | skip(v); | |
530 | if (ATEOS() || !iscdigit(*v->now)) { | |
531 | NOTE(REG_UBRACES); | |
532 | NOTE(REG_UUNSPEC); | |
533 | RETV(PLAIN, c); | |
534 | } else { | |
535 | NOTE(REG_UBOUNDS); | |
536 | INTOCON(L_EBND); | |
537 | RET('{'); | |
538 | } | |
539 | assert(NOTREACHED); | |
540 | break; | |
541 | case CHR('('): /* parenthesis, or advanced extension */ | |
542 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
543 | NOTE(REG_UNONPOSIX); | |
544 | v->now++; | |
545 | switch (*v->now++) { | |
546 | case CHR(':'): /* non-capturing paren */ | |
547 | RETV('(', 0); | |
548 | break; | |
549 | case CHR('#'): /* comment */ | |
550 | while (!ATEOS() && *v->now != CHR(')')) | |
551 | v->now++; | |
552 | if (!ATEOS()) | |
553 | v->now++; | |
554 | assert(v->nexttype == v->lasttype); | |
555 | return next(v); | |
556 | break; | |
557 | case CHR('='): /* positive lookahead */ | |
558 | NOTE(REG_ULOOKAHEAD); | |
559 | RETV(LACON, 1); | |
560 | break; | |
561 | case CHR('!'): /* negative lookahead */ | |
562 | NOTE(REG_ULOOKAHEAD); | |
563 | RETV(LACON, 0); | |
564 | break; | |
565 | default: | |
566 | FAILW(REG_BADRPT); | |
567 | break; | |
38d679fd RN |
568 | } |
569 | assert(NOTREACHED); | |
3ca4086b VS |
570 | } |
571 | if (v->cflags®_NOSUB) | |
572 | RETV('(', 0); /* all parens non-capturing */ | |
573 | else | |
574 | RETV('(', 1); | |
575 | break; | |
576 | case CHR(')'): | |
577 | if (LASTTYPE('(')) { | |
578 | NOTE(REG_UUNSPEC); | |
579 | } | |
580 | RETV(')', c); | |
581 | break; | |
582 | case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ | |
583 | if (HAVE(6) && *(v->now+0) == CHR('[') && | |
584 | *(v->now+1) == CHR(':') && | |
585 | (*(v->now+2) == CHR('<') || | |
586 | *(v->now+2) == CHR('>')) && | |
587 | *(v->now+3) == CHR(':') && | |
588 | *(v->now+4) == CHR(']') && | |
589 | *(v->now+5) == CHR(']')) { | |
590 | c = *(v->now+2); | |
591 | v->now += 6; | |
592 | NOTE(REG_UNONPOSIX); | |
593 | RET((c == CHR('<')) ? '<' : '>'); | |
594 | } | |
595 | INTOCON(L_BRACK); | |
596 | if (NEXT1('^')) { | |
597 | v->now++; | |
598 | RETV('[', 0); | |
599 | } | |
600 | RETV('[', 1); | |
601 | break; | |
602 | case CHR('.'): | |
603 | RET('.'); | |
604 | break; | |
605 | case CHR('^'): | |
606 | RET('^'); | |
607 | break; | |
608 | case CHR('$'): | |
609 | RET('$'); | |
610 | break; | |
611 | case CHR('\\'): /* mostly punt backslashes to code below */ | |
612 | if (ATEOS()) | |
613 | FAILW(REG_EESCAPE); | |
614 | break; | |
615 | default: /* ordinary character */ | |
616 | RETV(PLAIN, c); | |
617 | break; | |
38d679fd RN |
618 | } |
619 | ||
620 | /* ERE/ARE backslash handling; backslash already eaten */ | |
621 | assert(!ATEOS()); | |
3ca4086b VS |
622 | if (!(v->cflags®_ADVF)) { /* only AREs have non-trivial escapes */ |
623 | if (iscalnum(*v->now)) { | |
38d679fd RN |
624 | NOTE(REG_UBSALNUM); |
625 | NOTE(REG_UUNSPEC); | |
626 | } | |
627 | RETV(PLAIN, *v->now++); | |
628 | } | |
3ca4086b | 629 | (DISCARD)lexescape(v); |
38d679fd RN |
630 | if (ISERR()) |
631 | FAILW(REG_EESCAPE); | |
3ca4086b VS |
632 | if (v->nexttype == CCLASS) { /* fudge at lexical level */ |
633 | switch (v->nextvalue) { | |
634 | case 'd': lexnest(v, backd, ENDOF(backd)); break; | |
635 | case 'D': lexnest(v, backD, ENDOF(backD)); break; | |
636 | case 's': lexnest(v, backs, ENDOF(backs)); break; | |
637 | case 'S': lexnest(v, backS, ENDOF(backS)); break; | |
638 | case 'w': lexnest(v, backw, ENDOF(backw)); break; | |
639 | case 'W': lexnest(v, backW, ENDOF(backW)); break; | |
640 | default: | |
641 | assert(NOTREACHED); | |
642 | FAILW(REG_ASSERT); | |
643 | break; | |
38d679fd RN |
644 | } |
645 | /* lexnest done, back up and try again */ | |
646 | v->nexttype = v->lasttype; | |
647 | return next(v); | |
648 | } | |
649 | /* otherwise, lexescape has already done the work */ | |
650 | return !ISERR(); | |
651 | } | |
652 | ||
653 | /* | |
3ca4086b | 654 | - lexescape - parse an ARE backslash escape (backslash already eaten) |
38d679fd | 655 | * Note slightly nonstandard use of the CCLASS type code. |
3ca4086b | 656 | ^ static int lexescape(struct vars *); |
38d679fd | 657 | */ |
3ca4086b VS |
658 | static int /* not actually used, but convenient for RETV */ |
659 | lexescape(v) | |
660 | struct vars *v; | |
38d679fd | 661 | { |
3ca4086b VS |
662 | chr c; |
663 | static chr alert[] = { | |
38d679fd RN |
664 | CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') |
665 | }; | |
3ca4086b | 666 | static chr esc[] = { |
38d679fd RN |
667 | CHR('E'), CHR('S'), CHR('C') |
668 | }; | |
3ca4086b | 669 | chr *save; |
38d679fd | 670 | |
3ca4086b | 671 | assert(v->cflags®_ADVF); |
38d679fd RN |
672 | |
673 | assert(!ATEOS()); | |
674 | c = *v->now++; | |
675 | if (!iscalnum(c)) | |
676 | RETV(PLAIN, c); | |
677 | ||
678 | NOTE(REG_UNONPOSIX); | |
3ca4086b VS |
679 | switch (c) { |
680 | case CHR('a'): | |
681 | RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); | |
682 | break; | |
683 | case CHR('A'): | |
684 | RETV(SBEGIN, 0); | |
685 | break; | |
686 | case CHR('b'): | |
687 | RETV(PLAIN, CHR('\b')); | |
688 | break; | |
689 | case CHR('B'): | |
690 | RETV(PLAIN, CHR('\\')); | |
691 | break; | |
692 | case CHR('c'): | |
693 | NOTE(REG_UUNPORT); | |
694 | if (ATEOS()) | |
695 | FAILW(REG_EESCAPE); | |
696 | RETV(PLAIN, (chr)(*v->now++ & 037)); | |
697 | break; | |
698 | case CHR('d'): | |
699 | NOTE(REG_ULOCALE); | |
700 | RETV(CCLASS, 'd'); | |
701 | break; | |
702 | case CHR('D'): | |
703 | NOTE(REG_ULOCALE); | |
704 | RETV(CCLASS, 'D'); | |
705 | break; | |
706 | case CHR('e'): | |
707 | NOTE(REG_UUNPORT); | |
708 | RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); | |
709 | break; | |
710 | case CHR('f'): | |
711 | RETV(PLAIN, CHR('\f')); | |
712 | break; | |
713 | case CHR('m'): | |
714 | RET('<'); | |
715 | break; | |
716 | case CHR('M'): | |
717 | RET('>'); | |
718 | break; | |
719 | case CHR('n'): | |
720 | RETV(PLAIN, CHR('\n')); | |
721 | break; | |
722 | case CHR('r'): | |
723 | RETV(PLAIN, CHR('\r')); | |
724 | break; | |
725 | case CHR('s'): | |
726 | NOTE(REG_ULOCALE); | |
727 | RETV(CCLASS, 's'); | |
728 | break; | |
729 | case CHR('S'): | |
730 | NOTE(REG_ULOCALE); | |
731 | RETV(CCLASS, 'S'); | |
732 | break; | |
733 | case CHR('t'): | |
734 | RETV(PLAIN, CHR('\t')); | |
735 | break; | |
736 | case CHR('u'): | |
737 | c = lexdigits(v, 16, 4, 4); | |
738 | if (ISERR()) | |
739 | FAILW(REG_EESCAPE); | |
740 | RETV(PLAIN, c); | |
741 | break; | |
742 | case CHR('U'): | |
743 | c = lexdigits(v, 16, 8, 8); | |
744 | if (ISERR()) | |
745 | FAILW(REG_EESCAPE); | |
746 | RETV(PLAIN, c); | |
747 | break; | |
748 | case CHR('v'): | |
749 | RETV(PLAIN, CHR('\v')); | |
750 | break; | |
751 | case CHR('w'): | |
752 | NOTE(REG_ULOCALE); | |
753 | RETV(CCLASS, 'w'); | |
754 | break; | |
755 | case CHR('W'): | |
756 | NOTE(REG_ULOCALE); | |
757 | RETV(CCLASS, 'W'); | |
758 | break; | |
759 | case CHR('x'): | |
760 | NOTE(REG_UUNPORT); | |
761 | c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ | |
762 | if (ISERR()) | |
763 | FAILW(REG_EESCAPE); | |
764 | RETV(PLAIN, c); | |
765 | break; | |
766 | case CHR('y'): | |
767 | NOTE(REG_ULOCALE); | |
768 | RETV(WBDRY, 0); | |
769 | break; | |
770 | case CHR('Y'): | |
771 | NOTE(REG_ULOCALE); | |
772 | RETV(NWBDRY, 0); | |
773 | break; | |
774 | case CHR('Z'): | |
775 | RETV(SEND, 0); | |
776 | break; | |
777 | case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): | |
778 | case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): | |
779 | case CHR('9'): | |
780 | save = v->now; | |
781 | v->now--; /* put first digit back */ | |
782 | c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ | |
783 | if (ISERR()) | |
784 | FAILW(REG_EESCAPE); | |
785 | /* ugly heuristic (first test is "exactly 1 digit?") */ | |
786 | if (v->now - save == 0 || (int)c <= v->nsubexp) { | |
787 | NOTE(REG_UBACKREF); | |
788 | RETV(BACKREF, (chr)c); | |
789 | } | |
790 | /* oops, doesn't look like it's a backref after all... */ | |
791 | v->now = save; | |
792 | /* and fall through into octal number */ | |
793 | case CHR('0'): | |
794 | NOTE(REG_UUNPORT); | |
795 | v->now--; /* put first digit back */ | |
796 | c = lexdigits(v, 8, 1, 3); | |
797 | if (ISERR()) | |
798 | FAILW(REG_EESCAPE); | |
799 | RETV(PLAIN, c); | |
800 | break; | |
801 | default: | |
802 | assert(iscalpha(c)); | |
803 | FAILW(REG_EESCAPE); /* unknown alphabetic escape */ | |
804 | break; | |
38d679fd RN |
805 | } |
806 | assert(NOTREACHED); | |
807 | } | |
808 | ||
809 | /* | |
3ca4086b VS |
810 | - lexdigits - slurp up digits and return chr value |
811 | ^ static chr lexdigits(struct vars *, int, int, int); | |
38d679fd | 812 | */ |
3ca4086b VS |
813 | static chr /* chr value; errors signalled via ERR */ |
814 | lexdigits(v, base, minlen, maxlen) | |
815 | struct vars *v; | |
816 | int base; | |
817 | int minlen; | |
818 | int maxlen; | |
38d679fd | 819 | { |
3ca4086b VS |
820 | uchr n; /* unsigned to avoid overflow misbehavior */ |
821 | int len; | |
822 | chr c; | |
823 | int d; | |
824 | CONST uchr ub = (uchr) base; | |
38d679fd RN |
825 | |
826 | n = 0; | |
3ca4086b | 827 | for (len = 0; len < maxlen && !ATEOS(); len++) { |
38d679fd | 828 | c = *v->now++; |
3ca4086b VS |
829 | switch (c) { |
830 | case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): | |
831 | case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): | |
832 | case CHR('8'): case CHR('9'): | |
833 | d = DIGITVAL(c); | |
834 | break; | |
835 | case CHR('a'): case CHR('A'): d = 10; break; | |
836 | case CHR('b'): case CHR('B'): d = 11; break; | |
837 | case CHR('c'): case CHR('C'): d = 12; break; | |
838 | case CHR('d'): case CHR('D'): d = 13; break; | |
839 | case CHR('e'): case CHR('E'): d = 14; break; | |
840 | case CHR('f'): case CHR('F'): d = 15; break; | |
841 | default: | |
842 | v->now--; /* oops, not a digit at all */ | |
843 | d = -1; | |
844 | break; | |
38d679fd RN |
845 | } |
846 | ||
3ca4086b | 847 | if (d >= base) { /* not a plausible digit */ |
38d679fd RN |
848 | v->now--; |
849 | d = -1; | |
850 | } | |
851 | if (d < 0) | |
3ca4086b VS |
852 | break; /* NOTE BREAK OUT */ |
853 | n = n*ub + (uchr)d; | |
38d679fd RN |
854 | } |
855 | if (len < minlen) | |
856 | ERR(REG_EESCAPE); | |
857 | ||
3ca4086b | 858 | return (chr)n; |
38d679fd RN |
859 | } |
860 | ||
861 | /* | |
3ca4086b | 862 | - brenext - get next BRE token |
38d679fd RN |
863 | * This is much like EREs except for all the stupid backslashes and the |
864 | * context-dependency of some things. | |
3ca4086b | 865 | ^ static int brenext(struct vars *, pchr); |
38d679fd | 866 | */ |
3ca4086b VS |
867 | static int /* 1 normal, 0 failure */ |
868 | brenext(v, pc) | |
869 | struct vars *v; | |
870 | pchr pc; | |
38d679fd | 871 | { |
3ca4086b | 872 | chr c = (chr)pc; |
38d679fd | 873 | |
3ca4086b VS |
874 | switch (c) { |
875 | case CHR('*'): | |
876 | if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) | |
38d679fd | 877 | RETV(PLAIN, c); |
3ca4086b VS |
878 | RET('*'); |
879 | break; | |
880 | case CHR('['): | |
881 | if (HAVE(6) && *(v->now+0) == CHR('[') && | |
882 | *(v->now+1) == CHR(':') && | |
883 | (*(v->now+2) == CHR('<') || | |
884 | *(v->now+2) == CHR('>')) && | |
885 | *(v->now+3) == CHR(':') && | |
886 | *(v->now+4) == CHR(']') && | |
887 | *(v->now+5) == CHR(']')) { | |
888 | c = *(v->now+2); | |
889 | v->now += 6; | |
890 | NOTE(REG_UNONPOSIX); | |
891 | RET((c == CHR('<')) ? '<' : '>'); | |
892 | } | |
893 | INTOCON(L_BRACK); | |
894 | if (NEXT1('^')) { | |
895 | v->now++; | |
896 | RETV('[', 0); | |
897 | } | |
898 | RETV('[', 1); | |
899 | break; | |
900 | case CHR('.'): | |
901 | RET('.'); | |
902 | break; | |
903 | case CHR('^'): | |
904 | if (LASTTYPE(EMPTY)) | |
905 | RET('^'); | |
906 | if (LASTTYPE('(')) { | |
907 | NOTE(REG_UUNSPEC); | |
908 | RET('^'); | |
909 | } | |
910 | RETV(PLAIN, c); | |
911 | break; | |
912 | case CHR('$'): | |
913 | if (v->cflags®_EXPANDED) | |
914 | skip(v); | |
915 | if (ATEOS()) | |
916 | RET('$'); | |
917 | if (NEXT2('\\', ')')) { | |
918 | NOTE(REG_UUNSPEC); | |
919 | RET('$'); | |
920 | } | |
921 | RETV(PLAIN, c); | |
922 | break; | |
923 | case CHR('\\'): | |
924 | break; /* see below */ | |
925 | default: | |
926 | RETV(PLAIN, c); | |
927 | break; | |
38d679fd RN |
928 | } |
929 | ||
930 | assert(c == CHR('\\')); | |
931 | ||
932 | if (ATEOS()) | |
933 | FAILW(REG_EESCAPE); | |
934 | ||
935 | c = *v->now++; | |
3ca4086b VS |
936 | switch (c) { |
937 | case CHR('{'): | |
938 | INTOCON(L_BBND); | |
939 | NOTE(REG_UBOUNDS); | |
940 | RET('{'); | |
941 | break; | |
942 | case CHR('('): | |
943 | RETV('(', 1); | |
944 | break; | |
945 | case CHR(')'): | |
946 | RETV(')', c); | |
947 | break; | |
948 | case CHR('<'): | |
949 | NOTE(REG_UNONPOSIX); | |
950 | RET('<'); | |
951 | break; | |
952 | case CHR('>'): | |
953 | NOTE(REG_UNONPOSIX); | |
954 | RET('>'); | |
955 | break; | |
956 | case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): | |
957 | case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): | |
958 | case CHR('9'): | |
959 | NOTE(REG_UBACKREF); | |
960 | RETV(BACKREF, (chr)DIGITVAL(c)); | |
961 | break; | |
962 | default: | |
963 | if (iscalnum(c)) { | |
964 | NOTE(REG_UBSALNUM); | |
965 | NOTE(REG_UUNSPEC); | |
966 | } | |
967 | RETV(PLAIN, c); | |
968 | break; | |
38d679fd RN |
969 | } |
970 | ||
971 | assert(NOTREACHED); | |
972 | } | |
973 | ||
974 | /* | |
3ca4086b VS |
975 | - skip - skip white space and comments in expanded form |
976 | ^ static VOID skip(struct vars *); | |
38d679fd | 977 | */ |
3ca4086b VS |
978 | static VOID |
979 | skip(v) | |
980 | struct vars *v; | |
38d679fd | 981 | { |
3ca4086b | 982 | chr *start = v->now; |
38d679fd | 983 | |
3ca4086b | 984 | assert(v->cflags®_EXPANDED); |
38d679fd | 985 | |
3ca4086b | 986 | for (;;) { |
38d679fd RN |
987 | while (!ATEOS() && iscspace(*v->now)) |
988 | v->now++; | |
989 | if (ATEOS() || *v->now != CHR('#')) | |
990 | break; /* NOTE BREAK OUT */ | |
991 | assert(NEXT1('#')); | |
992 | while (!ATEOS() && *v->now != CHR('\n')) | |
993 | v->now++; | |
994 | /* leave the newline to be picked up by the iscspace loop */ | |
995 | } | |
996 | ||
997 | if (v->now != start) | |
998 | NOTE(REG_UNONPOSIX); | |
999 | } | |
1000 | ||
1001 | /* | |
3ca4086b | 1002 | - newline - return the chr for a newline |
38d679fd | 1003 | * This helps confine use of CHR to this source file. |
3ca4086b | 1004 | ^ static chr newline(NOPARMS); |
38d679fd RN |
1005 | */ |
1006 | static chr | |
3ca4086b | 1007 | newline() |
38d679fd RN |
1008 | { |
1009 | return CHR('\n'); | |
1010 | } | |
1011 | ||
1012 | /* | |
3ca4086b VS |
1013 | - ch - return the chr sequence for regc_locale.c's fake collating element ch |
1014 | * This helps confine use of CHR to this source file. Beware that the caller | |
1015 | * knows how long the sequence is. | |
1016 | ^ #ifdef REG_DEBUG | |
1017 | ^ static chr *ch(NOPARMS); | |
1018 | ^ #endif | |
1019 | */ | |
1020 | #ifdef REG_DEBUG | |
1021 | static chr * | |
1022 | ch() | |
1023 | { | |
1024 | static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; | |
1025 | ||
1026 | return chstr; | |
1027 | } | |
1028 | #endif | |
1029 | ||
1030 | /* | |
1031 | - chrnamed - return the chr known by a given (chr string) name | |
38d679fd RN |
1032 | * The code is a bit clumsy, but this routine gets only such specialized |
1033 | * use that it hardly matters. | |
3ca4086b | 1034 | ^ static chr chrnamed(struct vars *, chr *, chr *, pchr); |
38d679fd RN |
1035 | */ |
1036 | static chr | |
3ca4086b VS |
1037 | chrnamed(v, startp, endp, lastresort) |
1038 | struct vars *v; | |
1039 | chr *startp; /* start of name */ | |
1040 | chr *endp; /* just past end of name */ | |
1041 | pchr lastresort; /* what to return if name lookup fails */ | |
38d679fd | 1042 | { |
3ca4086b VS |
1043 | celt c; |
1044 | int errsave; | |
1045 | int e; | |
38d679fd RN |
1046 | struct cvec *cv; |
1047 | ||
1048 | errsave = v->err; | |
1049 | v->err = 0; | |
1050 | c = element(v, startp, endp); | |
1051 | e = v->err; | |
1052 | v->err = errsave; | |
1053 | ||
1054 | if (e != 0) | |
3ca4086b | 1055 | return (chr)lastresort; |
38d679fd RN |
1056 | |
1057 | cv = range(v, c, c, 0); | |
1058 | if (cv->nchrs == 0) | |
3ca4086b | 1059 | return (chr)lastresort; |
38d679fd RN |
1060 | return cv->chrs[0]; |
1061 | } |