Commit | Line | Data |
---|---|---|
38d679fd RN |
1 | /* |
2 | * lexical analyzer | |
3 | * This file is #included by regcomp.c. | |
4 | * | |
5 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. | |
6 | * | |
7 | * Development of this software was funded, in part, by Cray Research Inc., | |
8 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics | |
9 | * Corporation, none of whom are responsible for the results. The author | |
10 | * thanks all of them. | |
11 | * | |
12 | * Redistribution and use in source and binary forms -- with or without | |
13 | * modification -- are permitted for any purpose, provided that | |
14 | * redistributions in source form retain this entire copyright notice and | |
15 | * indicate the origin and nature of any modifications. | |
16 | * | |
17 | * I'd appreciate being given credit for this package in the documentation | |
18 | * of software which uses it, but that is not a requirement. | |
19 | * | |
20 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
21 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
22 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL | |
23 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
24 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
26 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
28 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
29 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
30 | * | |
31 | * $Header$ | |
32 | * | |
33 | */ | |
34 | ||
35 | /* scanning macros (know about v) */ | |
36 | #define ATEOS() (v->now >= v->stop) | |
37 | #define HAVE(n) (v->stop - v->now >= (n)) | |
38 | #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) | |
39 | #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) | |
40 | #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ | |
41 | *(v->now+1) == CHR(b) && \ | |
42 | *(v->now+2) == CHR(c)) | |
43 | #define SET(c) (v->nexttype = (c)) | |
44 | #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) | |
45 | #define RET(c) return (SET(c), 1) | |
46 | #define RETV(c, n) return (SETV(c, n), 1) | |
47 | #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ | |
48 | #define LASTTYPE(t) (v->lasttype == (t)) | |
49 | ||
50 | /* lexical contexts */ | |
51 | #define L_ERE 1 /* mainline ERE/ARE */ | |
52 | #define L_BRE 2 /* mainline BRE */ | |
53 | #define L_Q 3 /* REG_QUOTE */ | |
54 | #define L_EBND 4 /* ERE/ARE bound */ | |
55 | #define L_BBND 5 /* BRE bound */ | |
56 | #define L_BRACK 6 /* brackets */ | |
57 | #define L_CEL 7 /* collating element */ | |
58 | #define L_ECL 8 /* equivalence class */ | |
59 | #define L_CCL 9 /* character class */ | |
60 | #define INTOCON(c) (v->lexcon = (c)) | |
61 | #define INCON(con) (v->lexcon == (con)) | |
62 | ||
63 | /* construct pointer past end of chr array */ | |
64 | #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) | |
65 | ||
66 | /* | |
67 | * lexstart - set up lexical stuff, scan leading options | |
68 | */ | |
69 | static void | |
70 | lexstart(struct vars * v) | |
71 | { | |
72 | prefixes(v); /* may turn on new type bits etc. */ | |
73 | NOERR(); | |
74 | ||
75 | if (v->cflags & REG_QUOTE) | |
76 | { | |
77 | assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))); | |
78 | INTOCON(L_Q); | |
79 | } | |
80 | else if (v->cflags & REG_EXTENDED) | |
81 | { | |
82 | assert(!(v->cflags & REG_QUOTE)); | |
83 | INTOCON(L_ERE); | |
84 | } | |
85 | else | |
86 | { | |
87 | assert(!(v->cflags & (REG_QUOTE | REG_ADVF))); | |
88 | INTOCON(L_BRE); | |
89 | } | |
90 | ||
91 | v->nexttype = EMPTY; /* remember we were at the start */ | |
92 | next(v); /* set up the first token */ | |
93 | } | |
94 | ||
95 | /* | |
96 | * prefixes - implement various special prefixes | |
97 | */ | |
98 | static void | |
99 | prefixes(struct vars * v) | |
100 | { | |
101 | /* literal string doesn't get any of this stuff */ | |
102 | if (v->cflags & REG_QUOTE) | |
103 | return; | |
104 | ||
105 | /* initial "***" gets special things */ | |
106 | if (HAVE(4) && NEXT3('*', '*', '*')) | |
107 | switch (*(v->now + 3)) | |
108 | { | |
109 | case CHR('?'): /* "***?" error, msg shows version */ | |
110 | ERR(REG_BADPAT); | |
111 | return; /* proceed no further */ | |
112 | break; | |
113 | case CHR('='): /* "***=" shifts to literal string */ | |
114 | NOTE(REG_UNONPOSIX); | |
115 | v->cflags |= REG_QUOTE; | |
116 | v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE); | |
117 | v->now += 4; | |
118 | return; /* and there can be no more prefixes */ | |
119 | break; | |
120 | case CHR(':'): /* "***:" shifts to AREs */ | |
121 | NOTE(REG_UNONPOSIX); | |
122 | v->cflags |= REG_ADVANCED; | |
123 | v->now += 4; | |
124 | break; | |
125 | default: /* otherwise *** is just an error */ | |
126 | ERR(REG_BADRPT); | |
127 | return; | |
128 | break; | |
129 | } | |
130 | ||
131 | /* BREs and EREs don't get embedded options */ | |
132 | if ((v->cflags & REG_ADVANCED) != REG_ADVANCED) | |
133 | return; | |
134 | ||
135 | /* embedded options (AREs only) */ | |
136 | if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) | |
137 | { | |
138 | NOTE(REG_UNONPOSIX); | |
139 | v->now += 2; | |
140 | for (; !ATEOS() && iscalpha(*v->now); v->now++) | |
141 | switch (*v->now) | |
142 | { | |
143 | case CHR('b'): /* BREs (but why???) */ | |
144 | v->cflags &= ~(REG_ADVANCED | REG_QUOTE); | |
145 | break; | |
146 | case CHR('c'): /* case sensitive */ | |
147 | v->cflags &= ~REG_ICASE; | |
148 | break; | |
149 | case CHR('e'): /* plain EREs */ | |
150 | v->cflags |= REG_EXTENDED; | |
151 | v->cflags &= ~(REG_ADVF | REG_QUOTE); | |
152 | break; | |
153 | case CHR('i'): /* case insensitive */ | |
154 | v->cflags |= REG_ICASE; | |
155 | break; | |
156 | case CHR('m'): /* Perloid synonym for n */ | |
157 | case CHR('n'): /* \n affects ^ $ . [^ */ | |
158 | v->cflags |= REG_NEWLINE; | |
159 | break; | |
160 | case CHR('p'): /* ~Perl, \n affects . [^ */ | |
161 | v->cflags |= REG_NLSTOP; | |
162 | v->cflags &= ~REG_NLANCH; | |
163 | break; | |
164 | case CHR('q'): /* literal string */ | |
165 | v->cflags |= REG_QUOTE; | |
166 | v->cflags &= ~REG_ADVANCED; | |
167 | break; | |
168 | case CHR('s'): /* single line, \n ordinary */ | |
169 | v->cflags &= ~REG_NEWLINE; | |
170 | break; | |
171 | case CHR('t'): /* tight syntax */ | |
172 | v->cflags &= ~REG_EXPANDED; | |
173 | break; | |
174 | case CHR('w'): /* weird, \n affects ^ $ only */ | |
175 | v->cflags &= ~REG_NLSTOP; | |
176 | v->cflags |= REG_NLANCH; | |
177 | break; | |
178 | case CHR('x'): /* expanded syntax */ | |
179 | v->cflags |= REG_EXPANDED; | |
180 | break; | |
181 | default: | |
182 | ERR(REG_BADOPT); | |
183 | return; | |
184 | } | |
185 | if (!NEXT1(')')) | |
186 | { | |
187 | ERR(REG_BADOPT); | |
188 | return; | |
189 | } | |
190 | v->now++; | |
191 | if (v->cflags & REG_QUOTE) | |
192 | v->cflags &= ~(REG_EXPANDED | REG_NEWLINE); | |
193 | } | |
194 | } | |
195 | ||
196 | /* | |
197 | * lexnest - "call a subroutine", interpolating string at the lexical level | |
198 | * | |
199 | * Note, this is not a very general facility. There are a number of | |
200 | * implicit assumptions about what sorts of strings can be subroutines. | |
201 | */ | |
202 | static void | |
203 | lexnest(struct vars * v, | |
204 | chr *beginp, /* start of interpolation */ | |
205 | chr *endp) /* one past end of interpolation */ | |
206 | { | |
207 | assert(v->savenow == NULL); /* only one level of nesting */ | |
208 | v->savenow = v->now; | |
209 | v->savestop = v->stop; | |
210 | v->now = beginp; | |
211 | v->stop = endp; | |
212 | } | |
213 | ||
214 | /* | |
215 | * string constants to interpolate as expansions of things like \d | |
216 | */ | |
217 | static chr backd[] = { /* \d */ | |
218 | CHR('['), CHR('['), CHR(':'), | |
219 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
220 | CHR(':'), CHR(']'), CHR(']') | |
221 | }; | |
222 | static chr backD[] = { /* \D */ | |
223 | CHR('['), CHR('^'), CHR('['), CHR(':'), | |
224 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
225 | CHR(':'), CHR(']'), CHR(']') | |
226 | }; | |
227 | static chr brbackd[] = { /* \d within brackets */ | |
228 | CHR('['), CHR(':'), | |
229 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
230 | CHR(':'), CHR(']') | |
231 | }; | |
232 | static chr backs[] = { /* \s */ | |
233 | CHR('['), CHR('['), CHR(':'), | |
234 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
235 | CHR(':'), CHR(']'), CHR(']') | |
236 | }; | |
237 | static chr backS[] = { /* \S */ | |
238 | CHR('['), CHR('^'), CHR('['), CHR(':'), | |
239 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
240 | CHR(':'), CHR(']'), CHR(']') | |
241 | }; | |
242 | static chr brbacks[] = { /* \s within brackets */ | |
243 | CHR('['), CHR(':'), | |
244 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
245 | CHR(':'), CHR(']') | |
246 | }; | |
247 | static chr backw[] = { /* \w */ | |
248 | CHR('['), CHR('['), CHR(':'), | |
249 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
250 | CHR(':'), CHR(']'), CHR('_'), CHR(']') | |
251 | }; | |
252 | static chr backW[] = { /* \W */ | |
253 | CHR('['), CHR('^'), CHR('['), CHR(':'), | |
254 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
255 | CHR(':'), CHR(']'), CHR('_'), CHR(']') | |
256 | }; | |
257 | static chr brbackw[] = { /* \w within brackets */ | |
258 | CHR('['), CHR(':'), | |
259 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
260 | CHR(':'), CHR(']'), CHR('_') | |
261 | }; | |
262 | ||
263 | /* | |
264 | * lexword - interpolate a bracket expression for word characters | |
265 | * Possibly ought to inquire whether there is a "word" character class. | |
266 | */ | |
267 | static void | |
268 | lexword(struct vars * v) | |
269 | { | |
270 | lexnest(v, backw, ENDOF(backw)); | |
271 | } | |
272 | ||
273 | /* | |
274 | * next - get next token | |
275 | */ | |
276 | static int /* 1 normal, 0 failure */ | |
277 | next(struct vars * v) | |
278 | { | |
279 | chr c; | |
280 | ||
281 | /* errors yield an infinite sequence of failures */ | |
282 | if (ISERR()) | |
283 | return 0; /* the error has set nexttype to EOS */ | |
284 | ||
285 | /* remember flavor of last token */ | |
286 | v->lasttype = v->nexttype; | |
287 | ||
288 | /* REG_BOSONLY */ | |
289 | if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY)) | |
290 | { | |
291 | /* at start of a REG_BOSONLY RE */ | |
292 | RETV(SBEGIN, 0); /* same as \A */ | |
293 | } | |
294 | ||
295 | /* if we're nested and we've hit end, return to outer level */ | |
296 | if (v->savenow != NULL && ATEOS()) | |
297 | { | |
298 | v->now = v->savenow; | |
299 | v->stop = v->savestop; | |
300 | v->savenow = v->savestop = NULL; | |
301 | } | |
302 | ||
303 | /* skip white space etc. if appropriate (not in literal or []) */ | |
304 | if (v->cflags & REG_EXPANDED) | |
305 | switch (v->lexcon) | |
306 | { | |
307 | case L_ERE: | |
308 | case L_BRE: | |
309 | case L_EBND: | |
310 | case L_BBND: | |
311 | skip(v); | |
312 | break; | |
313 | } | |
314 | ||
315 | /* handle EOS, depending on context */ | |
316 | if (ATEOS()) | |
317 | { | |
318 | switch (v->lexcon) | |
319 | { | |
320 | case L_ERE: | |
321 | case L_BRE: | |
322 | case L_Q: | |
323 | RET(EOS); | |
324 | break; | |
325 | case L_EBND: | |
326 | case L_BBND: | |
327 | FAILW(REG_EBRACE); | |
328 | break; | |
329 | case L_BRACK: | |
330 | case L_CEL: | |
331 | case L_ECL: | |
332 | case L_CCL: | |
333 | FAILW(REG_EBRACK); | |
334 | break; | |
335 | } | |
336 | assert(NOTREACHED); | |
337 | } | |
338 | ||
339 | /* okay, time to actually get a character */ | |
340 | c = *v->now++; | |
341 | ||
342 | /* deal with the easy contexts, punt EREs to code below */ | |
343 | switch (v->lexcon) | |
344 | { | |
345 | case L_BRE: /* punt BREs to separate function */ | |
346 | return brenext(v, c); | |
347 | break; | |
348 | case L_ERE: /* see below */ | |
349 | break; | |
350 | case L_Q: /* literal strings are easy */ | |
351 | RETV(PLAIN, c); | |
352 | break; | |
353 | case L_BBND: /* bounds are fairly simple */ | |
354 | case L_EBND: | |
355 | switch (c) | |
356 | { | |
357 | case CHR('0'): | |
358 | case CHR('1'): | |
359 | case CHR('2'): | |
360 | case CHR('3'): | |
361 | case CHR('4'): | |
362 | case CHR('5'): | |
363 | case CHR('6'): | |
364 | case CHR('7'): | |
365 | case CHR('8'): | |
366 | case CHR('9'): | |
367 | RETV(DIGIT, (chr) DIGITVAL(c)); | |
368 | break; | |
369 | case CHR(','): | |
370 | RET(','); | |
371 | break; | |
372 | case CHR('}'): /* ERE bound ends with } */ | |
373 | if (INCON(L_EBND)) | |
374 | { | |
375 | INTOCON(L_ERE); | |
376 | if ((v->cflags & REG_ADVF) && NEXT1('?')) | |
377 | { | |
378 | v->now++; | |
379 | NOTE(REG_UNONPOSIX); | |
380 | RETV('}', 0); | |
381 | } | |
382 | RETV('}', 1); | |
383 | } | |
384 | else | |
385 | FAILW(REG_BADBR); | |
386 | break; | |
387 | case CHR('\\'): /* BRE bound ends with \} */ | |
388 | if (INCON(L_BBND) && NEXT1('}')) | |
389 | { | |
390 | v->now++; | |
391 | INTOCON(L_BRE); | |
392 | RET('}'); | |
393 | } | |
394 | else | |
395 | FAILW(REG_BADBR); | |
396 | break; | |
397 | default: | |
398 | FAILW(REG_BADBR); | |
399 | break; | |
400 | } | |
401 | assert(NOTREACHED); | |
402 | break; | |
403 | case L_BRACK: /* brackets are not too hard */ | |
404 | switch (c) | |
405 | { | |
406 | case CHR(']'): | |
407 | if (LASTTYPE('[')) | |
408 | RETV(PLAIN, c); | |
409 | else | |
410 | { | |
411 | INTOCON((v->cflags & REG_EXTENDED) ? | |
412 | L_ERE : L_BRE); | |
413 | RET(']'); | |
414 | } | |
415 | break; | |
416 | case CHR('\\'): | |
417 | NOTE(REG_UBBS); | |
418 | if (!(v->cflags & REG_ADVF)) | |
419 | RETV(PLAIN, c); | |
420 | NOTE(REG_UNONPOSIX); | |
421 | if (ATEOS()) | |
422 | FAILW(REG_EESCAPE); | |
423 | (DISCARD) lexescape(v); | |
424 | switch (v->nexttype) | |
425 | { /* not all escapes okay here */ | |
426 | case PLAIN: | |
427 | return 1; | |
428 | break; | |
429 | case CCLASS: | |
430 | switch (v->nextvalue) | |
431 | { | |
432 | case 'd': | |
433 | lexnest(v, brbackd, ENDOF(brbackd)); | |
434 | break; | |
435 | case 's': | |
436 | lexnest(v, brbacks, ENDOF(brbacks)); | |
437 | break; | |
438 | case 'w': | |
439 | lexnest(v, brbackw, ENDOF(brbackw)); | |
440 | break; | |
441 | default: | |
442 | FAILW(REG_EESCAPE); | |
443 | break; | |
444 | } | |
445 | /* lexnest done, back up and try again */ | |
446 | v->nexttype = v->lasttype; | |
447 | return next(v); | |
448 | break; | |
449 | } | |
450 | /* not one of the acceptable escapes */ | |
451 | FAILW(REG_EESCAPE); | |
452 | break; | |
453 | case CHR('-'): | |
454 | if (LASTTYPE('[') || NEXT1(']')) | |
455 | RETV(PLAIN, c); | |
456 | else | |
457 | RETV(RANGE, c); | |
458 | break; | |
459 | case CHR('['): | |
460 | if (ATEOS()) | |
461 | FAILW(REG_EBRACK); | |
462 | switch (*v->now++) | |
463 | { | |
464 | case CHR('.'): | |
465 | INTOCON(L_CEL); | |
466 | /* might or might not be locale-specific */ | |
467 | RET(COLLEL); | |
468 | break; | |
469 | case CHR('='): | |
470 | INTOCON(L_ECL); | |
471 | NOTE(REG_ULOCALE); | |
472 | RET(ECLASS); | |
473 | break; | |
474 | case CHR(':'): | |
475 | INTOCON(L_CCL); | |
476 | NOTE(REG_ULOCALE); | |
477 | RET(CCLASS); | |
478 | break; | |
479 | default: /* oops */ | |
480 | v->now--; | |
481 | RETV(PLAIN, c); | |
482 | break; | |
483 | } | |
484 | assert(NOTREACHED); | |
485 | break; | |
486 | default: | |
487 | RETV(PLAIN, c); | |
488 | break; | |
489 | } | |
490 | assert(NOTREACHED); | |
491 | break; | |
492 | case L_CEL: /* collating elements are easy */ | |
493 | if (c == CHR('.') && NEXT1(']')) | |
494 | { | |
495 | v->now++; | |
496 | INTOCON(L_BRACK); | |
497 | RETV(END, '.'); | |
498 | } | |
499 | else | |
500 | RETV(PLAIN, c); | |
501 | break; | |
502 | case L_ECL: /* ditto equivalence classes */ | |
503 | if (c == CHR('=') && NEXT1(']')) | |
504 | { | |
505 | v->now++; | |
506 | INTOCON(L_BRACK); | |
507 | RETV(END, '='); | |
508 | } | |
509 | else | |
510 | RETV(PLAIN, c); | |
511 | break; | |
512 | case L_CCL: /* ditto character classes */ | |
513 | if (c == CHR(':') && NEXT1(']')) | |
514 | { | |
515 | v->now++; | |
516 | INTOCON(L_BRACK); | |
517 | RETV(END, ':'); | |
518 | } | |
519 | else | |
520 | RETV(PLAIN, c); | |
521 | break; | |
522 | default: | |
523 | assert(NOTREACHED); | |
524 | break; | |
525 | } | |
526 | ||
527 | /* that got rid of everything except EREs and AREs */ | |
528 | assert(INCON(L_ERE)); | |
529 | ||
530 | /* deal with EREs and AREs, except for backslashes */ | |
531 | switch (c) | |
532 | { | |
533 | case CHR('|'): | |
534 | RET('|'); | |
535 | break; | |
536 | case CHR('*'): | |
537 | if ((v->cflags & REG_ADVF) && NEXT1('?')) | |
538 | { | |
539 | v->now++; | |
540 | NOTE(REG_UNONPOSIX); | |
541 | RETV('*', 0); | |
542 | } | |
543 | RETV('*', 1); | |
544 | break; | |
545 | case CHR('+'): | |
546 | if ((v->cflags & REG_ADVF) && NEXT1('?')) | |
547 | { | |
548 | v->now++; | |
549 | NOTE(REG_UNONPOSIX); | |
550 | RETV('+', 0); | |
551 | } | |
552 | RETV('+', 1); | |
553 | break; | |
554 | case CHR('?'): | |
555 | if ((v->cflags & REG_ADVF) && NEXT1('?')) | |
556 | { | |
557 | v->now++; | |
558 | NOTE(REG_UNONPOSIX); | |
559 | RETV('?', 0); | |
560 | } | |
561 | RETV('?', 1); | |
562 | break; | |
563 | case CHR('{'): /* bounds start or plain character */ | |
564 | if (v->cflags & REG_EXPANDED) | |
565 | skip(v); | |
566 | if (ATEOS() || !iscdigit(*v->now)) | |
567 | { | |
568 | NOTE(REG_UBRACES); | |
569 | NOTE(REG_UUNSPEC); | |
570 | RETV(PLAIN, c); | |
571 | } | |
572 | else | |
573 | { | |
574 | NOTE(REG_UBOUNDS); | |
575 | INTOCON(L_EBND); | |
576 | RET('{'); | |
577 | } | |
578 | assert(NOTREACHED); | |
579 | break; | |
580 | case CHR('('): /* parenthesis, or advanced extension */ | |
581 | if ((v->cflags & REG_ADVF) && NEXT1('?')) | |
582 | { | |
583 | NOTE(REG_UNONPOSIX); | |
584 | v->now++; | |
585 | switch (*v->now++) | |
586 | { | |
587 | case CHR(':'): /* non-capturing paren */ | |
588 | RETV('(', 0); | |
589 | break; | |
590 | case CHR('#'): /* comment */ | |
591 | while (!ATEOS() && *v->now != CHR(')')) | |
592 | v->now++; | |
593 | if (!ATEOS()) | |
594 | v->now++; | |
595 | assert(v->nexttype == v->lasttype); | |
596 | return next(v); | |
597 | break; | |
598 | case CHR('='): /* positive lookahead */ | |
599 | NOTE(REG_ULOOKAHEAD); | |
600 | RETV(LACON, 1); | |
601 | break; | |
602 | case CHR('!'): /* negative lookahead */ | |
603 | NOTE(REG_ULOOKAHEAD); | |
604 | RETV(LACON, 0); | |
605 | break; | |
606 | default: | |
607 | FAILW(REG_BADRPT); | |
608 | break; | |
609 | } | |
610 | assert(NOTREACHED); | |
611 | } | |
612 | if (v->cflags & REG_NOSUB) | |
613 | RETV('(', 0); /* all parens non-capturing */ | |
614 | else | |
615 | RETV('(', 1); | |
616 | break; | |
617 | case CHR(')'): | |
618 | if (LASTTYPE('(')) | |
619 | NOTE(REG_UUNSPEC); | |
620 | RETV(')', c); | |
621 | break; | |
622 | case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ | |
623 | if (HAVE(6) && *(v->now + 0) == CHR('[') && | |
624 | *(v->now + 1) == CHR(':') && | |
625 | (*(v->now + 2) == CHR('<') || | |
626 | *(v->now + 2) == CHR('>')) && | |
627 | *(v->now + 3) == CHR(':') && | |
628 | *(v->now + 4) == CHR(']') && | |
629 | *(v->now + 5) == CHR(']')) | |
630 | { | |
631 | c = *(v->now + 2); | |
632 | v->now += 6; | |
633 | NOTE(REG_UNONPOSIX); | |
634 | RET((c == CHR('<')) ? '<' : '>'); | |
635 | } | |
636 | INTOCON(L_BRACK); | |
637 | if (NEXT1('^')) | |
638 | { | |
639 | v->now++; | |
640 | RETV('[', 0); | |
641 | } | |
642 | RETV('[', 1); | |
643 | break; | |
644 | case CHR('.'): | |
645 | RET('.'); | |
646 | break; | |
647 | case CHR('^'): | |
648 | RET('^'); | |
649 | break; | |
650 | case CHR('$'): | |
651 | RET('$'); | |
652 | break; | |
653 | case CHR('\\'): /* mostly punt backslashes to code below */ | |
654 | if (ATEOS()) | |
655 | FAILW(REG_EESCAPE); | |
656 | break; | |
657 | default: /* ordinary character */ | |
658 | RETV(PLAIN, c); | |
659 | break; | |
660 | } | |
661 | ||
662 | /* ERE/ARE backslash handling; backslash already eaten */ | |
663 | assert(!ATEOS()); | |
664 | if (!(v->cflags & REG_ADVF)) | |
665 | { /* only AREs have non-trivial escapes */ | |
666 | if (iscalnum(*v->now)) | |
667 | { | |
668 | NOTE(REG_UBSALNUM); | |
669 | NOTE(REG_UUNSPEC); | |
670 | } | |
671 | RETV(PLAIN, *v->now++); | |
672 | } | |
673 | (DISCARD) lexescape(v); | |
674 | if (ISERR()) | |
675 | FAILW(REG_EESCAPE); | |
676 | if (v->nexttype == CCLASS) | |
677 | { /* fudge at lexical level */ | |
678 | switch (v->nextvalue) | |
679 | { | |
680 | case 'd': | |
681 | lexnest(v, backd, ENDOF(backd)); | |
682 | break; | |
683 | case 'D': | |
684 | lexnest(v, backD, ENDOF(backD)); | |
685 | break; | |
686 | case 's': | |
687 | lexnest(v, backs, ENDOF(backs)); | |
688 | break; | |
689 | case 'S': | |
690 | lexnest(v, backS, ENDOF(backS)); | |
691 | break; | |
692 | case 'w': | |
693 | lexnest(v, backw, ENDOF(backw)); | |
694 | break; | |
695 | case 'W': | |
696 | lexnest(v, backW, ENDOF(backW)); | |
697 | break; | |
698 | default: | |
699 | assert(NOTREACHED); | |
700 | FAILW(REG_ASSERT); | |
701 | break; | |
702 | } | |
703 | /* lexnest done, back up and try again */ | |
704 | v->nexttype = v->lasttype; | |
705 | return next(v); | |
706 | } | |
707 | /* otherwise, lexescape has already done the work */ | |
708 | return !ISERR(); | |
709 | } | |
710 | ||
711 | /* | |
712 | * lexescape - parse an ARE backslash escape (backslash already eaten) | |
713 | * Note slightly nonstandard use of the CCLASS type code. | |
714 | */ | |
715 | static int /* not actually used, but convenient for | |
716 | * RETV */ | |
717 | lexescape(struct vars * v) | |
718 | { | |
719 | chr c; | |
720 | static chr alert[] = { | |
721 | CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') | |
722 | }; | |
723 | static chr esc[] = { | |
724 | CHR('E'), CHR('S'), CHR('C') | |
725 | }; | |
726 | chr *save; | |
727 | ||
728 | assert(v->cflags & REG_ADVF); | |
729 | ||
730 | assert(!ATEOS()); | |
731 | c = *v->now++; | |
732 | if (!iscalnum(c)) | |
733 | RETV(PLAIN, c); | |
734 | ||
735 | NOTE(REG_UNONPOSIX); | |
736 | switch (c) | |
737 | { | |
738 | case CHR('a'): | |
739 | RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); | |
740 | break; | |
741 | case CHR('A'): | |
742 | RETV(SBEGIN, 0); | |
743 | break; | |
744 | case CHR('b'): | |
745 | RETV(PLAIN, CHR('\b')); | |
746 | break; | |
747 | case CHR('B'): | |
748 | RETV(PLAIN, CHR('\\')); | |
749 | break; | |
750 | case CHR('c'): | |
751 | NOTE(REG_UUNPORT); | |
752 | if (ATEOS()) | |
753 | FAILW(REG_EESCAPE); | |
754 | RETV(PLAIN, (chr) (*v->now++ & 037)); | |
755 | break; | |
756 | case CHR('d'): | |
757 | NOTE(REG_ULOCALE); | |
758 | RETV(CCLASS, 'd'); | |
759 | break; | |
760 | case CHR('D'): | |
761 | NOTE(REG_ULOCALE); | |
762 | RETV(CCLASS, 'D'); | |
763 | break; | |
764 | case CHR('e'): | |
765 | NOTE(REG_UUNPORT); | |
766 | RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); | |
767 | break; | |
768 | case CHR('f'): | |
769 | RETV(PLAIN, CHR('\f')); | |
770 | break; | |
771 | case CHR('m'): | |
772 | RET('<'); | |
773 | break; | |
774 | case CHR('M'): | |
775 | RET('>'); | |
776 | break; | |
777 | case CHR('n'): | |
778 | RETV(PLAIN, CHR('\n')); | |
779 | break; | |
780 | case CHR('r'): | |
781 | RETV(PLAIN, CHR('\r')); | |
782 | break; | |
783 | case CHR('s'): | |
784 | NOTE(REG_ULOCALE); | |
785 | RETV(CCLASS, 's'); | |
786 | break; | |
787 | case CHR('S'): | |
788 | NOTE(REG_ULOCALE); | |
789 | RETV(CCLASS, 'S'); | |
790 | break; | |
791 | case CHR('t'): | |
792 | RETV(PLAIN, CHR('\t')); | |
793 | break; | |
794 | case CHR('u'): | |
795 | c = lexdigits(v, 16, 4, 4); | |
796 | if (ISERR()) | |
797 | FAILW(REG_EESCAPE); | |
798 | RETV(PLAIN, c); | |
799 | break; | |
800 | case CHR('U'): | |
801 | c = lexdigits(v, 16, 8, 8); | |
802 | if (ISERR()) | |
803 | FAILW(REG_EESCAPE); | |
804 | RETV(PLAIN, c); | |
805 | break; | |
806 | case CHR('v'): | |
807 | RETV(PLAIN, CHR('\v')); | |
808 | break; | |
809 | case CHR('w'): | |
810 | NOTE(REG_ULOCALE); | |
811 | RETV(CCLASS, 'w'); | |
812 | break; | |
813 | case CHR('W'): | |
814 | NOTE(REG_ULOCALE); | |
815 | RETV(CCLASS, 'W'); | |
816 | break; | |
817 | case CHR('x'): | |
818 | NOTE(REG_UUNPORT); | |
819 | c = lexdigits(v, 16, 1, 255); /* REs >255 long outside | |
820 | * spec */ | |
821 | if (ISERR()) | |
822 | FAILW(REG_EESCAPE); | |
823 | RETV(PLAIN, c); | |
824 | break; | |
825 | case CHR('y'): | |
826 | NOTE(REG_ULOCALE); | |
827 | RETV(WBDRY, 0); | |
828 | break; | |
829 | case CHR('Y'): | |
830 | NOTE(REG_ULOCALE); | |
831 | RETV(NWBDRY, 0); | |
832 | break; | |
833 | case CHR('Z'): | |
834 | RETV(SEND, 0); | |
835 | break; | |
836 | case CHR('1'): | |
837 | case CHR('2'): | |
838 | case CHR('3'): | |
839 | case CHR('4'): | |
840 | case CHR('5'): | |
841 | case CHR('6'): | |
842 | case CHR('7'): | |
843 | case CHR('8'): | |
844 | case CHR('9'): | |
845 | save = v->now; | |
846 | v->now--; /* put first digit back */ | |
847 | c = lexdigits(v, 10, 1, 255); /* REs >255 long outside | |
848 | * spec */ | |
849 | if (ISERR()) | |
850 | FAILW(REG_EESCAPE); | |
851 | /* ugly heuristic (first test is "exactly 1 digit?") */ | |
852 | if (v->now - save == 0 || (int) c <= v->nsubexp) | |
853 | { | |
854 | NOTE(REG_UBACKREF); | |
855 | RETV(BACKREF, (chr) c); | |
856 | } | |
857 | /* oops, doesn't look like it's a backref after all... */ | |
858 | v->now = save; | |
859 | /* and fall through into octal number */ | |
860 | case CHR('0'): | |
861 | NOTE(REG_UUNPORT); | |
862 | v->now--; /* put first digit back */ | |
863 | c = lexdigits(v, 8, 1, 3); | |
864 | if (ISERR()) | |
865 | FAILW(REG_EESCAPE); | |
866 | RETV(PLAIN, c); | |
867 | break; | |
868 | default: | |
869 | assert(iscalpha(c)); | |
870 | FAILW(REG_EESCAPE); /* unknown alphabetic escape */ | |
871 | break; | |
872 | } | |
873 | assert(NOTREACHED); | |
874 | } | |
875 | ||
876 | /* | |
877 | * lexdigits - slurp up digits and return chr value | |
878 | */ | |
879 | static chr /* chr value; errors signalled via ERR */ | |
880 | lexdigits(struct vars * v, | |
881 | int base, | |
882 | int minlen, | |
883 | int maxlen) | |
884 | { | |
885 | uchr n; /* unsigned to avoid overflow misbehavior */ | |
886 | int len; | |
887 | chr c; | |
888 | int d; | |
889 | const uchr ub = (uchr) base; | |
890 | ||
891 | n = 0; | |
892 | for (len = 0; len < maxlen && !ATEOS(); len++) | |
893 | { | |
894 | c = *v->now++; | |
895 | switch (c) | |
896 | { | |
897 | case CHR('0'): | |
898 | case CHR('1'): | |
899 | case CHR('2'): | |
900 | case CHR('3'): | |
901 | case CHR('4'): | |
902 | case CHR('5'): | |
903 | case CHR('6'): | |
904 | case CHR('7'): | |
905 | case CHR('8'): | |
906 | case CHR('9'): | |
907 | d = DIGITVAL(c); | |
908 | break; | |
909 | case CHR('a'): | |
910 | case CHR('A'): | |
911 | d = 10; | |
912 | break; | |
913 | case CHR('b'): | |
914 | case CHR('B'): | |
915 | d = 11; | |
916 | break; | |
917 | case CHR('c'): | |
918 | case CHR('C'): | |
919 | d = 12; | |
920 | break; | |
921 | case CHR('d'): | |
922 | case CHR('D'): | |
923 | d = 13; | |
924 | break; | |
925 | case CHR('e'): | |
926 | case CHR('E'): | |
927 | d = 14; | |
928 | break; | |
929 | case CHR('f'): | |
930 | case CHR('F'): | |
931 | d = 15; | |
932 | break; | |
933 | default: | |
934 | v->now--; /* oops, not a digit at all */ | |
935 | d = -1; | |
936 | break; | |
937 | } | |
938 | ||
939 | if (d >= base) | |
940 | { /* not a plausible digit */ | |
941 | v->now--; | |
942 | d = -1; | |
943 | } | |
944 | if (d < 0) | |
945 | break; /* NOTE BREAK OUT */ | |
946 | n = n * ub + (uchr) d; | |
947 | } | |
948 | if (len < minlen) | |
949 | ERR(REG_EESCAPE); | |
950 | ||
951 | return (chr) n; | |
952 | } | |
953 | ||
954 | /* | |
955 | * brenext - get next BRE token | |
956 | * | |
957 | * This is much like EREs except for all the stupid backslashes and the | |
958 | * context-dependency of some things. | |
959 | */ | |
960 | static int /* 1 normal, 0 failure */ | |
961 | brenext(struct vars * v, | |
962 | chr pc) | |
963 | { | |
964 | chr c = (chr) pc; | |
965 | ||
966 | switch (c) | |
967 | { | |
968 | case CHR('*'): | |
969 | if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) | |
970 | RETV(PLAIN, c); | |
971 | RET('*'); | |
972 | break; | |
973 | case CHR('['): | |
974 | if (HAVE(6) && *(v->now + 0) == CHR('[') && | |
975 | *(v->now + 1) == CHR(':') && | |
976 | (*(v->now + 2) == CHR('<') || | |
977 | *(v->now + 2) == CHR('>')) && | |
978 | *(v->now + 3) == CHR(':') && | |
979 | *(v->now + 4) == CHR(']') && | |
980 | *(v->now + 5) == CHR(']')) | |
981 | { | |
982 | c = *(v->now + 2); | |
983 | v->now += 6; | |
984 | NOTE(REG_UNONPOSIX); | |
985 | RET((c == CHR('<')) ? '<' : '>'); | |
986 | } | |
987 | INTOCON(L_BRACK); | |
988 | if (NEXT1('^')) | |
989 | { | |
990 | v->now++; | |
991 | RETV('[', 0); | |
992 | } | |
993 | RETV('[', 1); | |
994 | break; | |
995 | case CHR('.'): | |
996 | RET('.'); | |
997 | break; | |
998 | case CHR('^'): | |
999 | if (LASTTYPE(EMPTY)) | |
1000 | RET('^'); | |
1001 | if (LASTTYPE('(')) | |
1002 | { | |
1003 | NOTE(REG_UUNSPEC); | |
1004 | RET('^'); | |
1005 | } | |
1006 | RETV(PLAIN, c); | |
1007 | break; | |
1008 | case CHR('$'): | |
1009 | if (v->cflags & REG_EXPANDED) | |
1010 | skip(v); | |
1011 | if (ATEOS()) | |
1012 | RET('$'); | |
1013 | if (NEXT2('\\', ')')) | |
1014 | { | |
1015 | NOTE(REG_UUNSPEC); | |
1016 | RET('$'); | |
1017 | } | |
1018 | RETV(PLAIN, c); | |
1019 | break; | |
1020 | case CHR('\\'): | |
1021 | break; /* see below */ | |
1022 | default: | |
1023 | RETV(PLAIN, c); | |
1024 | break; | |
1025 | } | |
1026 | ||
1027 | assert(c == CHR('\\')); | |
1028 | ||
1029 | if (ATEOS()) | |
1030 | FAILW(REG_EESCAPE); | |
1031 | ||
1032 | c = *v->now++; | |
1033 | switch (c) | |
1034 | { | |
1035 | case CHR('{'): | |
1036 | INTOCON(L_BBND); | |
1037 | NOTE(REG_UBOUNDS); | |
1038 | RET('{'); | |
1039 | break; | |
1040 | case CHR('('): | |
1041 | RETV('(', 1); | |
1042 | break; | |
1043 | case CHR(')'): | |
1044 | RETV(')', c); | |
1045 | break; | |
1046 | case CHR('<'): | |
1047 | NOTE(REG_UNONPOSIX); | |
1048 | RET('<'); | |
1049 | break; | |
1050 | case CHR('>'): | |
1051 | NOTE(REG_UNONPOSIX); | |
1052 | RET('>'); | |
1053 | break; | |
1054 | case CHR('1'): | |
1055 | case CHR('2'): | |
1056 | case CHR('3'): | |
1057 | case CHR('4'): | |
1058 | case CHR('5'): | |
1059 | case CHR('6'): | |
1060 | case CHR('7'): | |
1061 | case CHR('8'): | |
1062 | case CHR('9'): | |
1063 | NOTE(REG_UBACKREF); | |
1064 | RETV(BACKREF, (chr) DIGITVAL(c)); | |
1065 | break; | |
1066 | default: | |
1067 | if (iscalnum(c)) | |
1068 | { | |
1069 | NOTE(REG_UBSALNUM); | |
1070 | NOTE(REG_UUNSPEC); | |
1071 | } | |
1072 | RETV(PLAIN, c); | |
1073 | break; | |
1074 | } | |
1075 | ||
1076 | assert(NOTREACHED); | |
1077 | } | |
1078 | ||
1079 | /* | |
1080 | * skip - skip white space and comments in expanded form | |
1081 | */ | |
1082 | static void | |
1083 | skip(struct vars * v) | |
1084 | { | |
1085 | chr *start = v->now; | |
1086 | ||
1087 | assert(v->cflags & REG_EXPANDED); | |
1088 | ||
1089 | for (;;) | |
1090 | { | |
1091 | while (!ATEOS() && iscspace(*v->now)) | |
1092 | v->now++; | |
1093 | if (ATEOS() || *v->now != CHR('#')) | |
1094 | break; /* NOTE BREAK OUT */ | |
1095 | assert(NEXT1('#')); | |
1096 | while (!ATEOS() && *v->now != CHR('\n')) | |
1097 | v->now++; | |
1098 | /* leave the newline to be picked up by the iscspace loop */ | |
1099 | } | |
1100 | ||
1101 | if (v->now != start) | |
1102 | NOTE(REG_UNONPOSIX); | |
1103 | } | |
1104 | ||
1105 | /* | |
1106 | * newline - return the chr for a newline | |
1107 | * | |
1108 | * This helps confine use of CHR to this source file. | |
1109 | */ | |
1110 | static chr | |
1111 | newline(void) | |
1112 | { | |
1113 | return CHR('\n'); | |
1114 | } | |
1115 | ||
1116 | /* | |
1117 | * chrnamed - return the chr known by a given (chr string) name | |
1118 | * | |
1119 | * The code is a bit clumsy, but this routine gets only such specialized | |
1120 | * use that it hardly matters. | |
1121 | */ | |
1122 | static chr | |
1123 | chrnamed(struct vars * v, | |
1124 | chr *startp, /* start of name */ | |
1125 | chr *endp, /* just past end of name */ | |
1126 | chr lastresort) /* what to return if name lookup fails */ | |
1127 | { | |
1128 | celt c; | |
1129 | int errsave; | |
1130 | int e; | |
1131 | struct cvec *cv; | |
1132 | ||
1133 | errsave = v->err; | |
1134 | v->err = 0; | |
1135 | c = element(v, startp, endp); | |
1136 | e = v->err; | |
1137 | v->err = errsave; | |
1138 | ||
1139 | if (e != 0) | |
1140 | return (chr) lastresort; | |
1141 | ||
1142 | cv = range(v, c, c, 0); | |
1143 | if (cv->nchrs == 0) | |
1144 | return (chr) lastresort; | |
1145 | return cv->chrs[0]; | |
1146 | } |