]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * lexical analyzer | |
3 | * This file is #included by regcomp.c. | |
4 | * | |
5 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. | |
6 | * | |
7 | * Development of this software was funded, in part, by Cray Research Inc., | |
8 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics | |
9 | * Corporation, none of whom are responsible for the results. The author | |
10 | * thanks all of them. | |
11 | * | |
12 | * Redistribution and use in source and binary forms -- with or without | |
13 | * modification -- are permitted for any purpose, provided that | |
14 | * redistributions in source form retain this entire copyright notice and | |
15 | * indicate the origin and nature of any modifications. | |
16 | * | |
17 | * I'd appreciate being given credit for this package in the documentation | |
18 | * of software which uses it, but that is not a requirement. | |
19 | * | |
20 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, | |
21 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY | |
22 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL | |
23 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
24 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
25 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |
26 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
28 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
29 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
30 | * | |
31 | */ | |
32 | ||
33 | /* scanning macros (know about v) */ | |
34 | #define ATEOS() (v->now >= v->stop) | |
35 | #define HAVE(n) (v->stop - v->now >= (n)) | |
36 | #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) | |
37 | #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) | |
38 | #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ | |
39 | *(v->now+1) == CHR(b) && \ | |
40 | *(v->now+2) == CHR(c)) | |
41 | #define SET(c) (v->nexttype = (c)) | |
42 | #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) | |
43 | #define RET(c) return (SET(c), 1) | |
44 | #define RETV(c, n) return (SETV(c, n), 1) | |
45 | #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ | |
46 | #define LASTTYPE(t) (v->lasttype == (t)) | |
47 | ||
48 | /* lexical contexts */ | |
49 | #define L_ERE 1 /* mainline ERE/ARE */ | |
50 | #define L_BRE 2 /* mainline BRE */ | |
51 | #define L_Q 3 /* REG_QUOTE */ | |
52 | #define L_EBND 4 /* ERE/ARE bound */ | |
53 | #define L_BBND 5 /* BRE bound */ | |
54 | #define L_BRACK 6 /* brackets */ | |
55 | #define L_CEL 7 /* collating element */ | |
56 | #define L_ECL 8 /* equivalence class */ | |
57 | #define L_CCL 9 /* character class */ | |
58 | #define INTOCON(c) (v->lexcon = (c)) | |
59 | #define INCON(con) (v->lexcon == (con)) | |
60 | ||
61 | /* construct pointer past end of chr array */ | |
62 | #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) | |
63 | ||
64 | /* | |
65 | - lexstart - set up lexical stuff, scan leading options | |
66 | ^ static VOID lexstart(struct vars *); | |
67 | */ | |
68 | static VOID | |
69 | lexstart(v) | |
70 | struct vars *v; | |
71 | { | |
72 | prefixes(v); /* may turn on new type bits etc. */ | |
73 | NOERR(); | |
74 | ||
75 | if (v->cflags®_QUOTE) { | |
76 | assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); | |
77 | INTOCON(L_Q); | |
78 | } else if (v->cflags®_EXTENDED) { | |
79 | assert(!(v->cflags®_QUOTE)); | |
80 | INTOCON(L_ERE); | |
81 | } else { | |
82 | assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); | |
83 | INTOCON(L_BRE); | |
84 | } | |
85 | ||
86 | v->nexttype = EMPTY; /* remember we were at the start */ | |
87 | next(v); /* set up the first token */ | |
88 | } | |
89 | ||
90 | /* | |
91 | - prefixes - implement various special prefixes | |
92 | ^ static VOID prefixes(struct vars *); | |
93 | */ | |
94 | static VOID | |
95 | prefixes(v) | |
96 | struct vars *v; | |
97 | { | |
98 | /* literal string doesn't get any of this stuff */ | |
99 | if (v->cflags®_QUOTE) | |
100 | return; | |
101 | ||
102 | /* initial "***" gets special things */ | |
103 | if (HAVE(4) && NEXT3('*', '*', '*')) | |
104 | switch (*(v->now + 3)) { | |
105 | case CHR('?'): /* "***?" error, msg shows version */ | |
106 | ERR(REG_BADPAT); | |
107 | return; /* proceed no further */ | |
108 | break; | |
109 | case CHR('='): /* "***=" shifts to literal string */ | |
110 | NOTE(REG_UNONPOSIX); | |
111 | v->cflags |= REG_QUOTE; | |
112 | v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); | |
113 | v->now += 4; | |
114 | return; /* and there can be no more prefixes */ | |
115 | break; | |
116 | case CHR(':'): /* "***:" shifts to AREs */ | |
117 | NOTE(REG_UNONPOSIX); | |
118 | v->cflags |= REG_ADVANCED; | |
119 | v->now += 4; | |
120 | break; | |
121 | default: /* otherwise *** is just an error */ | |
122 | ERR(REG_BADRPT); | |
123 | return; | |
124 | break; | |
125 | } | |
126 | ||
127 | /* BREs and EREs don't get embedded options */ | |
128 | if ((v->cflags®_ADVANCED) != REG_ADVANCED) | |
129 | return; | |
130 | ||
131 | /* embedded options (AREs only) */ | |
132 | if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { | |
133 | NOTE(REG_UNONPOSIX); | |
134 | v->now += 2; | |
135 | for (; !ATEOS() && iscalpha(*v->now); v->now++) | |
136 | switch (*v->now) { | |
137 | case CHR('b'): /* BREs (but why???) */ | |
138 | v->cflags &= ~(REG_ADVANCED|REG_QUOTE); | |
139 | break; | |
140 | case CHR('c'): /* case sensitive */ | |
141 | v->cflags &= ~REG_ICASE; | |
142 | break; | |
143 | case CHR('e'): /* plain EREs */ | |
144 | v->cflags |= REG_EXTENDED; | |
145 | v->cflags &= ~(REG_ADVF|REG_QUOTE); | |
146 | break; | |
147 | case CHR('i'): /* case insensitive */ | |
148 | v->cflags |= REG_ICASE; | |
149 | break; | |
150 | case CHR('m'): /* Perloid synonym for n */ | |
151 | case CHR('n'): /* \n affects ^ $ . [^ */ | |
152 | v->cflags |= REG_NEWLINE; | |
153 | break; | |
154 | case CHR('p'): /* ~Perl, \n affects . [^ */ | |
155 | v->cflags |= REG_NLSTOP; | |
156 | v->cflags &= ~REG_NLANCH; | |
157 | break; | |
158 | case CHR('q'): /* literal string */ | |
159 | v->cflags |= REG_QUOTE; | |
160 | v->cflags &= ~REG_ADVANCED; | |
161 | break; | |
162 | case CHR('s'): /* single line, \n ordinary */ | |
163 | v->cflags &= ~REG_NEWLINE; | |
164 | break; | |
165 | case CHR('t'): /* tight syntax */ | |
166 | v->cflags &= ~REG_EXPANDED; | |
167 | break; | |
168 | case CHR('w'): /* weird, \n affects ^ $ only */ | |
169 | v->cflags &= ~REG_NLSTOP; | |
170 | v->cflags |= REG_NLANCH; | |
171 | break; | |
172 | case CHR('x'): /* expanded syntax */ | |
173 | v->cflags |= REG_EXPANDED; | |
174 | break; | |
175 | default: | |
176 | ERR(REG_BADOPT); | |
177 | return; | |
178 | } | |
179 | if (!NEXT1(')')) { | |
180 | ERR(REG_BADOPT); | |
181 | return; | |
182 | } | |
183 | v->now++; | |
184 | if (v->cflags®_QUOTE) | |
185 | v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); | |
186 | } | |
187 | } | |
188 | ||
189 | /* | |
190 | - lexnest - "call a subroutine", interpolating string at the lexical level | |
191 | * Note, this is not a very general facility. There are a number of | |
192 | * implicit assumptions about what sorts of strings can be subroutines. | |
193 | ^ static VOID lexnest(struct vars *, chr *, chr *); | |
194 | */ | |
195 | static VOID | |
196 | lexnest(v, beginp, endp) | |
197 | struct vars *v; | |
198 | chr *beginp; /* start of interpolation */ | |
199 | chr *endp; /* one past end of interpolation */ | |
200 | { | |
201 | assert(v->savenow == NULL); /* only one level of nesting */ | |
202 | v->savenow = v->now; | |
203 | v->savestop = v->stop; | |
204 | v->now = beginp; | |
205 | v->stop = endp; | |
206 | } | |
207 | ||
208 | /* | |
209 | * string constants to interpolate as expansions of things like \d | |
210 | */ | |
211 | static chr backd[] = { /* \d */ | |
212 | CHR('['), CHR('['), CHR(':'), | |
213 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
214 | CHR(':'), CHR(']'), CHR(']') | |
215 | }; | |
216 | static chr backD[] = { /* \D */ | |
217 | CHR('['), CHR('^'), CHR('['), CHR(':'), | |
218 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
219 | CHR(':'), CHR(']'), CHR(']') | |
220 | }; | |
221 | static chr brbackd[] = { /* \d within brackets */ | |
222 | CHR('['), CHR(':'), | |
223 | CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), | |
224 | CHR(':'), CHR(']') | |
225 | }; | |
226 | static chr backs[] = { /* \s */ | |
227 | CHR('['), CHR('['), CHR(':'), | |
228 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
229 | CHR(':'), CHR(']'), CHR(']') | |
230 | }; | |
231 | static chr backS[] = { /* \S */ | |
232 | CHR('['), CHR('^'), CHR('['), CHR(':'), | |
233 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
234 | CHR(':'), CHR(']'), CHR(']') | |
235 | }; | |
236 | static chr brbacks[] = { /* \s within brackets */ | |
237 | CHR('['), CHR(':'), | |
238 | CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), | |
239 | CHR(':'), CHR(']') | |
240 | }; | |
241 | static chr backw[] = { /* \w */ | |
242 | CHR('['), CHR('['), CHR(':'), | |
243 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
244 | CHR(':'), CHR(']'), CHR('_'), CHR(']') | |
245 | }; | |
246 | static chr backW[] = { /* \W */ | |
247 | CHR('['), CHR('^'), CHR('['), CHR(':'), | |
248 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
249 | CHR(':'), CHR(']'), CHR('_'), CHR(']') | |
250 | }; | |
251 | static chr brbackw[] = { /* \w within brackets */ | |
252 | CHR('['), CHR(':'), | |
253 | CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), | |
254 | CHR(':'), CHR(']'), CHR('_') | |
255 | }; | |
256 | ||
257 | /* | |
258 | - lexword - interpolate a bracket expression for word characters | |
259 | * Possibly ought to inquire whether there is a "word" character class. | |
260 | ^ static VOID lexword(struct vars *); | |
261 | */ | |
262 | static VOID | |
263 | lexword(v) | |
264 | struct vars *v; | |
265 | { | |
266 | lexnest(v, backw, ENDOF(backw)); | |
267 | } | |
268 | ||
269 | /* | |
270 | - next - get next token | |
271 | ^ static int next(struct vars *); | |
272 | */ | |
273 | static int /* 1 normal, 0 failure */ | |
274 | next(v) | |
275 | struct vars *v; | |
276 | { | |
277 | chr c; | |
278 | ||
279 | /* errors yield an infinite sequence of failures */ | |
280 | if (ISERR()) | |
281 | return 0; /* the error has set nexttype to EOS */ | |
282 | ||
283 | /* remember flavor of last token */ | |
284 | v->lasttype = v->nexttype; | |
285 | ||
286 | /* REG_BOSONLY */ | |
287 | if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { | |
288 | /* at start of a REG_BOSONLY RE */ | |
289 | RETV(SBEGIN, 0); /* same as \A */ | |
290 | } | |
291 | ||
292 | /* if we're nested and we've hit end, return to outer level */ | |
293 | if (v->savenow != NULL && ATEOS()) { | |
294 | v->now = v->savenow; | |
295 | v->stop = v->savestop; | |
296 | v->savenow = v->savestop = NULL; | |
297 | } | |
298 | ||
299 | /* skip white space etc. if appropriate (not in literal or []) */ | |
300 | if (v->cflags®_EXPANDED) | |
301 | switch (v->lexcon) { | |
302 | case L_ERE: | |
303 | case L_BRE: | |
304 | case L_EBND: | |
305 | case L_BBND: | |
306 | skip(v); | |
307 | break; | |
308 | } | |
309 | ||
310 | /* handle EOS, depending on context */ | |
311 | if (ATEOS()) { | |
312 | switch (v->lexcon) { | |
313 | case L_ERE: | |
314 | case L_BRE: | |
315 | case L_Q: | |
316 | RET(EOS); | |
317 | break; | |
318 | case L_EBND: | |
319 | case L_BBND: | |
320 | FAILW(REG_EBRACE); | |
321 | break; | |
322 | case L_BRACK: | |
323 | case L_CEL: | |
324 | case L_ECL: | |
325 | case L_CCL: | |
326 | FAILW(REG_EBRACK); | |
327 | break; | |
328 | } | |
329 | assert(NOTREACHED); | |
330 | } | |
331 | ||
332 | /* okay, time to actually get a character */ | |
333 | c = *v->now++; | |
334 | ||
335 | /* deal with the easy contexts, punt EREs to code below */ | |
336 | switch (v->lexcon) { | |
337 | case L_BRE: /* punt BREs to separate function */ | |
338 | return brenext(v, c); | |
339 | break; | |
340 | case L_ERE: /* see below */ | |
341 | break; | |
342 | case L_Q: /* literal strings are easy */ | |
343 | RETV(PLAIN, c); | |
344 | break; | |
345 | case L_BBND: /* bounds are fairly simple */ | |
346 | case L_EBND: | |
347 | switch (c) { | |
348 | case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): | |
349 | case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): | |
350 | case CHR('8'): case CHR('9'): | |
351 | RETV(DIGIT, (chr)DIGITVAL(c)); | |
352 | break; | |
353 | case CHR(','): | |
354 | RET(','); | |
355 | break; | |
356 | case CHR('}'): /* ERE bound ends with } */ | |
357 | if (INCON(L_EBND)) { | |
358 | INTOCON(L_ERE); | |
359 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
360 | v->now++; | |
361 | NOTE(REG_UNONPOSIX); | |
362 | RETV('}', 0); | |
363 | } | |
364 | RETV('}', 1); | |
365 | } else | |
366 | FAILW(REG_BADBR); | |
367 | break; | |
368 | case CHR('\\'): /* BRE bound ends with \} */ | |
369 | if (INCON(L_BBND) && NEXT1('}')) { | |
370 | v->now++; | |
371 | INTOCON(L_BRE); | |
372 | RET('}'); | |
373 | } else | |
374 | FAILW(REG_BADBR); | |
375 | break; | |
376 | default: | |
377 | FAILW(REG_BADBR); | |
378 | break; | |
379 | } | |
380 | assert(NOTREACHED); | |
381 | break; | |
382 | case L_BRACK: /* brackets are not too hard */ | |
383 | switch (c) { | |
384 | case CHR(']'): | |
385 | if (LASTTYPE('[')) | |
386 | RETV(PLAIN, c); | |
387 | else { | |
388 | INTOCON((v->cflags®_EXTENDED) ? | |
389 | L_ERE : L_BRE); | |
390 | RET(']'); | |
391 | } | |
392 | break; | |
393 | case CHR('\\'): | |
394 | NOTE(REG_UBBS); | |
395 | if (!(v->cflags®_ADVF)) | |
396 | RETV(PLAIN, c); | |
397 | NOTE(REG_UNONPOSIX); | |
398 | if (ATEOS()) | |
399 | FAILW(REG_EESCAPE); | |
400 | (DISCARD)lexescape(v); | |
401 | switch (v->nexttype) { /* not all escapes okay here */ | |
402 | case PLAIN: | |
403 | return 1; | |
404 | break; | |
405 | case CCLASS: | |
406 | switch (v->nextvalue) { | |
407 | case 'd': | |
408 | lexnest(v, brbackd, ENDOF(brbackd)); | |
409 | break; | |
410 | case 's': | |
411 | lexnest(v, brbacks, ENDOF(brbacks)); | |
412 | break; | |
413 | case 'w': | |
414 | lexnest(v, brbackw, ENDOF(brbackw)); | |
415 | break; | |
416 | default: | |
417 | FAILW(REG_EESCAPE); | |
418 | break; | |
419 | } | |
420 | /* lexnest done, back up and try again */ | |
421 | v->nexttype = v->lasttype; | |
422 | return next(v); | |
423 | break; | |
424 | } | |
425 | /* not one of the acceptable escapes */ | |
426 | FAILW(REG_EESCAPE); | |
427 | break; | |
428 | case CHR('-'): | |
429 | if (LASTTYPE('[') || NEXT1(']')) | |
430 | RETV(PLAIN, c); | |
431 | else | |
432 | RETV(RANGE, c); | |
433 | break; | |
434 | case CHR('['): | |
435 | if (ATEOS()) | |
436 | FAILW(REG_EBRACK); | |
437 | switch (*v->now++) { | |
438 | case CHR('.'): | |
439 | INTOCON(L_CEL); | |
440 | /* might or might not be locale-specific */ | |
441 | RET(COLLEL); | |
442 | break; | |
443 | case CHR('='): | |
444 | INTOCON(L_ECL); | |
445 | NOTE(REG_ULOCALE); | |
446 | RET(ECLASS); | |
447 | break; | |
448 | case CHR(':'): | |
449 | INTOCON(L_CCL); | |
450 | NOTE(REG_ULOCALE); | |
451 | RET(CCLASS); | |
452 | break; | |
453 | default: /* oops */ | |
454 | v->now--; | |
455 | RETV(PLAIN, c); | |
456 | break; | |
457 | } | |
458 | assert(NOTREACHED); | |
459 | break; | |
460 | default: | |
461 | RETV(PLAIN, c); | |
462 | break; | |
463 | } | |
464 | assert(NOTREACHED); | |
465 | break; | |
466 | case L_CEL: /* collating elements are easy */ | |
467 | if (c == CHR('.') && NEXT1(']')) { | |
468 | v->now++; | |
469 | INTOCON(L_BRACK); | |
470 | RETV(END, '.'); | |
471 | } else | |
472 | RETV(PLAIN, c); | |
473 | break; | |
474 | case L_ECL: /* ditto equivalence classes */ | |
475 | if (c == CHR('=') && NEXT1(']')) { | |
476 | v->now++; | |
477 | INTOCON(L_BRACK); | |
478 | RETV(END, '='); | |
479 | } else | |
480 | RETV(PLAIN, c); | |
481 | break; | |
482 | case L_CCL: /* ditto character classes */ | |
483 | if (c == CHR(':') && NEXT1(']')) { | |
484 | v->now++; | |
485 | INTOCON(L_BRACK); | |
486 | RETV(END, ':'); | |
487 | } else | |
488 | RETV(PLAIN, c); | |
489 | break; | |
490 | default: | |
491 | assert(NOTREACHED); | |
492 | break; | |
493 | } | |
494 | ||
495 | /* that got rid of everything except EREs and AREs */ | |
496 | assert(INCON(L_ERE)); | |
497 | ||
498 | /* deal with EREs and AREs, except for backslashes */ | |
499 | switch (c) { | |
500 | case CHR('|'): | |
501 | RET('|'); | |
502 | break; | |
503 | case CHR('*'): | |
504 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
505 | v->now++; | |
506 | NOTE(REG_UNONPOSIX); | |
507 | RETV('*', 0); | |
508 | } | |
509 | RETV('*', 1); | |
510 | break; | |
511 | case CHR('+'): | |
512 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
513 | v->now++; | |
514 | NOTE(REG_UNONPOSIX); | |
515 | RETV('+', 0); | |
516 | } | |
517 | RETV('+', 1); | |
518 | break; | |
519 | case CHR('?'): | |
520 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
521 | v->now++; | |
522 | NOTE(REG_UNONPOSIX); | |
523 | RETV('?', 0); | |
524 | } | |
525 | RETV('?', 1); | |
526 | break; | |
527 | case CHR('{'): /* bounds start or plain character */ | |
528 | if (v->cflags®_EXPANDED) | |
529 | skip(v); | |
530 | if (ATEOS() || !iscdigit(*v->now)) { | |
531 | NOTE(REG_UBRACES); | |
532 | NOTE(REG_UUNSPEC); | |
533 | RETV(PLAIN, c); | |
534 | } else { | |
535 | NOTE(REG_UBOUNDS); | |
536 | INTOCON(L_EBND); | |
537 | RET('{'); | |
538 | } | |
539 | assert(NOTREACHED); | |
540 | break; | |
541 | case CHR('('): /* parenthesis, or advanced extension */ | |
542 | if ((v->cflags®_ADVF) && NEXT1('?')) { | |
543 | NOTE(REG_UNONPOSIX); | |
544 | v->now++; | |
545 | switch (*v->now++) { | |
546 | case CHR(':'): /* non-capturing paren */ | |
547 | RETV('(', 0); | |
548 | break; | |
549 | case CHR('#'): /* comment */ | |
550 | while (!ATEOS() && *v->now != CHR(')')) | |
551 | v->now++; | |
552 | if (!ATEOS()) | |
553 | v->now++; | |
554 | assert(v->nexttype == v->lasttype); | |
555 | return next(v); | |
556 | break; | |
557 | case CHR('='): /* positive lookahead */ | |
558 | NOTE(REG_ULOOKAHEAD); | |
559 | RETV(LACON, 1); | |
560 | break; | |
561 | case CHR('!'): /* negative lookahead */ | |
562 | NOTE(REG_ULOOKAHEAD); | |
563 | RETV(LACON, 0); | |
564 | break; | |
565 | default: | |
566 | FAILW(REG_BADRPT); | |
567 | break; | |
568 | } | |
569 | assert(NOTREACHED); | |
570 | } | |
571 | if (v->cflags®_NOSUB) | |
572 | RETV('(', 0); /* all parens non-capturing */ | |
573 | else | |
574 | RETV('(', 1); | |
575 | break; | |
576 | case CHR(')'): | |
577 | if (LASTTYPE('(')) { | |
578 | NOTE(REG_UUNSPEC); | |
579 | } | |
580 | RETV(')', c); | |
581 | break; | |
582 | case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ | |
583 | if (HAVE(6) && *(v->now+0) == CHR('[') && | |
584 | *(v->now+1) == CHR(':') && | |
585 | (*(v->now+2) == CHR('<') || | |
586 | *(v->now+2) == CHR('>')) && | |
587 | *(v->now+3) == CHR(':') && | |
588 | *(v->now+4) == CHR(']') && | |
589 | *(v->now+5) == CHR(']')) { | |
590 | c = *(v->now+2); | |
591 | v->now += 6; | |
592 | NOTE(REG_UNONPOSIX); | |
593 | RET((c == CHR('<')) ? '<' : '>'); | |
594 | } | |
595 | INTOCON(L_BRACK); | |
596 | if (NEXT1('^')) { | |
597 | v->now++; | |
598 | RETV('[', 0); | |
599 | } | |
600 | RETV('[', 1); | |
601 | break; | |
602 | case CHR('.'): | |
603 | RET('.'); | |
604 | break; | |
605 | case CHR('^'): | |
606 | RET('^'); | |
607 | break; | |
608 | case CHR('$'): | |
609 | RET('$'); | |
610 | break; | |
611 | case CHR('\\'): /* mostly punt backslashes to code below */ | |
612 | if (ATEOS()) | |
613 | FAILW(REG_EESCAPE); | |
614 | break; | |
615 | default: /* ordinary character */ | |
616 | RETV(PLAIN, c); | |
617 | break; | |
618 | } | |
619 | ||
620 | /* ERE/ARE backslash handling; backslash already eaten */ | |
621 | assert(!ATEOS()); | |
622 | if (!(v->cflags®_ADVF)) { /* only AREs have non-trivial escapes */ | |
623 | if (iscalnum(*v->now)) { | |
624 | NOTE(REG_UBSALNUM); | |
625 | NOTE(REG_UUNSPEC); | |
626 | } | |
627 | RETV(PLAIN, *v->now++); | |
628 | } | |
629 | (DISCARD)lexescape(v); | |
630 | if (ISERR()) | |
631 | FAILW(REG_EESCAPE); | |
632 | if (v->nexttype == CCLASS) { /* fudge at lexical level */ | |
633 | switch (v->nextvalue) { | |
634 | case 'd': lexnest(v, backd, ENDOF(backd)); break; | |
635 | case 'D': lexnest(v, backD, ENDOF(backD)); break; | |
636 | case 's': lexnest(v, backs, ENDOF(backs)); break; | |
637 | case 'S': lexnest(v, backS, ENDOF(backS)); break; | |
638 | case 'w': lexnest(v, backw, ENDOF(backw)); break; | |
639 | case 'W': lexnest(v, backW, ENDOF(backW)); break; | |
640 | default: | |
641 | assert(NOTREACHED); | |
642 | FAILW(REG_ASSERT); | |
643 | break; | |
644 | } | |
645 | /* lexnest done, back up and try again */ | |
646 | v->nexttype = v->lasttype; | |
647 | return next(v); | |
648 | } | |
649 | /* otherwise, lexescape has already done the work */ | |
650 | return !ISERR(); | |
651 | } | |
652 | ||
653 | /* | |
654 | - lexescape - parse an ARE backslash escape (backslash already eaten) | |
655 | * Note slightly nonstandard use of the CCLASS type code. | |
656 | ^ static int lexescape(struct vars *); | |
657 | */ | |
658 | static int /* not actually used, but convenient for RETV */ | |
659 | lexescape(v) | |
660 | struct vars *v; | |
661 | { | |
662 | chr c; | |
663 | static chr alert[] = { | |
664 | CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') | |
665 | }; | |
666 | static chr esc[] = { | |
667 | CHR('E'), CHR('S'), CHR('C') | |
668 | }; | |
669 | chr *save; | |
670 | ||
671 | assert(v->cflags®_ADVF); | |
672 | ||
673 | assert(!ATEOS()); | |
674 | c = *v->now++; | |
675 | if (!iscalnum(c)) | |
676 | RETV(PLAIN, c); | |
677 | ||
678 | NOTE(REG_UNONPOSIX); | |
679 | switch (c) { | |
680 | case CHR('a'): | |
681 | RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); | |
682 | break; | |
683 | case CHR('A'): | |
684 | RETV(SBEGIN, 0); | |
685 | break; | |
686 | case CHR('b'): | |
687 | RETV(PLAIN, CHR('\b')); | |
688 | break; | |
689 | case CHR('B'): | |
690 | RETV(PLAIN, CHR('\\')); | |
691 | break; | |
692 | case CHR('c'): | |
693 | NOTE(REG_UUNPORT); | |
694 | if (ATEOS()) | |
695 | FAILW(REG_EESCAPE); | |
696 | RETV(PLAIN, (chr)(*v->now++ & 037)); | |
697 | break; | |
698 | case CHR('d'): | |
699 | NOTE(REG_ULOCALE); | |
700 | RETV(CCLASS, 'd'); | |
701 | break; | |
702 | case CHR('D'): | |
703 | NOTE(REG_ULOCALE); | |
704 | RETV(CCLASS, 'D'); | |
705 | break; | |
706 | case CHR('e'): | |
707 | NOTE(REG_UUNPORT); | |
708 | RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); | |
709 | break; | |
710 | case CHR('f'): | |
711 | RETV(PLAIN, CHR('\f')); | |
712 | break; | |
713 | case CHR('m'): | |
714 | RET('<'); | |
715 | break; | |
716 | case CHR('M'): | |
717 | RET('>'); | |
718 | break; | |
719 | case CHR('n'): | |
720 | RETV(PLAIN, CHR('\n')); | |
721 | break; | |
722 | case CHR('r'): | |
723 | RETV(PLAIN, CHR('\r')); | |
724 | break; | |
725 | case CHR('s'): | |
726 | NOTE(REG_ULOCALE); | |
727 | RETV(CCLASS, 's'); | |
728 | break; | |
729 | case CHR('S'): | |
730 | NOTE(REG_ULOCALE); | |
731 | RETV(CCLASS, 'S'); | |
732 | break; | |
733 | case CHR('t'): | |
734 | RETV(PLAIN, CHR('\t')); | |
735 | break; | |
736 | case CHR('u'): | |
737 | c = lexdigits(v, 16, 4, 4); | |
738 | if (ISERR()) | |
739 | FAILW(REG_EESCAPE); | |
740 | RETV(PLAIN, c); | |
741 | break; | |
742 | case CHR('U'): | |
743 | c = lexdigits(v, 16, 8, 8); | |
744 | if (ISERR()) | |
745 | FAILW(REG_EESCAPE); | |
746 | RETV(PLAIN, c); | |
747 | break; | |
748 | case CHR('v'): | |
749 | RETV(PLAIN, CHR('\v')); | |
750 | break; | |
751 | case CHR('w'): | |
752 | NOTE(REG_ULOCALE); | |
753 | RETV(CCLASS, 'w'); | |
754 | break; | |
755 | case CHR('W'): | |
756 | NOTE(REG_ULOCALE); | |
757 | RETV(CCLASS, 'W'); | |
758 | break; | |
759 | case CHR('x'): | |
760 | NOTE(REG_UUNPORT); | |
761 | c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ | |
762 | if (ISERR()) | |
763 | FAILW(REG_EESCAPE); | |
764 | RETV(PLAIN, c); | |
765 | break; | |
766 | case CHR('y'): | |
767 | NOTE(REG_ULOCALE); | |
768 | RETV(WBDRY, 0); | |
769 | break; | |
770 | case CHR('Y'): | |
771 | NOTE(REG_ULOCALE); | |
772 | RETV(NWBDRY, 0); | |
773 | break; | |
774 | case CHR('Z'): | |
775 | RETV(SEND, 0); | |
776 | break; | |
777 | case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): | |
778 | case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): | |
779 | case CHR('9'): | |
780 | save = v->now; | |
781 | v->now--; /* put first digit back */ | |
782 | c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ | |
783 | if (ISERR()) | |
784 | FAILW(REG_EESCAPE); | |
785 | /* ugly heuristic (first test is "exactly 1 digit?") */ | |
786 | if (v->now - save == 0 || (int)c <= v->nsubexp) { | |
787 | NOTE(REG_UBACKREF); | |
788 | RETV(BACKREF, (chr)c); | |
789 | } | |
790 | /* oops, doesn't look like it's a backref after all... */ | |
791 | v->now = save; | |
792 | /* and fall through into octal number */ | |
793 | case CHR('0'): | |
794 | NOTE(REG_UUNPORT); | |
795 | v->now--; /* put first digit back */ | |
796 | c = lexdigits(v, 8, 1, 3); | |
797 | if (ISERR()) | |
798 | FAILW(REG_EESCAPE); | |
799 | RETV(PLAIN, c); | |
800 | break; | |
801 | default: | |
802 | assert(iscalpha(c)); | |
803 | FAILW(REG_EESCAPE); /* unknown alphabetic escape */ | |
804 | break; | |
805 | } | |
806 | assert(NOTREACHED); | |
807 | } | |
808 | ||
809 | /* | |
810 | - lexdigits - slurp up digits and return chr value | |
811 | ^ static chr lexdigits(struct vars *, int, int, int); | |
812 | */ | |
813 | static chr /* chr value; errors signalled via ERR */ | |
814 | lexdigits(v, base, minlen, maxlen) | |
815 | struct vars *v; | |
816 | int base; | |
817 | int minlen; | |
818 | int maxlen; | |
819 | { | |
820 | uchr n; /* unsigned to avoid overflow misbehavior */ | |
821 | int len; | |
822 | chr c; | |
823 | int d; | |
824 | CONST uchr ub = (uchr) base; | |
825 | ||
826 | n = 0; | |
827 | for (len = 0; len < maxlen && !ATEOS(); len++) { | |
828 | c = *v->now++; | |
829 | switch (c) { | |
830 | case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): | |
831 | case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): | |
832 | case CHR('8'): case CHR('9'): | |
833 | d = DIGITVAL(c); | |
834 | break; | |
835 | case CHR('a'): case CHR('A'): d = 10; break; | |
836 | case CHR('b'): case CHR('B'): d = 11; break; | |
837 | case CHR('c'): case CHR('C'): d = 12; break; | |
838 | case CHR('d'): case CHR('D'): d = 13; break; | |
839 | case CHR('e'): case CHR('E'): d = 14; break; | |
840 | case CHR('f'): case CHR('F'): d = 15; break; | |
841 | default: | |
842 | v->now--; /* oops, not a digit at all */ | |
843 | d = -1; | |
844 | break; | |
845 | } | |
846 | ||
847 | if (d >= base) { /* not a plausible digit */ | |
848 | v->now--; | |
849 | d = -1; | |
850 | } | |
851 | if (d < 0) | |
852 | break; /* NOTE BREAK OUT */ | |
853 | n = n*ub + (uchr)d; | |
854 | } | |
855 | if (len < minlen) | |
856 | ERR(REG_EESCAPE); | |
857 | ||
858 | return (chr)n; | |
859 | } | |
860 | ||
861 | /* | |
862 | - brenext - get next BRE token | |
863 | * This is much like EREs except for all the stupid backslashes and the | |
864 | * context-dependency of some things. | |
865 | ^ static int brenext(struct vars *, pchr); | |
866 | */ | |
867 | static int /* 1 normal, 0 failure */ | |
868 | brenext(v, pc) | |
869 | struct vars *v; | |
870 | pchr pc; | |
871 | { | |
872 | chr c = (chr)pc; | |
873 | ||
874 | switch (c) { | |
875 | case CHR('*'): | |
876 | if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) | |
877 | RETV(PLAIN, c); | |
878 | RET('*'); | |
879 | break; | |
880 | case CHR('['): | |
881 | if (HAVE(6) && *(v->now+0) == CHR('[') && | |
882 | *(v->now+1) == CHR(':') && | |
883 | (*(v->now+2) == CHR('<') || | |
884 | *(v->now+2) == CHR('>')) && | |
885 | *(v->now+3) == CHR(':') && | |
886 | *(v->now+4) == CHR(']') && | |
887 | *(v->now+5) == CHR(']')) { | |
888 | c = *(v->now+2); | |
889 | v->now += 6; | |
890 | NOTE(REG_UNONPOSIX); | |
891 | RET((c == CHR('<')) ? '<' : '>'); | |
892 | } | |
893 | INTOCON(L_BRACK); | |
894 | if (NEXT1('^')) { | |
895 | v->now++; | |
896 | RETV('[', 0); | |
897 | } | |
898 | RETV('[', 1); | |
899 | break; | |
900 | case CHR('.'): | |
901 | RET('.'); | |
902 | break; | |
903 | case CHR('^'): | |
904 | if (LASTTYPE(EMPTY)) | |
905 | RET('^'); | |
906 | if (LASTTYPE('(')) { | |
907 | NOTE(REG_UUNSPEC); | |
908 | RET('^'); | |
909 | } | |
910 | RETV(PLAIN, c); | |
911 | break; | |
912 | case CHR('$'): | |
913 | if (v->cflags®_EXPANDED) | |
914 | skip(v); | |
915 | if (ATEOS()) | |
916 | RET('$'); | |
917 | if (NEXT2('\\', ')')) { | |
918 | NOTE(REG_UUNSPEC); | |
919 | RET('$'); | |
920 | } | |
921 | RETV(PLAIN, c); | |
922 | break; | |
923 | case CHR('\\'): | |
924 | break; /* see below */ | |
925 | default: | |
926 | RETV(PLAIN, c); | |
927 | break; | |
928 | } | |
929 | ||
930 | assert(c == CHR('\\')); | |
931 | ||
932 | if (ATEOS()) | |
933 | FAILW(REG_EESCAPE); | |
934 | ||
935 | c = *v->now++; | |
936 | switch (c) { | |
937 | case CHR('{'): | |
938 | INTOCON(L_BBND); | |
939 | NOTE(REG_UBOUNDS); | |
940 | RET('{'); | |
941 | break; | |
942 | case CHR('('): | |
943 | RETV('(', 1); | |
944 | break; | |
945 | case CHR(')'): | |
946 | RETV(')', c); | |
947 | break; | |
948 | case CHR('<'): | |
949 | NOTE(REG_UNONPOSIX); | |
950 | RET('<'); | |
951 | break; | |
952 | case CHR('>'): | |
953 | NOTE(REG_UNONPOSIX); | |
954 | RET('>'); | |
955 | break; | |
956 | case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): | |
957 | case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): | |
958 | case CHR('9'): | |
959 | NOTE(REG_UBACKREF); | |
960 | RETV(BACKREF, (chr)DIGITVAL(c)); | |
961 | break; | |
962 | default: | |
963 | if (iscalnum(c)) { | |
964 | NOTE(REG_UBSALNUM); | |
965 | NOTE(REG_UUNSPEC); | |
966 | } | |
967 | RETV(PLAIN, c); | |
968 | break; | |
969 | } | |
970 | ||
971 | assert(NOTREACHED); | |
972 | } | |
973 | ||
974 | /* | |
975 | - skip - skip white space and comments in expanded form | |
976 | ^ static VOID skip(struct vars *); | |
977 | */ | |
978 | static VOID | |
979 | skip(v) | |
980 | struct vars *v; | |
981 | { | |
982 | chr *start = v->now; | |
983 | ||
984 | assert(v->cflags®_EXPANDED); | |
985 | ||
986 | for (;;) { | |
987 | while (!ATEOS() && iscspace(*v->now)) | |
988 | v->now++; | |
989 | if (ATEOS() || *v->now != CHR('#')) | |
990 | break; /* NOTE BREAK OUT */ | |
991 | assert(NEXT1('#')); | |
992 | while (!ATEOS() && *v->now != CHR('\n')) | |
993 | v->now++; | |
994 | /* leave the newline to be picked up by the iscspace loop */ | |
995 | } | |
996 | ||
997 | if (v->now != start) | |
998 | NOTE(REG_UNONPOSIX); | |
999 | } | |
1000 | ||
1001 | /* | |
1002 | - newline - return the chr for a newline | |
1003 | * This helps confine use of CHR to this source file. | |
1004 | ^ static chr newline(NOPARMS); | |
1005 | */ | |
1006 | static chr | |
1007 | newline() | |
1008 | { | |
1009 | return CHR('\n'); | |
1010 | } | |
1011 | ||
1012 | /* | |
1013 | - ch - return the chr sequence for regc_locale.c's fake collating element ch | |
1014 | * This helps confine use of CHR to this source file. Beware that the caller | |
1015 | * knows how long the sequence is. | |
1016 | ^ #ifdef REG_DEBUG | |
1017 | ^ static chr *ch(NOPARMS); | |
1018 | ^ #endif | |
1019 | */ | |
1020 | #ifdef REG_DEBUG | |
1021 | static chr * | |
1022 | ch() | |
1023 | { | |
1024 | static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; | |
1025 | ||
1026 | return chstr; | |
1027 | } | |
1028 | #endif | |
1029 | ||
1030 | /* | |
1031 | - chrnamed - return the chr known by a given (chr string) name | |
1032 | * The code is a bit clumsy, but this routine gets only such specialized | |
1033 | * use that it hardly matters. | |
1034 | ^ static chr chrnamed(struct vars *, chr *, chr *, pchr); | |
1035 | */ | |
1036 | static chr | |
1037 | chrnamed(v, startp, endp, lastresort) | |
1038 | struct vars *v; | |
1039 | chr *startp; /* start of name */ | |
1040 | chr *endp; /* just past end of name */ | |
1041 | pchr lastresort; /* what to return if name lookup fails */ | |
1042 | { | |
1043 | celt c; | |
1044 | int errsave; | |
1045 | int e; | |
1046 | struct cvec *cv; | |
1047 | ||
1048 | errsave = v->err; | |
1049 | v->err = 0; | |
1050 | c = element(v, startp, endp); | |
1051 | e = v->err; | |
1052 | v->err = errsave; | |
1053 | ||
1054 | if (e != 0) | |
1055 | return (chr)lastresort; | |
1056 | ||
1057 | cv = range(v, c, c, 0); | |
1058 | if (cv->nchrs == 0) | |
1059 | return (chr)lastresort; | |
1060 | return cv->chrs[0]; | |
1061 | } |