]> git.saurik.com Git - wxWidgets.git/blame - src/regex/regc_lex.c
fix for always terminating intermediate UniChar String for 4 bytes wchar_t
[wxWidgets.git] / src / regex / regc_lex.c
CommitLineData
38d679fd
RN
1/*
2 * lexical analyzer
3 * This file is #included by regcomp.c.
4 *
3ca4086b
VS
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
38d679fd
RN
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
3ca4086b
VS
10 * thanks all of them.
11 *
38d679fd
RN
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
3ca4086b 16 *
38d679fd
RN
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
3ca4086b 19 *
38d679fd
RN
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
38d679fd
RN
31 */
32
33/* scanning macros (know about v) */
3ca4086b
VS
34#define ATEOS() (v->now >= v->stop)
35#define HAVE(n) (v->stop - v->now >= (n))
36#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
37#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
38#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
38d679fd
RN
39 *(v->now+1) == CHR(b) && \
40 *(v->now+2) == CHR(c))
3ca4086b
VS
41#define SET(c) (v->nexttype = (c))
42#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
43#define RET(c) return (SET(c), 1)
44#define RETV(c, n) return (SETV(c, n), 1)
45#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
46#define LASTTYPE(t) (v->lasttype == (t))
38d679fd
RN
47
48/* lexical contexts */
3ca4086b
VS
49#define L_ERE 1 /* mainline ERE/ARE */
50#define L_BRE 2 /* mainline BRE */
51#define L_Q 3 /* REG_QUOTE */
52#define L_EBND 4 /* ERE/ARE bound */
53#define L_BBND 5 /* BRE bound */
54#define L_BRACK 6 /* brackets */
55#define L_CEL 7 /* collating element */
56#define L_ECL 8 /* equivalence class */
57#define L_CCL 9 /* character class */
58#define INTOCON(c) (v->lexcon = (c))
59#define INCON(con) (v->lexcon == (con))
38d679fd
RN
60
61/* construct pointer past end of chr array */
3ca4086b 62#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
38d679fd
RN
63
64/*
3ca4086b
VS
65 - lexstart - set up lexical stuff, scan leading options
66 ^ static VOID lexstart(struct vars *);
38d679fd 67 */
3ca4086b
VS
68static VOID
69lexstart(v)
70struct vars *v;
38d679fd 71{
3ca4086b 72 prefixes(v); /* may turn on new type bits etc. */
38d679fd
RN
73 NOERR();
74
3ca4086b
VS
75 if (v->cflags&REG_QUOTE) {
76 assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
38d679fd 77 INTOCON(L_Q);
3ca4086b
VS
78 } else if (v->cflags&REG_EXTENDED) {
79 assert(!(v->cflags&REG_QUOTE));
38d679fd 80 INTOCON(L_ERE);
3ca4086b
VS
81 } else {
82 assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
38d679fd
RN
83 INTOCON(L_BRE);
84 }
85
86 v->nexttype = EMPTY; /* remember we were at the start */
3ca4086b 87 next(v); /* set up the first token */
38d679fd
RN
88}
89
90/*
3ca4086b
VS
91 - prefixes - implement various special prefixes
92 ^ static VOID prefixes(struct vars *);
38d679fd 93 */
3ca4086b
VS
94static VOID
95prefixes(v)
96struct vars *v;
38d679fd
RN
97{
98 /* literal string doesn't get any of this stuff */
3ca4086b 99 if (v->cflags&REG_QUOTE)
38d679fd
RN
100 return;
101
3ca4086b 102 /* initial "***" gets special things */
38d679fd 103 if (HAVE(4) && NEXT3('*', '*', '*'))
3ca4086b
VS
104 switch (*(v->now + 3)) {
105 case CHR('?'): /* "***?" error, msg shows version */
106 ERR(REG_BADPAT);
107 return; /* proceed no further */
108 break;
109 case CHR('='): /* "***=" shifts to literal string */
110 NOTE(REG_UNONPOSIX);
111 v->cflags |= REG_QUOTE;
112 v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);
113 v->now += 4;
114 return; /* and there can be no more prefixes */
115 break;
116 case CHR(':'): /* "***:" shifts to AREs */
117 NOTE(REG_UNONPOSIX);
118 v->cflags |= REG_ADVANCED;
119 v->now += 4;
120 break;
121 default: /* otherwise *** is just an error */
122 ERR(REG_BADRPT);
123 return;
124 break;
38d679fd
RN
125 }
126
127 /* BREs and EREs don't get embedded options */
3ca4086b 128 if ((v->cflags&REG_ADVANCED) != REG_ADVANCED)
38d679fd
RN
129 return;
130
131 /* embedded options (AREs only) */
3ca4086b 132 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
38d679fd
RN
133 NOTE(REG_UNONPOSIX);
134 v->now += 2;
135 for (; !ATEOS() && iscalpha(*v->now); v->now++)
3ca4086b
VS
136 switch (*v->now) {
137 case CHR('b'): /* BREs (but why???) */
138 v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
139 break;
140 case CHR('c'): /* case sensitive */
141 v->cflags &= ~REG_ICASE;
142 break;
143 case CHR('e'): /* plain EREs */
144 v->cflags |= REG_EXTENDED;
145 v->cflags &= ~(REG_ADVF|REG_QUOTE);
146 break;
147 case CHR('i'): /* case insensitive */
148 v->cflags |= REG_ICASE;
149 break;
150 case CHR('m'): /* Perloid synonym for n */
151 case CHR('n'): /* \n affects ^ $ . [^ */
152 v->cflags |= REG_NEWLINE;
153 break;
154 case CHR('p'): /* ~Perl, \n affects . [^ */
155 v->cflags |= REG_NLSTOP;
156 v->cflags &= ~REG_NLANCH;
157 break;
158 case CHR('q'): /* literal string */
159 v->cflags |= REG_QUOTE;
160 v->cflags &= ~REG_ADVANCED;
161 break;
162 case CHR('s'): /* single line, \n ordinary */
163 v->cflags &= ~REG_NEWLINE;
164 break;
165 case CHR('t'): /* tight syntax */
166 v->cflags &= ~REG_EXPANDED;
167 break;
168 case CHR('w'): /* weird, \n affects ^ $ only */
169 v->cflags &= ~REG_NLSTOP;
170 v->cflags |= REG_NLANCH;
171 break;
172 case CHR('x'): /* expanded syntax */
173 v->cflags |= REG_EXPANDED;
174 break;
175 default:
176 ERR(REG_BADOPT);
177 return;
38d679fd 178 }
3ca4086b 179 if (!NEXT1(')')) {
38d679fd
RN
180 ERR(REG_BADOPT);
181 return;
182 }
183 v->now++;
3ca4086b
VS
184 if (v->cflags&REG_QUOTE)
185 v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);
38d679fd
RN
186 }
187}
188
189/*
3ca4086b 190 - lexnest - "call a subroutine", interpolating string at the lexical level
38d679fd
RN
191 * Note, this is not a very general facility. There are a number of
192 * implicit assumptions about what sorts of strings can be subroutines.
3ca4086b 193 ^ static VOID lexnest(struct vars *, chr *, chr *);
38d679fd 194 */
3ca4086b
VS
195static VOID
196lexnest(v, beginp, endp)
197struct vars *v;
198chr *beginp; /* start of interpolation */
199chr *endp; /* one past end of interpolation */
38d679fd 200{
3ca4086b 201 assert(v->savenow == NULL); /* only one level of nesting */
38d679fd
RN
202 v->savenow = v->now;
203 v->savestop = v->stop;
204 v->now = beginp;
205 v->stop = endp;
206}
207
208/*
209 * string constants to interpolate as expansions of things like \d
210 */
3ca4086b 211static chr backd[] = { /* \d */
38d679fd
RN
212 CHR('['), CHR('['), CHR(':'),
213 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
214 CHR(':'), CHR(']'), CHR(']')
215};
3ca4086b 216static chr backD[] = { /* \D */
38d679fd
RN
217 CHR('['), CHR('^'), CHR('['), CHR(':'),
218 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
219 CHR(':'), CHR(']'), CHR(']')
220};
3ca4086b 221static chr brbackd[] = { /* \d within brackets */
38d679fd
RN
222 CHR('['), CHR(':'),
223 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
224 CHR(':'), CHR(']')
225};
3ca4086b 226static chr backs[] = { /* \s */
38d679fd
RN
227 CHR('['), CHR('['), CHR(':'),
228 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
229 CHR(':'), CHR(']'), CHR(']')
230};
3ca4086b 231static chr backS[] = { /* \S */
38d679fd
RN
232 CHR('['), CHR('^'), CHR('['), CHR(':'),
233 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
234 CHR(':'), CHR(']'), CHR(']')
235};
3ca4086b 236static chr brbacks[] = { /* \s within brackets */
38d679fd
RN
237 CHR('['), CHR(':'),
238 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
239 CHR(':'), CHR(']')
240};
3ca4086b 241static chr backw[] = { /* \w */
38d679fd
RN
242 CHR('['), CHR('['), CHR(':'),
243 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
244 CHR(':'), CHR(']'), CHR('_'), CHR(']')
245};
3ca4086b 246static chr backW[] = { /* \W */
38d679fd
RN
247 CHR('['), CHR('^'), CHR('['), CHR(':'),
248 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
249 CHR(':'), CHR(']'), CHR('_'), CHR(']')
250};
3ca4086b 251static chr brbackw[] = { /* \w within brackets */
38d679fd
RN
252 CHR('['), CHR(':'),
253 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
254 CHR(':'), CHR(']'), CHR('_')
255};
256
257/*
3ca4086b 258 - lexword - interpolate a bracket expression for word characters
38d679fd 259 * Possibly ought to inquire whether there is a "word" character class.
3ca4086b 260 ^ static VOID lexword(struct vars *);
38d679fd 261 */
3ca4086b
VS
262static VOID
263lexword(v)
264struct vars *v;
38d679fd
RN
265{
266 lexnest(v, backw, ENDOF(backw));
267}
268
269/*
3ca4086b
VS
270 - next - get next token
271 ^ static int next(struct vars *);
38d679fd 272 */
3ca4086b
VS
273static int /* 1 normal, 0 failure */
274next(v)
275struct vars *v;
38d679fd 276{
3ca4086b 277 chr c;
38d679fd
RN
278
279 /* errors yield an infinite sequence of failures */
280 if (ISERR())
3ca4086b 281 return 0; /* the error has set nexttype to EOS */
38d679fd
RN
282
283 /* remember flavor of last token */
284 v->lasttype = v->nexttype;
285
286 /* REG_BOSONLY */
3ca4086b 287 if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
38d679fd
RN
288 /* at start of a REG_BOSONLY RE */
289 RETV(SBEGIN, 0); /* same as \A */
290 }
291
292 /* if we're nested and we've hit end, return to outer level */
3ca4086b 293 if (v->savenow != NULL && ATEOS()) {
38d679fd
RN
294 v->now = v->savenow;
295 v->stop = v->savestop;
296 v->savenow = v->savestop = NULL;
297 }
298
299 /* skip white space etc. if appropriate (not in literal or []) */
3ca4086b
VS
300 if (v->cflags&REG_EXPANDED)
301 switch (v->lexcon) {
302 case L_ERE:
303 case L_BRE:
304 case L_EBND:
305 case L_BBND:
306 skip(v);
307 break;
38d679fd
RN
308 }
309
310 /* handle EOS, depending on context */
3ca4086b
VS
311 if (ATEOS()) {
312 switch (v->lexcon) {
313 case L_ERE:
314 case L_BRE:
315 case L_Q:
316 RET(EOS);
317 break;
318 case L_EBND:
319 case L_BBND:
320 FAILW(REG_EBRACE);
321 break;
322 case L_BRACK:
323 case L_CEL:
324 case L_ECL:
325 case L_CCL:
326 FAILW(REG_EBRACK);
327 break;
38d679fd
RN
328 }
329 assert(NOTREACHED);
330 }
331
332 /* okay, time to actually get a character */
333 c = *v->now++;
334
335 /* deal with the easy contexts, punt EREs to code below */
3ca4086b
VS
336 switch (v->lexcon) {
337 case L_BRE: /* punt BREs to separate function */
338 return brenext(v, c);
339 break;
340 case L_ERE: /* see below */
341 break;
342 case L_Q: /* literal strings are easy */
343 RETV(PLAIN, c);
344 break;
345 case L_BBND: /* bounds are fairly simple */
346 case L_EBND:
347 switch (c) {
348 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
349 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
350 case CHR('8'): case CHR('9'):
351 RETV(DIGIT, (chr)DIGITVAL(c));
38d679fd 352 break;
3ca4086b
VS
353 case CHR(','):
354 RET(',');
38d679fd 355 break;
3ca4086b
VS
356 case CHR('}'): /* ERE bound ends with } */
357 if (INCON(L_EBND)) {
358 INTOCON(L_ERE);
359 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
360 v->now++;
361 NOTE(REG_UNONPOSIX);
362 RETV('}', 0);
363 }
364 RETV('}', 1);
365 } else
366 FAILW(REG_BADBR);
38d679fd 367 break;
3ca4086b
VS
368 case CHR('\\'): /* BRE bound ends with \} */
369 if (INCON(L_BBND) && NEXT1('}')) {
370 v->now++;
371 INTOCON(L_BRE);
372 RET('}');
373 } else
374 FAILW(REG_BADBR);
375 break;
376 default:
377 FAILW(REG_BADBR);
378 break;
379 }
380 assert(NOTREACHED);
381 break;
382 case L_BRACK: /* brackets are not too hard */
383 switch (c) {
384 case CHR(']'):
385 if (LASTTYPE('['))
386 RETV(PLAIN, c);
387 else {
388 INTOCON((v->cflags&REG_EXTENDED) ?
389 L_ERE : L_BRE);
390 RET(']');
38d679fd 391 }
38d679fd 392 break;
3ca4086b
VS
393 case CHR('\\'):
394 NOTE(REG_UBBS);
395 if (!(v->cflags&REG_ADVF))
396 RETV(PLAIN, c);
397 NOTE(REG_UNONPOSIX);
398 if (ATEOS())
399 FAILW(REG_EESCAPE);
400 (DISCARD)lexescape(v);
401 switch (v->nexttype) { /* not all escapes okay here */
402 case PLAIN:
403 return 1;
404 break;
405 case CCLASS:
406 switch (v->nextvalue) {
407 case 'd':
408 lexnest(v, brbackd, ENDOF(brbackd));
38d679fd 409 break;
3ca4086b
VS
410 case 's':
411 lexnest(v, brbacks, ENDOF(brbacks));
38d679fd 412 break;
3ca4086b
VS
413 case 'w':
414 lexnest(v, brbackw, ENDOF(brbackw));
38d679fd
RN
415 break;
416 default:
3ca4086b 417 FAILW(REG_EESCAPE);
38d679fd 418 break;
3ca4086b
VS
419 }
420 /* lexnest done, back up and try again */
421 v->nexttype = v->lasttype;
422 return next(v);
423 break;
38d679fd 424 }
3ca4086b
VS
425 /* not one of the acceptable escapes */
426 FAILW(REG_EESCAPE);
38d679fd 427 break;
3ca4086b
VS
428 case CHR('-'):
429 if (LASTTYPE('[') || NEXT1(']'))
38d679fd 430 RETV(PLAIN, c);
38d679fd 431 else
3ca4086b 432 RETV(RANGE, c);
38d679fd 433 break;
3ca4086b
VS
434 case CHR('['):
435 if (ATEOS())
436 FAILW(REG_EBRACK);
437 switch (*v->now++) {
438 case CHR('.'):
439 INTOCON(L_CEL);
440 /* might or might not be locale-specific */
441 RET(COLLEL);
442 break;
443 case CHR('='):
444 INTOCON(L_ECL);
445 NOTE(REG_ULOCALE);
446 RET(ECLASS);
447 break;
448 case CHR(':'):
449 INTOCON(L_CCL);
450 NOTE(REG_ULOCALE);
451 RET(CCLASS);
452 break;
453 default: /* oops */
454 v->now--;
38d679fd 455 RETV(PLAIN, c);
3ca4086b
VS
456 break;
457 }
458 assert(NOTREACHED);
38d679fd
RN
459 break;
460 default:
3ca4086b 461 RETV(PLAIN, c);
38d679fd 462 break;
3ca4086b
VS
463 }
464 assert(NOTREACHED);
465 break;
466 case L_CEL: /* collating elements are easy */
467 if (c == CHR('.') && NEXT1(']')) {
468 v->now++;
469 INTOCON(L_BRACK);
470 RETV(END, '.');
471 } else
472 RETV(PLAIN, c);
473 break;
474 case L_ECL: /* ditto equivalence classes */
475 if (c == CHR('=') && NEXT1(']')) {
476 v->now++;
477 INTOCON(L_BRACK);
478 RETV(END, '=');
479 } else
480 RETV(PLAIN, c);
481 break;
482 case L_CCL: /* ditto character classes */
483 if (c == CHR(':') && NEXT1(']')) {
484 v->now++;
485 INTOCON(L_BRACK);
486 RETV(END, ':');
487 } else
488 RETV(PLAIN, c);
489 break;
490 default:
491 assert(NOTREACHED);
492 break;
38d679fd
RN
493 }
494
495 /* that got rid of everything except EREs and AREs */
496 assert(INCON(L_ERE));
497
498 /* deal with EREs and AREs, except for backslashes */
3ca4086b
VS
499 switch (c) {
500 case CHR('|'):
501 RET('|');
502 break;
503 case CHR('*'):
504 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
505 v->now++;
506 NOTE(REG_UNONPOSIX);
507 RETV('*', 0);
508 }
509 RETV('*', 1);
510 break;
511 case CHR('+'):
512 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
513 v->now++;
514 NOTE(REG_UNONPOSIX);
515 RETV('+', 0);
516 }
517 RETV('+', 1);
518 break;
519 case CHR('?'):
520 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
521 v->now++;
522 NOTE(REG_UNONPOSIX);
523 RETV('?', 0);
524 }
525 RETV('?', 1);
526 break;
527 case CHR('{'): /* bounds start or plain character */
528 if (v->cflags&REG_EXPANDED)
529 skip(v);
530 if (ATEOS() || !iscdigit(*v->now)) {
531 NOTE(REG_UBRACES);
532 NOTE(REG_UUNSPEC);
533 RETV(PLAIN, c);
534 } else {
535 NOTE(REG_UBOUNDS);
536 INTOCON(L_EBND);
537 RET('{');
538 }
539 assert(NOTREACHED);
540 break;
541 case CHR('('): /* parenthesis, or advanced extension */
542 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
543 NOTE(REG_UNONPOSIX);
544 v->now++;
545 switch (*v->now++) {
546 case CHR(':'): /* non-capturing paren */
547 RETV('(', 0);
548 break;
549 case CHR('#'): /* comment */
550 while (!ATEOS() && *v->now != CHR(')'))
551 v->now++;
552 if (!ATEOS())
553 v->now++;
554 assert(v->nexttype == v->lasttype);
555 return next(v);
556 break;
557 case CHR('='): /* positive lookahead */
558 NOTE(REG_ULOOKAHEAD);
559 RETV(LACON, 1);
560 break;
561 case CHR('!'): /* negative lookahead */
562 NOTE(REG_ULOOKAHEAD);
563 RETV(LACON, 0);
564 break;
565 default:
566 FAILW(REG_BADRPT);
567 break;
38d679fd
RN
568 }
569 assert(NOTREACHED);
3ca4086b
VS
570 }
571 if (v->cflags&REG_NOSUB)
572 RETV('(', 0); /* all parens non-capturing */
573 else
574 RETV('(', 1);
575 break;
576 case CHR(')'):
577 if (LASTTYPE('(')) {
578 NOTE(REG_UUNSPEC);
579 }
580 RETV(')', c);
581 break;
582 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
583 if (HAVE(6) && *(v->now+0) == CHR('[') &&
584 *(v->now+1) == CHR(':') &&
585 (*(v->now+2) == CHR('<') ||
586 *(v->now+2) == CHR('>')) &&
587 *(v->now+3) == CHR(':') &&
588 *(v->now+4) == CHR(']') &&
589 *(v->now+5) == CHR(']')) {
590 c = *(v->now+2);
591 v->now += 6;
592 NOTE(REG_UNONPOSIX);
593 RET((c == CHR('<')) ? '<' : '>');
594 }
595 INTOCON(L_BRACK);
596 if (NEXT1('^')) {
597 v->now++;
598 RETV('[', 0);
599 }
600 RETV('[', 1);
601 break;
602 case CHR('.'):
603 RET('.');
604 break;
605 case CHR('^'):
606 RET('^');
607 break;
608 case CHR('$'):
609 RET('$');
610 break;
611 case CHR('\\'): /* mostly punt backslashes to code below */
612 if (ATEOS())
613 FAILW(REG_EESCAPE);
614 break;
615 default: /* ordinary character */
616 RETV(PLAIN, c);
617 break;
38d679fd
RN
618 }
619
620 /* ERE/ARE backslash handling; backslash already eaten */
621 assert(!ATEOS());
3ca4086b
VS
622 if (!(v->cflags&REG_ADVF)) { /* only AREs have non-trivial escapes */
623 if (iscalnum(*v->now)) {
38d679fd
RN
624 NOTE(REG_UBSALNUM);
625 NOTE(REG_UUNSPEC);
626 }
627 RETV(PLAIN, *v->now++);
628 }
3ca4086b 629 (DISCARD)lexescape(v);
38d679fd
RN
630 if (ISERR())
631 FAILW(REG_EESCAPE);
3ca4086b
VS
632 if (v->nexttype == CCLASS) { /* fudge at lexical level */
633 switch (v->nextvalue) {
634 case 'd': lexnest(v, backd, ENDOF(backd)); break;
635 case 'D': lexnest(v, backD, ENDOF(backD)); break;
636 case 's': lexnest(v, backs, ENDOF(backs)); break;
637 case 'S': lexnest(v, backS, ENDOF(backS)); break;
638 case 'w': lexnest(v, backw, ENDOF(backw)); break;
639 case 'W': lexnest(v, backW, ENDOF(backW)); break;
640 default:
641 assert(NOTREACHED);
642 FAILW(REG_ASSERT);
643 break;
38d679fd
RN
644 }
645 /* lexnest done, back up and try again */
646 v->nexttype = v->lasttype;
647 return next(v);
648 }
649 /* otherwise, lexescape has already done the work */
650 return !ISERR();
651}
652
653/*
3ca4086b 654 - lexescape - parse an ARE backslash escape (backslash already eaten)
38d679fd 655 * Note slightly nonstandard use of the CCLASS type code.
3ca4086b 656 ^ static int lexescape(struct vars *);
38d679fd 657 */
3ca4086b
VS
658static int /* not actually used, but convenient for RETV */
659lexescape(v)
660struct vars *v;
38d679fd 661{
3ca4086b
VS
662 chr c;
663 static chr alert[] = {
38d679fd
RN
664 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
665 };
3ca4086b 666 static chr esc[] = {
38d679fd
RN
667 CHR('E'), CHR('S'), CHR('C')
668 };
3ca4086b 669 chr *save;
38d679fd 670
3ca4086b 671 assert(v->cflags&REG_ADVF);
38d679fd
RN
672
673 assert(!ATEOS());
674 c = *v->now++;
675 if (!iscalnum(c))
676 RETV(PLAIN, c);
677
678 NOTE(REG_UNONPOSIX);
3ca4086b
VS
679 switch (c) {
680 case CHR('a'):
681 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
682 break;
683 case CHR('A'):
684 RETV(SBEGIN, 0);
685 break;
686 case CHR('b'):
687 RETV(PLAIN, CHR('\b'));
688 break;
689 case CHR('B'):
690 RETV(PLAIN, CHR('\\'));
691 break;
692 case CHR('c'):
693 NOTE(REG_UUNPORT);
694 if (ATEOS())
695 FAILW(REG_EESCAPE);
696 RETV(PLAIN, (chr)(*v->now++ & 037));
697 break;
698 case CHR('d'):
699 NOTE(REG_ULOCALE);
700 RETV(CCLASS, 'd');
701 break;
702 case CHR('D'):
703 NOTE(REG_ULOCALE);
704 RETV(CCLASS, 'D');
705 break;
706 case CHR('e'):
707 NOTE(REG_UUNPORT);
708 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
709 break;
710 case CHR('f'):
711 RETV(PLAIN, CHR('\f'));
712 break;
713 case CHR('m'):
714 RET('<');
715 break;
716 case CHR('M'):
717 RET('>');
718 break;
719 case CHR('n'):
720 RETV(PLAIN, CHR('\n'));
721 break;
722 case CHR('r'):
723 RETV(PLAIN, CHR('\r'));
724 break;
725 case CHR('s'):
726 NOTE(REG_ULOCALE);
727 RETV(CCLASS, 's');
728 break;
729 case CHR('S'):
730 NOTE(REG_ULOCALE);
731 RETV(CCLASS, 'S');
732 break;
733 case CHR('t'):
734 RETV(PLAIN, CHR('\t'));
735 break;
736 case CHR('u'):
737 c = lexdigits(v, 16, 4, 4);
738 if (ISERR())
739 FAILW(REG_EESCAPE);
740 RETV(PLAIN, c);
741 break;
742 case CHR('U'):
743 c = lexdigits(v, 16, 8, 8);
744 if (ISERR())
745 FAILW(REG_EESCAPE);
746 RETV(PLAIN, c);
747 break;
748 case CHR('v'):
749 RETV(PLAIN, CHR('\v'));
750 break;
751 case CHR('w'):
752 NOTE(REG_ULOCALE);
753 RETV(CCLASS, 'w');
754 break;
755 case CHR('W'):
756 NOTE(REG_ULOCALE);
757 RETV(CCLASS, 'W');
758 break;
759 case CHR('x'):
760 NOTE(REG_UUNPORT);
761 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
762 if (ISERR())
763 FAILW(REG_EESCAPE);
764 RETV(PLAIN, c);
765 break;
766 case CHR('y'):
767 NOTE(REG_ULOCALE);
768 RETV(WBDRY, 0);
769 break;
770 case CHR('Y'):
771 NOTE(REG_ULOCALE);
772 RETV(NWBDRY, 0);
773 break;
774 case CHR('Z'):
775 RETV(SEND, 0);
776 break;
777 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
778 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
779 case CHR('9'):
780 save = v->now;
781 v->now--; /* put first digit back */
782 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
783 if (ISERR())
784 FAILW(REG_EESCAPE);
785 /* ugly heuristic (first test is "exactly 1 digit?") */
786 if (v->now - save == 0 || (int)c <= v->nsubexp) {
787 NOTE(REG_UBACKREF);
788 RETV(BACKREF, (chr)c);
789 }
790 /* oops, doesn't look like it's a backref after all... */
791 v->now = save;
792 /* and fall through into octal number */
793 case CHR('0'):
794 NOTE(REG_UUNPORT);
795 v->now--; /* put first digit back */
796 c = lexdigits(v, 8, 1, 3);
797 if (ISERR())
798 FAILW(REG_EESCAPE);
799 RETV(PLAIN, c);
800 break;
801 default:
802 assert(iscalpha(c));
803 FAILW(REG_EESCAPE); /* unknown alphabetic escape */
804 break;
38d679fd
RN
805 }
806 assert(NOTREACHED);
807}
808
809/*
3ca4086b
VS
810 - lexdigits - slurp up digits and return chr value
811 ^ static chr lexdigits(struct vars *, int, int, int);
38d679fd 812 */
3ca4086b
VS
813static chr /* chr value; errors signalled via ERR */
814lexdigits(v, base, minlen, maxlen)
815struct vars *v;
816int base;
817int minlen;
818int maxlen;
38d679fd 819{
3ca4086b
VS
820 uchr n; /* unsigned to avoid overflow misbehavior */
821 int len;
822 chr c;
823 int d;
824 CONST uchr ub = (uchr) base;
38d679fd
RN
825
826 n = 0;
3ca4086b 827 for (len = 0; len < maxlen && !ATEOS(); len++) {
38d679fd 828 c = *v->now++;
3ca4086b
VS
829 switch (c) {
830 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
831 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
832 case CHR('8'): case CHR('9'):
833 d = DIGITVAL(c);
834 break;
835 case CHR('a'): case CHR('A'): d = 10; break;
836 case CHR('b'): case CHR('B'): d = 11; break;
837 case CHR('c'): case CHR('C'): d = 12; break;
838 case CHR('d'): case CHR('D'): d = 13; break;
839 case CHR('e'): case CHR('E'): d = 14; break;
840 case CHR('f'): case CHR('F'): d = 15; break;
841 default:
842 v->now--; /* oops, not a digit at all */
843 d = -1;
844 break;
38d679fd
RN
845 }
846
3ca4086b 847 if (d >= base) { /* not a plausible digit */
38d679fd
RN
848 v->now--;
849 d = -1;
850 }
851 if (d < 0)
3ca4086b
VS
852 break; /* NOTE BREAK OUT */
853 n = n*ub + (uchr)d;
38d679fd
RN
854 }
855 if (len < minlen)
856 ERR(REG_EESCAPE);
857
3ca4086b 858 return (chr)n;
38d679fd
RN
859}
860
861/*
3ca4086b 862 - brenext - get next BRE token
38d679fd
RN
863 * This is much like EREs except for all the stupid backslashes and the
864 * context-dependency of some things.
3ca4086b 865 ^ static int brenext(struct vars *, pchr);
38d679fd 866 */
3ca4086b
VS
867static int /* 1 normal, 0 failure */
868brenext(v, pc)
869struct vars *v;
870pchr pc;
38d679fd 871{
3ca4086b 872 chr c = (chr)pc;
38d679fd 873
3ca4086b
VS
874 switch (c) {
875 case CHR('*'):
876 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
38d679fd 877 RETV(PLAIN, c);
3ca4086b
VS
878 RET('*');
879 break;
880 case CHR('['):
881 if (HAVE(6) && *(v->now+0) == CHR('[') &&
882 *(v->now+1) == CHR(':') &&
883 (*(v->now+2) == CHR('<') ||
884 *(v->now+2) == CHR('>')) &&
885 *(v->now+3) == CHR(':') &&
886 *(v->now+4) == CHR(']') &&
887 *(v->now+5) == CHR(']')) {
888 c = *(v->now+2);
889 v->now += 6;
890 NOTE(REG_UNONPOSIX);
891 RET((c == CHR('<')) ? '<' : '>');
892 }
893 INTOCON(L_BRACK);
894 if (NEXT1('^')) {
895 v->now++;
896 RETV('[', 0);
897 }
898 RETV('[', 1);
899 break;
900 case CHR('.'):
901 RET('.');
902 break;
903 case CHR('^'):
904 if (LASTTYPE(EMPTY))
905 RET('^');
906 if (LASTTYPE('(')) {
907 NOTE(REG_UUNSPEC);
908 RET('^');
909 }
910 RETV(PLAIN, c);
911 break;
912 case CHR('$'):
913 if (v->cflags&REG_EXPANDED)
914 skip(v);
915 if (ATEOS())
916 RET('$');
917 if (NEXT2('\\', ')')) {
918 NOTE(REG_UUNSPEC);
919 RET('$');
920 }
921 RETV(PLAIN, c);
922 break;
923 case CHR('\\'):
924 break; /* see below */
925 default:
926 RETV(PLAIN, c);
927 break;
38d679fd
RN
928 }
929
930 assert(c == CHR('\\'));
931
932 if (ATEOS())
933 FAILW(REG_EESCAPE);
934
935 c = *v->now++;
3ca4086b
VS
936 switch (c) {
937 case CHR('{'):
938 INTOCON(L_BBND);
939 NOTE(REG_UBOUNDS);
940 RET('{');
941 break;
942 case CHR('('):
943 RETV('(', 1);
944 break;
945 case CHR(')'):
946 RETV(')', c);
947 break;
948 case CHR('<'):
949 NOTE(REG_UNONPOSIX);
950 RET('<');
951 break;
952 case CHR('>'):
953 NOTE(REG_UNONPOSIX);
954 RET('>');
955 break;
956 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
957 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
958 case CHR('9'):
959 NOTE(REG_UBACKREF);
960 RETV(BACKREF, (chr)DIGITVAL(c));
961 break;
962 default:
963 if (iscalnum(c)) {
964 NOTE(REG_UBSALNUM);
965 NOTE(REG_UUNSPEC);
966 }
967 RETV(PLAIN, c);
968 break;
38d679fd
RN
969 }
970
971 assert(NOTREACHED);
972}
973
974/*
3ca4086b
VS
975 - skip - skip white space and comments in expanded form
976 ^ static VOID skip(struct vars *);
38d679fd 977 */
3ca4086b
VS
978static VOID
979skip(v)
980struct vars *v;
38d679fd 981{
3ca4086b 982 chr *start = v->now;
38d679fd 983
3ca4086b 984 assert(v->cflags&REG_EXPANDED);
38d679fd 985
3ca4086b 986 for (;;) {
38d679fd
RN
987 while (!ATEOS() && iscspace(*v->now))
988 v->now++;
989 if (ATEOS() || *v->now != CHR('#'))
990 break; /* NOTE BREAK OUT */
991 assert(NEXT1('#'));
992 while (!ATEOS() && *v->now != CHR('\n'))
993 v->now++;
994 /* leave the newline to be picked up by the iscspace loop */
995 }
996
997 if (v->now != start)
998 NOTE(REG_UNONPOSIX);
999}
1000
1001/*
3ca4086b 1002 - newline - return the chr for a newline
38d679fd 1003 * This helps confine use of CHR to this source file.
3ca4086b 1004 ^ static chr newline(NOPARMS);
38d679fd
RN
1005 */
1006static chr
3ca4086b 1007newline()
38d679fd
RN
1008{
1009 return CHR('\n');
1010}
1011
1012/*
3ca4086b
VS
1013 - ch - return the chr sequence for regc_locale.c's fake collating element ch
1014 * This helps confine use of CHR to this source file. Beware that the caller
1015 * knows how long the sequence is.
1016 ^ #ifdef REG_DEBUG
1017 ^ static chr *ch(NOPARMS);
1018 ^ #endif
1019 */
1020#ifdef REG_DEBUG
1021static chr *
1022ch()
1023{
1024 static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
1025
1026 return chstr;
1027}
1028#endif
1029
1030/*
1031 - chrnamed - return the chr known by a given (chr string) name
38d679fd
RN
1032 * The code is a bit clumsy, but this routine gets only such specialized
1033 * use that it hardly matters.
3ca4086b 1034 ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
38d679fd
RN
1035 */
1036static chr
3ca4086b
VS
1037chrnamed(v, startp, endp, lastresort)
1038struct vars *v;
1039chr *startp; /* start of name */
1040chr *endp; /* just past end of name */
1041pchr lastresort; /* what to return if name lookup fails */
38d679fd 1042{
3ca4086b
VS
1043 celt c;
1044 int errsave;
1045 int e;
38d679fd
RN
1046 struct cvec *cv;
1047
1048 errsave = v->err;
1049 v->err = 0;
1050 c = element(v, startp, endp);
1051 e = v->err;
1052 v->err = errsave;
1053
1054 if (e != 0)
3ca4086b 1055 return (chr)lastresort;
38d679fd
RN
1056
1057 cv = range(v, c, c, 0);
1058 if (cv->nchrs == 0)
3ca4086b 1059 return (chr)lastresort;
38d679fd
RN
1060 return cv->chrs[0];
1061}