]> git.saurik.com Git - wxWidgets.git/blame - src/regex/regc_lex.c
Unicode compilation fixes
[wxWidgets.git] / src / regex / regc_lex.c
CommitLineData
38d679fd
RN
1/*
2 * lexical analyzer
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * $Header$
32 *
33 */
34
35/* scanning macros (know about v) */
36#define ATEOS() (v->now >= v->stop)
37#define HAVE(n) (v->stop - v->now >= (n))
38#define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41 *(v->now+1) == CHR(b) && \
42 *(v->now+2) == CHR(c))
43#define SET(c) (v->nexttype = (c))
44#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45#define RET(c) return (SET(c), 1)
46#define RETV(c, n) return (SETV(c, n), 1)
47#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48#define LASTTYPE(t) (v->lasttype == (t))
49
50/* lexical contexts */
51#define L_ERE 1 /* mainline ERE/ARE */
52#define L_BRE 2 /* mainline BRE */
53#define L_Q 3 /* REG_QUOTE */
54#define L_EBND 4 /* ERE/ARE bound */
55#define L_BBND 5 /* BRE bound */
56#define L_BRACK 6 /* brackets */
57#define L_CEL 7 /* collating element */
58#define L_ECL 8 /* equivalence class */
59#define L_CCL 9 /* character class */
60#define INTOCON(c) (v->lexcon = (c))
61#define INCON(con) (v->lexcon == (con))
62
63/* construct pointer past end of chr array */
64#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65
66/*
67 * lexstart - set up lexical stuff, scan leading options
68 */
69static void
70lexstart(struct vars * v)
71{
72 prefixes(v); /* may turn on new type bits etc. */
73 NOERR();
74
75 if (v->cflags & REG_QUOTE)
76 {
77 assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
78 INTOCON(L_Q);
79 }
80 else if (v->cflags & REG_EXTENDED)
81 {
82 assert(!(v->cflags & REG_QUOTE));
83 INTOCON(L_ERE);
84 }
85 else
86 {
87 assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88 INTOCON(L_BRE);
89 }
90
91 v->nexttype = EMPTY; /* remember we were at the start */
92 next(v); /* set up the first token */
93}
94
95/*
96 * prefixes - implement various special prefixes
97 */
98static void
99prefixes(struct vars * v)
100{
101 /* literal string doesn't get any of this stuff */
102 if (v->cflags & REG_QUOTE)
103 return;
104
105 /* initial "***" gets special things */
106 if (HAVE(4) && NEXT3('*', '*', '*'))
107 switch (*(v->now + 3))
108 {
109 case CHR('?'): /* "***?" error, msg shows version */
110 ERR(REG_BADPAT);
111 return; /* proceed no further */
112 break;
113 case CHR('='): /* "***=" shifts to literal string */
114 NOTE(REG_UNONPOSIX);
115 v->cflags |= REG_QUOTE;
116 v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
117 v->now += 4;
118 return; /* and there can be no more prefixes */
119 break;
120 case CHR(':'): /* "***:" shifts to AREs */
121 NOTE(REG_UNONPOSIX);
122 v->cflags |= REG_ADVANCED;
123 v->now += 4;
124 break;
125 default: /* otherwise *** is just an error */
126 ERR(REG_BADRPT);
127 return;
128 break;
129 }
130
131 /* BREs and EREs don't get embedded options */
132 if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133 return;
134
135 /* embedded options (AREs only) */
136 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137 {
138 NOTE(REG_UNONPOSIX);
139 v->now += 2;
140 for (; !ATEOS() && iscalpha(*v->now); v->now++)
141 switch (*v->now)
142 {
143 case CHR('b'): /* BREs (but why???) */
144 v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145 break;
146 case CHR('c'): /* case sensitive */
147 v->cflags &= ~REG_ICASE;
148 break;
149 case CHR('e'): /* plain EREs */
150 v->cflags |= REG_EXTENDED;
151 v->cflags &= ~(REG_ADVF | REG_QUOTE);
152 break;
153 case CHR('i'): /* case insensitive */
154 v->cflags |= REG_ICASE;
155 break;
156 case CHR('m'): /* Perloid synonym for n */
157 case CHR('n'): /* \n affects ^ $ . [^ */
158 v->cflags |= REG_NEWLINE;
159 break;
160 case CHR('p'): /* ~Perl, \n affects . [^ */
161 v->cflags |= REG_NLSTOP;
162 v->cflags &= ~REG_NLANCH;
163 break;
164 case CHR('q'): /* literal string */
165 v->cflags |= REG_QUOTE;
166 v->cflags &= ~REG_ADVANCED;
167 break;
168 case CHR('s'): /* single line, \n ordinary */
169 v->cflags &= ~REG_NEWLINE;
170 break;
171 case CHR('t'): /* tight syntax */
172 v->cflags &= ~REG_EXPANDED;
173 break;
174 case CHR('w'): /* weird, \n affects ^ $ only */
175 v->cflags &= ~REG_NLSTOP;
176 v->cflags |= REG_NLANCH;
177 break;
178 case CHR('x'): /* expanded syntax */
179 v->cflags |= REG_EXPANDED;
180 break;
181 default:
182 ERR(REG_BADOPT);
183 return;
184 }
185 if (!NEXT1(')'))
186 {
187 ERR(REG_BADOPT);
188 return;
189 }
190 v->now++;
191 if (v->cflags & REG_QUOTE)
192 v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193 }
194}
195
196/*
197 * lexnest - "call a subroutine", interpolating string at the lexical level
198 *
199 * Note, this is not a very general facility. There are a number of
200 * implicit assumptions about what sorts of strings can be subroutines.
201 */
202static void
203lexnest(struct vars * v,
204 chr *beginp, /* start of interpolation */
205 chr *endp) /* one past end of interpolation */
206{
207 assert(v->savenow == NULL); /* only one level of nesting */
208 v->savenow = v->now;
209 v->savestop = v->stop;
210 v->now = beginp;
211 v->stop = endp;
212}
213
214/*
215 * string constants to interpolate as expansions of things like \d
216 */
217static chr backd[] = { /* \d */
218 CHR('['), CHR('['), CHR(':'),
219 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220 CHR(':'), CHR(']'), CHR(']')
221};
222static chr backD[] = { /* \D */
223 CHR('['), CHR('^'), CHR('['), CHR(':'),
224 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225 CHR(':'), CHR(']'), CHR(']')
226};
227static chr brbackd[] = { /* \d within brackets */
228 CHR('['), CHR(':'),
229 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
230 CHR(':'), CHR(']')
231};
232static chr backs[] = { /* \s */
233 CHR('['), CHR('['), CHR(':'),
234 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235 CHR(':'), CHR(']'), CHR(']')
236};
237static chr backS[] = { /* \S */
238 CHR('['), CHR('^'), CHR('['), CHR(':'),
239 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240 CHR(':'), CHR(']'), CHR(']')
241};
242static chr brbacks[] = { /* \s within brackets */
243 CHR('['), CHR(':'),
244 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
245 CHR(':'), CHR(']')
246};
247static chr backw[] = { /* \w */
248 CHR('['), CHR('['), CHR(':'),
249 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250 CHR(':'), CHR(']'), CHR('_'), CHR(']')
251};
252static chr backW[] = { /* \W */
253 CHR('['), CHR('^'), CHR('['), CHR(':'),
254 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255 CHR(':'), CHR(']'), CHR('_'), CHR(']')
256};
257static chr brbackw[] = { /* \w within brackets */
258 CHR('['), CHR(':'),
259 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260 CHR(':'), CHR(']'), CHR('_')
261};
262
263/*
264 * lexword - interpolate a bracket expression for word characters
265 * Possibly ought to inquire whether there is a "word" character class.
266 */
267static void
268lexword(struct vars * v)
269{
270 lexnest(v, backw, ENDOF(backw));
271}
272
273/*
274 * next - get next token
275 */
276static int /* 1 normal, 0 failure */
277next(struct vars * v)
278{
279 chr c;
280
281 /* errors yield an infinite sequence of failures */
282 if (ISERR())
283 return 0; /* the error has set nexttype to EOS */
284
285 /* remember flavor of last token */
286 v->lasttype = v->nexttype;
287
288 /* REG_BOSONLY */
289 if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290 {
291 /* at start of a REG_BOSONLY RE */
292 RETV(SBEGIN, 0); /* same as \A */
293 }
294
295 /* if we're nested and we've hit end, return to outer level */
296 if (v->savenow != NULL && ATEOS())
297 {
298 v->now = v->savenow;
299 v->stop = v->savestop;
300 v->savenow = v->savestop = NULL;
301 }
302
303 /* skip white space etc. if appropriate (not in literal or []) */
304 if (v->cflags & REG_EXPANDED)
305 switch (v->lexcon)
306 {
307 case L_ERE:
308 case L_BRE:
309 case L_EBND:
310 case L_BBND:
311 skip(v);
312 break;
313 }
314
315 /* handle EOS, depending on context */
316 if (ATEOS())
317 {
318 switch (v->lexcon)
319 {
320 case L_ERE:
321 case L_BRE:
322 case L_Q:
323 RET(EOS);
324 break;
325 case L_EBND:
326 case L_BBND:
327 FAILW(REG_EBRACE);
328 break;
329 case L_BRACK:
330 case L_CEL:
331 case L_ECL:
332 case L_CCL:
333 FAILW(REG_EBRACK);
334 break;
335 }
336 assert(NOTREACHED);
337 }
338
339 /* okay, time to actually get a character */
340 c = *v->now++;
341
342 /* deal with the easy contexts, punt EREs to code below */
343 switch (v->lexcon)
344 {
345 case L_BRE: /* punt BREs to separate function */
346 return brenext(v, c);
347 break;
348 case L_ERE: /* see below */
349 break;
350 case L_Q: /* literal strings are easy */
351 RETV(PLAIN, c);
352 break;
353 case L_BBND: /* bounds are fairly simple */
354 case L_EBND:
355 switch (c)
356 {
357 case CHR('0'):
358 case CHR('1'):
359 case CHR('2'):
360 case CHR('3'):
361 case CHR('4'):
362 case CHR('5'):
363 case CHR('6'):
364 case CHR('7'):
365 case CHR('8'):
366 case CHR('9'):
367 RETV(DIGIT, (chr) DIGITVAL(c));
368 break;
369 case CHR(','):
370 RET(',');
371 break;
372 case CHR('}'): /* ERE bound ends with } */
373 if (INCON(L_EBND))
374 {
375 INTOCON(L_ERE);
376 if ((v->cflags & REG_ADVF) && NEXT1('?'))
377 {
378 v->now++;
379 NOTE(REG_UNONPOSIX);
380 RETV('}', 0);
381 }
382 RETV('}', 1);
383 }
384 else
385 FAILW(REG_BADBR);
386 break;
387 case CHR('\\'): /* BRE bound ends with \} */
388 if (INCON(L_BBND) && NEXT1('}'))
389 {
390 v->now++;
391 INTOCON(L_BRE);
392 RET('}');
393 }
394 else
395 FAILW(REG_BADBR);
396 break;
397 default:
398 FAILW(REG_BADBR);
399 break;
400 }
401 assert(NOTREACHED);
402 break;
403 case L_BRACK: /* brackets are not too hard */
404 switch (c)
405 {
406 case CHR(']'):
407 if (LASTTYPE('['))
408 RETV(PLAIN, c);
409 else
410 {
411 INTOCON((v->cflags & REG_EXTENDED) ?
412 L_ERE : L_BRE);
413 RET(']');
414 }
415 break;
416 case CHR('\\'):
417 NOTE(REG_UBBS);
418 if (!(v->cflags & REG_ADVF))
419 RETV(PLAIN, c);
420 NOTE(REG_UNONPOSIX);
421 if (ATEOS())
422 FAILW(REG_EESCAPE);
423 (DISCARD) lexescape(v);
424 switch (v->nexttype)
425 { /* not all escapes okay here */
426 case PLAIN:
427 return 1;
428 break;
429 case CCLASS:
430 switch (v->nextvalue)
431 {
432 case 'd':
433 lexnest(v, brbackd, ENDOF(brbackd));
434 break;
435 case 's':
436 lexnest(v, brbacks, ENDOF(brbacks));
437 break;
438 case 'w':
439 lexnest(v, brbackw, ENDOF(brbackw));
440 break;
441 default:
442 FAILW(REG_EESCAPE);
443 break;
444 }
445 /* lexnest done, back up and try again */
446 v->nexttype = v->lasttype;
447 return next(v);
448 break;
449 }
450 /* not one of the acceptable escapes */
451 FAILW(REG_EESCAPE);
452 break;
453 case CHR('-'):
454 if (LASTTYPE('[') || NEXT1(']'))
455 RETV(PLAIN, c);
456 else
457 RETV(RANGE, c);
458 break;
459 case CHR('['):
460 if (ATEOS())
461 FAILW(REG_EBRACK);
462 switch (*v->now++)
463 {
464 case CHR('.'):
465 INTOCON(L_CEL);
466 /* might or might not be locale-specific */
467 RET(COLLEL);
468 break;
469 case CHR('='):
470 INTOCON(L_ECL);
471 NOTE(REG_ULOCALE);
472 RET(ECLASS);
473 break;
474 case CHR(':'):
475 INTOCON(L_CCL);
476 NOTE(REG_ULOCALE);
477 RET(CCLASS);
478 break;
479 default: /* oops */
480 v->now--;
481 RETV(PLAIN, c);
482 break;
483 }
484 assert(NOTREACHED);
485 break;
486 default:
487 RETV(PLAIN, c);
488 break;
489 }
490 assert(NOTREACHED);
491 break;
492 case L_CEL: /* collating elements are easy */
493 if (c == CHR('.') && NEXT1(']'))
494 {
495 v->now++;
496 INTOCON(L_BRACK);
497 RETV(END, '.');
498 }
499 else
500 RETV(PLAIN, c);
501 break;
502 case L_ECL: /* ditto equivalence classes */
503 if (c == CHR('=') && NEXT1(']'))
504 {
505 v->now++;
506 INTOCON(L_BRACK);
507 RETV(END, '=');
508 }
509 else
510 RETV(PLAIN, c);
511 break;
512 case L_CCL: /* ditto character classes */
513 if (c == CHR(':') && NEXT1(']'))
514 {
515 v->now++;
516 INTOCON(L_BRACK);
517 RETV(END, ':');
518 }
519 else
520 RETV(PLAIN, c);
521 break;
522 default:
523 assert(NOTREACHED);
524 break;
525 }
526
527 /* that got rid of everything except EREs and AREs */
528 assert(INCON(L_ERE));
529
530 /* deal with EREs and AREs, except for backslashes */
531 switch (c)
532 {
533 case CHR('|'):
534 RET('|');
535 break;
536 case CHR('*'):
537 if ((v->cflags & REG_ADVF) && NEXT1('?'))
538 {
539 v->now++;
540 NOTE(REG_UNONPOSIX);
541 RETV('*', 0);
542 }
543 RETV('*', 1);
544 break;
545 case CHR('+'):
546 if ((v->cflags & REG_ADVF) && NEXT1('?'))
547 {
548 v->now++;
549 NOTE(REG_UNONPOSIX);
550 RETV('+', 0);
551 }
552 RETV('+', 1);
553 break;
554 case CHR('?'):
555 if ((v->cflags & REG_ADVF) && NEXT1('?'))
556 {
557 v->now++;
558 NOTE(REG_UNONPOSIX);
559 RETV('?', 0);
560 }
561 RETV('?', 1);
562 break;
563 case CHR('{'): /* bounds start or plain character */
564 if (v->cflags & REG_EXPANDED)
565 skip(v);
566 if (ATEOS() || !iscdigit(*v->now))
567 {
568 NOTE(REG_UBRACES);
569 NOTE(REG_UUNSPEC);
570 RETV(PLAIN, c);
571 }
572 else
573 {
574 NOTE(REG_UBOUNDS);
575 INTOCON(L_EBND);
576 RET('{');
577 }
578 assert(NOTREACHED);
579 break;
580 case CHR('('): /* parenthesis, or advanced extension */
581 if ((v->cflags & REG_ADVF) && NEXT1('?'))
582 {
583 NOTE(REG_UNONPOSIX);
584 v->now++;
585 switch (*v->now++)
586 {
587 case CHR(':'): /* non-capturing paren */
588 RETV('(', 0);
589 break;
590 case CHR('#'): /* comment */
591 while (!ATEOS() && *v->now != CHR(')'))
592 v->now++;
593 if (!ATEOS())
594 v->now++;
595 assert(v->nexttype == v->lasttype);
596 return next(v);
597 break;
598 case CHR('='): /* positive lookahead */
599 NOTE(REG_ULOOKAHEAD);
600 RETV(LACON, 1);
601 break;
602 case CHR('!'): /* negative lookahead */
603 NOTE(REG_ULOOKAHEAD);
604 RETV(LACON, 0);
605 break;
606 default:
607 FAILW(REG_BADRPT);
608 break;
609 }
610 assert(NOTREACHED);
611 }
612 if (v->cflags & REG_NOSUB)
613 RETV('(', 0); /* all parens non-capturing */
614 else
615 RETV('(', 1);
616 break;
617 case CHR(')'):
618 if (LASTTYPE('('))
619 NOTE(REG_UUNSPEC);
620 RETV(')', c);
621 break;
622 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
623 if (HAVE(6) && *(v->now + 0) == CHR('[') &&
624 *(v->now + 1) == CHR(':') &&
625 (*(v->now + 2) == CHR('<') ||
626 *(v->now + 2) == CHR('>')) &&
627 *(v->now + 3) == CHR(':') &&
628 *(v->now + 4) == CHR(']') &&
629 *(v->now + 5) == CHR(']'))
630 {
631 c = *(v->now + 2);
632 v->now += 6;
633 NOTE(REG_UNONPOSIX);
634 RET((c == CHR('<')) ? '<' : '>');
635 }
636 INTOCON(L_BRACK);
637 if (NEXT1('^'))
638 {
639 v->now++;
640 RETV('[', 0);
641 }
642 RETV('[', 1);
643 break;
644 case CHR('.'):
645 RET('.');
646 break;
647 case CHR('^'):
648 RET('^');
649 break;
650 case CHR('$'):
651 RET('$');
652 break;
653 case CHR('\\'): /* mostly punt backslashes to code below */
654 if (ATEOS())
655 FAILW(REG_EESCAPE);
656 break;
657 default: /* ordinary character */
658 RETV(PLAIN, c);
659 break;
660 }
661
662 /* ERE/ARE backslash handling; backslash already eaten */
663 assert(!ATEOS());
664 if (!(v->cflags & REG_ADVF))
665 { /* only AREs have non-trivial escapes */
666 if (iscalnum(*v->now))
667 {
668 NOTE(REG_UBSALNUM);
669 NOTE(REG_UUNSPEC);
670 }
671 RETV(PLAIN, *v->now++);
672 }
673 (DISCARD) lexescape(v);
674 if (ISERR())
675 FAILW(REG_EESCAPE);
676 if (v->nexttype == CCLASS)
677 { /* fudge at lexical level */
678 switch (v->nextvalue)
679 {
680 case 'd':
681 lexnest(v, backd, ENDOF(backd));
682 break;
683 case 'D':
684 lexnest(v, backD, ENDOF(backD));
685 break;
686 case 's':
687 lexnest(v, backs, ENDOF(backs));
688 break;
689 case 'S':
690 lexnest(v, backS, ENDOF(backS));
691 break;
692 case 'w':
693 lexnest(v, backw, ENDOF(backw));
694 break;
695 case 'W':
696 lexnest(v, backW, ENDOF(backW));
697 break;
698 default:
699 assert(NOTREACHED);
700 FAILW(REG_ASSERT);
701 break;
702 }
703 /* lexnest done, back up and try again */
704 v->nexttype = v->lasttype;
705 return next(v);
706 }
707 /* otherwise, lexescape has already done the work */
708 return !ISERR();
709}
710
711/*
712 * lexescape - parse an ARE backslash escape (backslash already eaten)
713 * Note slightly nonstandard use of the CCLASS type code.
714 */
715static int /* not actually used, but convenient for
716 * RETV */
717lexescape(struct vars * v)
718{
719 chr c;
720 static chr alert[] = {
721 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
722 };
723 static chr esc[] = {
724 CHR('E'), CHR('S'), CHR('C')
725 };
726 chr *save;
727
728 assert(v->cflags & REG_ADVF);
729
730 assert(!ATEOS());
731 c = *v->now++;
732 if (!iscalnum(c))
733 RETV(PLAIN, c);
734
735 NOTE(REG_UNONPOSIX);
736 switch (c)
737 {
738 case CHR('a'):
739 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
740 break;
741 case CHR('A'):
742 RETV(SBEGIN, 0);
743 break;
744 case CHR('b'):
745 RETV(PLAIN, CHR('\b'));
746 break;
747 case CHR('B'):
748 RETV(PLAIN, CHR('\\'));
749 break;
750 case CHR('c'):
751 NOTE(REG_UUNPORT);
752 if (ATEOS())
753 FAILW(REG_EESCAPE);
754 RETV(PLAIN, (chr) (*v->now++ & 037));
755 break;
756 case CHR('d'):
757 NOTE(REG_ULOCALE);
758 RETV(CCLASS, 'd');
759 break;
760 case CHR('D'):
761 NOTE(REG_ULOCALE);
762 RETV(CCLASS, 'D');
763 break;
764 case CHR('e'):
765 NOTE(REG_UUNPORT);
766 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
767 break;
768 case CHR('f'):
769 RETV(PLAIN, CHR('\f'));
770 break;
771 case CHR('m'):
772 RET('<');
773 break;
774 case CHR('M'):
775 RET('>');
776 break;
777 case CHR('n'):
778 RETV(PLAIN, CHR('\n'));
779 break;
780 case CHR('r'):
781 RETV(PLAIN, CHR('\r'));
782 break;
783 case CHR('s'):
784 NOTE(REG_ULOCALE);
785 RETV(CCLASS, 's');
786 break;
787 case CHR('S'):
788 NOTE(REG_ULOCALE);
789 RETV(CCLASS, 'S');
790 break;
791 case CHR('t'):
792 RETV(PLAIN, CHR('\t'));
793 break;
794 case CHR('u'):
795 c = lexdigits(v, 16, 4, 4);
796 if (ISERR())
797 FAILW(REG_EESCAPE);
798 RETV(PLAIN, c);
799 break;
800 case CHR('U'):
801 c = lexdigits(v, 16, 8, 8);
802 if (ISERR())
803 FAILW(REG_EESCAPE);
804 RETV(PLAIN, c);
805 break;
806 case CHR('v'):
807 RETV(PLAIN, CHR('\v'));
808 break;
809 case CHR('w'):
810 NOTE(REG_ULOCALE);
811 RETV(CCLASS, 'w');
812 break;
813 case CHR('W'):
814 NOTE(REG_ULOCALE);
815 RETV(CCLASS, 'W');
816 break;
817 case CHR('x'):
818 NOTE(REG_UUNPORT);
819 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside
820 * spec */
821 if (ISERR())
822 FAILW(REG_EESCAPE);
823 RETV(PLAIN, c);
824 break;
825 case CHR('y'):
826 NOTE(REG_ULOCALE);
827 RETV(WBDRY, 0);
828 break;
829 case CHR('Y'):
830 NOTE(REG_ULOCALE);
831 RETV(NWBDRY, 0);
832 break;
833 case CHR('Z'):
834 RETV(SEND, 0);
835 break;
836 case CHR('1'):
837 case CHR('2'):
838 case CHR('3'):
839 case CHR('4'):
840 case CHR('5'):
841 case CHR('6'):
842 case CHR('7'):
843 case CHR('8'):
844 case CHR('9'):
845 save = v->now;
846 v->now--; /* put first digit back */
847 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside
848 * spec */
849 if (ISERR())
850 FAILW(REG_EESCAPE);
851 /* ugly heuristic (first test is "exactly 1 digit?") */
852 if (v->now - save == 0 || (int) c <= v->nsubexp)
853 {
854 NOTE(REG_UBACKREF);
855 RETV(BACKREF, (chr) c);
856 }
857 /* oops, doesn't look like it's a backref after all... */
858 v->now = save;
859 /* and fall through into octal number */
860 case CHR('0'):
861 NOTE(REG_UUNPORT);
862 v->now--; /* put first digit back */
863 c = lexdigits(v, 8, 1, 3);
864 if (ISERR())
865 FAILW(REG_EESCAPE);
866 RETV(PLAIN, c);
867 break;
868 default:
869 assert(iscalpha(c));
870 FAILW(REG_EESCAPE); /* unknown alphabetic escape */
871 break;
872 }
873 assert(NOTREACHED);
874}
875
876/*
877 * lexdigits - slurp up digits and return chr value
878 */
879static chr /* chr value; errors signalled via ERR */
880lexdigits(struct vars * v,
881 int base,
882 int minlen,
883 int maxlen)
884{
885 uchr n; /* unsigned to avoid overflow misbehavior */
886 int len;
887 chr c;
888 int d;
889 const uchr ub = (uchr) base;
890
891 n = 0;
892 for (len = 0; len < maxlen && !ATEOS(); len++)
893 {
894 c = *v->now++;
895 switch (c)
896 {
897 case CHR('0'):
898 case CHR('1'):
899 case CHR('2'):
900 case CHR('3'):
901 case CHR('4'):
902 case CHR('5'):
903 case CHR('6'):
904 case CHR('7'):
905 case CHR('8'):
906 case CHR('9'):
907 d = DIGITVAL(c);
908 break;
909 case CHR('a'):
910 case CHR('A'):
911 d = 10;
912 break;
913 case CHR('b'):
914 case CHR('B'):
915 d = 11;
916 break;
917 case CHR('c'):
918 case CHR('C'):
919 d = 12;
920 break;
921 case CHR('d'):
922 case CHR('D'):
923 d = 13;
924 break;
925 case CHR('e'):
926 case CHR('E'):
927 d = 14;
928 break;
929 case CHR('f'):
930 case CHR('F'):
931 d = 15;
932 break;
933 default:
934 v->now--; /* oops, not a digit at all */
935 d = -1;
936 break;
937 }
938
939 if (d >= base)
940 { /* not a plausible digit */
941 v->now--;
942 d = -1;
943 }
944 if (d < 0)
945 break; /* NOTE BREAK OUT */
946 n = n * ub + (uchr) d;
947 }
948 if (len < minlen)
949 ERR(REG_EESCAPE);
950
951 return (chr) n;
952}
953
954/*
955 * brenext - get next BRE token
956 *
957 * This is much like EREs except for all the stupid backslashes and the
958 * context-dependency of some things.
959 */
960static int /* 1 normal, 0 failure */
961brenext(struct vars * v,
962 chr pc)
963{
964 chr c = (chr) pc;
965
966 switch (c)
967 {
968 case CHR('*'):
969 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
970 RETV(PLAIN, c);
971 RET('*');
972 break;
973 case CHR('['):
974 if (HAVE(6) && *(v->now + 0) == CHR('[') &&
975 *(v->now + 1) == CHR(':') &&
976 (*(v->now + 2) == CHR('<') ||
977 *(v->now + 2) == CHR('>')) &&
978 *(v->now + 3) == CHR(':') &&
979 *(v->now + 4) == CHR(']') &&
980 *(v->now + 5) == CHR(']'))
981 {
982 c = *(v->now + 2);
983 v->now += 6;
984 NOTE(REG_UNONPOSIX);
985 RET((c == CHR('<')) ? '<' : '>');
986 }
987 INTOCON(L_BRACK);
988 if (NEXT1('^'))
989 {
990 v->now++;
991 RETV('[', 0);
992 }
993 RETV('[', 1);
994 break;
995 case CHR('.'):
996 RET('.');
997 break;
998 case CHR('^'):
999 if (LASTTYPE(EMPTY))
1000 RET('^');
1001 if (LASTTYPE('('))
1002 {
1003 NOTE(REG_UUNSPEC);
1004 RET('^');
1005 }
1006 RETV(PLAIN, c);
1007 break;
1008 case CHR('$'):
1009 if (v->cflags & REG_EXPANDED)
1010 skip(v);
1011 if (ATEOS())
1012 RET('$');
1013 if (NEXT2('\\', ')'))
1014 {
1015 NOTE(REG_UUNSPEC);
1016 RET('$');
1017 }
1018 RETV(PLAIN, c);
1019 break;
1020 case CHR('\\'):
1021 break; /* see below */
1022 default:
1023 RETV(PLAIN, c);
1024 break;
1025 }
1026
1027 assert(c == CHR('\\'));
1028
1029 if (ATEOS())
1030 FAILW(REG_EESCAPE);
1031
1032 c = *v->now++;
1033 switch (c)
1034 {
1035 case CHR('{'):
1036 INTOCON(L_BBND);
1037 NOTE(REG_UBOUNDS);
1038 RET('{');
1039 break;
1040 case CHR('('):
1041 RETV('(', 1);
1042 break;
1043 case CHR(')'):
1044 RETV(')', c);
1045 break;
1046 case CHR('<'):
1047 NOTE(REG_UNONPOSIX);
1048 RET('<');
1049 break;
1050 case CHR('>'):
1051 NOTE(REG_UNONPOSIX);
1052 RET('>');
1053 break;
1054 case CHR('1'):
1055 case CHR('2'):
1056 case CHR('3'):
1057 case CHR('4'):
1058 case CHR('5'):
1059 case CHR('6'):
1060 case CHR('7'):
1061 case CHR('8'):
1062 case CHR('9'):
1063 NOTE(REG_UBACKREF);
1064 RETV(BACKREF, (chr) DIGITVAL(c));
1065 break;
1066 default:
1067 if (iscalnum(c))
1068 {
1069 NOTE(REG_UBSALNUM);
1070 NOTE(REG_UUNSPEC);
1071 }
1072 RETV(PLAIN, c);
1073 break;
1074 }
1075
1076 assert(NOTREACHED);
1077}
1078
1079/*
1080 * skip - skip white space and comments in expanded form
1081 */
1082static void
1083skip(struct vars * v)
1084{
1085 chr *start = v->now;
1086
1087 assert(v->cflags & REG_EXPANDED);
1088
1089 for (;;)
1090 {
1091 while (!ATEOS() && iscspace(*v->now))
1092 v->now++;
1093 if (ATEOS() || *v->now != CHR('#'))
1094 break; /* NOTE BREAK OUT */
1095 assert(NEXT1('#'));
1096 while (!ATEOS() && *v->now != CHR('\n'))
1097 v->now++;
1098 /* leave the newline to be picked up by the iscspace loop */
1099 }
1100
1101 if (v->now != start)
1102 NOTE(REG_UNONPOSIX);
1103}
1104
1105/*
1106 * newline - return the chr for a newline
1107 *
1108 * This helps confine use of CHR to this source file.
1109 */
1110static chr
1111newline(void)
1112{
1113 return CHR('\n');
1114}
1115
1116/*
1117 * chrnamed - return the chr known by a given (chr string) name
1118 *
1119 * The code is a bit clumsy, but this routine gets only such specialized
1120 * use that it hardly matters.
1121 */
1122static chr
1123chrnamed(struct vars * v,
1124 chr *startp, /* start of name */
1125 chr *endp, /* just past end of name */
1126 chr lastresort) /* what to return if name lookup fails */
1127{
1128 celt c;
1129 int errsave;
1130 int e;
1131 struct cvec *cv;
1132
1133 errsave = v->err;
1134 v->err = 0;
1135 c = element(v, startp, endp);
1136 e = v->err;
1137 v->err = errsave;
1138
1139 if (e != 0)
1140 return (chr) lastresort;
1141
1142 cv = range(v, c, c, 0);
1143 if (cv->nchrs == 0)
1144 return (chr) lastresort;
1145 return cv->chrs[0];
1146}