]> git.saurik.com Git - apple/libc.git/blob - regex/engine.c
Libc-262.tar.gz
[apple/libc.git] / regex / engine.c
1 /*
2 * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1992, 1993, 1994
24 * The Regents of the University of California. All rights reserved.
25 *
26 * This code is derived from software contributed to Berkeley by
27 * Henry Spencer.
28 *
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted provided that the following conditions
31 * are met:
32 * 1. Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * 2. Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in the
36 * documentation and/or other materials provided with the distribution.
37 * 3. All advertising materials mentioning features or use of this software
38 * must display the following acknowledgement:
39 * This product includes software developed by the University of
40 * California, Berkeley and its contributors.
41 * 4. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57
58 /*
59 * The matching engine and friends. This file is #included by regexec.c
60 * after suitable #defines of a variety of macros used herein, so that
61 * different state representations can be used without duplicating masses
62 * of code.
63 */
64
65 #ifdef SNAMES
66 #define matcher smatcher
67 #define fast sfast
68 #define slow sslow
69 #define dissect sdissect
70 #define backref sbackref
71 #define step sstep
72 #define print sprint
73 #define at sat
74 #define match smat
75 #endif
76 #ifdef LNAMES
77 #define matcher lmatcher
78 #define fast lfast
79 #define slow lslow
80 #define dissect ldissect
81 #define backref lbackref
82 #define step lstep
83 #define print lprint
84 #define at lat
85 #define match lmat
86 #endif
87
88 /* another structure passed up and down to avoid zillions of parameters */
89 struct match {
90 struct re_guts *g;
91 int eflags;
92 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
93 char *offp; /* offsets work from here */
94 char *beginp; /* start of string -- virtual NUL precedes */
95 char *endp; /* end of string -- virtual NUL here */
96 char *coldp; /* can be no match starting before here */
97 char **lastpos; /* [nplus+1] */
98 STATEVARS;
99 states st; /* current states */
100 states fresh; /* states for a fresh start */
101 states tmp; /* temporary */
102 states empty; /* empty set of states */
103 };
104
105 /* ========= begin header generated by ./mkh ========= */
106 #ifdef __cplusplus
107 extern "C" {
108 #endif
109
110 /* === engine.c === */
111 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
112 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
113 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
114 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
115 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
116 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
117 #define BOL (OUT+1)
118 #define EOL (BOL+1)
119 #define BOLEOL (BOL+2)
120 #define NOTHING (BOL+3)
121 #define BOW (BOL+4)
122 #define EOW (BOL+5)
123 #define CODEMAX (BOL+5) /* highest code used */
124 #define NONCHAR(c) ((c) > CHAR_MAX)
125 #define NNONCHAR (CODEMAX-CHAR_MAX)
126 #ifdef REDEBUG
127 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
128 #endif
129 #ifdef REDEBUG
130 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
131 #endif
132 #ifdef REDEBUG
133 static char *pchar __P((int ch));
134 #endif
135
136 #ifdef __cplusplus
137 }
138 #endif
139 /* ========= end header generated by ./mkh ========= */
140
141 #ifdef REDEBUG
142 #define SP(t, s, c) print(m, t, s, c, stdout)
143 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
144 #define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
145 #else
146 #define SP(t, s, c) /* nothing */
147 #define AT(t, p1, p2, s1, s2) /* nothing */
148 #define NOTE(s) /* nothing */
149 #endif
150
151 /*
152 - matcher - the actual matching engine
153 == static int matcher(register struct re_guts *g, char *string, \
154 == size_t nmatch, regmatch_t pmatch[], int eflags);
155 */
156 static int /* 0 success, REG_NOMATCH failure */
157 matcher(g, string, nmatch, pmatch, eflags)
158 register struct re_guts *g;
159 char *string;
160 size_t nmatch;
161 regmatch_t pmatch[];
162 int eflags;
163 {
164 register char *endp;
165 register int i;
166 struct match mv;
167 register struct match *m = &mv;
168 register char *dp;
169 const register sopno gf = g->firststate+1; /* +1 for OEND */
170 const register sopno gl = g->laststate;
171 char *start;
172 char *stop;
173
174 /* simplify the situation where possible */
175 if (g->cflags&REG_NOSUB)
176 nmatch = 0;
177 if (eflags&REG_STARTEND) {
178 start = string + pmatch[0].rm_so;
179 stop = string + pmatch[0].rm_eo;
180 } else {
181 start = string;
182 stop = start + strlen(start);
183 }
184 if (stop < start)
185 return(REG_INVARG);
186
187 /* prescreening; this does wonders for this rather slow code */
188 if (g->must != NULL) {
189 for (dp = start; dp < stop; dp++)
190 if (*dp == g->must[0] && stop - dp >= g->mlen &&
191 memcmp(dp, g->must, (size_t)g->mlen) == 0)
192 break;
193 if (dp == stop) /* we didn't find g->must */
194 return(REG_NOMATCH);
195 }
196
197 /* match struct setup */
198 m->g = g;
199 m->eflags = eflags;
200 m->pmatch = NULL;
201 m->lastpos = NULL;
202 m->offp = string;
203 m->beginp = start;
204 m->endp = stop;
205 STATESETUP(m, 4);
206 SETUP(m->st);
207 SETUP(m->fresh);
208 SETUP(m->tmp);
209 SETUP(m->empty);
210 CLEAR(m->empty);
211
212 /* this loop does only one repetition except for backrefs */
213 for (;;) {
214 endp = fast(m, start, stop, gf, gl);
215 if (endp == NULL) { /* a miss */
216 STATETEARDOWN(m);
217 return(REG_NOMATCH);
218 }
219 if (nmatch == 0 && !g->backrefs)
220 break; /* no further info needed */
221
222 /* where? */
223 assert(m->coldp != NULL);
224 for (;;) {
225 NOTE("finding start");
226 endp = slow(m, m->coldp, stop, gf, gl);
227 if (endp != NULL)
228 break;
229 assert(m->coldp < m->endp);
230 m->coldp++;
231 }
232 if (nmatch == 1 && !g->backrefs)
233 break; /* no further info needed */
234
235 /* oh my, he wants the subexpressions... */
236 if (m->pmatch == NULL)
237 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
238 sizeof(regmatch_t));
239 if (m->pmatch == NULL) {
240 STATETEARDOWN(m);
241 return(REG_ESPACE);
242 }
243 for (i = 1; i <= m->g->nsub; i++)
244 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
245 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
246 NOTE("dissecting");
247 dp = dissect(m, m->coldp, endp, gf, gl);
248 } else {
249 if (g->nplus > 0 && m->lastpos == NULL)
250 m->lastpos = (char **)malloc((g->nplus+1) *
251 sizeof(char *));
252 if (g->nplus > 0 && m->lastpos == NULL) {
253 free(m->pmatch);
254 STATETEARDOWN(m);
255 return(REG_ESPACE);
256 }
257 NOTE("backref dissect");
258 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
259 }
260 if (dp != NULL)
261 break;
262
263 /* uh-oh... we couldn't find a subexpression-level match */
264 assert(g->backrefs); /* must be back references doing it */
265 assert(g->nplus == 0 || m->lastpos != NULL);
266 for (;;) {
267 if (dp != NULL || endp <= m->coldp)
268 break; /* defeat */
269 NOTE("backoff");
270 endp = slow(m, m->coldp, endp-1, gf, gl);
271 if (endp == NULL)
272 break; /* defeat */
273 /* try it on a shorter possibility */
274 #ifndef NDEBUG
275 for (i = 1; i <= m->g->nsub; i++) {
276 assert(m->pmatch[i].rm_so == -1);
277 assert(m->pmatch[i].rm_eo == -1);
278 }
279 #endif
280 NOTE("backoff dissect");
281 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
282 }
283 assert(dp == NULL || dp == endp);
284 if (dp != NULL) /* found a shorter one */
285 break;
286
287 /* despite initial appearances, there is no match here */
288 NOTE("false alarm");
289 start = m->coldp + 1; /* recycle starting later */
290 assert(start <= stop);
291 }
292
293 /* fill in the details if requested */
294 if (nmatch > 0) {
295 pmatch[0].rm_so = m->coldp - m->offp;
296 pmatch[0].rm_eo = endp - m->offp;
297 }
298 if (nmatch > 1) {
299 assert(m->pmatch != NULL);
300 for (i = 1; i < nmatch; i++)
301 if (i <= m->g->nsub)
302 pmatch[i] = m->pmatch[i];
303 else {
304 pmatch[i].rm_so = -1;
305 pmatch[i].rm_eo = -1;
306 }
307 }
308
309 if (m->pmatch != NULL)
310 free((char *)m->pmatch);
311 if (m->lastpos != NULL)
312 free((char *)m->lastpos);
313 STATETEARDOWN(m);
314 return(0);
315 }
316
317 /*
318 - dissect - figure out what matched what, no back references
319 == static char *dissect(register struct match *m, char *start, \
320 == char *stop, sopno startst, sopno stopst);
321 */
322 static char * /* == stop (success) always */
323 dissect(m, start, stop, startst, stopst)
324 register struct match *m;
325 char *start;
326 char *stop;
327 sopno startst;
328 sopno stopst;
329 {
330 register int i;
331 register sopno ss; /* start sop of current subRE */
332 register sopno es; /* end sop of current subRE */
333 register char *sp; /* start of string matched by it */
334 register char *stp; /* string matched by it cannot pass here */
335 register char *rest; /* start of rest of string */
336 register char *tail; /* string unmatched by rest of RE */
337 register sopno ssub; /* start sop of subsubRE */
338 register sopno esub; /* end sop of subsubRE */
339 register char *ssp; /* start of string matched by subsubRE */
340 register char *sep; /* end of string matched by subsubRE */
341 register char *oldssp; /* previous ssp */
342 register char *dp;
343
344 AT("diss", start, stop, startst, stopst);
345 sp = start;
346 for (ss = startst; ss < stopst; ss = es) {
347 /* identify end of subRE */
348 es = ss;
349 switch (OP(m->g->strip[es])) {
350 case OPLUS_:
351 case OQUEST_:
352 es += OPND(m->g->strip[es]);
353 break;
354 case OCH_:
355 while (OP(m->g->strip[es]) != O_CH)
356 es += OPND(m->g->strip[es]);
357 break;
358 }
359 es++;
360
361 /* figure out what it matched */
362 switch (OP(m->g->strip[ss])) {
363 case OEND:
364 assert(nope);
365 break;
366 case OCHAR:
367 sp++;
368 break;
369 case OBOL:
370 case OEOL:
371 case OBOW:
372 case OEOW:
373 break;
374 case OANY:
375 case OANYOF:
376 sp++;
377 break;
378 case OBACK_:
379 case O_BACK:
380 assert(nope);
381 break;
382 /* cases where length of match is hard to find */
383 case OQUEST_:
384 stp = stop;
385 for (;;) {
386 /* how long could this one be? */
387 rest = slow(m, sp, stp, ss, es);
388 assert(rest != NULL); /* it did match */
389 /* could the rest match the rest? */
390 tail = slow(m, rest, stop, es, stopst);
391 if (tail == stop)
392 break; /* yes! */
393 /* no -- try a shorter match for this one */
394 stp = rest - 1;
395 assert(stp >= sp); /* it did work */
396 }
397 ssub = ss + 1;
398 esub = es - 1;
399 /* did innards match? */
400 if (slow(m, sp, rest, ssub, esub) != NULL) {
401 dp = dissect(m, sp, rest, ssub, esub);
402 assert(dp == rest);
403 } else /* no */
404 assert(sp == rest);
405 sp = rest;
406 break;
407 case OPLUS_:
408 stp = stop;
409 for (;;) {
410 /* how long could this one be? */
411 rest = slow(m, sp, stp, ss, es);
412 assert(rest != NULL); /* it did match */
413 /* could the rest match the rest? */
414 tail = slow(m, rest, stop, es, stopst);
415 if (tail == stop)
416 break; /* yes! */
417 /* no -- try a shorter match for this one */
418 stp = rest - 1;
419 assert(stp >= sp); /* it did work */
420 }
421 ssub = ss + 1;
422 esub = es - 1;
423 ssp = sp;
424 oldssp = ssp;
425 for (;;) { /* find last match of innards */
426 sep = slow(m, ssp, rest, ssub, esub);
427 if (sep == NULL || sep == ssp)
428 break; /* failed or matched null */
429 oldssp = ssp; /* on to next try */
430 ssp = sep;
431 }
432 if (sep == NULL) {
433 /* last successful match */
434 sep = ssp;
435 ssp = oldssp;
436 }
437 assert(sep == rest); /* must exhaust substring */
438 assert(slow(m, ssp, sep, ssub, esub) == rest);
439 dp = dissect(m, ssp, sep, ssub, esub);
440 assert(dp == sep);
441 sp = rest;
442 break;
443 case OCH_:
444 stp = stop;
445 for (;;) {
446 /* how long could this one be? */
447 rest = slow(m, sp, stp, ss, es);
448 assert(rest != NULL); /* it did match */
449 /* could the rest match the rest? */
450 tail = slow(m, rest, stop, es, stopst);
451 if (tail == stop)
452 break; /* yes! */
453 /* no -- try a shorter match for this one */
454 stp = rest - 1;
455 assert(stp >= sp); /* it did work */
456 }
457 ssub = ss + 1;
458 esub = ss + OPND(m->g->strip[ss]) - 1;
459 assert(OP(m->g->strip[esub]) == OOR1);
460 for (;;) { /* find first matching branch */
461 if (slow(m, sp, rest, ssub, esub) == rest)
462 break; /* it matched all of it */
463 /* that one missed, try next one */
464 assert(OP(m->g->strip[esub]) == OOR1);
465 esub++;
466 assert(OP(m->g->strip[esub]) == OOR2);
467 ssub = esub + 1;
468 esub += OPND(m->g->strip[esub]);
469 if (OP(m->g->strip[esub]) == OOR2)
470 esub--;
471 else
472 assert(OP(m->g->strip[esub]) == O_CH);
473 }
474 dp = dissect(m, sp, rest, ssub, esub);
475 assert(dp == rest);
476 sp = rest;
477 break;
478 case O_PLUS:
479 case O_QUEST:
480 case OOR1:
481 case OOR2:
482 case O_CH:
483 assert(nope);
484 break;
485 case OLPAREN:
486 i = OPND(m->g->strip[ss]);
487 assert(0 < i && i <= m->g->nsub);
488 m->pmatch[i].rm_so = sp - m->offp;
489 break;
490 case ORPAREN:
491 i = OPND(m->g->strip[ss]);
492 assert(0 < i && i <= m->g->nsub);
493 m->pmatch[i].rm_eo = sp - m->offp;
494 break;
495 default: /* uh oh */
496 assert(nope);
497 break;
498 }
499 }
500
501 assert(sp == stop);
502 return(sp);
503 }
504
505 /*
506 - backref - figure out what matched what, figuring in back references
507 == static char *backref(register struct match *m, char *start, \
508 == char *stop, sopno startst, sopno stopst, sopno lev);
509 */
510 static char * /* == stop (success) or NULL (failure) */
511 backref(m, start, stop, startst, stopst, lev)
512 register struct match *m;
513 char *start;
514 char *stop;
515 sopno startst;
516 sopno stopst;
517 sopno lev; /* PLUS nesting level */
518 {
519 register int i;
520 register sopno ss; /* start sop of current subRE */
521 register char *sp; /* start of string matched by it */
522 register sopno ssub; /* start sop of subsubRE */
523 register sopno esub; /* end sop of subsubRE */
524 register char *ssp; /* start of string matched by subsubRE */
525 register char *dp;
526 register size_t len;
527 register int hard;
528 register sop s;
529 register regoff_t offsave;
530 register cset *cs;
531
532 AT("back", start, stop, startst, stopst);
533 sp = start;
534
535 /* get as far as we can with easy stuff */
536 hard = 0;
537 for (ss = startst; !hard && ss < stopst; ss++)
538 switch (OP(s = m->g->strip[ss])) {
539 case OCHAR:
540 if (sp == stop || *sp++ != (char)OPND(s))
541 return(NULL);
542 break;
543 case OANY:
544 if (sp == stop)
545 return(NULL);
546 sp++;
547 break;
548 case OANYOF:
549 cs = &m->g->sets[OPND(s)];
550 if (sp == stop || !CHIN(cs, *sp++))
551 return(NULL);
552 break;
553 case OBOL:
554 if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
555 (sp < m->endp && *(sp-1) == '\n' &&
556 (m->g->cflags&REG_NEWLINE)) )
557 { /* yes */ }
558 else
559 return(NULL);
560 break;
561 case OEOL:
562 if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
563 (sp < m->endp && *sp == '\n' &&
564 (m->g->cflags&REG_NEWLINE)) )
565 { /* yes */ }
566 else
567 return(NULL);
568 break;
569 case OBOW:
570 if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
571 (sp < m->endp && *(sp-1) == '\n' &&
572 (m->g->cflags&REG_NEWLINE)) ||
573 (sp > m->beginp &&
574 !ISWORD(*(sp-1))) ) &&
575 (sp < m->endp && ISWORD(*sp)) )
576 { /* yes */ }
577 else
578 return(NULL);
579 break;
580 case OEOW:
581 if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
582 (sp < m->endp && *sp == '\n' &&
583 (m->g->cflags&REG_NEWLINE)) ||
584 (sp < m->endp && !ISWORD(*sp)) ) &&
585 (sp > m->beginp && ISWORD(*(sp-1))) )
586 { /* yes */ }
587 else
588 return(NULL);
589 break;
590 case O_QUEST:
591 break;
592 case OOR1: /* matches null but needs to skip */
593 ss++;
594 s = m->g->strip[ss];
595 do {
596 assert(OP(s) == OOR2);
597 ss += OPND(s);
598 } while (OP(s = m->g->strip[ss]) != O_CH);
599 /* note that the ss++ gets us past the O_CH */
600 break;
601 default: /* have to make a choice */
602 hard = 1;
603 break;
604 }
605 if (!hard) { /* that was it! */
606 if (sp != stop)
607 return(NULL);
608 return(sp);
609 }
610 ss--; /* adjust for the for's final increment */
611
612 /* the hard stuff */
613 AT("hard", sp, stop, ss, stopst);
614 s = m->g->strip[ss];
615 switch (OP(s)) {
616 case OBACK_: /* the vilest depths */
617 i = OPND(s);
618 assert(0 < i && i <= m->g->nsub);
619 if (m->pmatch[i].rm_eo == -1)
620 return(NULL);
621 assert(m->pmatch[i].rm_so != -1);
622 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
623 assert(stop - m->beginp >= len);
624 if (sp > stop - len)
625 return(NULL); /* not enough left to match */
626 ssp = m->offp + m->pmatch[i].rm_so;
627 if (memcmp(sp, ssp, len) != 0)
628 return(NULL);
629 while (m->g->strip[ss] != SOP(O_BACK, i))
630 ss++;
631 return(backref(m, sp+len, stop, ss+1, stopst, lev));
632 break;
633 case OQUEST_: /* to null or not */
634 dp = backref(m, sp, stop, ss+1, stopst, lev);
635 if (dp != NULL)
636 return(dp); /* not */
637 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
638 break;
639 case OPLUS_:
640 assert(m->lastpos != NULL);
641 assert(lev+1 <= m->g->nplus);
642 m->lastpos[lev+1] = sp;
643 return(backref(m, sp, stop, ss+1, stopst, lev+1));
644 break;
645 case O_PLUS:
646 if (sp == m->lastpos[lev]) /* last pass matched null */
647 return(backref(m, sp, stop, ss+1, stopst, lev-1));
648 /* try another pass */
649 m->lastpos[lev] = sp;
650 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
651 if (dp == NULL)
652 return(backref(m, sp, stop, ss+1, stopst, lev-1));
653 else
654 return(dp);
655 break;
656 case OCH_: /* find the right one, if any */
657 ssub = ss + 1;
658 esub = ss + OPND(s) - 1;
659 assert(OP(m->g->strip[esub]) == OOR1);
660 for (;;) { /* find first matching branch */
661 dp = backref(m, sp, stop, ssub, esub, lev);
662 if (dp != NULL)
663 return(dp);
664 /* that one missed, try next one */
665 if (OP(m->g->strip[esub]) == O_CH)
666 return(NULL); /* there is none */
667 esub++;
668 assert(OP(m->g->strip[esub]) == OOR2);
669 ssub = esub + 1;
670 esub += OPND(m->g->strip[esub]);
671 if (OP(m->g->strip[esub]) == OOR2)
672 esub--;
673 else
674 assert(OP(m->g->strip[esub]) == O_CH);
675 }
676 break;
677 case OLPAREN: /* must undo assignment if rest fails */
678 i = OPND(s);
679 assert(0 < i && i <= m->g->nsub);
680 offsave = m->pmatch[i].rm_so;
681 m->pmatch[i].rm_so = sp - m->offp;
682 dp = backref(m, sp, stop, ss+1, stopst, lev);
683 if (dp != NULL)
684 return(dp);
685 m->pmatch[i].rm_so = offsave;
686 return(NULL);
687 break;
688 case ORPAREN: /* must undo assignment if rest fails */
689 i = OPND(s);
690 assert(0 < i && i <= m->g->nsub);
691 offsave = m->pmatch[i].rm_eo;
692 m->pmatch[i].rm_eo = sp - m->offp;
693 dp = backref(m, sp, stop, ss+1, stopst, lev);
694 if (dp != NULL)
695 return(dp);
696 m->pmatch[i].rm_eo = offsave;
697 return(NULL);
698 break;
699 default: /* uh oh */
700 assert(nope);
701 break;
702 }
703
704 /* "can't happen" */
705 assert(nope);
706 /* NOTREACHED */
707 }
708
709 /*
710 - fast - step through the string at top speed
711 == static char *fast(register struct match *m, char *start, \
712 == char *stop, sopno startst, sopno stopst);
713 */
714 static char * /* where tentative match ended, or NULL */
715 fast(m, start, stop, startst, stopst)
716 register struct match *m;
717 char *start;
718 char *stop;
719 sopno startst;
720 sopno stopst;
721 {
722 register states st = m->st;
723 register states fresh = m->fresh;
724 register states tmp = m->tmp;
725 register char *p = start;
726 register int c = (start == m->beginp) ? OUT : *(start-1);
727 register int lastc; /* previous c */
728 register int flagch;
729 register int i;
730 register char *coldp; /* last p after which no match was underway */
731
732 CLEAR(st);
733 SET1(st, startst);
734 st = step(m->g, startst, stopst, st, NOTHING, st);
735 ASSIGN(fresh, st);
736 SP("start", st, *p);
737 coldp = NULL;
738 for (;;) {
739 /* next character */
740 lastc = c;
741 c = (p == m->endp) ? OUT : *p;
742 if (EQ(st, fresh))
743 coldp = p;
744
745 /* is there an EOL and/or BOL between lastc and c? */
746 flagch = '\0';
747 i = 0;
748 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
749 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
750 flagch = BOL;
751 i = m->g->nbol;
752 }
753 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
754 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
755 flagch = (flagch == BOL) ? BOLEOL : EOL;
756 i += m->g->neol;
757 }
758 if (i != 0) {
759 for (; i > 0; i--)
760 st = step(m->g, startst, stopst, st, flagch, st);
761 SP("boleol", st, c);
762 }
763
764 /* how about a word boundary? */
765 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
766 (c != OUT && ISWORD(c)) ) {
767 flagch = BOW;
768 }
769 if ( (lastc != OUT && ISWORD(lastc)) &&
770 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
771 flagch = EOW;
772 }
773 if (flagch == BOW || flagch == EOW) {
774 st = step(m->g, startst, stopst, st, flagch, st);
775 SP("boweow", st, c);
776 }
777
778 /* are we done? */
779 if (ISSET(st, stopst) || p == stop)
780 break; /* NOTE BREAK OUT */
781
782 /* no, we must deal with this character */
783 ASSIGN(tmp, st);
784 ASSIGN(st, fresh);
785 assert(c != OUT);
786 st = step(m->g, startst, stopst, tmp, c, st);
787 SP("aft", st, c);
788 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
789 p++;
790 }
791
792 assert(coldp != NULL);
793 m->coldp = coldp;
794 if (ISSET(st, stopst))
795 return(p+1);
796 else
797 return(NULL);
798 }
799
800 /*
801 - slow - step through the string more deliberately
802 == static char *slow(register struct match *m, char *start, \
803 == char *stop, sopno startst, sopno stopst);
804 */
805 static char * /* where it ended */
806 slow(m, start, stop, startst, stopst)
807 register struct match *m;
808 char *start;
809 char *stop;
810 sopno startst;
811 sopno stopst;
812 {
813 register states st = m->st;
814 register states empty = m->empty;
815 register states tmp = m->tmp;
816 register char *p = start;
817 register int c = (start == m->beginp) ? OUT : *(start-1);
818 register int lastc; /* previous c */
819 register int flagch;
820 register int i;
821 register char *matchp; /* last p at which a match ended */
822
823 AT("slow", start, stop, startst, stopst);
824 CLEAR(st);
825 SET1(st, startst);
826 SP("sstart", st, *p);
827 st = step(m->g, startst, stopst, st, NOTHING, st);
828 matchp = NULL;
829 for (;;) {
830 /* next character */
831 lastc = c;
832 c = (p == m->endp) ? OUT : *p;
833
834 /* is there an EOL and/or BOL between lastc and c? */
835 flagch = '\0';
836 i = 0;
837 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
838 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
839 flagch = BOL;
840 i = m->g->nbol;
841 }
842 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
843 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
844 flagch = (flagch == BOL) ? BOLEOL : EOL;
845 i += m->g->neol;
846 }
847 if (i != 0) {
848 for (; i > 0; i--)
849 st = step(m->g, startst, stopst, st, flagch, st);
850 SP("sboleol", st, c);
851 }
852
853 /* how about a word boundary? */
854 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
855 (c != OUT && ISWORD(c)) ) {
856 flagch = BOW;
857 }
858 if ( (lastc != OUT && ISWORD(lastc)) &&
859 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
860 flagch = EOW;
861 }
862 if (flagch == BOW || flagch == EOW) {
863 st = step(m->g, startst, stopst, st, flagch, st);
864 SP("sboweow", st, c);
865 }
866
867 /* are we done? */
868 if (ISSET(st, stopst))
869 matchp = p;
870 if (EQ(st, empty) || p == stop)
871 break; /* NOTE BREAK OUT */
872
873 /* no, we must deal with this character */
874 ASSIGN(tmp, st);
875 ASSIGN(st, empty);
876 assert(c != OUT);
877 st = step(m->g, startst, stopst, tmp, c, st);
878 SP("saft", st, c);
879 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
880 p++;
881 }
882
883 return(matchp);
884 }
885
886
887 /*
888 - step - map set of states reachable before char to set reachable after
889 == static states step(register struct re_guts *g, sopno start, sopno stop, \
890 == register states bef, int ch, register states aft);
891 == #define BOL (OUT+1)
892 == #define EOL (BOL+1)
893 == #define BOLEOL (BOL+2)
894 == #define NOTHING (BOL+3)
895 == #define BOW (BOL+4)
896 == #define EOW (BOL+5)
897 == #define CODEMAX (BOL+5) // highest code used
898 == #define NONCHAR(c) ((c) > CHAR_MAX)
899 == #define NNONCHAR (CODEMAX-CHAR_MAX)
900 */
901 static states
902 step(g, start, stop, bef, ch, aft)
903 register struct re_guts *g;
904 sopno start; /* start state within strip */
905 sopno stop; /* state after stop state within strip */
906 register states bef; /* states reachable before */
907 int ch; /* character or NONCHAR code */
908 register states aft; /* states already known reachable after */
909 {
910 register cset *cs;
911 register sop s;
912 register sopno pc;
913 register onestate here; /* note, macros know this name */
914 register sopno look;
915 register int i;
916
917 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
918 s = g->strip[pc];
919 switch (OP(s)) {
920 case OEND:
921 assert(pc == stop-1);
922 break;
923 case OCHAR:
924 /* only characters can match */
925 assert(!NONCHAR(ch) || ch != (char)OPND(s));
926 if (ch == (char)OPND(s))
927 FWD(aft, bef, 1);
928 break;
929 case OBOL:
930 if (ch == BOL || ch == BOLEOL)
931 FWD(aft, bef, 1);
932 break;
933 case OEOL:
934 if (ch == EOL || ch == BOLEOL)
935 FWD(aft, bef, 1);
936 break;
937 case OBOW:
938 if (ch == BOW)
939 FWD(aft, bef, 1);
940 break;
941 case OEOW:
942 if (ch == EOW)
943 FWD(aft, bef, 1);
944 break;
945 case OANY:
946 if (!NONCHAR(ch))
947 FWD(aft, bef, 1);
948 break;
949 case OANYOF:
950 cs = &g->sets[OPND(s)];
951 if (!NONCHAR(ch) && CHIN(cs, ch))
952 FWD(aft, bef, 1);
953 break;
954 case OBACK_: /* ignored here */
955 case O_BACK:
956 FWD(aft, aft, 1);
957 break;
958 case OPLUS_: /* forward, this is just an empty */
959 FWD(aft, aft, 1);
960 break;
961 case O_PLUS: /* both forward and back */
962 FWD(aft, aft, 1);
963 i = ISSETBACK(aft, OPND(s));
964 BACK(aft, aft, OPND(s));
965 if (!i && ISSETBACK(aft, OPND(s))) {
966 /* oho, must reconsider loop body */
967 pc -= OPND(s) + 1;
968 INIT(here, pc);
969 }
970 break;
971 case OQUEST_: /* two branches, both forward */
972 FWD(aft, aft, 1);
973 FWD(aft, aft, OPND(s));
974 break;
975 case O_QUEST: /* just an empty */
976 FWD(aft, aft, 1);
977 break;
978 case OLPAREN: /* not significant here */
979 case ORPAREN:
980 FWD(aft, aft, 1);
981 break;
982 case OCH_: /* mark the first two branches */
983 FWD(aft, aft, 1);
984 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
985 FWD(aft, aft, OPND(s));
986 break;
987 case OOR1: /* done a branch, find the O_CH */
988 if (ISSTATEIN(aft, here)) {
989 for (look = 1;
990 OP(s = g->strip[pc+look]) != O_CH;
991 look += OPND(s))
992 assert(OP(s) == OOR2);
993 FWD(aft, aft, look);
994 }
995 break;
996 case OOR2: /* propagate OCH_'s marking */
997 FWD(aft, aft, 1);
998 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
999 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1000 FWD(aft, aft, OPND(s));
1001 }
1002 break;
1003 case O_CH: /* just empty */
1004 FWD(aft, aft, 1);
1005 break;
1006 default: /* ooooops... */
1007 assert(nope);
1008 break;
1009 }
1010 }
1011
1012 return(aft);
1013 }
1014
1015 #ifdef REDEBUG
1016 /*
1017 - print - print a set of states
1018 == #ifdef REDEBUG
1019 == static void print(struct match *m, char *caption, states st, \
1020 == int ch, FILE *d);
1021 == #endif
1022 */
1023 static void
1024 print(m, caption, st, ch, d)
1025 struct match *m;
1026 char *caption;
1027 states st;
1028 int ch;
1029 FILE *d;
1030 {
1031 register struct re_guts *g = m->g;
1032 register int i;
1033 register int first = 1;
1034
1035 if (!(m->eflags&REG_TRACE))
1036 return;
1037
1038 fprintf(d, "%s", caption);
1039 if (ch != '\0')
1040 fprintf(d, " %s", pchar(ch));
1041 for (i = 0; i < g->nstates; i++)
1042 if (ISSET(st, i)) {
1043 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1044 first = 0;
1045 }
1046 fprintf(d, "\n");
1047 }
1048
1049 /*
1050 - at - print current situation
1051 == #ifdef REDEBUG
1052 == static void at(struct match *m, char *title, char *start, char *stop, \
1053 == sopno startst, sopno stopst);
1054 == #endif
1055 */
1056 static void
1057 at(m, title, start, stop, startst, stopst)
1058 struct match *m;
1059 char *title;
1060 char *start;
1061 char *stop;
1062 sopno startst;
1063 sopno stopst;
1064 {
1065 if (!(m->eflags&REG_TRACE))
1066 return;
1067
1068 printf("%s %s-", title, pchar(*start));
1069 printf("%s ", pchar(*stop));
1070 printf("%ld-%ld\n", (long)startst, (long)stopst);
1071 }
1072
1073 #ifndef PCHARDONE
1074 #define PCHARDONE /* never again */
1075 /*
1076 - pchar - make a character printable
1077 == #ifdef REDEBUG
1078 == static char *pchar(int ch);
1079 == #endif
1080 *
1081 * Is this identical to regchar() over in debug.c? Well, yes. But a
1082 * duplicate here avoids having a debugging-capable regexec.o tied to
1083 * a matching debug.o, and this is convenient. It all disappears in
1084 * the non-debug compilation anyway, so it doesn't matter much.
1085 */
1086 static char * /* -> representation */
1087 pchar(ch)
1088 int ch;
1089 {
1090 static char pbuf[10];
1091
1092 if (isprint(ch) || ch == ' ')
1093 sprintf(pbuf, "%c", ch);
1094 else
1095 sprintf(pbuf, "\\%o", ch);
1096 return(pbuf);
1097 }
1098 #endif
1099 #endif
1100
1101 #undef matcher
1102 #undef fast
1103 #undef slow
1104 #undef dissect
1105 #undef backref
1106 #undef step
1107 #undef print
1108 #undef at
1109 #undef match