]> git.saurik.com Git - apple/libc.git/blob - regex/engine.c
4bcabfb7a85aa2f040058c586f8dcb12692815b6
[apple/libc.git] / regex / engine.c
1 /*
2 * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /*
26 * Copyright (c) 1992, 1993, 1994
27 * The Regents of the University of California. All rights reserved.
28 *
29 * This code is derived from software contributed to Berkeley by
30 * Henry Spencer.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61 /*
62 * The matching engine and friends. This file is #included by regexec.c
63 * after suitable #defines of a variety of macros used herein, so that
64 * different state representations can be used without duplicating masses
65 * of code.
66 */
67
68 #ifdef SNAMES
69 #define matcher smatcher
70 #define fast sfast
71 #define slow sslow
72 #define dissect sdissect
73 #define backref sbackref
74 #define step sstep
75 #define print sprint
76 #define at sat
77 #define match smat
78 #endif
79 #ifdef LNAMES
80 #define matcher lmatcher
81 #define fast lfast
82 #define slow lslow
83 #define dissect ldissect
84 #define backref lbackref
85 #define step lstep
86 #define print lprint
87 #define at lat
88 #define match lmat
89 #endif
90
91 /* another structure passed up and down to avoid zillions of parameters */
92 struct match {
93 struct re_guts *g;
94 int eflags;
95 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
96 char *offp; /* offsets work from here */
97 char *beginp; /* start of string -- virtual NUL precedes */
98 char *endp; /* end of string -- virtual NUL here */
99 char *coldp; /* can be no match starting before here */
100 char **lastpos; /* [nplus+1] */
101 STATEVARS;
102 states st; /* current states */
103 states fresh; /* states for a fresh start */
104 states tmp; /* temporary */
105 states empty; /* empty set of states */
106 };
107
108 /* ========= begin header generated by ./mkh ========= */
109 #ifdef __cplusplus
110 extern "C" {
111 #endif
112
113 /* === engine.c === */
114 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
115 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
116 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
117 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
118 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
119 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
120 #define BOL (OUT+1)
121 #define EOL (BOL+1)
122 #define BOLEOL (BOL+2)
123 #define NOTHING (BOL+3)
124 #define BOW (BOL+4)
125 #define EOW (BOL+5)
126 #define CODEMAX (BOL+5) /* highest code used */
127 #define NONCHAR(c) ((c) > CHAR_MAX)
128 #define NNONCHAR (CODEMAX-CHAR_MAX)
129 #ifdef REDEBUG
130 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
131 #endif
132 #ifdef REDEBUG
133 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
134 #endif
135 #ifdef REDEBUG
136 static char *pchar __P((int ch));
137 #endif
138
139 #ifdef __cplusplus
140 }
141 #endif
142 /* ========= end header generated by ./mkh ========= */
143
144 #ifdef REDEBUG
145 #define SP(t, s, c) print(m, t, s, c, stdout)
146 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
147 #define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
148 #else
149 #define SP(t, s, c) /* nothing */
150 #define AT(t, p1, p2, s1, s2) /* nothing */
151 #define NOTE(s) /* nothing */
152 #endif
153
154 /*
155 - matcher - the actual matching engine
156 == static int matcher(register struct re_guts *g, char *string, \
157 == size_t nmatch, regmatch_t pmatch[], int eflags);
158 */
159 static int /* 0 success, REG_NOMATCH failure */
160 matcher(g, string, nmatch, pmatch, eflags)
161 register struct re_guts *g;
162 char *string;
163 size_t nmatch;
164 regmatch_t pmatch[];
165 int eflags;
166 {
167 register char *endp;
168 register int i;
169 struct match mv;
170 register struct match *m = &mv;
171 register char *dp;
172 const register sopno gf = g->firststate+1; /* +1 for OEND */
173 const register sopno gl = g->laststate;
174 char *start;
175 char *stop;
176
177 /* simplify the situation where possible */
178 if (g->cflags&REG_NOSUB)
179 nmatch = 0;
180 if (eflags&REG_STARTEND) {
181 start = string + pmatch[0].rm_so;
182 stop = string + pmatch[0].rm_eo;
183 } else {
184 start = string;
185 stop = start + strlen(start);
186 }
187 if (stop < start)
188 return(REG_INVARG);
189
190 /* prescreening; this does wonders for this rather slow code */
191 if (g->must != NULL) {
192 for (dp = start; dp < stop; dp++)
193 if (*dp == g->must[0] && stop - dp >= g->mlen &&
194 memcmp(dp, g->must, (size_t)g->mlen) == 0)
195 break;
196 if (dp == stop) /* we didn't find g->must */
197 return(REG_NOMATCH);
198 }
199
200 /* match struct setup */
201 m->g = g;
202 m->eflags = eflags;
203 m->pmatch = NULL;
204 m->lastpos = NULL;
205 m->offp = string;
206 m->beginp = start;
207 m->endp = stop;
208 STATESETUP(m, 4);
209 SETUP(m->st);
210 SETUP(m->fresh);
211 SETUP(m->tmp);
212 SETUP(m->empty);
213 CLEAR(m->empty);
214
215 /* this loop does only one repetition except for backrefs */
216 for (;;) {
217 endp = fast(m, start, stop, gf, gl);
218 if (endp == NULL) { /* a miss */
219 STATETEARDOWN(m);
220 return(REG_NOMATCH);
221 }
222 if (nmatch == 0 && !g->backrefs)
223 break; /* no further info needed */
224
225 /* where? */
226 assert(m->coldp != NULL);
227 for (;;) {
228 NOTE("finding start");
229 endp = slow(m, m->coldp, stop, gf, gl);
230 if (endp != NULL)
231 break;
232 assert(m->coldp < m->endp);
233 m->coldp++;
234 }
235 if (nmatch == 1 && !g->backrefs)
236 break; /* no further info needed */
237
238 /* oh my, he wants the subexpressions... */
239 if (m->pmatch == NULL)
240 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
241 sizeof(regmatch_t));
242 if (m->pmatch == NULL) {
243 STATETEARDOWN(m);
244 return(REG_ESPACE);
245 }
246 for (i = 1; i <= m->g->nsub; i++)
247 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
248 if (!g->backrefs && !(m->eflags&REG_BACKR)) {
249 NOTE("dissecting");
250 dp = dissect(m, m->coldp, endp, gf, gl);
251 } else {
252 if (g->nplus > 0 && m->lastpos == NULL)
253 m->lastpos = (char **)malloc((g->nplus+1) *
254 sizeof(char *));
255 if (g->nplus > 0 && m->lastpos == NULL) {
256 free(m->pmatch);
257 STATETEARDOWN(m);
258 return(REG_ESPACE);
259 }
260 NOTE("backref dissect");
261 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
262 }
263 if (dp != NULL)
264 break;
265
266 /* uh-oh... we couldn't find a subexpression-level match */
267 assert(g->backrefs); /* must be back references doing it */
268 assert(g->nplus == 0 || m->lastpos != NULL);
269 for (;;) {
270 if (dp != NULL || endp <= m->coldp)
271 break; /* defeat */
272 NOTE("backoff");
273 endp = slow(m, m->coldp, endp-1, gf, gl);
274 if (endp == NULL)
275 break; /* defeat */
276 /* try it on a shorter possibility */
277 #ifndef NDEBUG
278 for (i = 1; i <= m->g->nsub; i++) {
279 assert(m->pmatch[i].rm_so == -1);
280 assert(m->pmatch[i].rm_eo == -1);
281 }
282 #endif
283 NOTE("backoff dissect");
284 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
285 }
286 assert(dp == NULL || dp == endp);
287 if (dp != NULL) /* found a shorter one */
288 break;
289
290 /* despite initial appearances, there is no match here */
291 NOTE("false alarm");
292 start = m->coldp + 1; /* recycle starting later */
293 assert(start <= stop);
294 }
295
296 /* fill in the details if requested */
297 if (nmatch > 0) {
298 pmatch[0].rm_so = m->coldp - m->offp;
299 pmatch[0].rm_eo = endp - m->offp;
300 }
301 if (nmatch > 1) {
302 assert(m->pmatch != NULL);
303 for (i = 1; i < nmatch; i++)
304 if (i <= m->g->nsub)
305 pmatch[i] = m->pmatch[i];
306 else {
307 pmatch[i].rm_so = -1;
308 pmatch[i].rm_eo = -1;
309 }
310 }
311
312 if (m->pmatch != NULL)
313 free((char *)m->pmatch);
314 if (m->lastpos != NULL)
315 free((char *)m->lastpos);
316 STATETEARDOWN(m);
317 return(0);
318 }
319
320 /*
321 - dissect - figure out what matched what, no back references
322 == static char *dissect(register struct match *m, char *start, \
323 == char *stop, sopno startst, sopno stopst);
324 */
325 static char * /* == stop (success) always */
326 dissect(m, start, stop, startst, stopst)
327 register struct match *m;
328 char *start;
329 char *stop;
330 sopno startst;
331 sopno stopst;
332 {
333 register int i;
334 register sopno ss; /* start sop of current subRE */
335 register sopno es; /* end sop of current subRE */
336 register char *sp; /* start of string matched by it */
337 register char *stp; /* string matched by it cannot pass here */
338 register char *rest; /* start of rest of string */
339 register char *tail; /* string unmatched by rest of RE */
340 register sopno ssub; /* start sop of subsubRE */
341 register sopno esub; /* end sop of subsubRE */
342 register char *ssp; /* start of string matched by subsubRE */
343 register char *sep; /* end of string matched by subsubRE */
344 register char *oldssp; /* previous ssp */
345 register char *dp;
346
347 AT("diss", start, stop, startst, stopst);
348 sp = start;
349 for (ss = startst; ss < stopst; ss = es) {
350 /* identify end of subRE */
351 es = ss;
352 switch (OP(m->g->strip[es])) {
353 case OPLUS_:
354 case OQUEST_:
355 es += OPND(m->g->strip[es]);
356 break;
357 case OCH_:
358 while (OP(m->g->strip[es]) != O_CH)
359 es += OPND(m->g->strip[es]);
360 break;
361 }
362 es++;
363
364 /* figure out what it matched */
365 switch (OP(m->g->strip[ss])) {
366 case OEND:
367 assert(nope);
368 break;
369 case OCHAR:
370 sp++;
371 break;
372 case OBOL:
373 case OEOL:
374 case OBOW:
375 case OEOW:
376 break;
377 case OANY:
378 case OANYOF:
379 sp++;
380 break;
381 case OBACK_:
382 case O_BACK:
383 assert(nope);
384 break;
385 /* cases where length of match is hard to find */
386 case OQUEST_:
387 stp = stop;
388 for (;;) {
389 /* how long could this one be? */
390 rest = slow(m, sp, stp, ss, es);
391 assert(rest != NULL); /* it did match */
392 /* could the rest match the rest? */
393 tail = slow(m, rest, stop, es, stopst);
394 if (tail == stop)
395 break; /* yes! */
396 /* no -- try a shorter match for this one */
397 stp = rest - 1;
398 assert(stp >= sp); /* it did work */
399 }
400 ssub = ss + 1;
401 esub = es - 1;
402 /* did innards match? */
403 if (slow(m, sp, rest, ssub, esub) != NULL) {
404 dp = dissect(m, sp, rest, ssub, esub);
405 assert(dp == rest);
406 } else /* no */
407 assert(sp == rest);
408 sp = rest;
409 break;
410 case OPLUS_:
411 stp = stop;
412 for (;;) {
413 /* how long could this one be? */
414 rest = slow(m, sp, stp, ss, es);
415 assert(rest != NULL); /* it did match */
416 /* could the rest match the rest? */
417 tail = slow(m, rest, stop, es, stopst);
418 if (tail == stop)
419 break; /* yes! */
420 /* no -- try a shorter match for this one */
421 stp = rest - 1;
422 assert(stp >= sp); /* it did work */
423 }
424 ssub = ss + 1;
425 esub = es - 1;
426 ssp = sp;
427 oldssp = ssp;
428 for (;;) { /* find last match of innards */
429 sep = slow(m, ssp, rest, ssub, esub);
430 if (sep == NULL || sep == ssp)
431 break; /* failed or matched null */
432 oldssp = ssp; /* on to next try */
433 ssp = sep;
434 }
435 if (sep == NULL) {
436 /* last successful match */
437 sep = ssp;
438 ssp = oldssp;
439 }
440 assert(sep == rest); /* must exhaust substring */
441 assert(slow(m, ssp, sep, ssub, esub) == rest);
442 dp = dissect(m, ssp, sep, ssub, esub);
443 assert(dp == sep);
444 sp = rest;
445 break;
446 case OCH_:
447 stp = stop;
448 for (;;) {
449 /* how long could this one be? */
450 rest = slow(m, sp, stp, ss, es);
451 assert(rest != NULL); /* it did match */
452 /* could the rest match the rest? */
453 tail = slow(m, rest, stop, es, stopst);
454 if (tail == stop)
455 break; /* yes! */
456 /* no -- try a shorter match for this one */
457 stp = rest - 1;
458 assert(stp >= sp); /* it did work */
459 }
460 ssub = ss + 1;
461 esub = ss + OPND(m->g->strip[ss]) - 1;
462 assert(OP(m->g->strip[esub]) == OOR1);
463 for (;;) { /* find first matching branch */
464 if (slow(m, sp, rest, ssub, esub) == rest)
465 break; /* it matched all of it */
466 /* that one missed, try next one */
467 assert(OP(m->g->strip[esub]) == OOR1);
468 esub++;
469 assert(OP(m->g->strip[esub]) == OOR2);
470 ssub = esub + 1;
471 esub += OPND(m->g->strip[esub]);
472 if (OP(m->g->strip[esub]) == OOR2)
473 esub--;
474 else
475 assert(OP(m->g->strip[esub]) == O_CH);
476 }
477 dp = dissect(m, sp, rest, ssub, esub);
478 assert(dp == rest);
479 sp = rest;
480 break;
481 case O_PLUS:
482 case O_QUEST:
483 case OOR1:
484 case OOR2:
485 case O_CH:
486 assert(nope);
487 break;
488 case OLPAREN:
489 i = OPND(m->g->strip[ss]);
490 assert(0 < i && i <= m->g->nsub);
491 m->pmatch[i].rm_so = sp - m->offp;
492 break;
493 case ORPAREN:
494 i = OPND(m->g->strip[ss]);
495 assert(0 < i && i <= m->g->nsub);
496 m->pmatch[i].rm_eo = sp - m->offp;
497 break;
498 default: /* uh oh */
499 assert(nope);
500 break;
501 }
502 }
503
504 assert(sp == stop);
505 return(sp);
506 }
507
508 /*
509 - backref - figure out what matched what, figuring in back references
510 == static char *backref(register struct match *m, char *start, \
511 == char *stop, sopno startst, sopno stopst, sopno lev);
512 */
513 static char * /* == stop (success) or NULL (failure) */
514 backref(m, start, stop, startst, stopst, lev)
515 register struct match *m;
516 char *start;
517 char *stop;
518 sopno startst;
519 sopno stopst;
520 sopno lev; /* PLUS nesting level */
521 {
522 register int i;
523 register sopno ss; /* start sop of current subRE */
524 register char *sp; /* start of string matched by it */
525 register sopno ssub; /* start sop of subsubRE */
526 register sopno esub; /* end sop of subsubRE */
527 register char *ssp; /* start of string matched by subsubRE */
528 register char *dp;
529 register size_t len;
530 register int hard;
531 register sop s;
532 register regoff_t offsave;
533 register cset *cs;
534
535 AT("back", start, stop, startst, stopst);
536 sp = start;
537
538 /* get as far as we can with easy stuff */
539 hard = 0;
540 for (ss = startst; !hard && ss < stopst; ss++)
541 switch (OP(s = m->g->strip[ss])) {
542 case OCHAR:
543 if (sp == stop || *sp++ != (char)OPND(s))
544 return(NULL);
545 break;
546 case OANY:
547 if (sp == stop)
548 return(NULL);
549 sp++;
550 break;
551 case OANYOF:
552 cs = &m->g->sets[OPND(s)];
553 if (sp == stop || !CHIN(cs, *sp++))
554 return(NULL);
555 break;
556 case OBOL:
557 if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
558 (sp < m->endp && *(sp-1) == '\n' &&
559 (m->g->cflags&REG_NEWLINE)) )
560 { /* yes */ }
561 else
562 return(NULL);
563 break;
564 case OEOL:
565 if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
566 (sp < m->endp && *sp == '\n' &&
567 (m->g->cflags&REG_NEWLINE)) )
568 { /* yes */ }
569 else
570 return(NULL);
571 break;
572 case OBOW:
573 if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
574 (sp < m->endp && *(sp-1) == '\n' &&
575 (m->g->cflags&REG_NEWLINE)) ||
576 (sp > m->beginp &&
577 !ISWORD(*(sp-1))) ) &&
578 (sp < m->endp && ISWORD(*sp)) )
579 { /* yes */ }
580 else
581 return(NULL);
582 break;
583 case OEOW:
584 if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
585 (sp < m->endp && *sp == '\n' &&
586 (m->g->cflags&REG_NEWLINE)) ||
587 (sp < m->endp && !ISWORD(*sp)) ) &&
588 (sp > m->beginp && ISWORD(*(sp-1))) )
589 { /* yes */ }
590 else
591 return(NULL);
592 break;
593 case O_QUEST:
594 break;
595 case OOR1: /* matches null but needs to skip */
596 ss++;
597 s = m->g->strip[ss];
598 do {
599 assert(OP(s) == OOR2);
600 ss += OPND(s);
601 } while (OP(s = m->g->strip[ss]) != O_CH);
602 /* note that the ss++ gets us past the O_CH */
603 break;
604 default: /* have to make a choice */
605 hard = 1;
606 break;
607 }
608 if (!hard) { /* that was it! */
609 if (sp != stop)
610 return(NULL);
611 return(sp);
612 }
613 ss--; /* adjust for the for's final increment */
614
615 /* the hard stuff */
616 AT("hard", sp, stop, ss, stopst);
617 s = m->g->strip[ss];
618 switch (OP(s)) {
619 case OBACK_: /* the vilest depths */
620 i = OPND(s);
621 assert(0 < i && i <= m->g->nsub);
622 if (m->pmatch[i].rm_eo == -1)
623 return(NULL);
624 assert(m->pmatch[i].rm_so != -1);
625 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
626 assert(stop - m->beginp >= len);
627 if (sp > stop - len)
628 return(NULL); /* not enough left to match */
629 ssp = m->offp + m->pmatch[i].rm_so;
630 if (memcmp(sp, ssp, len) != 0)
631 return(NULL);
632 while (m->g->strip[ss] != SOP(O_BACK, i))
633 ss++;
634 return(backref(m, sp+len, stop, ss+1, stopst, lev));
635 break;
636 case OQUEST_: /* to null or not */
637 dp = backref(m, sp, stop, ss+1, stopst, lev);
638 if (dp != NULL)
639 return(dp); /* not */
640 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
641 break;
642 case OPLUS_:
643 assert(m->lastpos != NULL);
644 assert(lev+1 <= m->g->nplus);
645 m->lastpos[lev+1] = sp;
646 return(backref(m, sp, stop, ss+1, stopst, lev+1));
647 break;
648 case O_PLUS:
649 if (sp == m->lastpos[lev]) /* last pass matched null */
650 return(backref(m, sp, stop, ss+1, stopst, lev-1));
651 /* try another pass */
652 m->lastpos[lev] = sp;
653 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
654 if (dp == NULL)
655 return(backref(m, sp, stop, ss+1, stopst, lev-1));
656 else
657 return(dp);
658 break;
659 case OCH_: /* find the right one, if any */
660 ssub = ss + 1;
661 esub = ss + OPND(s) - 1;
662 assert(OP(m->g->strip[esub]) == OOR1);
663 for (;;) { /* find first matching branch */
664 dp = backref(m, sp, stop, ssub, esub, lev);
665 if (dp != NULL)
666 return(dp);
667 /* that one missed, try next one */
668 if (OP(m->g->strip[esub]) == O_CH)
669 return(NULL); /* there is none */
670 esub++;
671 assert(OP(m->g->strip[esub]) == OOR2);
672 ssub = esub + 1;
673 esub += OPND(m->g->strip[esub]);
674 if (OP(m->g->strip[esub]) == OOR2)
675 esub--;
676 else
677 assert(OP(m->g->strip[esub]) == O_CH);
678 }
679 break;
680 case OLPAREN: /* must undo assignment if rest fails */
681 i = OPND(s);
682 assert(0 < i && i <= m->g->nsub);
683 offsave = m->pmatch[i].rm_so;
684 m->pmatch[i].rm_so = sp - m->offp;
685 dp = backref(m, sp, stop, ss+1, stopst, lev);
686 if (dp != NULL)
687 return(dp);
688 m->pmatch[i].rm_so = offsave;
689 return(NULL);
690 break;
691 case ORPAREN: /* must undo assignment if rest fails */
692 i = OPND(s);
693 assert(0 < i && i <= m->g->nsub);
694 offsave = m->pmatch[i].rm_eo;
695 m->pmatch[i].rm_eo = sp - m->offp;
696 dp = backref(m, sp, stop, ss+1, stopst, lev);
697 if (dp != NULL)
698 return(dp);
699 m->pmatch[i].rm_eo = offsave;
700 return(NULL);
701 break;
702 default: /* uh oh */
703 assert(nope);
704 break;
705 }
706
707 /* "can't happen" */
708 assert(nope);
709 /* NOTREACHED */
710 }
711
712 /*
713 - fast - step through the string at top speed
714 == static char *fast(register struct match *m, char *start, \
715 == char *stop, sopno startst, sopno stopst);
716 */
717 static char * /* where tentative match ended, or NULL */
718 fast(m, start, stop, startst, stopst)
719 register struct match *m;
720 char *start;
721 char *stop;
722 sopno startst;
723 sopno stopst;
724 {
725 register states st = m->st;
726 register states fresh = m->fresh;
727 register states tmp = m->tmp;
728 register char *p = start;
729 register int c = (start == m->beginp) ? OUT : *(start-1);
730 register int lastc; /* previous c */
731 register int flagch;
732 register int i;
733 register char *coldp; /* last p after which no match was underway */
734
735 CLEAR(st);
736 SET1(st, startst);
737 st = step(m->g, startst, stopst, st, NOTHING, st);
738 ASSIGN(fresh, st);
739 SP("start", st, *p);
740 coldp = NULL;
741 for (;;) {
742 /* next character */
743 lastc = c;
744 c = (p == m->endp) ? OUT : *p;
745 if (EQ(st, fresh))
746 coldp = p;
747
748 /* is there an EOL and/or BOL between lastc and c? */
749 flagch = '\0';
750 i = 0;
751 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
752 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
753 flagch = BOL;
754 i = m->g->nbol;
755 }
756 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
757 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
758 flagch = (flagch == BOL) ? BOLEOL : EOL;
759 i += m->g->neol;
760 }
761 if (i != 0) {
762 for (; i > 0; i--)
763 st = step(m->g, startst, stopst, st, flagch, st);
764 SP("boleol", st, c);
765 }
766
767 /* how about a word boundary? */
768 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
769 (c != OUT && ISWORD(c)) ) {
770 flagch = BOW;
771 }
772 if ( (lastc != OUT && ISWORD(lastc)) &&
773 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
774 flagch = EOW;
775 }
776 if (flagch == BOW || flagch == EOW) {
777 st = step(m->g, startst, stopst, st, flagch, st);
778 SP("boweow", st, c);
779 }
780
781 /* are we done? */
782 if (ISSET(st, stopst) || p == stop)
783 break; /* NOTE BREAK OUT */
784
785 /* no, we must deal with this character */
786 ASSIGN(tmp, st);
787 ASSIGN(st, fresh);
788 assert(c != OUT);
789 st = step(m->g, startst, stopst, tmp, c, st);
790 SP("aft", st, c);
791 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
792 p++;
793 }
794
795 assert(coldp != NULL);
796 m->coldp = coldp;
797 if (ISSET(st, stopst))
798 return(p+1);
799 else
800 return(NULL);
801 }
802
803 /*
804 - slow - step through the string more deliberately
805 == static char *slow(register struct match *m, char *start, \
806 == char *stop, sopno startst, sopno stopst);
807 */
808 static char * /* where it ended */
809 slow(m, start, stop, startst, stopst)
810 register struct match *m;
811 char *start;
812 char *stop;
813 sopno startst;
814 sopno stopst;
815 {
816 register states st = m->st;
817 register states empty = m->empty;
818 register states tmp = m->tmp;
819 register char *p = start;
820 register int c = (start == m->beginp) ? OUT : *(start-1);
821 register int lastc; /* previous c */
822 register int flagch;
823 register int i;
824 register char *matchp; /* last p at which a match ended */
825
826 AT("slow", start, stop, startst, stopst);
827 CLEAR(st);
828 SET1(st, startst);
829 SP("sstart", st, *p);
830 st = step(m->g, startst, stopst, st, NOTHING, st);
831 matchp = NULL;
832 for (;;) {
833 /* next character */
834 lastc = c;
835 c = (p == m->endp) ? OUT : *p;
836
837 /* is there an EOL and/or BOL between lastc and c? */
838 flagch = '\0';
839 i = 0;
840 if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
841 (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
842 flagch = BOL;
843 i = m->g->nbol;
844 }
845 if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
846 (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
847 flagch = (flagch == BOL) ? BOLEOL : EOL;
848 i += m->g->neol;
849 }
850 if (i != 0) {
851 for (; i > 0; i--)
852 st = step(m->g, startst, stopst, st, flagch, st);
853 SP("sboleol", st, c);
854 }
855
856 /* how about a word boundary? */
857 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
858 (c != OUT && ISWORD(c)) ) {
859 flagch = BOW;
860 }
861 if ( (lastc != OUT && ISWORD(lastc)) &&
862 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
863 flagch = EOW;
864 }
865 if (flagch == BOW || flagch == EOW) {
866 st = step(m->g, startst, stopst, st, flagch, st);
867 SP("sboweow", st, c);
868 }
869
870 /* are we done? */
871 if (ISSET(st, stopst))
872 matchp = p;
873 if (EQ(st, empty) || p == stop)
874 break; /* NOTE BREAK OUT */
875
876 /* no, we must deal with this character */
877 ASSIGN(tmp, st);
878 ASSIGN(st, empty);
879 assert(c != OUT);
880 st = step(m->g, startst, stopst, tmp, c, st);
881 SP("saft", st, c);
882 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
883 p++;
884 }
885
886 return(matchp);
887 }
888
889
890 /*
891 - step - map set of states reachable before char to set reachable after
892 == static states step(register struct re_guts *g, sopno start, sopno stop, \
893 == register states bef, int ch, register states aft);
894 == #define BOL (OUT+1)
895 == #define EOL (BOL+1)
896 == #define BOLEOL (BOL+2)
897 == #define NOTHING (BOL+3)
898 == #define BOW (BOL+4)
899 == #define EOW (BOL+5)
900 == #define CODEMAX (BOL+5) // highest code used
901 == #define NONCHAR(c) ((c) > CHAR_MAX)
902 == #define NNONCHAR (CODEMAX-CHAR_MAX)
903 */
904 static states
905 step(g, start, stop, bef, ch, aft)
906 register struct re_guts *g;
907 sopno start; /* start state within strip */
908 sopno stop; /* state after stop state within strip */
909 register states bef; /* states reachable before */
910 int ch; /* character or NONCHAR code */
911 register states aft; /* states already known reachable after */
912 {
913 register cset *cs;
914 register sop s;
915 register sopno pc;
916 register onestate here; /* note, macros know this name */
917 register sopno look;
918 register int i;
919
920 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
921 s = g->strip[pc];
922 switch (OP(s)) {
923 case OEND:
924 assert(pc == stop-1);
925 break;
926 case OCHAR:
927 /* only characters can match */
928 assert(!NONCHAR(ch) || ch != (char)OPND(s));
929 if (ch == (char)OPND(s))
930 FWD(aft, bef, 1);
931 break;
932 case OBOL:
933 if (ch == BOL || ch == BOLEOL)
934 FWD(aft, bef, 1);
935 break;
936 case OEOL:
937 if (ch == EOL || ch == BOLEOL)
938 FWD(aft, bef, 1);
939 break;
940 case OBOW:
941 if (ch == BOW)
942 FWD(aft, bef, 1);
943 break;
944 case OEOW:
945 if (ch == EOW)
946 FWD(aft, bef, 1);
947 break;
948 case OANY:
949 if (!NONCHAR(ch))
950 FWD(aft, bef, 1);
951 break;
952 case OANYOF:
953 cs = &g->sets[OPND(s)];
954 if (!NONCHAR(ch) && CHIN(cs, ch))
955 FWD(aft, bef, 1);
956 break;
957 case OBACK_: /* ignored here */
958 case O_BACK:
959 FWD(aft, aft, 1);
960 break;
961 case OPLUS_: /* forward, this is just an empty */
962 FWD(aft, aft, 1);
963 break;
964 case O_PLUS: /* both forward and back */
965 FWD(aft, aft, 1);
966 i = ISSETBACK(aft, OPND(s));
967 BACK(aft, aft, OPND(s));
968 if (!i && ISSETBACK(aft, OPND(s))) {
969 /* oho, must reconsider loop body */
970 pc -= OPND(s) + 1;
971 INIT(here, pc);
972 }
973 break;
974 case OQUEST_: /* two branches, both forward */
975 FWD(aft, aft, 1);
976 FWD(aft, aft, OPND(s));
977 break;
978 case O_QUEST: /* just an empty */
979 FWD(aft, aft, 1);
980 break;
981 case OLPAREN: /* not significant here */
982 case ORPAREN:
983 FWD(aft, aft, 1);
984 break;
985 case OCH_: /* mark the first two branches */
986 FWD(aft, aft, 1);
987 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
988 FWD(aft, aft, OPND(s));
989 break;
990 case OOR1: /* done a branch, find the O_CH */
991 if (ISSTATEIN(aft, here)) {
992 for (look = 1;
993 OP(s = g->strip[pc+look]) != O_CH;
994 look += OPND(s))
995 assert(OP(s) == OOR2);
996 FWD(aft, aft, look);
997 }
998 break;
999 case OOR2: /* propagate OCH_'s marking */
1000 FWD(aft, aft, 1);
1001 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1002 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1003 FWD(aft, aft, OPND(s));
1004 }
1005 break;
1006 case O_CH: /* just empty */
1007 FWD(aft, aft, 1);
1008 break;
1009 default: /* ooooops... */
1010 assert(nope);
1011 break;
1012 }
1013 }
1014
1015 return(aft);
1016 }
1017
1018 #ifdef REDEBUG
1019 /*
1020 - print - print a set of states
1021 == #ifdef REDEBUG
1022 == static void print(struct match *m, char *caption, states st, \
1023 == int ch, FILE *d);
1024 == #endif
1025 */
1026 static void
1027 print(m, caption, st, ch, d)
1028 struct match *m;
1029 char *caption;
1030 states st;
1031 int ch;
1032 FILE *d;
1033 {
1034 register struct re_guts *g = m->g;
1035 register int i;
1036 register int first = 1;
1037
1038 if (!(m->eflags&REG_TRACE))
1039 return;
1040
1041 fprintf(d, "%s", caption);
1042 if (ch != '\0')
1043 fprintf(d, " %s", pchar(ch));
1044 for (i = 0; i < g->nstates; i++)
1045 if (ISSET(st, i)) {
1046 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1047 first = 0;
1048 }
1049 fprintf(d, "\n");
1050 }
1051
1052 /*
1053 - at - print current situation
1054 == #ifdef REDEBUG
1055 == static void at(struct match *m, char *title, char *start, char *stop, \
1056 == sopno startst, sopno stopst);
1057 == #endif
1058 */
1059 static void
1060 at(m, title, start, stop, startst, stopst)
1061 struct match *m;
1062 char *title;
1063 char *start;
1064 char *stop;
1065 sopno startst;
1066 sopno stopst;
1067 {
1068 if (!(m->eflags&REG_TRACE))
1069 return;
1070
1071 printf("%s %s-", title, pchar(*start));
1072 printf("%s ", pchar(*stop));
1073 printf("%ld-%ld\n", (long)startst, (long)stopst);
1074 }
1075
1076 #ifndef PCHARDONE
1077 #define PCHARDONE /* never again */
1078 /*
1079 - pchar - make a character printable
1080 == #ifdef REDEBUG
1081 == static char *pchar(int ch);
1082 == #endif
1083 *
1084 * Is this identical to regchar() over in debug.c? Well, yes. But a
1085 * duplicate here avoids having a debugging-capable regexec.o tied to
1086 * a matching debug.o, and this is convenient. It all disappears in
1087 * the non-debug compilation anyway, so it doesn't matter much.
1088 */
1089 static char * /* -> representation */
1090 pchar(ch)
1091 int ch;
1092 {
1093 static char pbuf[10];
1094
1095 if (isprint(ch) || ch == ' ')
1096 sprintf(pbuf, "%c", ch);
1097 else
1098 sprintf(pbuf, "\\%o", ch);
1099 return(pbuf);
1100 }
1101 #endif
1102 #endif
1103
1104 #undef matcher
1105 #undef fast
1106 #undef slow
1107 #undef dissect
1108 #undef backref
1109 #undef step
1110 #undef print
1111 #undef at
1112 #undef match