3 * This file is #included by regexec.c.
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 - longest - longest-preferred matching engine
35 ^ static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
37 static chr
* /* endpoint, or NULL */
38 longest(v
, d
, start
, stop
, hitstopp
)
39 struct vars
*v
; /* used only for debug and exec flags */
41 chr
*start
; /* where the match should start */
42 chr
*stop
; /* match must end at or before here */
43 int *hitstopp
; /* record whether hit v->stop, if non-NULL */
46 chr
*realstop
= (stop
== v
->stop
) ? stop
: stop
+ 1;
52 struct colormap
*cm
= d
->cm
;
55 css
= initialize(v
, d
, start
);
61 FDEBUG(("+++ startup +++\n"));
63 co
= d
->cnfa
->bos
[(v
->eflags
®_NOTBOL
) ? 0 : 1];
64 FDEBUG(("color %ld\n", (long)co
));
66 co
= GETCOLOR(cm
, *(cp
- 1));
67 FDEBUG(("char %c, color %ld\n", (char)*(cp
-1), (long)co
));
69 css
= miss(v
, d
, css
, co
, cp
, start
);
75 if (v
->eflags
®_FTRACE
)
76 while (cp
< realstop
) {
77 FDEBUG(("+++ at c%d +++\n", css
- d
->ssets
));
78 co
= GETCOLOR(cm
, *cp
);
79 FDEBUG(("char %c, color %ld\n", (char)*cp
, (long)co
));
82 ss
= miss(v
, d
, css
, co
, cp
+1, start
);
84 break; /* NOTE BREAK OUT */
91 while (cp
< realstop
) {
92 co
= GETCOLOR(cm
, *cp
);
95 ss
= miss(v
, d
, css
, co
, cp
+1, start
);
97 break; /* NOTE BREAK OUT */
105 FDEBUG(("+++ shutdown at c%d +++\n", css
- d
->ssets
));
106 if (cp
== v
->stop
&& stop
== v
->stop
) {
107 if (hitstopp
!= NULL
)
109 co
= d
->cnfa
->eos
[(v
->eflags
®_NOTEOL
) ? 0 : 1];
110 FDEBUG(("color %ld\n", (long)co
));
111 ss
= miss(v
, d
, css
, co
, cp
, start
);
112 /* special case: match ended at eol? */
113 if (ss
!= NULL
&& (ss
->flags
&POSTSTATE
))
116 ss
->lastseen
= cp
; /* to be tidy */
119 /* find last match, if any */
121 for (ss
= d
->ssets
, i
= d
->nssused
; i
> 0; ss
++, i
--)
122 if ((ss
->flags
&POSTSTATE
) && post
!= ss
->lastseen
&&
123 (post
== NULL
|| post
< ss
->lastseen
))
125 if (post
!= NULL
) /* found one */
132 - shortest - shortest-preferred matching engine
133 ^ static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *,
136 static chr
* /* endpoint, or NULL */
137 shortest(v
, d
, start
, min
, max
, coldp
, hitstopp
)
140 chr
*start
; /* where the match should start */
141 chr
*min
; /* match must end at or after here */
142 chr
*max
; /* match must end at or before here */
143 chr
**coldp
; /* store coldstart pointer here, if nonNULL */
144 int *hitstopp
; /* record whether hit v->stop, if non-NULL */
147 chr
*realmin
= (min
== v
->stop
) ? min
: min
+ 1;
148 chr
*realmax
= (max
== v
->stop
) ? max
: max
+ 1;
152 struct colormap
*cm
= d
->cm
;
155 css
= initialize(v
, d
, start
);
157 if (hitstopp
!= NULL
)
161 FDEBUG(("--- startup ---\n"));
162 if (cp
== v
->start
) {
163 co
= d
->cnfa
->bos
[(v
->eflags
®_NOTBOL
) ? 0 : 1];
164 FDEBUG(("color %ld\n", (long)co
));
166 co
= GETCOLOR(cm
, *(cp
- 1));
167 FDEBUG(("char %c, color %ld\n", (char)*(cp
-1), (long)co
));
169 css
= miss(v
, d
, css
, co
, cp
, start
);
176 if (v
->eflags
®_FTRACE
)
177 while (cp
< realmax
) {
178 FDEBUG(("--- at c%d ---\n", css
- d
->ssets
));
179 co
= GETCOLOR(cm
, *cp
);
180 FDEBUG(("char %c, color %ld\n", (char)*cp
, (long)co
));
183 ss
= miss(v
, d
, css
, co
, cp
+1, start
);
185 break; /* NOTE BREAK OUT */
190 if ((ss
->flags
&POSTSTATE
) && cp
>= realmin
)
191 break; /* NOTE BREAK OUT */
194 while (cp
< realmax
) {
195 co
= GETCOLOR(cm
, *cp
);
198 ss
= miss(v
, d
, css
, co
, cp
+1, start
);
200 break; /* NOTE BREAK OUT */
205 if ((ss
->flags
&POSTSTATE
) && cp
>= realmin
)
206 break; /* NOTE BREAK OUT */
212 if (coldp
!= NULL
) /* report last no-progress state set, if any */
213 *coldp
= lastcold(v
, d
);
215 if ((ss
->flags
&POSTSTATE
) && cp
> min
) {
216 assert(cp
>= realmin
);
218 } else if (cp
== v
->stop
&& max
== v
->stop
) {
219 co
= d
->cnfa
->eos
[(v
->eflags
®_NOTEOL
) ? 0 : 1];
220 FDEBUG(("color %ld\n", (long)co
));
221 ss
= miss(v
, d
, css
, co
, cp
, start
);
222 /* match might have ended at eol */
223 if ((ss
== NULL
|| !(ss
->flags
&POSTSTATE
)) && hitstopp
!= NULL
)
227 if (ss
== NULL
|| !(ss
->flags
&POSTSTATE
))
234 - lastcold - determine last point at which no progress had been made
235 ^ static chr *lastcold(struct vars *, struct dfa *);
237 static chr
* /* endpoint, or NULL */
249 for (ss
= d
->ssets
, i
= d
->nssused
; i
> 0; ss
++, i
--)
250 if ((ss
->flags
&NOPROGRESS
) && nopr
< ss
->lastseen
)
256 - newdfa - set up a fresh DFA
257 ^ static struct dfa *newdfa(struct vars *, struct cnfa *,
258 ^ struct colormap *, struct smalldfa *);
261 /* FIXME Required for CW 8 on Mac since it's not in limits.h */
263 #define __CHAR_BIT__ 8
267 newdfa(v
, cnfa
, cm
, small
)
271 struct smalldfa
*small
; /* preallocated space, may be NULL */
274 size_t nss
= cnfa
->nstates
* 2;
275 int wordsper
= (cnfa
->nstates
+ UBITS
- 1) / UBITS
;
276 struct smalldfa
*smallwas
= small
;
278 assert(cnfa
!= NULL
&& cnfa
->nstates
!= 0);
280 if (nss
<= FEWSTATES
&& cnfa
->ncolors
<= FEWCOLORS
) {
281 assert(wordsper
== 1);
283 small
= (struct smalldfa
*)MALLOC(
284 sizeof(struct smalldfa
));
291 d
->ssets
= small
->ssets
;
292 d
->statesarea
= small
->statesarea
;
293 d
->work
= &d
->statesarea
[nss
];
294 d
->outsarea
= small
->outsarea
;
295 d
->incarea
= small
->incarea
;
297 d
->mallocarea
= (smallwas
== NULL
) ? (char *)small
: NULL
;
299 d
= (struct dfa
*)MALLOC(sizeof(struct dfa
));
304 d
->ssets
= (struct sset
*)MALLOC(nss
* sizeof(struct sset
));
305 d
->statesarea
= (unsigned *)MALLOC((nss
+WORK
) * wordsper
*
307 d
->work
= &d
->statesarea
[nss
* wordsper
];
308 d
->outsarea
= (struct sset
**)MALLOC(nss
* cnfa
->ncolors
*
309 sizeof(struct sset
*));
310 d
->incarea
= (struct arcp
*)MALLOC(nss
* cnfa
->ncolors
*
311 sizeof(struct arcp
));
313 d
->mallocarea
= (char *)d
;
314 if (d
->ssets
== NULL
|| d
->statesarea
== NULL
||
315 d
->outsarea
== NULL
|| d
->incarea
== NULL
) {
322 d
->nssets
= (v
->eflags
®_SMALL
) ? 7 : nss
;
324 d
->nstates
= cnfa
->nstates
;
325 d
->ncolors
= cnfa
->ncolors
;
326 d
->wordsper
= wordsper
;
331 d
->search
= d
->ssets
;
333 /* initialization of sset fields is done as needed */
339 - freedfa - free a DFA
340 ^ static VOID freedfa(struct dfa *);
346 if (d
->cptsmalloced
) {
347 if (d
->ssets
!= NULL
)
349 if (d
->statesarea
!= NULL
)
351 if (d
->outsarea
!= NULL
)
353 if (d
->incarea
!= NULL
)
357 if (d
->mallocarea
!= NULL
)
362 - hash - construct a hash code for a bitvector
363 * There are probably better ways, but they're more expensive.
364 ^ static unsigned hash(unsigned *, int);
375 for (i
= 0; i
< n
; i
++)
381 - initialize - hand-craft a cache entry for startup, otherwise get ready
382 ^ static struct sset *initialize(struct vars *, struct dfa *, chr *);
385 initialize(v
, d
, start
)
386 struct vars
*v
; /* used only for debug flags */
393 /* is previous one still there? */
394 if (d
->nssused
> 0 && (d
->ssets
[0].flags
&STARTER
))
396 else { /* no, must (re)build it */
397 ss
= getvacant(v
, d
, start
, start
);
398 for (i
= 0; i
< d
->wordsper
; i
++)
400 BSET(ss
->states
, d
->cnfa
->pre
);
401 ss
->hash
= HASH(ss
->states
, d
->wordsper
);
402 assert(d
->cnfa
->pre
!= d
->cnfa
->post
);
403 ss
->flags
= STARTER
|LOCKED
|NOPROGRESS
;
404 /* lastseen dealt with below */
407 for (i
= 0; i
< d
->nssused
; i
++)
408 d
->ssets
[i
].lastseen
= NULL
;
409 ss
->lastseen
= start
; /* maybe untrue, but harmless */
416 - miss - handle a cache miss
417 ^ static struct sset *miss(struct vars *, struct dfa *, struct sset *,
418 ^ pcolor, chr *, chr *);
420 static struct sset
* /* NULL if goes to empty set */
421 miss(v
, d
, css
, co
, cp
, start
)
422 struct vars
*v
; /* used only for debug flags */
426 chr
*cp
; /* next chr */
427 chr
*start
; /* where the attempt got started */
429 struct cnfa
*cnfa
= d
->cnfa
;
440 /* for convenience, we can be called even if it might not be a miss */
441 if (css
->outs
[co
] != NULL
) {
443 return css
->outs
[co
];
447 /* first, what set of states would we end up in? */
448 for (i
= 0; i
< d
->wordsper
; i
++)
453 for (i
= 0; i
< d
->nstates
; i
++)
454 if (ISBSET(css
->states
, i
))
455 for (ca
= cnfa
->states
[i
]+1; ca
->co
!= COLORLESS
; ca
++)
457 BSET(d
->work
, ca
->to
);
459 if (ca
->to
== cnfa
->post
)
461 if (!cnfa
->states
[ca
->to
]->co
)
463 FDEBUG(("%d -> %d\n", i
, ca
->to
));
465 dolacons
= (gotstate
) ? (cnfa
->flags
&HASLACONS
) : 0;
467 while (dolacons
) { /* transitive closure */
469 for (i
= 0; i
< d
->nstates
; i
++)
470 if (ISBSET(d
->work
, i
))
471 for (ca
= cnfa
->states
[i
]+1; ca
->co
!= COLORLESS
;
473 if (ca
->co
<= cnfa
->ncolors
)
474 continue; /* NOTE CONTINUE */
476 if (ISBSET(d
->work
, ca
->to
))
477 continue; /* NOTE CONTINUE */
478 if (!lacon(v
, cnfa
, cp
, ca
->co
))
479 continue; /* NOTE CONTINUE */
480 BSET(d
->work
, ca
->to
);
482 if (ca
->to
== cnfa
->post
)
484 if (!cnfa
->states
[ca
->to
]->co
)
486 FDEBUG(("%d :> %d\n", i
, ca
->to
));
491 h
= HASH(d
->work
, d
->wordsper
);
493 /* next, is that in the cache? */
494 for (p
= d
->ssets
, i
= d
->nssused
; i
> 0; p
++, i
--)
495 if (HIT(h
, d
->work
, p
, d
->wordsper
)) {
496 FDEBUG(("cached c%d\n", p
- d
->ssets
));
497 break; /* NOTE BREAK OUT */
499 if (i
== 0) { /* nope, need a new cache entry */
500 p
= getvacant(v
, d
, cp
, start
);
502 for (i
= 0; i
< d
->wordsper
; i
++)
503 p
->states
[i
] = d
->work
[i
];
505 p
->flags
= (ispost
) ? POSTSTATE
: 0;
507 p
->flags
|= NOPROGRESS
;
508 /* lastseen to be dealt with by caller */
511 if (!sawlacons
) { /* lookahead conds. always cache miss */
512 FDEBUG(("c%d[%d]->c%d\n", css
- d
->ssets
, co
, p
- d
->ssets
));
514 css
->inchain
[co
] = p
->ins
;
516 p
->ins
.co
= (color
)co
;
522 - lacon - lookahead-constraint checker for miss()
523 ^ static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
525 static int /* predicate: constraint satisfied? */
526 lacon(v
, pcnfa
, cp
, co
)
528 struct cnfa
*pcnfa
; /* parent cnfa */
530 pcolor co
; /* "color" of the lookahead constraint */
538 n
= co
- pcnfa
->ncolors
;
539 assert(n
< v
->g
->nlacons
&& v
->g
->lacons
!= NULL
);
540 FDEBUG(("=== testing lacon %d\n", n
));
541 sub
= &v
->g
->lacons
[n
];
542 d
= newdfa(v
, &sub
->cnfa
, &v
->g
->cmap
, &sd
);
547 end
= longest(v
, d
, cp
, v
->stop
, (int *)NULL
);
549 FDEBUG(("=== lacon %d match %d\n", n
, (end
!= NULL
)));
550 return (sub
->subno
) ? (end
!= NULL
) : (end
== NULL
);
554 - getvacant - get a vacant state set
555 * This routine clears out the inarcs and outarcs, but does not otherwise
556 * clear the innards of the state set -- that's up to the caller.
557 ^ static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
560 getvacant(v
, d
, cp
, start
)
561 struct vars
*v
; /* used only for debug flags */
573 ss
= pickss(v
, d
, cp
, start
);
574 assert(!(ss
->flags
&LOCKED
));
576 /* clear out its inarcs, including self-referential ones */
578 while ((p
= ap
.ss
) != NULL
) {
580 FDEBUG(("zapping c%d's %ld outarc\n", p
- d
->ssets
, (long)co
));
583 p
->inchain
[co
].ss
= NULL
; /* paranoia */
587 /* take it off the inarc chains of the ssets reached by its outarcs */
588 for (i
= 0; i
< d
->ncolors
; i
++) {
590 assert(p
!= ss
); /* not self-referential */
592 continue; /* NOTE CONTINUE */
593 FDEBUG(("del outarc %d from c%d's in chn\n", i
, p
- d
->ssets
));
594 if (p
->ins
.ss
== ss
&& p
->ins
.co
== i
)
595 p
->ins
= ss
->inchain
[i
];
597 assert(p
->ins
.ss
!= NULL
);
598 for (ap
= p
->ins
; ap
.ss
!= NULL
&&
599 !(ap
.ss
== ss
&& ap
.co
== i
);
600 ap
= ap
.ss
->inchain
[ap
.co
])
602 assert(ap
.ss
!= NULL
);
603 lastap
.ss
->inchain
[lastap
.co
] = ss
->inchain
[i
];
606 ss
->inchain
[i
].ss
= NULL
;
609 /* if ss was a success state, may need to remember location */
610 if ((ss
->flags
&POSTSTATE
) && ss
->lastseen
!= d
->lastpost
&&
611 (d
->lastpost
== NULL
|| d
->lastpost
< ss
->lastseen
))
612 d
->lastpost
= ss
->lastseen
;
614 /* likewise for a no-progress state */
615 if ((ss
->flags
&NOPROGRESS
) && ss
->lastseen
!= d
->lastnopr
&&
616 (d
->lastnopr
== NULL
|| d
->lastnopr
< ss
->lastseen
))
617 d
->lastnopr
= ss
->lastseen
;
623 - pickss - pick the next stateset to be used
624 ^ static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
627 pickss(v
, d
, cp
, start
)
628 struct vars
*v
; /* used only for debug flags */
638 /* shortcut for cases where cache isn't full */
639 if (d
->nssused
< d
->nssets
) {
643 FDEBUG(("new c%d\n", i
));
645 ss
->states
= &d
->statesarea
[i
* d
->wordsper
];
648 ss
->ins
.co
= WHITE
; /* give it some value */
649 ss
->outs
= &d
->outsarea
[i
* d
->ncolors
];
650 ss
->inchain
= &d
->incarea
[i
* d
->ncolors
];
651 for (i
= 0; i
< d
->ncolors
; i
++) {
653 ss
->inchain
[i
].ss
= NULL
;
658 /* look for oldest, or old enough anyway */
659 if (cp
- start
> d
->nssets
*2/3) /* oldest 33% are expendable */
660 ancient
= cp
- d
->nssets
*2/3;
663 for (ss
= d
->search
, end
= &d
->ssets
[d
->nssets
]; ss
< end
; ss
++)
664 if ((ss
->lastseen
== NULL
|| ss
->lastseen
< ancient
) &&
665 !(ss
->flags
&LOCKED
)) {
667 FDEBUG(("replacing c%d\n", ss
- d
->ssets
));
670 for (ss
= d
->ssets
, end
= d
->search
; ss
< end
; ss
++)
671 if ((ss
->lastseen
== NULL
|| ss
->lastseen
< ancient
) &&
672 !(ss
->flags
&LOCKED
)) {
674 FDEBUG(("replacing c%d\n", ss
- d
->ssets
));
678 /* nobody's old enough?!? -- something's really wrong */
679 FDEBUG(("can't find victim to replace!\n"));