fixed Cyrillic encodings
[wxWidgets.git] / src / regex / rege_dfa.c
CommitLineData
48bf5def
RN
1/*
2 * DFA routines
3 * This file is #included by regexec.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * $Header$
32 *
33 */
34
35/*
36 * longest - longest-preferred matching engine
37 */
38static chr * /* endpoint, or NULL */
39longest(struct vars * v, /* used only for debug and exec flags */
40 struct dfa * d,
41 chr *start, /* where the match should start */
42 chr *stop, /* match must end at or before here */
43 int *hitstopp) /* record whether hit v->stop, if non-NULL */
44{
45 chr *cp;
46 chr *realstop = (stop == v->stop) ? stop : stop + 1;
47 color co;
48 struct sset *css;
49 struct sset *ss;
50 chr *post;
51 int i;
52 struct colormap *cm = d->cm;
53
54 /* initialize */
55 css = initialize(v, d, start);
56 cp = start;
57 if (hitstopp != NULL)
58 *hitstopp = 0;
59
60 /* startup */
61 FDEBUG(("+++ startup +++\n"));
62 if (cp == v->start)
63 {
64 co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
65 FDEBUG(("color %ld\n", (long) co));
66 }
67 else
68 {
69 co = GETCOLOR(cm, *(cp - 1));
70 FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
71 }
72 css = miss(v, d, css, co, cp, start);
73 if (css == NULL)
74 return NULL;
75 css->lastseen = cp;
76
77 /* main loop */
78 if (v->eflags & REG_FTRACE)
79 while (cp < realstop)
80 {
81 FDEBUG(("+++ at c%d +++\n", css - d->ssets));
82 co = GETCOLOR(cm, *cp);
83 FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
84 ss = css->outs[co];
85 if (ss == NULL)
86 {
87 ss = miss(v, d, css, co, cp + 1, start);
88 if (ss == NULL)
89 break; /* NOTE BREAK OUT */
90 }
91 cp++;
92 ss->lastseen = cp;
93 css = ss;
94 }
95 else
96 while (cp < realstop)
97 {
98 co = GETCOLOR(cm, *cp);
99 ss = css->outs[co];
100 if (ss == NULL)
101 {
102 ss = miss(v, d, css, co, cp + 1, start);
103 if (ss == NULL)
104 break; /* NOTE BREAK OUT */
105 }
106 cp++;
107 ss->lastseen = cp;
108 css = ss;
109 }
110
111 /* shutdown */
112 FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
113 if (cp == v->stop && stop == v->stop)
114 {
115 if (hitstopp != NULL)
116 *hitstopp = 1;
117 co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
118 FDEBUG(("color %ld\n", (long) co));
119 ss = miss(v, d, css, co, cp, start);
120 /* special case: match ended at eol? */
121 if (ss != NULL && (ss->flags & POSTSTATE))
122 return cp;
123 else if (ss != NULL)
124 ss->lastseen = cp; /* to be tidy */
125 }
126
127 /* find last match, if any */
128 post = d->lastpost;
129 for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
130 if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
131 (post == NULL || post < ss->lastseen))
132 post = ss->lastseen;
133 if (post != NULL) /* found one */
134 return post - 1;
135
136 return NULL;
137}
138
139/*
140 * shortest - shortest-preferred matching engine
141 */
142static chr * /* endpoint, or NULL */
143shortest(struct vars * v,
144 struct dfa * d,
145 chr *start, /* where the match should start */
146 chr *min, /* match must end at or after here */
147 chr *max, /* match must end at or before here */
148 chr **coldp, /* store coldstart pointer here, if
149 * nonNULL */
150 int *hitstopp) /* record whether hit v->stop, if non-NULL */
151{
152 chr *cp;
153 chr *realmin = (min == v->stop) ? min : min + 1;
154 chr *realmax = (max == v->stop) ? max : max + 1;
155 color co;
156 struct sset *css;
157 struct sset *ss;
158 struct colormap *cm = d->cm;
159
160 /* initialize */
161 css = initialize(v, d, start);
162 cp = start;
163 if (hitstopp != NULL)
164 *hitstopp = 0;
165
166 /* startup */
167 FDEBUG(("--- startup ---\n"));
168 if (cp == v->start)
169 {
170 co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
171 FDEBUG(("color %ld\n", (long) co));
172 }
173 else
174 {
175 co = GETCOLOR(cm, *(cp - 1));
176 FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
177 }
178 css = miss(v, d, css, co, cp, start);
179 if (css == NULL)
180 return NULL;
181 css->lastseen = cp;
182 ss = css;
183
184 /* main loop */
185 if (v->eflags & REG_FTRACE)
186 while (cp < realmax)
187 {
188 FDEBUG(("--- at c%d ---\n", css - d->ssets));
189 co = GETCOLOR(cm, *cp);
190 FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
191 ss = css->outs[co];
192 if (ss == NULL)
193 {
194 ss = miss(v, d, css, co, cp + 1, start);
195 if (ss == NULL)
196 break; /* NOTE BREAK OUT */
197 }
198 cp++;
199 ss->lastseen = cp;
200 css = ss;
201 if ((ss->flags & POSTSTATE) && cp >= realmin)
202 break; /* NOTE BREAK OUT */
203 }
204 else
205 while (cp < realmax)
206 {
207 co = GETCOLOR(cm, *cp);
208 ss = css->outs[co];
209 if (ss == NULL)
210 {
211 ss = miss(v, d, css, co, cp + 1, start);
212 if (ss == NULL)
213 break; /* NOTE BREAK OUT */
214 }
215 cp++;
216 ss->lastseen = cp;
217 css = ss;
218 if ((ss->flags & POSTSTATE) && cp >= realmin)
219 break; /* NOTE BREAK OUT */
220 }
221
222 if (ss == NULL)
223 return NULL;
224
225 if (coldp != NULL) /* report last no-progress state set, if
226 * any */
227 *coldp = lastcold(v, d);
228
229 if ((ss->flags & POSTSTATE) && cp > min)
230 {
231 assert(cp >= realmin);
232 cp--;
233 }
234 else if (cp == v->stop && max == v->stop)
235 {
236 co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
237 FDEBUG(("color %ld\n", (long) co));
238 ss = miss(v, d, css, co, cp, start);
239 /* match might have ended at eol */
240 if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
241 *hitstopp = 1;
242 }
243
244 if (ss == NULL || !(ss->flags & POSTSTATE))
245 return NULL;
246
247 return cp;
248}
249
250/*
251 * lastcold - determine last point at which no progress had been made
252 */
253static chr * /* endpoint, or NULL */
254lastcold(struct vars * v,
255 struct dfa * d)
256{
257 struct sset *ss;
258 chr *nopr;
259 int i;
260
261 nopr = d->lastnopr;
262 if (nopr == NULL)
263 nopr = v->start;
264 for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
265 if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
266 nopr = ss->lastseen;
267 return nopr;
268}
269
270/*
271 * newdfa - set up a fresh DFA
272 */
273static struct dfa *
274newdfa(struct vars * v,
275 struct cnfa * cnfa,
276 struct colormap * cm,
277 struct smalldfa * small) /* preallocated space, may be NULL */
278{
279 struct dfa *d;
280 size_t nss = cnfa->nstates * 2;
281 int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
282 struct smalldfa *smallwas = small;
283
284 assert(cnfa != NULL && cnfa->nstates != 0);
285
286 if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
287 {
288 assert(wordsper == 1);
289 if (small == NULL)
290 {
291 small = (struct smalldfa *) MALLOC(
292 sizeof(struct smalldfa));
293 if (small == NULL)
294 {
295 ERR(REG_ESPACE);
296 return NULL;
297 }
298 }
299 d = &small->dfa;
300 d->ssets = small->ssets;
301 d->statesarea = small->statesarea;
302 d->work = &d->statesarea[nss];
303 d->outsarea = small->outsarea;
304 d->incarea = small->incarea;
305 d->cptsmalloced = 0;
306 d->mallocarea = (smallwas == NULL) ? (char *) small : NULL;
307 }
308 else
309 {
310 d = (struct dfa *) MALLOC(sizeof(struct dfa));
311 if (d == NULL)
312 {
313 ERR(REG_ESPACE);
314 return NULL;
315 }
316 d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
317 d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
318 sizeof(unsigned));
319 d->work = &d->statesarea[nss * wordsper];
320 d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
321 sizeof(struct sset *));
322 d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
323 sizeof(struct arcp));
324 d->cptsmalloced = 1;
325 d->mallocarea = (char *) d;
326 if (d->ssets == NULL || d->statesarea == NULL ||
327 d->outsarea == NULL || d->incarea == NULL)
328 {
329 freedfa(d);
330 ERR(REG_ESPACE);
331 return NULL;
332 }
333 }
334
335 d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
336 d->nssused = 0;
337 d->nstates = cnfa->nstates;
338 d->ncolors = cnfa->ncolors;
339 d->wordsper = wordsper;
340 d->cnfa = cnfa;
341 d->cm = cm;
342 d->lastpost = NULL;
343 d->lastnopr = NULL;
344 d->search = d->ssets;
345
346 /* initialization of sset fields is done as needed */
347
348 return d;
349}
350
351/*
352 * freedfa - free a DFA
353 */
354static void
355freedfa(struct dfa * d)
356{
357 if (d->cptsmalloced)
358 {
359 if (d->ssets != NULL)
360 FREE(d->ssets);
361 if (d->statesarea != NULL)
362 FREE(d->statesarea);
363 if (d->outsarea != NULL)
364 FREE(d->outsarea);
365 if (d->incarea != NULL)
366 FREE(d->incarea);
367 }
368
369 if (d->mallocarea != NULL)
370 FREE(d->mallocarea);
371}
372
373/*
374 * hash - construct a hash code for a bitvector
375 *
376 * There are probably better ways, but they're more expensive.
377 */
378static unsigned
379hash(unsigned *uv,
380 int n)
381{
382 int i;
383 unsigned h;
384
385 h = 0;
386 for (i = 0; i < n; i++)
387 h ^= uv[i];
388 return h;
389}
390
391/*
392 * initialize - hand-craft a cache entry for startup, otherwise get ready
393 */
394static struct sset *
395initialize(struct vars * v, /* used only for debug flags */
396 struct dfa * d,
397 chr *start)
398{
399 struct sset *ss;
400 int i;
401
402 /* is previous one still there? */
403 if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
404 ss = &d->ssets[0];
405 else
406 { /* no, must (re)build it */
407 ss = getvacant(v, d, start, start);
408 for (i = 0; i < d->wordsper; i++)
409 ss->states[i] = 0;
410 BSET(ss->states, d->cnfa->pre);
411 ss->hash = HASH(ss->states, d->wordsper);
412 assert(d->cnfa->pre != d->cnfa->post);
413 ss->flags = STARTER | LOCKED | NOPROGRESS;
414 /* lastseen dealt with below */
415 }
416
417 for (i = 0; i < d->nssused; i++)
418 d->ssets[i].lastseen = NULL;
419 ss->lastseen = start; /* maybe untrue, but harmless */
420 d->lastpost = NULL;
421 d->lastnopr = NULL;
422 return ss;
423}
424
425/*
426 * miss - handle a cache miss
427 */
428static struct sset * /* NULL if goes to empty set */
429miss(struct vars * v, /* used only for debug flags */
430 struct dfa * d,
431 struct sset * css,
432 pcolor co,
433 chr *cp, /* next chr */
434 chr *start) /* where the attempt got started */
435{
436 struct cnfa *cnfa = d->cnfa;
437 int i;
438 unsigned h;
439 struct carc *ca;
440 struct sset *p;
441 int ispost;
442 int noprogress;
443 int gotstate;
444 int dolacons;
445 int sawlacons;
446
447 /* for convenience, we can be called even if it might not be a miss */
448 if (css->outs[co] != NULL)
449 {
450 FDEBUG(("hit\n"));
451 return css->outs[co];
452 }
453 FDEBUG(("miss\n"));
454
455 /* first, what set of states would we end up in? */
456 for (i = 0; i < d->wordsper; i++)
457 d->work[i] = 0;
458 ispost = 0;
459 noprogress = 1;
460 gotstate = 0;
461 for (i = 0; i < d->nstates; i++)
462 if (ISBSET(css->states, i))
463 for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
464 if (ca->co == co)
465 {
466 BSET(d->work, ca->to);
467 gotstate = 1;
468 if (ca->to == cnfa->post)
469 ispost = 1;
470 if (!cnfa->states[ca->to]->co)
471 noprogress = 0;
472 FDEBUG(("%d -> %d\n", i, ca->to));
473 }
474 dolacons = (gotstate) ? (cnfa->flags & HASLACONS) : 0;
475 sawlacons = 0;
476 while (dolacons)
477 { /* transitive closure */
478 dolacons = 0;
479 for (i = 0; i < d->nstates; i++)
480 if (ISBSET(d->work, i))
481 for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
482 ca++)
483 {
484 if (ca->co <= cnfa->ncolors)
485 continue; /* NOTE CONTINUE */
486 sawlacons = 1;
487 if (ISBSET(d->work, ca->to))
488 continue; /* NOTE CONTINUE */
489 if (!lacon(v, cnfa, cp, ca->co))
490 continue; /* NOTE CONTINUE */
491 BSET(d->work, ca->to);
492 dolacons = 1;
493 if (ca->to == cnfa->post)
494 ispost = 1;
495 if (!cnfa->states[ca->to]->co)
496 noprogress = 0;
497 FDEBUG(("%d :> %d\n", i, ca->to));
498 }
499 }
500 if (!gotstate)
501 return NULL;
502 h = HASH(d->work, d->wordsper);
503
504 /* next, is that in the cache? */
505 for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
506 if (HIT(h, d->work, p, d->wordsper))
507 {
508 FDEBUG(("cached c%d\n", p - d->ssets));
509 break; /* NOTE BREAK OUT */
510 }
511 if (i == 0)
512 { /* nope, need a new cache entry */
513 p = getvacant(v, d, cp, start);
514 assert(p != css);
515 for (i = 0; i < d->wordsper; i++)
516 p->states[i] = d->work[i];
517 p->hash = h;
518 p->flags = (ispost) ? POSTSTATE : 0;
519 if (noprogress)
520 p->flags |= NOPROGRESS;
521 /* lastseen to be dealt with by caller */
522 }
523
524 if (!sawlacons)
525 { /* lookahead conds. always cache miss */
526 FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
527 css->outs[co] = p;
528 css->inchain[co] = p->ins;
529 p->ins.ss = css;
530 p->ins.co = (color) co;
531 }
532 return p;
533}
534
535/*
536 * lacon - lookahead-constraint checker for miss()
537 */
538static int /* predicate: constraint satisfied? */
539lacon(struct vars * v,
540 struct cnfa * pcnfa, /* parent cnfa */
541 chr *cp,
542 pcolor co) /* "color" of the lookahead constraint */
543{
544 int n;
545 struct subre *sub;
546 struct dfa *d;
547 struct smalldfa sd;
548 chr *end;
549
550 n = co - pcnfa->ncolors;
551 assert(n < v->g->nlacons && v->g->lacons != NULL);
552 FDEBUG(("=== testing lacon %d\n", n));
553 sub = &v->g->lacons[n];
554 d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
555 if (d == NULL)
556 {
557 ERR(REG_ESPACE);
558 return 0;
559 }
560 end = longest(v, d, cp, v->stop, (int *) NULL);
561 freedfa(d);
562 FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
563 return (sub->subno) ? (end != NULL) : (end == NULL);
564}
565
566/*
567 * getvacant - get a vacant state set
568 * This routine clears out the inarcs and outarcs, but does not otherwise
569 * clear the innards of the state set -- that's up to the caller.
570 */
571static struct sset *
572getvacant(struct vars * v, /* used only for debug flags */
573 struct dfa * d,
574 chr *cp,
575 chr *start)
576{
577 int i;
578 struct sset *ss;
579 struct sset *p;
580 struct arcp ap;
581 struct arcp lastap;
582 color co;
583
584 ss = pickss(v, d, cp, start);
585 assert(!(ss->flags & LOCKED));
586
587 /* clear out its inarcs, including self-referential ones */
588 ap = ss->ins;
589 while ((p = ap.ss) != NULL)
590 {
591 co = ap.co;
592 FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long) co));
593 p->outs[co] = NULL;
594 ap = p->inchain[co];
595 p->inchain[co].ss = NULL; /* paranoia */
596 }
597 ss->ins.ss = NULL;
598
599 /* take it off the inarc chains of the ssets reached by its outarcs */
600 for (i = 0; i < d->ncolors; i++)
601 {
602 p = ss->outs[i];
603 assert(p != ss); /* not self-referential */
604 if (p == NULL)
605 continue; /* NOTE CONTINUE */
606 FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
607 if (p->ins.ss == ss && p->ins.co == i)
608 p->ins = ss->inchain[i];
609 else
610 {
611 assert(p->ins.ss != NULL);
612 for (ap = p->ins; ap.ss != NULL &&
613 !(ap.ss == ss && ap.co == i);
614 ap = ap.ss->inchain[ap.co])
615 lastap = ap;
616 assert(ap.ss != NULL);
617 lastap.ss->inchain[lastap.co] = ss->inchain[i];
618 }
619 ss->outs[i] = NULL;
620 ss->inchain[i].ss = NULL;
621 }
622
623 /* if ss was a success state, may need to remember location */
624 if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
625 (d->lastpost == NULL || d->lastpost < ss->lastseen))
626 d->lastpost = ss->lastseen;
627
628 /* likewise for a no-progress state */
629 if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
630 (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
631 d->lastnopr = ss->lastseen;
632
633 return ss;
634}
635
636/*
637 * pickss - pick the next stateset to be used
638 */
639static struct sset *
640pickss(struct vars * v, /* used only for debug flags */
641 struct dfa * d,
642 chr *cp,
643 chr *start)
644{
645 int i;
646 struct sset *ss;
647 struct sset *end;
648 chr *ancient;
649
650 /* shortcut for cases where cache isn't full */
651 if (d->nssused < d->nssets)
652 {
653 i = d->nssused;
654 d->nssused++;
655 ss = &d->ssets[i];
656 FDEBUG(("new c%d\n", i));
657 /* set up innards */
658 ss->states = &d->statesarea[i * d->wordsper];
659 ss->flags = 0;
660 ss->ins.ss = NULL;
661 ss->ins.co = WHITE; /* give it some value */
662 ss->outs = &d->outsarea[i * d->ncolors];
663 ss->inchain = &d->incarea[i * d->ncolors];
664 for (i = 0; i < d->ncolors; i++)
665 {
666 ss->outs[i] = NULL;
667 ss->inchain[i].ss = NULL;
668 }
669 return ss;
670 }
671
672 /* look for oldest, or old enough anyway */
673 if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
674 ancient = cp - d->nssets * 2 / 3;
675 else
676 ancient = start;
677 for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
678 if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
679 !(ss->flags & LOCKED))
680 {
681 d->search = ss + 1;
682 FDEBUG(("replacing c%d\n", ss - d->ssets));
683 return ss;
684 }
685 for (ss = d->ssets, end = d->search; ss < end; ss++)
686 if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
687 !(ss->flags & LOCKED))
688 {
689 d->search = ss + 1;
690 FDEBUG(("replacing c%d\n", ss - d->ssets));
691 return ss;
692 }
693
694 /* nobody's old enough?!? -- something's really wrong */
695 FDEBUG(("can't find victim to replace!\n"));
696 assert(NOTREACHED);
697 ERR(REG_ASSERT);
698 return d->ssets;
699}