1.
[wxWidgets.git] / src / regex / regexec.c
1 /*
2 * re_*exec and friends - match REs
3 *
4 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 *
6 * Development of this software was funded, in part, by Cray Research Inc.,
7 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 * Corporation, none of whom are responsible for the results. The author
9 * thanks all of them.
10 *
11 * Redistribution and use in source and binary forms -- with or without
12 * modification -- are permitted for any purpose, provided that
13 * redistributions in source form retain this entire copyright notice and
14 * indicate the origin and nature of any modifications.
15 *
16 * I'd appreciate being given credit for this package in the documentation
17 * of software which uses it, but that is not a requirement.
18 *
19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * $Header: /projects/cvsroot/pgsql-server/src/backend/regex/regexec.c,v 1.23 2003/08/08 21:41:56 momjian Exp $
31 *
32 */
33
34 #include "regguts.h"
35
36
37
38 /* lazy-DFA representation */
39 struct arcp
40 { /* "pointer" to an outarc */
41 struct sset *ss;
42 color co;
43 };
44
45 struct sset
46 { /* state set */
47 unsigned *states; /* pointer to bitvector */
48 unsigned hash; /* hash of bitvector */
49 #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50 #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 int flags;
53 #define STARTER 01 /* the initial state set */
54 #define POSTSTATE 02 /* includes the goal state */
55 #define LOCKED 04 /* locked in cache */
56 #define NOPROGRESS 010 /* zero-progress state set */
57 struct arcp ins; /* chain of inarcs pointing here */
58 chr *lastseen; /* last entered on arrival here */
59 struct sset **outs; /* outarc vector indexed by color */
60 struct arcp *inchain; /* chain-pointer vector for outarcs */
61 };
62
63 struct dfa
64 {
65 int nssets; /* size of cache */
66 int nssused; /* how many entries occupied yet */
67 int nstates; /* number of states */
68 int ncolors; /* length of outarc and inchain vectors */
69 int wordsper; /* length of state-set bitvectors */
70 struct sset *ssets; /* state-set cache */
71 unsigned *statesarea; /* bitvector storage */
72 unsigned *work; /* pointer to work area within statesarea */
73 struct sset **outsarea; /* outarc-vector storage */
74 struct arcp *incarea; /* inchain storage */
75 struct cnfa *cnfa;
76 struct colormap *cm;
77 chr *lastpost; /* location of last cache-flushed success */
78 chr *lastnopr; /* location of last cache-flushed
79 * NOPROGRESS */
80 struct sset *search; /* replacement-search-pointer memory */
81 int cptsmalloced; /* were the areas individually malloced? */
82 char *mallocarea; /* self, or master malloced area, or NULL */
83 };
84
85 #define WORK 1 /* number of work bitvectors needed */
86
87 /* setup for non-malloc allocation for small cases */
88 #define FEWSTATES 20 /* must be less than UBITS */
89 #define FEWCOLORS 15
90 struct smalldfa
91 {
92 struct dfa dfa;
93 struct sset ssets[FEWSTATES * 2];
94 unsigned statesarea[FEWSTATES * 2 + WORK];
95 struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
96 struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
97 };
98
99 #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
100
101
102
103 /* internal variables, bundled for easy passing around */
104 struct vars
105 {
106 regex_t *re;
107 struct guts *g;
108 int eflags; /* copies of arguments */
109 size_t nmatch;
110 regmatch_t *pmatch;
111 rm_detail_t *details;
112 chr *start; /* start of string */
113 chr *stop; /* just past end of string */
114 int err; /* error code if any (0 none) */
115 regoff_t *mem; /* memory vector for backtracking */
116 struct smalldfa dfa1;
117 struct smalldfa dfa2;
118 };
119
120 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
121 #define ISERR() VISERR(v)
122 #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e)))
123 #define ERR(e) VERR(v, e) /* record an error */
124 #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return
125 * it */
126 #define OFF(p) ((p) - v->start)
127 #define LOFF(p) ((long)OFF(p))
128
129
130
131 /*
132 * forward declarations
133 */
134 /* === regexec.c === */
135 static int find(struct vars *, struct cnfa *, struct colormap *);
136 static int cfind(struct vars *, struct cnfa *, struct colormap *);
137 static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
138 static void zapsubs(regmatch_t *, size_t);
139 static void zapmem(struct vars *, struct subre *);
140 static void subset(struct vars *, struct subre *, chr *, chr *);
141 static int dissect(struct vars *, struct subre *, chr *, chr *);
142 static int condissect(struct vars *, struct subre *, chr *, chr *);
143 static int altdissect(struct vars *, struct subre *, chr *, chr *);
144 static int cdissect(struct vars *, struct subre *, chr *, chr *);
145 static int ccondissect(struct vars *, struct subre *, chr *, chr *);
146 static int crevdissect(struct vars *, struct subre *, chr *, chr *);
147 static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
148 static int caltdissect(struct vars *, struct subre *, chr *, chr *);
149
150 /* === rege_dfa.c === */
151 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
152 static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
153 static chr *lastcold(struct vars *, struct dfa *);
154 static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
155 static void freedfa(struct dfa *);
156 static unsigned hash(unsigned *, int);
157 static struct sset *initialize(struct vars *, struct dfa *, chr *);
158 static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
159 static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
160 static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
161 static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
162
163
164 /*
165 * regexec - match regular expression
166 */
167 int
168 regexec(regex_t *re,
169 const chr *string,
170 size_t nmatch,
171 regmatch_t pmatch[],
172 int flags)
173 {
174 rm_detail_t det;
175 size_t nLen = 0;
176 chr* s2 = (chr*) string;
177
178 if (string && *string)
179 {
180 while(*++s2);
181 }
182
183 nLen = ((s2 - string) / sizeof(chr));
184
185 return wx_regexec(re, string, nLen, &det, nmatch, pmatch, flags);
186 }
187 int
188 wx_regexec(regex_t *re,
189 const chr *string,
190 size_t len,
191 rm_detail_t *details,
192 size_t nmatch,
193 regmatch_t pmatch[],
194 int flags)
195 {
196 struct vars var;
197 register struct vars *v = &var;
198 int st;
199 size_t n;
200 int backref;
201
202 #define LOCALMAT 20
203 regmatch_t mat[LOCALMAT];
204
205 #define LOCALMEM 40
206 regoff_t mem[LOCALMEM];
207
208 /* sanity checks */
209 if (re == NULL || string == NULL || re->re_magic != REMAGIC)
210 return REG_INVARG;
211 if (re->re_csize != sizeof(chr))
212 return REG_MIXED;
213
214 /* setup */
215 v->re = re;
216 v->g = (struct guts *) re->re_guts;
217 if ((v->g->cflags & REG_EXPECT) && details == NULL)
218 return REG_INVARG;
219 if (v->g->info & REG_UIMPOSSIBLE)
220 return REG_NOMATCH;
221 backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
222 v->eflags = flags;
223 if (v->g->cflags & REG_NOSUB)
224 nmatch = 0; /* override client */
225 v->nmatch = nmatch;
226 if (backref)
227 {
228 /* need work area */
229 if (v->g->nsub + 1 <= LOCALMAT)
230 v->pmatch = mat;
231 else
232 v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
233 sizeof(regmatch_t));
234 if (v->pmatch == NULL)
235 return REG_ESPACE;
236 v->nmatch = v->g->nsub + 1;
237 }
238 else
239 v->pmatch = pmatch;
240 v->details = details;
241 v->start = (chr *) string;
242 v->stop = (chr *) string + len;
243 v->err = 0;
244 if (backref)
245 {
246 /* need retry memory */
247 assert(v->g->ntree >= 0);
248 n = (size_t) v->g->ntree;
249 if (n <= LOCALMEM)
250 v->mem = mem;
251 else
252 v->mem = (regoff_t *) MALLOC(n * sizeof(regoff_t));
253 if (v->mem == NULL)
254 {
255 if (v->pmatch != pmatch && v->pmatch != mat)
256 FREE(v->pmatch);
257 return REG_ESPACE;
258 }
259 }
260 else
261 v->mem = NULL;
262
263 /* do it */
264 assert(v->g->tree != NULL);
265 if (backref)
266 st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
267 else
268 st = find(v, &v->g->tree->cnfa, &v->g->cmap);
269
270 /* copy (portion of) match vector over if necessary */
271 if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
272 {
273 zapsubs(pmatch, nmatch);
274 n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
275 memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
276 }
277
278 /* clean up */
279 if (v->pmatch != pmatch && v->pmatch != mat)
280 FREE(v->pmatch);
281 if (v->mem != NULL && v->mem != mem)
282 FREE(v->mem);
283 return st;
284 }
285
286 /*
287 * find - find a match for the main NFA (no-complications case)
288 */
289 static int
290 find(struct vars * v,
291 struct cnfa * cnfa,
292 struct colormap * cm)
293 {
294 struct dfa *s;
295 struct dfa *d;
296 chr *begin;
297 chr *end = NULL;
298 chr *cold;
299 chr *open; /* open and close of range of possible
300 * starts */
301 chr *close;
302 int hitend;
303 int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
304
305 /* first, a shot with the search RE */
306 s = newdfa(v, &v->g->search, cm, &v->dfa1);
307 assert(!(ISERR() && s != NULL));
308 NOERR();
309 MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
310 cold = NULL;
311 close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL);
312 freedfa(s);
313 NOERR();
314 if (v->g->cflags & REG_EXPECT)
315 {
316 assert(v->details != NULL);
317 if (cold != NULL)
318 v->details->rm_extend.rm_so = OFF(cold);
319 else
320 v->details->rm_extend.rm_so = OFF(v->stop);
321 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
322 }
323 if (close == NULL) /* not found */
324 return REG_NOMATCH;
325 if (v->nmatch == 0) /* found, don't need exact location */
326 return REG_OKAY;
327
328 /* find starting point and match */
329 assert(cold != NULL);
330 open = cold;
331 cold = NULL;
332 MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
333 d = newdfa(v, cnfa, cm, &v->dfa1);
334 assert(!(ISERR() && d != NULL));
335 NOERR();
336 for (begin = open; begin <= close; begin++)
337 {
338 MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
339 if (shorter)
340 end = shortest(v, d, begin, begin, v->stop,
341 (chr **) NULL, &hitend);
342 else
343 end = longest(v, d, begin, v->stop, &hitend);
344 NOERR();
345 if (hitend && cold == NULL)
346 cold = begin;
347 if (end != NULL)
348 break; /* NOTE BREAK OUT */
349 }
350 assert(end != NULL); /* search RE succeeded so loop should */
351 freedfa(d);
352
353 /* and pin down details */
354 assert(v->nmatch > 0);
355 v->pmatch[0].rm_so = OFF(begin);
356 v->pmatch[0].rm_eo = OFF(end);
357 if (v->g->cflags & REG_EXPECT)
358 {
359 if (cold != NULL)
360 v->details->rm_extend.rm_so = OFF(cold);
361 else
362 v->details->rm_extend.rm_so = OFF(v->stop);
363 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
364 }
365 if (v->nmatch == 1) /* no need for submatches */
366 return REG_OKAY;
367
368 /* submatches */
369 zapsubs(v->pmatch, v->nmatch);
370 return dissect(v, v->g->tree, begin, end);
371 }
372
373 /*
374 * cfind - find a match for the main NFA (with complications)
375 */
376 static int
377 cfind(struct vars * v,
378 struct cnfa * cnfa,
379 struct colormap * cm)
380 {
381 struct dfa *s;
382 struct dfa *d;
383 chr *cold;
384 int ret;
385
386 s = newdfa(v, &v->g->search, cm, &v->dfa1);
387 NOERR();
388 d = newdfa(v, cnfa, cm, &v->dfa2);
389 if (ISERR())
390 {
391 assert(d == NULL);
392 freedfa(s);
393 return v->err;
394 }
395
396 ret = cfindloop(v, cnfa, cm, d, s, &cold);
397
398 freedfa(d);
399 freedfa(s);
400 NOERR();
401 if (v->g->cflags & REG_EXPECT)
402 {
403 assert(v->details != NULL);
404 if (cold != NULL)
405 v->details->rm_extend.rm_so = OFF(cold);
406 else
407 v->details->rm_extend.rm_so = OFF(v->stop);
408 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
409 }
410 return ret;
411 }
412
413 /*
414 * cfindloop - the heart of cfind
415 */
416 static int
417 cfindloop(struct vars * v,
418 struct cnfa * cnfa,
419 struct colormap * cm,
420 struct dfa * d,
421 struct dfa * s,
422 chr **coldp) /* where to put coldstart pointer */
423 {
424 chr *begin;
425 chr *end;
426 chr *cold;
427 chr *open; /* open and close of range of possible
428 * starts */
429 chr *close;
430 chr *estart;
431 chr *estop;
432 int er;
433 int shorter = v->g->tree->flags & SHORTER;
434 int hitend;
435
436 assert(d != NULL && s != NULL);
437 cold = NULL;
438 close = v->start;
439 do
440 {
441 MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
442 close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
443 if (close == NULL)
444 break; /* NOTE BREAK */
445 assert(cold != NULL);
446 open = cold;
447 cold = NULL;
448 MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
449 for (begin = open; begin <= close; begin++)
450 {
451 MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
452 estart = begin;
453 estop = v->stop;
454 for (;;)
455 {
456 if (shorter)
457 end = shortest(v, d, begin, estart,
458 estop, (chr **) NULL, &hitend);
459 else
460 end = longest(v, d, begin, estop,
461 &hitend);
462 if (hitend && cold == NULL)
463 cold = begin;
464 if (end == NULL)
465 break; /* NOTE BREAK OUT */
466 MDEBUG(("tentative end %ld\n", LOFF(end)));
467 zapsubs(v->pmatch, v->nmatch);
468 zapmem(v, v->g->tree);
469 er = cdissect(v, v->g->tree, begin, end);
470 if (er == REG_OKAY)
471 {
472 if (v->nmatch > 0)
473 {
474 v->pmatch[0].rm_so = OFF(begin);
475 v->pmatch[0].rm_eo = OFF(end);
476 }
477 *coldp = cold;
478 return REG_OKAY;
479 }
480 if (er != REG_NOMATCH)
481 {
482 ERR(er);
483 return er;
484 }
485 if ((shorter) ? end == estop : end == begin)
486 {
487 /* no point in trying again */
488 *coldp = cold;
489 return REG_NOMATCH;
490 }
491 /* go around and try again */
492 if (shorter)
493 estart = end + 1;
494 else
495 estop = end - 1;
496 }
497 }
498 } while (close < v->stop);
499
500 *coldp = cold;
501 return REG_NOMATCH;
502 }
503
504 /*
505 * zapsubs - initialize the subexpression matches to "no match"
506 */
507 static void
508 zapsubs(regmatch_t *p,
509 size_t n)
510 {
511 size_t i;
512
513 for (i = n - 1; i > 0; i--)
514 {
515 p[i].rm_so = -1;
516 p[i].rm_eo = -1;
517 }
518 }
519
520 /*
521 * zapmem - initialize the retry memory of a subtree to zeros
522 */
523 static void
524 zapmem(struct vars * v,
525 struct subre * t)
526 {
527 if (t == NULL)
528 return;
529
530 assert(v->mem != NULL);
531 v->mem[t->retry] = 0;
532 if (t->op == '(')
533 {
534 assert(t->subno > 0);
535 v->pmatch[t->subno].rm_so = -1;
536 v->pmatch[t->subno].rm_eo = -1;
537 }
538
539 if (t->left != NULL)
540 zapmem(v, t->left);
541 if (t->right != NULL)
542 zapmem(v, t->right);
543 }
544
545 /*
546 * subset - set any subexpression relevant to a successful subre
547 */
548 static void
549 subset(struct vars * v,
550 struct subre * sub,
551 chr *begin,
552 chr *end)
553 {
554 int n = sub->subno;
555
556 assert(n > 0);
557 if ((size_t) n >= v->nmatch)
558 return;
559
560 MDEBUG(("setting %d\n", n));
561 v->pmatch[n].rm_so = OFF(begin);
562 v->pmatch[n].rm_eo = OFF(end);
563 }
564
565 /*
566 * dissect - determine subexpression matches (uncomplicated case)
567 */
568 static int /* regexec return code */
569 dissect(struct vars * v,
570 struct subre * t,
571 chr *begin, /* beginning of relevant substring */
572 chr *end) /* end of same */
573 {
574 assert(t != NULL);
575 MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
576
577 switch (t->op)
578 {
579 case '=': /* terminal node */
580 assert(t->left == NULL && t->right == NULL);
581 return REG_OKAY; /* no action, parent did the work */
582 break;
583 case '|': /* alternation */
584 assert(t->left != NULL);
585 return altdissect(v, t, begin, end);
586 break;
587 case 'b': /* back ref -- shouldn't be calling us! */
588 return REG_ASSERT;
589 break;
590 case '.': /* concatenation */
591 assert(t->left != NULL && t->right != NULL);
592 return condissect(v, t, begin, end);
593 break;
594 case '(': /* capturing */
595 assert(t->left != NULL && t->right == NULL);
596 assert(t->subno > 0);
597 subset(v, t, begin, end);
598 return dissect(v, t->left, begin, end);
599 break;
600 default:
601 return REG_ASSERT;
602 break;
603 }
604 }
605
606 /*
607 * condissect - determine concatenation subexpression matches (uncomplicated)
608 */
609 static int /* regexec return code */
610 condissect(struct vars * v,
611 struct subre * t,
612 chr *begin, /* beginning of relevant substring */
613 chr *end) /* end of same */
614 {
615 struct dfa *d;
616 struct dfa *d2;
617 chr *mid;
618 int i;
619 int shorter = (t->left->flags & SHORTER) ? 1 : 0;
620 chr *stop = (shorter) ? end : begin;
621
622 assert(t->op == '.');
623 assert(t->left != NULL && t->left->cnfa.nstates > 0);
624 assert(t->right != NULL && t->right->cnfa.nstates > 0);
625
626 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
627 NOERR();
628 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
629 if (ISERR())
630 {
631 assert(d2 == NULL);
632 freedfa(d);
633 return v->err;
634 }
635
636 /* pick a tentative midpoint */
637 if (shorter)
638 mid = shortest(v, d, begin, begin, end, (chr **) NULL,
639 (int *) NULL);
640 else
641 mid = longest(v, d, begin, end, (int *) NULL);
642 if (mid == NULL)
643 {
644 freedfa(d);
645 freedfa(d2);
646 return REG_ASSERT;
647 }
648 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
649
650 /* iterate until satisfaction or failure */
651 while (longest(v, d2, mid, end, (int *) NULL) != end)
652 {
653 /* that midpoint didn't work, find a new one */
654 if (mid == stop)
655 {
656 /* all possibilities exhausted! */
657 MDEBUG(("no midpoint!\n"));
658 freedfa(d);
659 freedfa(d2);
660 return REG_ASSERT;
661 }
662 if (shorter)
663 mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL,
664 (int *) NULL);
665 else
666 mid = longest(v, d, begin, mid - 1, (int *) NULL);
667 if (mid == NULL)
668 {
669 /* failed to find a new one! */
670 MDEBUG(("failed midpoint!\n"));
671 freedfa(d);
672 freedfa(d2);
673 return REG_ASSERT;
674 }
675 MDEBUG(("new midpoint %ld\n", LOFF(mid)));
676 }
677
678 /* satisfaction */
679 MDEBUG(("successful\n"));
680 freedfa(d);
681 freedfa(d2);
682 i = dissect(v, t->left, begin, mid);
683 if (i != REG_OKAY)
684 return i;
685 return dissect(v, t->right, mid, end);
686 }
687
688 /*
689 * altdissect - determine alternative subexpression matches (uncomplicated)
690 */
691 static int /* regexec return code */
692 altdissect(struct vars * v,
693 struct subre * t,
694 chr *begin, /* beginning of relevant substring */
695 chr *end) /* end of same */
696 {
697 struct dfa *d;
698 int i;
699
700 assert(t != NULL);
701 assert(t->op == '|');
702
703 for (i = 0; t != NULL; t = t->right, i++)
704 {
705 MDEBUG(("trying %dth\n", i));
706 assert(t->left != NULL && t->left->cnfa.nstates > 0);
707 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
708 if (ISERR())
709 return v->err;
710 if (longest(v, d, begin, end, (int *) NULL) == end)
711 {
712 MDEBUG(("success\n"));
713 freedfa(d);
714 return dissect(v, t->left, begin, end);
715 }
716 freedfa(d);
717 }
718 return REG_ASSERT; /* none of them matched?!? */
719 }
720
721 /*
722 * cdissect - determine subexpression matches (with complications)
723 * The retry memory stores the offset of the trial midpoint from begin,
724 * plus 1 so that 0 uniquely means "clean slate".
725 */
726 static int /* regexec return code */
727 cdissect(struct vars * v,
728 struct subre * t,
729 chr *begin, /* beginning of relevant substring */
730 chr *end) /* end of same */
731 {
732 int er;
733
734 assert(t != NULL);
735 MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
736
737 switch (t->op)
738 {
739 case '=': /* terminal node */
740 assert(t->left == NULL && t->right == NULL);
741 return REG_OKAY; /* no action, parent did the work */
742 break;
743 case '|': /* alternation */
744 assert(t->left != NULL);
745 return caltdissect(v, t, begin, end);
746 break;
747 case 'b': /* back ref -- shouldn't be calling us! */
748 assert(t->left == NULL && t->right == NULL);
749 return cbrdissect(v, t, begin, end);
750 break;
751 case '.': /* concatenation */
752 assert(t->left != NULL && t->right != NULL);
753 return ccondissect(v, t, begin, end);
754 break;
755 case '(': /* capturing */
756 assert(t->left != NULL && t->right == NULL);
757 assert(t->subno > 0);
758 er = cdissect(v, t->left, begin, end);
759 if (er == REG_OKAY)
760 subset(v, t, begin, end);
761 return er;
762 break;
763 default:
764 return REG_ASSERT;
765 break;
766 }
767 }
768
769 /*
770 * ccondissect - concatenation subexpression matches (with complications)
771 * The retry memory stores the offset of the trial midpoint from begin,
772 * plus 1 so that 0 uniquely means "clean slate".
773 */
774 static int /* regexec return code */
775 ccondissect(struct vars * v,
776 struct subre * t,
777 chr *begin, /* beginning of relevant substring */
778 chr *end) /* end of same */
779 {
780 struct dfa *d;
781 struct dfa *d2;
782 chr *mid;
783 int er;
784
785 assert(t->op == '.');
786 assert(t->left != NULL && t->left->cnfa.nstates > 0);
787 assert(t->right != NULL && t->right->cnfa.nstates > 0);
788
789 if (t->left->flags & SHORTER) /* reverse scan */
790 return crevdissect(v, t, begin, end);
791
792 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
793 if (ISERR())
794 return v->err;
795 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
796 if (ISERR())
797 {
798 freedfa(d);
799 return v->err;
800 }
801 MDEBUG(("cconcat %d\n", t->retry));
802
803 /* pick a tentative midpoint */
804 if (v->mem[t->retry] == 0)
805 {
806 mid = longest(v, d, begin, end, (int *) NULL);
807 if (mid == NULL)
808 {
809 freedfa(d);
810 freedfa(d2);
811 return REG_NOMATCH;
812 }
813 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
814 v->mem[t->retry] = (mid - begin) + 1;
815 }
816 else
817 {
818 mid = begin + (v->mem[t->retry] - 1);
819 MDEBUG(("working midpoint %ld\n", LOFF(mid)));
820 }
821
822 /* iterate until satisfaction or failure */
823 for (;;)
824 {
825 /* try this midpoint on for size */
826 er = cdissect(v, t->left, begin, mid);
827 if (er == REG_OKAY &&
828 longest(v, d2, mid, end, (int *) NULL) == end &&
829 (er = cdissect(v, t->right, mid, end)) ==
830 REG_OKAY)
831 break; /* NOTE BREAK OUT */
832 if (er != REG_OKAY && er != REG_NOMATCH)
833 {
834 freedfa(d);
835 freedfa(d2);
836 return er;
837 }
838
839 /* that midpoint didn't work, find a new one */
840 if (mid == begin)
841 {
842 /* all possibilities exhausted */
843 MDEBUG(("%d no midpoint\n", t->retry));
844 freedfa(d);
845 freedfa(d2);
846 return REG_NOMATCH;
847 }
848 mid = longest(v, d, begin, mid - 1, (int *) NULL);
849 if (mid == NULL)
850 {
851 /* failed to find a new one */
852 MDEBUG(("%d failed midpoint\n", t->retry));
853 freedfa(d);
854 freedfa(d2);
855 return REG_NOMATCH;
856 }
857 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
858 v->mem[t->retry] = (mid - begin) + 1;
859 zapmem(v, t->left);
860 zapmem(v, t->right);
861 }
862
863 /* satisfaction */
864 MDEBUG(("successful\n"));
865 freedfa(d);
866 freedfa(d2);
867 return REG_OKAY;
868 }
869
870 /*
871 * crevdissect - determine backref shortest-first subexpression matches
872 * The retry memory stores the offset of the trial midpoint from begin,
873 * plus 1 so that 0 uniquely means "clean slate".
874 */
875 static int /* regexec return code */
876 crevdissect(struct vars * v,
877 struct subre * t,
878 chr *begin, /* beginning of relevant substring */
879 chr *end) /* end of same */
880 {
881 struct dfa *d;
882 struct dfa *d2;
883 chr *mid;
884 int er;
885
886 assert(t->op == '.');
887 assert(t->left != NULL && t->left->cnfa.nstates > 0);
888 assert(t->right != NULL && t->right->cnfa.nstates > 0);
889 assert(t->left->flags & SHORTER);
890
891 /* concatenation -- need to split the substring between parts */
892 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
893 if (ISERR())
894 return v->err;
895 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
896 if (ISERR())
897 {
898 freedfa(d);
899 return v->err;
900 }
901 MDEBUG(("crev %d\n", t->retry));
902
903 /* pick a tentative midpoint */
904 if (v->mem[t->retry] == 0)
905 {
906 mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
907 if (mid == NULL)
908 {
909 freedfa(d);
910 freedfa(d2);
911 return REG_NOMATCH;
912 }
913 MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
914 v->mem[t->retry] = (mid - begin) + 1;
915 }
916 else
917 {
918 mid = begin + (v->mem[t->retry] - 1);
919 MDEBUG(("working midpoint %ld\n", LOFF(mid)));
920 }
921
922 /* iterate until satisfaction or failure */
923 for (;;)
924 {
925 /* try this midpoint on for size */
926 er = cdissect(v, t->left, begin, mid);
927 if (er == REG_OKAY &&
928 longest(v, d2, mid, end, (int *) NULL) == end &&
929 (er = cdissect(v, t->right, mid, end)) ==
930 REG_OKAY)
931 break; /* NOTE BREAK OUT */
932 if (er != REG_OKAY && er != REG_NOMATCH)
933 {
934 freedfa(d);
935 freedfa(d2);
936 return er;
937 }
938
939 /* that midpoint didn't work, find a new one */
940 if (mid == end)
941 {
942 /* all possibilities exhausted */
943 MDEBUG(("%d no midpoint\n", t->retry));
944 freedfa(d);
945 freedfa(d2);
946 return REG_NOMATCH;
947 }
948 mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
949 if (mid == NULL)
950 {
951 /* failed to find a new one */
952 MDEBUG(("%d failed midpoint\n", t->retry));
953 freedfa(d);
954 freedfa(d2);
955 return REG_NOMATCH;
956 }
957 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
958 v->mem[t->retry] = (mid - begin) + 1;
959 zapmem(v, t->left);
960 zapmem(v, t->right);
961 }
962
963 /* satisfaction */
964 MDEBUG(("successful\n"));
965 freedfa(d);
966 freedfa(d2);
967 return REG_OKAY;
968 }
969
970 /*
971 * cbrdissect - determine backref subexpression matches
972 */
973 static int /* regexec return code */
974 cbrdissect(struct vars * v,
975 struct subre * t,
976 chr *begin, /* beginning of relevant substring */
977 chr *end) /* end of same */
978 {
979 int i;
980 int n = t->subno;
981 size_t len;
982 chr *paren;
983 chr *p;
984 chr *stop;
985 int min = t->min;
986 int max = t->max;
987
988 assert(t != NULL);
989 assert(t->op == 'b');
990 assert(n >= 0);
991 assert((size_t) n < v->nmatch);
992
993 MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
994
995 if (v->pmatch[n].rm_so == -1)
996 return REG_NOMATCH;
997 paren = v->start + v->pmatch[n].rm_so;
998 len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
999
1000 /* no room to maneuver -- retries are pointless */
1001 if (v->mem[t->retry])
1002 return REG_NOMATCH;
1003 v->mem[t->retry] = 1;
1004
1005 /* special-case zero-length string */
1006 if (len == 0)
1007 {
1008 if (begin == end)
1009 return REG_OKAY;
1010 return REG_NOMATCH;
1011 }
1012
1013 /* and too-short string */
1014 assert(end >= begin);
1015 if ((size_t) (end - begin) < len)
1016 return REG_NOMATCH;
1017 stop = end - len;
1018
1019 /* count occurrences */
1020 i = 0;
1021 for (p = begin; p <= stop && (i < max || max == INFINITY); p += len)
1022 {
1023 if ((*v->g->compare) (paren, p, len) != 0)
1024 break;
1025 i++;
1026 }
1027 MDEBUG(("cbackref found %d\n", i));
1028
1029 /* and sort it out */
1030 if (p != end) /* didn't consume all of it */
1031 return REG_NOMATCH;
1032 if (min <= i && (i <= max || max == INFINITY))
1033 return REG_OKAY;
1034 return REG_NOMATCH; /* out of range */
1035 }
1036
1037 /*
1038 * caltdissect - determine alternative subexpression matches (w. complications)
1039 */
1040 static int /* regexec return code */
1041 caltdissect(struct vars * v,
1042 struct subre * t,
1043 chr *begin, /* beginning of relevant substring */
1044 chr *end) /* end of same */
1045 {
1046 struct dfa *d;
1047 int er;
1048
1049 #define UNTRIED 0 /* not yet tried at all */
1050 #define TRYING 1 /* top matched, trying submatches */
1051 #define TRIED 2 /* top didn't match or submatches
1052 * exhausted */
1053
1054 if (t == NULL)
1055 return REG_NOMATCH;
1056 assert(t->op == '|');
1057 if (v->mem[t->retry] == TRIED)
1058 return caltdissect(v, t->right, begin, end);
1059
1060 MDEBUG(("calt n%d\n", t->retry));
1061 assert(t->left != NULL);
1062
1063 if (v->mem[t->retry] == UNTRIED)
1064 {
1065 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
1066 if (ISERR())
1067 return v->err;
1068 if (longest(v, d, begin, end, (int *) NULL) != end)
1069 {
1070 freedfa(d);
1071 v->mem[t->retry] = TRIED;
1072 return caltdissect(v, t->right, begin, end);
1073 }
1074 freedfa(d);
1075 MDEBUG(("calt matched\n"));
1076 v->mem[t->retry] = TRYING;
1077 }
1078
1079 er = cdissect(v, t->left, begin, end);
1080 if (er != REG_NOMATCH)
1081 return er;
1082
1083 v->mem[t->retry] = TRIED;
1084 return caltdissect(v, t->right, begin, end);
1085 }
1086
1087
1088
1089 #include "rege_dfa.c"