]>
Commit | Line | Data |
---|---|---|
224c7076 A |
1 | /*- |
2 | * Copyright (c) 1992, 1993, 1994 Henry Spencer. | |
3 | * Copyright (c) 1992, 1993, 1994 | |
4 | * The Regents of the University of California. All rights reserved. | |
5 | * | |
6 | * This code is derived from software contributed to Berkeley by | |
7 | * Henry Spencer. | |
8 | * | |
9 | * Redistribution and use in source and binary forms, with or without | |
10 | * modification, are permitted provided that the following conditions | |
11 | * are met: | |
12 | * 1. Redistributions of source code must retain the above copyright | |
13 | * notice, this list of conditions and the following disclaimer. | |
14 | * 2. Redistributions in binary form must reproduce the above copyright | |
15 | * notice, this list of conditions and the following disclaimer in the | |
16 | * documentation and/or other materials provided with the distribution. | |
17 | * 3. All advertising materials mentioning features or use of this software | |
18 | * must display the following acknowledgement: | |
19 | * This product includes software developed by the University of | |
20 | * California, Berkeley and its contributors. | |
21 | * 4. Neither the name of the University nor the names of its contributors | |
22 | * may be used to endorse or promote products derived from this software | |
23 | * without specific prior written permission. | |
24 | * | |
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
35 | * SUCH DAMAGE. | |
36 | * | |
37 | * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 | |
38 | */ | |
39 | ||
40 | #if defined(LIBC_SCCS) && !defined(lint) | |
41 | static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; | |
42 | #endif /* LIBC_SCCS and not lint */ | |
43 | #include <sys/cdefs.h> | |
44 | __FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.34 2004/10/03 15:42:59 stefanf Exp $"); | |
45 | ||
46 | #include "xlocale_private.h" | |
47 | ||
48 | #include <sys/types.h> | |
49 | #include <stdio.h> | |
50 | #include <string.h> | |
51 | #include <ctype.h> | |
52 | #include <limits.h> | |
53 | #include <stdlib.h> | |
54 | #include <regex.h> | |
55 | #include <runetype.h> | |
56 | #include <wchar.h> | |
57 | #include <wctype.h> | |
58 | ||
59 | #include "collate.h" | |
60 | ||
61 | #include "utils.h" | |
62 | #include "regex2.h" | |
63 | ||
64 | #include "cname.h" | |
65 | ||
66 | /* | |
67 | * parse structure, passed up and down to avoid global variables and | |
68 | * other clumsinesses | |
69 | */ | |
70 | struct parse { | |
71 | char *next; /* next character in RE */ | |
72 | char *end; /* end of string (-> NUL normally) */ | |
73 | int error; /* has an error been seen? */ | |
74 | sop *strip; /* malloced strip */ | |
75 | sopno ssize; /* malloced strip size (allocated) */ | |
76 | sopno slen; /* malloced strip length (used) */ | |
77 | int ncsalloc; /* number of csets allocated */ | |
78 | #if __DARWIN_UNIX03 | |
79 | int zerorepeats; | |
80 | #endif /* __DARWIN_UNIX03 */ | |
81 | struct re_guts *g; | |
82 | # define NPAREN 10 /* we need to remember () 1-9 for back refs */ | |
83 | sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ | |
84 | sopno pend[NPAREN]; /* -> ) ([0] unused) */ | |
85 | }; | |
86 | ||
87 | /* ========= begin header generated by ./mkh ========= */ | |
88 | #ifdef __cplusplus | |
89 | extern "C" { | |
90 | #endif | |
91 | ||
92 | /* === regcomp.c === */ | |
93 | static void p_ere(struct parse *p, wint_t stop); | |
94 | static void p_ere_exp(struct parse *p); | |
95 | static void p_str(struct parse *p); | |
96 | static void p_bre(struct parse *p, wint_t end1, wint_t end2); | |
97 | static int p_simp_re(struct parse *p, int starordinary); | |
98 | static int p_count(struct parse *p); | |
99 | static void p_bracket(struct parse *p); | |
100 | static void p_b_term(struct parse *p, cset *cs); | |
101 | static void p_b_cclass(struct parse *p, cset *cs); | |
102 | static void p_b_eclass(struct parse *p, cset *cs); | |
103 | static wint_t p_b_symbol(struct parse *p); | |
104 | static wint_t p_b_coll_elem(struct parse *p, wint_t endc); | |
105 | static wint_t othercase(wint_t ch, locale_t loc); | |
106 | static void bothcases(struct parse *p, wint_t ch); | |
107 | static void ordinary(struct parse *p, wint_t ch); | |
108 | static void nonnewline(struct parse *p); | |
109 | static void repeat(struct parse *p, sopno start, int from, int to); | |
110 | static int seterr(struct parse *p, int e); | |
111 | static cset *allocset(struct parse *p); | |
112 | static void freeset(struct parse *p, cset *cs); | |
113 | static void CHadd(struct parse *p, cset *cs, wint_t ch); | |
114 | static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); | |
115 | static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); | |
116 | static wint_t singleton(cset *cs, locale_t loc); | |
117 | static sopno dupl(struct parse *p, sopno start, sopno finish); | |
118 | static void doemit(struct parse *p, sop op, size_t opnd); | |
119 | static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); | |
120 | static void dofwd(struct parse *p, sopno pos, sop value); | |
121 | static void enlarge(struct parse *p, sopno size); | |
122 | static void stripsnug(struct parse *p, struct re_guts *g); | |
123 | static void findmust(struct parse *p, struct re_guts *g); | |
124 | static int altoffset(sop *scan, int offset); | |
125 | static void computejumps(struct parse *p, struct re_guts *g); | |
126 | static void computematchjumps(struct parse *p, struct re_guts *g); | |
127 | static sopno pluscount(struct parse *p, struct re_guts *g); | |
128 | static wint_t wgetnext(struct parse *p); | |
129 | ||
130 | #ifdef __cplusplus | |
131 | } | |
132 | #endif | |
133 | /* ========= end header generated by ./mkh ========= */ | |
134 | ||
135 | static char nuls[10]; /* place to point scanner in event of error */ | |
136 | ||
137 | /* | |
138 | * macros for use with parse structure | |
139 | * BEWARE: these know that the parse structure is named `p' !!! | |
140 | */ | |
141 | #define PEEK() (*p->next) | |
142 | #define PEEK2() (*(p->next+1)) | |
143 | #define MORE() (p->next < p->end) | |
144 | #define MORE2() (p->next+1 < p->end) | |
145 | #define SEE(c) (MORE() && PEEK() == (c)) | |
146 | #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) | |
147 | #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) | |
148 | #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) | |
149 | #define NEXT() (p->next++) | |
150 | #define NEXT2() (p->next += 2) | |
151 | #define NEXTn(n) (p->next += (n)) | |
152 | #define GETNEXT() (*p->next++) | |
153 | #define WGETNEXT() wgetnext(p) | |
154 | #define SETERROR(e) seterr(p, (e)) | |
155 | #define REQUIRE(co, e) ((co) || SETERROR(e)) | |
156 | #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) | |
157 | #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) | |
158 | #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) | |
159 | #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) | |
160 | #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) | |
161 | #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) | |
162 | #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) | |
163 | #define HERE() (p->slen) | |
164 | #define THERE() (p->slen - 1) | |
165 | #define THERETHERE() (p->slen - 2) | |
166 | #define DROP(n) (p->slen -= (n)) | |
167 | ||
168 | #ifndef NDEBUG | |
169 | static int never = 0; /* for use in asserts; shuts lint up */ | |
170 | #else | |
171 | #define never 0 /* some <assert.h>s have bugs too */ | |
172 | #endif | |
173 | ||
174 | /* Macro used by computejump()/computematchjump() */ | |
175 | #define MIN(a,b) ((a)<(b)?(a):(b)) | |
176 | ||
177 | /* | |
178 | - regcomp - interface for parser and compilation | |
179 | = extern int regcomp(regex_t *, const char *, int); | |
180 | = #define REG_BASIC 0000 | |
181 | = #define REG_EXTENDED 0001 | |
182 | = #define REG_ICASE 0002 | |
183 | = #define REG_NOSUB 0004 | |
184 | = #define REG_NEWLINE 0010 | |
185 | = #define REG_NOSPEC 0020 | |
186 | = #define REG_PEND 0040 | |
187 | = #define REG_DUMP 0200 | |
188 | */ | |
189 | int /* 0 success, otherwise REG_something */ | |
190 | regcomp(preg, pattern, cflags) | |
191 | regex_t * __restrict preg; | |
192 | const char * __restrict pattern; | |
193 | int cflags; | |
194 | { | |
195 | struct parse pa; | |
196 | struct re_guts *g; | |
197 | struct parse *p = &pa; | |
198 | int i; | |
199 | size_t len; | |
200 | #ifdef REDEBUG | |
201 | # define GOODFLAGS(f) (f) | |
202 | #else | |
203 | # define GOODFLAGS(f) ((f)&~REG_DUMP) | |
204 | #endif | |
205 | ||
206 | cflags = GOODFLAGS(cflags); | |
207 | if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) | |
208 | return(REG_INVARG); | |
209 | ||
210 | if (cflags®_PEND) { | |
211 | if (preg->re_endp < pattern) | |
212 | return(REG_INVARG); | |
213 | len = preg->re_endp - pattern; | |
214 | } else | |
215 | len = strlen((char *)pattern); | |
216 | ||
217 | /* do the mallocs early so failure handling is easy */ | |
218 | g = (struct re_guts *)malloc(sizeof(struct re_guts)); | |
219 | if (g == NULL) | |
220 | return(REG_ESPACE); | |
221 | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ | |
222 | p->strip = (sop *)malloc(p->ssize * sizeof(sop)); | |
223 | p->slen = 0; | |
224 | if (p->strip == NULL) { | |
225 | free((char *)g); | |
226 | return(REG_ESPACE); | |
227 | } | |
228 | ||
229 | /* set things up */ | |
230 | p->g = g; | |
231 | p->next = (char *)pattern; /* convenience; we do not modify it */ | |
232 | p->end = p->next + len; | |
233 | p->error = 0; | |
234 | p->ncsalloc = 0; | |
235 | #if __DARWIN_UNIX03 | |
236 | p->zerorepeats = 0; | |
237 | #endif /* __DARWIN_UNIX03 */ | |
238 | for (i = 0; i < NPAREN; i++) { | |
239 | p->pbegin[i] = 0; | |
240 | p->pend[i] = 0; | |
241 | } | |
242 | g->loc = __current_locale(); | |
243 | g->sets = NULL; | |
244 | g->ncsets = 0; | |
245 | g->cflags = cflags; | |
246 | g->iflags = 0; | |
247 | g->nbol = 0; | |
248 | g->neol = 0; | |
249 | g->must = NULL; | |
250 | g->moffset = -1; | |
251 | g->charjump = NULL; | |
252 | g->matchjump = NULL; | |
253 | g->mlen = 0; | |
254 | g->nsub = 0; | |
255 | g->backrefs = 0; | |
256 | ||
257 | /* do it */ | |
258 | EMIT(OEND, 0); | |
259 | g->firststate = THERE(); | |
260 | if (cflags®_EXTENDED) | |
261 | p_ere(p, OUT); | |
262 | else if (cflags®_NOSPEC) | |
263 | p_str(p); | |
264 | else | |
265 | p_bre(p, OUT, OUT); | |
266 | EMIT(OEND, 0); | |
267 | g->laststate = THERE(); | |
268 | ||
269 | /* tidy up loose ends and fill things in */ | |
270 | stripsnug(p, g); | |
271 | findmust(p, g); | |
272 | /* only use Boyer-Moore algorithm if the pattern is bigger | |
273 | * than three characters | |
274 | */ | |
275 | if(g->mlen > 3) { | |
276 | computejumps(p, g); | |
277 | computematchjumps(p, g); | |
278 | if(g->matchjump == NULL && g->charjump != NULL) { | |
279 | free(g->charjump); | |
280 | g->charjump = NULL; | |
281 | } | |
282 | } | |
283 | g->nplus = pluscount(p, g); | |
284 | g->magic = MAGIC2; | |
285 | preg->re_nsub = g->nsub; | |
286 | preg->re_g = g; | |
287 | preg->re_magic = MAGIC1; | |
288 | #ifndef REDEBUG | |
289 | /* not debugging, so can't rely on the assert() in regexec() */ | |
290 | if (g->iflags&BAD) | |
291 | SETERROR(REG_ASSERT); | |
292 | #endif | |
293 | ||
294 | /* win or lose, we're done */ | |
295 | if (p->error != 0) /* lose */ | |
296 | regfree(preg); | |
297 | return(p->error); | |
298 | } | |
299 | ||
300 | /* | |
301 | - p_ere - ERE parser top level, concatenation and alternation | |
302 | == static void p_ere(struct parse *p, int stop); | |
303 | */ | |
304 | static void | |
305 | p_ere(p, stop) | |
306 | struct parse *p; | |
307 | int stop; /* character this ERE should end at */ | |
308 | { | |
309 | char c; | |
310 | sopno prevback; | |
311 | sopno prevfwd; | |
312 | sopno conc; | |
313 | int first = 1; /* is this the first alternative? */ | |
314 | ||
315 | for (;;) { | |
316 | /* do a bunch of concatenated expressions */ | |
317 | conc = HERE(); | |
318 | while (MORE() && (c = PEEK()) != '|' && c != stop) | |
319 | p_ere_exp(p); | |
320 | #if __DARWIN_UNIX03 | |
321 | if (!p->zerorepeats) REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ | |
322 | else p->zerorepeats--; | |
323 | #else | |
324 | (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ | |
325 | #endif | |
326 | if (!EAT('|')) | |
327 | break; /* NOTE BREAK OUT */ | |
328 | ||
329 | if (first) { | |
330 | INSERT(OCH_, conc); /* offset is wrong */ | |
331 | prevfwd = conc; | |
332 | prevback = conc; | |
333 | first = 0; | |
334 | } | |
335 | ASTERN(OOR1, prevback); | |
336 | prevback = THERE(); | |
337 | AHEAD(prevfwd); /* fix previous offset */ | |
338 | prevfwd = HERE(); | |
339 | EMIT(OOR2, 0); /* offset is very wrong */ | |
340 | } | |
341 | ||
342 | if (!first) { /* tail-end fixups */ | |
343 | AHEAD(prevfwd); | |
344 | ASTERN(O_CH, prevback); | |
345 | } | |
346 | ||
347 | assert(!MORE() || SEE(stop)); | |
348 | } | |
349 | ||
350 | /* | |
351 | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op | |
352 | == static void p_ere_exp(struct parse *p); | |
353 | */ | |
354 | static void | |
355 | p_ere_exp(p) | |
356 | struct parse *p; | |
357 | { | |
358 | char c; | |
359 | wint_t wc; | |
360 | sopno pos; | |
361 | int count; | |
362 | int count2; | |
363 | sopno subno; | |
364 | int wascaret = 0; | |
365 | ||
366 | assert(MORE()); /* caller should have ensured this */ | |
367 | c = GETNEXT(); | |
368 | ||
369 | pos = HERE(); | |
370 | switch (c) { | |
371 | case '(': | |
372 | (void)REQUIRE(MORE(), REG_EPAREN); | |
373 | p->g->nsub++; | |
374 | subno = p->g->nsub; | |
375 | if (subno < NPAREN) | |
376 | p->pbegin[subno] = HERE(); | |
377 | EMIT(OLPAREN, subno); | |
378 | if (!SEE(')')) | |
379 | p_ere(p, ')'); | |
380 | if (subno < NPAREN) { | |
381 | p->pend[subno] = HERE(); | |
382 | assert(p->pend[subno] != 0); | |
383 | } | |
384 | EMIT(ORPAREN, subno); | |
385 | (void)MUSTEAT(')', REG_EPAREN); | |
386 | break; | |
387 | #ifndef POSIX_MISTAKE | |
388 | case ')': /* happens only if no current unmatched ( */ | |
389 | /* | |
390 | * You may ask, why the ifndef? Because I didn't notice | |
391 | * this until slightly too late for 1003.2, and none of the | |
392 | * other 1003.2 regular-expression reviewers noticed it at | |
393 | * all. So an unmatched ) is legal POSIX, at least until | |
394 | * we can get it fixed. | |
395 | */ | |
396 | SETERROR(REG_EPAREN); | |
397 | break; | |
398 | #endif | |
399 | case '^': | |
400 | EMIT(OBOL, 0); | |
401 | p->g->iflags |= USEBOL; | |
402 | p->g->nbol++; | |
403 | wascaret = 1; | |
404 | break; | |
405 | case '$': | |
406 | EMIT(OEOL, 0); | |
407 | p->g->iflags |= USEEOL; | |
408 | p->g->neol++; | |
409 | break; | |
410 | case '|': | |
411 | SETERROR(REG_EMPTY); | |
412 | break; | |
413 | case '*': | |
414 | case '+': | |
415 | case '?': | |
416 | SETERROR(REG_BADRPT); | |
417 | break; | |
418 | case '.': | |
419 | if (p->g->cflags®_NEWLINE) | |
420 | nonnewline(p); | |
421 | else | |
422 | EMIT(OANY, 0); | |
423 | break; | |
424 | case '[': | |
425 | p_bracket(p); | |
426 | break; | |
427 | case '\\': | |
428 | (void)REQUIRE(MORE(), REG_EESCAPE); | |
429 | wc = WGETNEXT(); | |
430 | ordinary(p, wc); | |
431 | break; | |
432 | case '{': /* okay as ordinary except if digit follows */ | |
433 | (void)REQUIRE(!MORE() || !isdigit_l((uch)PEEK(), p->g->loc), REG_BADRPT); | |
434 | /* FALLTHROUGH */ | |
435 | default: | |
436 | p->next--; | |
437 | wc = WGETNEXT(); | |
438 | ordinary(p, wc); | |
439 | break; | |
440 | } | |
441 | ||
442 | if (!MORE()) | |
443 | return; | |
444 | c = PEEK(); | |
445 | /* we call { a repetition if followed by a digit */ | |
446 | if (!( c == '*' || c == '+' || c == '?' || | |
447 | (c == '{' && MORE2() && isdigit_l((uch)PEEK2(), p->g->loc)) )) | |
448 | return; /* no repetition, we're done */ | |
449 | NEXT(); | |
450 | ||
451 | (void)REQUIRE(!wascaret, REG_BADRPT); | |
452 | switch (c) { | |
453 | case '*': /* implemented as +? */ | |
454 | /* this case does not require the (y|) trick, noKLUDGE */ | |
455 | INSERT(OPLUS_, pos); | |
456 | ASTERN(O_PLUS, pos); | |
457 | INSERT(OQUEST_, pos); | |
458 | ASTERN(O_QUEST, pos); | |
459 | break; | |
460 | case '+': | |
461 | INSERT(OPLUS_, pos); | |
462 | ASTERN(O_PLUS, pos); | |
463 | break; | |
464 | case '?': | |
465 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | |
466 | INSERT(OCH_, pos); /* offset slightly wrong */ | |
467 | ASTERN(OOR1, pos); /* this one's right */ | |
468 | AHEAD(pos); /* fix the OCH_ */ | |
469 | EMIT(OOR2, 0); /* offset very wrong... */ | |
470 | AHEAD(THERE()); /* ...so fix it */ | |
471 | ASTERN(O_CH, THERETHERE()); | |
472 | break; | |
473 | case '{': | |
474 | count = p_count(p); | |
475 | if (EAT(',')) { | |
476 | if (isdigit_l((uch)PEEK(), p->g->loc)) { | |
477 | count2 = p_count(p); | |
478 | (void)REQUIRE(count <= count2, REG_BADBR); | |
479 | } else /* single number with comma */ | |
480 | count2 = INFINITY; | |
481 | } else /* just a single number */ | |
482 | count2 = count; | |
483 | repeat(p, pos, count, count2); | |
484 | if (!EAT('}')) { /* error heuristics */ | |
485 | while (MORE() && PEEK() != '}') | |
486 | NEXT(); | |
487 | (void)REQUIRE(MORE(), REG_EBRACE); | |
488 | SETERROR(REG_BADBR); | |
489 | } | |
490 | break; | |
491 | } | |
492 | ||
493 | if (!MORE()) | |
494 | return; | |
495 | c = PEEK(); | |
496 | if (!( c == '*' || c == '+' || c == '?' || | |
497 | (c == '{' && MORE2() && isdigit_l((uch)PEEK2(), p->g->loc)) ) ) | |
498 | return; | |
499 | SETERROR(REG_BADRPT); | |
500 | } | |
501 | ||
502 | /* | |
503 | - p_str - string (no metacharacters) "parser" | |
504 | == static void p_str(struct parse *p); | |
505 | */ | |
506 | static void | |
507 | p_str(p) | |
508 | struct parse *p; | |
509 | { | |
510 | #if __DARWIN_UNIX03 | |
511 | if (!p->zerorepeats) REQUIRE(MORE(), REG_EMPTY); | |
512 | else p->zerorepeats--; | |
513 | #else /* !__DARWIN_UNIX03 */ | |
514 | (void)REQUIRE(MORE(), REG_EMPTY); | |
515 | #endif /* __DARWIN_UNIX03 */ | |
516 | while (MORE()) | |
517 | ordinary(p, WGETNEXT()); | |
518 | } | |
519 | ||
520 | /* | |
521 | - p_bre - BRE parser top level, anchoring and concatenation | |
522 | == static void p_bre(struct parse *p, int end1, \ | |
523 | == int end2); | |
524 | * Giving end1 as OUT essentially eliminates the end1/end2 check. | |
525 | * | |
526 | * This implementation is a bit of a kludge, in that a trailing $ is first | |
527 | * taken as an ordinary character and then revised to be an anchor. | |
528 | * The amount of lookahead needed to avoid this kludge is excessive. | |
529 | */ | |
530 | static void | |
531 | p_bre(p, end1, end2) | |
532 | struct parse *p; | |
533 | int end1; /* first terminating character */ | |
534 | int end2; /* second terminating character */ | |
535 | { | |
536 | sopno start = HERE(); | |
537 | int first = 1; /* first subexpression? */ | |
538 | int wasdollar = 0; | |
539 | ||
540 | if (EAT('^')) { | |
541 | EMIT(OBOL, 0); | |
542 | p->g->iflags |= USEBOL; | |
543 | p->g->nbol++; | |
544 | } | |
545 | while (MORE() && !SEETWO(end1, end2)) { | |
546 | wasdollar = p_simp_re(p, first); | |
547 | first = 0; | |
548 | } | |
549 | if (wasdollar) { /* oops, that was a trailing anchor */ | |
550 | DROP(1); | |
551 | EMIT(OEOL, 0); | |
552 | p->g->iflags |= USEEOL; | |
553 | p->g->neol++; | |
554 | } | |
555 | #if __DARWIN_UNIX03 | |
556 | if (!p->zerorepeats) REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ | |
557 | else p->zerorepeats--; | |
558 | #else /* !__DARWIN_UNIX03 */ | |
559 | (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ | |
560 | #endif /* __DARWIN_UNIX03 */ | |
561 | } | |
562 | ||
563 | /* | |
564 | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition | |
565 | == static int p_simp_re(struct parse *p, int starordinary); | |
566 | */ | |
567 | static int /* was the simple RE an unbackslashed $? */ | |
568 | p_simp_re(p, starordinary) | |
569 | struct parse *p; | |
570 | int starordinary; /* is a leading * an ordinary character? */ | |
571 | { | |
572 | int c; | |
573 | int count; | |
574 | int count2; | |
575 | sopno pos; | |
576 | int i; | |
577 | wint_t wc; | |
578 | sopno subno; | |
579 | # define BACKSL (1<<CHAR_BIT) | |
580 | ||
581 | pos = HERE(); /* repetion op, if any, covers from here */ | |
582 | ||
583 | assert(MORE()); /* caller should have ensured this */ | |
584 | c = GETNEXT(); | |
585 | if (c == '\\') { | |
586 | (void)REQUIRE(MORE(), REG_EESCAPE); | |
587 | c = BACKSL | GETNEXT(); | |
588 | } | |
589 | switch (c) { | |
590 | case '.': | |
591 | if (p->g->cflags®_NEWLINE) | |
592 | nonnewline(p); | |
593 | else | |
594 | EMIT(OANY, 0); | |
595 | break; | |
596 | case '[': | |
597 | p_bracket(p); | |
598 | break; | |
599 | case BACKSL|'{': | |
600 | SETERROR(REG_BADRPT); | |
601 | break; | |
602 | case BACKSL|'(': | |
603 | p->g->nsub++; | |
604 | subno = p->g->nsub; | |
605 | if (subno < NPAREN) | |
606 | p->pbegin[subno] = HERE(); | |
607 | EMIT(OLPAREN, subno); | |
608 | /* the MORE here is an error heuristic */ | |
609 | if (MORE() && !SEETWO('\\', ')')) | |
610 | p_bre(p, '\\', ')'); | |
611 | if (subno < NPAREN) { | |
612 | p->pend[subno] = HERE(); | |
613 | assert(p->pend[subno] != 0); | |
614 | } | |
615 | EMIT(ORPAREN, subno); | |
616 | (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); | |
617 | break; | |
618 | case BACKSL|')': /* should not get here -- must be user */ | |
619 | case BACKSL|'}': | |
620 | SETERROR(REG_EPAREN); | |
621 | break; | |
622 | case BACKSL|'1': | |
623 | case BACKSL|'2': | |
624 | case BACKSL|'3': | |
625 | case BACKSL|'4': | |
626 | case BACKSL|'5': | |
627 | case BACKSL|'6': | |
628 | case BACKSL|'7': | |
629 | case BACKSL|'8': | |
630 | case BACKSL|'9': | |
631 | i = (c&~BACKSL) - '0'; | |
632 | assert(i < NPAREN); | |
633 | if (p->pend[i] != 0) { | |
634 | #if __DARWIN_UNIX03 | |
635 | int skip = 1; | |
636 | #endif /* __DARWIN_UNIX03 */ | |
637 | assert(i <= p->g->nsub); | |
638 | EMIT(OBACK_, i); | |
639 | assert(p->pbegin[i] != 0); | |
640 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); | |
641 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); | |
642 | #if __DARWIN_UNIX03 | |
643 | if (OP(p->strip[p->pbegin[i]+skip]) == OBOL) { | |
644 | skip++; /* don't dup anchor in subexp */ | |
645 | } | |
646 | (void) dupl(p, p->pbegin[i]+skip, p->pend[i]); | |
647 | #else /* !__DARWIN_UNIX03 */ | |
648 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); | |
649 | #endif /* __DARWIN_UNIX03 */ | |
650 | EMIT(O_BACK, i); | |
651 | } else | |
652 | SETERROR(REG_ESUBREG); | |
653 | p->g->backrefs = 1; | |
654 | break; | |
655 | case '*': | |
656 | (void)REQUIRE(starordinary, REG_BADRPT); | |
657 | /* FALLTHROUGH */ | |
658 | default: | |
659 | p->next--; | |
660 | wc = WGETNEXT(); | |
661 | ordinary(p, wc); | |
662 | break; | |
663 | } | |
664 | ||
665 | if (EAT('*')) { /* implemented as +? */ | |
666 | /* this case does not require the (y|) trick, noKLUDGE */ | |
667 | INSERT(OPLUS_, pos); | |
668 | ASTERN(O_PLUS, pos); | |
669 | INSERT(OQUEST_, pos); | |
670 | ASTERN(O_QUEST, pos); | |
671 | } else if (EATTWO('\\', '{')) { | |
672 | (void)REQUIRE(MORE(), REG_EBRACE); | |
673 | count = p_count(p); | |
674 | if (EAT(',')) { | |
675 | if (MORE() && isdigit_l((uch)PEEK(), p->g->loc)) { | |
676 | count2 = p_count(p); | |
677 | (void)REQUIRE(count <= count2, REG_BADBR); | |
678 | } else /* single number with comma */ | |
679 | count2 = INFINITY; | |
680 | } else /* just a single number */ | |
681 | count2 = count; | |
682 | repeat(p, pos, count, count2); | |
683 | if (!EATTWO('\\', '}')) { /* error heuristics */ | |
684 | while (MORE() && !SEETWO('\\', '}')) | |
685 | NEXT(); | |
686 | (void)REQUIRE(MORE(), REG_EBRACE); | |
687 | SETERROR(REG_BADBR); | |
688 | } | |
689 | } else if (c == '$') /* $ (but not \$) ends it */ | |
690 | return(1); | |
691 | ||
692 | return(0); | |
693 | } | |
694 | ||
695 | /* | |
696 | - p_count - parse a repetition count | |
697 | == static int p_count(struct parse *p); | |
698 | */ | |
699 | static int /* the value */ | |
700 | p_count(p) | |
701 | struct parse *p; | |
702 | { | |
703 | int count = 0; | |
704 | int ndigits = 0; | |
705 | ||
706 | while (MORE() && isdigit_l((uch)PEEK(), p->g->loc) && count <= DUPMAX) { | |
707 | count = count*10 + (GETNEXT() - '0'); | |
708 | ndigits++; | |
709 | } | |
710 | ||
711 | (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); | |
712 | return(count); | |
713 | } | |
714 | ||
715 | /* | |
716 | - p_bracket - parse a bracketed character list | |
717 | == static void p_bracket(struct parse *p); | |
718 | */ | |
719 | static void | |
720 | p_bracket(p) | |
721 | struct parse *p; | |
722 | { | |
723 | cset *cs; | |
724 | wint_t ch; | |
725 | ||
726 | /* Dept of Truly Sickening Special-Case Kludges */ | |
727 | if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { | |
728 | EMIT(OBOW, 0); | |
729 | NEXTn(6); | |
730 | return; | |
731 | } | |
732 | if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { | |
733 | EMIT(OEOW, 0); | |
734 | NEXTn(6); | |
735 | return; | |
736 | } | |
737 | ||
738 | if ((cs = allocset(p)) == NULL) | |
739 | return; | |
740 | ||
741 | if (p->g->cflags®_ICASE) | |
742 | cs->icase = 1; | |
743 | if (EAT('^')) | |
744 | cs->invert = 1; | |
745 | #if __DARWIN_UNIX03 | |
746 | if (PEEK2() != '-' && PEEK2() != ']') { /* Don't eat '-' or ']' if they're part of ranges | |
747 | * but do process [^-] */ | |
748 | if (EAT(']')) | |
749 | CHadd(p, cs, ']'); | |
750 | else if (EAT('-')) | |
751 | CHadd(p, cs, '-'); | |
752 | } | |
753 | if (MORE() && !SEETWO('-',']')) /* Parse RE []-'] */ | |
754 | p_b_term(p, cs); | |
755 | #else /* !__DARWIN_UNIX03 */ | |
756 | if (EAT(']')) | |
757 | CHadd(p, cs, ']'); | |
758 | else if (EAT('-')) | |
759 | CHadd(p, cs, '-'); | |
760 | #endif /* __DARWIN_UNIX03 */ | |
761 | while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) | |
762 | p_b_term(p, cs); | |
763 | if (EAT('-')) | |
764 | CHadd(p, cs, '-'); | |
765 | (void)MUSTEAT(']', REG_EBRACK); | |
766 | ||
767 | if (p->error != 0) /* don't mess things up further */ | |
768 | return; | |
769 | ||
770 | if (cs->invert && p->g->cflags®_NEWLINE) | |
771 | cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); | |
772 | ||
773 | if ((ch = singleton(cs, p->g->loc)) != OUT) { /* optimize singleton sets */ | |
774 | ordinary(p, ch); | |
775 | freeset(p, cs); | |
776 | } else | |
777 | EMIT(OANYOF, (int)(cs - p->g->sets)); | |
778 | } | |
779 | ||
780 | /* | |
781 | - p_b_term - parse one term of a bracketed character list | |
782 | == static void p_b_term(struct parse *p, cset *cs); | |
783 | */ | |
784 | static void | |
785 | p_b_term(p, cs) | |
786 | struct parse *p; | |
787 | cset *cs; | |
788 | { | |
789 | char c; | |
790 | wint_t start, finish; | |
791 | wint_t i; | |
792 | ||
793 | /* classify what we've got */ | |
794 | switch ((MORE()) ? PEEK() : '\0') { | |
795 | case '[': | |
796 | c = (MORE2()) ? PEEK2() : '\0'; | |
797 | break; | |
798 | case '-': | |
799 | #if __DARWIN_UNIX03 | |
800 | if (PEEK2() != '-') { /* Allow [---] */ | |
801 | SETERROR(REG_ERANGE); | |
802 | return; /* NOTE RETURN */ | |
803 | } else | |
804 | c = '-'; | |
805 | #else /* !__DARWIN_UNIX03 */ | |
806 | SETERROR(REG_ERANGE); | |
807 | return; /* NOTE RETURN */ | |
808 | #endif /* __DARWIN_UNIX03 */ | |
809 | break; | |
810 | default: | |
811 | c = '\0'; | |
812 | break; | |
813 | } | |
814 | ||
815 | switch (c) { | |
816 | case ':': /* character class */ | |
817 | NEXT2(); | |
818 | (void)REQUIRE(MORE(), REG_EBRACK); | |
819 | c = PEEK(); | |
820 | (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); | |
821 | p_b_cclass(p, cs); | |
822 | (void)REQUIRE(MORE(), REG_EBRACK); | |
823 | (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); | |
824 | break; | |
825 | case '=': /* equivalence class */ | |
826 | NEXT2(); | |
827 | (void)REQUIRE(MORE(), REG_EBRACK); | |
828 | c = PEEK(); | |
829 | #if __DARWIN_UNIX03 | |
830 | REQUIRE(c != '-', REG_ECOLLATE); /* allow [=]=] */ | |
831 | #else /* !__DARWIN_UNIX03 */ | |
832 | (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); | |
833 | #endif /* __DARWIN_UNIX03 */ | |
834 | p_b_eclass(p, cs); | |
835 | (void)REQUIRE(MORE(), REG_EBRACK); | |
836 | (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); | |
837 | break; | |
838 | default: /* symbol, ordinary character, or range */ | |
839 | start = p_b_symbol(p); | |
840 | if (SEE('-') && MORE2() && PEEK2() != ']') { | |
841 | /* range */ | |
842 | NEXT(); | |
843 | if (EAT('-')) | |
844 | finish = '-'; | |
845 | else | |
846 | finish = p_b_symbol(p); | |
847 | } else | |
848 | finish = start; | |
849 | if (start == finish) | |
850 | CHadd(p, cs, start); | |
851 | else { | |
852 | if (p->g->loc->__collate_load_error) { | |
853 | (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); | |
854 | CHaddrange(p, cs, start, finish); | |
855 | } else { | |
856 | (void)REQUIRE(__collate_range_cmp(start, finish, p->g->loc) <= 0, REG_ERANGE); | |
857 | for (i = 0; i <= UCHAR_MAX; i++) { | |
858 | if ( __collate_range_cmp(start, i, p->g->loc) <= 0 | |
859 | && __collate_range_cmp(i, finish, p->g->loc) <= 0 | |
860 | ) | |
861 | CHadd(p, cs, i); | |
862 | } | |
863 | } | |
864 | } | |
865 | break; | |
866 | } | |
867 | } | |
868 | ||
869 | /* | |
870 | - p_b_cclass - parse a character-class name and deal with it | |
871 | == static void p_b_cclass(struct parse *p, cset *cs); | |
872 | */ | |
873 | static void | |
874 | p_b_cclass(p, cs) | |
875 | struct parse *p; | |
876 | cset *cs; | |
877 | { | |
878 | char *sp = p->next; | |
879 | size_t len; | |
880 | wctype_t wct; | |
881 | char clname[16]; | |
882 | ||
883 | while (MORE() && isalpha_l((uch)PEEK(), p->g->loc)) | |
884 | NEXT(); | |
885 | len = p->next - sp; | |
886 | if (len >= sizeof(clname) - 1) { | |
887 | SETERROR(REG_ECTYPE); | |
888 | return; | |
889 | } | |
890 | memcpy(clname, sp, len); | |
891 | clname[len] = '\0'; | |
892 | if ((wct = wctype_l(clname, p->g->loc)) == 0) { | |
893 | SETERROR(REG_ECTYPE); | |
894 | return; | |
895 | } | |
896 | CHaddtype(p, cs, wct); | |
897 | } | |
898 | ||
899 | /* | |
900 | - p_b_eclass - parse an equivalence-class name and deal with it | |
901 | == static void p_b_eclass(struct parse *p, cset *cs); | |
902 | */ | |
903 | static void | |
904 | p_b_eclass(p, cs) | |
905 | struct parse *p; | |
906 | cset *cs; | |
907 | { | |
908 | char *sp = p->next; | |
909 | int len, ec; | |
910 | mbstate_t mbs; | |
911 | int *newequiv_classes; | |
912 | wint_t c; | |
913 | ||
914 | while (MORE() && !SEETWO('=', ']')) | |
915 | NEXT(); | |
916 | if (!MORE()) { | |
917 | SETERROR(REG_EBRACK); | |
918 | return; | |
919 | } | |
920 | len = p->next - sp; | |
921 | memset(&mbs, 0, sizeof(mbs)); | |
922 | ec = __collate_equiv_class(sp, len, &mbs, p->g->loc); | |
923 | if (ec > 0) { | |
924 | newequiv_classes = realloc(cs->equiv_classes, | |
925 | (cs->nequiv_classes + 1) * sizeof(*cs->equiv_classes)); | |
926 | if (newequiv_classes == NULL) { | |
927 | SETERROR(REG_ESPACE); | |
928 | return; | |
929 | } | |
930 | cs->equiv_classes = newequiv_classes; | |
931 | cs->equiv_classes[cs->nequiv_classes++] = ec; | |
932 | return; | |
933 | } | |
934 | /* not an equivalence class, so fallback to a collating element */ | |
935 | p->next = sp; | |
936 | c = p_b_coll_elem(p, '='); | |
937 | CHadd(p, cs, c); | |
938 | } | |
939 | ||
940 | /* | |
941 | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol | |
942 | == static char p_b_symbol(struct parse *p); | |
943 | */ | |
944 | static wint_t /* value of symbol */ | |
945 | p_b_symbol(p) | |
946 | struct parse *p; | |
947 | { | |
948 | wint_t value; | |
949 | ||
950 | (void)REQUIRE(MORE(), REG_EBRACK); | |
951 | if (!EATTWO('[', '.')) | |
952 | return(WGETNEXT()); | |
953 | ||
954 | /* collating symbol */ | |
955 | value = p_b_coll_elem(p, '.'); | |
956 | (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); | |
957 | return(value); | |
958 | } | |
959 | ||
960 | /* | |
961 | - p_b_coll_elem - parse a collating-element name and look it up | |
962 | == static char p_b_coll_elem(struct parse *p, int endc); | |
963 | */ | |
964 | static wint_t /* value of collating element */ | |
965 | p_b_coll_elem(p, endc) | |
966 | struct parse *p; | |
967 | wint_t endc; /* name ended by endc,']' */ | |
968 | { | |
969 | char *sp = p->next; | |
970 | struct cname *cp; | |
971 | int len; | |
972 | mbstate_t mbs; | |
973 | wchar_t wbuf[16]; | |
974 | size_t clen; | |
975 | ||
976 | while (MORE() && !SEETWO(endc, ']')) | |
977 | NEXT(); | |
978 | if (!MORE()) { | |
979 | SETERROR(REG_EBRACK); | |
980 | return(0); | |
981 | } | |
982 | len = p->next - sp; | |
983 | for (cp = cnames; cp->name != NULL; cp++) | |
984 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | |
985 | return(cp->code); /* known name */ | |
986 | memset(&mbs, 0, sizeof(mbs)); | |
987 | clen = __collate_collating_symbol(wbuf, 16, sp, len, &mbs, p->g->loc); | |
988 | if (clen == 1) | |
989 | return (*wbuf); /* single character */ | |
990 | else if (clen == (size_t)-1) | |
991 | SETERROR(REG_ILLSEQ); | |
992 | else | |
993 | SETERROR(REG_ECOLLATE); /* neither */ | |
994 | return(0); | |
995 | } | |
996 | ||
997 | /* | |
998 | - othercase - return the case counterpart of an alphabetic | |
999 | == static char othercase(int ch, locale_t loc); | |
1000 | */ | |
1001 | static wint_t /* if no counterpart, return ch */ | |
1002 | othercase(ch, loc) | |
1003 | wint_t ch; | |
1004 | locale_t loc; | |
1005 | { | |
1006 | assert(iswalpha_l(ch, loc)); | |
1007 | if (iswupper_l(ch, loc)) | |
1008 | return(towlower_l(ch, loc)); | |
1009 | else if (iswlower_l(ch, loc)) | |
1010 | return(towupper_l(ch, loc)); | |
1011 | else /* peculiar, but could happen */ | |
1012 | return(ch); | |
1013 | } | |
1014 | ||
1015 | /* | |
1016 | - bothcases - emit a dualcase version of a two-case character | |
1017 | == static void bothcases(struct parse *p, int ch); | |
1018 | * | |
1019 | * Boy, is this implementation ever a kludge... | |
1020 | */ | |
1021 | static void | |
1022 | bothcases(p, ch) | |
1023 | struct parse *p; | |
1024 | wint_t ch; | |
1025 | { | |
1026 | char *oldnext = p->next; | |
1027 | char *oldend = p->end; | |
1028 | char bracket[3 + MB_LEN_MAX]; | |
1029 | size_t n; | |
1030 | mbstate_t mbs; | |
1031 | ||
1032 | assert(othercase(ch, p->g->loc) != ch); /* p_bracket() would recurse */ | |
1033 | p->next = bracket; | |
1034 | memset(&mbs, 0, sizeof(mbs)); | |
1035 | n = wcrtomb_l(bracket, ch, &mbs, p->g->loc); | |
1036 | assert(n != (size_t)-1); | |
1037 | bracket[n] = ']'; | |
1038 | bracket[n + 1] = '\0'; | |
1039 | p->end = bracket+n+1; | |
1040 | p_bracket(p); | |
1041 | assert(p->next == p->end); | |
1042 | p->next = oldnext; | |
1043 | p->end = oldend; | |
1044 | } | |
1045 | ||
1046 | /* | |
1047 | - ordinary - emit an ordinary character | |
1048 | == static void ordinary(struct parse *p, int ch); | |
1049 | */ | |
1050 | static void | |
1051 | ordinary(p, ch) | |
1052 | struct parse *p; | |
1053 | wint_t ch; | |
1054 | { | |
1055 | cset *cs; | |
1056 | ||
1057 | if ((p->g->cflags®_ICASE) && iswalpha_l(ch, p->g->loc) && othercase(ch, p->g->loc) != ch) | |
1058 | bothcases(p, ch); | |
1059 | else if ((ch & OPDMASK) == ch) | |
1060 | EMIT(OCHAR, ch); | |
1061 | else { | |
1062 | /* | |
1063 | * Kludge: character is too big to fit into an OCHAR operand. | |
1064 | * Emit a singleton set. | |
1065 | */ | |
1066 | if ((cs = allocset(p)) == NULL) | |
1067 | return; | |
1068 | CHadd(p, cs, ch); | |
1069 | EMIT(OANYOF, (int)(cs - p->g->sets)); | |
1070 | } | |
1071 | } | |
1072 | ||
1073 | /* | |
1074 | - nonnewline - emit REG_NEWLINE version of OANY | |
1075 | == static void nonnewline(struct parse *p); | |
1076 | * | |
1077 | * Boy, is this implementation ever a kludge... | |
1078 | */ | |
1079 | static void | |
1080 | nonnewline(p) | |
1081 | struct parse *p; | |
1082 | { | |
1083 | char *oldnext = p->next; | |
1084 | char *oldend = p->end; | |
1085 | char bracket[4]; | |
1086 | ||
1087 | p->next = bracket; | |
1088 | p->end = bracket+3; | |
1089 | bracket[0] = '^'; | |
1090 | bracket[1] = '\n'; | |
1091 | bracket[2] = ']'; | |
1092 | bracket[3] = '\0'; | |
1093 | p_bracket(p); | |
1094 | assert(p->next == bracket+3); | |
1095 | p->next = oldnext; | |
1096 | p->end = oldend; | |
1097 | } | |
1098 | ||
1099 | /* | |
1100 | - repeat - generate code for a bounded repetition, recursively if needed | |
1101 | == static void repeat(struct parse *p, sopno start, int from, int to); | |
1102 | */ | |
1103 | static void | |
1104 | repeat(p, start, from, to) | |
1105 | struct parse *p; | |
1106 | sopno start; /* operand from here to end of strip */ | |
1107 | int from; /* repeated from this number */ | |
1108 | int to; /* to this number of times (maybe INFINITY) */ | |
1109 | { | |
1110 | sopno finish = HERE(); | |
1111 | # define N 2 | |
1112 | # define INF 3 | |
1113 | # define REP(f, t) ((f)*8 + (t)) | |
1114 | # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) | |
1115 | sopno copy; | |
1116 | ||
1117 | if (p->error != 0) /* head off possible runaway recursion */ | |
1118 | return; | |
1119 | ||
1120 | assert(from <= to); | |
1121 | ||
1122 | switch (REP(MAP(from), MAP(to))) { | |
1123 | case REP(0, 0): /* must be user doing this */ | |
1124 | DROP(finish-start); /* drop the operand */ | |
1125 | #if __DARWIN_UNIX03 | |
1126 | p->zerorepeats++; | |
1127 | #endif /* __DARWIN_UNIX03 */ | |
1128 | break; | |
1129 | case REP(0, INF): /* as x{1,}? */ | |
1130 | #if __DARWIN_UNIX03 | |
1131 | /* this case does not require the (y|) trick, noKLUDGE */ | |
1132 | /* Just like * =+? */ | |
1133 | INSERT(OPLUS_, start); | |
1134 | ASTERN(O_PLUS, start); | |
1135 | INSERT(OQUEST_, start); | |
1136 | ASTERN(O_QUEST, start); | |
1137 | break; | |
1138 | #endif /* __DARWIN_UNIX03 */ | |
1139 | case REP(0, 1): /* as x{1,1}? */ | |
1140 | case REP(0, N): /* as x{1,n}? */ | |
1141 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | |
1142 | INSERT(OCH_, start); /* offset is wrong... */ | |
1143 | repeat(p, start+1, 1, to); | |
1144 | ASTERN(OOR1, start); | |
1145 | AHEAD(start); /* ... fix it */ | |
1146 | EMIT(OOR2, 0); | |
1147 | AHEAD(THERE()); | |
1148 | ASTERN(O_CH, THERETHERE()); | |
1149 | break; | |
1150 | case REP(1, 1): /* trivial case */ | |
1151 | /* done */ | |
1152 | break; | |
1153 | case REP(1, N): /* as x?x{1,n-1} */ | |
1154 | #if __DARWIN_UNIX03 | |
1155 | INSERT(OQUEST_, start); | |
1156 | ASTERN(O_QUEST, start); | |
1157 | #else /* !__DARWIN_UNIX03 */ | |
1158 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | |
1159 | INSERT(OCH_, start); | |
1160 | ASTERN(OOR1, start); | |
1161 | AHEAD(start); | |
1162 | EMIT(OOR2, 0); /* offset very wrong... */ | |
1163 | AHEAD(THERE()); /* ...so fix it */ | |
1164 | ASTERN(O_CH, THERETHERE()); | |
1165 | #endif /* __DARWIN_UNIX03 */ | |
1166 | copy = dupl(p, start+1, finish+1); | |
1167 | assert(copy == finish+4); | |
1168 | repeat(p, copy, 1, to-1); | |
1169 | break; | |
1170 | case REP(1, INF): /* as x+ */ | |
1171 | INSERT(OPLUS_, start); | |
1172 | ASTERN(O_PLUS, start); | |
1173 | break; | |
1174 | case REP(N, N): /* as xx{m-1,n-1} */ | |
1175 | copy = dupl(p, start, finish); | |
1176 | repeat(p, copy, from-1, to-1); | |
1177 | break; | |
1178 | case REP(N, INF): /* as xx{n-1,INF} */ | |
1179 | copy = dupl(p, start, finish); | |
1180 | repeat(p, copy, from-1, to); | |
1181 | break; | |
1182 | default: /* "can't happen" */ | |
1183 | SETERROR(REG_ASSERT); /* just in case */ | |
1184 | break; | |
1185 | } | |
1186 | } | |
1187 | ||
1188 | /* | |
1189 | - wgetnext - helper function for WGETNEXT() macro. Gets the next wide | |
1190 | - character from the parse struct, signals a REG_ILLSEQ error if the | |
1191 | - character can't be converted. Returns the number of bytes consumed. | |
1192 | */ | |
1193 | static wint_t | |
1194 | wgetnext(p) | |
1195 | struct parse *p; | |
1196 | { | |
1197 | mbstate_t mbs; | |
1198 | wchar_t wc; | |
1199 | size_t n; | |
1200 | ||
1201 | memset(&mbs, 0, sizeof(mbs)); | |
1202 | n = mbrtowc_l(&wc, p->next, p->end - p->next, &mbs, p->g->loc); | |
1203 | if (n == (size_t)-1 || n == (size_t)-2) { | |
1204 | SETERROR(REG_ILLSEQ); | |
1205 | return (0); | |
1206 | } | |
1207 | if (n == 0) | |
1208 | n = 1; | |
1209 | p->next += n; | |
1210 | return (wc); | |
1211 | } | |
1212 | ||
1213 | /* | |
1214 | - seterr - set an error condition | |
1215 | == static int seterr(struct parse *p, int e); | |
1216 | */ | |
1217 | static int /* useless but makes type checking happy */ | |
1218 | seterr(p, e) | |
1219 | struct parse *p; | |
1220 | int e; | |
1221 | { | |
1222 | if (p->error == 0) /* keep earliest error condition */ | |
1223 | p->error = e; | |
1224 | p->next = nuls; /* try to bring things to a halt */ | |
1225 | p->end = nuls; | |
1226 | return(0); /* make the return value well-defined */ | |
1227 | } | |
1228 | ||
1229 | /* | |
1230 | - allocset - allocate a set of characters for [] | |
1231 | == static cset *allocset(struct parse *p); | |
1232 | */ | |
1233 | static cset * | |
1234 | allocset(p) | |
1235 | struct parse *p; | |
1236 | { | |
1237 | cset *cs, *ncs; | |
1238 | ||
1239 | ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); | |
1240 | if (ncs == NULL) { | |
1241 | SETERROR(REG_ESPACE); | |
1242 | return (NULL); | |
1243 | } | |
1244 | p->g->sets = ncs; | |
1245 | cs = &p->g->sets[p->g->ncsets++]; | |
1246 | memset(cs, 0, sizeof(*cs)); | |
1247 | ||
1248 | return(cs); | |
1249 | } | |
1250 | ||
1251 | /* | |
1252 | - freeset - free a now-unused set | |
1253 | == static void freeset(struct parse *p, cset *cs); | |
1254 | */ | |
1255 | static void | |
1256 | freeset(p, cs) | |
1257 | struct parse *p; | |
1258 | cset *cs; | |
1259 | { | |
1260 | cset *top = &p->g->sets[p->g->ncsets]; | |
1261 | ||
1262 | free(cs->wides); | |
1263 | free(cs->ranges); | |
1264 | free(cs->types); | |
1265 | memset(cs, 0, sizeof(*cs)); | |
1266 | if (cs == top-1) /* recover only the easy case */ | |
1267 | p->g->ncsets--; | |
1268 | } | |
1269 | ||
1270 | /* | |
1271 | - singleton - Determine whether a set contains only one character, | |
1272 | - returning it if so, otherwise returning OUT. | |
1273 | */ | |
1274 | static wint_t | |
1275 | singleton(cs, loc) | |
1276 | cset *cs; | |
1277 | locale_t loc; | |
1278 | { | |
1279 | wint_t i, s, n; | |
1280 | ||
1281 | for (i = n = 0; i < NC; i++) | |
1282 | if (CHIN(cs, i, loc)) { | |
1283 | n++; | |
1284 | s = i; | |
1285 | } | |
1286 | if (n == 1) | |
1287 | return (s); | |
1288 | if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && | |
1289 | cs->icase == 0) | |
1290 | return (cs->wides[0]); | |
1291 | /* Don't bother handling the other cases. */ | |
1292 | return (OUT); | |
1293 | } | |
1294 | ||
1295 | /* | |
1296 | - CHadd - add character to character set. | |
1297 | */ | |
1298 | static void | |
1299 | CHadd(p, cs, ch) | |
1300 | struct parse *p; | |
1301 | cset *cs; | |
1302 | wint_t ch; | |
1303 | { | |
1304 | wint_t nch, *newwides; | |
1305 | assert(ch >= 0); | |
1306 | if (ch < NC) | |
1307 | cs->bmp[ch >> 3] |= 1 << (ch & 7); | |
1308 | else { | |
1309 | newwides = realloc(cs->wides, (cs->nwides + 1) * | |
1310 | sizeof(*cs->wides)); | |
1311 | if (newwides == NULL) { | |
1312 | SETERROR(REG_ESPACE); | |
1313 | return; | |
1314 | } | |
1315 | cs->wides = newwides; | |
1316 | cs->wides[cs->nwides++] = ch; | |
1317 | } | |
1318 | if (cs->icase) { | |
1319 | if ((nch = towlower_l(ch, p->g->loc)) < NC) | |
1320 | cs->bmp[nch >> 3] |= 1 << (nch & 7); | |
1321 | if ((nch = towupper_l(ch, p->g->loc)) < NC) | |
1322 | cs->bmp[nch >> 3] |= 1 << (nch & 7); | |
1323 | } | |
1324 | } | |
1325 | ||
1326 | /* | |
1327 | - CHaddrange - add all characters in the range [min,max] to a character set. | |
1328 | */ | |
1329 | static void | |
1330 | CHaddrange(p, cs, min, max) | |
1331 | struct parse *p; | |
1332 | cset *cs; | |
1333 | wint_t min, max; | |
1334 | { | |
1335 | crange *newranges; | |
1336 | ||
1337 | for (; min < NC && min <= max; min++) | |
1338 | CHadd(p, cs, min); | |
1339 | if (min >= max) | |
1340 | return; | |
1341 | newranges = realloc(cs->ranges, (cs->nranges + 1) * | |
1342 | sizeof(*cs->ranges)); | |
1343 | if (newranges == NULL) { | |
1344 | SETERROR(REG_ESPACE); | |
1345 | return; | |
1346 | } | |
1347 | cs->ranges = newranges; | |
1348 | cs->ranges[cs->nranges].min = min; | |
1349 | cs->ranges[cs->nranges].min = max; | |
1350 | cs->nranges++; | |
1351 | } | |
1352 | ||
1353 | /* | |
1354 | - CHaddtype - add all characters of a certain type to a character set. | |
1355 | */ | |
1356 | static void | |
1357 | CHaddtype(p, cs, wct) | |
1358 | struct parse *p; | |
1359 | cset *cs; | |
1360 | wctype_t wct; | |
1361 | { | |
1362 | wint_t i; | |
1363 | wctype_t *newtypes; | |
1364 | ||
1365 | for (i = 0; i < NC; i++) | |
1366 | if (iswctype_l(i, wct, p->g->loc)) | |
1367 | CHadd(p, cs, i); | |
1368 | newtypes = realloc(cs->types, (cs->ntypes + 1) * | |
1369 | sizeof(*cs->types)); | |
1370 | if (newtypes == NULL) { | |
1371 | SETERROR(REG_ESPACE); | |
1372 | return; | |
1373 | } | |
1374 | cs->types = newtypes; | |
1375 | cs->types[cs->ntypes++] = wct; | |
1376 | } | |
1377 | ||
1378 | /* | |
1379 | - dupl - emit a duplicate of a bunch of sops | |
1380 | == static sopno dupl(struct parse *p, sopno start, sopno finish); | |
1381 | */ | |
1382 | static sopno /* start of duplicate */ | |
1383 | dupl(p, start, finish) | |
1384 | struct parse *p; | |
1385 | sopno start; /* from here */ | |
1386 | sopno finish; /* to this less one */ | |
1387 | { | |
1388 | sopno ret = HERE(); | |
1389 | sopno len = finish - start; | |
1390 | ||
1391 | assert(finish >= start); | |
1392 | if (len == 0) | |
1393 | return(ret); | |
1394 | enlarge(p, p->ssize + len); /* this many unexpected additions */ | |
1395 | assert(p->ssize >= p->slen + len); | |
1396 | (void) memcpy((char *)(p->strip + p->slen), | |
1397 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); | |
1398 | p->slen += len; | |
1399 | return(ret); | |
1400 | } | |
1401 | ||
1402 | /* | |
1403 | - doemit - emit a strip operator | |
1404 | == static void doemit(struct parse *p, sop op, size_t opnd); | |
1405 | * | |
1406 | * It might seem better to implement this as a macro with a function as | |
1407 | * hard-case backup, but it's just too big and messy unless there are | |
1408 | * some changes to the data structures. Maybe later. | |
1409 | */ | |
1410 | static void | |
1411 | doemit(p, op, opnd) | |
1412 | struct parse *p; | |
1413 | sop op; | |
1414 | size_t opnd; | |
1415 | { | |
1416 | /* avoid making error situations worse */ | |
1417 | if (p->error != 0) | |
1418 | return; | |
1419 | ||
1420 | /* deal with oversize operands ("can't happen", more or less) */ | |
1421 | assert(opnd < 1<<OPSHIFT); | |
1422 | ||
1423 | /* deal with undersized strip */ | |
1424 | if (p->slen >= p->ssize) | |
1425 | enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ | |
1426 | assert(p->slen < p->ssize); | |
1427 | ||
1428 | /* finally, it's all reduced to the easy case */ | |
1429 | p->strip[p->slen++] = SOP(op, opnd); | |
1430 | } | |
1431 | ||
1432 | /* | |
1433 | - doinsert - insert a sop into the strip | |
1434 | == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); | |
1435 | */ | |
1436 | static void | |
1437 | doinsert(p, op, opnd, pos) | |
1438 | struct parse *p; | |
1439 | sop op; | |
1440 | size_t opnd; | |
1441 | sopno pos; | |
1442 | { | |
1443 | sopno sn; | |
1444 | sop s; | |
1445 | int i; | |
1446 | ||
1447 | /* avoid making error situations worse */ | |
1448 | if (p->error != 0) | |
1449 | return; | |
1450 | ||
1451 | sn = HERE(); | |
1452 | EMIT(op, opnd); /* do checks, ensure space */ | |
1453 | assert(HERE() == sn+1); | |
1454 | s = p->strip[sn]; | |
1455 | ||
1456 | /* adjust paren pointers */ | |
1457 | assert(pos > 0); | |
1458 | for (i = 1; i < NPAREN; i++) { | |
1459 | if (p->pbegin[i] >= pos) { | |
1460 | p->pbegin[i]++; | |
1461 | } | |
1462 | if (p->pend[i] >= pos) { | |
1463 | p->pend[i]++; | |
1464 | } | |
1465 | } | |
1466 | ||
1467 | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], | |
1468 | (HERE()-pos-1)*sizeof(sop)); | |
1469 | p->strip[pos] = s; | |
1470 | } | |
1471 | ||
1472 | /* | |
1473 | - dofwd - complete a forward reference | |
1474 | == static void dofwd(struct parse *p, sopno pos, sop value); | |
1475 | */ | |
1476 | static void | |
1477 | dofwd(p, pos, value) | |
1478 | struct parse *p; | |
1479 | sopno pos; | |
1480 | sop value; | |
1481 | { | |
1482 | /* avoid making error situations worse */ | |
1483 | if (p->error != 0) | |
1484 | return; | |
1485 | ||
1486 | assert(value < 1<<OPSHIFT); | |
1487 | p->strip[pos] = OP(p->strip[pos]) | value; | |
1488 | } | |
1489 | ||
1490 | /* | |
1491 | - enlarge - enlarge the strip | |
1492 | == static void enlarge(struct parse *p, sopno size); | |
1493 | */ | |
1494 | static void | |
1495 | enlarge(p, size) | |
1496 | struct parse *p; | |
1497 | sopno size; | |
1498 | { | |
1499 | sop *sp; | |
1500 | ||
1501 | if (p->ssize >= size) | |
1502 | return; | |
1503 | ||
1504 | sp = (sop *)realloc(p->strip, size*sizeof(sop)); | |
1505 | if (sp == NULL) { | |
1506 | SETERROR(REG_ESPACE); | |
1507 | return; | |
1508 | } | |
1509 | p->strip = sp; | |
1510 | p->ssize = size; | |
1511 | } | |
1512 | ||
1513 | /* | |
1514 | - stripsnug - compact the strip | |
1515 | == static void stripsnug(struct parse *p, struct re_guts *g); | |
1516 | */ | |
1517 | static void | |
1518 | stripsnug(p, g) | |
1519 | struct parse *p; | |
1520 | struct re_guts *g; | |
1521 | { | |
1522 | g->nstates = p->slen; | |
1523 | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); | |
1524 | if (g->strip == NULL) { | |
1525 | SETERROR(REG_ESPACE); | |
1526 | g->strip = p->strip; | |
1527 | } | |
1528 | } | |
1529 | ||
1530 | /* | |
1531 | - findmust - fill in must and mlen with longest mandatory literal string | |
1532 | == static void findmust(struct parse *p, struct re_guts *g); | |
1533 | * | |
1534 | * This algorithm could do fancy things like analyzing the operands of | | |
1535 | * for common subsequences. Someday. This code is simple and finds most | |
1536 | * of the interesting cases. | |
1537 | * | |
1538 | * Note that must and mlen got initialized during setup. | |
1539 | */ | |
1540 | static void | |
1541 | findmust(p, g) | |
1542 | struct parse *p; | |
1543 | struct re_guts *g; | |
1544 | { | |
1545 | sop *scan; | |
1546 | sop *start; | |
1547 | sop *newstart; | |
1548 | sopno newlen; | |
1549 | sop s; | |
1550 | char *cp; | |
1551 | int offset; | |
1552 | char buf[MB_LEN_MAX]; | |
1553 | size_t clen; | |
1554 | mbstate_t mbs; | |
1555 | struct __xlocale_st_runelocale *rl = p->g->loc->__lc_ctype; | |
1556 | ||
1557 | /* avoid making error situations worse */ | |
1558 | if (p->error != 0) | |
1559 | return; | |
1560 | ||
1561 | /* | |
1562 | * It's not generally safe to do a ``char'' substring search on | |
1563 | * multibyte character strings, but it's safe for at least | |
1564 | * UTF-8 (see RFC 3629). | |
1565 | */ | |
1566 | if (rl->__mb_cur_max > 1 && | |
1567 | strcmp(rl->_CurrentRuneLocale.__encoding, "UTF-8") != 0) | |
1568 | return; | |
1569 | ||
1570 | /* find the longest OCHAR sequence in strip */ | |
1571 | newlen = 0; | |
1572 | offset = 0; | |
1573 | g->moffset = 0; | |
1574 | scan = g->strip + 1; | |
1575 | do { | |
1576 | s = *scan++; | |
1577 | switch (OP(s)) { | |
1578 | case OCHAR: /* sequence member */ | |
1579 | if (newlen == 0) { /* new sequence */ | |
1580 | memset(&mbs, 0, sizeof(mbs)); | |
1581 | newstart = scan - 1; | |
1582 | } | |
1583 | clen = wcrtomb_l(buf, OPND(s), &mbs, p->g->loc); | |
1584 | if (clen == (size_t)-1) | |
1585 | goto toohard; | |
1586 | newlen += clen; | |
1587 | break; | |
1588 | case OPLUS_: /* things that don't break one */ | |
1589 | case OLPAREN: | |
1590 | case ORPAREN: | |
1591 | break; | |
1592 | case OQUEST_: /* things that must be skipped */ | |
1593 | case OCH_: | |
1594 | offset = altoffset(scan, offset); | |
1595 | scan--; | |
1596 | do { | |
1597 | scan += OPND(s); | |
1598 | s = *scan; | |
1599 | /* assert() interferes w debug printouts */ | |
1600 | if (OP(s) != O_QUEST && OP(s) != O_CH && | |
1601 | OP(s) != OOR2) { | |
1602 | g->iflags |= BAD; | |
1603 | return; | |
1604 | } | |
1605 | } while (OP(s) != O_QUEST && OP(s) != O_CH); | |
1606 | /* FALLTHROUGH */ | |
1607 | case OBOW: /* things that break a sequence */ | |
1608 | case OEOW: | |
1609 | case OBOL: | |
1610 | case OEOL: | |
1611 | case O_QUEST: | |
1612 | case O_CH: | |
1613 | case OEND: | |
1614 | if (newlen > g->mlen) { /* ends one */ | |
1615 | start = newstart; | |
1616 | g->mlen = newlen; | |
1617 | if (offset > -1) { | |
1618 | g->moffset += offset; | |
1619 | offset = newlen; | |
1620 | } else | |
1621 | g->moffset = offset; | |
1622 | } else { | |
1623 | if (offset > -1) | |
1624 | offset += newlen; | |
1625 | } | |
1626 | newlen = 0; | |
1627 | break; | |
1628 | case OANY: | |
1629 | if (newlen > g->mlen) { /* ends one */ | |
1630 | start = newstart; | |
1631 | g->mlen = newlen; | |
1632 | if (offset > -1) { | |
1633 | g->moffset += offset; | |
1634 | offset = newlen; | |
1635 | } else | |
1636 | g->moffset = offset; | |
1637 | } else { | |
1638 | if (offset > -1) | |
1639 | offset += newlen; | |
1640 | } | |
1641 | if (offset > -1) | |
1642 | offset++; | |
1643 | newlen = 0; | |
1644 | break; | |
1645 | case OANYOF: /* may or may not invalidate offset */ | |
1646 | /* First, everything as OANY */ | |
1647 | if (newlen > g->mlen) { /* ends one */ | |
1648 | start = newstart; | |
1649 | g->mlen = newlen; | |
1650 | if (offset > -1) { | |
1651 | g->moffset += offset; | |
1652 | offset = newlen; | |
1653 | } else | |
1654 | g->moffset = offset; | |
1655 | } else { | |
1656 | if (offset > -1) | |
1657 | offset += newlen; | |
1658 | } | |
1659 | if (offset > -1) | |
1660 | offset++; | |
1661 | newlen = 0; | |
1662 | break; | |
1663 | toohard: | |
1664 | default: | |
1665 | /* Anything here makes it impossible or too hard | |
1666 | * to calculate the offset -- so we give up; | |
1667 | * save the last known good offset, in case the | |
1668 | * must sequence doesn't occur later. | |
1669 | */ | |
1670 | if (newlen > g->mlen) { /* ends one */ | |
1671 | start = newstart; | |
1672 | g->mlen = newlen; | |
1673 | if (offset > -1) | |
1674 | g->moffset += offset; | |
1675 | else | |
1676 | g->moffset = offset; | |
1677 | } | |
1678 | offset = -1; | |
1679 | newlen = 0; | |
1680 | break; | |
1681 | } | |
1682 | } while (OP(s) != OEND); | |
1683 | ||
1684 | if (g->mlen == 0) { /* there isn't one */ | |
1685 | g->moffset = -1; | |
1686 | return; | |
1687 | } | |
1688 | ||
1689 | /* turn it into a character string */ | |
1690 | g->must = malloc((size_t)g->mlen + 1); | |
1691 | if (g->must == NULL) { /* argh; just forget it */ | |
1692 | g->mlen = 0; | |
1693 | g->moffset = -1; | |
1694 | return; | |
1695 | } | |
1696 | cp = g->must; | |
1697 | scan = start; | |
1698 | memset(&mbs, 0, sizeof(mbs)); | |
1699 | while (cp < g->must + g->mlen) { | |
1700 | while (OP(s = *scan++) != OCHAR) | |
1701 | continue; | |
1702 | clen = wcrtomb_l(cp, OPND(s), &mbs, p->g->loc); | |
1703 | assert(clen != (size_t)-1); | |
1704 | cp += clen; | |
1705 | } | |
1706 | assert(cp == g->must + g->mlen); | |
1707 | *cp++ = '\0'; /* just on general principles */ | |
1708 | } | |
1709 | ||
1710 | /* | |
1711 | - altoffset - choose biggest offset among multiple choices | |
1712 | == static int altoffset(sop *scan, int offset); | |
1713 | * | |
1714 | * Compute, recursively if necessary, the largest offset among multiple | |
1715 | * re paths. | |
1716 | */ | |
1717 | static int | |
1718 | altoffset(scan, offset) | |
1719 | sop *scan; | |
1720 | int offset; | |
1721 | { | |
1722 | int largest; | |
1723 | int try; | |
1724 | sop s; | |
1725 | ||
1726 | /* If we gave up already on offsets, return */ | |
1727 | if (offset == -1) | |
1728 | return -1; | |
1729 | ||
1730 | largest = 0; | |
1731 | try = 0; | |
1732 | s = *scan++; | |
1733 | while (OP(s) != O_QUEST && OP(s) != O_CH) { | |
1734 | switch (OP(s)) { | |
1735 | case OOR1: | |
1736 | if (try > largest) | |
1737 | largest = try; | |
1738 | try = 0; | |
1739 | break; | |
1740 | case OQUEST_: | |
1741 | case OCH_: | |
1742 | try = altoffset(scan, try); | |
1743 | if (try == -1) | |
1744 | return -1; | |
1745 | scan--; | |
1746 | do { | |
1747 | scan += OPND(s); | |
1748 | s = *scan; | |
1749 | if (OP(s) != O_QUEST && OP(s) != O_CH && | |
1750 | OP(s) != OOR2) | |
1751 | return -1; | |
1752 | } while (OP(s) != O_QUEST && OP(s) != O_CH); | |
1753 | /* We must skip to the next position, or we'll | |
1754 | * leave altoffset() too early. | |
1755 | */ | |
1756 | scan++; | |
1757 | break; | |
1758 | case OANYOF: | |
1759 | case OCHAR: | |
1760 | case OANY: | |
1761 | try++; | |
1762 | case OBOW: | |
1763 | case OEOW: | |
1764 | case OLPAREN: | |
1765 | case ORPAREN: | |
1766 | case OOR2: | |
1767 | break; | |
1768 | default: | |
1769 | try = -1; | |
1770 | break; | |
1771 | } | |
1772 | if (try == -1) | |
1773 | return -1; | |
1774 | s = *scan++; | |
1775 | } | |
1776 | ||
1777 | if (try > largest) | |
1778 | largest = try; | |
1779 | ||
1780 | return largest+offset; | |
1781 | } | |
1782 | ||
1783 | /* | |
1784 | - computejumps - compute char jumps for BM scan | |
1785 | == static void computejumps(struct parse *p, struct re_guts *g); | |
1786 | * | |
1787 | * This algorithm assumes g->must exists and is has size greater than | |
1788 | * zero. It's based on the algorithm found on Computer Algorithms by | |
1789 | * Sara Baase. | |
1790 | * | |
1791 | * A char jump is the number of characters one needs to jump based on | |
1792 | * the value of the character from the text that was mismatched. | |
1793 | */ | |
1794 | static void | |
1795 | computejumps(p, g) | |
1796 | struct parse *p; | |
1797 | struct re_guts *g; | |
1798 | { | |
1799 | int ch; | |
1800 | int mindex; | |
1801 | ||
1802 | /* Avoid making errors worse */ | |
1803 | if (p->error != 0) | |
1804 | return; | |
1805 | ||
1806 | g->charjump = (int*) malloc((NC + 1) * sizeof(int)); | |
1807 | if (g->charjump == NULL) /* Not a fatal error */ | |
1808 | return; | |
1809 | /* Adjust for signed chars, if necessary */ | |
1810 | g->charjump = &g->charjump[-(CHAR_MIN)]; | |
1811 | ||
1812 | /* If the character does not exist in the pattern, the jump | |
1813 | * is equal to the number of characters in the pattern. | |
1814 | */ | |
1815 | for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) | |
1816 | g->charjump[ch] = g->mlen; | |
1817 | ||
1818 | /* If the character does exist, compute the jump that would | |
1819 | * take us to the last character in the pattern equal to it | |
1820 | * (notice that we match right to left, so that last character | |
1821 | * is the first one that would be matched). | |
1822 | */ | |
1823 | for (mindex = 0; mindex < g->mlen; mindex++) | |
1824 | g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; | |
1825 | } | |
1826 | ||
1827 | /* | |
1828 | - computematchjumps - compute match jumps for BM scan | |
1829 | == static void computematchjumps(struct parse *p, struct re_guts *g); | |
1830 | * | |
1831 | * This algorithm assumes g->must exists and is has size greater than | |
1832 | * zero. It's based on the algorithm found on Computer Algorithms by | |
1833 | * Sara Baase. | |
1834 | * | |
1835 | * A match jump is the number of characters one needs to advance based | |
1836 | * on the already-matched suffix. | |
1837 | * Notice that all values here are minus (g->mlen-1), because of the way | |
1838 | * the search algorithm works. | |
1839 | */ | |
1840 | static void | |
1841 | computematchjumps(p, g) | |
1842 | struct parse *p; | |
1843 | struct re_guts *g; | |
1844 | { | |
1845 | int mindex; /* General "must" iterator */ | |
1846 | int suffix; /* Keeps track of matching suffix */ | |
1847 | int ssuffix; /* Keeps track of suffixes' suffix */ | |
1848 | int* pmatches; /* pmatches[k] points to the next i | |
1849 | * such that i+1...mlen is a substring | |
1850 | * of k+1...k+mlen-i-1 | |
1851 | */ | |
1852 | ||
1853 | /* Avoid making errors worse */ | |
1854 | if (p->error != 0) | |
1855 | return; | |
1856 | ||
1857 | pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); | |
1858 | if (pmatches == NULL) { | |
1859 | g->matchjump = NULL; | |
1860 | return; | |
1861 | } | |
1862 | ||
1863 | g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); | |
1864 | if (g->matchjump == NULL) /* Not a fatal error */ | |
1865 | return; | |
1866 | ||
1867 | /* Set maximum possible jump for each character in the pattern */ | |
1868 | for (mindex = 0; mindex < g->mlen; mindex++) | |
1869 | g->matchjump[mindex] = 2*g->mlen - mindex - 1; | |
1870 | ||
1871 | /* Compute pmatches[] */ | |
1872 | for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; | |
1873 | mindex--, suffix--) { | |
1874 | pmatches[mindex] = suffix; | |
1875 | ||
1876 | /* If a mismatch is found, interrupting the substring, | |
1877 | * compute the matchjump for that position. If no | |
1878 | * mismatch is found, then a text substring mismatched | |
1879 | * against the suffix will also mismatch against the | |
1880 | * substring. | |
1881 | */ | |
1882 | while (suffix < g->mlen | |
1883 | && g->must[mindex] != g->must[suffix]) { | |
1884 | g->matchjump[suffix] = MIN(g->matchjump[suffix], | |
1885 | g->mlen - mindex - 1); | |
1886 | suffix = pmatches[suffix]; | |
1887 | } | |
1888 | } | |
1889 | ||
1890 | /* Compute the matchjump up to the last substring found to jump | |
1891 | * to the beginning of the largest must pattern prefix matching | |
1892 | * it's own suffix. | |
1893 | */ | |
1894 | for (mindex = 0; mindex <= suffix; mindex++) | |
1895 | g->matchjump[mindex] = MIN(g->matchjump[mindex], | |
1896 | g->mlen + suffix - mindex); | |
1897 | ||
1898 | ssuffix = pmatches[suffix]; | |
1899 | while (suffix < g->mlen) { | |
1900 | while (suffix <= ssuffix && suffix < g->mlen) { | |
1901 | g->matchjump[suffix] = MIN(g->matchjump[suffix], | |
1902 | g->mlen + ssuffix - suffix); | |
1903 | suffix++; | |
1904 | } | |
1905 | if (suffix < g->mlen) | |
1906 | ssuffix = pmatches[ssuffix]; | |
1907 | } | |
1908 | ||
1909 | free(pmatches); | |
1910 | } | |
1911 | ||
1912 | /* | |
1913 | - pluscount - count + nesting | |
1914 | == static sopno pluscount(struct parse *p, struct re_guts *g); | |
1915 | */ | |
1916 | static sopno /* nesting depth */ | |
1917 | pluscount(p, g) | |
1918 | struct parse *p; | |
1919 | struct re_guts *g; | |
1920 | { | |
1921 | sop *scan; | |
1922 | sop s; | |
1923 | sopno plusnest = 0; | |
1924 | sopno maxnest = 0; | |
1925 | ||
1926 | if (p->error != 0) | |
1927 | return(0); /* there may not be an OEND */ | |
1928 | ||
1929 | scan = g->strip + 1; | |
1930 | do { | |
1931 | s = *scan++; | |
1932 | switch (OP(s)) { | |
1933 | case OPLUS_: | |
1934 | plusnest++; | |
1935 | break; | |
1936 | case O_PLUS: | |
1937 | if (plusnest > maxnest) | |
1938 | maxnest = plusnest; | |
1939 | plusnest--; | |
1940 | break; | |
1941 | } | |
1942 | } while (OP(s) != OEND); | |
1943 | if (plusnest != 0) | |
1944 | g->iflags |= BAD; | |
1945 | return(maxnest); | |
1946 | } |