]>
Commit | Line | Data |
---|---|---|
9385eb3d A |
1 | /*- |
2 | * Copyright (c) 1992, 1993, 1994 Henry Spencer. | |
e9ce8d39 A |
3 | * Copyright (c) 1992, 1993, 1994 |
4 | * The Regents of the University of California. All rights reserved. | |
5 | * | |
6 | * This code is derived from software contributed to Berkeley by | |
7 | * Henry Spencer. | |
8 | * | |
9 | * Redistribution and use in source and binary forms, with or without | |
10 | * modification, are permitted provided that the following conditions | |
11 | * are met: | |
12 | * 1. Redistributions of source code must retain the above copyright | |
13 | * notice, this list of conditions and the following disclaimer. | |
14 | * 2. Redistributions in binary form must reproduce the above copyright | |
15 | * notice, this list of conditions and the following disclaimer in the | |
16 | * documentation and/or other materials provided with the distribution. | |
17 | * 3. All advertising materials mentioning features or use of this software | |
18 | * must display the following acknowledgement: | |
19 | * This product includes software developed by the University of | |
20 | * California, Berkeley and its contributors. | |
21 | * 4. Neither the name of the University nor the names of its contributors | |
22 | * may be used to endorse or promote products derived from this software | |
23 | * without specific prior written permission. | |
24 | * | |
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
35 | * SUCH DAMAGE. | |
9385eb3d A |
36 | * |
37 | * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 | |
e9ce8d39 A |
38 | */ |
39 | ||
9385eb3d A |
40 | #if defined(LIBC_SCCS) && !defined(lint) |
41 | static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; | |
42 | #endif /* LIBC_SCCS and not lint */ | |
43 | #include <sys/cdefs.h> | |
44 | __FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.30 2003/02/16 17:29:10 nectar Exp $"); | |
45 | ||
e9ce8d39 A |
46 | #include <sys/types.h> |
47 | #include <stdio.h> | |
48 | #include <string.h> | |
49 | #include <ctype.h> | |
50 | #include <limits.h> | |
51 | #include <stdlib.h> | |
52 | #include <regex.h> | |
53 | ||
9385eb3d A |
54 | #include "collate.h" |
55 | ||
e9ce8d39 A |
56 | #include "utils.h" |
57 | #include "regex2.h" | |
58 | ||
59 | #include "cclass.h" | |
60 | #include "cname.h" | |
61 | ||
62 | /* | |
63 | * parse structure, passed up and down to avoid global variables and | |
64 | * other clumsinesses | |
65 | */ | |
66 | struct parse { | |
67 | char *next; /* next character in RE */ | |
68 | char *end; /* end of string (-> NUL normally) */ | |
69 | int error; /* has an error been seen? */ | |
70 | sop *strip; /* malloced strip */ | |
71 | sopno ssize; /* malloced strip size (allocated) */ | |
72 | sopno slen; /* malloced strip length (used) */ | |
73 | int ncsalloc; /* number of csets allocated */ | |
74 | struct re_guts *g; | |
75 | # define NPAREN 10 /* we need to remember () 1-9 for back refs */ | |
76 | sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ | |
77 | sopno pend[NPAREN]; /* -> ) ([0] unused) */ | |
78 | }; | |
79 | ||
80 | /* ========= begin header generated by ./mkh ========= */ | |
81 | #ifdef __cplusplus | |
82 | extern "C" { | |
83 | #endif | |
84 | ||
85 | /* === regcomp.c === */ | |
9385eb3d A |
86 | static void p_ere(struct parse *p, int stop); |
87 | static void p_ere_exp(struct parse *p); | |
88 | static void p_str(struct parse *p); | |
89 | static void p_bre(struct parse *p, int end1, int end2); | |
90 | static int p_simp_re(struct parse *p, int starordinary); | |
91 | static int p_count(struct parse *p); | |
92 | static void p_bracket(struct parse *p); | |
93 | static void p_b_term(struct parse *p, cset *cs); | |
94 | static void p_b_cclass(struct parse *p, cset *cs); | |
95 | static void p_b_eclass(struct parse *p, cset *cs); | |
96 | static char p_b_symbol(struct parse *p); | |
97 | static char p_b_coll_elem(struct parse *p, int endc); | |
98 | static char othercase(int ch); | |
99 | static void bothcases(struct parse *p, int ch); | |
100 | static void ordinary(struct parse *p, int ch); | |
101 | static void nonnewline(struct parse *p); | |
102 | static void repeat(struct parse *p, sopno start, int from, int to); | |
103 | static int seterr(struct parse *p, int e); | |
104 | static cset *allocset(struct parse *p); | |
105 | static void freeset(struct parse *p, cset *cs); | |
106 | static int freezeset(struct parse *p, cset *cs); | |
107 | static int firstch(struct parse *p, cset *cs); | |
108 | static int nch(struct parse *p, cset *cs); | |
109 | static void mcadd(struct parse *p, cset *cs, char *cp) __unused; | |
110 | #if used | |
111 | static void mcsub(cset *cs, char *cp); | |
112 | static int mcin(cset *cs, char *cp); | |
113 | static char *mcfind(cset *cs, char *cp); | |
114 | #endif | |
115 | static void mcinvert(struct parse *p, cset *cs); | |
116 | static void mccase(struct parse *p, cset *cs); | |
117 | static int isinsets(struct re_guts *g, int c); | |
118 | static int samesets(struct re_guts *g, int c1, int c2); | |
119 | static void categorize(struct parse *p, struct re_guts *g); | |
120 | static sopno dupl(struct parse *p, sopno start, sopno finish); | |
121 | static void doemit(struct parse *p, sop op, size_t opnd); | |
122 | static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); | |
123 | static void dofwd(struct parse *p, sopno pos, sop value); | |
124 | static void enlarge(struct parse *p, sopno size); | |
125 | static void stripsnug(struct parse *p, struct re_guts *g); | |
126 | static void findmust(struct parse *p, struct re_guts *g); | |
127 | static int altoffset(sop *scan, int offset, int mccs); | |
128 | static void computejumps(struct parse *p, struct re_guts *g); | |
129 | static void computematchjumps(struct parse *p, struct re_guts *g); | |
130 | static sopno pluscount(struct parse *p, struct re_guts *g); | |
e9ce8d39 A |
131 | |
132 | #ifdef __cplusplus | |
133 | } | |
134 | #endif | |
135 | /* ========= end header generated by ./mkh ========= */ | |
136 | ||
137 | static char nuls[10]; /* place to point scanner in event of error */ | |
138 | ||
139 | /* | |
140 | * macros for use with parse structure | |
141 | * BEWARE: these know that the parse structure is named `p' !!! | |
142 | */ | |
143 | #define PEEK() (*p->next) | |
144 | #define PEEK2() (*(p->next+1)) | |
145 | #define MORE() (p->next < p->end) | |
146 | #define MORE2() (p->next+1 < p->end) | |
147 | #define SEE(c) (MORE() && PEEK() == (c)) | |
148 | #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) | |
149 | #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) | |
150 | #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) | |
151 | #define NEXT() (p->next++) | |
152 | #define NEXT2() (p->next += 2) | |
153 | #define NEXTn(n) (p->next += (n)) | |
154 | #define GETNEXT() (*p->next++) | |
155 | #define SETERROR(e) seterr(p, (e)) | |
156 | #define REQUIRE(co, e) ((co) || SETERROR(e)) | |
157 | #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) | |
158 | #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) | |
159 | #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) | |
160 | #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) | |
161 | #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) | |
162 | #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) | |
163 | #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) | |
164 | #define HERE() (p->slen) | |
165 | #define THERE() (p->slen - 1) | |
166 | #define THERETHERE() (p->slen - 2) | |
167 | #define DROP(n) (p->slen -= (n)) | |
168 | ||
169 | #ifndef NDEBUG | |
170 | static int never = 0; /* for use in asserts; shuts lint up */ | |
171 | #else | |
172 | #define never 0 /* some <assert.h>s have bugs too */ | |
173 | #endif | |
174 | ||
9385eb3d A |
175 | /* Macro used by computejump()/computematchjump() */ |
176 | #define MIN(a,b) ((a)<(b)?(a):(b)) | |
177 | ||
e9ce8d39 A |
178 | /* |
179 | - regcomp - interface for parser and compilation | |
180 | = extern int regcomp(regex_t *, const char *, int); | |
181 | = #define REG_BASIC 0000 | |
182 | = #define REG_EXTENDED 0001 | |
183 | = #define REG_ICASE 0002 | |
184 | = #define REG_NOSUB 0004 | |
185 | = #define REG_NEWLINE 0010 | |
186 | = #define REG_NOSPEC 0020 | |
187 | = #define REG_PEND 0040 | |
188 | = #define REG_DUMP 0200 | |
189 | */ | |
190 | int /* 0 success, otherwise REG_something */ | |
191 | regcomp(preg, pattern, cflags) | |
9385eb3d A |
192 | regex_t * __restrict preg; |
193 | const char * __restrict pattern; | |
e9ce8d39 A |
194 | int cflags; |
195 | { | |
196 | struct parse pa; | |
9385eb3d A |
197 | struct re_guts *g; |
198 | struct parse *p = &pa; | |
199 | int i; | |
200 | size_t len; | |
e9ce8d39 A |
201 | #ifdef REDEBUG |
202 | # define GOODFLAGS(f) (f) | |
203 | #else | |
204 | # define GOODFLAGS(f) ((f)&~REG_DUMP) | |
205 | #endif | |
206 | ||
207 | cflags = GOODFLAGS(cflags); | |
208 | if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) | |
209 | return(REG_INVARG); | |
210 | ||
211 | if (cflags®_PEND) { | |
212 | if (preg->re_endp < pattern) | |
213 | return(REG_INVARG); | |
214 | len = preg->re_endp - pattern; | |
215 | } else | |
216 | len = strlen((char *)pattern); | |
217 | ||
218 | /* do the mallocs early so failure handling is easy */ | |
219 | g = (struct re_guts *)malloc(sizeof(struct re_guts) + | |
220 | (NC-1)*sizeof(cat_t)); | |
221 | if (g == NULL) | |
222 | return(REG_ESPACE); | |
223 | p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ | |
224 | p->strip = (sop *)malloc(p->ssize * sizeof(sop)); | |
225 | p->slen = 0; | |
226 | if (p->strip == NULL) { | |
227 | free((char *)g); | |
228 | return(REG_ESPACE); | |
229 | } | |
230 | ||
231 | /* set things up */ | |
232 | p->g = g; | |
233 | p->next = (char *)pattern; /* convenience; we do not modify it */ | |
234 | p->end = p->next + len; | |
235 | p->error = 0; | |
236 | p->ncsalloc = 0; | |
237 | for (i = 0; i < NPAREN; i++) { | |
238 | p->pbegin[i] = 0; | |
239 | p->pend[i] = 0; | |
240 | } | |
241 | g->csetsize = NC; | |
242 | g->sets = NULL; | |
243 | g->setbits = NULL; | |
244 | g->ncsets = 0; | |
245 | g->cflags = cflags; | |
246 | g->iflags = 0; | |
247 | g->nbol = 0; | |
248 | g->neol = 0; | |
249 | g->must = NULL; | |
9385eb3d A |
250 | g->moffset = -1; |
251 | g->charjump = NULL; | |
252 | g->matchjump = NULL; | |
e9ce8d39 A |
253 | g->mlen = 0; |
254 | g->nsub = 0; | |
255 | g->ncategories = 1; /* category 0 is "everything else" */ | |
256 | g->categories = &g->catspace[-(CHAR_MIN)]; | |
257 | (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); | |
258 | g->backrefs = 0; | |
259 | ||
260 | /* do it */ | |
261 | EMIT(OEND, 0); | |
262 | g->firststate = THERE(); | |
263 | if (cflags®_EXTENDED) | |
264 | p_ere(p, OUT); | |
265 | else if (cflags®_NOSPEC) | |
266 | p_str(p); | |
267 | else | |
268 | p_bre(p, OUT, OUT); | |
269 | EMIT(OEND, 0); | |
270 | g->laststate = THERE(); | |
271 | ||
272 | /* tidy up loose ends and fill things in */ | |
273 | categorize(p, g); | |
274 | stripsnug(p, g); | |
275 | findmust(p, g); | |
9385eb3d A |
276 | /* only use Boyer-Moore algorithm if the pattern is bigger |
277 | * than three characters | |
278 | */ | |
279 | if(g->mlen > 3) { | |
280 | computejumps(p, g); | |
281 | computematchjumps(p, g); | |
282 | if(g->matchjump == NULL && g->charjump != NULL) { | |
283 | free(g->charjump); | |
284 | g->charjump = NULL; | |
285 | } | |
286 | } | |
e9ce8d39 A |
287 | g->nplus = pluscount(p, g); |
288 | g->magic = MAGIC2; | |
289 | preg->re_nsub = g->nsub; | |
290 | preg->re_g = g; | |
291 | preg->re_magic = MAGIC1; | |
292 | #ifndef REDEBUG | |
293 | /* not debugging, so can't rely on the assert() in regexec() */ | |
294 | if (g->iflags&BAD) | |
295 | SETERROR(REG_ASSERT); | |
296 | #endif | |
297 | ||
298 | /* win or lose, we're done */ | |
299 | if (p->error != 0) /* lose */ | |
300 | regfree(preg); | |
301 | return(p->error); | |
302 | } | |
303 | ||
304 | /* | |
305 | - p_ere - ERE parser top level, concatenation and alternation | |
9385eb3d | 306 | == static void p_ere(struct parse *p, int stop); |
e9ce8d39 A |
307 | */ |
308 | static void | |
309 | p_ere(p, stop) | |
9385eb3d | 310 | struct parse *p; |
e9ce8d39 A |
311 | int stop; /* character this ERE should end at */ |
312 | { | |
9385eb3d A |
313 | char c; |
314 | sopno prevback; | |
315 | sopno prevfwd; | |
316 | sopno conc; | |
317 | int first = 1; /* is this the first alternative? */ | |
e9ce8d39 A |
318 | |
319 | for (;;) { | |
320 | /* do a bunch of concatenated expressions */ | |
321 | conc = HERE(); | |
322 | while (MORE() && (c = PEEK()) != '|' && c != stop) | |
323 | p_ere_exp(p); | |
9385eb3d | 324 | (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
e9ce8d39 A |
325 | |
326 | if (!EAT('|')) | |
327 | break; /* NOTE BREAK OUT */ | |
328 | ||
329 | if (first) { | |
330 | INSERT(OCH_, conc); /* offset is wrong */ | |
331 | prevfwd = conc; | |
332 | prevback = conc; | |
333 | first = 0; | |
334 | } | |
335 | ASTERN(OOR1, prevback); | |
336 | prevback = THERE(); | |
337 | AHEAD(prevfwd); /* fix previous offset */ | |
338 | prevfwd = HERE(); | |
339 | EMIT(OOR2, 0); /* offset is very wrong */ | |
340 | } | |
341 | ||
342 | if (!first) { /* tail-end fixups */ | |
343 | AHEAD(prevfwd); | |
344 | ASTERN(O_CH, prevback); | |
345 | } | |
346 | ||
347 | assert(!MORE() || SEE(stop)); | |
348 | } | |
349 | ||
350 | /* | |
351 | - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op | |
9385eb3d | 352 | == static void p_ere_exp(struct parse *p); |
e9ce8d39 A |
353 | */ |
354 | static void | |
355 | p_ere_exp(p) | |
9385eb3d | 356 | struct parse *p; |
e9ce8d39 | 357 | { |
9385eb3d A |
358 | char c; |
359 | sopno pos; | |
360 | int count; | |
361 | int count2; | |
362 | sopno subno; | |
e9ce8d39 A |
363 | int wascaret = 0; |
364 | ||
365 | assert(MORE()); /* caller should have ensured this */ | |
366 | c = GETNEXT(); | |
367 | ||
368 | pos = HERE(); | |
369 | switch (c) { | |
370 | case '(': | |
9385eb3d | 371 | (void)REQUIRE(MORE(), REG_EPAREN); |
e9ce8d39 A |
372 | p->g->nsub++; |
373 | subno = p->g->nsub; | |
374 | if (subno < NPAREN) | |
375 | p->pbegin[subno] = HERE(); | |
376 | EMIT(OLPAREN, subno); | |
377 | if (!SEE(')')) | |
378 | p_ere(p, ')'); | |
379 | if (subno < NPAREN) { | |
380 | p->pend[subno] = HERE(); | |
381 | assert(p->pend[subno] != 0); | |
382 | } | |
383 | EMIT(ORPAREN, subno); | |
9385eb3d | 384 | (void)MUSTEAT(')', REG_EPAREN); |
e9ce8d39 A |
385 | break; |
386 | #ifndef POSIX_MISTAKE | |
387 | case ')': /* happens only if no current unmatched ( */ | |
388 | /* | |
389 | * You may ask, why the ifndef? Because I didn't notice | |
390 | * this until slightly too late for 1003.2, and none of the | |
391 | * other 1003.2 regular-expression reviewers noticed it at | |
392 | * all. So an unmatched ) is legal POSIX, at least until | |
393 | * we can get it fixed. | |
394 | */ | |
395 | SETERROR(REG_EPAREN); | |
396 | break; | |
397 | #endif | |
398 | case '^': | |
399 | EMIT(OBOL, 0); | |
400 | p->g->iflags |= USEBOL; | |
401 | p->g->nbol++; | |
402 | wascaret = 1; | |
403 | break; | |
404 | case '$': | |
405 | EMIT(OEOL, 0); | |
406 | p->g->iflags |= USEEOL; | |
407 | p->g->neol++; | |
408 | break; | |
409 | case '|': | |
410 | SETERROR(REG_EMPTY); | |
411 | break; | |
412 | case '*': | |
413 | case '+': | |
414 | case '?': | |
415 | SETERROR(REG_BADRPT); | |
416 | break; | |
417 | case '.': | |
418 | if (p->g->cflags®_NEWLINE) | |
419 | nonnewline(p); | |
420 | else | |
421 | EMIT(OANY, 0); | |
422 | break; | |
423 | case '[': | |
424 | p_bracket(p); | |
425 | break; | |
426 | case '\\': | |
9385eb3d | 427 | (void)REQUIRE(MORE(), REG_EESCAPE); |
e9ce8d39 A |
428 | c = GETNEXT(); |
429 | ordinary(p, c); | |
430 | break; | |
431 | case '{': /* okay as ordinary except if digit follows */ | |
9385eb3d | 432 | (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
e9ce8d39 A |
433 | /* FALLTHROUGH */ |
434 | default: | |
435 | ordinary(p, c); | |
436 | break; | |
437 | } | |
438 | ||
439 | if (!MORE()) | |
440 | return; | |
441 | c = PEEK(); | |
442 | /* we call { a repetition if followed by a digit */ | |
443 | if (!( c == '*' || c == '+' || c == '?' || | |
9385eb3d | 444 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) |
e9ce8d39 A |
445 | return; /* no repetition, we're done */ |
446 | NEXT(); | |
447 | ||
9385eb3d | 448 | (void)REQUIRE(!wascaret, REG_BADRPT); |
e9ce8d39 A |
449 | switch (c) { |
450 | case '*': /* implemented as +? */ | |
451 | /* this case does not require the (y|) trick, noKLUDGE */ | |
452 | INSERT(OPLUS_, pos); | |
453 | ASTERN(O_PLUS, pos); | |
454 | INSERT(OQUEST_, pos); | |
455 | ASTERN(O_QUEST, pos); | |
456 | break; | |
457 | case '+': | |
458 | INSERT(OPLUS_, pos); | |
459 | ASTERN(O_PLUS, pos); | |
460 | break; | |
461 | case '?': | |
462 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | |
463 | INSERT(OCH_, pos); /* offset slightly wrong */ | |
464 | ASTERN(OOR1, pos); /* this one's right */ | |
465 | AHEAD(pos); /* fix the OCH_ */ | |
466 | EMIT(OOR2, 0); /* offset very wrong... */ | |
467 | AHEAD(THERE()); /* ...so fix it */ | |
468 | ASTERN(O_CH, THERETHERE()); | |
469 | break; | |
470 | case '{': | |
471 | count = p_count(p); | |
472 | if (EAT(',')) { | |
9385eb3d | 473 | if (isdigit((uch)PEEK())) { |
e9ce8d39 | 474 | count2 = p_count(p); |
9385eb3d | 475 | (void)REQUIRE(count <= count2, REG_BADBR); |
e9ce8d39 A |
476 | } else /* single number with comma */ |
477 | count2 = INFINITY; | |
478 | } else /* just a single number */ | |
479 | count2 = count; | |
480 | repeat(p, pos, count, count2); | |
481 | if (!EAT('}')) { /* error heuristics */ | |
482 | while (MORE() && PEEK() != '}') | |
483 | NEXT(); | |
9385eb3d | 484 | (void)REQUIRE(MORE(), REG_EBRACE); |
e9ce8d39 A |
485 | SETERROR(REG_BADBR); |
486 | } | |
487 | break; | |
488 | } | |
489 | ||
490 | if (!MORE()) | |
491 | return; | |
492 | c = PEEK(); | |
493 | if (!( c == '*' || c == '+' || c == '?' || | |
9385eb3d | 494 | (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
e9ce8d39 A |
495 | return; |
496 | SETERROR(REG_BADRPT); | |
497 | } | |
498 | ||
499 | /* | |
500 | - p_str - string (no metacharacters) "parser" | |
9385eb3d | 501 | == static void p_str(struct parse *p); |
e9ce8d39 A |
502 | */ |
503 | static void | |
504 | p_str(p) | |
9385eb3d | 505 | struct parse *p; |
e9ce8d39 | 506 | { |
9385eb3d | 507 | (void)REQUIRE(MORE(), REG_EMPTY); |
e9ce8d39 A |
508 | while (MORE()) |
509 | ordinary(p, GETNEXT()); | |
510 | } | |
511 | ||
512 | /* | |
513 | - p_bre - BRE parser top level, anchoring and concatenation | |
9385eb3d A |
514 | == static void p_bre(struct parse *p, int end1, \ |
515 | == int end2); | |
e9ce8d39 A |
516 | * Giving end1 as OUT essentially eliminates the end1/end2 check. |
517 | * | |
518 | * This implementation is a bit of a kludge, in that a trailing $ is first | |
519 | * taken as an ordinary character and then revised to be an anchor. The | |
520 | * only undesirable side effect is that '$' gets included as a character | |
521 | * category in such cases. This is fairly harmless; not worth fixing. | |
522 | * The amount of lookahead needed to avoid this kludge is excessive. | |
523 | */ | |
524 | static void | |
525 | p_bre(p, end1, end2) | |
9385eb3d A |
526 | struct parse *p; |
527 | int end1; /* first terminating character */ | |
528 | int end2; /* second terminating character */ | |
e9ce8d39 | 529 | { |
9385eb3d A |
530 | sopno start = HERE(); |
531 | int first = 1; /* first subexpression? */ | |
532 | int wasdollar = 0; | |
e9ce8d39 A |
533 | |
534 | if (EAT('^')) { | |
535 | EMIT(OBOL, 0); | |
536 | p->g->iflags |= USEBOL; | |
537 | p->g->nbol++; | |
538 | } | |
539 | while (MORE() && !SEETWO(end1, end2)) { | |
540 | wasdollar = p_simp_re(p, first); | |
541 | first = 0; | |
542 | } | |
543 | if (wasdollar) { /* oops, that was a trailing anchor */ | |
544 | DROP(1); | |
545 | EMIT(OEOL, 0); | |
546 | p->g->iflags |= USEEOL; | |
547 | p->g->neol++; | |
548 | } | |
549 | ||
9385eb3d | 550 | (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
e9ce8d39 A |
551 | } |
552 | ||
553 | /* | |
554 | - p_simp_re - parse a simple RE, an atom possibly followed by a repetition | |
9385eb3d | 555 | == static int p_simp_re(struct parse *p, int starordinary); |
e9ce8d39 A |
556 | */ |
557 | static int /* was the simple RE an unbackslashed $? */ | |
558 | p_simp_re(p, starordinary) | |
9385eb3d | 559 | struct parse *p; |
e9ce8d39 A |
560 | int starordinary; /* is a leading * an ordinary character? */ |
561 | { | |
9385eb3d A |
562 | int c; |
563 | int count; | |
564 | int count2; | |
565 | sopno pos; | |
566 | int i; | |
567 | sopno subno; | |
e9ce8d39 A |
568 | # define BACKSL (1<<CHAR_BIT) |
569 | ||
570 | pos = HERE(); /* repetion op, if any, covers from here */ | |
571 | ||
572 | assert(MORE()); /* caller should have ensured this */ | |
573 | c = GETNEXT(); | |
574 | if (c == '\\') { | |
9385eb3d A |
575 | (void)REQUIRE(MORE(), REG_EESCAPE); |
576 | c = BACKSL | GETNEXT(); | |
e9ce8d39 A |
577 | } |
578 | switch (c) { | |
579 | case '.': | |
580 | if (p->g->cflags®_NEWLINE) | |
581 | nonnewline(p); | |
582 | else | |
583 | EMIT(OANY, 0); | |
584 | break; | |
585 | case '[': | |
586 | p_bracket(p); | |
587 | break; | |
588 | case BACKSL|'{': | |
589 | SETERROR(REG_BADRPT); | |
590 | break; | |
591 | case BACKSL|'(': | |
592 | p->g->nsub++; | |
593 | subno = p->g->nsub; | |
594 | if (subno < NPAREN) | |
595 | p->pbegin[subno] = HERE(); | |
596 | EMIT(OLPAREN, subno); | |
597 | /* the MORE here is an error heuristic */ | |
598 | if (MORE() && !SEETWO('\\', ')')) | |
599 | p_bre(p, '\\', ')'); | |
600 | if (subno < NPAREN) { | |
601 | p->pend[subno] = HERE(); | |
602 | assert(p->pend[subno] != 0); | |
603 | } | |
604 | EMIT(ORPAREN, subno); | |
9385eb3d | 605 | (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
e9ce8d39 A |
606 | break; |
607 | case BACKSL|')': /* should not get here -- must be user */ | |
608 | case BACKSL|'}': | |
609 | SETERROR(REG_EPAREN); | |
610 | break; | |
611 | case BACKSL|'1': | |
612 | case BACKSL|'2': | |
613 | case BACKSL|'3': | |
614 | case BACKSL|'4': | |
615 | case BACKSL|'5': | |
616 | case BACKSL|'6': | |
617 | case BACKSL|'7': | |
618 | case BACKSL|'8': | |
619 | case BACKSL|'9': | |
620 | i = (c&~BACKSL) - '0'; | |
621 | assert(i < NPAREN); | |
622 | if (p->pend[i] != 0) { | |
623 | assert(i <= p->g->nsub); | |
624 | EMIT(OBACK_, i); | |
625 | assert(p->pbegin[i] != 0); | |
626 | assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); | |
627 | assert(OP(p->strip[p->pend[i]]) == ORPAREN); | |
628 | (void) dupl(p, p->pbegin[i]+1, p->pend[i]); | |
629 | EMIT(O_BACK, i); | |
630 | } else | |
631 | SETERROR(REG_ESUBREG); | |
632 | p->g->backrefs = 1; | |
633 | break; | |
634 | case '*': | |
9385eb3d | 635 | (void)REQUIRE(starordinary, REG_BADRPT); |
e9ce8d39 A |
636 | /* FALLTHROUGH */ |
637 | default: | |
9385eb3d | 638 | ordinary(p, (char)c); |
e9ce8d39 A |
639 | break; |
640 | } | |
641 | ||
642 | if (EAT('*')) { /* implemented as +? */ | |
643 | /* this case does not require the (y|) trick, noKLUDGE */ | |
644 | INSERT(OPLUS_, pos); | |
645 | ASTERN(O_PLUS, pos); | |
646 | INSERT(OQUEST_, pos); | |
647 | ASTERN(O_QUEST, pos); | |
648 | } else if (EATTWO('\\', '{')) { | |
649 | count = p_count(p); | |
650 | if (EAT(',')) { | |
9385eb3d | 651 | if (MORE() && isdigit((uch)PEEK())) { |
e9ce8d39 | 652 | count2 = p_count(p); |
9385eb3d | 653 | (void)REQUIRE(count <= count2, REG_BADBR); |
e9ce8d39 A |
654 | } else /* single number with comma */ |
655 | count2 = INFINITY; | |
656 | } else /* just a single number */ | |
657 | count2 = count; | |
658 | repeat(p, pos, count, count2); | |
659 | if (!EATTWO('\\', '}')) { /* error heuristics */ | |
660 | while (MORE() && !SEETWO('\\', '}')) | |
661 | NEXT(); | |
9385eb3d | 662 | (void)REQUIRE(MORE(), REG_EBRACE); |
e9ce8d39 A |
663 | SETERROR(REG_BADBR); |
664 | } | |
9385eb3d | 665 | } else if (c == '$') /* $ (but not \$) ends it */ |
e9ce8d39 A |
666 | return(1); |
667 | ||
668 | return(0); | |
669 | } | |
670 | ||
671 | /* | |
672 | - p_count - parse a repetition count | |
9385eb3d | 673 | == static int p_count(struct parse *p); |
e9ce8d39 A |
674 | */ |
675 | static int /* the value */ | |
676 | p_count(p) | |
9385eb3d | 677 | struct parse *p; |
e9ce8d39 | 678 | { |
9385eb3d A |
679 | int count = 0; |
680 | int ndigits = 0; | |
e9ce8d39 | 681 | |
9385eb3d | 682 | while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
e9ce8d39 A |
683 | count = count*10 + (GETNEXT() - '0'); |
684 | ndigits++; | |
685 | } | |
686 | ||
9385eb3d | 687 | (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
e9ce8d39 A |
688 | return(count); |
689 | } | |
690 | ||
691 | /* | |
692 | - p_bracket - parse a bracketed character list | |
9385eb3d | 693 | == static void p_bracket(struct parse *p); |
e9ce8d39 A |
694 | * |
695 | * Note a significant property of this code: if the allocset() did SETERROR, | |
696 | * no set operations are done. | |
697 | */ | |
698 | static void | |
699 | p_bracket(p) | |
9385eb3d | 700 | struct parse *p; |
e9ce8d39 | 701 | { |
9385eb3d A |
702 | cset *cs = allocset(p); |
703 | int invert = 0; | |
e9ce8d39 A |
704 | |
705 | /* Dept of Truly Sickening Special-Case Kludges */ | |
706 | if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { | |
707 | EMIT(OBOW, 0); | |
708 | NEXTn(6); | |
709 | return; | |
710 | } | |
711 | if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { | |
712 | EMIT(OEOW, 0); | |
713 | NEXTn(6); | |
714 | return; | |
715 | } | |
716 | ||
717 | if (EAT('^')) | |
718 | invert++; /* make note to invert set at end */ | |
719 | if (EAT(']')) | |
720 | CHadd(cs, ']'); | |
721 | else if (EAT('-')) | |
722 | CHadd(cs, '-'); | |
723 | while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) | |
724 | p_b_term(p, cs); | |
725 | if (EAT('-')) | |
726 | CHadd(cs, '-'); | |
9385eb3d | 727 | (void)MUSTEAT(']', REG_EBRACK); |
e9ce8d39 A |
728 | |
729 | if (p->error != 0) /* don't mess things up further */ | |
730 | return; | |
731 | ||
732 | if (p->g->cflags®_ICASE) { | |
9385eb3d A |
733 | int i; |
734 | int ci; | |
e9ce8d39 A |
735 | |
736 | for (i = p->g->csetsize - 1; i >= 0; i--) | |
737 | if (CHIN(cs, i) && isalpha(i)) { | |
738 | ci = othercase(i); | |
739 | if (ci != i) | |
740 | CHadd(cs, ci); | |
741 | } | |
742 | if (cs->multis != NULL) | |
743 | mccase(p, cs); | |
744 | } | |
745 | if (invert) { | |
9385eb3d | 746 | int i; |
e9ce8d39 A |
747 | |
748 | for (i = p->g->csetsize - 1; i >= 0; i--) | |
749 | if (CHIN(cs, i)) | |
750 | CHsub(cs, i); | |
751 | else | |
752 | CHadd(cs, i); | |
753 | if (p->g->cflags®_NEWLINE) | |
754 | CHsub(cs, '\n'); | |
755 | if (cs->multis != NULL) | |
756 | mcinvert(p, cs); | |
757 | } | |
758 | ||
759 | assert(cs->multis == NULL); /* xxx */ | |
760 | ||
761 | if (nch(p, cs) == 1) { /* optimize singleton sets */ | |
762 | ordinary(p, firstch(p, cs)); | |
763 | freeset(p, cs); | |
764 | } else | |
765 | EMIT(OANYOF, freezeset(p, cs)); | |
766 | } | |
767 | ||
768 | /* | |
769 | - p_b_term - parse one term of a bracketed character list | |
9385eb3d | 770 | == static void p_b_term(struct parse *p, cset *cs); |
e9ce8d39 A |
771 | */ |
772 | static void | |
773 | p_b_term(p, cs) | |
9385eb3d A |
774 | struct parse *p; |
775 | cset *cs; | |
e9ce8d39 | 776 | { |
9385eb3d A |
777 | char c; |
778 | char start, finish; | |
779 | int i; | |
e9ce8d39 A |
780 | |
781 | /* classify what we've got */ | |
782 | switch ((MORE()) ? PEEK() : '\0') { | |
783 | case '[': | |
784 | c = (MORE2()) ? PEEK2() : '\0'; | |
785 | break; | |
786 | case '-': | |
787 | SETERROR(REG_ERANGE); | |
788 | return; /* NOTE RETURN */ | |
789 | break; | |
790 | default: | |
791 | c = '\0'; | |
792 | break; | |
793 | } | |
794 | ||
795 | switch (c) { | |
796 | case ':': /* character class */ | |
797 | NEXT2(); | |
9385eb3d | 798 | (void)REQUIRE(MORE(), REG_EBRACK); |
e9ce8d39 | 799 | c = PEEK(); |
9385eb3d | 800 | (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
e9ce8d39 | 801 | p_b_cclass(p, cs); |
9385eb3d A |
802 | (void)REQUIRE(MORE(), REG_EBRACK); |
803 | (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); | |
e9ce8d39 A |
804 | break; |
805 | case '=': /* equivalence class */ | |
806 | NEXT2(); | |
9385eb3d | 807 | (void)REQUIRE(MORE(), REG_EBRACK); |
e9ce8d39 | 808 | c = PEEK(); |
9385eb3d | 809 | (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
e9ce8d39 | 810 | p_b_eclass(p, cs); |
9385eb3d A |
811 | (void)REQUIRE(MORE(), REG_EBRACK); |
812 | (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); | |
e9ce8d39 A |
813 | break; |
814 | default: /* symbol, ordinary character, or range */ | |
815 | /* xxx revision needed for multichar stuff */ | |
816 | start = p_b_symbol(p); | |
817 | if (SEE('-') && MORE2() && PEEK2() != ']') { | |
818 | /* range */ | |
819 | NEXT(); | |
820 | if (EAT('-')) | |
821 | finish = '-'; | |
822 | else | |
823 | finish = p_b_symbol(p); | |
824 | } else | |
825 | finish = start; | |
9385eb3d A |
826 | if (start == finish) |
827 | CHadd(cs, start); | |
828 | else { | |
829 | if (__collate_load_error) { | |
830 | (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); | |
831 | for (i = (uch)start; i <= (uch)finish; i++) | |
832 | CHadd(cs, i); | |
833 | } else { | |
834 | (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); | |
835 | for (i = CHAR_MIN; i <= CHAR_MAX; i++) { | |
836 | if ( __collate_range_cmp(start, i) <= 0 | |
837 | && __collate_range_cmp(i, finish) <= 0 | |
838 | ) | |
839 | CHadd(cs, i); | |
840 | } | |
841 | } | |
842 | } | |
e9ce8d39 A |
843 | break; |
844 | } | |
845 | } | |
846 | ||
847 | /* | |
848 | - p_b_cclass - parse a character-class name and deal with it | |
9385eb3d | 849 | == static void p_b_cclass(struct parse *p, cset *cs); |
e9ce8d39 A |
850 | */ |
851 | static void | |
852 | p_b_cclass(p, cs) | |
9385eb3d A |
853 | struct parse *p; |
854 | cset *cs; | |
e9ce8d39 | 855 | { |
9385eb3d A |
856 | int c; |
857 | char *sp = p->next; | |
858 | struct cclass *cp; | |
859 | size_t len; | |
e9ce8d39 | 860 | |
9385eb3d | 861 | while (MORE() && isalpha((uch)PEEK())) |
e9ce8d39 A |
862 | NEXT(); |
863 | len = p->next - sp; | |
864 | for (cp = cclasses; cp->name != NULL; cp++) | |
865 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | |
866 | break; | |
867 | if (cp->name == NULL) { | |
868 | /* oops, didn't find it */ | |
869 | SETERROR(REG_ECTYPE); | |
870 | return; | |
871 | } | |
872 | ||
9385eb3d A |
873 | switch (cp->fidx) { |
874 | case CALNUM: | |
875 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
876 | if (isalnum((uch)c)) | |
877 | CHadd(cs, c); | |
878 | break; | |
879 | case CALPHA: | |
880 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
881 | if (isalpha((uch)c)) | |
882 | CHadd(cs, c); | |
883 | break; | |
884 | case CBLANK: | |
885 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
886 | if (isblank((uch)c)) | |
887 | CHadd(cs, c); | |
888 | break; | |
889 | case CCNTRL: | |
890 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
891 | if (iscntrl((uch)c)) | |
892 | CHadd(cs, c); | |
893 | break; | |
894 | case CDIGIT: | |
895 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
896 | if (isdigit((uch)c)) | |
897 | CHadd(cs, c); | |
898 | break; | |
899 | case CGRAPH: | |
900 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
901 | if (isgraph((uch)c)) | |
902 | CHadd(cs, c); | |
903 | break; | |
904 | case CLOWER: | |
905 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
906 | if (islower((uch)c)) | |
907 | CHadd(cs, c); | |
908 | break; | |
909 | case CPRINT: | |
910 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
911 | if (isprint((uch)c)) | |
912 | CHadd(cs, c); | |
913 | break; | |
914 | case CPUNCT: | |
915 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
916 | if (ispunct((uch)c)) | |
917 | CHadd(cs, c); | |
918 | break; | |
919 | case CSPACE: | |
920 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
921 | if (isspace((uch)c)) | |
922 | CHadd(cs, c); | |
923 | break; | |
924 | case CUPPER: | |
925 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
926 | if (isupper((uch)c)) | |
927 | CHadd(cs, c); | |
928 | break; | |
929 | case CXDIGIT: | |
930 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
931 | if (isxdigit((uch)c)) | |
932 | CHadd(cs, c); | |
933 | break; | |
934 | } | |
935 | #if 0 | |
e9ce8d39 A |
936 | for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) |
937 | MCadd(p, cs, u); | |
9385eb3d | 938 | #endif |
e9ce8d39 A |
939 | } |
940 | ||
941 | /* | |
942 | - p_b_eclass - parse an equivalence-class name and deal with it | |
9385eb3d | 943 | == static void p_b_eclass(struct parse *p, cset *cs); |
e9ce8d39 A |
944 | * |
945 | * This implementation is incomplete. xxx | |
946 | */ | |
947 | static void | |
948 | p_b_eclass(p, cs) | |
9385eb3d A |
949 | struct parse *p; |
950 | cset *cs; | |
e9ce8d39 | 951 | { |
9385eb3d | 952 | char c; |
e9ce8d39 A |
953 | |
954 | c = p_b_coll_elem(p, '='); | |
955 | CHadd(cs, c); | |
956 | } | |
957 | ||
958 | /* | |
959 | - p_b_symbol - parse a character or [..]ed multicharacter collating symbol | |
9385eb3d | 960 | == static char p_b_symbol(struct parse *p); |
e9ce8d39 A |
961 | */ |
962 | static char /* value of symbol */ | |
963 | p_b_symbol(p) | |
9385eb3d | 964 | struct parse *p; |
e9ce8d39 | 965 | { |
9385eb3d | 966 | char value; |
e9ce8d39 | 967 | |
9385eb3d | 968 | (void)REQUIRE(MORE(), REG_EBRACK); |
e9ce8d39 A |
969 | if (!EATTWO('[', '.')) |
970 | return(GETNEXT()); | |
971 | ||
972 | /* collating symbol */ | |
973 | value = p_b_coll_elem(p, '.'); | |
9385eb3d | 974 | (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
e9ce8d39 A |
975 | return(value); |
976 | } | |
977 | ||
978 | /* | |
979 | - p_b_coll_elem - parse a collating-element name and look it up | |
9385eb3d | 980 | == static char p_b_coll_elem(struct parse *p, int endc); |
e9ce8d39 A |
981 | */ |
982 | static char /* value of collating element */ | |
983 | p_b_coll_elem(p, endc) | |
9385eb3d | 984 | struct parse *p; |
e9ce8d39 A |
985 | int endc; /* name ended by endc,']' */ |
986 | { | |
9385eb3d A |
987 | char *sp = p->next; |
988 | struct cname *cp; | |
989 | int len; | |
e9ce8d39 A |
990 | |
991 | while (MORE() && !SEETWO(endc, ']')) | |
992 | NEXT(); | |
993 | if (!MORE()) { | |
994 | SETERROR(REG_EBRACK); | |
995 | return(0); | |
996 | } | |
997 | len = p->next - sp; | |
998 | for (cp = cnames; cp->name != NULL; cp++) | |
999 | if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') | |
1000 | return(cp->code); /* known name */ | |
1001 | if (len == 1) | |
1002 | return(*sp); /* single character */ | |
1003 | SETERROR(REG_ECOLLATE); /* neither */ | |
1004 | return(0); | |
1005 | } | |
1006 | ||
1007 | /* | |
1008 | - othercase - return the case counterpart of an alphabetic | |
1009 | == static char othercase(int ch); | |
1010 | */ | |
1011 | static char /* if no counterpart, return ch */ | |
1012 | othercase(ch) | |
1013 | int ch; | |
1014 | { | |
9385eb3d | 1015 | ch = (uch)ch; |
e9ce8d39 A |
1016 | assert(isalpha(ch)); |
1017 | if (isupper(ch)) | |
1018 | return(tolower(ch)); | |
1019 | else if (islower(ch)) | |
1020 | return(toupper(ch)); | |
1021 | else /* peculiar, but could happen */ | |
1022 | return(ch); | |
1023 | } | |
1024 | ||
1025 | /* | |
1026 | - bothcases - emit a dualcase version of a two-case character | |
9385eb3d | 1027 | == static void bothcases(struct parse *p, int ch); |
e9ce8d39 A |
1028 | * |
1029 | * Boy, is this implementation ever a kludge... | |
1030 | */ | |
1031 | static void | |
1032 | bothcases(p, ch) | |
9385eb3d | 1033 | struct parse *p; |
e9ce8d39 A |
1034 | int ch; |
1035 | { | |
9385eb3d A |
1036 | char *oldnext = p->next; |
1037 | char *oldend = p->end; | |
e9ce8d39 A |
1038 | char bracket[3]; |
1039 | ||
9385eb3d | 1040 | ch = (uch)ch; |
e9ce8d39 A |
1041 | assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
1042 | p->next = bracket; | |
1043 | p->end = bracket+2; | |
1044 | bracket[0] = ch; | |
1045 | bracket[1] = ']'; | |
1046 | bracket[2] = '\0'; | |
1047 | p_bracket(p); | |
1048 | assert(p->next == bracket+2); | |
1049 | p->next = oldnext; | |
1050 | p->end = oldend; | |
1051 | } | |
1052 | ||
1053 | /* | |
1054 | - ordinary - emit an ordinary character | |
9385eb3d | 1055 | == static void ordinary(struct parse *p, int ch); |
e9ce8d39 A |
1056 | */ |
1057 | static void | |
1058 | ordinary(p, ch) | |
9385eb3d A |
1059 | struct parse *p; |
1060 | int ch; | |
e9ce8d39 | 1061 | { |
9385eb3d | 1062 | cat_t *cap = p->g->categories; |
e9ce8d39 | 1063 | |
9385eb3d | 1064 | if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) |
e9ce8d39 A |
1065 | bothcases(p, ch); |
1066 | else { | |
9385eb3d | 1067 | EMIT(OCHAR, (uch)ch); |
e9ce8d39 A |
1068 | if (cap[ch] == 0) |
1069 | cap[ch] = p->g->ncategories++; | |
1070 | } | |
1071 | } | |
1072 | ||
1073 | /* | |
1074 | - nonnewline - emit REG_NEWLINE version of OANY | |
9385eb3d | 1075 | == static void nonnewline(struct parse *p); |
e9ce8d39 A |
1076 | * |
1077 | * Boy, is this implementation ever a kludge... | |
1078 | */ | |
1079 | static void | |
1080 | nonnewline(p) | |
9385eb3d | 1081 | struct parse *p; |
e9ce8d39 | 1082 | { |
9385eb3d A |
1083 | char *oldnext = p->next; |
1084 | char *oldend = p->end; | |
e9ce8d39 A |
1085 | char bracket[4]; |
1086 | ||
1087 | p->next = bracket; | |
1088 | p->end = bracket+3; | |
1089 | bracket[0] = '^'; | |
1090 | bracket[1] = '\n'; | |
1091 | bracket[2] = ']'; | |
1092 | bracket[3] = '\0'; | |
1093 | p_bracket(p); | |
1094 | assert(p->next == bracket+3); | |
1095 | p->next = oldnext; | |
1096 | p->end = oldend; | |
1097 | } | |
1098 | ||
1099 | /* | |
1100 | - repeat - generate code for a bounded repetition, recursively if needed | |
9385eb3d | 1101 | == static void repeat(struct parse *p, sopno start, int from, int to); |
e9ce8d39 A |
1102 | */ |
1103 | static void | |
1104 | repeat(p, start, from, to) | |
9385eb3d | 1105 | struct parse *p; |
e9ce8d39 A |
1106 | sopno start; /* operand from here to end of strip */ |
1107 | int from; /* repeated from this number */ | |
1108 | int to; /* to this number of times (maybe INFINITY) */ | |
1109 | { | |
9385eb3d | 1110 | sopno finish = HERE(); |
e9ce8d39 A |
1111 | # define N 2 |
1112 | # define INF 3 | |
1113 | # define REP(f, t) ((f)*8 + (t)) | |
1114 | # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) | |
9385eb3d | 1115 | sopno copy; |
e9ce8d39 A |
1116 | |
1117 | if (p->error != 0) /* head off possible runaway recursion */ | |
1118 | return; | |
1119 | ||
1120 | assert(from <= to); | |
1121 | ||
1122 | switch (REP(MAP(from), MAP(to))) { | |
1123 | case REP(0, 0): /* must be user doing this */ | |
1124 | DROP(finish-start); /* drop the operand */ | |
1125 | break; | |
1126 | case REP(0, 1): /* as x{1,1}? */ | |
1127 | case REP(0, N): /* as x{1,n}? */ | |
1128 | case REP(0, INF): /* as x{1,}? */ | |
1129 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | |
1130 | INSERT(OCH_, start); /* offset is wrong... */ | |
1131 | repeat(p, start+1, 1, to); | |
1132 | ASTERN(OOR1, start); | |
1133 | AHEAD(start); /* ... fix it */ | |
1134 | EMIT(OOR2, 0); | |
1135 | AHEAD(THERE()); | |
1136 | ASTERN(O_CH, THERETHERE()); | |
1137 | break; | |
1138 | case REP(1, 1): /* trivial case */ | |
1139 | /* done */ | |
1140 | break; | |
1141 | case REP(1, N): /* as x?x{1,n-1} */ | |
1142 | /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ | |
1143 | INSERT(OCH_, start); | |
1144 | ASTERN(OOR1, start); | |
1145 | AHEAD(start); | |
1146 | EMIT(OOR2, 0); /* offset very wrong... */ | |
1147 | AHEAD(THERE()); /* ...so fix it */ | |
1148 | ASTERN(O_CH, THERETHERE()); | |
1149 | copy = dupl(p, start+1, finish+1); | |
1150 | assert(copy == finish+4); | |
1151 | repeat(p, copy, 1, to-1); | |
1152 | break; | |
1153 | case REP(1, INF): /* as x+ */ | |
1154 | INSERT(OPLUS_, start); | |
1155 | ASTERN(O_PLUS, start); | |
1156 | break; | |
1157 | case REP(N, N): /* as xx{m-1,n-1} */ | |
1158 | copy = dupl(p, start, finish); | |
1159 | repeat(p, copy, from-1, to-1); | |
1160 | break; | |
1161 | case REP(N, INF): /* as xx{n-1,INF} */ | |
1162 | copy = dupl(p, start, finish); | |
1163 | repeat(p, copy, from-1, to); | |
1164 | break; | |
1165 | default: /* "can't happen" */ | |
1166 | SETERROR(REG_ASSERT); /* just in case */ | |
1167 | break; | |
1168 | } | |
1169 | } | |
1170 | ||
1171 | /* | |
1172 | - seterr - set an error condition | |
9385eb3d | 1173 | == static int seterr(struct parse *p, int e); |
e9ce8d39 A |
1174 | */ |
1175 | static int /* useless but makes type checking happy */ | |
1176 | seterr(p, e) | |
9385eb3d | 1177 | struct parse *p; |
e9ce8d39 A |
1178 | int e; |
1179 | { | |
1180 | if (p->error == 0) /* keep earliest error condition */ | |
1181 | p->error = e; | |
1182 | p->next = nuls; /* try to bring things to a halt */ | |
1183 | p->end = nuls; | |
1184 | return(0); /* make the return value well-defined */ | |
1185 | } | |
1186 | ||
1187 | /* | |
1188 | - allocset - allocate a set of characters for [] | |
9385eb3d | 1189 | == static cset *allocset(struct parse *p); |
e9ce8d39 A |
1190 | */ |
1191 | static cset * | |
1192 | allocset(p) | |
9385eb3d | 1193 | struct parse *p; |
e9ce8d39 | 1194 | { |
9385eb3d A |
1195 | int no = p->g->ncsets++; |
1196 | size_t nc; | |
1197 | size_t nbytes; | |
1198 | cset *cs; | |
1199 | size_t css = (size_t)p->g->csetsize; | |
1200 | int i; | |
e9ce8d39 A |
1201 | |
1202 | if (no >= p->ncsalloc) { /* need another column of space */ | |
1203 | p->ncsalloc += CHAR_BIT; | |
1204 | nc = p->ncsalloc; | |
1205 | assert(nc % CHAR_BIT == 0); | |
1206 | nbytes = nc / CHAR_BIT * css; | |
1207 | if (p->g->sets == NULL) | |
1208 | p->g->sets = (cset *)malloc(nc * sizeof(cset)); | |
1209 | else | |
9385eb3d | 1210 | p->g->sets = (cset *)reallocf((char *)p->g->sets, |
e9ce8d39 A |
1211 | nc * sizeof(cset)); |
1212 | if (p->g->setbits == NULL) | |
1213 | p->g->setbits = (uch *)malloc(nbytes); | |
1214 | else { | |
9385eb3d | 1215 | p->g->setbits = (uch *)reallocf((char *)p->g->setbits, |
e9ce8d39 A |
1216 | nbytes); |
1217 | /* xxx this isn't right if setbits is now NULL */ | |
1218 | for (i = 0; i < no; i++) | |
1219 | p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); | |
1220 | } | |
1221 | if (p->g->sets != NULL && p->g->setbits != NULL) | |
1222 | (void) memset((char *)p->g->setbits + (nbytes - css), | |
1223 | 0, css); | |
1224 | else { | |
1225 | no = 0; | |
1226 | SETERROR(REG_ESPACE); | |
1227 | /* caller's responsibility not to do set ops */ | |
1228 | } | |
1229 | } | |
1230 | ||
1231 | assert(p->g->sets != NULL); /* xxx */ | |
1232 | cs = &p->g->sets[no]; | |
1233 | cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); | |
1234 | cs->mask = 1 << ((no) % CHAR_BIT); | |
1235 | cs->hash = 0; | |
1236 | cs->smultis = 0; | |
1237 | cs->multis = NULL; | |
1238 | ||
1239 | return(cs); | |
1240 | } | |
1241 | ||
1242 | /* | |
1243 | - freeset - free a now-unused set | |
9385eb3d | 1244 | == static void freeset(struct parse *p, cset *cs); |
e9ce8d39 A |
1245 | */ |
1246 | static void | |
1247 | freeset(p, cs) | |
9385eb3d A |
1248 | struct parse *p; |
1249 | cset *cs; | |
e9ce8d39 | 1250 | { |
9385eb3d A |
1251 | int i; |
1252 | cset *top = &p->g->sets[p->g->ncsets]; | |
1253 | size_t css = (size_t)p->g->csetsize; | |
e9ce8d39 A |
1254 | |
1255 | for (i = 0; i < css; i++) | |
1256 | CHsub(cs, i); | |
1257 | if (cs == top-1) /* recover only the easy case */ | |
1258 | p->g->ncsets--; | |
1259 | } | |
1260 | ||
1261 | /* | |
1262 | - freezeset - final processing on a set of characters | |
9385eb3d | 1263 | == static int freezeset(struct parse *p, cset *cs); |
e9ce8d39 A |
1264 | * |
1265 | * The main task here is merging identical sets. This is usually a waste | |
1266 | * of time (although the hash code minimizes the overhead), but can win | |
1267 | * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash | |
1268 | * is done using addition rather than xor -- all ASCII [aA] sets xor to | |
1269 | * the same value! | |
1270 | */ | |
1271 | static int /* set number */ | |
1272 | freezeset(p, cs) | |
9385eb3d A |
1273 | struct parse *p; |
1274 | cset *cs; | |
e9ce8d39 | 1275 | { |
9385eb3d A |
1276 | short h = cs->hash; |
1277 | int i; | |
1278 | cset *top = &p->g->sets[p->g->ncsets]; | |
1279 | cset *cs2; | |
1280 | size_t css = (size_t)p->g->csetsize; | |
e9ce8d39 A |
1281 | |
1282 | /* look for an earlier one which is the same */ | |
1283 | for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) | |
1284 | if (cs2->hash == h && cs2 != cs) { | |
1285 | /* maybe */ | |
1286 | for (i = 0; i < css; i++) | |
1287 | if (!!CHIN(cs2, i) != !!CHIN(cs, i)) | |
1288 | break; /* no */ | |
1289 | if (i == css) | |
1290 | break; /* yes */ | |
1291 | } | |
1292 | ||
1293 | if (cs2 < top) { /* found one */ | |
1294 | freeset(p, cs); | |
1295 | cs = cs2; | |
1296 | } | |
1297 | ||
1298 | return((int)(cs - p->g->sets)); | |
1299 | } | |
1300 | ||
1301 | /* | |
1302 | - firstch - return first character in a set (which must have at least one) | |
9385eb3d | 1303 | == static int firstch(struct parse *p, cset *cs); |
e9ce8d39 A |
1304 | */ |
1305 | static int /* character; there is no "none" value */ | |
1306 | firstch(p, cs) | |
9385eb3d A |
1307 | struct parse *p; |
1308 | cset *cs; | |
e9ce8d39 | 1309 | { |
9385eb3d A |
1310 | int i; |
1311 | size_t css = (size_t)p->g->csetsize; | |
e9ce8d39 A |
1312 | |
1313 | for (i = 0; i < css; i++) | |
1314 | if (CHIN(cs, i)) | |
1315 | return((char)i); | |
1316 | assert(never); | |
1317 | return(0); /* arbitrary */ | |
1318 | } | |
1319 | ||
1320 | /* | |
1321 | - nch - number of characters in a set | |
9385eb3d | 1322 | == static int nch(struct parse *p, cset *cs); |
e9ce8d39 A |
1323 | */ |
1324 | static int | |
1325 | nch(p, cs) | |
9385eb3d A |
1326 | struct parse *p; |
1327 | cset *cs; | |
e9ce8d39 | 1328 | { |
9385eb3d A |
1329 | int i; |
1330 | size_t css = (size_t)p->g->csetsize; | |
1331 | int n = 0; | |
e9ce8d39 A |
1332 | |
1333 | for (i = 0; i < css; i++) | |
1334 | if (CHIN(cs, i)) | |
1335 | n++; | |
1336 | return(n); | |
1337 | } | |
1338 | ||
1339 | /* | |
1340 | - mcadd - add a collating element to a cset | |
9385eb3d A |
1341 | == static void mcadd(struct parse *p, cset *cs, \ |
1342 | == char *cp); | |
e9ce8d39 A |
1343 | */ |
1344 | static void | |
1345 | mcadd(p, cs, cp) | |
9385eb3d A |
1346 | struct parse *p; |
1347 | cset *cs; | |
1348 | char *cp; | |
e9ce8d39 | 1349 | { |
9385eb3d | 1350 | size_t oldend = cs->smultis; |
e9ce8d39 A |
1351 | |
1352 | cs->smultis += strlen(cp) + 1; | |
1353 | if (cs->multis == NULL) | |
1354 | cs->multis = malloc(cs->smultis); | |
1355 | else | |
9385eb3d | 1356 | cs->multis = reallocf(cs->multis, cs->smultis); |
e9ce8d39 A |
1357 | if (cs->multis == NULL) { |
1358 | SETERROR(REG_ESPACE); | |
1359 | return; | |
1360 | } | |
1361 | ||
1362 | (void) strcpy(cs->multis + oldend - 1, cp); | |
1363 | cs->multis[cs->smultis - 1] = '\0'; | |
1364 | } | |
1365 | ||
9385eb3d | 1366 | #if used |
e9ce8d39 A |
1367 | /* |
1368 | - mcsub - subtract a collating element from a cset | |
9385eb3d | 1369 | == static void mcsub(cset *cs, char *cp); |
e9ce8d39 A |
1370 | */ |
1371 | static void | |
1372 | mcsub(cs, cp) | |
9385eb3d A |
1373 | cset *cs; |
1374 | char *cp; | |
e9ce8d39 | 1375 | { |
9385eb3d A |
1376 | char *fp = mcfind(cs, cp); |
1377 | size_t len = strlen(fp); | |
e9ce8d39 A |
1378 | |
1379 | assert(fp != NULL); | |
1380 | (void) memmove(fp, fp + len + 1, | |
1381 | cs->smultis - (fp + len + 1 - cs->multis)); | |
1382 | cs->smultis -= len; | |
1383 | ||
1384 | if (cs->smultis == 0) { | |
1385 | free(cs->multis); | |
1386 | cs->multis = NULL; | |
1387 | return; | |
1388 | } | |
1389 | ||
9385eb3d | 1390 | cs->multis = reallocf(cs->multis, cs->smultis); |
e9ce8d39 A |
1391 | assert(cs->multis != NULL); |
1392 | } | |
1393 | ||
1394 | /* | |
1395 | - mcin - is a collating element in a cset? | |
9385eb3d | 1396 | == static int mcin(cset *cs, char *cp); |
e9ce8d39 A |
1397 | */ |
1398 | static int | |
1399 | mcin(cs, cp) | |
9385eb3d A |
1400 | cset *cs; |
1401 | char *cp; | |
e9ce8d39 A |
1402 | { |
1403 | return(mcfind(cs, cp) != NULL); | |
1404 | } | |
1405 | ||
1406 | /* | |
1407 | - mcfind - find a collating element in a cset | |
9385eb3d | 1408 | == static char *mcfind(cset *cs, char *cp); |
e9ce8d39 A |
1409 | */ |
1410 | static char * | |
1411 | mcfind(cs, cp) | |
9385eb3d A |
1412 | cset *cs; |
1413 | char *cp; | |
e9ce8d39 | 1414 | { |
9385eb3d | 1415 | char *p; |
e9ce8d39 A |
1416 | |
1417 | if (cs->multis == NULL) | |
1418 | return(NULL); | |
1419 | for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) | |
1420 | if (strcmp(cp, p) == 0) | |
1421 | return(p); | |
1422 | return(NULL); | |
1423 | } | |
9385eb3d | 1424 | #endif |
e9ce8d39 A |
1425 | |
1426 | /* | |
1427 | - mcinvert - invert the list of collating elements in a cset | |
9385eb3d | 1428 | == static void mcinvert(struct parse *p, cset *cs); |
e9ce8d39 A |
1429 | * |
1430 | * This would have to know the set of possibilities. Implementation | |
1431 | * is deferred. | |
1432 | */ | |
1433 | static void | |
1434 | mcinvert(p, cs) | |
9385eb3d A |
1435 | struct parse *p; |
1436 | cset *cs; | |
e9ce8d39 A |
1437 | { |
1438 | assert(cs->multis == NULL); /* xxx */ | |
1439 | } | |
1440 | ||
1441 | /* | |
1442 | - mccase - add case counterparts of the list of collating elements in a cset | |
9385eb3d | 1443 | == static void mccase(struct parse *p, cset *cs); |
e9ce8d39 A |
1444 | * |
1445 | * This would have to know the set of possibilities. Implementation | |
1446 | * is deferred. | |
1447 | */ | |
1448 | static void | |
1449 | mccase(p, cs) | |
9385eb3d A |
1450 | struct parse *p; |
1451 | cset *cs; | |
e9ce8d39 A |
1452 | { |
1453 | assert(cs->multis == NULL); /* xxx */ | |
1454 | } | |
1455 | ||
1456 | /* | |
1457 | - isinsets - is this character in any sets? | |
9385eb3d | 1458 | == static int isinsets(struct re_guts *g, int c); |
e9ce8d39 A |
1459 | */ |
1460 | static int /* predicate */ | |
1461 | isinsets(g, c) | |
9385eb3d | 1462 | struct re_guts *g; |
e9ce8d39 A |
1463 | int c; |
1464 | { | |
9385eb3d A |
1465 | uch *col; |
1466 | int i; | |
1467 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; | |
1468 | unsigned uc = (uch)c; | |
e9ce8d39 A |
1469 | |
1470 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) | |
1471 | if (col[uc] != 0) | |
1472 | return(1); | |
1473 | return(0); | |
1474 | } | |
1475 | ||
1476 | /* | |
1477 | - samesets - are these two characters in exactly the same sets? | |
9385eb3d | 1478 | == static int samesets(struct re_guts *g, int c1, int c2); |
e9ce8d39 A |
1479 | */ |
1480 | static int /* predicate */ | |
1481 | samesets(g, c1, c2) | |
9385eb3d | 1482 | struct re_guts *g; |
e9ce8d39 A |
1483 | int c1; |
1484 | int c2; | |
1485 | { | |
9385eb3d A |
1486 | uch *col; |
1487 | int i; | |
1488 | int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; | |
1489 | unsigned uc1 = (uch)c1; | |
1490 | unsigned uc2 = (uch)c2; | |
e9ce8d39 A |
1491 | |
1492 | for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) | |
1493 | if (col[uc1] != col[uc2]) | |
1494 | return(0); | |
1495 | return(1); | |
1496 | } | |
1497 | ||
1498 | /* | |
1499 | - categorize - sort out character categories | |
9385eb3d | 1500 | == static void categorize(struct parse *p, struct re_guts *g); |
e9ce8d39 A |
1501 | */ |
1502 | static void | |
1503 | categorize(p, g) | |
1504 | struct parse *p; | |
9385eb3d | 1505 | struct re_guts *g; |
e9ce8d39 | 1506 | { |
9385eb3d A |
1507 | cat_t *cats = g->categories; |
1508 | int c; | |
1509 | int c2; | |
1510 | cat_t cat; | |
e9ce8d39 A |
1511 | |
1512 | /* avoid making error situations worse */ | |
1513 | if (p->error != 0) | |
1514 | return; | |
1515 | ||
1516 | for (c = CHAR_MIN; c <= CHAR_MAX; c++) | |
1517 | if (cats[c] == 0 && isinsets(g, c)) { | |
1518 | cat = g->ncategories++; | |
1519 | cats[c] = cat; | |
1520 | for (c2 = c+1; c2 <= CHAR_MAX; c2++) | |
1521 | if (cats[c2] == 0 && samesets(g, c, c2)) | |
1522 | cats[c2] = cat; | |
1523 | } | |
1524 | } | |
1525 | ||
1526 | /* | |
1527 | - dupl - emit a duplicate of a bunch of sops | |
9385eb3d | 1528 | == static sopno dupl(struct parse *p, sopno start, sopno finish); |
e9ce8d39 A |
1529 | */ |
1530 | static sopno /* start of duplicate */ | |
1531 | dupl(p, start, finish) | |
9385eb3d | 1532 | struct parse *p; |
e9ce8d39 A |
1533 | sopno start; /* from here */ |
1534 | sopno finish; /* to this less one */ | |
1535 | { | |
9385eb3d A |
1536 | sopno ret = HERE(); |
1537 | sopno len = finish - start; | |
e9ce8d39 A |
1538 | |
1539 | assert(finish >= start); | |
1540 | if (len == 0) | |
1541 | return(ret); | |
1542 | enlarge(p, p->ssize + len); /* this many unexpected additions */ | |
1543 | assert(p->ssize >= p->slen + len); | |
1544 | (void) memcpy((char *)(p->strip + p->slen), | |
1545 | (char *)(p->strip + start), (size_t)len*sizeof(sop)); | |
1546 | p->slen += len; | |
1547 | return(ret); | |
1548 | } | |
1549 | ||
1550 | /* | |
1551 | - doemit - emit a strip operator | |
9385eb3d | 1552 | == static void doemit(struct parse *p, sop op, size_t opnd); |
e9ce8d39 A |
1553 | * |
1554 | * It might seem better to implement this as a macro with a function as | |
1555 | * hard-case backup, but it's just too big and messy unless there are | |
1556 | * some changes to the data structures. Maybe later. | |
1557 | */ | |
1558 | static void | |
1559 | doemit(p, op, opnd) | |
9385eb3d | 1560 | struct parse *p; |
e9ce8d39 A |
1561 | sop op; |
1562 | size_t opnd; | |
1563 | { | |
1564 | /* avoid making error situations worse */ | |
1565 | if (p->error != 0) | |
1566 | return; | |
1567 | ||
1568 | /* deal with oversize operands ("can't happen", more or less) */ | |
1569 | assert(opnd < 1<<OPSHIFT); | |
1570 | ||
1571 | /* deal with undersized strip */ | |
1572 | if (p->slen >= p->ssize) | |
1573 | enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ | |
1574 | assert(p->slen < p->ssize); | |
1575 | ||
1576 | /* finally, it's all reduced to the easy case */ | |
1577 | p->strip[p->slen++] = SOP(op, opnd); | |
1578 | } | |
1579 | ||
1580 | /* | |
1581 | - doinsert - insert a sop into the strip | |
9385eb3d | 1582 | == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); |
e9ce8d39 A |
1583 | */ |
1584 | static void | |
1585 | doinsert(p, op, opnd, pos) | |
9385eb3d | 1586 | struct parse *p; |
e9ce8d39 A |
1587 | sop op; |
1588 | size_t opnd; | |
1589 | sopno pos; | |
1590 | { | |
9385eb3d A |
1591 | sopno sn; |
1592 | sop s; | |
1593 | int i; | |
e9ce8d39 A |
1594 | |
1595 | /* avoid making error situations worse */ | |
1596 | if (p->error != 0) | |
1597 | return; | |
1598 | ||
1599 | sn = HERE(); | |
1600 | EMIT(op, opnd); /* do checks, ensure space */ | |
1601 | assert(HERE() == sn+1); | |
1602 | s = p->strip[sn]; | |
1603 | ||
1604 | /* adjust paren pointers */ | |
1605 | assert(pos > 0); | |
1606 | for (i = 1; i < NPAREN; i++) { | |
1607 | if (p->pbegin[i] >= pos) { | |
1608 | p->pbegin[i]++; | |
1609 | } | |
1610 | if (p->pend[i] >= pos) { | |
1611 | p->pend[i]++; | |
1612 | } | |
1613 | } | |
1614 | ||
1615 | memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], | |
1616 | (HERE()-pos-1)*sizeof(sop)); | |
1617 | p->strip[pos] = s; | |
1618 | } | |
1619 | ||
1620 | /* | |
1621 | - dofwd - complete a forward reference | |
9385eb3d | 1622 | == static void dofwd(struct parse *p, sopno pos, sop value); |
e9ce8d39 A |
1623 | */ |
1624 | static void | |
1625 | dofwd(p, pos, value) | |
9385eb3d A |
1626 | struct parse *p; |
1627 | sopno pos; | |
e9ce8d39 A |
1628 | sop value; |
1629 | { | |
1630 | /* avoid making error situations worse */ | |
1631 | if (p->error != 0) | |
1632 | return; | |
1633 | ||
1634 | assert(value < 1<<OPSHIFT); | |
1635 | p->strip[pos] = OP(p->strip[pos]) | value; | |
1636 | } | |
1637 | ||
1638 | /* | |
1639 | - enlarge - enlarge the strip | |
9385eb3d | 1640 | == static void enlarge(struct parse *p, sopno size); |
e9ce8d39 A |
1641 | */ |
1642 | static void | |
1643 | enlarge(p, size) | |
9385eb3d A |
1644 | struct parse *p; |
1645 | sopno size; | |
e9ce8d39 | 1646 | { |
9385eb3d | 1647 | sop *sp; |
e9ce8d39 A |
1648 | |
1649 | if (p->ssize >= size) | |
1650 | return; | |
1651 | ||
1652 | sp = (sop *)realloc(p->strip, size*sizeof(sop)); | |
1653 | if (sp == NULL) { | |
1654 | SETERROR(REG_ESPACE); | |
1655 | return; | |
1656 | } | |
1657 | p->strip = sp; | |
1658 | p->ssize = size; | |
1659 | } | |
1660 | ||
1661 | /* | |
1662 | - stripsnug - compact the strip | |
9385eb3d | 1663 | == static void stripsnug(struct parse *p, struct re_guts *g); |
e9ce8d39 A |
1664 | */ |
1665 | static void | |
1666 | stripsnug(p, g) | |
9385eb3d A |
1667 | struct parse *p; |
1668 | struct re_guts *g; | |
e9ce8d39 A |
1669 | { |
1670 | g->nstates = p->slen; | |
1671 | g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); | |
1672 | if (g->strip == NULL) { | |
1673 | SETERROR(REG_ESPACE); | |
1674 | g->strip = p->strip; | |
1675 | } | |
1676 | } | |
1677 | ||
1678 | /* | |
1679 | - findmust - fill in must and mlen with longest mandatory literal string | |
9385eb3d | 1680 | == static void findmust(struct parse *p, struct re_guts *g); |
e9ce8d39 A |
1681 | * |
1682 | * This algorithm could do fancy things like analyzing the operands of | | |
1683 | * for common subsequences. Someday. This code is simple and finds most | |
1684 | * of the interesting cases. | |
1685 | * | |
1686 | * Note that must and mlen got initialized during setup. | |
1687 | */ | |
1688 | static void | |
1689 | findmust(p, g) | |
1690 | struct parse *p; | |
9385eb3d | 1691 | struct re_guts *g; |
e9ce8d39 | 1692 | { |
9385eb3d | 1693 | sop *scan; |
e9ce8d39 | 1694 | sop *start; |
9385eb3d A |
1695 | sop *newstart; |
1696 | sopno newlen; | |
1697 | sop s; | |
1698 | char *cp; | |
1699 | sopno i; | |
1700 | int offset; | |
1701 | int cs, mccs; | |
e9ce8d39 A |
1702 | |
1703 | /* avoid making error situations worse */ | |
1704 | if (p->error != 0) | |
1705 | return; | |
1706 | ||
9385eb3d A |
1707 | /* Find out if we can handle OANYOF or not */ |
1708 | mccs = 0; | |
1709 | for (cs = 0; cs < g->ncsets; cs++) | |
1710 | if (g->sets[cs].multis != NULL) | |
1711 | mccs = 1; | |
1712 | ||
e9ce8d39 A |
1713 | /* find the longest OCHAR sequence in strip */ |
1714 | newlen = 0; | |
9385eb3d A |
1715 | offset = 0; |
1716 | g->moffset = 0; | |
e9ce8d39 A |
1717 | scan = g->strip + 1; |
1718 | do { | |
1719 | s = *scan++; | |
1720 | switch (OP(s)) { | |
1721 | case OCHAR: /* sequence member */ | |
1722 | if (newlen == 0) /* new sequence */ | |
1723 | newstart = scan - 1; | |
1724 | newlen++; | |
1725 | break; | |
1726 | case OPLUS_: /* things that don't break one */ | |
1727 | case OLPAREN: | |
1728 | case ORPAREN: | |
1729 | break; | |
1730 | case OQUEST_: /* things that must be skipped */ | |
1731 | case OCH_: | |
9385eb3d | 1732 | offset = altoffset(scan, offset, mccs); |
e9ce8d39 A |
1733 | scan--; |
1734 | do { | |
1735 | scan += OPND(s); | |
1736 | s = *scan; | |
1737 | /* assert() interferes w debug printouts */ | |
1738 | if (OP(s) != O_QUEST && OP(s) != O_CH && | |
1739 | OP(s) != OOR2) { | |
1740 | g->iflags |= BAD; | |
1741 | return; | |
1742 | } | |
1743 | } while (OP(s) != O_QUEST && OP(s) != O_CH); | |
9385eb3d A |
1744 | /* FALLTHROUGH */ |
1745 | case OBOW: /* things that break a sequence */ | |
1746 | case OEOW: | |
1747 | case OBOL: | |
1748 | case OEOL: | |
1749 | case O_QUEST: | |
1750 | case O_CH: | |
1751 | case OEND: | |
e9ce8d39 A |
1752 | if (newlen > g->mlen) { /* ends one */ |
1753 | start = newstart; | |
1754 | g->mlen = newlen; | |
9385eb3d A |
1755 | if (offset > -1) { |
1756 | g->moffset += offset; | |
1757 | offset = newlen; | |
1758 | } else | |
1759 | g->moffset = offset; | |
1760 | } else { | |
1761 | if (offset > -1) | |
1762 | offset += newlen; | |
e9ce8d39 A |
1763 | } |
1764 | newlen = 0; | |
1765 | break; | |
9385eb3d A |
1766 | case OANY: |
1767 | if (newlen > g->mlen) { /* ends one */ | |
1768 | start = newstart; | |
1769 | g->mlen = newlen; | |
1770 | if (offset > -1) { | |
1771 | g->moffset += offset; | |
1772 | offset = newlen; | |
1773 | } else | |
1774 | g->moffset = offset; | |
1775 | } else { | |
1776 | if (offset > -1) | |
1777 | offset += newlen; | |
1778 | } | |
1779 | if (offset > -1) | |
1780 | offset++; | |
1781 | newlen = 0; | |
1782 | break; | |
1783 | case OANYOF: /* may or may not invalidate offset */ | |
1784 | /* First, everything as OANY */ | |
1785 | if (newlen > g->mlen) { /* ends one */ | |
1786 | start = newstart; | |
1787 | g->mlen = newlen; | |
1788 | if (offset > -1) { | |
1789 | g->moffset += offset; | |
1790 | offset = newlen; | |
1791 | } else | |
1792 | g->moffset = offset; | |
1793 | } else { | |
1794 | if (offset > -1) | |
1795 | offset += newlen; | |
1796 | } | |
1797 | if (offset > -1) | |
1798 | offset++; | |
1799 | newlen = 0; | |
1800 | /* And, now, if we found out we can't deal with | |
1801 | * it, make offset = -1. | |
1802 | */ | |
1803 | if (mccs) | |
1804 | offset = -1; | |
1805 | break; | |
1806 | default: | |
1807 | /* Anything here makes it impossible or too hard | |
1808 | * to calculate the offset -- so we give up; | |
1809 | * save the last known good offset, in case the | |
1810 | * must sequence doesn't occur later. | |
1811 | */ | |
1812 | if (newlen > g->mlen) { /* ends one */ | |
1813 | start = newstart; | |
1814 | g->mlen = newlen; | |
1815 | if (offset > -1) | |
1816 | g->moffset += offset; | |
1817 | else | |
1818 | g->moffset = offset; | |
1819 | } | |
1820 | offset = -1; | |
1821 | newlen = 0; | |
1822 | break; | |
e9ce8d39 A |
1823 | } |
1824 | } while (OP(s) != OEND); | |
1825 | ||
9385eb3d A |
1826 | if (g->mlen == 0) { /* there isn't one */ |
1827 | g->moffset = -1; | |
e9ce8d39 | 1828 | return; |
9385eb3d | 1829 | } |
e9ce8d39 A |
1830 | |
1831 | /* turn it into a character string */ | |
1832 | g->must = malloc((size_t)g->mlen + 1); | |
1833 | if (g->must == NULL) { /* argh; just forget it */ | |
1834 | g->mlen = 0; | |
9385eb3d | 1835 | g->moffset = -1; |
e9ce8d39 A |
1836 | return; |
1837 | } | |
1838 | cp = g->must; | |
1839 | scan = start; | |
1840 | for (i = g->mlen; i > 0; i--) { | |
1841 | while (OP(s = *scan++) != OCHAR) | |
1842 | continue; | |
1843 | assert(cp < g->must + g->mlen); | |
1844 | *cp++ = (char)OPND(s); | |
1845 | } | |
1846 | assert(cp == g->must + g->mlen); | |
1847 | *cp++ = '\0'; /* just on general principles */ | |
1848 | } | |
1849 | ||
9385eb3d A |
1850 | /* |
1851 | - altoffset - choose biggest offset among multiple choices | |
1852 | == static int altoffset(sop *scan, int offset, int mccs); | |
1853 | * | |
1854 | * Compute, recursively if necessary, the largest offset among multiple | |
1855 | * re paths. | |
1856 | */ | |
1857 | static int | |
1858 | altoffset(scan, offset, mccs) | |
1859 | sop *scan; | |
1860 | int offset; | |
1861 | int mccs; | |
1862 | { | |
1863 | int largest; | |
1864 | int try; | |
1865 | sop s; | |
1866 | ||
1867 | /* If we gave up already on offsets, return */ | |
1868 | if (offset == -1) | |
1869 | return -1; | |
1870 | ||
1871 | largest = 0; | |
1872 | try = 0; | |
1873 | s = *scan++; | |
1874 | while (OP(s) != O_QUEST && OP(s) != O_CH) { | |
1875 | switch (OP(s)) { | |
1876 | case OOR1: | |
1877 | if (try > largest) | |
1878 | largest = try; | |
1879 | try = 0; | |
1880 | break; | |
1881 | case OQUEST_: | |
1882 | case OCH_: | |
1883 | try = altoffset(scan, try, mccs); | |
1884 | if (try == -1) | |
1885 | return -1; | |
1886 | scan--; | |
1887 | do { | |
1888 | scan += OPND(s); | |
1889 | s = *scan; | |
1890 | if (OP(s) != O_QUEST && OP(s) != O_CH && | |
1891 | OP(s) != OOR2) | |
1892 | return -1; | |
1893 | } while (OP(s) != O_QUEST && OP(s) != O_CH); | |
1894 | /* We must skip to the next position, or we'll | |
1895 | * leave altoffset() too early. | |
1896 | */ | |
1897 | scan++; | |
1898 | break; | |
1899 | case OANYOF: | |
1900 | if (mccs) | |
1901 | return -1; | |
1902 | case OCHAR: | |
1903 | case OANY: | |
1904 | try++; | |
1905 | case OBOW: | |
1906 | case OEOW: | |
1907 | case OLPAREN: | |
1908 | case ORPAREN: | |
1909 | case OOR2: | |
1910 | break; | |
1911 | default: | |
1912 | try = -1; | |
1913 | break; | |
1914 | } | |
1915 | if (try == -1) | |
1916 | return -1; | |
1917 | s = *scan++; | |
1918 | } | |
1919 | ||
1920 | if (try > largest) | |
1921 | largest = try; | |
1922 | ||
1923 | return largest+offset; | |
1924 | } | |
1925 | ||
1926 | /* | |
1927 | - computejumps - compute char jumps for BM scan | |
1928 | == static void computejumps(struct parse *p, struct re_guts *g); | |
1929 | * | |
1930 | * This algorithm assumes g->must exists and is has size greater than | |
1931 | * zero. It's based on the algorithm found on Computer Algorithms by | |
1932 | * Sara Baase. | |
1933 | * | |
1934 | * A char jump is the number of characters one needs to jump based on | |
1935 | * the value of the character from the text that was mismatched. | |
1936 | */ | |
1937 | static void | |
1938 | computejumps(p, g) | |
1939 | struct parse *p; | |
1940 | struct re_guts *g; | |
1941 | { | |
1942 | int ch; | |
1943 | int mindex; | |
1944 | ||
1945 | /* Avoid making errors worse */ | |
1946 | if (p->error != 0) | |
1947 | return; | |
1948 | ||
1949 | g->charjump = (int*) malloc((NC + 1) * sizeof(int)); | |
1950 | if (g->charjump == NULL) /* Not a fatal error */ | |
1951 | return; | |
1952 | /* Adjust for signed chars, if necessary */ | |
1953 | g->charjump = &g->charjump[-(CHAR_MIN)]; | |
1954 | ||
1955 | /* If the character does not exist in the pattern, the jump | |
1956 | * is equal to the number of characters in the pattern. | |
1957 | */ | |
1958 | for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) | |
1959 | g->charjump[ch] = g->mlen; | |
1960 | ||
1961 | /* If the character does exist, compute the jump that would | |
1962 | * take us to the last character in the pattern equal to it | |
1963 | * (notice that we match right to left, so that last character | |
1964 | * is the first one that would be matched). | |
1965 | */ | |
1966 | for (mindex = 0; mindex < g->mlen; mindex++) | |
1967 | g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; | |
1968 | } | |
1969 | ||
1970 | /* | |
1971 | - computematchjumps - compute match jumps for BM scan | |
1972 | == static void computematchjumps(struct parse *p, struct re_guts *g); | |
1973 | * | |
1974 | * This algorithm assumes g->must exists and is has size greater than | |
1975 | * zero. It's based on the algorithm found on Computer Algorithms by | |
1976 | * Sara Baase. | |
1977 | * | |
1978 | * A match jump is the number of characters one needs to advance based | |
1979 | * on the already-matched suffix. | |
1980 | * Notice that all values here are minus (g->mlen-1), because of the way | |
1981 | * the search algorithm works. | |
1982 | */ | |
1983 | static void | |
1984 | computematchjumps(p, g) | |
1985 | struct parse *p; | |
1986 | struct re_guts *g; | |
1987 | { | |
1988 | int mindex; /* General "must" iterator */ | |
1989 | int suffix; /* Keeps track of matching suffix */ | |
1990 | int ssuffix; /* Keeps track of suffixes' suffix */ | |
1991 | int* pmatches; /* pmatches[k] points to the next i | |
1992 | * such that i+1...mlen is a substring | |
1993 | * of k+1...k+mlen-i-1 | |
1994 | */ | |
1995 | ||
1996 | /* Avoid making errors worse */ | |
1997 | if (p->error != 0) | |
1998 | return; | |
1999 | ||
2000 | pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); | |
2001 | if (pmatches == NULL) { | |
2002 | g->matchjump = NULL; | |
2003 | return; | |
2004 | } | |
2005 | ||
2006 | g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); | |
2007 | if (g->matchjump == NULL) /* Not a fatal error */ | |
2008 | return; | |
2009 | ||
2010 | /* Set maximum possible jump for each character in the pattern */ | |
2011 | for (mindex = 0; mindex < g->mlen; mindex++) | |
2012 | g->matchjump[mindex] = 2*g->mlen - mindex - 1; | |
2013 | ||
2014 | /* Compute pmatches[] */ | |
2015 | for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; | |
2016 | mindex--, suffix--) { | |
2017 | pmatches[mindex] = suffix; | |
2018 | ||
2019 | /* If a mismatch is found, interrupting the substring, | |
2020 | * compute the matchjump for that position. If no | |
2021 | * mismatch is found, then a text substring mismatched | |
2022 | * against the suffix will also mismatch against the | |
2023 | * substring. | |
2024 | */ | |
2025 | while (suffix < g->mlen | |
2026 | && g->must[mindex] != g->must[suffix]) { | |
2027 | g->matchjump[suffix] = MIN(g->matchjump[suffix], | |
2028 | g->mlen - mindex - 1); | |
2029 | suffix = pmatches[suffix]; | |
2030 | } | |
2031 | } | |
2032 | ||
2033 | /* Compute the matchjump up to the last substring found to jump | |
2034 | * to the beginning of the largest must pattern prefix matching | |
2035 | * it's own suffix. | |
2036 | */ | |
2037 | for (mindex = 0; mindex <= suffix; mindex++) | |
2038 | g->matchjump[mindex] = MIN(g->matchjump[mindex], | |
2039 | g->mlen + suffix - mindex); | |
2040 | ||
2041 | ssuffix = pmatches[suffix]; | |
2042 | while (suffix < g->mlen) { | |
2043 | while (suffix <= ssuffix && suffix < g->mlen) { | |
2044 | g->matchjump[suffix] = MIN(g->matchjump[suffix], | |
2045 | g->mlen + ssuffix - suffix); | |
2046 | suffix++; | |
2047 | } | |
2048 | if (suffix < g->mlen) | |
2049 | ssuffix = pmatches[ssuffix]; | |
2050 | } | |
2051 | ||
2052 | free(pmatches); | |
2053 | } | |
2054 | ||
e9ce8d39 A |
2055 | /* |
2056 | - pluscount - count + nesting | |
9385eb3d | 2057 | == static sopno pluscount(struct parse *p, struct re_guts *g); |
e9ce8d39 A |
2058 | */ |
2059 | static sopno /* nesting depth */ | |
2060 | pluscount(p, g) | |
2061 | struct parse *p; | |
9385eb3d | 2062 | struct re_guts *g; |
e9ce8d39 | 2063 | { |
9385eb3d A |
2064 | sop *scan; |
2065 | sop s; | |
2066 | sopno plusnest = 0; | |
2067 | sopno maxnest = 0; | |
e9ce8d39 A |
2068 | |
2069 | if (p->error != 0) | |
2070 | return(0); /* there may not be an OEND */ | |
2071 | ||
2072 | scan = g->strip + 1; | |
2073 | do { | |
2074 | s = *scan++; | |
2075 | switch (OP(s)) { | |
2076 | case OPLUS_: | |
2077 | plusnest++; | |
2078 | break; | |
2079 | case O_PLUS: | |
2080 | if (plusnest > maxnest) | |
2081 | maxnest = plusnest; | |
2082 | plusnest--; | |
2083 | break; | |
2084 | } | |
2085 | } while (OP(s) != OEND); | |
2086 | if (plusnest != 0) | |
2087 | g->iflags |= BAD; | |
2088 | return(maxnest); | |
2089 | } |