]>
Commit | Line | Data |
---|---|---|
ad3c9f2a A |
1 | /* |
2 | tre-parse.c - Regexp parser | |
3 | ||
4 | This software is released under a BSD-style license. | |
5 | See the file LICENSE for details and copyright. | |
6 | ||
7 | */ | |
8 | ||
9 | /* | |
10 | This parser is just a simple recursive descent parser for POSIX.2 | |
11 | regexps. The parser supports both the obsolete default syntax and | |
12 | the "extended" syntax, and some nonstandard extensions. | |
13 | */ | |
14 | ||
15 | ||
16 | #ifdef HAVE_CONFIG_H | |
17 | #include <config.h> | |
18 | #endif /* HAVE_CONFIG_H */ | |
19 | #include <string.h> | |
20 | #include <assert.h> | |
21 | #include <limits.h> | |
22 | #include <stddef.h> | |
23 | ||
24 | #include "xmalloc.h" | |
25 | #include "tre-mem.h" | |
26 | #include "tre-ast.h" | |
27 | #include "tre-stack.h" | |
28 | #include "tre-parse.h" | |
29 | ||
30 | /* BSD compatibility: | |
31 | Before looking up a collating symbol, check if the name matches in | |
32 | the character names (cnames) array; if so, use the corresponding | |
33 | character. | |
34 | ||
35 | Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve | |
36 | the implementation choice that for ERE, a non-numeric character following | |
37 | a left brace that would normally be a bound, causes the left brace to be | |
38 | literal. */ | |
39 | #define BSD_COMPATIBILITY | |
40 | #ifdef BSD_COMPATIBILITY | |
41 | #include "cname.h" | |
42 | #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
43 | #endif /* BSD_COMPATIBILITY */ | |
44 | ||
45 | /* Characters with special meanings in regexp syntax. */ | |
46 | #define CHAR_PIPE L'|' | |
47 | #define CHAR_LPAREN L'(' | |
48 | #define CHAR_RPAREN L')' | |
49 | #define CHAR_LBRACE L'{' | |
50 | #define CHAR_RBRACE L'}' | |
51 | #define CHAR_LBRACKET L'[' | |
52 | #define CHAR_RBRACKET L']' | |
53 | #define CHAR_MINUS L'-' | |
54 | #define CHAR_STAR L'*' | |
55 | #define CHAR_QUESTIONMARK L'?' | |
56 | #define CHAR_PLUS L'+' | |
57 | #define CHAR_PERIOD L'.' | |
58 | #define CHAR_COLON L':' | |
59 | #define CHAR_EQUAL L'=' | |
60 | #define CHAR_COMMA L',' | |
61 | #define CHAR_CARET L'^' | |
62 | #define CHAR_DOLLAR L'$' | |
63 | #define CHAR_BACKSLASH L'\\' | |
64 | #define CHAR_HASH L'#' | |
65 | #define CHAR_TILDE L'~' | |
66 | ||
67 | ||
68 | /* Some macros for expanding \w, \s, etc. */ | |
69 | static const struct tre_macro_struct { | |
70 | const char c; | |
71 | const char *expansion; | |
72 | } tre_macros[] = | |
73 | { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, | |
74 | {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, | |
75 | {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, | |
76 | {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, | |
77 | { 0, NULL } | |
78 | }; | |
79 | ||
80 | ||
81 | /* Expands a macro delimited by `regex' and `regex_end' to `buf', which | |
82 | must have at least `len' items. Sets buf[0] to zero if the there | |
83 | is no match in `tre_macros'. */ | |
84 | static void | |
85 | tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, | |
86 | tre_char_t *buf, size_t buf_len) | |
87 | { | |
88 | int i; | |
89 | ||
90 | buf[0] = 0; | |
91 | if (regex >= regex_end) | |
92 | return; | |
93 | ||
94 | for (i = 0; tre_macros[i].expansion; i++) | |
95 | { | |
96 | if (tre_macros[i].c == *regex) | |
97 | { | |
98 | unsigned int j; | |
99 | DPRINT(("Expanding macro '%c' => '%s'\n", | |
100 | tre_macros[i].c, tre_macros[i].expansion)); | |
101 | for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) | |
102 | buf[j] = tre_macros[i].expansion[j]; | |
103 | buf[j] = 0; | |
104 | break; | |
105 | } | |
106 | } | |
107 | } | |
108 | ||
109 | static reg_errcode_t | |
110 | tre_new_item(tre_mem_t mem, int type, int val, int *max_i, | |
111 | tre_bracket_match_list_t **items) | |
112 | { | |
113 | reg_errcode_t status = REG_OK; | |
114 | tre_bracket_match_list_t *array = *items; | |
115 | int i = array->num_bracket_matches; | |
116 | /* Allocate more space if necessary. */ | |
117 | if (i >= *max_i) | |
118 | { | |
119 | tre_bracket_match_list_t *new_items; | |
120 | DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i)); | |
121 | /* If the array is already 1024 items large, give up -- there's | |
122 | probably an error in the regexp (e.g. not a '\0' terminated | |
123 | string and missing ']') */ | |
124 | if (*max_i >= 1024) | |
125 | return REG_ESPACE; | |
126 | *max_i *= 2; | |
127 | new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i)); | |
128 | if (new_items == NULL) | |
129 | return REG_ESPACE; | |
130 | *items = array = new_items; | |
131 | } | |
132 | array->bracket_matches[i].type = type; | |
133 | array->bracket_matches[i].value = val; | |
134 | array->num_bracket_matches++; | |
135 | return status; | |
136 | } | |
137 | ||
138 | #ifndef TRE_USE_SYSTEM_WCTYPE | |
139 | ||
140 | /* isalnum() and the rest may be macros, so wrap them to functions. */ | |
141 | int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); } | |
142 | int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); } | |
143 | ||
144 | #ifdef tre_isascii | |
145 | int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); } | |
146 | #else /* !tre_isascii */ | |
147 | int tre_isascii_func(tre_cint_t c) { return !(c >> 7); } | |
148 | #endif /* !tre_isascii */ | |
149 | ||
150 | #ifdef tre_isblank | |
151 | int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); } | |
152 | #else /* !tre_isblank */ | |
153 | int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); } | |
154 | #endif /* !tre_isblank */ | |
155 | ||
156 | int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); } | |
157 | int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); } | |
158 | int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); } | |
159 | int tre_islower_func(tre_cint_t c) { return tre_islower(c); } | |
160 | int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); } | |
161 | int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); } | |
162 | int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); } | |
163 | int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); } | |
164 | int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); } | |
165 | ||
166 | struct { | |
167 | char *name; | |
168 | int (*func)(tre_cint_t); | |
169 | } tre_ctype_map[] = { | |
170 | { "alnum", &tre_isalnum_func }, | |
171 | { "alpha", &tre_isalpha_func }, | |
172 | #ifdef tre_isascii | |
173 | { "ascii", &tre_isascii_func }, | |
174 | #endif /* tre_isascii */ | |
175 | #ifdef tre_isblank | |
176 | { "blank", &tre_isblank_func }, | |
177 | #endif /* tre_isblank */ | |
178 | { "cntrl", &tre_iscntrl_func }, | |
179 | { "digit", &tre_isdigit_func }, | |
180 | { "graph", &tre_isgraph_func }, | |
181 | { "lower", &tre_islower_func }, | |
182 | { "print", &tre_isprint_func }, | |
183 | { "punct", &tre_ispunct_func }, | |
184 | { "space", &tre_isspace_func }, | |
185 | { "upper", &tre_isupper_func }, | |
186 | { "xdigit", &tre_isxdigit_func }, | |
187 | { NULL, NULL} | |
188 | }; | |
189 | ||
190 | tre_ctype_t tre_ctype(const char *name) | |
191 | { | |
192 | int i; | |
193 | for (i = 0; tre_ctype_map[i].name != NULL; i++) | |
194 | { | |
195 | if (strcmp(name, tre_ctype_map[i].name) == 0) | |
196 | return tre_ctype_map[i].func; | |
197 | } | |
198 | return (tre_ctype_t)0; | |
199 | } | |
200 | #endif /* !TRE_USE_SYSTEM_WCTYPE */ | |
201 | ||
202 | #define REST(re) (int)(ctx->re_end - (re)), (re) | |
203 | ||
204 | #define START_COLLATING_SYMBOLS 16 | |
205 | #define MAX_COLLATING_SYMBOL_LEN 4 | |
206 | ||
207 | typedef struct { | |
208 | const tre_char_t *start; | |
209 | int len; | |
210 | } tre_collating_symbol; | |
211 | ||
212 | #include <xlocale.h> | |
213 | ||
214 | int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len); | |
215 | ||
216 | #ifdef BSD_COMPATIBILITY | |
217 | static wchar_t | |
218 | tre_search_cnames(const wchar_t *name, size_t len) | |
219 | { | |
220 | size_t low = 0; | |
221 | size_t high = NCNAMES - 1; | |
222 | size_t cur; | |
223 | int cmp; | |
224 | ||
225 | while(low <= high) | |
226 | { | |
227 | cur = (low + high) / 2; | |
228 | cmp = wcsncmp(name, cnames[cur].name, len); | |
229 | if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code; | |
230 | if (cmp > 0) low = cur + 1; | |
231 | else high = cur - 1; | |
232 | } | |
233 | return (wchar_t)-1; | |
234 | } | |
235 | #endif /* BSD_COMPATIBILITY */ | |
236 | ||
237 | /* Scan the contents of a bracket expression, and create a | |
238 | * tre_bracket_match_list_t encoding the bracket expression. If during | |
239 | * the scan, multi-character collating symbols are detected, switch | |
240 | * into a mode to collect those MCCSs into a tre_collating_symbol | |
241 | * list and pass them back. tre_parse_bracket will use that to | |
242 | * create a new string composed of a union of the bracket expression | |
243 | * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and | |
244 | * call tre_parse (recursive) to parse that new string (which will | |
245 | * call tre_parse_bracket and tre_parse_bracket_items again. */ | |
246 | static reg_errcode_t | |
247 | tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items, | |
248 | int *items_size, tre_collating_symbol **result) | |
249 | { | |
250 | const tre_char_t *re = ctx->re; | |
251 | const tre_char_t *re_end = ctx->re_end; | |
252 | tre_collating_symbol *col_syms = NULL; | |
253 | tre_collating_symbol *cp = NULL; | |
254 | int n_col_syms = 0; | |
255 | reg_errcode_t status; | |
256 | int max_i = *items_size; | |
257 | int other = 0; /* contains content other than multi-character collating | |
258 | * symbols */ | |
259 | int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */ | |
260 | tre_cint_t min, c; | |
261 | int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE); | |
262 | int collect_MCCS = 0; | |
263 | const tre_char_t *start; | |
264 | ||
265 | for ( ;re < re_end; re++) | |
266 | { | |
267 | switch (*re) | |
268 | { | |
269 | case CHAR_MINUS: | |
270 | /* A first hyphen */ | |
271 | if (re == ctx->re) | |
272 | { | |
273 | DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); | |
274 | min = CHAR_MINUS; | |
275 | other++; | |
276 | range = 0; | |
277 | break; | |
278 | } | |
279 | /* The hyphen is the end range */ | |
280 | if (range > 0) | |
281 | { | |
282 | DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); | |
283 | c = CHAR_MINUS; | |
284 | goto process_end_range; | |
285 | } | |
286 | if (re + 1 >= re_end) | |
287 | { | |
288 | status = REG_EBRACK; | |
289 | goto error; | |
290 | } | |
291 | /* The hyphen is at the end */ | |
292 | if (re[1] == CHAR_RBRACKET) | |
293 | { | |
294 | DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); | |
295 | c = CHAR_MINUS; | |
296 | goto process_begin_range; | |
297 | } | |
298 | /* Two ranges are not allowed to share an endpoint, or begin | |
299 | * range is illegal. */ | |
300 | if (range < 0) | |
301 | { | |
302 | status = REG_ERANGE; | |
303 | goto error; | |
304 | } | |
305 | range = 1; /* Expect end range */ | |
306 | DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re))); | |
307 | break; | |
308 | ||
309 | case CHAR_LBRACKET: | |
310 | if (re + 1 >= re_end) | |
311 | { | |
312 | status = REG_EBRACK; | |
313 | goto error; | |
314 | } | |
315 | switch (re[1]) | |
316 | { | |
317 | case CHAR_PERIOD: | |
318 | { | |
319 | re += 2; | |
320 | start = re; | |
321 | for (;; re++) | |
322 | { | |
323 | if (re >= re_end) | |
324 | { | |
325 | status = REG_ECOLLATE; | |
326 | goto error; | |
327 | } | |
328 | if (*re == CHAR_PERIOD) | |
329 | { | |
330 | if (re + 1 >= re_end) | |
331 | { | |
332 | status = REG_ECOLLATE; | |
333 | goto error; | |
334 | } | |
335 | /* Found end */ | |
336 | if (re[1] == CHAR_RBRACKET) | |
337 | { | |
338 | DPRINT(("tre_parse_bracket: collating " | |
339 | "symbol: '%.*" STRF "'\n", | |
340 | REST(start - 2))); | |
341 | /* Empty name */ | |
342 | if (re == start) | |
343 | { | |
344 | status = REG_ECOLLATE; | |
345 | goto error; | |
346 | } | |
347 | #ifdef BSD_COMPATIBILITY | |
348 | /* Check if the name is in cnames; if so, use | |
349 | the corresponding code */ | |
350 | c = tre_search_cnames(start, re - start); | |
351 | if (c != (wchar_t)-1) | |
352 | { | |
353 | re++; | |
354 | goto process_single_character; | |
355 | } | |
356 | #endif /* BSD_COMPATIBILITY */ | |
357 | /* Verify this is a known sequence */ | |
358 | if (__collate_equiv_value(ctx->loc, start, | |
359 | re - start) <= 0) | |
360 | { | |
361 | status = REG_ECOLLATE; | |
362 | goto error; | |
363 | } | |
364 | /* Process single character collating symbols */ | |
365 | if (re - start == 1) | |
366 | { | |
367 | c = *start; | |
368 | re++; | |
369 | goto process_single_character; | |
370 | } | |
371 | /* Inverted MCCSs are undefined */ | |
372 | if (invert) | |
373 | { | |
374 | status = REG_ECOLLATE; | |
375 | goto error; | |
376 | } | |
377 | /* Can't have MCCSs as an endpoint to a range */ | |
378 | if (range > 0) | |
379 | { | |
380 | status = REG_ERANGE; | |
381 | goto error; | |
382 | } | |
383 | range = -1; | |
384 | /* Switch into MCCS collection mode (if not | |
385 | * already there */ | |
386 | #if TRE_DEBUG | |
387 | if (!collect_MCCS) | |
388 | { | |
389 | collect_MCCS = 1; | |
390 | DPRINT(("tre_parse_bracket: Detected MCCS\n")); | |
391 | } | |
392 | #else /* !TRE_DEBUG */ | |
393 | collect_MCCS = 1; | |
394 | #endif /* !TRE_DEBUG */ | |
395 | /* Allocate a memory block the first time */ | |
396 | if (!cp) | |
397 | { | |
398 | if ((col_syms = xmalloc(sizeof(*col_syms) * | |
399 | (START_COLLATING_SYMBOLS + 2))) | |
400 | == NULL) | |
401 | return REG_ESPACE; | |
402 | cp = col_syms + 1; | |
403 | n_col_syms = START_COLLATING_SYMBOLS; | |
404 | } | |
405 | /* Enlarge the memory block is more is needed */ | |
406 | if ((cp - col_syms) - 1 >= n_col_syms) | |
407 | { | |
408 | int i = n_col_syms; | |
409 | tre_collating_symbol *tmp = | |
410 | xrealloc(col_syms, sizeof(*col_syms) * | |
411 | ((n_col_syms *= 2) + 2)); | |
412 | if (tmp == NULL) | |
413 | { | |
414 | xfree(col_syms); | |
415 | return REG_ESPACE; | |
416 | } | |
417 | DPRINT(("tre_list_collating_symbols: " | |
418 | "Enlarging col_syms to %d\n", | |
419 | n_col_syms)); | |
420 | col_syms = tmp; | |
421 | cp = col_syms + i + 1; | |
422 | } | |
423 | cp->start = start; | |
424 | cp->len = re - start; | |
425 | cp++; | |
426 | re++; | |
427 | break; | |
428 | } | |
429 | } | |
430 | } | |
431 | break; | |
432 | } | |
433 | ||
434 | case CHAR_EQUAL: | |
435 | case CHAR_COLON: | |
436 | { | |
437 | /* Process equivalence and character classes */ | |
438 | tre_char_t kind = re[1]; | |
439 | ||
440 | /* Can't have a class as an endpoint to a range */ | |
441 | if (range > 0) | |
442 | { | |
443 | status = REG_ERANGE; | |
444 | goto error; | |
445 | } | |
446 | if (!collect_MCCS && range == 0) | |
447 | { | |
448 | status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, | |
449 | min, &max_i, items); | |
450 | if (status != REG_OK) | |
451 | goto error; | |
452 | } | |
453 | range = -1; | |
454 | re += 2; | |
455 | start = re; | |
456 | for (;; re++) | |
457 | { | |
458 | if (re >= re_end) | |
459 | { | |
460 | status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE; | |
461 | goto error; | |
462 | } | |
463 | if (*re == kind) | |
464 | { | |
465 | if (re + 1 >= re_end) | |
466 | { | |
467 | status = kind == CHAR_EQUAL ? REG_ECOLLATE : | |
468 | REG_ECTYPE; | |
469 | goto error; | |
470 | } | |
471 | /* Found end */ | |
472 | if (re[1] == CHAR_RBRACKET) | |
473 | { | |
474 | if (re == start) | |
475 | { | |
476 | /* Empty class name */ | |
477 | status = kind == CHAR_EQUAL ? REG_ECOLLATE : | |
478 | REG_ECTYPE; | |
479 | goto error; | |
480 | } | |
481 | /* Process equivalence class */ | |
482 | if (kind == CHAR_EQUAL) | |
483 | { | |
484 | int equiv; | |
485 | ||
486 | DPRINT(("tre_parse_bracket: equivalence: '%.*" | |
487 | STRF "'\n", REST(start - 2))); | |
488 | ||
489 | /* While we find the collation value even for | |
490 | multi-character collating elements , we | |
491 | don't (yet) match any collation values | |
492 | against multi-character sequences. We'd have | |
493 | to enumerate those multi-character sequences | |
494 | and like multi-character collating symbols, | |
495 | create a union of those sequences with the | |
496 | rest of the bracket expression. While | |
497 | doable, a bracket expression matching | |
498 | multiple characters, that doesn't explicitly | |
499 | contain multi-character sequences, might | |
500 | be unexpected, so we punt for now. */ | |
501 | if ((equiv = __collate_equiv_value(ctx->loc, | |
502 | start, re - start)) <= 0) | |
503 | { | |
504 | /* The standard says that if no collating | |
505 | element if found, we use the collating | |
506 | symbol itself. But __collate_equiv_value | |
507 | doesn't make a distinction between | |
508 | an element that is in a equvalence | |
509 | class with others, or is the only member, | |
510 | so we already know there is no collating | |
511 | symbol. (Note that in the case of a | |
512 | collating element whose collation value | |
513 | is unique, matching against the | |
514 | collating element itself, or against | |
515 | its collation value, is equivalent.) */ | |
516 | #ifdef BSD_COMPATIBILITY | |
517 | /* Check if the name is in cnames; if so, | |
518 | use the corresponding code */ | |
519 | c = tre_search_cnames(start, re - start); | |
520 | if (c != (wchar_t)-1) | |
521 | { | |
522 | re++; | |
523 | goto process_single_character; | |
524 | } | |
525 | #endif /* BSD_COMPATIBILITY */ | |
526 | status = REG_ECOLLATE; | |
527 | goto error; | |
528 | } | |
529 | if (!collect_MCCS) | |
530 | { | |
531 | status = tre_new_item(ctx->mem, | |
532 | TRE_BRACKET_MATCH_TYPE_EQUIVALENCE, | |
533 | equiv, &max_i, items); | |
534 | if (status != REG_OK) | |
535 | goto error; | |
536 | } | |
537 | } | |
538 | else | |
539 | { | |
540 | /* Process character class */ | |
541 | DPRINT(("tre_parse_bracket: class: '%.*" STRF | |
542 | "'\n", REST(start - 2))); | |
543 | if (!collect_MCCS) | |
544 | { | |
545 | char tmp_str[64]; | |
546 | tre_ctype_t class; | |
547 | int len = MIN(re - start, 63); | |
548 | #ifdef TRE_WCHAR | |
549 | { | |
550 | tre_char_t tmp_wcs[64]; | |
551 | wcsncpy(tmp_wcs, start, (size_t)len); | |
552 | tmp_wcs[len] = L'\0'; | |
553 | #if defined HAVE_WCSRTOMBS | |
554 | { | |
555 | mbstate_t state; | |
556 | const tre_char_t *src = tmp_wcs; | |
557 | memset(&state, '\0', sizeof(state)); | |
558 | len = wcsrtombs_l(tmp_str, &src, | |
559 | sizeof(tmp_str), &state, | |
560 | ctx->loc); | |
561 | } | |
562 | #elif defined HAVE_WCSTOMBS | |
563 | len = wcstombs(tmp_str, tmp_wcs, 63); | |
564 | #endif /* defined HAVE_WCSTOMBS */ | |
565 | } | |
566 | #else /* !TRE_WCHAR */ | |
567 | strncpy(tmp_str, (const char*)start, len); | |
568 | #endif /* !TRE_WCHAR */ | |
569 | tmp_str[len] = '\0'; | |
570 | DPRINT((" class name: %s\n", tmp_str)); | |
571 | class = tre_ctype_l(tmp_str, ctx->loc); | |
572 | if (!class) | |
573 | { | |
574 | status = REG_ECTYPE; | |
575 | goto error; | |
576 | } | |
577 | status = tre_new_item(ctx->mem, | |
578 | TRE_BRACKET_MATCH_TYPE_CLASS, | |
579 | class, &max_i, items); | |
580 | if (status != REG_OK) | |
581 | goto error; | |
582 | } | |
583 | } | |
584 | re++; | |
585 | break; | |
586 | } | |
587 | } | |
588 | } | |
589 | other++; | |
590 | break; | |
591 | } | |
592 | ||
593 | default: | |
594 | DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); | |
595 | c = CHAR_LBRACKET; | |
596 | goto process_single_character; | |
597 | break; | |
598 | } | |
599 | break; | |
600 | ||
601 | case CHAR_RBRACKET: | |
602 | /* A first right bracket */ | |
603 | if (re == ctx->re) | |
604 | { | |
605 | DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); | |
606 | min = CHAR_RBRACKET; | |
607 | range = 0; | |
608 | other++; | |
609 | break; | |
610 | } | |
611 | /* Done */ | |
612 | if (collect_MCCS) | |
613 | { | |
614 | DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", | |
615 | REST(re))); | |
616 | if (col_syms) | |
617 | { | |
618 | /* Mark the character following the right bracket. Set len | |
619 | * to whether there are other things besides the | |
620 | * multi-character collating symbols */ | |
621 | col_syms->start = re + 1; | |
622 | col_syms->len = other; | |
623 | /* Mark the end of the list */ | |
624 | cp->start = NULL; | |
625 | } | |
626 | *result = col_syms; | |
627 | return REG_OK; | |
628 | } | |
629 | /* range > 0 is not possible, since we did a lookahead after the | |
630 | * hyphen */ | |
631 | if (range == 0) | |
632 | { | |
633 | status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR, | |
634 | min, &max_i, items); | |
635 | if (status != REG_OK) | |
636 | goto error; | |
637 | } | |
638 | DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re))); | |
639 | *items_size = max_i; | |
640 | ctx->re = re + 1; | |
641 | return REG_OK; | |
642 | ||
643 | default: | |
644 | DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); | |
645 | c = *re; | |
646 | process_single_character: | |
647 | /* Process single character */ | |
648 | if (range > 0) | |
649 | { | |
650 | int mine, maxe; | |
651 | ||
652 | process_end_range: | |
653 | /* Get collation equivalence values */ | |
654 | mine = __collate_equiv_value(ctx->loc, &min, 1); | |
655 | maxe = __collate_equiv_value(ctx->loc, &c, 1); | |
656 | if (maxe < mine) | |
657 | { | |
658 | status = REG_ERANGE; | |
659 | goto error; | |
660 | } | |
661 | if (!collect_MCCS) | |
662 | { | |
663 | status = tre_new_item(ctx->mem, | |
664 | TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN, | |
665 | mine, &max_i, items); | |
666 | if (status != REG_OK) | |
667 | goto error; | |
668 | status = tre_new_item(ctx->mem, | |
669 | TRE_BRACKET_MATCH_TYPE_RANGE_END, | |
670 | maxe, &max_i, items); | |
671 | if (status != REG_OK) | |
672 | goto error; | |
673 | } | |
674 | range = -1; | |
675 | } | |
676 | else | |
677 | { | |
678 | process_begin_range: | |
679 | if (!collect_MCCS) | |
680 | { | |
681 | if (range == 0) | |
682 | { | |
683 | status = tre_new_item(ctx->mem, | |
684 | TRE_BRACKET_MATCH_TYPE_CHAR, | |
685 | min, &max_i, items); | |
686 | if (status != REG_OK) | |
687 | goto error; | |
688 | } | |
689 | min = c; | |
690 | } | |
691 | range = 0; | |
692 | } | |
693 | other++; | |
694 | break; | |
695 | } | |
696 | } | |
697 | status = REG_EBRACK; | |
698 | error: | |
699 | DPRINT(("tre_parse_bracket: error: '%.*" STRF "', status=%d\n", | |
700 | REST(re), status)); | |
701 | if (col_syms) | |
702 | xfree(col_syms); | |
703 | return status; | |
704 | } | |
705 | ||
706 | #ifdef TRE_DEBUG | |
707 | static const char *bracket_match_type_str[] = { | |
708 | "unused", | |
709 | "char", | |
710 | "range begin", | |
711 | "range end", | |
712 | "class", | |
713 | "equivalence value", | |
714 | }; | |
715 | #endif /* TRE_DEBUG */ | |
716 | ||
717 | static reg_errcode_t | |
718 | tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) | |
719 | { | |
720 | tre_ast_node_t *node; | |
721 | reg_errcode_t status = REG_OK; | |
722 | tre_bracket_match_list_t *items; | |
723 | int max_i = 32; | |
724 | tre_collating_symbol *col_syms = NULL; | |
725 | ||
726 | /* Handle special cases [[:<:]] and [[:>:]] */ | |
727 | if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET | |
728 | && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>') | |
729 | && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET | |
730 | && ctx->re[5] == CHAR_RBRACKET) | |
731 | { | |
732 | *result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
733 | (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW, | |
734 | -1); | |
735 | DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ? | |
736 | "[[:<:]]" : "[[:>:]]")); | |
737 | ctx->re += 6; | |
738 | return *result ? REG_OK : REG_ESPACE; | |
739 | } | |
740 | ||
741 | /* Start off with an array of `max_i' elements. */ | |
742 | items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i)); | |
743 | if (items == NULL) | |
744 | return REG_ESPACE; | |
745 | ||
746 | if (*ctx->re == CHAR_CARET) | |
747 | { | |
748 | DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re))); | |
749 | items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE; | |
750 | ctx->re++; | |
751 | } | |
752 | ||
753 | status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms); | |
754 | ||
755 | if (status != REG_OK) | |
756 | goto parse_bracket_done; | |
757 | ||
758 | /* If there are collating symbols, split off the multi-character ones | |
759 | * into a union of the bracket expression (without the collating symbols) | |
760 | * and the multiple-character sequences. We create an equivalent input | |
761 | * string and run tre_parse() recursively */ | |
762 | if (col_syms) | |
763 | { | |
764 | tre_char_t *str, *sp; | |
765 | tre_collating_symbol *cp; | |
766 | tre_parse_ctx_t subctx; | |
767 | ||
768 | /* Allocate a new string. We start with the size of the original | |
769 | * bracket expression (minus 1) and add 2 (for a leading "[" and | |
770 | * a trailing nil; don't need a "^", since it is illegal to have | |
771 | * inverted MCCSs). Since a multi-character collating symbols | |
772 | * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't | |
773 | * need to worry about the new string getting too long. */ | |
774 | xfree(items); | |
775 | str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2)); | |
776 | if (str == NULL) | |
777 | { | |
778 | xfree(col_syms); | |
779 | return REG_ESPACE; | |
780 | } | |
781 | sp = str; | |
782 | if (col_syms->len > 0) | |
783 | { | |
784 | /* There are other items in the bracket expression besides the | |
785 | * multi-character collating symbols, so create a new bracket | |
786 | * expression with only those other itmes. */ | |
787 | const tre_char_t *re; | |
788 | ptrdiff_t i; | |
789 | ||
790 | *sp++ = '['; | |
791 | re = ctx->re; | |
792 | for (cp = col_syms + 1; cp->start; cp++) | |
793 | { | |
794 | /* The "- 2" is to account for the "[." */ | |
795 | if ((i = ((cp->start - re) - 2)) > 0) | |
796 | { | |
797 | memcpy(sp, re, sizeof(*sp) * i); | |
798 | sp += i; | |
799 | } | |
800 | /* The "+ 2" is to account for the ".]" */ | |
801 | re = cp->start + cp->len + 2; | |
802 | } | |
803 | i = col_syms->start - re; /* Includes the trailing right bracket */ | |
804 | memcpy(sp, re, sizeof(*sp) * i); | |
805 | sp += i; | |
806 | *sp++ = '|'; | |
807 | } | |
808 | for (cp = col_syms + 1; cp->start; cp++) | |
809 | { | |
810 | memcpy(sp, cp->start, sizeof(*sp) * cp->len); | |
811 | sp += cp->len; | |
812 | if (cp[1].start) | |
813 | *sp++ = '|'; | |
814 | } | |
815 | *sp = 0; | |
816 | DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n", | |
817 | str)); | |
818 | ||
819 | memcpy(&subctx, ctx, sizeof(subctx)); | |
820 | subctx.re = str; | |
821 | subctx.len = sp - str; | |
822 | subctx.nofirstsub = 1; | |
823 | subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */ | |
824 | status = tre_parse(&subctx); | |
825 | xfree(str); | |
826 | if (status != REG_OK) | |
827 | { | |
828 | xfree(col_syms); | |
829 | return status; | |
830 | } | |
831 | ctx->re = col_syms->start; | |
832 | ctx->position = subctx.position; | |
833 | xfree(col_syms); | |
834 | *result = subctx.result; | |
835 | DPRINT(("tre_parse_bracket: Returning to original string\n")); | |
836 | return REG_OK; | |
837 | } | |
838 | ||
839 | DPRINT(("tre_parse_bracket: creating bracket expression literal\n")); | |
840 | node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position); | |
841 | if (node == NULL) | |
842 | { | |
843 | status = REG_ESPACE; | |
844 | goto parse_bracket_done; | |
845 | } | |
846 | else | |
847 | { | |
848 | tre_literal_t *l = node->obj; | |
849 | l->u.bracket_match_list = tre_mem_alloc(ctx->mem, | |
850 | SIZEOF_BRACKET_MATCH_LIST(items)); | |
851 | if (l->u.bracket_match_list == NULL) | |
852 | { | |
853 | status = REG_ESPACE; | |
854 | goto parse_bracket_done; | |
855 | } | |
856 | memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items)); | |
857 | } | |
858 | ||
859 | #ifdef TRE_DEBUG | |
860 | { | |
861 | int i; | |
862 | tre_bracket_match_t *b; | |
863 | DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n", | |
864 | items->num_bracket_matches, items->flags)); | |
865 | for (i = 0, b = items->bracket_matches; | |
866 | i < items->num_bracket_matches; i++, b++) | |
867 | { | |
868 | DPRINT((" %d: %s %d\n", i, bracket_match_type_str[b->type], | |
869 | b->value)); | |
870 | } | |
871 | } | |
872 | #endif /* TRE_DEBUG */ | |
873 | ||
874 | parse_bracket_done: | |
875 | xfree(items); | |
876 | ctx->position++; | |
877 | *result = node; | |
878 | return status; | |
879 | } | |
880 | ||
881 | ||
882 | /* Parses a positive decimal integer. Returns -1 if the string does not | |
883 | contain a valid number. */ | |
884 | static int | |
885 | tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) | |
886 | { | |
887 | int num = -1; | |
888 | const tre_char_t *r = *regex; | |
889 | while (r < regex_end && *r >= L'0' && *r <= L'9') | |
890 | { | |
891 | if (num < 0) | |
892 | num = 0; | |
893 | num = num * 10 + *r - L'0'; | |
894 | r++; | |
895 | } | |
896 | *regex = r; | |
897 | return num; | |
898 | } | |
899 | ||
900 | ||
901 | static reg_errcode_t | |
902 | tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) | |
903 | { | |
904 | int min, max; | |
905 | #ifdef TRE_APPROX | |
906 | int i; | |
907 | int cost_ins, cost_del, cost_subst, cost_max; | |
908 | int limit_ins, limit_del, limit_subst, limit_err; | |
909 | const tre_char_t *start; | |
910 | #endif /* TRE_APPROX */ | |
911 | const tre_char_t *r = ctx->re; | |
912 | int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; | |
913 | #ifdef TRE_APPROX | |
914 | int approx = 0; | |
915 | int costs_set = 0; | |
916 | int counts_set = 0; | |
917 | ||
918 | cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET; | |
919 | limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET; | |
920 | #endif /* TRE_APPROX */ | |
921 | ||
922 | /* Parse number (minimum repetition count). */ | |
923 | min = -1; | |
924 | if (r >= ctx->re_end) | |
925 | #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
926 | return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE; | |
927 | #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
928 | return REG_EBRACE; | |
929 | #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
930 | if (*r >= L'0' && *r <= L'9') { | |
931 | DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r))); | |
932 | min = tre_parse_int(&r, ctx->re_end); | |
933 | } | |
934 | #ifndef TRE_APPROX | |
935 | else | |
936 | #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
937 | /* For ERE, return REG_NOMATCH to signal that the lbrace should | |
938 | be treated as a literal */ | |
939 | return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR; | |
940 | #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
941 | return REG_BADBR; | |
942 | #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
943 | #endif /* !TRE_APPROX */ | |
944 | ||
945 | /* Parse comma and second number (maximum repetition count). */ | |
946 | max = min; | |
947 | if (r < ctx->re_end && *r == CHAR_COMMA) | |
948 | { | |
949 | r++; | |
950 | DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r))); | |
951 | max = tre_parse_int(&r, ctx->re_end); | |
952 | } | |
953 | ||
954 | /* Check that the repeat counts are sane. */ | |
955 | if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX) | |
956 | return REG_BADBR; | |
957 | ||
958 | ||
959 | #ifdef TRE_APPROX | |
960 | /* | |
961 | '{' | |
962 | optionally followed immediately by a number == minimum repcount | |
963 | optionally followed by , then a number == maximum repcount | |
964 | + then a number == maximum insertion count | |
965 | - then a number == maximum deletion count | |
966 | # then a number == maximum substitution count | |
967 | ~ then a number == maximum number of errors | |
968 | Any of +, -, # or ~ without followed by a number means that | |
969 | the maximum count/number of errors is infinite. | |
970 | ||
971 | An equation of the form | |
972 | Xi + Yd + Zs < C | |
973 | can be specified to set costs and the cost limit to a value | |
974 | different from the default value: | |
975 | - X is the cost of an insertion | |
976 | - Y is the cost of a deletion | |
977 | - Z is the cost of a substitution | |
978 | - C is the maximum cost | |
979 | ||
980 | If no count limit or cost is set for an operation, the operation | |
981 | is not allowed at all. | |
982 | */ | |
983 | ||
984 | ||
985 | do { | |
986 | int done; | |
987 | start = r; | |
988 | ||
989 | /* Parse count limit settings */ | |
990 | done = 0; | |
991 | if (!counts_set) | |
992 | while (r + 1 < ctx->re_end && !done) | |
993 | { | |
994 | switch (*r) | |
995 | { | |
996 | case CHAR_PLUS: /* Insert limit */ | |
997 | DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r))); | |
998 | r++; | |
999 | limit_ins = tre_parse_int(&r, ctx->re_end); | |
1000 | if (limit_ins < 0) | |
1001 | limit_ins = INT_MAX; | |
1002 | counts_set = 1; | |
1003 | break; | |
1004 | case CHAR_MINUS: /* Delete limit */ | |
1005 | DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r))); | |
1006 | r++; | |
1007 | limit_del = tre_parse_int(&r, ctx->re_end); | |
1008 | if (limit_del < 0) | |
1009 | limit_del = INT_MAX; | |
1010 | counts_set = 1; | |
1011 | break; | |
1012 | case CHAR_HASH: /* Substitute limit */ | |
1013 | DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r))); | |
1014 | r++; | |
1015 | limit_subst = tre_parse_int(&r, ctx->re_end); | |
1016 | if (limit_subst < 0) | |
1017 | limit_subst = INT_MAX; | |
1018 | counts_set = 1; | |
1019 | break; | |
1020 | case CHAR_TILDE: /* Maximum number of changes */ | |
1021 | DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r))); | |
1022 | r++; | |
1023 | limit_err = tre_parse_int(&r, ctx->re_end); | |
1024 | if (limit_err < 0) | |
1025 | limit_err = INT_MAX; | |
1026 | approx = 1; | |
1027 | break; | |
1028 | case CHAR_COMMA: | |
1029 | r++; | |
1030 | break; | |
1031 | case L' ': | |
1032 | r++; | |
1033 | break; | |
1034 | case L'}': | |
1035 | done = 1; | |
1036 | break; | |
1037 | default: | |
1038 | done = 1; | |
1039 | break; | |
1040 | } | |
1041 | } | |
1042 | ||
1043 | /* Parse cost restriction equation. */ | |
1044 | done = 0; | |
1045 | if (!costs_set) | |
1046 | while (r + 1 < ctx->re_end && !done) | |
1047 | { | |
1048 | switch (*r) | |
1049 | { | |
1050 | case CHAR_PLUS: | |
1051 | case L' ': | |
1052 | r++; | |
1053 | break; | |
1054 | case L'<': | |
1055 | DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r))); | |
1056 | r++; | |
1057 | while (*r == L' ') | |
1058 | r++; | |
1059 | cost_max = tre_parse_int(&r, ctx->re_end); | |
1060 | if (cost_max < 0) | |
1061 | cost_max = INT_MAX; | |
1062 | else | |
1063 | cost_max--; | |
1064 | approx = 1; | |
1065 | break; | |
1066 | case CHAR_COMMA: | |
1067 | r++; | |
1068 | done = 1; | |
1069 | break; | |
1070 | default: | |
1071 | if (*r >= L'0' && *r <= L'9') | |
1072 | { | |
1073 | #ifdef TRE_DEBUG | |
1074 | const tre_char_t *sr = r; | |
1075 | #endif /* TRE_DEBUG */ | |
1076 | int cost = tre_parse_int(&r, ctx->re_end); | |
1077 | /* XXX - make sure r is not past end. */ | |
1078 | switch (*r) | |
1079 | { | |
1080 | case L'i': /* Insert cost */ | |
1081 | DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n", | |
1082 | REST(sr))); | |
1083 | r++; | |
1084 | cost_ins = cost; | |
1085 | costs_set = 1; | |
1086 | break; | |
1087 | case L'd': /* Delete cost */ | |
1088 | DPRINT(("tre_parse: del cost: '%.*" STRF "'\n", | |
1089 | REST(sr))); | |
1090 | r++; | |
1091 | cost_del = cost; | |
1092 | costs_set = 1; | |
1093 | break; | |
1094 | case L's': /* Substitute cost */ | |
1095 | DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n", | |
1096 | REST(sr))); | |
1097 | r++; | |
1098 | cost_subst = cost; | |
1099 | costs_set = 1; | |
1100 | break; | |
1101 | default: | |
1102 | return REG_BADBR; | |
1103 | } | |
1104 | } | |
1105 | else | |
1106 | { | |
1107 | done = 1; | |
1108 | break; | |
1109 | } | |
1110 | } | |
1111 | } | |
1112 | } while (start != r); | |
1113 | #endif /* TRE_APPROX */ | |
1114 | ||
1115 | /*{*//* Missing }. */ | |
1116 | if (r >= ctx->re_end) | |
1117 | return REG_EBRACE; | |
1118 | ||
1119 | /* Empty contents of {}. */ | |
1120 | if (r == ctx->re) | |
1121 | return REG_BADBR; | |
1122 | ||
1123 | /* Parse the ending '}' or '\}'.*/ | |
1124 | if (ctx->cflags & REG_EXTENDED) | |
1125 | { | |
1126 | if (r >= ctx->re_end || *r != CHAR_RBRACE) | |
1127 | return REG_BADBR; | |
1128 | r++; | |
1129 | /* Parse trailing '?' marking minimal repetition. */ | |
1130 | if (r < ctx->re_end) | |
1131 | { | |
1132 | if (*r == CHAR_QUESTIONMARK) | |
1133 | { | |
1134 | /* Process the question mark only in enhanced mode. | |
1135 | Otherwise, the question mark is an error in ERE | |
1136 | or a literal in BRE */ | |
1137 | if (ctx->cflags & REG_ENHANCED) | |
1138 | { | |
1139 | minimal = !(ctx->cflags & REG_UNGREEDY); | |
1140 | r++; | |
1141 | } | |
1142 | else return REG_BADRPT; | |
1143 | } | |
1144 | else if (*r == CHAR_STAR || *r == CHAR_PLUS) | |
1145 | { | |
1146 | /* These are reserved for future extensions. */ | |
1147 | return REG_BADRPT; | |
1148 | } | |
1149 | } | |
1150 | } | |
1151 | else | |
1152 | { | |
1153 | if (r + 1 >= ctx->re_end | |
1154 | || *r != CHAR_BACKSLASH | |
1155 | || *(r + 1) != CHAR_RBRACE) | |
1156 | return REG_BADBR; | |
1157 | r += 2; | |
1158 | if (r < ctx->re_end && *r == CHAR_STAR) | |
1159 | { | |
1160 | /* This is reserved for future extensions. */ | |
1161 | return REG_BADRPT; | |
1162 | } | |
1163 | } | |
1164 | ||
1165 | if (minimal) | |
1166 | ctx->num_reorder_tags++; | |
1167 | ||
1168 | if (!result) goto parse_bound_exit; | |
1169 | /* Create the AST node(s). */ | |
1170 | /* Originally, if min == 0 && max == 0, we immediately replace the whole | |
1171 | iteration with EMPTY. This unfortunately drops any submatches, and | |
1172 | messes up setting the pmatch values (we can get tags of -1, and | |
1173 | tag values in the billions). So we leave it and process this case as | |
1174 | usual, and wait until tre_expand_ast() to replace with EMPTY */ | |
1175 | #ifdef TRE_APPROX | |
1176 | if (min < 0 && max < 0) | |
1177 | /* Only approximate parameters set, no repetitions. */ | |
1178 | min = max = 1; | |
1179 | #endif /* TRE_APPROX */ | |
1180 | ||
1181 | *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); | |
1182 | if (!*result) | |
1183 | return REG_ESPACE; | |
1184 | ||
1185 | #ifdef TRE_APPROX | |
1186 | /* If approximate matching parameters are set, add them to the | |
1187 | iteration node. */ | |
1188 | if (approx || costs_set || counts_set) | |
1189 | { | |
1190 | int *params; | |
1191 | tre_iteration_t *iter = (*result)->obj; | |
1192 | ||
1193 | if (costs_set || counts_set) | |
1194 | { | |
1195 | if (limit_ins == TRE_PARAM_UNSET) | |
1196 | { | |
1197 | if (cost_ins == TRE_PARAM_UNSET) | |
1198 | limit_ins = 0; | |
1199 | else | |
1200 | limit_ins = INT_MAX; | |
1201 | } | |
1202 | ||
1203 | if (limit_del == TRE_PARAM_UNSET) | |
1204 | { | |
1205 | if (cost_del == TRE_PARAM_UNSET) | |
1206 | limit_del = 0; | |
1207 | else | |
1208 | limit_del = INT_MAX; | |
1209 | } | |
1210 | ||
1211 | if (limit_subst == TRE_PARAM_UNSET) | |
1212 | { | |
1213 | if (cost_subst == TRE_PARAM_UNSET) | |
1214 | limit_subst = 0; | |
1215 | else | |
1216 | limit_subst = INT_MAX; | |
1217 | } | |
1218 | } | |
1219 | ||
1220 | if (cost_max == TRE_PARAM_UNSET) | |
1221 | cost_max = INT_MAX; | |
1222 | if (limit_err == TRE_PARAM_UNSET) | |
1223 | limit_err = INT_MAX; | |
1224 | ||
1225 | ctx->have_approx = 1; | |
1226 | params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST); | |
1227 | if (!params) | |
1228 | return REG_ESPACE; | |
1229 | for (i = 0; i < TRE_PARAM_LAST; i++) | |
1230 | params[i] = TRE_PARAM_UNSET; | |
1231 | params[TRE_PARAM_COST_INS] = cost_ins; | |
1232 | params[TRE_PARAM_COST_DEL] = cost_del; | |
1233 | params[TRE_PARAM_COST_SUBST] = cost_subst; | |
1234 | params[TRE_PARAM_COST_MAX] = cost_max; | |
1235 | params[TRE_PARAM_MAX_INS] = limit_ins; | |
1236 | params[TRE_PARAM_MAX_DEL] = limit_del; | |
1237 | params[TRE_PARAM_MAX_SUBST] = limit_subst; | |
1238 | params[TRE_PARAM_MAX_ERR] = limit_err; | |
1239 | iter->params = params; | |
1240 | } | |
1241 | #endif /* TRE_APPROX */ | |
1242 | ||
1243 | parse_bound_exit: | |
1244 | #ifdef TRE_APPROX | |
1245 | DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], " | |
1246 | "limits [%d,%d,%d, total %d]\n", | |
1247 | min, max, cost_ins, cost_del, cost_subst, cost_max, | |
1248 | limit_ins, limit_del, limit_subst, limit_err)); | |
1249 | #else /* !TRE_APPROX */ | |
1250 | DPRINT(("tre_parse_bound: min %d, max %d\n", min, max)); | |
1251 | #endif /* !TRE_APPROX */ | |
1252 | ||
1253 | ||
1254 | ctx->re = r; | |
1255 | return REG_OK; | |
1256 | } | |
1257 | ||
1258 | /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for | |
1259 | non-self-contained options, like (?i), this causes ((?i)fu)bar to be | |
1260 | treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect. | |
1261 | Because we now set up tags for even non-capturing parenthesized | |
1262 | subexpressions, we always call PARSE_MARK_FOR_SUBMATCH. So if we | |
1263 | pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and | |
1264 | have it restore cflags after the subexpression, we don't need to have | |
1265 | a separate PARSE_RESTORE_CFLAGS, and then after processing the | |
1266 | non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE. | |
1267 | This has the side-benefit of now matching the perl behavior: the RE | |
1268 | foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior | |
1269 | of foo AND (?i) (bar OR zap). */ | |
1270 | typedef enum { | |
1271 | PARSE_RE = 0, | |
1272 | PARSE_ATOM, | |
1273 | PARSE_MARK_FOR_SUBMATCH, | |
1274 | PARSE_BRANCH, | |
1275 | PARSE_PIECE, | |
1276 | PARSE_CATENATION, | |
1277 | PARSE_POST_CATENATION, | |
1278 | PARSE_UNION, | |
1279 | PARSE_POST_UNION, | |
1280 | PARSE_POSTFIX, | |
1281 | } tre_parse_re_stack_symbol_t; | |
1282 | ||
1283 | ||
1284 | reg_errcode_t | |
1285 | tre_parse(tre_parse_ctx_t *ctx) | |
1286 | { | |
1287 | tre_ast_node_t *result = NULL; | |
1288 | tre_parse_re_stack_symbol_t symbol; | |
1289 | reg_errcode_t status = REG_OK; | |
1290 | tre_stack_t *stack = ctx->stack; | |
1291 | int bottom = tre_stack_num_objects(stack); | |
1292 | int depth = 0; | |
1293 | int temporary_cflags = 0; | |
1294 | int bre_branch_begin; | |
1295 | #ifdef TRE_DEBUG | |
1296 | const tre_char_t *tmp_re; | |
1297 | #endif | |
1298 | ||
1299 | DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n", | |
1300 | ctx->len, ctx->re, ctx->len, ctx->cflags)); | |
1301 | ||
1302 | if (ctx->len <= 0) return REG_EMPTY; | |
1303 | if (!ctx->nofirstsub) | |
1304 | { | |
1305 | STACK_PUSH(stack, int, ctx->cflags); | |
1306 | STACK_PUSH(stack, int, ctx->submatch_id); | |
1307 | STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); | |
1308 | ctx->submatch_id++; | |
1309 | } | |
1310 | STACK_PUSH(stack, int, 0); // bre_branch_begin | |
1311 | STACK_PUSH(stack, int, PARSE_RE); | |
1312 | ctx->re_start = ctx->re; | |
1313 | ctx->re_end = ctx->re + ctx->len; | |
1314 | ||
1315 | ||
1316 | /* The following is basically just a recursive descent parser. I use | |
1317 | an explicit stack instead of recursive functions mostly because of | |
1318 | two reasons: compatibility with systems which have an overflowable | |
1319 | call stack, and efficiency (both in lines of code and speed). */ | |
1320 | while (tre_stack_num_objects(stack) > bottom) | |
1321 | { | |
1322 | symbol = tre_stack_pop_int(stack); | |
1323 | switch (symbol) | |
1324 | { | |
1325 | case PARSE_RE: | |
1326 | /* Parse a full regexp. A regexp is one or more branches, | |
1327 | separated by the union operator `|'. */ | |
1328 | bre_branch_begin = tre_stack_pop_int(stack); | |
1329 | if ( | |
1330 | #ifdef REG_LITERAL | |
1331 | !(ctx->cflags & REG_LITERAL) && | |
1332 | #endif /* REG_LITERAL */ | |
1333 | ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) | |
1334 | STACK_PUSHX(stack, int, PARSE_UNION); | |
1335 | STACK_PUSHX(stack, int, bre_branch_begin); | |
1336 | STACK_PUSHX(stack, int, PARSE_BRANCH); | |
1337 | break; | |
1338 | ||
1339 | case PARSE_BRANCH: | |
1340 | /* Parse a branch. A branch is one or more pieces, concatenated. | |
1341 | A piece is an atom possibly followed by a postfix operator. */ | |
1342 | bre_branch_begin = tre_stack_pop_int(stack); | |
1343 | STACK_PUSHX(stack, int, PARSE_CATENATION); | |
1344 | STACK_PUSHX(stack, int, bre_branch_begin); | |
1345 | STACK_PUSHX(stack, int, PARSE_PIECE); | |
1346 | break; | |
1347 | ||
1348 | case PARSE_PIECE: | |
1349 | /* Parse a piece. A piece is an atom possibly followed by one | |
1350 | or more postfix operators. */ | |
1351 | bre_branch_begin = tre_stack_pop_int(stack); | |
1352 | STACK_PUSHX(stack, int, PARSE_POSTFIX); | |
1353 | STACK_PUSHX(stack, int, bre_branch_begin); | |
1354 | STACK_PUSHX(stack, int, PARSE_ATOM); | |
1355 | break; | |
1356 | ||
1357 | case PARSE_CATENATION: | |
1358 | /* If the expression has not ended, parse another piece. */ | |
1359 | { | |
1360 | tre_char_t c; | |
1361 | if (ctx->re >= ctx->re_end) | |
1362 | break; | |
1363 | c = *ctx->re; | |
1364 | #ifdef REG_LITERAL | |
1365 | if (!(ctx->cflags & REG_LITERAL)) | |
1366 | { | |
1367 | #endif /* REG_LITERAL */ | |
1368 | if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) || | |
1369 | ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED | |
1370 | && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH && | |
1371 | *(ctx->re + 1) == CHAR_PIPE)) | |
1372 | break; | |
1373 | if ((ctx->cflags & REG_EXTENDED | |
1374 | && c == CHAR_RPAREN && depth > 0) | |
1375 | || (!(ctx->cflags & REG_EXTENDED) | |
1376 | && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH | |
1377 | && *(ctx->re + 1) == CHAR_RPAREN)) | |
1378 | { | |
1379 | if (!(ctx->cflags & REG_EXTENDED) && depth == 0) | |
1380 | return REG_EPAREN; | |
1381 | DPRINT(("tre_parse: group end: '%.*" STRF "'\n", | |
1382 | REST(ctx->re))); | |
1383 | depth--; | |
1384 | if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED))) | |
1385 | ctx->re += 2; | |
1386 | break; | |
1387 | } | |
1388 | #ifdef REG_LITERAL | |
1389 | } | |
1390 | #endif /* REG_LITERAL */ | |
1391 | ||
1392 | #ifdef REG_LEFT_ASSOC | |
1393 | if (ctx->cflags & REG_LEFT_ASSOC) | |
1394 | { | |
1395 | /* Left associative concatenation. */ | |
1396 | STACK_PUSHX(stack, int, PARSE_CATENATION); | |
1397 | STACK_PUSHX(stack, voidptr, result); | |
1398 | STACK_PUSHX(stack, int, PARSE_POST_CATENATION); | |
1399 | STACK_PUSHX(stack, int, 0); // bre_branch_begin | |
1400 | STACK_PUSHX(stack, int, PARSE_PIECE); | |
1401 | } | |
1402 | else | |
1403 | #endif /* REG_LEFT_ASSOC */ | |
1404 | { | |
1405 | /* Default case, right associative concatenation. */ | |
1406 | STACK_PUSHX(stack, voidptr, result); | |
1407 | STACK_PUSHX(stack, int, PARSE_POST_CATENATION); | |
1408 | STACK_PUSHX(stack, int, PARSE_CATENATION); | |
1409 | STACK_PUSHX(stack, int, 0); // bre_branch_begin | |
1410 | STACK_PUSHX(stack, int, PARSE_PIECE); | |
1411 | } | |
1412 | break; | |
1413 | } | |
1414 | ||
1415 | case PARSE_POST_CATENATION: | |
1416 | { | |
1417 | tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); | |
1418 | tre_ast_node_t *tmp_node; | |
1419 | tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); | |
1420 | if (!tmp_node) | |
1421 | return REG_ESPACE; | |
1422 | result = tmp_node; | |
1423 | break; | |
1424 | } | |
1425 | ||
1426 | case PARSE_UNION: | |
1427 | if (ctx->re >= ctx->re_end) | |
1428 | break; | |
1429 | #ifdef REG_LITERAL | |
1430 | if (ctx->cflags & REG_LITERAL) | |
1431 | break; | |
1432 | #endif /* REG_LITERAL */ | |
1433 | if (!(ctx->cflags & REG_EXTENDED)) | |
1434 | { | |
1435 | if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end) | |
1436 | break; | |
1437 | ctx->re++; | |
1438 | } | |
1439 | switch (*ctx->re) | |
1440 | { | |
1441 | case CHAR_PIPE: | |
1442 | DPRINT(("tre_parse: union: '%.*" STRF "'\n", | |
1443 | REST(ctx->re))); | |
1444 | STACK_PUSHX(stack, int, PARSE_UNION); | |
1445 | STACK_PUSHX(stack, voidptr, (void *)ctx->re); | |
1446 | STACK_PUSHX(stack, voidptr, result); | |
1447 | STACK_PUSHX(stack, int, PARSE_POST_UNION); | |
1448 | /* We need to pass a boolean (eventually) to PARSE_ATOM to | |
1449 | indicate if this is the beginning of a BRE extended branch. */ | |
1450 | STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin | |
1451 | STACK_PUSHX(stack, int, PARSE_BRANCH); | |
1452 | ctx->re++; | |
1453 | break; | |
1454 | ||
1455 | case CHAR_RPAREN: | |
1456 | ctx->re++; | |
1457 | break; | |
1458 | ||
1459 | default: | |
1460 | if (!(ctx->cflags & REG_EXTENDED)) | |
1461 | ctx->re--; | |
1462 | break; | |
1463 | } | |
1464 | break; | |
1465 | ||
1466 | case PARSE_POST_UNION: | |
1467 | { | |
1468 | tre_ast_node_t *tmp_node; | |
1469 | tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); | |
1470 | const tre_char_t *pipechar = tre_stack_pop_voidptr(stack); | |
1471 | /* error on empty expression at end of union */ | |
1472 | if (pipechar == ctx->re - 1) | |
1473 | { | |
1474 | return REG_EMPTY; | |
1475 | } | |
1476 | tmp_node = tre_ast_new_union(ctx->mem, tree, result); | |
1477 | if (!tmp_node) | |
1478 | return REG_ESPACE; | |
1479 | result = tmp_node; | |
1480 | break; | |
1481 | } | |
1482 | ||
1483 | case PARSE_POSTFIX: | |
1484 | /* Parse postfix operators. */ | |
1485 | if (ctx->re >= ctx->re_end) | |
1486 | break; | |
1487 | #ifdef REG_LITERAL | |
1488 | if (ctx->cflags & REG_LITERAL) | |
1489 | break; | |
1490 | #endif /* REG_LITERAL */ | |
1491 | int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0; | |
1492 | int rep_min = 0; | |
1493 | int rep_max = -1; | |
1494 | #ifdef TRE_DEBUG | |
1495 | int lbrace_off; | |
1496 | #endif | |
1497 | switch (*ctx->re) | |
1498 | { | |
1499 | case CHAR_PLUS: | |
1500 | case CHAR_QUESTIONMARK: | |
1501 | if (!(ctx->cflags & REG_EXTENDED)) | |
1502 | break; | |
1503 | /*FALLTHROUGH*/ | |
1504 | case CHAR_STAR: | |
1505 | { | |
1506 | tre_ast_node_t *tmp_node; | |
1507 | #ifdef TRE_DEBUG | |
1508 | const char *tstr = "star"; | |
1509 | tmp_re = ctx->re; | |
1510 | #endif | |
1511 | ||
1512 | handle_plus_or_question: | |
1513 | /* error on iteration of raw assertion (not in subexpression) */ | |
1514 | if (result->type == LITERAL && result->submatch_id < 0 && | |
1515 | IS_ASSERTION((tre_literal_t *)result->obj)) | |
1516 | { | |
1517 | if (!(ctx->cflags & REG_EXTENDED)) break; | |
1518 | return REG_BADRPT; | |
1519 | } | |
1520 | if (*ctx->re == CHAR_PLUS) | |
1521 | { | |
1522 | rep_min = 1; | |
1523 | #ifdef TRE_DEBUG | |
1524 | tstr = "plus"; | |
1525 | #endif | |
1526 | } | |
1527 | if (*ctx->re == CHAR_QUESTIONMARK) | |
1528 | { | |
1529 | rep_max = 1; | |
1530 | #ifdef TRE_DEBUG | |
1531 | tstr = "questionmark"; | |
1532 | #endif | |
1533 | } | |
1534 | ||
1535 | if (ctx->cflags & REG_EXTENDED) | |
1536 | { | |
1537 | if (ctx->re + 1 < ctx->re_end) | |
1538 | { | |
1539 | if (*(ctx->re + 1) == CHAR_QUESTIONMARK) | |
1540 | { | |
1541 | /* Process the question mark only in enhanced mode. | |
1542 | Otherwise, the question mark is an error in ERE */ | |
1543 | if (ctx->cflags & REG_ENHANCED) | |
1544 | { | |
1545 | minimal = !(ctx->cflags & REG_UNGREEDY); | |
1546 | ctx->re++; | |
1547 | } | |
1548 | else return REG_BADRPT; | |
1549 | } | |
1550 | else if (*(ctx->re + 1) == CHAR_STAR | |
1551 | || *(ctx->re + 1) == CHAR_PLUS) | |
1552 | { | |
1553 | /* These are reserved for future extensions. */ | |
1554 | return REG_BADRPT; | |
1555 | } | |
1556 | } | |
1557 | } | |
1558 | else | |
1559 | { | |
1560 | if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR) | |
1561 | { | |
1562 | /* This is reserved for future extensions. */ | |
1563 | return REG_BADRPT; | |
1564 | } | |
1565 | if (ctx->re + 2 < ctx->re_end) | |
1566 | { | |
1567 | if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 1) == CHAR_QUESTIONMARK) | |
1568 | { | |
1569 | /* Process the question mark only in enhanced mode. | |
1570 | Otherwise, the question mark is a literal in BRE */ | |
1571 | if (ctx->cflags & REG_ENHANCED) | |
1572 | { | |
1573 | minimal = !(ctx->cflags & REG_UNGREEDY); | |
1574 | ctx->re += 2; | |
1575 | } | |
1576 | } | |
1577 | else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS) | |
1578 | { | |
1579 | /* This is reserved for future extensions. */ | |
1580 | return REG_BADRPT; | |
1581 | } | |
1582 | } | |
1583 | } | |
1584 | ||
1585 | if (minimal) | |
1586 | ctx->num_reorder_tags++; | |
1587 | ||
1588 | DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n", | |
1589 | minimal ? " minimal" : "greedy", tstr, REST(tmp_re))); | |
1590 | if (result == NULL) | |
1591 | { | |
1592 | if (ctx->cflags & REG_EXTENDED) return REG_BADRPT; | |
1593 | else goto parse_literal; | |
1594 | } | |
1595 | ctx->re++; | |
1596 | tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, | |
1597 | minimal); | |
1598 | if (tmp_node == NULL) | |
1599 | return REG_ESPACE; | |
1600 | result = tmp_node; | |
1601 | ||
1602 | /* Set the iterator with a submatch id in the invisible range | |
1603 | * (which will be overridden if a real submatch is needed) */ | |
1604 | result->submatch_id = ctx->submatch_id_invisible++; | |
1605 | ||
1606 | #if 0 | |
1607 | /* We don't allow multiple postfixes, but this might be needed | |
1608 | to support approximate matching */ | |
1609 | STACK_PUSHX(stack, int, PARSE_POSTFIX); | |
1610 | #endif | |
1611 | } | |
1612 | break; | |
1613 | ||
1614 | case CHAR_BACKSLASH: | |
1615 | /* "\{" is special without REG_EXTENDED */ | |
1616 | /* "\+" and "\?" are special with REG_ENHANCED for BRE */ | |
1617 | if (!(ctx->cflags & REG_EXTENDED) | |
1618 | && ctx->re + 1 < ctx->re_end) | |
1619 | { | |
1620 | switch (*(ctx->re + 1)) | |
1621 | { | |
1622 | case CHAR_LBRACE: | |
1623 | ctx->re++; | |
1624 | #ifdef TRE_DEBUG | |
1625 | lbrace_off = 2; | |
1626 | #endif | |
1627 | goto parse_brace; | |
1628 | case CHAR_PLUS: | |
1629 | case CHAR_QUESTIONMARK: | |
1630 | if (ctx->cflags & REG_ENHANCED) | |
1631 | { | |
1632 | #ifdef TRE_DEBUG | |
1633 | tmp_re = ctx->re; | |
1634 | #endif | |
1635 | ctx->re++; | |
1636 | goto handle_plus_or_question; | |
1637 | } | |
1638 | break; | |
1639 | } | |
1640 | break; | |
1641 | } | |
1642 | else | |
1643 | break; | |
1644 | ||
1645 | case CHAR_LBRACE: | |
1646 | { | |
1647 | int raw_assertion; | |
1648 | ||
1649 | /* "{" is literal without REG_EXTENDED */ | |
1650 | if (!(ctx->cflags & REG_EXTENDED)) | |
1651 | break; | |
1652 | #ifdef TRE_DEBUG | |
1653 | lbrace_off = 1; | |
1654 | #endif | |
1655 | ||
1656 | parse_brace: | |
1657 | /* error on iteration of raw assertion (not in subexpression), | |
1658 | but wait until after parsing bounds */ | |
1659 | raw_assertion = (result->type == LITERAL | |
1660 | && result->submatch_id < 0 | |
1661 | && IS_ASSERTION((tre_literal_t *)result->obj)); | |
1662 | ctx->re++; | |
1663 | ||
1664 | status = tre_parse_bound(ctx, &result); | |
1665 | #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
1666 | /* For ERE, if status is REG_NOMATCH, this mean the lbrace | |
1667 | is to be treated as a literal. */ | |
1668 | if (status == REG_NOMATCH) | |
1669 | { | |
1670 | ctx->re--; | |
1671 | break; | |
1672 | } | |
1673 | #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
1674 | DPRINT(("tre_parse: bound: '%.*" STRF "'\n", | |
1675 | REST(ctx->re - lbrace_off))); | |
1676 | if (status != REG_OK) | |
1677 | return status; | |
1678 | if (raw_assertion) return REG_BADRPT; | |
1679 | ||
1680 | /* Set the iterator with a submatch id in the invisible range | |
1681 | * (which will be overridden if a real submatch is needed) */ | |
1682 | if (result->type == ITERATION) | |
1683 | result->submatch_id = ctx->submatch_id_invisible++; | |
1684 | ||
1685 | #if 0 | |
1686 | /* We don't allow multiple postfixes, but this might be needed | |
1687 | to support approximate matching */ | |
1688 | STACK_PUSHX(stack, int, PARSE_POSTFIX); | |
1689 | #endif | |
1690 | break; | |
1691 | } | |
1692 | } | |
1693 | break; | |
1694 | ||
1695 | case PARSE_ATOM: | |
1696 | { | |
1697 | /* Parse an atom. An atom is a regular expression enclosed in `()', | |
1698 | an empty set of `()', a bracket expression, `.', `^', `$', | |
1699 | a `\' followed by a character, or a single character. */ | |
1700 | ||
1701 | /* The stack contains a boolean value, whether PARSE_ATOM is | |
1702 | being called just after the start of a group (left paren) | |
1703 | in a BRE */ | |
1704 | bre_branch_begin = tre_stack_pop_int(stack); | |
1705 | ||
1706 | /* End of regexp? (empty string). */ | |
1707 | if (ctx->re >= ctx->re_end) | |
1708 | goto parse_literal; | |
1709 | ||
1710 | #ifdef REG_LITERAL | |
1711 | if (ctx->cflags & REG_LITERAL) | |
1712 | goto parse_literal; | |
1713 | #endif /* REG_LITERAL */ | |
1714 | ||
1715 | switch (*ctx->re) | |
1716 | { | |
1717 | case CHAR_LPAREN: /* parenthesized subexpression */ | |
1718 | ||
1719 | /* Handle "(?...)" extensions. They work in a way similar | |
1720 | to Perls corresponding extensions. */ | |
1721 | if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) == | |
1722 | (REG_EXTENDED|REG_ENHANCED) | |
1723 | && *(ctx->re + 1) == CHAR_QUESTIONMARK) | |
1724 | { | |
1725 | int new_cflags = ctx->cflags; | |
1726 | int bit = 1; | |
1727 | int invisible_submatch = 0; | |
1728 | DPRINT(("tre_parse: extension: '%.*" STRF "'\n", | |
1729 | REST(ctx->re))); | |
1730 | ctx->re += 2; | |
1731 | while (/*CONSTCOND*/1) | |
1732 | { | |
1733 | if (*ctx->re == L'i') | |
1734 | { | |
1735 | DPRINT(("tre_parse: icase: '%.*" STRF "'\n", | |
1736 | REST(ctx->re))); | |
1737 | if (bit) | |
1738 | new_cflags |= REG_ICASE; | |
1739 | else | |
1740 | new_cflags &= ~REG_ICASE; | |
1741 | ctx->re++; | |
1742 | } | |
1743 | else if (*ctx->re == L'n') | |
1744 | { | |
1745 | DPRINT(("tre_parse: newline: '%.*" STRF "'\n", | |
1746 | REST(ctx->re))); | |
1747 | if (bit) | |
1748 | new_cflags |= REG_NEWLINE; | |
1749 | else | |
1750 | new_cflags &= ~REG_NEWLINE; | |
1751 | ctx->re++; | |
1752 | } | |
1753 | #ifdef REG_LEFT_ASSOC | |
1754 | else if (*ctx->re == L'l') | |
1755 | { | |
1756 | DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n", | |
1757 | REST(ctx->re))); | |
1758 | if (bit) | |
1759 | new_cflags |= REG_LEFT_ASSOC; | |
1760 | else | |
1761 | new_cflags &= ~REG_LEFT_ASSOC; | |
1762 | ctx->re++; | |
1763 | } | |
1764 | #endif /* REG_LEFT_ASSOC */ | |
1765 | #ifdef REG_UNGREEDY | |
1766 | else if (*ctx->re == L'U') | |
1767 | { | |
1768 | DPRINT(("tre_parse: ungreedy: '%.*" STRF "'\n", | |
1769 | REST(ctx->re))); | |
1770 | if (bit) | |
1771 | new_cflags |= REG_UNGREEDY; | |
1772 | else | |
1773 | new_cflags &= ~REG_UNGREEDY; | |
1774 | ctx->re++; | |
1775 | } | |
1776 | #endif /* REG_UNGREEDY */ | |
1777 | else if (*ctx->re == CHAR_MINUS) | |
1778 | { | |
1779 | DPRINT(("tre_parse: turn off: '%.*" STRF "'\n", | |
1780 | REST(ctx->re))); | |
1781 | ctx->re++; | |
1782 | bit = 0; | |
1783 | } | |
1784 | else if (*ctx->re == CHAR_COLON) | |
1785 | { | |
1786 | DPRINT(("tre_parse: no group: '%.*" STRF | |
1787 | "', (invisible submatch %d)\n", | |
1788 | REST(ctx->re), ctx->submatch_id_invisible)); | |
1789 | ctx->re++; | |
1790 | depth++; | |
1791 | invisible_submatch = 1; | |
1792 | break; | |
1793 | } | |
1794 | else if (*ctx->re == CHAR_HASH) | |
1795 | { | |
1796 | DPRINT(("tre_parse: comment: '%.*" STRF "'\n", | |
1797 | REST(ctx->re))); | |
1798 | /* A comment can contain any character except a | |
1799 | right parenthesis */ | |
1800 | while (*ctx->re != CHAR_RPAREN | |
1801 | && ctx->re < ctx->re_end) | |
1802 | ctx->re++; | |
1803 | if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end) | |
1804 | { | |
1805 | ctx->re++; | |
1806 | break; | |
1807 | } | |
1808 | else | |
1809 | return REG_BADPAT; | |
1810 | } | |
1811 | else if (*ctx->re == CHAR_RPAREN) | |
1812 | { | |
1813 | ctx->re++; | |
1814 | break; | |
1815 | } | |
1816 | else | |
1817 | return REG_BADRPT; | |
1818 | } | |
1819 | ||
1820 | /* Turn on the cflags changes for the rest of the | |
1821 | enclosing group. */ | |
1822 | if (invisible_submatch) | |
1823 | { | |
1824 | STACK_PUSHX(stack, int, ctx->cflags); | |
1825 | STACK_PUSHX(stack, int, ctx->submatch_id_invisible); | |
1826 | STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); | |
1827 | ctx->submatch_id_invisible++; | |
1828 | STACK_PUSHX(stack, int, 0); // bre_branch_begin | |
1829 | STACK_PUSHX(stack, int, PARSE_RE); | |
1830 | } | |
1831 | else { | |
1832 | STACK_PUSHX(stack, int, 0); // bre_branch_begin | |
1833 | STACK_PUSHX(stack, int, PARSE_ATOM); | |
1834 | } | |
1835 | ctx->cflags = new_cflags; | |
1836 | break; | |
1837 | } | |
1838 | ||
1839 | if (ctx->cflags & REG_EXTENDED) | |
1840 | { | |
1841 | parse_bre_lparen: | |
1842 | DPRINT(("tre_parse: group begin: '%.*" STRF | |
1843 | "', submatch %d\n", REST(ctx->re), | |
1844 | ctx->submatch_id)); | |
1845 | ctx->re++; | |
1846 | /* First parse a whole RE, then mark the resulting tree | |
1847 | for submatching. */ | |
1848 | STACK_PUSHX(stack, int, ctx->cflags); | |
1849 | STACK_PUSHX(stack, int, ctx->submatch_id); | |
1850 | STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); | |
1851 | /* We need to pass a boolean (eventually) to PARSE_ATOM to | |
1852 | indicate if this is the beginning of a BRE group. */ | |
1853 | STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED)); | |
1854 | STACK_PUSHX(stack, int, PARSE_RE); | |
1855 | ctx->submatch_id++; | |
1856 | depth++; | |
1857 | } | |
1858 | else | |
1859 | goto parse_literal; | |
1860 | break; | |
1861 | ||
1862 | case CHAR_RPAREN: /* end of current subexpression */ | |
1863 | if (ctx->cflags & REG_EXTENDED && depth > 0) | |
1864 | { | |
1865 | parse_bre_rparen_empty: | |
1866 | if (!(ctx->cflags & REG_EXTENDED) && depth == 0) | |
1867 | return REG_EPAREN; | |
1868 | DPRINT(("tre_parse: empty: '%.*" STRF "'\n", | |
1869 | REST(ctx->re))); | |
1870 | /* We were expecting an atom, but instead the current | |
1871 | subexpression was closed. POSIX leaves the meaning of | |
1872 | this to be implementation-defined. We interpret this as | |
1873 | an empty expression (which matches an empty string). */ | |
1874 | result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); | |
1875 | if (result == NULL) | |
1876 | return REG_ESPACE; | |
1877 | if (!(ctx->cflags & REG_EXTENDED)) | |
1878 | ctx->re--; | |
1879 | } | |
1880 | else | |
1881 | goto parse_literal; | |
1882 | break; | |
1883 | ||
1884 | case CHAR_LBRACKET: /* bracket expression */ | |
1885 | DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", | |
1886 | REST(ctx->re))); | |
1887 | ctx->re++; | |
1888 | status = tre_parse_bracket(ctx, &result); | |
1889 | if (status != REG_OK) | |
1890 | return status; | |
1891 | break; | |
1892 | ||
1893 | case CHAR_BACKSLASH: | |
1894 | /* Deal with "\(", "\)" or "\{" for BREs */ | |
1895 | if (!(ctx->cflags & REG_EXTENDED) | |
1896 | && ctx->re + 1 < ctx->re_end) | |
1897 | { | |
1898 | if (*(ctx->re + 1) == CHAR_LPAREN) | |
1899 | { | |
1900 | ctx->re++; | |
1901 | goto parse_bre_lparen; | |
1902 | } | |
1903 | else if (*(ctx->re + 1) == CHAR_RPAREN) | |
1904 | { | |
1905 | ctx->re++; | |
1906 | goto parse_bre_rparen_empty; | |
1907 | } | |
1908 | if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal; | |
1909 | } | |
1910 | ||
1911 | if (ctx->re + 1 >= ctx->re_end) | |
1912 | /* Trailing backslash. */ | |
1913 | return REG_EESCAPE; | |
1914 | ||
1915 | if (!(ctx->cflags & REG_ENHANCED)) | |
1916 | { | |
1917 | DPRINT(("tre_parse: unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re))); | |
1918 | ctx->re++; | |
1919 | goto unenhanced_backslash; | |
1920 | } | |
1921 | ||
1922 | /* If a macro is used, parse the expanded macro recursively. */ | |
1923 | { | |
1924 | tre_char_t buf[64]; | |
1925 | tre_expand_macro(ctx->re + 1, ctx->re_end, | |
1926 | buf, elementsof(buf)); | |
1927 | if (buf[0] != 0) | |
1928 | { | |
1929 | tre_parse_ctx_t subctx; | |
1930 | memcpy(&subctx, ctx, sizeof(subctx)); | |
1931 | subctx.re = buf; | |
1932 | subctx.len = tre_strlen(buf); | |
1933 | subctx.nofirstsub = 1; | |
1934 | status = tre_parse(&subctx); | |
1935 | if (status != REG_OK) | |
1936 | return status; | |
1937 | ctx->re += 2; | |
1938 | ctx->position = subctx.position; | |
1939 | result = subctx.result; | |
1940 | break; | |
1941 | } | |
1942 | } | |
1943 | ||
1944 | #ifdef REG_LITERAL | |
1945 | if (*(ctx->re + 1) == L'Q') | |
1946 | { | |
1947 | DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n", | |
1948 | REST(ctx->re))); | |
1949 | ctx->cflags |= REG_LITERAL; | |
1950 | temporary_cflags |= REG_LITERAL; | |
1951 | ctx->re += 2; | |
1952 | STACK_PUSHX(stack, int, 0); | |
1953 | STACK_PUSHX(stack, int, PARSE_ATOM); | |
1954 | break; | |
1955 | } | |
1956 | #endif /* REG_LITERAL */ | |
1957 | ||
1958 | DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re))); | |
1959 | ctx->re++; | |
1960 | switch (*ctx->re) | |
1961 | { | |
1962 | case L'b': | |
1963 | result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
1964 | ASSERT_AT_WB, -1); | |
1965 | ctx->re++; | |
1966 | break; | |
1967 | case L'B': | |
1968 | result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
1969 | ASSERT_AT_WB_NEG, -1); | |
1970 | ctx->re++; | |
1971 | break; | |
1972 | case L'<': | |
1973 | result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
1974 | ASSERT_AT_BOW, -1); | |
1975 | ctx->re++; | |
1976 | break; | |
1977 | case L'>': | |
1978 | result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
1979 | ASSERT_AT_EOW, -1); | |
1980 | ctx->re++; | |
1981 | break; | |
1982 | case L'x': | |
1983 | ctx->re++; | |
1984 | if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end) | |
1985 | { | |
1986 | /* 8 bit hex char. */ | |
1987 | char tmp[3] = {0, 0, 0}; | |
1988 | long val; | |
1989 | DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n", | |
1990 | REST(ctx->re - 2))); | |
1991 | ||
1992 | if (tre_isxdigit_l(ctx->re[0], ctx->loc) && | |
1993 | ctx->re < ctx->re_end) | |
1994 | { | |
1995 | tmp[0] = (char)ctx->re[0]; | |
1996 | ctx->re++; | |
1997 | } | |
1998 | if (tre_isxdigit_l(ctx->re[0], ctx->loc) && | |
1999 | ctx->re < ctx->re_end) | |
2000 | { | |
2001 | tmp[1] = (char)ctx->re[0]; | |
2002 | ctx->re++; | |
2003 | } | |
2004 | val = strtol(tmp, NULL, 16); | |
2005 | result = tre_ast_new_literal(ctx->mem, (int)val, | |
2006 | (int)val, ctx->position); | |
2007 | ctx->position++; | |
2008 | break; | |
2009 | } | |
2010 | else if (ctx->re < ctx->re_end) | |
2011 | { | |
2012 | /* Wide char. */ | |
2013 | char tmp[32]; | |
2014 | long val; | |
2015 | int i = 0; | |
2016 | ctx->re++; | |
2017 | while (ctx->re_end - ctx->re >= 0) | |
2018 | { | |
2019 | if (ctx->re[0] == CHAR_RBRACE) | |
2020 | break; | |
2021 | if (tre_isxdigit_l(ctx->re[0], ctx->loc)) | |
2022 | { | |
2023 | tmp[i] = (char)ctx->re[0]; | |
2024 | i++; | |
2025 | ctx->re++; | |
2026 | continue; | |
2027 | } | |
2028 | return REG_EBRACE; | |
2029 | } | |
2030 | ctx->re++; | |
2031 | tmp[i] = 0; | |
2032 | val = strtol(tmp, NULL, 16); | |
2033 | result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, | |
2034 | ctx->position); | |
2035 | ctx->position++; | |
2036 | break; | |
2037 | } | |
2038 | /*FALLTHROUGH*/ | |
2039 | ||
2040 | default: | |
2041 | unenhanced_backslash: | |
2042 | if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) != | |
2043 | REG_EXTENDED && | |
2044 | tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0') | |
2045 | { | |
2046 | /* Back reference (only in BRE or enhanced). */ | |
2047 | int val = *ctx->re - L'0'; | |
2048 | DPRINT(("tre_parse: backref: '%.*" STRF "'\n", | |
2049 | REST(ctx->re - 1))); | |
2050 | result = tre_ast_new_literal(ctx->mem, BACKREF, val, | |
2051 | ctx->position); | |
2052 | if (result == NULL) | |
2053 | return REG_ESPACE; | |
2054 | ||
2055 | /* Set the backref with a submatch id in the invisible | |
2056 | * range (which will be overridden if a real submatch | |
2057 | * is needed) */ | |
2058 | result->submatch_id = ctx->submatch_id_invisible++; | |
2059 | ||
2060 | ctx->position++; | |
2061 | ctx->num_reorder_tags++; | |
2062 | ctx->max_backref = MAX(val, ctx->max_backref); | |
2063 | ctx->re++; | |
2064 | } | |
2065 | else | |
2066 | { | |
2067 | /* Escaped character. */ | |
2068 | DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", | |
2069 | REST(ctx->re - 1))); | |
2070 | result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, | |
2071 | ctx->position); | |
2072 | ctx->position++; | |
2073 | ctx->re++; | |
2074 | } | |
2075 | break; | |
2076 | } | |
2077 | if (result == NULL) | |
2078 | return REG_ESPACE; | |
2079 | break; | |
2080 | ||
2081 | case CHAR_PERIOD: /* the any-symbol */ | |
2082 | DPRINT(("tre_parse: any: '%.*" STRF "'\n", | |
2083 | REST(ctx->re))); | |
2084 | if (ctx->cflags & REG_NEWLINE) | |
2085 | { | |
2086 | tre_ast_node_t *tmp1; | |
2087 | tre_ast_node_t *tmp2; | |
2088 | tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, | |
2089 | ctx->position); | |
2090 | if (!tmp1) | |
2091 | return REG_ESPACE; | |
2092 | tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, | |
2093 | ctx->position + 1); | |
2094 | if (!tmp2) | |
2095 | return REG_ESPACE; | |
2096 | result = tre_ast_new_union(ctx->mem, tmp1, tmp2); | |
2097 | if (!result) | |
2098 | return REG_ESPACE; | |
2099 | ctx->position += 2; | |
2100 | } | |
2101 | else | |
2102 | { | |
2103 | result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, | |
2104 | ctx->position); | |
2105 | if (!result) | |
2106 | return REG_ESPACE; | |
2107 | ctx->position++; | |
2108 | } | |
2109 | ctx->re++; | |
2110 | break; | |
2111 | ||
2112 | case CHAR_CARET: /* beginning of line assertion */ | |
2113 | /* '^' has a special meaning everywhere in EREs, at the | |
2114 | beginning of the RE and after \( is BREs. It is also | |
2115 | special in enhanced BREs at the beginning of each branches | |
2116 | of a union */ | |
2117 | if (ctx->cflags & REG_EXTENDED | |
2118 | || bre_branch_begin | |
2119 | || ctx->re == ctx->re_start) | |
2120 | { | |
2121 | DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", | |
2122 | REST(ctx->re))); | |
2123 | result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
2124 | ASSERT_AT_BOL, -1); | |
2125 | if (result == NULL) | |
2126 | return REG_ESPACE; | |
2127 | ctx->re++; | |
2128 | } | |
2129 | else | |
2130 | goto parse_literal; | |
2131 | break; | |
2132 | ||
2133 | case CHAR_DOLLAR: /* end of line assertion. */ | |
2134 | /* '$' is special everywhere in EREs, and in the end of the | |
2135 | string and before \) is BREs. */ | |
2136 | if (ctx->cflags & REG_EXTENDED | |
2137 | || (ctx->re + 2 < ctx->re_end | |
2138 | && *(ctx->re + 1) == CHAR_BACKSLASH | |
2139 | && *(ctx->re + 2) == CHAR_RPAREN) | |
2140 | || ctx->re + 1 == ctx->re_end) | |
2141 | { | |
2142 | DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", | |
2143 | REST(ctx->re))); | |
2144 | result = tre_ast_new_literal(ctx->mem, ASSERTION, | |
2145 | ASSERT_AT_EOL, -1); | |
2146 | if (result == NULL) | |
2147 | return REG_ESPACE; | |
2148 | ctx->re++; | |
2149 | } | |
2150 | else | |
2151 | goto parse_literal; | |
2152 | break; | |
2153 | ||
2154 | default: | |
2155 | parse_literal: | |
2156 | ||
2157 | if (temporary_cflags && ctx->re + 1 < ctx->re_end | |
2158 | && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E') | |
2159 | { | |
2160 | DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n", | |
2161 | REST(ctx->re))); | |
2162 | ctx->cflags &= ~temporary_cflags; | |
2163 | temporary_cflags = 0; | |
2164 | ctx->re += 2; | |
2165 | if (ctx->re < ctx->re_end) | |
2166 | { | |
2167 | STACK_PUSHX(stack, int, 0); | |
2168 | STACK_PUSHX(stack, int, PARSE_ATOM); | |
2169 | } | |
2170 | else | |
2171 | { | |
2172 | result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); | |
2173 | if (!result) return REG_ESPACE; | |
2174 | } | |
2175 | break; | |
2176 | } | |
2177 | ||
2178 | ||
2179 | /* We are expecting an atom. If the subexpression (or the whole | |
2180 | regexp ends here, we interpret it as an empty expression | |
2181 | (which matches an empty string), which is an error. | |
2182 | Iterations of an empty expression is also an error. */ | |
2183 | #ifdef REG_LITERAL | |
2184 | if (!(ctx->cflags & REG_LITERAL)) | |
2185 | { | |
2186 | #endif /* REG_LITERAL */ | |
2187 | /* error on end of string */ | |
2188 | if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN | |
2189 | : REG_EMPTY; | |
2190 | /* error on unions and iterations of empty expressions */ | |
2191 | if (ctx->cflags & REG_EXTENDED) | |
2192 | { | |
2193 | if (ctx->re < ctx->re_end) | |
2194 | { | |
2195 | if (*ctx->re == CHAR_PIPE) return REG_EMPTY; | |
2196 | if (*ctx->re == CHAR_LBRACE) | |
2197 | { | |
2198 | ctx->re++; | |
2199 | empty_parse_bound: | |
2200 | /* We need to parse the bound first and return | |
2201 | any error, before returning REG_BADRPT */ | |
2202 | status = tre_parse_bound(ctx, NULL); | |
2203 | #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
2204 | /* For ERE, if REG_NOMATCH is returned, we | |
2205 | treat the lbrace as a literal. */ | |
2206 | if (status == REG_NOMATCH) | |
2207 | { | |
2208 | ctx->re--; | |
2209 | /* Drop down to literal-handling code */ | |
2210 | } | |
2211 | else | |
2212 | { | |
2213 | #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
2214 | if (status != REG_OK) | |
2215 | return status; | |
2216 | return REG_BADRPT; | |
2217 | #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
2218 | } | |
2219 | #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
2220 | } | |
2221 | #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND | |
2222 | else | |
2223 | #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */ | |
2224 | if (*ctx->re == CHAR_STAR | |
2225 | || *ctx->re == CHAR_PLUS | |
2226 | || *ctx->re == CHAR_QUESTIONMARK) | |
2227 | { | |
2228 | return REG_BADRPT; | |
2229 | } | |
2230 | } | |
2231 | } | |
2232 | else if (ctx->re + 1 < ctx->re_end | |
2233 | && *ctx->re == CHAR_BACKSLASH | |
2234 | && *(ctx->re + 1) == CHAR_LBRACE) | |
2235 | { | |
2236 | ctx->re += 2; | |
2237 | goto empty_parse_bound; | |
2238 | } | |
2239 | #ifdef REG_LITERAL | |
2240 | } | |
2241 | #endif /* REG_LITERAL */ | |
2242 | ||
2243 | DPRINT(("tre_parse: literal: '%.*" STRF "'\n", | |
2244 | REST(ctx->re))); | |
2245 | /* Note that we can't use an tre_isalpha() test here, since there | |
2246 | may be characters which are alphabetic but neither upper or | |
2247 | lower case. */ | |
2248 | if (ctx->cflags & REG_ICASE | |
2249 | && (tre_isupper_l(*ctx->re, ctx->loc) || | |
2250 | tre_islower_l(*ctx->re, ctx->loc))) | |
2251 | { | |
2252 | tre_ast_node_t *tmp1; | |
2253 | tre_ast_node_t *tmp2; | |
2254 | ||
2255 | /* XXX - Can there be more than one opposite-case | |
2256 | counterpoints for some character in some locale? Or | |
2257 | more than two characters which all should be regarded | |
2258 | the same character if case is ignored? If yes, there | |
2259 | does not seem to be a portable way to detect it. I guess | |
2260 | that at least for multi-character collating elements there | |
2261 | could be several opposite-case counterpoints, but they | |
2262 | cannot be supported portably anyway. */ | |
2263 | tmp1 = tre_ast_new_literal(ctx->mem, | |
2264 | tre_toupper_l(*ctx->re, ctx->loc), | |
2265 | tre_toupper_l(*ctx->re, ctx->loc), | |
2266 | ctx->position); | |
2267 | if (!tmp1) | |
2268 | return REG_ESPACE; | |
2269 | tmp2 = tre_ast_new_literal(ctx->mem, | |
2270 | tre_tolower_l(*ctx->re, ctx->loc), | |
2271 | tre_tolower_l(*ctx->re, ctx->loc), | |
2272 | ctx->position); | |
2273 | if (!tmp2) | |
2274 | return REG_ESPACE; | |
2275 | result = tre_ast_new_union(ctx->mem, tmp1, tmp2); | |
2276 | if (!result) | |
2277 | return REG_ESPACE; | |
2278 | } | |
2279 | else | |
2280 | { | |
2281 | result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, | |
2282 | ctx->position); | |
2283 | if (!result) | |
2284 | return REG_ESPACE; | |
2285 | } | |
2286 | ctx->position++; | |
2287 | ctx->re++; | |
2288 | break; | |
2289 | } | |
2290 | break; | |
2291 | } | |
2292 | ||
2293 | case PARSE_MARK_FOR_SUBMATCH: | |
2294 | { | |
2295 | int submatch_id = tre_stack_pop_int(stack); | |
2296 | ||
2297 | ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */ | |
2298 | if (result->submatch_id >= 0 && | |
2299 | result->submatch_id < SUBMATCH_ID_INVISIBLE_START) | |
2300 | { | |
2301 | tre_ast_node_t *n, *tmp_node; | |
2302 | if (submatch_id >= SUBMATCH_ID_INVISIBLE_START) | |
2303 | break; | |
2304 | n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); | |
2305 | if (n == NULL) | |
2306 | return REG_ESPACE; | |
2307 | tmp_node = tre_ast_new_catenation(ctx->mem, n, result); | |
2308 | if (tmp_node == NULL) | |
2309 | return REG_ESPACE; | |
2310 | tmp_node->num_submatches = result->num_submatches; | |
2311 | result = tmp_node; | |
2312 | } | |
2313 | result->submatch_id = submatch_id; | |
2314 | if (submatch_id < SUBMATCH_ID_INVISIBLE_START) | |
2315 | result->num_submatches++; | |
2316 | break; | |
2317 | } | |
2318 | ||
2319 | default: | |
2320 | assert(0); | |
2321 | break; | |
2322 | } | |
2323 | } | |
2324 | ||
2325 | /* Check for missing closing parentheses. */ | |
2326 | if (depth > 0) | |
2327 | return REG_EPAREN; | |
2328 | ||
2329 | ctx->result = result; | |
2330 | ||
2331 | return REG_OK; | |
2332 | } | |
2333 | ||
2334 | /* EOF */ |