]> git.saurik.com Git - apple/libc.git/blob - regex/TRE/lib/regexec.c
6823f72e22ee9900513e1793d7dc3464a55adec3
[apple/libc.git] / regex / TRE / lib / regexec.c
1 /*
2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12
13 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
14 info while running */
15 #undef TRE_USE_ALLOCA
16
17 #ifdef TRE_USE_ALLOCA
18 /* AIX requires this to be the first thing in the file. */
19 #ifndef __GNUC__
20 # if HAVE_ALLOCA_H
21 # include <alloca.h>
22 # else
23 # ifdef _AIX
24 #pragma alloca
25 # else
26 # ifndef alloca /* predefined by HP cc +Olibcalls */
27 char *alloca ();
28 # endif
29 # endif
30 # endif
31 #endif
32 #endif /* TRE_USE_ALLOCA */
33
34 #include <assert.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #ifdef HAVE_WCHAR_H
38 #include <wchar.h>
39 #endif /* HAVE_WCHAR_H */
40 #ifdef HAVE_WCTYPE_H
41 #include <wctype.h>
42 #endif /* HAVE_WCTYPE_H */
43 #ifndef TRE_WCHAR
44 #include <ctype.h>
45 #endif /* !TRE_WCHAR */
46 #ifdef HAVE_MALLOC_H
47 #include <malloc.h>
48 #endif /* HAVE_MALLOC_H */
49 #include <limits.h>
50
51 #include "tre-internal.h"
52 #include "tre-match-utils.h"
53 #include "tre.h"
54 #include "xmalloc.h"
55
56
57 /* For each tre_last_matched_t in the lm array, find the last matched branch by
58 comparing the touch value of the cmp_tag's. For all other branches, reset
59 the corresponding tags. If reset_all is non-zero, reset all tags in all
60 branches. Recurse into the nested last matched structures, clearing tags as
61 apprpriate. */
62 static void
63 tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm,
64 int n, int start_tag, int reset_all)
65 {
66 int max, i, reset;
67 tre_last_matched_branch_t *b;
68
69 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
70 n, start_tag, reset_all));
71 for (; n-- > 0; lm++)
72 {
73 if (lm->n_branches == 1)
74 {
75 b = lm->branches;
76 if (start_tag > 0)
77 {
78 DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag,
79 tre_tag_touch_get(tags, b->cmp_tag),
80 tre_tag_touch_get(tags, start_tag)));
81 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) <
82 tre_tag_touch_get(tags, start_tag));
83 }
84 else
85 reset = 0;
86
87 if (reset)
88 {
89 int *t;
90
91 for (i = b->n_tags, t = b->tags; i > 0; i--, t++)
92 {
93 DPRINT((" Resetting t%d\n", *t));
94 tre_tag_reset(tags, *t);
95 }
96 }
97 if (b->n_last_matched > 0)
98 tre_reset_last_matched_branches(tags, b->last_matched,
99 b->n_last_matched,
100 lm->start_tag, reset);
101 }
102 else
103 {
104 if (!reset_all)
105 {
106 #ifdef TRE_DEBUG
107 int last;
108 #endif /* TRE_DEBUG */
109 max = 0;
110 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
111 {
112 int t = b->cmp_tag;
113 int touch = tre_tag_touch_get(tags, t);
114 if (touch > max)
115 {
116 max = touch;
117 #ifdef TRE_DEBUG
118 last = t;
119 #endif /* TRE_DEBUG */
120 }
121 }
122 DPRINT((" Last touched end tag t%d=%d\n", last, max));
123 }
124
125 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
126 {
127 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max);
128 if (reset)
129 {
130 int j;
131 int *t;
132
133 for (j = b->n_tags, t = b->tags; j > 0; j--, t++)
134 {
135 DPRINT((" Resetting t%d\n", *t));
136 tre_tag_reset(tags, *t);
137 }
138 }
139 if (b->n_last_matched > 0)
140 tre_reset_last_matched_branches(tags, b->last_matched,
141 b->n_last_matched,
142 lm->start_tag, reset);
143 }
144 }
145 }
146 }
147
148
149 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
150 endpoint values. */
151 reg_errcode_t
152 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
153 const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo)
154 {
155 unsigned int i;
156
157 if (cflags & REG_NOSUB) return REG_OK;
158
159 i = 0;
160 if (match_eo >= 0 && intags)
161 {
162 const tre_tag_t *tags = intags;
163 tre_submatch_data_t *submatch_data;
164
165 if (tnfa->last_matched_branch &&
166 tnfa->last_matched_branch->n_last_matched > 0)
167 {
168 tre_tag_t *t;
169 #ifdef TRE_USE_ALLOCA
170 t = alloca(sizeof(*t) * tnfa->num_tags);
171 #else /* !TRE_USE_ALLOCA */
172 t = xmalloc(sizeof(*t) * tnfa->num_tags);
173 #endif /* !TRE_USE_ALLOCA */
174 if (!t) return REG_ESPACE;
175 memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t));
176 tre_reset_last_matched_branches(t,
177 tnfa->last_matched_branch->last_matched,
178 tnfa->last_matched_branch->n_last_matched,
179 0, 0);
180 tags = t;
181 }
182 /* Construct submatch offsets from the tags. */
183 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
184 submatch_data = tnfa->submatch_data;
185 while (i < tnfa->num_submatches && i < nmatch)
186 {
187 if (submatch_data[i].so_tag == tnfa->end_tag)
188 pmatch[i].rm_so = match_eo;
189 else
190 pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag);
191
192 if (submatch_data[i].eo_tag == tnfa->end_tag)
193 pmatch[i].rm_eo = match_eo;
194 else
195 pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag);
196
197 /* If either of the endpoints were not used, this submatch
198 was not part of the match. */
199 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
200 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
201
202 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i,
203 submatch_data[i].so_tag, pmatch[i].rm_so,
204 submatch_data[i].eo_tag, pmatch[i].rm_eo));
205 i++;
206 }
207 #ifndef TRE_USE_ALLOCA
208 if (tags != intags) xfree(tags);
209 #endif /* !TRE_USE_ALLOCA */
210 }
211
212 while (i < nmatch)
213 {
214 pmatch[i].rm_so = -1;
215 pmatch[i].rm_eo = -1;
216 i++;
217 }
218
219 return REG_OK;
220 }
221
222
223 /*
224 Wrapper functions for POSIX compatible regexp matching.
225 */
226
227 #ifndef __LIBC__
228 int
229 tre_have_backrefs(const regex_t *preg)
230 {
231 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
232 return tnfa->have_backrefs;
233 }
234
235 #ifdef TRE_APPROX
236 int
237 tre_have_approx(const regex_t *preg)
238 {
239 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
240 return tnfa->have_approx;
241 }
242 #endif /* TRE_APPROX */
243 #endif /* !__LIBC__ */
244
245 static int
246 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
247 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
248 int eflags)
249 {
250 reg_errcode_t status;
251 tre_tag_t *tags = NULL;
252 int eo;
253 size_t offset = 0, count = 0;
254 if (tnfa->num_tags > 0 && nmatch > 0)
255 {
256 #ifdef TRE_USE_ALLOCA
257 tags = alloca(sizeof(*tags) * tnfa->num_tags);
258 #else /* !TRE_USE_ALLOCA */
259 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
260 #endif /* !TRE_USE_ALLOCA */
261 if (tags == NULL)
262 return REG_ESPACE;
263 }
264
265 if (
266 #ifdef TRE_STR_USER
267 type != STR_USER &&
268 #endif /* TRE_STR_USER */
269 (eflags & REG_STARTEND) && pmatch)
270 {
271 if (pmatch->rm_so < 0)
272 return REG_INVARG;
273 if (len == (size_t)-1)
274 {
275 if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo)
276 return REG_INVARG;
277 len = pmatch->rm_eo - pmatch->rm_so;
278 }
279 count = offset = pmatch->rm_so;
280 if (type == STR_WIDE) offset *= sizeof(wchar_t);
281 }
282
283 /* Dispatch to the appropriate matcher. */
284 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
285 {
286 /* The regex has back references, use the backtracking matcher. */
287 #ifdef TRE_STR_USER
288 if (type == STR_USER)
289 {
290 const tre_str_source *source = string;
291 if (source->rewind == NULL || source->compare == NULL)
292 /* The backtracking matcher requires rewind and compare
293 capabilities from the input stream. */
294 return REG_BADPAT;
295 }
296 #endif /* TRE_STR_USER */
297 status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type,
298 tags, eflags, &eo);
299 }
300 #ifdef TRE_APPROX
301 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
302 {
303 /* The regex uses approximate matching, use the approximate matcher. */
304 regamatch_t match;
305 regaparams_t params;
306 tre_regaparams_default(&params);
307 params.max_err = 0;
308 params.max_cost = 0;
309 status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags,
310 &match, params, eflags, &eo);
311 }
312 #endif /* TRE_APPROX */
313 else
314 {
315 /* Exact matching, no back references, use the parallel matcher. */
316 status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type,
317 tags, eflags, &eo);
318 }
319
320 if (status == REG_OK)
321 {
322 /* A match was found, so fill the submatch registers. */
323 status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
324 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
325 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
326 tre_fill_pmatch itself). */
327 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
328 #ifdef TRE_STR_USER
329 type != STR_USER &&
330 #endif /* TRE_STR_USER */
331 (eflags & REG_STARTEND) && pmatch && nmatch > 0)
332 {
333 size_t i;
334 regmatch_t *p;
335 for (i = nmatch, p = pmatch; i > 0; p++, i--)
336 {
337 if (p->rm_so >= 0) p->rm_so += count;
338 if (p->rm_eo >= 0) p->rm_eo += count;
339 }
340 }
341 }
342 #ifndef TRE_USE_ALLOCA
343 if (tags)
344 xfree(tags);
345 #endif /* !TRE_USE_ALLOCA */
346 return status;
347 }
348
349 int
350 tre_regnexec(const regex_t *preg, const char *str, size_t len,
351 size_t nmatch, regmatch_t pmatch[], int eflags)
352 {
353 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
354 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
355
356 #ifdef TRE_USE_SYSTEM_REGEX_H
357 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
358 #endif /* TRE_USE_SYSTEM_REGEX_H */
359
360 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
361 }
362
363 int
364 tre_regexec(const regex_t *preg, const char *str,
365 size_t nmatch, regmatch_t pmatch[], int eflags)
366 {
367 return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
368 }
369
370
371 #ifdef TRE_WCHAR
372
373 int
374 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
375 size_t nmatch, regmatch_t pmatch[], int eflags)
376 {
377 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
378
379 #ifdef TRE_USE_SYSTEM_REGEX_H
380 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
381 #endif /* TRE_USE_SYSTEM_REGEX_H */
382
383 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
384 }
385
386 int
387 tre_regwexec(const regex_t *preg, const wchar_t *str,
388 size_t nmatch, regmatch_t pmatch[], int eflags)
389 {
390 return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
391 }
392
393 #endif /* TRE_WCHAR */
394
395 #ifdef TRE_STR_USER
396 int
397 tre_reguexec(const regex_t *preg, const tre_str_source *str,
398 size_t nmatch, regmatch_t pmatch[], int eflags)
399 {
400 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
401
402 #ifdef TRE_USE_SYSTEM_REGEX_H
403 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
404 #endif /* TRE_USE_SYSTEM_REGEX_H */
405
406 return tre_match(tnfa, str, (size_t)-1, STR_USER, nmatch, pmatch, eflags);
407 }
408 #endif /* TRE_STR_USER */
409
410
411 #ifdef TRE_APPROX
412
413 /*
414 Wrapper functions for approximate regexp matching.
415 */
416
417 static int
418 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
419 tre_str_type_t type, regamatch_t *match, regaparams_t params,
420 int eflags)
421 {
422 reg_errcode_t status;
423 tre_tag_t *tags = NULL;
424 int eo;
425 size_t offset = 0, count = 0;
426
427 /* If the regexp does not use approximate matching features, the
428 maximum cost is zero, and the approximate matcher isn't forced,
429 use the exact matcher instead. */
430 if (params.max_cost == 0 && !tnfa->have_approx
431 && !(eflags & REG_APPROX_MATCHER))
432 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
433 eflags);
434
435 /* Back references are not supported by the approximate matcher. */
436 if (tnfa->have_backrefs)
437 return REG_BADPAT;
438
439 if (tnfa->num_tags > 0 && match->nmatch > 0)
440 {
441 #if TRE_USE_ALLOCA
442 tags = alloca(sizeof(*tags) * tnfa->num_tags);
443 #else /* !TRE_USE_ALLOCA */
444 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
445 #endif /* !TRE_USE_ALLOCA */
446 if (tags == NULL)
447 return REG_ESPACE;
448 }
449
450 if (
451 #ifdef TRE_STR_USER
452 type != STR_USER &&
453 #endif /* TRE_STR_USER */
454 (eflags & REG_STARTEND) && match->pmatch)
455 {
456 if (match->pmatch->rm_so < 0)
457 return REG_INVARG;
458 if (len == (size_t)-1)
459 {
460 if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so >
461 match->pmatch->rm_eo)
462 return REG_INVARG;
463 len = match->pmatch->rm_eo - match->pmatch->rm_so;
464 }
465 count = offset = match->pmatch->rm_so;
466 if (type == STR_WIDE) offset *= sizeof(wchar_t);
467 }
468
469 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
470 match, params, eflags, &eo);
471 if (status == REG_OK)
472 {
473 status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags,
474 tnfa, tags, eo);
475 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
476 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
477 tre_fill_pmatch itself). */
478 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
479 #ifdef TRE_STR_USER
480 type != STR_USER &&
481 #endif /* TRE_STR_USER */
482 (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0)
483 {
484 size_t i;
485 regmatch_t *p;
486 for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--)
487 {
488 if (p->rm_so >= 0) p->rm_so += count;
489 if (p->rm_eo >= 0) p->rm_eo += count;
490 }
491 }
492 }
493 #ifndef TRE_USE_ALLOCA
494 if (tags)
495 xfree(tags);
496 #endif /* !TRE_USE_ALLOCA */
497 return status;
498 }
499
500 int
501 tre_reganexec(const regex_t *preg, const char *str, size_t len,
502 regamatch_t *match, regaparams_t params, int eflags)
503 {
504 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
505 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
506
507 #ifdef TRE_USE_SYSTEM_REGEX_H
508 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
509 #endif /* TRE_USE_SYSTEM_REGEX_H */
510
511 return tre_match_approx(tnfa, str, len, type, match, params, eflags);
512 }
513
514 int
515 tre_regaexec(const regex_t *preg, const char *str,
516 regamatch_t *match, regaparams_t params, int eflags)
517 {
518 return tre_reganexec(preg, str, (size_t)-1, match, params, eflags);
519 }
520
521 #ifdef TRE_WCHAR
522
523 int
524 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
525 regamatch_t *match, regaparams_t params, int eflags)
526 {
527 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
528
529 #ifdef TRE_USE_SYSTEM_REGEX_H
530 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
531 #endif /* TRE_USE_SYSTEM_REGEX_H */
532
533 return tre_match_approx(tnfa, str, len, STR_WIDE,
534 match, params, eflags);
535 }
536
537 int
538 tre_regawexec(const regex_t *preg, const wchar_t *str,
539 regamatch_t *match, regaparams_t params, int eflags)
540 {
541 return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags);
542 }
543
544 #endif /* TRE_WCHAR */
545
546 void
547 tre_regaparams_default(regaparams_t *params)
548 {
549 memset(params, 0, sizeof(*params));
550 params->cost_ins = 1;
551 params->cost_del = 1;
552 params->cost_subst = 1;
553 params->max_cost = INT_MAX;
554 params->max_ins = INT_MAX;
555 params->max_del = INT_MAX;
556 params->max_subst = INT_MAX;
557 params->max_err = INT_MAX;
558 }
559
560 #endif /* TRE_APPROX */
561
562 /* EOF */