]> git.saurik.com Git - apple/libc.git/blame - regex/TRE/lib/regexec.c
Libc-1272.200.26.tar.gz
[apple/libc.git] / regex / TRE / lib / regexec.c
CommitLineData
ad3c9f2a
A
1/*
2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7*/
8
9#ifdef HAVE_CONFIG_H
10#include <config.h>
11#endif /* HAVE_CONFIG_H */
12
2acb8998
A
13/* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
14 info while running */
15#undef TRE_USE_ALLOCA
16
ad3c9f2a
A
17#ifdef TRE_USE_ALLOCA
18/* AIX requires this to be the first thing in the file. */
19#ifndef __GNUC__
20# if HAVE_ALLOCA_H
21# include <alloca.h>
22# else
23# ifdef _AIX
24 #pragma alloca
25# else
26# ifndef alloca /* predefined by HP cc +Olibcalls */
27char *alloca ();
28# endif
29# endif
30# endif
31#endif
32#endif /* TRE_USE_ALLOCA */
33
34#include <assert.h>
35#include <stdlib.h>
36#include <string.h>
37#ifdef HAVE_WCHAR_H
38#include <wchar.h>
39#endif /* HAVE_WCHAR_H */
40#ifdef HAVE_WCTYPE_H
41#include <wctype.h>
42#endif /* HAVE_WCTYPE_H */
43#ifndef TRE_WCHAR
44#include <ctype.h>
45#endif /* !TRE_WCHAR */
46#ifdef HAVE_MALLOC_H
47#include <malloc.h>
48#endif /* HAVE_MALLOC_H */
49#include <limits.h>
50
51#include "tre-internal.h"
52#include "tre-match-utils.h"
53#include "tre.h"
54#include "xmalloc.h"
55
56
57/* For each tre_last_matched_t in the lm array, find the last matched branch by
58 comparing the touch value of the cmp_tag's. For all other branches, reset
59 the corresponding tags. If reset_all is non-zero, reset all tags in all
60 branches. Recurse into the nested last matched structures, clearing tags as
61 apprpriate. */
62static void
63tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm,
64 int n, int start_tag, int reset_all)
65{
66 int max, i, reset;
67 tre_last_matched_branch_t *b;
68
69 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
70 n, start_tag, reset_all));
71 for (; n-- > 0; lm++)
72 {
73 if (lm->n_branches == 1)
74 {
75 b = lm->branches;
76 if (start_tag > 0)
77 {
78 DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag,
79 tre_tag_touch_get(tags, b->cmp_tag),
80 tre_tag_touch_get(tags, start_tag)));
81 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) <
82 tre_tag_touch_get(tags, start_tag));
83 }
84 else
85 reset = 0;
86
87 if (reset)
88 {
89 int *t;
90
91 for (i = b->n_tags, t = b->tags; i > 0; i--, t++)
92 {
93 DPRINT((" Resetting t%d\n", *t));
94 tre_tag_reset(tags, *t);
95 }
96 }
97 if (b->n_last_matched > 0)
98 tre_reset_last_matched_branches(tags, b->last_matched,
99 b->n_last_matched,
100 lm->start_tag, reset);
101 }
102 else
103 {
104 if (!reset_all)
105 {
106#ifdef TRE_DEBUG
107 int last;
108#endif /* TRE_DEBUG */
109 max = 0;
110 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
111 {
112 int t = b->cmp_tag;
113 int touch = tre_tag_touch_get(tags, t);
114 if (touch > max)
115 {
116 max = touch;
117#ifdef TRE_DEBUG
118 last = t;
119#endif /* TRE_DEBUG */
120 }
121 }
122 DPRINT((" Last touched end tag t%d=%d\n", last, max));
123 }
124
125 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
126 {
127 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max);
128 if (reset)
129 {
130 int j;
131 int *t;
132
133 for (j = b->n_tags, t = b->tags; j > 0; j--, t++)
134 {
135 DPRINT((" Resetting t%d\n", *t));
136 tre_tag_reset(tags, *t);
137 }
138 }
139 if (b->n_last_matched > 0)
140 tre_reset_last_matched_branches(tags, b->last_matched,
141 b->n_last_matched,
142 lm->start_tag, reset);
143 }
144 }
145 }
146}
147
148
149/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
150 endpoint values. */
151reg_errcode_t
152tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
153 const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo)
154{
155 unsigned int i;
156
157 if (cflags & REG_NOSUB) return REG_OK;
158
159 i = 0;
160 if (match_eo >= 0 && intags)
161 {
162 const tre_tag_t *tags = intags;
163 tre_submatch_data_t *submatch_data;
164
165 if (tnfa->last_matched_branch &&
166 tnfa->last_matched_branch->n_last_matched > 0)
167 {
168 tre_tag_t *t;
169#ifdef TRE_USE_ALLOCA
170 t = alloca(sizeof(*t) * tnfa->num_tags);
171#else /* !TRE_USE_ALLOCA */
172 t = xmalloc(sizeof(*t) * tnfa->num_tags);
173#endif /* !TRE_USE_ALLOCA */
174 if (!t) return REG_ESPACE;
175 memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t));
176 tre_reset_last_matched_branches(t,
177 tnfa->last_matched_branch->last_matched,
178 tnfa->last_matched_branch->n_last_matched,
179 0, 0);
180 tags = t;
181 }
182 /* Construct submatch offsets from the tags. */
183 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
184 submatch_data = tnfa->submatch_data;
185 while (i < tnfa->num_submatches && i < nmatch)
186 {
187 if (submatch_data[i].so_tag == tnfa->end_tag)
188 pmatch[i].rm_so = match_eo;
189 else
190 pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag);
191
192 if (submatch_data[i].eo_tag == tnfa->end_tag)
193 pmatch[i].rm_eo = match_eo;
194 else
195 pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag);
196
197 /* If either of the endpoints were not used, this submatch
198 was not part of the match. */
199 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
200 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
201
202 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i,
203 submatch_data[i].so_tag, pmatch[i].rm_so,
204 submatch_data[i].eo_tag, pmatch[i].rm_eo));
205 i++;
206 }
207#ifndef TRE_USE_ALLOCA
5f125488 208 if (tags != intags) xfree((void*)tags);
ad3c9f2a
A
209#endif /* !TRE_USE_ALLOCA */
210 }
211
212 while (i < nmatch)
213 {
214 pmatch[i].rm_so = -1;
215 pmatch[i].rm_eo = -1;
216 i++;
217 }
218
219 return REG_OK;
220}
221
222
223/*
224 Wrapper functions for POSIX compatible regexp matching.
225*/
226
227#ifndef __LIBC__
228int
229tre_have_backrefs(const regex_t *preg)
230{
231 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
232 return tnfa->have_backrefs;
233}
234
235#ifdef TRE_APPROX
236int
237tre_have_approx(const regex_t *preg)
238{
239 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
240 return tnfa->have_approx;
241}
242#endif /* TRE_APPROX */
243#endif /* !__LIBC__ */
244
245static int
246tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
247 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
248 int eflags)
249{
250 reg_errcode_t status;
251 tre_tag_t *tags = NULL;
252 int eo;
253 size_t offset = 0, count = 0;
ad3c9f2a
A
254
255 if (
256#ifdef TRE_STR_USER
257 type != STR_USER &&
258#endif /* TRE_STR_USER */
259 (eflags & REG_STARTEND) && pmatch)
260 {
261 if (pmatch->rm_so < 0)
262 return REG_INVARG;
263 if (len == (size_t)-1)
264 {
265 if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo)
266 return REG_INVARG;
267 len = pmatch->rm_eo - pmatch->rm_so;
268 }
269 count = offset = pmatch->rm_so;
270 if (type == STR_WIDE) offset *= sizeof(wchar_t);
271 }
272
5f125488
A
273 if (tnfa->num_tags > 0 && nmatch > 0)
274 {
275#ifdef TRE_USE_ALLOCA
276 tags = alloca(sizeof(*tags) * tnfa->num_tags);
277#else /* !TRE_USE_ALLOCA */
278 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
279#endif /* !TRE_USE_ALLOCA */
280 if (tags == NULL)
281 return REG_ESPACE;
282 }
283
ad3c9f2a
A
284 /* Dispatch to the appropriate matcher. */
285 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
286 {
287 /* The regex has back references, use the backtracking matcher. */
288#ifdef TRE_STR_USER
289 if (type == STR_USER)
290 {
291 const tre_str_source *source = string;
292 if (source->rewind == NULL || source->compare == NULL)
293 /* The backtracking matcher requires rewind and compare
294 capabilities from the input stream. */
5f125488
A
295 status = REG_BADPAT;
296 } else
ad3c9f2a
A
297#endif /* TRE_STR_USER */
298 status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type,
299 tags, eflags, &eo);
300 }
301#ifdef TRE_APPROX
302 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
303 {
304 /* The regex uses approximate matching, use the approximate matcher. */
305 regamatch_t match;
306 regaparams_t params;
307 tre_regaparams_default(&params);
308 params.max_err = 0;
309 params.max_cost = 0;
310 status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags,
311 &match, params, eflags, &eo);
312 }
313#endif /* TRE_APPROX */
314 else
315 {
316 /* Exact matching, no back references, use the parallel matcher. */
317 status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type,
318 tags, eflags, &eo);
319 }
320
321 if (status == REG_OK)
322 {
323 /* A match was found, so fill the submatch registers. */
324 status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
325 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
326 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
327 tre_fill_pmatch itself). */
328 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
329#ifdef TRE_STR_USER
330 type != STR_USER &&
331#endif /* TRE_STR_USER */
332 (eflags & REG_STARTEND) && pmatch && nmatch > 0)
333 {
334 size_t i;
335 regmatch_t *p;
336 for (i = nmatch, p = pmatch; i > 0; p++, i--)
337 {
338 if (p->rm_so >= 0) p->rm_so += count;
339 if (p->rm_eo >= 0) p->rm_eo += count;
340 }
341 }
342 }
343#ifndef TRE_USE_ALLOCA
344 if (tags)
345 xfree(tags);
346#endif /* !TRE_USE_ALLOCA */
347 return status;
348}
349
350int
351tre_regnexec(const regex_t *preg, const char *str, size_t len,
352 size_t nmatch, regmatch_t pmatch[], int eflags)
353{
354 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
355 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
356
357#ifdef TRE_USE_SYSTEM_REGEX_H
358 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
359#endif /* TRE_USE_SYSTEM_REGEX_H */
360
361 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
362}
363
364int
365tre_regexec(const regex_t *preg, const char *str,
366 size_t nmatch, regmatch_t pmatch[], int eflags)
367{
368 return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
369}
370
371
372#ifdef TRE_WCHAR
373
374int
375tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
376 size_t nmatch, regmatch_t pmatch[], int eflags)
377{
378 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
379
380#ifdef TRE_USE_SYSTEM_REGEX_H
381 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
382#endif /* TRE_USE_SYSTEM_REGEX_H */
383
384 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
385}
386
387int
388tre_regwexec(const regex_t *preg, const wchar_t *str,
389 size_t nmatch, regmatch_t pmatch[], int eflags)
390{
391 return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
392}
393
394#endif /* TRE_WCHAR */
395
396#ifdef TRE_STR_USER
397int
398tre_reguexec(const regex_t *preg, const tre_str_source *str,
399 size_t nmatch, regmatch_t pmatch[], int eflags)
400{
401 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
402
403#ifdef TRE_USE_SYSTEM_REGEX_H
404 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
405#endif /* TRE_USE_SYSTEM_REGEX_H */
406
407 return tre_match(tnfa, str, (size_t)-1, STR_USER, nmatch, pmatch, eflags);
408}
409#endif /* TRE_STR_USER */
410
411
412#ifdef TRE_APPROX
413
414/*
415 Wrapper functions for approximate regexp matching.
416*/
417
418static int
419tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
420 tre_str_type_t type, regamatch_t *match, regaparams_t params,
421 int eflags)
422{
423 reg_errcode_t status;
424 tre_tag_t *tags = NULL;
425 int eo;
426 size_t offset = 0, count = 0;
427
428 /* If the regexp does not use approximate matching features, the
429 maximum cost is zero, and the approximate matcher isn't forced,
430 use the exact matcher instead. */
431 if (params.max_cost == 0 && !tnfa->have_approx
432 && !(eflags & REG_APPROX_MATCHER))
433 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
434 eflags);
435
436 /* Back references are not supported by the approximate matcher. */
437 if (tnfa->have_backrefs)
438 return REG_BADPAT;
439
440 if (tnfa->num_tags > 0 && match->nmatch > 0)
441 {
442#if TRE_USE_ALLOCA
443 tags = alloca(sizeof(*tags) * tnfa->num_tags);
444#else /* !TRE_USE_ALLOCA */
445 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
446#endif /* !TRE_USE_ALLOCA */
447 if (tags == NULL)
448 return REG_ESPACE;
449 }
450
451 if (
452#ifdef TRE_STR_USER
453 type != STR_USER &&
454#endif /* TRE_STR_USER */
455 (eflags & REG_STARTEND) && match->pmatch)
456 {
457 if (match->pmatch->rm_so < 0)
458 return REG_INVARG;
459 if (len == (size_t)-1)
460 {
461 if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so >
462 match->pmatch->rm_eo)
463 return REG_INVARG;
464 len = match->pmatch->rm_eo - match->pmatch->rm_so;
465 }
466 count = offset = match->pmatch->rm_so;
467 if (type == STR_WIDE) offset *= sizeof(wchar_t);
468 }
469
470 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
471 match, params, eflags, &eo);
472 if (status == REG_OK)
473 {
474 status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags,
475 tnfa, tags, eo);
476 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
477 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
478 tre_fill_pmatch itself). */
479 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
480#ifdef TRE_STR_USER
481 type != STR_USER &&
482#endif /* TRE_STR_USER */
483 (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0)
484 {
485 size_t i;
486 regmatch_t *p;
487 for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--)
488 {
489 if (p->rm_so >= 0) p->rm_so += count;
490 if (p->rm_eo >= 0) p->rm_eo += count;
491 }
492 }
493 }
494#ifndef TRE_USE_ALLOCA
495 if (tags)
496 xfree(tags);
497#endif /* !TRE_USE_ALLOCA */
498 return status;
499}
500
501int
502tre_reganexec(const regex_t *preg, const char *str, size_t len,
503 regamatch_t *match, regaparams_t params, int eflags)
504{
505 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
506 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
507
508#ifdef TRE_USE_SYSTEM_REGEX_H
509 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
510#endif /* TRE_USE_SYSTEM_REGEX_H */
511
512 return tre_match_approx(tnfa, str, len, type, match, params, eflags);
513}
514
515int
516tre_regaexec(const regex_t *preg, const char *str,
517 regamatch_t *match, regaparams_t params, int eflags)
518{
519 return tre_reganexec(preg, str, (size_t)-1, match, params, eflags);
520}
521
522#ifdef TRE_WCHAR
523
524int
525tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
526 regamatch_t *match, regaparams_t params, int eflags)
527{
528 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
529
530#ifdef TRE_USE_SYSTEM_REGEX_H
531 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
532#endif /* TRE_USE_SYSTEM_REGEX_H */
533
534 return tre_match_approx(tnfa, str, len, STR_WIDE,
535 match, params, eflags);
536}
537
538int
539tre_regawexec(const regex_t *preg, const wchar_t *str,
540 regamatch_t *match, regaparams_t params, int eflags)
541{
542 return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags);
543}
544
545#endif /* TRE_WCHAR */
546
547void
548tre_regaparams_default(regaparams_t *params)
549{
550 memset(params, 0, sizeof(*params));
551 params->cost_ins = 1;
552 params->cost_del = 1;
553 params->cost_subst = 1;
554 params->max_cost = INT_MAX;
555 params->max_ins = INT_MAX;
556 params->max_del = INT_MAX;
557 params->max_subst = INT_MAX;
558 params->max_err = INT_MAX;
559}
560
561#endif /* TRE_APPROX */
562
563/* EOF */