]> git.saurik.com Git - apple/libc.git/blob - regex/TRE/lib/regexec.c
Libc-1081.1.3.tar.gz
[apple/libc.git] / regex / TRE / lib / regexec.c
1 /*
2 tre_regexec.c - TRE POSIX compatible matching functions (and more).
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12
13 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
14 info while running */
15 #undef TRE_USE_ALLOCA
16
17 #ifdef TRE_USE_ALLOCA
18 /* AIX requires this to be the first thing in the file. */
19 #ifndef __GNUC__
20 # if HAVE_ALLOCA_H
21 # include <alloca.h>
22 # else
23 # ifdef _AIX
24 #pragma alloca
25 # else
26 # ifndef alloca /* predefined by HP cc +Olibcalls */
27 char *alloca ();
28 # endif
29 # endif
30 # endif
31 #endif
32 #endif /* TRE_USE_ALLOCA */
33
34 #include <assert.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #ifdef HAVE_WCHAR_H
38 #include <wchar.h>
39 #endif /* HAVE_WCHAR_H */
40 #ifdef HAVE_WCTYPE_H
41 #include <wctype.h>
42 #endif /* HAVE_WCTYPE_H */
43 #ifndef TRE_WCHAR
44 #include <ctype.h>
45 #endif /* !TRE_WCHAR */
46 #ifdef HAVE_MALLOC_H
47 #include <malloc.h>
48 #endif /* HAVE_MALLOC_H */
49 #include <limits.h>
50
51 #include "tre-internal.h"
52 #include "tre-match-utils.h"
53 #include "tre.h"
54 #include "xmalloc.h"
55
56
57 /* For each tre_last_matched_t in the lm array, find the last matched branch by
58 comparing the touch value of the cmp_tag's. For all other branches, reset
59 the corresponding tags. If reset_all is non-zero, reset all tags in all
60 branches. Recurse into the nested last matched structures, clearing tags as
61 apprpriate. */
62 static void
63 tre_reset_last_matched_branches(tre_tag_t *tags, const tre_last_matched_t *lm,
64 int n, int start_tag, int reset_all)
65 {
66 int max, i, reset;
67 tre_last_matched_branch_t *b;
68
69 DPRINT(("tre_reset_last_matched_branches: n=%d start_tag=%d reset_all=%d\n",
70 n, start_tag, reset_all));
71 for (; n-- > 0; lm++)
72 {
73 if (lm->n_branches == 1)
74 {
75 b = lm->branches;
76 if (start_tag > 0)
77 {
78 DPRINT((" b->cmp_tag=%d %d <? %d\n", b->cmp_tag,
79 tre_tag_touch_get(tags, b->cmp_tag),
80 tre_tag_touch_get(tags, start_tag)));
81 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) <
82 tre_tag_touch_get(tags, start_tag));
83 }
84 else
85 reset = 0;
86
87 if (reset)
88 {
89 int *t;
90
91 for (i = b->n_tags, t = b->tags; i > 0; i--, t++)
92 {
93 DPRINT((" Resetting t%d\n", *t));
94 tre_tag_reset(tags, *t);
95 }
96 }
97 if (b->n_last_matched > 0)
98 tre_reset_last_matched_branches(tags, b->last_matched,
99 b->n_last_matched,
100 lm->start_tag, reset);
101 }
102 else
103 {
104 if (!reset_all)
105 {
106 #ifdef TRE_DEBUG
107 int last;
108 #endif /* TRE_DEBUG */
109 max = 0;
110 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
111 {
112 int t = b->cmp_tag;
113 int touch = tre_tag_touch_get(tags, t);
114 if (touch > max)
115 {
116 max = touch;
117 #ifdef TRE_DEBUG
118 last = t;
119 #endif /* TRE_DEBUG */
120 }
121 }
122 DPRINT((" Last touched end tag t%d=%d\n", last, max));
123 }
124
125 for (i = lm->n_branches, b = lm->branches; i > 0; i--, b++)
126 {
127 reset = (reset_all || tre_tag_touch_get(tags, b->cmp_tag) < max);
128 if (reset)
129 {
130 int j;
131 int *t;
132
133 for (j = b->n_tags, t = b->tags; j > 0; j--, t++)
134 {
135 DPRINT((" Resetting t%d\n", *t));
136 tre_tag_reset(tags, *t);
137 }
138 }
139 if (b->n_last_matched > 0)
140 tre_reset_last_matched_branches(tags, b->last_matched,
141 b->n_last_matched,
142 lm->start_tag, reset);
143 }
144 }
145 }
146 }
147
148
149 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
150 endpoint values. */
151 reg_errcode_t
152 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
153 const tre_tnfa_t *tnfa, const tre_tag_t *intags, int match_eo)
154 {
155 unsigned int i;
156
157 if (cflags & REG_NOSUB) return REG_OK;
158
159 i = 0;
160 if (match_eo >= 0 && intags)
161 {
162 const tre_tag_t *tags = intags;
163 tre_submatch_data_t *submatch_data;
164
165 if (tnfa->last_matched_branch &&
166 tnfa->last_matched_branch->n_last_matched > 0)
167 {
168 tre_tag_t *t;
169 #ifdef TRE_USE_ALLOCA
170 t = alloca(sizeof(*t) * tnfa->num_tags);
171 #else /* !TRE_USE_ALLOCA */
172 t = xmalloc(sizeof(*t) * tnfa->num_tags);
173 #endif /* !TRE_USE_ALLOCA */
174 if (!t) return REG_ESPACE;
175 memcpy(t, intags, tnfa->num_tags * sizeof(tre_tag_t));
176 tre_reset_last_matched_branches(t,
177 tnfa->last_matched_branch->last_matched,
178 tnfa->last_matched_branch->n_last_matched,
179 0, 0);
180 tags = t;
181 }
182 /* Construct submatch offsets from the tags. */
183 DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
184 submatch_data = tnfa->submatch_data;
185 while (i < tnfa->num_submatches && i < nmatch)
186 {
187 if (submatch_data[i].so_tag == tnfa->end_tag)
188 pmatch[i].rm_so = match_eo;
189 else
190 pmatch[i].rm_so = tre_tag_get(tags, submatch_data[i].so_tag);
191
192 if (submatch_data[i].eo_tag == tnfa->end_tag)
193 pmatch[i].rm_eo = match_eo;
194 else
195 pmatch[i].rm_eo = tre_tag_get(tags, submatch_data[i].eo_tag);
196
197 /* If either of the endpoints were not used, this submatch
198 was not part of the match. */
199 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
200 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
201
202 DPRINT(("pmatch[%d] = {t%d = %qd, t%d = %qd}\n", i,
203 submatch_data[i].so_tag, pmatch[i].rm_so,
204 submatch_data[i].eo_tag, pmatch[i].rm_eo));
205 i++;
206 }
207 #ifndef TRE_USE_ALLOCA
208 if (tags != intags) xfree((void*)tags);
209 #endif /* !TRE_USE_ALLOCA */
210 }
211
212 while (i < nmatch)
213 {
214 pmatch[i].rm_so = -1;
215 pmatch[i].rm_eo = -1;
216 i++;
217 }
218
219 return REG_OK;
220 }
221
222
223 /*
224 Wrapper functions for POSIX compatible regexp matching.
225 */
226
227 #ifndef __LIBC__
228 int
229 tre_have_backrefs(const regex_t *preg)
230 {
231 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
232 return tnfa->have_backrefs;
233 }
234
235 #ifdef TRE_APPROX
236 int
237 tre_have_approx(const regex_t *preg)
238 {
239 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
240 return tnfa->have_approx;
241 }
242 #endif /* TRE_APPROX */
243 #endif /* !__LIBC__ */
244
245 static int
246 tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
247 tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
248 int eflags)
249 {
250 reg_errcode_t status;
251 tre_tag_t *tags = NULL;
252 int eo;
253 size_t offset = 0, count = 0;
254
255 if (
256 #ifdef TRE_STR_USER
257 type != STR_USER &&
258 #endif /* TRE_STR_USER */
259 (eflags & REG_STARTEND) && pmatch)
260 {
261 if (pmatch->rm_so < 0)
262 return REG_INVARG;
263 if (len == (size_t)-1)
264 {
265 if (pmatch->rm_eo < 0 || pmatch->rm_so > pmatch->rm_eo)
266 return REG_INVARG;
267 len = pmatch->rm_eo - pmatch->rm_so;
268 }
269 count = offset = pmatch->rm_so;
270 if (type == STR_WIDE) offset *= sizeof(wchar_t);
271 }
272
273 if (tnfa->num_tags > 0 && nmatch > 0)
274 {
275 #ifdef TRE_USE_ALLOCA
276 tags = alloca(sizeof(*tags) * tnfa->num_tags);
277 #else /* !TRE_USE_ALLOCA */
278 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
279 #endif /* !TRE_USE_ALLOCA */
280 if (tags == NULL)
281 return REG_ESPACE;
282 }
283
284 /* Dispatch to the appropriate matcher. */
285 if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
286 {
287 /* The regex has back references, use the backtracking matcher. */
288 #ifdef TRE_STR_USER
289 if (type == STR_USER)
290 {
291 const tre_str_source *source = string;
292 if (source->rewind == NULL || source->compare == NULL)
293 /* The backtracking matcher requires rewind and compare
294 capabilities from the input stream. */
295 status = REG_BADPAT;
296 } else
297 #endif /* TRE_STR_USER */
298 status = tre_tnfa_run_backtrack(tnfa, string + offset, (int)len, type,
299 tags, eflags, &eo);
300 }
301 #ifdef TRE_APPROX
302 else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
303 {
304 /* The regex uses approximate matching, use the approximate matcher. */
305 regamatch_t match;
306 regaparams_t params;
307 tre_regaparams_default(&params);
308 params.max_err = 0;
309 params.max_cost = 0;
310 status = tre_tnfa_run_approx(tnfa, string + offset, (int)len, type, tags,
311 &match, params, eflags, &eo);
312 }
313 #endif /* TRE_APPROX */
314 else
315 {
316 /* Exact matching, no back references, use the parallel matcher. */
317 status = tre_tnfa_run_parallel(tnfa, string + offset, (int)len, type,
318 tags, eflags, &eo);
319 }
320
321 if (status == REG_OK)
322 {
323 /* A match was found, so fill the submatch registers. */
324 status = tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
325 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
326 this into tre_fill_pmatch, because tre_tnfa_run_backtrack calls
327 tre_fill_pmatch itself). */
328 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
329 #ifdef TRE_STR_USER
330 type != STR_USER &&
331 #endif /* TRE_STR_USER */
332 (eflags & REG_STARTEND) && pmatch && nmatch > 0)
333 {
334 size_t i;
335 regmatch_t *p;
336 for (i = nmatch, p = pmatch; i > 0; p++, i--)
337 {
338 if (p->rm_so >= 0) p->rm_so += count;
339 if (p->rm_eo >= 0) p->rm_eo += count;
340 }
341 }
342 }
343 #ifndef TRE_USE_ALLOCA
344 if (tags)
345 xfree(tags);
346 #endif /* !TRE_USE_ALLOCA */
347 return status;
348 }
349
350 int
351 tre_regnexec(const regex_t *preg, const char *str, size_t len,
352 size_t nmatch, regmatch_t pmatch[], int eflags)
353 {
354 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
355 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
356
357 #ifdef TRE_USE_SYSTEM_REGEX_H
358 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
359 #endif /* TRE_USE_SYSTEM_REGEX_H */
360
361 return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
362 }
363
364 int
365 tre_regexec(const regex_t *preg, const char *str,
366 size_t nmatch, regmatch_t pmatch[], int eflags)
367 {
368 return tre_regnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
369 }
370
371
372 #ifdef TRE_WCHAR
373
374 int
375 tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
376 size_t nmatch, regmatch_t pmatch[], int eflags)
377 {
378 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
379
380 #ifdef TRE_USE_SYSTEM_REGEX_H
381 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
382 #endif /* TRE_USE_SYSTEM_REGEX_H */
383
384 return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
385 }
386
387 int
388 tre_regwexec(const regex_t *preg, const wchar_t *str,
389 size_t nmatch, regmatch_t pmatch[], int eflags)
390 {
391 return tre_regwnexec(preg, str, (size_t)-1, nmatch, pmatch, eflags);
392 }
393
394 #endif /* TRE_WCHAR */
395
396 #ifdef TRE_STR_USER
397 int
398 tre_reguexec(const regex_t *preg, const tre_str_source *str,
399 size_t nmatch, regmatch_t pmatch[], int eflags)
400 {
401 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
402
403 #ifdef TRE_USE_SYSTEM_REGEX_H
404 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
405 #endif /* TRE_USE_SYSTEM_REGEX_H */
406
407 return tre_match(tnfa, str, (size_t)-1, STR_USER, nmatch, pmatch, eflags);
408 }
409 #endif /* TRE_STR_USER */
410
411
412 #ifdef TRE_APPROX
413
414 /*
415 Wrapper functions for approximate regexp matching.
416 */
417
418 static int
419 tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
420 tre_str_type_t type, regamatch_t *match, regaparams_t params,
421 int eflags)
422 {
423 reg_errcode_t status;
424 tre_tag_t *tags = NULL;
425 int eo;
426 size_t offset = 0, count = 0;
427
428 /* If the regexp does not use approximate matching features, the
429 maximum cost is zero, and the approximate matcher isn't forced,
430 use the exact matcher instead. */
431 if (params.max_cost == 0 && !tnfa->have_approx
432 && !(eflags & REG_APPROX_MATCHER))
433 return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
434 eflags);
435
436 /* Back references are not supported by the approximate matcher. */
437 if (tnfa->have_backrefs)
438 return REG_BADPAT;
439
440 if (tnfa->num_tags > 0 && match->nmatch > 0)
441 {
442 #if TRE_USE_ALLOCA
443 tags = alloca(sizeof(*tags) * tnfa->num_tags);
444 #else /* !TRE_USE_ALLOCA */
445 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
446 #endif /* !TRE_USE_ALLOCA */
447 if (tags == NULL)
448 return REG_ESPACE;
449 }
450
451 if (
452 #ifdef TRE_STR_USER
453 type != STR_USER &&
454 #endif /* TRE_STR_USER */
455 (eflags & REG_STARTEND) && match->pmatch)
456 {
457 if (match->pmatch->rm_so < 0)
458 return REG_INVARG;
459 if (len == (size_t)-1)
460 {
461 if (match->pmatch->rm_eo < 0 || match->pmatch->rm_so >
462 match->pmatch->rm_eo)
463 return REG_INVARG;
464 len = match->pmatch->rm_eo - match->pmatch->rm_so;
465 }
466 count = offset = match->pmatch->rm_so;
467 if (type == STR_WIDE) offset *= sizeof(wchar_t);
468 }
469
470 status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags,
471 match, params, eflags, &eo);
472 if (status == REG_OK)
473 {
474 status = tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags,
475 tnfa, tags, eo);
476 /* If doing REG_STARTEND, adjust the pmatch array (we can't build
477 this into tre_fill_pmatch, because tre_tnfa_run_backtrack call
478 tre_fill_pmatch itself). */
479 if (status == REG_OK && !(tnfa->cflags & REG_NOSUB) &&
480 #ifdef TRE_STR_USER
481 type != STR_USER &&
482 #endif /* TRE_STR_USER */
483 (eflags & REG_STARTEND) && match->pmatch && match->nmatch > 0)
484 {
485 size_t i;
486 regmatch_t *p;
487 for (i = match->nmatch, p = match->pmatch; i > 0; p++, i--)
488 {
489 if (p->rm_so >= 0) p->rm_so += count;
490 if (p->rm_eo >= 0) p->rm_eo += count;
491 }
492 }
493 }
494 #ifndef TRE_USE_ALLOCA
495 if (tags)
496 xfree(tags);
497 #endif /* !TRE_USE_ALLOCA */
498 return status;
499 }
500
501 int
502 tre_reganexec(const regex_t *preg, const char *str, size_t len,
503 regamatch_t *match, regaparams_t params, int eflags)
504 {
505 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
506 tre_str_type_t type = (TRE_MB_CUR_MAX_L(tnfa->loc) == 1) ? STR_BYTE : STR_MBS;
507
508 #ifdef TRE_USE_SYSTEM_REGEX_H
509 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
510 #endif /* TRE_USE_SYSTEM_REGEX_H */
511
512 return tre_match_approx(tnfa, str, len, type, match, params, eflags);
513 }
514
515 int
516 tre_regaexec(const regex_t *preg, const char *str,
517 regamatch_t *match, regaparams_t params, int eflags)
518 {
519 return tre_reganexec(preg, str, (size_t)-1, match, params, eflags);
520 }
521
522 #ifdef TRE_WCHAR
523
524 int
525 tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
526 regamatch_t *match, regaparams_t params, int eflags)
527 {
528 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
529
530 #ifdef TRE_USE_SYSTEM_REGEX_H
531 if (preg->re_magic != RE_MAGIC) return REG_BADPAT;
532 #endif /* TRE_USE_SYSTEM_REGEX_H */
533
534 return tre_match_approx(tnfa, str, len, STR_WIDE,
535 match, params, eflags);
536 }
537
538 int
539 tre_regawexec(const regex_t *preg, const wchar_t *str,
540 regamatch_t *match, regaparams_t params, int eflags)
541 {
542 return tre_regawnexec(preg, str, (size_t)-1, match, params, eflags);
543 }
544
545 #endif /* TRE_WCHAR */
546
547 void
548 tre_regaparams_default(regaparams_t *params)
549 {
550 memset(params, 0, sizeof(*params));
551 params->cost_ins = 1;
552 params->cost_del = 1;
553 params->cost_subst = 1;
554 params->max_cost = INT_MAX;
555 params->max_ins = INT_MAX;
556 params->max_del = INT_MAX;
557 params->max_subst = INT_MAX;
558 params->max_err = INT_MAX;
559 }
560
561 #endif /* TRE_APPROX */
562
563 /* EOF */