]> git.saurik.com Git - apple/libc.git/blob - regex/TRE/lib/tre-match-parallel.c
Libc-825.40.1.tar.gz
[apple/libc.git] / regex / TRE / lib / tre-match-parallel.c
1 /*
2 tre-match-parallel.c - TRE parallel regex matching engine
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 This algorithm searches for matches basically by reading characters
11 in the searched string one by one, starting at the beginning. All
12 matching paths in the TNFA are traversed in parallel. When two or
13 more paths reach the same state, exactly one is chosen according to
14 tag ordering rules; if returning submatches is not required it does
15 not matter which path is chosen.
16
17 The worst case time required for finding the leftmost and longest
18 match, or determining that there is no match, is always linearly
19 dependent on the length of the text being searched.
20
21 This algorithm cannot handle TNFAs with back referencing nodes.
22 See `tre-match-backtrack.c'.
23 */
24
25
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29
30 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
31 info while running */
32 #undef TRE_USE_ALLOCA
33
34 #ifdef TRE_USE_ALLOCA
35 /* AIX requires this to be the first thing in the file. */
36 #ifndef __GNUC__
37 # if HAVE_ALLOCA_H
38 # include <alloca.h>
39 # else
40 # ifdef _AIX
41 #pragma alloca
42 # else
43 # ifndef alloca /* predefined by HP cc +Olibcalls */
44 char *alloca ();
45 # endif
46 # endif
47 # endif
48 #endif
49 #endif /* TRE_USE_ALLOCA */
50
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #ifdef HAVE_WCHAR_H
55 #include <wchar.h>
56 #endif /* HAVE_WCHAR_H */
57 #ifdef HAVE_WCTYPE_H
58 #include <wctype.h>
59 #endif /* HAVE_WCTYPE_H */
60 #ifndef TRE_WCHAR
61 #include <ctype.h>
62 #endif /* !TRE_WCHAR */
63 #ifdef HAVE_MALLOC_H
64 #include <malloc.h>
65 #endif /* HAVE_MALLOC_H */
66
67 #include "tre-internal.h"
68 #include "tre-match-utils.h"
69 #include "tre.h"
70 #include "xmalloc.h"
71
72
73
74 typedef struct {
75 tre_tnfa_transition_t *state;
76 tre_tag_t *tags;
77 } tre_tnfa_reach_t;
78
79 typedef struct {
80 int pos;
81 tre_tag_t **tags;
82 } tre_reach_pos_t;
83
84
85 #ifdef TRE_DEBUG
86 static void
87 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
88 {
89 DPRINT((" %p", (void *)state));
90 if (num_tags > 0)
91 {
92 DPRINT(("/"));
93 tre_print_tags(tags, num_tags);
94 }
95 }
96
97 static void
98 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
99 {
100 while (reach->state != NULL)
101 {
102 tre_print_reach1(reach->state, reach->tags, num_tags);
103 reach++;
104 }
105 DPRINT(("\n"));
106
107 }
108 #endif /* TRE_DEBUG */
109
110 reg_errcode_t
111 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
112 tre_str_type_t type, tre_tag_t *match_tags, int eflags,
113 int *match_end_ofs)
114 {
115 /* State variables required by GET_NEXT_WCHAR. */
116 tre_char_t prev_c = 0, next_c = 0;
117 const char *str_byte = string;
118 int pos = -1;
119 unsigned int pos_add_next = 1;
120 #ifdef TRE_WCHAR
121 const wchar_t *str_wide = string;
122 #ifdef TRE_MBSTATE
123 mbstate_t mbstate;
124 #endif /* TRE_MBSTATE */
125 #endif /* TRE_WCHAR */
126 int reg_notbol = eflags & REG_NOTBOL;
127 int reg_noteol = eflags & REG_NOTEOL;
128 int reg_newline = tnfa->cflags & REG_NEWLINE;
129 #ifdef TRE_STR_USER
130 int str_user_end = 0;
131 #endif /* TRE_STR_USER */
132
133 char *buf;
134 tre_tnfa_transition_t *trans_i;
135 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
136 tre_reach_pos_t *reach_pos;
137 int *tag_i;
138 int num_tags, i;
139
140 int match_eo = -1; /* end offset of match (-1 if no match found yet) */
141 #ifdef TRE_DEBUG
142 int once;
143 #endif /* TRE_DEBUG */
144 tre_tag_t *tmp_tags = NULL;
145 tre_tag_t *tmp_iptr;
146 int tbytes;
147 int touch = 1;
148
149 #ifdef TRE_MBSTATE
150 memset(&mbstate, '\0', sizeof(mbstate));
151 #endif /* TRE_MBSTATE */
152
153 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
154
155 if (!match_tags)
156 num_tags = 0;
157 else
158 num_tags = tnfa->num_tags;
159
160 /* Allocate memory for temporary data required for matching. This needs to
161 be done for every matching operation to be thread safe. This allocates
162 everything in a single large block from the stack frame using alloca()
163 or with malloc() if alloca is unavailable. */
164 {
165 int rbytes, pbytes, total_bytes;
166 char *tmp_buf;
167 /* Compute the length of the block we need. */
168 tbytes = sizeof(*tmp_tags) * num_tags;
169 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
170 pbytes = sizeof(*reach_pos) * tnfa->num_states;
171 total_bytes =
172 (sizeof(long) - 1) * 4 /* for alignment paddings */
173 + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
174
175 DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
176 /* Allocate the memory. */
177 #ifdef TRE_USE_ALLOCA
178 buf = alloca(total_bytes);
179 #else /* !TRE_USE_ALLOCA */
180 buf = xmalloc((unsigned)total_bytes);
181 #endif /* !TRE_USE_ALLOCA */
182 if (buf == NULL)
183 return REG_ESPACE;
184 memset(buf, 0, (size_t)total_bytes);
185
186 /* Get the various pointers within tmp_buf (properly aligned). */
187 tmp_tags = (void *)buf;
188 tmp_buf = buf + tbytes;
189 tmp_buf += ALIGN(tmp_buf, long);
190 reach_next = (void *)tmp_buf;
191 tmp_buf += rbytes;
192 tmp_buf += ALIGN(tmp_buf, long);
193 reach = (void *)tmp_buf;
194 tmp_buf += rbytes;
195 tmp_buf += ALIGN(tmp_buf, long);
196 reach_pos = (void *)tmp_buf;
197 tmp_buf += pbytes;
198 tmp_buf += ALIGN(tmp_buf, long);
199 for (i = 0; i < tnfa->num_states; i++)
200 {
201 reach[i].tags = (void *)tmp_buf;
202 tmp_buf += tbytes;
203 reach_next[i].tags = (void *)tmp_buf;
204 tmp_buf += tbytes;
205 }
206 }
207
208 for (i = 0; i < tnfa->num_states; i++)
209 reach_pos[i].pos = -1;
210
211 /* If only one character can start a match, find it first. */
212 if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
213 {
214 const char *orig_str = str_byte;
215 int first = tnfa->first_char;
216
217 if (len >= 0)
218 str_byte = memchr(orig_str, first, (size_t)len);
219 else
220 str_byte = strchr(orig_str, first);
221 if (str_byte == NULL)
222 {
223 #ifndef TRE_USE_ALLOCA
224 if (buf)
225 xfree(buf);
226 #endif /* !TRE_USE_ALLOCA */
227 return REG_NOMATCH;
228 }
229 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
230 if (str_byte >= orig_str + 1)
231 prev_c = (unsigned char)*(str_byte - 1);
232 next_c = (unsigned char)*str_byte;
233 pos = str_byte - orig_str;
234 if (len < 0 || pos < len)
235 str_byte++;
236 }
237 else
238 {
239 GET_NEXT_WCHAR();
240 pos = 0;
241 }
242
243 #if 0
244 /* Skip over characters that cannot possibly be the first character
245 of a match. */
246 if (tnfa->firstpos_chars != NULL)
247 {
248 char *chars = tnfa->firstpos_chars;
249
250 if (len < 0)
251 {
252 const char *orig_str = str_byte;
253 /* XXX - use strpbrk() and wcspbrk() because they might be
254 optimized for the target architecture. Try also strcspn()
255 and wcscspn() and compare the speeds. */
256 while (next_c != L'\0' && !chars[next_c])
257 {
258 next_c = *str_byte++;
259 }
260 prev_c = *(str_byte - 2);
261 pos += str_byte - orig_str;
262 DPRINT(("skipped %d chars\n", str_byte - orig_str));
263 }
264 else
265 {
266 while (pos <= len && !chars[next_c])
267 {
268 prev_c = next_c;
269 next_c = (unsigned char)(*str_byte++);
270 pos++;
271 }
272 }
273 }
274 #endif
275
276 DPRINT(("length: %d\n", len));
277 DPRINT(("pos:chr/code | states and tags\n"));
278 DPRINT(("-------------+------------------------------------------------\n"));
279
280 reach_next_i = reach_next;
281 while (/*CONSTCOND*/1)
282 {
283 /* If no match found yet, add the initial states to `reach_next'. */
284 if (match_eo < 0)
285 {
286 DPRINT((" init >"));
287 trans_i = tnfa->initial;
288 while (trans_i->state != NULL)
289 {
290 if (reach_pos[trans_i->state_id].pos < pos)
291 {
292 if (trans_i->assertions
293 && CHECK_ASSERTIONS(trans_i->assertions))
294 {
295 DPRINT(("assertion failed\n"));
296 trans_i++;
297 continue;
298 }
299
300 DPRINT((" %p", (void *)trans_i->state));
301 reach_next_i->state = trans_i->state;
302 memset(reach_next_i->tags, 0, tbytes);
303 tag_i = trans_i->tags;
304 if (tag_i)
305 {
306 while (*tag_i >= 0)
307 {
308 if (*tag_i < num_tags)
309 tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
310 tag_i++;
311 }
312 touch++;
313 }
314 if (reach_next_i->state == tnfa->final)
315 {
316 DPRINT((" found empty match\n"));
317 match_eo = pos;
318 memcpy(match_tags, reach_next_i->tags, tbytes);
319 }
320 reach_pos[trans_i->state_id].pos = pos;
321 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
322 reach_next_i++;
323 }
324 trans_i++;
325 }
326 DPRINT(("\n"));
327 reach_next_i->state = NULL;
328 }
329 else
330 {
331 if (num_tags == 0 || reach_next_i == reach_next)
332 /* We have found a match. */
333 break;
334 }
335
336 /* Check for end of string. */
337 if (len < 0)
338 {
339 #ifdef TRE_STR_USER
340 if (type == STR_USER)
341 {
342 if (str_user_end)
343 break;
344 }
345 else
346 #endif /* TRE_STR_USER */
347 if (next_c == L'\0')
348 break;
349 }
350 else
351 {
352 if (pos >= len)
353 break;
354 }
355
356 GET_NEXT_WCHAR();
357
358 #ifdef TRE_DEBUG
359 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
360 tre_print_reach(tnfa, reach_next, num_tags);
361 //DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
362 //tre_print_reach(tnfa, reach_next, num_tags);
363 #endif /* TRE_DEBUG */
364
365 /* Swap `reach' and `reach_next'. */
366 reach_i = reach;
367 reach = reach_next;
368 reach_next = reach_i;
369
370 #ifdef TRE_DEBUG
371 once = 0;
372 #endif /* TRE_DEBUG */
373
374 /* For each state in `reach' see if there is a transition leaving with
375 the current input symbol to a state not yet in `reach_next', and
376 add the destination states to `reach_next'. */
377 reach_next_i = reach_next;
378 for (reach_i = reach; reach_i->state; reach_i++)
379 {
380 for (trans_i = reach_i->state; trans_i->state; trans_i++)
381 {
382 /* Does this transition match the input symbol? */
383 if (trans_i->code_min <= (tre_cint_t)prev_c &&
384 trans_i->code_max >= (tre_cint_t)prev_c)
385 {
386 if (trans_i->assertions
387 && (CHECK_ASSERTIONS(trans_i->assertions)
388 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
389 {
390 DPRINT(("assertion failed\n"));
391 continue;
392 }
393
394 /* Compute the tags after this transition. */
395 memcpy(tmp_tags, reach_i->tags, tbytes);
396 tag_i = trans_i->tags;
397 if (tag_i != NULL)
398 {
399 while (*tag_i >= 0)
400 {
401 if (*tag_i < num_tags)
402 tre_tag_set(tmp_tags, *tag_i, pos, touch);
403 tag_i++;
404 }
405 touch++;
406 }
407
408 /* For each new transition, weed out those that don't
409 fulfill the minimal matching conditions. */
410 if (tnfa->num_minimals && match_eo >= 0)
411 {
412 int skip = 0;
413 #ifdef TRE_DEBUG
414 if (!once)
415 {
416 DPRINT(("Checking minimal conditions: match_eo=%d "
417 "match_tags=", match_eo));
418 tre_print_tags(match_tags, num_tags);
419 DPRINT(("\n"));
420 once++;
421 }
422 #endif /* TRE_DEBUG */
423 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
424 {
425 int end = tnfa->minimal_tags[i];
426 int start = tnfa->minimal_tags[i + 1];
427 DPRINT((" Minimal start %d, end %d\n", start, end));
428 if (tre_minimal_tag_order(start, end, match_tags,
429 tmp_tags) > 0)
430 {
431 skip = 1;
432 break;
433 }
434 }
435 if (skip)
436 {
437 #ifdef TRE_DEBUG
438 DPRINT((" Throwing out"));
439 tre_print_reach1(reach_i->state, tmp_tags,
440 num_tags);
441 DPRINT(("\n"));
442 #endif /* TRE_DEBUG */
443 continue;
444 }
445 }
446
447 if (reach_pos[trans_i->state_id].pos < pos)
448 {
449 /* Found an unvisited node. */
450 reach_next_i->state = trans_i->state;
451 tmp_iptr = reach_next_i->tags;
452 reach_next_i->tags = tmp_tags;
453 tmp_tags = tmp_iptr;
454 reach_pos[trans_i->state_id].pos = pos;
455 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
456
457 if (reach_next_i->state == tnfa->final
458 && (match_eo == -1
459 || (num_tags > 0
460 && tre_tag_get(reach_next_i->tags, 0) <=
461 tre_tag_get(match_tags, 0))))
462 {
463 #ifdef TRE_DEBUG
464 DPRINT((" found match"));
465 tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
466 DPRINT(("\n"));
467 #endif /* TRE_DEBUG */
468 match_eo = pos;
469 memcpy(match_tags, reach_next_i->tags, tbytes);
470 }
471 reach_next_i++;
472
473 }
474 else
475 {
476 assert(reach_pos[trans_i->state_id].pos == pos);
477 /* Another path has also reached this state. We choose
478 the winner by examining the tag values for both
479 paths. */
480 if (tre_tag_order(num_tags, tnfa->tag_directions,
481 tmp_tags,
482 *reach_pos[trans_i->state_id].tags))
483 {
484 /* The new path wins. */
485 tmp_iptr = *reach_pos[trans_i->state_id].tags;
486 *reach_pos[trans_i->state_id].tags = tmp_tags;
487 if (trans_i->state == tnfa->final)
488 {
489 #ifdef TRE_DEBUG
490 DPRINT((" found better match"));
491 tre_print_reach1(trans_i->state, tmp_tags, num_tags);
492 DPRINT(("\n"));
493 #endif /* TRE_DEBUG */
494 match_eo = pos;
495 memcpy(match_tags, tmp_tags, tbytes);
496 }
497 tmp_tags = tmp_iptr;
498 }
499 }
500 }
501 }
502 }
503 reach_next_i->state = NULL;
504 }
505
506 DPRINT(("match end offset = %d\n", match_eo));
507
508 *match_end_ofs = match_eo;
509 #ifdef TRE_DEBUG
510 if (match_tags)
511 {
512 DPRINT(("Winning tags="));
513 tre_print_tags_all(match_tags, num_tags);
514 DPRINT((" touch=%d\n", touch));
515 }
516 #endif /* TRE_DEBUG */
517
518 #ifndef TRE_USE_ALLOCA
519 if (buf)
520 xfree(buf);
521 #endif /* !TRE_USE_ALLOCA */
522
523 return match_eo >= 0 ? REG_OK : REG_NOMATCH;
524 }
525
526 /* EOF */