]>
Commit | Line | Data |
---|---|---|
ad3c9f2a A |
1 | /* |
2 | tre-match-parallel.c - TRE parallel regex matching engine | |
3 | ||
4 | This software is released under a BSD-style license. | |
5 | See the file LICENSE for details and copyright. | |
6 | ||
7 | */ | |
8 | ||
9 | /* | |
10 | This algorithm searches for matches basically by reading characters | |
11 | in the searched string one by one, starting at the beginning. All | |
12 | matching paths in the TNFA are traversed in parallel. When two or | |
13 | more paths reach the same state, exactly one is chosen according to | |
14 | tag ordering rules; if returning submatches is not required it does | |
15 | not matter which path is chosen. | |
16 | ||
17 | The worst case time required for finding the leftmost and longest | |
18 | match, or determining that there is no match, is always linearly | |
19 | dependent on the length of the text being searched. | |
20 | ||
21 | This algorithm cannot handle TNFAs with back referencing nodes. | |
22 | See `tre-match-backtrack.c'. | |
23 | */ | |
24 | ||
25 | ||
26 | #ifdef HAVE_CONFIG_H | |
27 | #include <config.h> | |
28 | #endif /* HAVE_CONFIG_H */ | |
29 | ||
30 | /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state | |
31 | info while running */ | |
32 | #undef TRE_USE_ALLOCA | |
33 | ||
34 | #ifdef TRE_USE_ALLOCA | |
35 | /* AIX requires this to be the first thing in the file. */ | |
36 | #ifndef __GNUC__ | |
37 | # if HAVE_ALLOCA_H | |
38 | # include <alloca.h> | |
39 | # else | |
40 | # ifdef _AIX | |
41 | #pragma alloca | |
42 | # else | |
43 | # ifndef alloca /* predefined by HP cc +Olibcalls */ | |
44 | char *alloca (); | |
45 | # endif | |
46 | # endif | |
47 | # endif | |
48 | #endif | |
49 | #endif /* TRE_USE_ALLOCA */ | |
50 | ||
51 | #include <assert.h> | |
52 | #include <stdlib.h> | |
53 | #include <string.h> | |
54 | #ifdef HAVE_WCHAR_H | |
55 | #include <wchar.h> | |
56 | #endif /* HAVE_WCHAR_H */ | |
57 | #ifdef HAVE_WCTYPE_H | |
58 | #include <wctype.h> | |
59 | #endif /* HAVE_WCTYPE_H */ | |
60 | #ifndef TRE_WCHAR | |
61 | #include <ctype.h> | |
62 | #endif /* !TRE_WCHAR */ | |
63 | #ifdef HAVE_MALLOC_H | |
64 | #include <malloc.h> | |
65 | #endif /* HAVE_MALLOC_H */ | |
66 | ||
67 | #include "tre-internal.h" | |
68 | #include "tre-match-utils.h" | |
69 | #include "tre.h" | |
70 | #include "xmalloc.h" | |
71 | ||
72 | ||
73 | ||
74 | typedef struct { | |
75 | tre_tnfa_transition_t *state; | |
76 | tre_tag_t *tags; | |
77 | } tre_tnfa_reach_t; | |
78 | ||
79 | typedef struct { | |
80 | int pos; | |
81 | tre_tag_t **tags; | |
82 | } tre_reach_pos_t; | |
83 | ||
84 | ||
85 | #ifdef TRE_DEBUG | |
86 | static void | |
87 | tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags) | |
88 | { | |
89 | DPRINT((" %p", (void *)state)); | |
90 | if (num_tags > 0) | |
91 | { | |
92 | DPRINT(("/")); | |
93 | tre_print_tags(tags, num_tags); | |
94 | } | |
95 | } | |
96 | ||
97 | static void | |
98 | tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) | |
99 | { | |
100 | while (reach->state != NULL) | |
101 | { | |
102 | tre_print_reach1(reach->state, reach->tags, num_tags); | |
103 | reach++; | |
104 | } | |
105 | DPRINT(("\n")); | |
106 | ||
107 | } | |
108 | #endif /* TRE_DEBUG */ | |
109 | ||
110 | reg_errcode_t | |
111 | tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, | |
112 | tre_str_type_t type, tre_tag_t *match_tags, int eflags, | |
113 | int *match_end_ofs) | |
114 | { | |
115 | /* State variables required by GET_NEXT_WCHAR. */ | |
116 | tre_char_t prev_c = 0, next_c = 0; | |
117 | const char *str_byte = string; | |
118 | int pos = -1; | |
119 | unsigned int pos_add_next = 1; | |
120 | #ifdef TRE_WCHAR | |
121 | const wchar_t *str_wide = string; | |
122 | #ifdef TRE_MBSTATE | |
123 | mbstate_t mbstate; | |
124 | #endif /* TRE_MBSTATE */ | |
125 | #endif /* TRE_WCHAR */ | |
126 | int reg_notbol = eflags & REG_NOTBOL; | |
127 | int reg_noteol = eflags & REG_NOTEOL; | |
128 | int reg_newline = tnfa->cflags & REG_NEWLINE; | |
129 | #ifdef TRE_STR_USER | |
130 | int str_user_end = 0; | |
131 | #endif /* TRE_STR_USER */ | |
132 | ||
133 | char *buf; | |
134 | tre_tnfa_transition_t *trans_i; | |
135 | tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; | |
136 | tre_reach_pos_t *reach_pos; | |
137 | int *tag_i; | |
138 | int num_tags, i; | |
139 | ||
140 | int match_eo = -1; /* end offset of match (-1 if no match found yet) */ | |
141 | #ifdef TRE_DEBUG | |
142 | int once; | |
143 | #endif /* TRE_DEBUG */ | |
144 | tre_tag_t *tmp_tags = NULL; | |
145 | tre_tag_t *tmp_iptr; | |
2acb8998 | 146 | size_t tbytes; |
ad3c9f2a A |
147 | int touch = 1; |
148 | ||
149 | #ifdef TRE_MBSTATE | |
150 | memset(&mbstate, '\0', sizeof(mbstate)); | |
151 | #endif /* TRE_MBSTATE */ | |
152 | ||
153 | DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); | |
154 | ||
155 | if (!match_tags) | |
156 | num_tags = 0; | |
157 | else | |
158 | num_tags = tnfa->num_tags; | |
159 | ||
160 | /* Allocate memory for temporary data required for matching. This needs to | |
161 | be done for every matching operation to be thread safe. This allocates | |
162 | everything in a single large block from the stack frame using alloca() | |
163 | or with malloc() if alloca is unavailable. */ | |
164 | { | |
2acb8998 | 165 | size_t rbytes, pbytes, total_bytes; |
ad3c9f2a A |
166 | char *tmp_buf; |
167 | /* Compute the length of the block we need. */ | |
168 | tbytes = sizeof(*tmp_tags) * num_tags; | |
169 | rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); | |
170 | pbytes = sizeof(*reach_pos) * tnfa->num_states; | |
171 | total_bytes = | |
172 | (sizeof(long) - 1) * 4 /* for alignment paddings */ | |
173 | + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes; | |
174 | ||
175 | DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes)); | |
176 | /* Allocate the memory. */ | |
177 | #ifdef TRE_USE_ALLOCA | |
178 | buf = alloca(total_bytes); | |
179 | #else /* !TRE_USE_ALLOCA */ | |
2acb8998 | 180 | buf = xmalloc(total_bytes); |
ad3c9f2a A |
181 | #endif /* !TRE_USE_ALLOCA */ |
182 | if (buf == NULL) | |
183 | return REG_ESPACE; | |
2acb8998 | 184 | memset(buf, 0, total_bytes); |
ad3c9f2a A |
185 | |
186 | /* Get the various pointers within tmp_buf (properly aligned). */ | |
187 | tmp_tags = (void *)buf; | |
188 | tmp_buf = buf + tbytes; | |
189 | tmp_buf += ALIGN(tmp_buf, long); | |
190 | reach_next = (void *)tmp_buf; | |
191 | tmp_buf += rbytes; | |
192 | tmp_buf += ALIGN(tmp_buf, long); | |
193 | reach = (void *)tmp_buf; | |
194 | tmp_buf += rbytes; | |
195 | tmp_buf += ALIGN(tmp_buf, long); | |
196 | reach_pos = (void *)tmp_buf; | |
197 | tmp_buf += pbytes; | |
198 | tmp_buf += ALIGN(tmp_buf, long); | |
199 | for (i = 0; i < tnfa->num_states; i++) | |
200 | { | |
201 | reach[i].tags = (void *)tmp_buf; | |
202 | tmp_buf += tbytes; | |
203 | reach_next[i].tags = (void *)tmp_buf; | |
204 | tmp_buf += tbytes; | |
205 | } | |
206 | } | |
207 | ||
208 | for (i = 0; i < tnfa->num_states; i++) | |
209 | reach_pos[i].pos = -1; | |
210 | ||
211 | /* If only one character can start a match, find it first. */ | |
6465356a | 212 | if (tnfa->first_char >= 0 && str_byte) |
ad3c9f2a A |
213 | { |
214 | const char *orig_str = str_byte; | |
215 | int first = tnfa->first_char; | |
6465356a | 216 | int found_high_bit = 0; |
ad3c9f2a | 217 | |
6465356a A |
218 | |
219 | if (type == STR_BYTE) | |
220 | { | |
221 | if (len >= 0) | |
222 | str_byte = memchr(orig_str, first, (size_t)len); | |
223 | else | |
224 | str_byte = strchr(orig_str, first); | |
225 | } | |
226 | else if (type == STR_MBS) | |
227 | { | |
228 | /* | |
229 | * If the match character is ASCII, try to match the character | |
230 | * directly, but if a high bit character is found, we stop there. | |
231 | */ | |
232 | if (first < 0x80) | |
233 | { | |
234 | if (len >= 0) | |
235 | { | |
236 | int i; | |
237 | for (i = 0; ; str_byte++, i++) | |
238 | { | |
239 | if (i >= len) | |
240 | { | |
241 | str_byte = NULL; | |
242 | break; | |
243 | } | |
244 | if (*str_byte == first) | |
245 | break; | |
246 | if (*str_byte & 0x80) | |
247 | { | |
248 | found_high_bit = 1; | |
249 | break; | |
250 | } | |
251 | } | |
252 | } | |
253 | else | |
254 | { | |
255 | for (; ; str_byte++) | |
256 | { | |
257 | if (!*str_byte) | |
258 | { | |
259 | str_byte = NULL; | |
260 | break; | |
261 | } | |
262 | if (*str_byte == first) | |
263 | break; | |
264 | if (*str_byte & 0x80) | |
265 | { | |
266 | found_high_bit = 1; | |
267 | break; | |
268 | } | |
269 | } | |
270 | } | |
271 | } | |
272 | else | |
273 | { | |
274 | if (len >= 0) | |
275 | { | |
276 | int i; | |
277 | for (i = 0; ; str_byte++, i++) | |
278 | { | |
279 | if (i >= len) | |
280 | { | |
281 | str_byte = NULL; | |
282 | break; | |
283 | } | |
284 | if (*str_byte & 0x80) | |
285 | { | |
286 | found_high_bit = 1; | |
287 | break; | |
288 | } | |
289 | } | |
290 | } | |
291 | else | |
292 | { | |
293 | for (; ; str_byte++) | |
294 | { | |
295 | if (!*str_byte) | |
296 | { | |
297 | str_byte = NULL; | |
298 | break; | |
299 | } | |
300 | if (*str_byte & 0x80) | |
301 | { | |
302 | found_high_bit = 1; | |
303 | break; | |
304 | } | |
305 | } | |
306 | } | |
307 | } | |
308 | } | |
ad3c9f2a A |
309 | if (str_byte == NULL) |
310 | { | |
311 | #ifndef TRE_USE_ALLOCA | |
312 | if (buf) | |
313 | xfree(buf); | |
314 | #endif /* !TRE_USE_ALLOCA */ | |
315 | return REG_NOMATCH; | |
316 | } | |
317 | DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); | |
6465356a A |
318 | if (!found_high_bit) |
319 | { | |
320 | if (str_byte >= orig_str + 1) | |
321 | prev_c = (unsigned char)*(str_byte - 1); | |
322 | next_c = (unsigned char)*str_byte; | |
323 | pos = str_byte - orig_str; | |
324 | if (len < 0 || pos < len) | |
325 | str_byte++; | |
326 | } | |
327 | else | |
328 | { | |
329 | if (str_byte == orig_str) | |
330 | goto no_first_optimization; | |
331 | /* | |
332 | * Back up one character, fix up the position, then call | |
333 | * GET_NEXT_WCHAR() to process the multibyte character. | |
334 | */ | |
335 | /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */ | |
336 | next_c = (unsigned char)*(str_byte - 1); | |
337 | pos = (str_byte - 1) - orig_str; | |
338 | GET_NEXT_WCHAR(); | |
339 | } | |
ad3c9f2a A |
340 | } |
341 | else | |
342 | { | |
6465356a | 343 | no_first_optimization: |
ad3c9f2a A |
344 | GET_NEXT_WCHAR(); |
345 | pos = 0; | |
346 | } | |
347 | ||
6465356a | 348 | #ifdef USE_FIRSTPOS_CHARS /* not defined */ |
ad3c9f2a A |
349 | /* Skip over characters that cannot possibly be the first character |
350 | of a match. */ | |
351 | if (tnfa->firstpos_chars != NULL) | |
352 | { | |
353 | char *chars = tnfa->firstpos_chars; | |
354 | ||
355 | if (len < 0) | |
356 | { | |
357 | const char *orig_str = str_byte; | |
358 | /* XXX - use strpbrk() and wcspbrk() because they might be | |
359 | optimized for the target architecture. Try also strcspn() | |
360 | and wcscspn() and compare the speeds. */ | |
361 | while (next_c != L'\0' && !chars[next_c]) | |
362 | { | |
363 | next_c = *str_byte++; | |
364 | } | |
365 | prev_c = *(str_byte - 2); | |
366 | pos += str_byte - orig_str; | |
367 | DPRINT(("skipped %d chars\n", str_byte - orig_str)); | |
368 | } | |
369 | else | |
370 | { | |
371 | while (pos <= len && !chars[next_c]) | |
372 | { | |
373 | prev_c = next_c; | |
374 | next_c = (unsigned char)(*str_byte++); | |
375 | pos++; | |
376 | } | |
377 | } | |
378 | } | |
6465356a | 379 | #endif /* USE_FIRSTPOS_CHARS */ |
ad3c9f2a A |
380 | |
381 | DPRINT(("length: %d\n", len)); | |
382 | DPRINT(("pos:chr/code | states and tags\n")); | |
383 | DPRINT(("-------------+------------------------------------------------\n")); | |
384 | ||
385 | reach_next_i = reach_next; | |
386 | while (/*CONSTCOND*/1) | |
387 | { | |
388 | /* If no match found yet, add the initial states to `reach_next'. */ | |
389 | if (match_eo < 0) | |
390 | { | |
391 | DPRINT((" init >")); | |
392 | trans_i = tnfa->initial; | |
393 | while (trans_i->state != NULL) | |
394 | { | |
395 | if (reach_pos[trans_i->state_id].pos < pos) | |
396 | { | |
397 | if (trans_i->assertions | |
398 | && CHECK_ASSERTIONS(trans_i->assertions)) | |
399 | { | |
400 | DPRINT(("assertion failed\n")); | |
401 | trans_i++; | |
402 | continue; | |
403 | } | |
404 | ||
405 | DPRINT((" %p", (void *)trans_i->state)); | |
406 | reach_next_i->state = trans_i->state; | |
407 | memset(reach_next_i->tags, 0, tbytes); | |
408 | tag_i = trans_i->tags; | |
409 | if (tag_i) | |
410 | { | |
411 | while (*tag_i >= 0) | |
412 | { | |
413 | if (*tag_i < num_tags) | |
414 | tre_tag_set(reach_next_i->tags, *tag_i, pos, touch); | |
415 | tag_i++; | |
416 | } | |
417 | touch++; | |
418 | } | |
419 | if (reach_next_i->state == tnfa->final) | |
420 | { | |
421 | DPRINT((" found empty match\n")); | |
422 | match_eo = pos; | |
423 | memcpy(match_tags, reach_next_i->tags, tbytes); | |
424 | } | |
425 | reach_pos[trans_i->state_id].pos = pos; | |
426 | reach_pos[trans_i->state_id].tags = &reach_next_i->tags; | |
427 | reach_next_i++; | |
428 | } | |
429 | trans_i++; | |
430 | } | |
431 | DPRINT(("\n")); | |
432 | reach_next_i->state = NULL; | |
433 | } | |
434 | else | |
435 | { | |
436 | if (num_tags == 0 || reach_next_i == reach_next) | |
437 |