]>
Commit | Line | Data |
---|---|---|
ad3c9f2a A |
1 | /* |
2 | tre-match-parallel.c - TRE parallel regex matching engine | |
3 | ||
4 | This software is released under a BSD-style license. | |
5 | See the file LICENSE for details and copyright. | |
6 | ||
7 | */ | |
8 | ||
9 | /* | |
10 | This algorithm searches for matches basically by reading characters | |
11 | in the searched string one by one, starting at the beginning. All | |
12 | matching paths in the TNFA are traversed in parallel. When two or | |
13 | more paths reach the same state, exactly one is chosen according to | |
14 | tag ordering rules; if returning submatches is not required it does | |
15 | not matter which path is chosen. | |
16 | ||
17 | The worst case time required for finding the leftmost and longest | |
18 | match, or determining that there is no match, is always linearly | |
19 | dependent on the length of the text being searched. | |
20 | ||
21 | This algorithm cannot handle TNFAs with back referencing nodes. | |
22 | See `tre-match-backtrack.c'. | |
23 | */ | |
24 | ||
25 | ||
26 | #ifdef HAVE_CONFIG_H | |
27 | #include <config.h> | |
28 | #endif /* HAVE_CONFIG_H */ | |
29 | ||
30 | /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state | |
31 | info while running */ | |
32 | #undef TRE_USE_ALLOCA | |
33 | ||
34 | #ifdef TRE_USE_ALLOCA | |
35 | /* AIX requires this to be the first thing in the file. */ | |
36 | #ifndef __GNUC__ | |
37 | # if HAVE_ALLOCA_H | |
38 | # include <alloca.h> | |
39 | # else | |
40 | # ifdef _AIX | |
41 | #pragma alloca | |
42 | # else | |
43 | # ifndef alloca /* predefined by HP cc +Olibcalls */ | |
44 | char *alloca (); | |
45 | # endif | |
46 | # endif | |
47 | # endif | |
48 | #endif | |
49 | #endif /* TRE_USE_ALLOCA */ | |
50 | ||
51 | #include <assert.h> | |
52 | #include <stdlib.h> | |
53 | #include <string.h> | |
54 | #ifdef HAVE_WCHAR_H | |
55 | #include <wchar.h> | |
56 | #endif /* HAVE_WCHAR_H */ | |
57 | #ifdef HAVE_WCTYPE_H | |
58 | #include <wctype.h> | |
59 | #endif /* HAVE_WCTYPE_H */ | |
60 | #ifndef TRE_WCHAR | |
61 | #include <ctype.h> | |
62 | #endif /* !TRE_WCHAR */ | |
63 | #ifdef HAVE_MALLOC_H | |
64 | #include <malloc.h> | |
65 | #endif /* HAVE_MALLOC_H */ | |
66 | ||
67 | #include "tre-internal.h" | |
68 | #include "tre-match-utils.h" | |
69 | #include "tre.h" | |
70 | #include "xmalloc.h" | |
71 | ||
72 | ||
73 | ||
74 | typedef struct { | |
75 | tre_tnfa_transition_t *state; | |
76 | tre_tag_t *tags; | |
77 | } tre_tnfa_reach_t; | |
78 | ||
79 | typedef struct { | |
80 | int pos; | |
81 | tre_tag_t **tags; | |
82 | } tre_reach_pos_t; | |
83 | ||
84 | ||
85 | #ifdef TRE_DEBUG | |
86 | static void | |
87 | tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags) | |
88 | { | |
89 | DPRINT((" %p", (void *)state)); | |
90 | if (num_tags > 0) | |
91 | { | |
92 | DPRINT(("/")); | |
93 | tre_print_tags(tags, num_tags); | |
94 | } | |
95 | } | |
96 | ||
97 | static void | |
98 | tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags) | |
99 | { | |
100 | while (reach->state != NULL) | |
101 | { | |
102 | tre_print_reach1(reach->state, reach->tags, num_tags); | |
103 | reach++; | |
104 | } | |
105 | DPRINT(("\n")); | |
106 | ||
107 | } | |
108 | #endif /* TRE_DEBUG */ | |
109 | ||
110 | reg_errcode_t | |
111 | tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, | |
112 | tre_str_type_t type, tre_tag_t *match_tags, int eflags, | |
113 | int *match_end_ofs) | |
114 | { | |
115 | /* State variables required by GET_NEXT_WCHAR. */ | |
116 | tre_char_t prev_c = 0, next_c = 0; | |
117 | const char *str_byte = string; | |
118 | int pos = -1; | |
119 | unsigned int pos_add_next = 1; | |
120 | #ifdef TRE_WCHAR | |
121 | const wchar_t *str_wide = string; | |
122 | #ifdef TRE_MBSTATE | |
123 | mbstate_t mbstate; | |
124 | #endif /* TRE_MBSTATE */ | |
125 | #endif /* TRE_WCHAR */ | |
126 | int reg_notbol = eflags & REG_NOTBOL; | |
127 | int reg_noteol = eflags & REG_NOTEOL; | |
128 | int reg_newline = tnfa->cflags & REG_NEWLINE; | |
129 | #ifdef TRE_STR_USER | |
130 | int str_user_end = 0; | |
131 | #endif /* TRE_STR_USER */ | |
132 | ||
133 | char *buf; | |
134 | tre_tnfa_transition_t *trans_i; | |
135 | tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i; | |
136 | tre_reach_pos_t *reach_pos; | |
137 | int *tag_i; | |
138 | int num_tags, i; | |
139 | ||
140 | int match_eo = -1; /* end offset of match (-1 if no match found yet) */ | |
141 | #ifdef TRE_DEBUG | |
142 | int once; | |
143 | #endif /* TRE_DEBUG */ | |
144 | tre_tag_t *tmp_tags = NULL; | |
145 | tre_tag_t *tmp_iptr; | |
146 | int tbytes; | |
147 | int touch = 1; | |
148 | ||
149 | #ifdef TRE_MBSTATE | |
150 | memset(&mbstate, '\0', sizeof(mbstate)); | |
151 | #endif /* TRE_MBSTATE */ | |
152 | ||
153 | DPRINT(("tre_tnfa_run_parallel, input type %d\n", type)); | |
154 | ||
155 | if (!match_tags) | |
156 | num_tags = 0; | |
157 | else | |
158 | num_tags = tnfa->num_tags; | |
159 | ||
160 | /* Allocate memory for temporary data required for matching. This needs to | |
161 | be done for every matching operation to be thread safe. This allocates | |
162 | everything in a single large block from the stack frame using alloca() | |
163 | or with malloc() if alloca is unavailable. */ | |
164 | { | |
165 | int rbytes, pbytes, total_bytes; | |
166 | char *tmp_buf; | |
167 | /* Compute the length of the block we need. */ | |
168 | tbytes = sizeof(*tmp_tags) * num_tags; | |
169 | rbytes = sizeof(*reach_next) * (tnfa->num_states + 1); | |
170 | pbytes = sizeof(*reach_pos) * tnfa->num_states; | |
171 | total_bytes = | |
172 | (sizeof(long) - 1) * 4 /* for alignment paddings */ | |
173 | + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes; | |
174 | ||
175 | DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes)); | |
176 | /* Allocate the memory. */ | |
177 | #ifdef TRE_USE_ALLOCA | |
178 | buf = alloca(total_bytes); | |
179 | #else /* !TRE_USE_ALLOCA */ | |
180 | buf = xmalloc((unsigned)total_bytes); | |
181 | #endif /* !TRE_USE_ALLOCA */ | |
182 | if (buf == NULL) | |
183 | return REG_ESPACE; | |
184 | memset(buf, 0, (size_t)total_bytes); | |
185 | ||
186 | /* Get the various pointers within tmp_buf (properly aligned). */ | |
187 | tmp_tags = (void *)buf; | |
188 | tmp_buf = buf + tbytes; | |
189 | tmp_buf += ALIGN(tmp_buf, long); | |
190 | reach_next = (void *)tmp_buf; | |
191 | tmp_buf += rbytes; | |
192 | tmp_buf += ALIGN(tmp_buf, long); | |
193 | reach = (void *)tmp_buf; | |
194 | tmp_buf += rbytes; | |
195 | tmp_buf += ALIGN(tmp_buf, long); | |
196 | reach_pos = (void *)tmp_buf; | |
197 | tmp_buf += pbytes; | |
198 | tmp_buf += ALIGN(tmp_buf, long); | |
199 | for (i = 0; i < tnfa->num_states; i++) | |
200 | { | |
201 | reach[i].tags = (void *)tmp_buf; | |
202 | tmp_buf += tbytes; | |
203 | reach_next[i].tags = (void *)tmp_buf; | |
204 | tmp_buf += tbytes; | |
205 | } | |
206 | } | |
207 | ||
208 | for (i = 0; i < tnfa->num_states; i++) | |
209 | reach_pos[i].pos = -1; | |
210 | ||
211 | /* If only one character can start a match, find it first. */ | |
212 | if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte) | |
213 | { | |
214 | const char *orig_str = str_byte; | |
215 | int first = tnfa->first_char; | |
216 | ||
217 | if (len >= 0) | |
218 | str_byte = memchr(orig_str, first, (size_t)len); | |
219 | else | |
220 | str_byte = strchr(orig_str, first); | |
221 | if (str_byte == NULL) | |
222 | { | |
223 | #ifndef TRE_USE_ALLOCA | |
224 | if (buf) | |
225 | xfree(buf); | |
226 | #endif /* !TRE_USE_ALLOCA */ | |
227 | return REG_NOMATCH; | |
228 | } | |
229 | DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str))); | |
230 | if (str_byte >= orig_str + 1) | |
231 | prev_c = (unsigned char)*(str_byte - 1); | |
232 | next_c = (unsigned char)*str_byte; | |
233 | pos = str_byte - orig_str; | |
234 | if (len < 0 || pos < len) | |
235 | str_byte++; | |
236 | } | |
237 | else | |
238 | { | |
239 | GET_NEXT_WCHAR(); | |
240 | pos = 0; | |
241 | } | |
242 | ||
243 | #if 0 | |
244 | /* Skip over characters that cannot possibly be the first character | |
245 | of a match. */ | |
246 | if (tnfa->firstpos_chars != NULL) | |
247 | { | |
248 | char *chars = tnfa->firstpos_chars; | |
249 | ||
250 | if (len < 0) | |
251 | { | |
252 | const char *orig_str = str_byte; | |
253 | /* XXX - use strpbrk() and wcspbrk() because they might be | |
254 | optimized for the target architecture. Try also strcspn() | |
255 | and wcscspn() and compare the speeds. */ | |
256 | while (next_c != L'\0' && !chars[next_c]) | |
257 | { | |
258 | next_c = *str_byte++; | |
259 | } | |
260 | prev_c = *(str_byte - 2); | |
261 | pos += str_byte - orig_str; | |
262 | DPRINT(("skipped %d chars\n", str_byte - orig_str)); | |
263 | } | |
264 | else | |
265 | { | |
266 | while (pos <= len && !chars[next_c]) | |
267 | { | |
268 | prev_c = next_c; | |
269 | next_c = (unsigned char)(*str_byte++); | |
270 | pos++; | |
271 | } | |
272 | } | |
273 | } | |
274 | #endif | |
275 | ||
276 | DPRINT(("length: %d\n", len)); | |
277 | DPRINT(("pos:chr/code | states and tags\n")); | |
278 | DPRINT(("-------------+------------------------------------------------\n")); | |
279 | ||
280 | reach_next_i = reach_next; | |
281 | while (/*CONSTCOND*/1) | |
282 | { | |
283 | /* If no match found yet, add the initial states to `reach_next'. */ | |
284 | if (match_eo < 0) | |
285 | { | |
286 | DPRINT((" init >")); | |
287 | trans_i = tnfa->initial; | |
288 | while (trans_i->state != NULL) | |
289 | { | |
290 | if (reach_pos[trans_i->state_id].pos < pos) | |
291 | { | |
292 | if (trans_i->assertions | |
293 | && CHECK_ASSERTIONS(trans_i->assertions)) | |
294 | { | |
295 | DPRINT(("assertion failed\n")); | |
296 | trans_i++; | |
297 | continue; | |
298 | } | |
299 | ||
300 | DPRINT((" %p", (void *)trans_i->state)); | |
301 | reach_next_i->state = trans_i->state; | |
302 | memset(reach_next_i->tags, 0, tbytes); | |
303 | tag_i = trans_i->tags; | |
304 | if (tag_i) | |
305 | { | |
306 | while (*tag_i >= 0) | |
307 | { | |
308 | if (*tag_i < num_tags) | |
309 | tre_tag_set(reach_next_i->tags, *tag_i, pos, touch); | |
310 | tag_i++; | |
311 | } | |
312 | touch++; | |
313 | } | |
314 | if (reach_next_i->state == tnfa->final) | |
315 | { | |
316 | DPRINT((" found empty match\n")); | |
317 | match_eo = pos; | |
318 | memcpy(match_tags, reach_next_i->tags, tbytes); | |
319 | } | |
320 | reach_pos[trans_i->state_id].pos = pos; | |
321 | reach_pos[trans_i->state_id].tags = &reach_next_i->tags; | |
322 | reach_next_i++; | |
323 | } | |
324 | trans_i++; | |
325 | } | |
326 | DPRINT(("\n")); | |
327 | reach_next_i->state = NULL; | |
328 | } | |
329 | else | |
330 | { | |
331 | if (num_tags == 0 || reach_next_i == reach_next) | |
332 |