X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/7b00c0c43f52e9d27168e67a26aac19065cdb40c..ad3c9f2af814c84582fdd1649e49ec4f68572c5a:/regex/TRE/lib/tre-compile.c diff --git a/regex/TRE/lib/tre-compile.c b/regex/TRE/lib/tre-compile.c new file mode 100644 index 0000000..f9ee7d9 --- /dev/null +++ b/regex/TRE/lib/tre-compile.c @@ -0,0 +1,3445 @@ +/* + tre-compile.c - TRE regex compiler + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + TODO: + - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive + function calls. +*/ + + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ +#include +#include +#include +#include + +#include "tre-internal.h" +#include "tre-mem.h" +#include "tre-stack.h" +#include "tre-ast.h" +#include "tre-parse.h" +#include "tre-compile.h" +#include "tre.h" +#include "tre-last-matched.h" +#include "xmalloc.h" + +/* + The bit_ffs() macro in bitstring.h is flawed. Replace it with a working one. +*/ +#undef bit_ffs +#define bit_ffs(name, nbits, value) { \ + register bitstr_t *_name = name; \ + register int _byte, _nbits = nbits; \ + register int _stopbyte = _bit_byte(_nbits), _value = -1; \ + for (_byte = 0; _byte <= _stopbyte; ++_byte) \ + if (_name[_byte]) { \ + _value = _byte << 3; \ + for (_stopbyte = _name[_byte]; !(_stopbyte&0x1); \ + ++_value, _stopbyte >>= 1); \ + break; \ + } \ + *(value) = _value; \ +} + +/* + Algorithms to setup tags so that submatch addressing can be done. +*/ + + +#ifdef TRE_DEBUG +static const char *tag_dir_str[] = { + "minimize", + "maximize", + "left-maximize" +}; + +static const char _indent[] = " "; + +static void +print_indent(int indent) +{ + while (indent-- > 0) + DPRINT((_indent)); +} + +static void print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, + int num_tags); +static void +print_last_match_branch_pre(tre_last_matched_branch_pre_t *branch, int indent, + int num_tags) +{ + tre_last_matched_pre_t *u = branch->last_matched; + int n_last_matched = 0; + + while (u) + { + n_last_matched++; + u = u->next; + } + + print_indent(indent); + DPRINT(("BRANCH: tot_branches=%d tot_last_matched=%d tot_tags=%d\n", + branch->tot_branches, branch->tot_last_matched, branch->tot_tags)); + print_indent(indent); + DPRINT(("..n_last_matched=%d last_matched=%d\n", branch->n_last_matched, + n_last_matched)); + if (branch->n_last_matched != n_last_matched) + DPRINT(("*** mismatch between n_last_matched and unions ***\n")); + if (branch->cmp_tag > 0) + { + int i; + const char *sep = " tags="; + print_indent(indent); + DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags)); + for (i = 0; i < num_tags; i++) + if (bit_test(branch->tags, i)) + { + DPRINT(("%s%d", sep, i)); + sep = ","; + } + DPRINT(("\n")); + } + + u = branch->last_matched; + indent++; + while (u) + { + print_last_matched_pre(u, indent, num_tags); + u = u->next; + } +} + +static void +print_last_matched_pre(tre_last_matched_pre_t *lm, int indent, int num_tags) +{ + tre_last_matched_branch_pre_t *b = lm->branches; + int n_branches = 0; + + while (b) + { + n_branches++; + b = b->next; + } + + print_indent(indent); + DPRINT(("LAST_MATCHED: tot_branches=%d tot_last_matched=%d tot_tags=%d\n", + lm->tot_branches, lm->tot_last_matched, lm->tot_tags)); + print_indent(indent); + DPRINT(("..start_tag=%d n_branches=%d branches=%d\n", lm->start_tag, + lm->n_branches, n_branches)); + if (lm->n_branches != n_branches) + DPRINT(("*** mismatch between n and branches ***\n")); + + b = lm->branches; + indent++; + while (b) + { + print_last_match_branch_pre(b, indent, num_tags); + b = b->next; + } +} + +static void print_last_matched(tre_last_matched_t *lm, int indent); +static void +print_last_match_branch(tre_last_matched_branch_t *branch, int indent) +{ + tre_last_matched_t *u; + int i; + + print_indent(indent); + DPRINT(("BRANCH: n_last_matched=%d\n", branch->n_last_matched)); + if (branch->cmp_tag > 0) + { + print_indent(indent); + DPRINT(("..cmp_tag=%d n_tags=%d", branch->cmp_tag, branch->n_tags)); + if (branch->n_tags > 0) + { + const char *sep = " tags="; + for (i = 0; i < branch->n_tags; i++) + { + DPRINT(("%s%d", sep, branch->tags[i])); + sep = ","; + } + } + DPRINT(("\n")); + } + + u = branch->last_matched; + indent++; + for (i = branch->n_last_matched; i > 0; i--, u++) + print_last_matched(u, indent); +} + +static void +print_last_matched(tre_last_matched_t *lm, int indent) +{ + int i; + tre_last_matched_branch_t *b; + + print_indent(indent); + DPRINT(("LAST_MATCHED: n_branches=%d start_tag=%d\n", lm->n_branches, + lm->start_tag)); + + b = lm->branches; + indent++; + for (i = lm->n_branches; i > 0; i--, b++) + print_last_match_branch(b, indent); +} +#endif /* TRE_DEBUG */ + + +/* Merge the tre_last_matched_branch_pre_t of src into dst, creating a new + one if needed. If tag_id > 0, add that tag as well (a negative tag_id will + create an unset tre_last_matched_branch_pre_t. */ +static reg_errcode_t +tre_merge_branches(tre_mem_t mem, tre_ast_node_t *dst, tre_ast_node_t *src, + int tag_id, int num_tags) +{ + tre_last_matched_branch_pre_t *db = dst->last_matched_branch; + tre_last_matched_branch_pre_t *sb = (src ? src->last_matched_branch : NULL); + + if (db) + { + if (sb) + { + bitstr_t *l = db->tags; + bitstr_t *r = sb->tags; + int i = bitstr_size(num_tags); + + while(i-- > 0) + *l++ |= *r++; + /* db and sb are the info from two parallel sub-trees, so the tags + must be mutually exclusive, and we can just add their numbers */ + db->n_tags += sb->n_tags; + db->tot_tags += sb->tot_tags; + if (db->last_matched) + { + if (sb->last_matched) + { + tre_last_matched_pre_t *u = db->last_matched; + + while(u->next) + u = u->next; + u->next = sb->last_matched; + db->n_last_matched += sb->n_last_matched; + db->tot_branches += sb->tot_branches; + db->tot_last_matched += sb->tot_last_matched; + } + } + else if (sb->last_matched) + { + db->last_matched = sb->last_matched; + db->n_last_matched = sb->n_last_matched; + db->tot_branches = sb->tot_branches; + db->tot_last_matched = sb->tot_last_matched; + } + } + } + else + db = sb; + + if (tag_id != 0) + { + if (!db) + { + db = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t) + + bitstr_size(num_tags)); + if (db == NULL) + return REG_ESPACE; + db->tot_branches = 1; + } + if (tag_id > 0) + { + /* tag_id is a new tag, and shouldn't exist in db's tags, + so we can always increment n_tags */ + bit_set(db->tags, tag_id); + db->n_tags++; + db->tot_tags++; + } + } + dst->last_matched_branch = db; + return REG_OK; +} + + +/* Inserts a catenation node to the root of the tree given in `node'. + As the left child a new tag with number `tag_id' to `node' is added, + and the right child is the old root. */ +static reg_errcode_t +tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) +{ + tre_catenation_t *c; + + DPRINT(("add_tag_left: tag %d\n", tag_id)); + + c = tre_mem_alloc(mem, sizeof(*c)); + if (c == NULL) + return REG_ESPACE; + c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); + if (c->left == NULL) + return REG_ESPACE; + c->right = tre_mem_calloc(mem, sizeof(tre_ast_node_t)); + if (c->right == NULL) + return REG_ESPACE; + + c->right->obj = node->obj; + c->right->type = node->type; + c->right->last_matched_branch = node->last_matched_branch; + c->right->nullable = -1; + c->right->submatch_id = -1; + node->obj = c; + node->type = CATENATION; + node->original = c->right; + return REG_OK; +} + +/* Inserts a catenation node to the root of the tree given in `node'. + As the right child a new tag with number `tag_id' to `node' is added, + and the left child is the old root. */ +static reg_errcode_t +tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) +{ + tre_catenation_t *c; + + DPRINT(("tre_add_tag_right: tag %d\n", tag_id)); + + c = tre_mem_alloc(mem, sizeof(*c)); + if (c == NULL) + return REG_ESPACE; + c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); + if (c->right == NULL) + return REG_ESPACE; + c->left = tre_mem_calloc(mem, sizeof(tre_ast_node_t)); + if (c->left == NULL) + return REG_ESPACE; + + c->left->obj = node->obj; + c->left->type = node->type; + c->left->last_matched_branch = node->last_matched_branch; + c->left->nullable = -1; + c->left->submatch_id = -1; + node->obj = c; + node->type = CATENATION; + node->original = c->left; + return REG_OK; +} + +typedef enum { + ADDTAGS_RECURSE, + ADDTAGS_RECURSE_NOT_TOP_UNION, + ADDTAGS_AFTER_ITERATION, + ADDTAGS_AFTER_UNION_LEFT, + ADDTAGS_AFTER_UNION_RIGHT, + ADDTAGS_AFTER_CAT_LEFT, + ADDTAGS_AFTER_CAT_RIGHT, + ADDTAGS_SET_SUBMATCH_END, + ADDTAGS_UNION_RECURSE, + ADDTAGS_UNION_RIGHT_RECURSE, + ADDTAGS_AFTER_UNION_TOP, +} tre_addtags_symbol_t; + +enum { + COPY_LAST_MATCHED_BRANCH, + COPY_LAST_MATCHED_BRANCH_NEXT, + COPY_LAST_MATCHED, + COPY_LAST_MATCHED_NEXT, +}; + + +#define REGSET_UNSET ((unsigned)-1) + +/* Go through `regset' and set submatch data for submatches that are + using this tag. */ +static void +tre_purge_regset(unsigned *regset, tre_tnfa_t *tnfa, int tag) +{ + int i; + + for (i = 0; regset[i] != REGSET_UNSET; i++) + { + int id = regset[i] / 2; + int start = !(regset[i] % 2); + if (id >= SUBMATCH_ID_INVISIBLE_START) + continue; + DPRINT((" Using tag %d for %s offset of " + "submatch %d\n", tag, + start ? "start" : "end", id)); + if (start) + tnfa->submatch_data[id].so_tag = tag; + else + tnfa->submatch_data[id].eo_tag = tag; + } + regset[0] = -1; +} + + +#define REGSET_HAS_STARTS 0x1 +#define REGSET_HAS_ENDS 0x2 + + +/* Adds tags to appropriate locations in the parse tree in `tree', so that + subexpressions marked for submatch addressing can be traced. */ +static reg_errcode_t +tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, + tre_tnfa_t *tnfa) +{ + reg_errcode_t status = REG_OK; + tre_addtags_symbol_t symbol; + tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */ + int bottom = tre_stack_num_objects(stack); + /* True for first pass (counting number of needed tags) */ + int first_pass = (mem == NULL || tnfa == NULL); + unsigned *regset, *orig_regset; + int regset_contains = 0; + int num_tags = 0; /* Total number of tags. */ + int num_minimals = 0; /* Number of special minimal tags. */ + int tag = 0; /* The tag that is to be added next. */ + int next_tag = 1; /* Next tag to use after this one. */ + int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ + int *reorder_tags = NULL; /* Tag reorder array: a pair for each reorder, + * the first is the tag to reorder, the second + * is the tag after which the first is reordered */ + int *rtp; /* Pointer used to fill in reorder_tags and + * tag_order */ + int *to_reorder; /* Transform array converting sequential order to + * that specified by reorder_tags */ + int id; + + tre_tag_direction_t direction = TRE_TAG_LEFT_MAXIMIZE; + if (!first_pass) + { + DPRINT(("Initializing direction to %s\n", tag_dir_str[direction])); + tnfa->end_tag = 0; + tnfa->minimal_tags[0] = -1; + } + + regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + + tnfa->num_submatches_invisible + 1) * 2)); + if (regset == NULL) + { + status = REG_ESPACE; + goto error_regset; + } + regset[0] = REGSET_UNSET; + orig_regset = regset; + + if (!first_pass) + { + /* Allocate all memory for reorder_tags, tag_order, to_seq_order and + * to_reorder in one batch (assuming all are the same type) */ + rtp = reorder_tags = xmalloc(sizeof(*reorder_tags) * + ((2 * tnfa->num_reorder_tags + 1) + + tnfa->num_tags)); + if (reorder_tags == NULL) + { + status = REG_ESPACE; + goto error_reorder_tags; + } + to_reorder = reorder_tags + (2 * tnfa->num_reorder_tags + 1); + } + + STACK_PUSH(stack, voidptr, node); + STACK_PUSH(stack, int, ADDTAGS_RECURSE); + + while (tre_stack_num_objects(stack) > bottom) + { + if (status != REG_OK) + break; + + symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); + switch (symbol) + { + int top_union; + + case ADDTAGS_SET_SUBMATCH_END: + { + int i; + + id = tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); + /* Add end of this submatch to regset. */ + for (i = 0; regset[i] != REGSET_UNSET; i++); + regset[i] = id * 2 + 1; + regset[i + 1] = -1; + regset_contains |= REGSET_HAS_ENDS; + + /* Always put a tag after a minimal iterator. */ + if (minimal_tag >= 0) + { + if (first_pass) + { + node->num_tags++; + DPRINT((" ADDTAGS_SET_SUBMATCH_END: node->num_tags = %d\n", + node->num_tags)); + } + else + { + int i; + status = tre_merge_branches(mem, node, NULL, tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_right(mem, node, tag); + if (status != REG_OK) + break; + tnfa->tag_directions[tag] = TRE_TAG_MINIMIZE; + DPRINT(("Setting t%d direction to %s\n", tag, + tag_dir_str[tnfa->tag_directions[tag]])); + DPRINT(("Minimal %d, %d\n", minimal_tag, tag)); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + + DPRINT((" Minimal end: t%d reordered to " + "after t%d\n", tag, minimal_tag)); + /* Append to tag_order, move "tag" after + * "minimal_tag" */ + *rtp++ = tag; + *rtp++ = minimal_tag; + + num_minimals++; + tre_purge_regset(regset, tnfa, tag); + } + + minimal_tag = -1; + DPRINT((" ADDTAGS_SET_SUBMATCH_END num_tags++ tag=%d\n", tag)); + regset[0] = REGSET_UNSET; + regset_contains = 0; + tag = next_tag; + num_tags++; + next_tag++; + } + break; + } + + case ADDTAGS_RECURSE_NOT_TOP_UNION: + /* Like ADDTAGS_RECURSE, except that top_union is set to zero, + * indicating that if a union is being processed, it is not the + * top-most of a series */ + top_union = 0; + goto do_addtags_recurse; + + case ADDTAGS_RECURSE: + /* Setting top_union to 1 means that if a union is begin processed, + * it is the top-most of a series, and should recurse through the + * series to set the left_tag and right_tag values */ + top_union = 1; + +do_addtags_recurse: + node = tre_stack_pop_voidptr(stack); + + id = node->submatch_id; + if (id >= 0) + { + int i; + + + /* Add start of this submatch to regset. */ + for (i = 0; regset[i] != REGSET_UNSET; i++); + regset[i] = id * 2; + regset[i + 1] = -1; + regset_contains |= REGSET_HAS_STARTS; + + /* Add end of this submatch to regset after processing this + node. */ + STACK_PUSH(stack, voidptr, node); + STACK_PUSHX(stack, int, id); + STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); + } + + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = node->obj; + + if (!IS_SPECIAL(lit) || IS_BACKREF(lit) || IS_EMPTY(lit) || IS_ASSERTION(lit)) + { + DPRINT(("Literal %d-%d\n", + (int)lit->code_min, (int)lit->code_max)); + if (regset_contains) + { + /* Regset is not empty, so add a tag before the + literal or backref. */ + if (first_pass) + { + DPRINT((" ADDTAGS_RECURSE:LITERAL node->num_tags = 1\n")); + node->num_tags = 1; + } + else + { + status = tre_merge_branches(mem, node, NULL, tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_left(mem, node, tag); + if (status != REG_OK) + break; + if (regset_contains == REGSET_HAS_STARTS) + tnfa->tag_directions[tag] = TRE_TAG_LEFT_MAXIMIZE; + else + tnfa->tag_directions[tag] = direction; + DPRINT(("Setting t%d direction to %s\n", tag, + tag_dir_str[tnfa->tag_directions[tag]])); + tre_purge_regset(regset, tnfa, tag); + + if (IS_BACKREF(lit)) + { + int b = lit->code_max; + int t = tnfa->submatch_data[b].so_tag; + /* Fail if the referenced submatch hasn't been + * completed yet */ + if (tnfa->submatch_data[b].eo_tag < 0) + { + status = REG_ESUBREG; + break; + } + if (t < tag) + { + DPRINT((" Backref %d start: " + "t%d reordered to before t%d\n", + b, tag, t)); + if(t > 0) + t--; + /* Append to tag_order, move "tag" after + * "t" */ + *rtp++ = tag; + *rtp++ = t; + } +#if TRE_DEBUG + else + DPRINT((" Backref %d start: " + "(t%d already before t%d)\n", + b, tag, t)); +#endif /* TRE_DEBUG */ + } + } + + DPRINT((" ADDTAGS_RECURSE:LITERAL num_tags++ tag=%d\n", + tag)); + regset[0] = REGSET_UNSET; + regset_contains = 0; + tag = next_tag; + num_tags++; + next_tag++; + } + } + else + { + assert(!IS_TAG(lit)); + } + break; + } + case CATENATION: + { + tre_catenation_t *cat = node->obj; + tre_ast_node_t *left = cat->left; + tre_ast_node_t *right = cat->right; + int reserved_tag = -1; + DPRINT(("Catenation, next_tag = %d\n", next_tag)); + + + /* After processing right child. */ + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT); + + /* Process right child. */ + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + /* After processing left child. */ + STACK_PUSHX(stack, int, next_tag + left->num_tags); + DPRINT((" Pushing %d for after left\n", + next_tag + left->num_tags)); + if (left->num_tags > 0 && right->num_tags > 0) + { + /* Reserve the next tag to the right child. */ + DPRINT((" ADDTAGS_RECURSE:CATENATION num_tags++ " + "Reserving next_tag %d to right child\n", + next_tag)); + reserved_tag = next_tag; + next_tag++; + } + STACK_PUSHX(stack, int, reserved_tag); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT); + + /* Process left child. */ + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + } + break; + case ITERATION: + { + tre_iteration_t *iter = node->obj; + DPRINT(("Iteration\n")); + + if (first_pass) + STACK_PUSHX(stack, int, regset_contains != 0); + STACK_PUSHX(stack, int, tag); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); + + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); + + /* Regset is not empty, so add a tag here (this always happens + because iterators always get submatch id, even if in the + invisible range) */ + if (regset_contains) + { + if (!first_pass) + { + status = tre_merge_branches(mem, node, NULL, tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_left(mem, node, tag); + if (status != REG_OK) + break; + if (regset_contains == REGSET_HAS_STARTS && tag != 0) + tnfa->tag_directions[tag] = iter->minimal ? + TRE_TAG_MINIMIZE : + TRE_TAG_LEFT_MAXIMIZE; + else + tnfa->tag_directions[tag] = direction; + DPRINT(("Setting t%d direction to %s\n", tag, + tag_dir_str[tnfa->tag_directions[tag]])); + tre_purge_regset(regset, tnfa, tag); + } + + DPRINT((" ADDTAGS_RECURSE:ITERATION num_tags++ tag=%d\n", + tag)); + regset[0] = REGSET_UNSET; + regset_contains = 0; + tag = next_tag; + num_tags++; + next_tag++; + } + direction = TRE_TAG_LEFT_MAXIMIZE; + DPRINT((" Setting direction to %s\n", tag_dir_str[direction])); + } + break; + case UNION: + { + tre_union_t *uni; + tre_ast_node_t *left; + tre_ast_node_t *right; + int front_tag = -1; + + DPRINT(("Union\n")); + + if (regset_contains) + { + DPRINT((" UNION num_tags++ tag=%d\n", tag)); + front_tag = tag; + tag = next_tag; + num_tags++; + next_tag++; + } + + /* For the top union, walk the tree of consecutive unions, + * setting the left_tag and right_tag values in increasing + * order (left to right priority) */ + if (top_union && + (node->num_submatches - + (node->submatch_id >= 0 && + node->submatch_id < SUBMATCH_ID_INVISIBLE_START)) > 0) + { + tre_ast_node_t *n; + int last = tre_stack_num_objects(stack); + + STACK_PUSH(stack, voidptr, node); + STACK_PUSH(stack, int, ADDTAGS_UNION_RECURSE); + + while (tre_stack_num_objects(stack) > last) + { + symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); + switch (symbol) + { + case ADDTAGS_UNION_RECURSE: + n = tre_stack_pop_voidptr(stack); + uni = n->obj; + left = uni->left; + + /* Since the top union has num_submatches > 0, + * we set all the consecutive union's + * make_branches to 1 to force the generation + * of end tags for each union branch. */ + n->make_branches = 1; + + STACK_PUSH(stack, voidptr, n); + STACK_PUSH(stack, int, + ADDTAGS_UNION_RIGHT_RECURSE); + + if (left->type == UNION) + { + STACK_PUSH(stack, voidptr, left); + STACK_PUSH(stack, int, + ADDTAGS_UNION_RECURSE); + } + else + { + DPRINT((" ADDTAGS_UNION_RECURSE " + "num_tags++ tag=%d\n", tag)); + uni->left_tag = tag; + tag = next_tag; + num_tags++; + next_tag++; + } + break; + + case ADDTAGS_UNION_RIGHT_RECURSE: + n = tre_stack_pop_voidptr(stack); + uni = n->obj; + right = uni->right; + + if (right->type == UNION) + { + STACK_PUSH(stack, voidptr, right); + STACK_PUSH(stack, int, + ADDTAGS_UNION_RECURSE); + } + else + { + DPRINT((" ADDTAGS_UNION_RIGHT_RECURSE " + "num_tags++ tag=%d\n", tag)); + uni->right_tag = tag; + tag = next_tag; + num_tags++; + next_tag++; + } + + break; + + default: + assert(0); + break; + + } /* end switch(symbol) */ + } /* end while(tre_stack_num_objects(stack) > last */ + if (!first_pass) + { + STACK_PUSHX(stack, int, front_tag); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_TOP); + } + } /* end if (top_union && ...) */ + + uni = node->obj; + left = uni->left; + right = uni->right; + + /* After processing right child. */ + STACK_PUSHX(stack, voidptr, regset); + STACK_PUSHX(stack, int, regset_contains != 0); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); + + /* Process right child. */ + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION); + + /* After processing left child. */ + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); + + /* Process left child. */ + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE_NOT_TOP_UNION); + + /* Regset is not empty, so add a tag here. */ + if (regset_contains) + { + if (!first_pass) + { + status = tre_merge_branches(mem, node, NULL, front_tag, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_add_tag_left(mem, node, front_tag); + if (status != REG_OK) + break; + if (regset_contains == REGSET_HAS_STARTS) + tnfa->tag_directions[front_tag] = TRE_TAG_LEFT_MAXIMIZE; + else + tnfa->tag_directions[front_tag] = direction; + DPRINT(("Setting t%d direction to %s\n", front_tag, + tag_dir_str[tnfa->tag_directions[front_tag]])); + tre_purge_regset(regset, tnfa, front_tag); + } + + regset[0] = REGSET_UNSET; + regset_contains = 0; + } + + break; + } + } /* end switch (node->type) */ + + break; /* end case: ADDTAGS_RECURSE */ + + case ADDTAGS_AFTER_ITERATION: + { + tre_iteration_t *iter; + tre_ast_node_t *orig; + int enter_tag; + + node = tre_stack_pop_voidptr(stack); + orig = node->original ? node->original : node; + iter = (tre_iteration_t *)orig->obj; + enter_tag = tre_stack_pop_int(stack); + if (iter->minimal) + minimal_tag = enter_tag; + + DPRINT(("After iteration\n")); + if (first_pass) + { + node->num_tags = iter->arg->num_tags + tre_stack_pop_int(stack); + DPRINT((" ADDTAGS_AFTER_ITERATION: node->num_tags = %d\n", + node->num_tags)); + } + else + { + /* node->last_matched_branch will have the start tag (the tag + just *before* the iteration). iter->arg->last_matched_branch + will have the tag(s) inside the iteration, the ones that + may need to be reset if the iteration doesn't match. So + before we merge iter->arg into node, we need to set up + a new tre_last_matched_t and tre_last_matched_branch_t, + using any of the inside tags as cmp_tag (we choose the first + tag found by bit_ffs). If there are no inside tags, we + don't bother creating the extra structures. */ + tre_last_matched_branch_pre_t *b = + iter->arg->last_matched_branch; + + if (b && b->n_tags > 0) + { + tre_last_matched_pre_t *u; + + bit_ffs(b->tags, num_tags, &b->cmp_tag); + DPRINT((" ADDTAGS_AFTER_ITERATION: n_tags=%d " + "cmp_tag = %d\n", b->n_tags, b->cmp_tag)); + + u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t) + + sizeof(tre_last_matched_branch_pre_t) + + bitstr_size(num_tags)); + if (!u) + { + status = REG_ESPACE; + break; + } + u->branches = b; + u->n_branches = 1; + u->start_tag = b->cmp_tag; + u->tot_branches = b->tot_branches; + u->tot_last_matched = 1 + b->tot_last_matched; + u->tot_tags = b->tot_tags; + + b = (tre_last_matched_branch_pre_t *)(u + 1); + b->last_matched = u; + b->n_last_matched = 1; + b->tot_branches = 1 + u->tot_branches; + b->tot_last_matched = u->tot_last_matched; + b->tot_tags = u->tot_tags; + + iter->arg->last_matched_branch = b; + } + status = tre_merge_branches(mem, node, iter->arg, 0, + tnfa->num_tags); + if (status != REG_OK) + break; + + if (iter->minimal) + { + /* Add a union with a left EMPTY literal and the right + being iter->arg. This should force the tags inside + the minimal iteration to prefer being unset */ + if (iter->min == 0 && iter->max <= 1) + { + tre_ast_node_t *u, *e; + + e = tre_ast_new_literal(mem, EMPTY, -1, -1); + if (e == NULL) + { + status = REG_ESPACE; + break; + } + u = tre_ast_new_union(mem, e, iter->arg); + if (u == NULL) + { + status = REG_ESPACE; + break; + } + iter->arg = u; + } + + direction = TRE_TAG_MINIMIZE; + } + else + direction = TRE_TAG_MAXIMIZE; + DPRINT((" Setting direction to %s\n", tag_dir_str[direction])); + } + break; + } + + case ADDTAGS_AFTER_CAT_LEFT: + { + int new_tag = tre_stack_pop_int(stack); + next_tag = tre_stack_pop_int(stack); + DPRINT(("After cat left, tag = %d, next_tag = %d\n", + tag, next_tag)); + if (new_tag >= 0) + { + DPRINT((" Setting tag to %d\n", new_tag)); + tag = new_tag; + } + break; + } + + case ADDTAGS_AFTER_CAT_RIGHT: + { + tre_catenation_t *cat; + + DPRINT(("After cat right\n")); + node = tre_stack_pop_voidptr(stack); + cat = node->obj; + if (first_pass) + { + node->num_tags = cat->left->num_tags + cat->right->num_tags; + DPRINT((" ADDTAGS_AFTER_CAT_RIGHT: node->num_tags = %d\n", + node->num_tags)); + } + else + { + status = tre_merge_branches(mem, cat->left, cat->right, 0, + tnfa->num_tags); + if (status != REG_OK) + break; + status = tre_merge_branches(mem, node, cat->left, 0, + tnfa->num_tags); + } + break; + } + + case ADDTAGS_AFTER_UNION_LEFT: + DPRINT(("After union left\n")); + /* Lift the bottom of the `regset' array so that when processing + the right operand the items currently in the array are + invisible. The original bottom was saved at ADDTAGS_UNION and + will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */ + while (*regset != REGSET_UNSET) + regset++; + regset_contains = 0; + break; + + case ADDTAGS_AFTER_UNION_RIGHT: + { + int added_tags; + tre_ast_node_t *orig; + tre_union_t *uni; + /* Note: node may not be a UNION, but a CATENATION with a left + * tag. So that is why we pass the original node->obj on the + * stack, to get the union's true values. */ + + DPRINT(("After union right\n")); + node = tre_stack_pop_voidptr(stack); + orig = node->original ? node->original : node; + uni = (tre_union_t *)orig->obj; + added_tags = tre_stack_pop_int(stack); + if (first_pass) + { + node->num_tags = uni->left->num_tags + uni->right->num_tags + + added_tags; + if (uni->left_tag > 0) + node->num_tags++; + if (uni->right_tag > 0) + node->num_tags++; + DPRINT((" ADDTAGS_AFTER_UNION_RIGHT: node->num_tags = %d\n", + node->num_tags)); + } + regset = tre_stack_pop_voidptr(stack); + + /* Add tags after both children, the left child gets a smaller + tag than the right child. This guarantees that we prefer + the left child over the right child. */ + /* XXX - This is not always necessary (if the children have + tags which must be seen for every match of that child). */ + if (!first_pass && node->make_branches) + { + tre_last_matched_branch_pre_t *lb = + uni->left->last_matched_branch; + tre_last_matched_branch_pre_t *rb = + uni->right->last_matched_branch; + tre_last_matched_pre_t *lu = + uni->left->last_matched_in_progress; + tre_last_matched_pre_t *ru = + uni->right->last_matched_in_progress; + tre_last_matched_pre_t *u; + /* We don't need to call tre_merge_branches because these + * tags don't participate in submatch ranges, so don't need + * to be recorded. But we do set the cmp_tag entry of the + * tre_last_matched_branch_pre_t, so we might call + * tre_merge_branches if we need to create an empty + * tre_last_matched_branch_pre_t. */ + if (uni->left_tag > 0) + { + DPRINT(("Setting t%d direction to maximize\n", + uni->left_tag)); + status = tre_add_tag_right(mem, uni->left, uni->left_tag); + if (status != REG_OK) + break; + tnfa->tag_directions[uni->left_tag] = TRE_TAG_MAXIMIZE; + if (!lb) + { + status = tre_merge_branches(mem, uni->left, NULL, -1, + tnfa->num_tags); + if (status != REG_OK) + break; + lb = uni->left->last_matched_branch; + } + lb->cmp_tag = uni->left_tag; + } + if (uni->right_tag > 0) + { + DPRINT(("Setting t%d direction to maximize\n", + uni->right_tag)); + status = tre_add_tag_right(mem, uni->right, uni->right_tag); + if (status != REG_OK) + break; + tnfa->tag_directions[uni->right_tag] = TRE_TAG_MAXIMIZE; + if (!rb) + { + status = tre_merge_branches(mem, uni->right, NULL, -1, + tnfa->num_tags); + if (status != REG_OK) + break; + rb = uni->right->last_matched_branch; + } + rb->cmp_tag = uni->right_tag; + } + /* Now merge the tre_last_matched_branch_pre_t into a + tre_last_matched_pre_t */ + if (lu == NULL) + { + if (ru == NULL) + { + /* Create a new tre_last_matched_pre_t */ + u = tre_mem_calloc(mem, sizeof(tre_last_matched_pre_t)); + if (!u) + { + status = REG_ESPACE; + break; + } + u->tot_last_matched = 1; + + if (lb) + { + u->branches = lb; + u->n_branches = 1; + u->tot_branches += lb->tot_branches; + u->tot_last_matched += lb->tot_last_matched; + u->tot_tags += lb->tot_tags; + if (rb) + { + lb->next = rb; + u->n_branches++; + u->tot_branches += rb->tot_branches; + u->tot_last_matched += rb->tot_last_matched; + u->tot_tags += rb->tot_tags; + } + } + else if (rb) + { + u->branches = rb; + u->n_branches = 1; + u->tot_branches += rb->tot_branches; + u->tot_last_matched += rb->tot_last_matched; + u->tot_tags += rb->tot_tags; + } + } + else + { + /* Use ru, and add lb */ + u = ru; + if (lb) + { + lb->next = u->branches; + u->branches = lb; + u->n_branches++; + u->tot_branches += lb->tot_branches; + u->tot_last_matched += lb->tot_last_matched; + u->tot_tags += lb->tot_tags; + } + } + } + else if (ru == NULL) + { + /* Use lu, and add rb */ + u = lu; + if (rb) + { + rb->next = u->branches; + u->branches = rb; + u->n_branches++; + u->tot_branches += rb->tot_branches; + u->tot_last_matched += rb->tot_last_matched; + u->tot_tags += rb->tot_tags; + } + } + else + { + /* Merge lu and ru into lu */ + if (lu->branches) + { + if (ru->branches) + { + tre_last_matched_branch_pre_t *b = lu->branches; + while (b->next) b = b->next; + b->next = ru->branches; + lu->n_branches += ru->n_branches; + } + } + else if (ru->branches) + { + lu->branches = ru->branches; + lu->n_branches = ru->n_branches; + } + lu->tot_branches += ru->tot_branches; + lu->tot_last_matched += ru->tot_last_matched - 1; + lu->tot_tags += ru->tot_tags; + u = lu; + } + node->last_matched_in_progress = u; + } + direction = TRE_TAG_MAXIMIZE; + break; + } + + case ADDTAGS_AFTER_UNION_TOP: /* only called when not first_pass */ + { + tre_last_matched_branch_pre_t *b; + tre_last_matched_pre_t *u; + int start_tag; + + DPRINT(("After union top\n")); + node = tre_stack_pop_voidptr(stack); + start_tag = tre_stack_pop_int(stack); + b = tre_mem_calloc(mem, sizeof(tre_last_matched_branch_pre_t) + + bitstr_size(num_tags)); + if (!b) + { + status = REG_ESPACE; + break; + } + + u = node->last_matched_in_progress; + u->start_tag = start_tag; + b->tot_branches = 1 + u->tot_branches; + b->tot_last_matched = u->tot_last_matched; + b->tot_tags = u->tot_tags; + b->last_matched = u; + b->n_last_matched = 1; + node->last_matched_branch = b; + node->last_matched_in_progress = NULL; + break; + } + + default: + assert(0); + break; + + } /* end switch(symbol) */ + } /* end while(tre_stack_num_objects(stack) > bottom) */ + + if (status != REG_OK) + { + DPRINT(("Error during %s pass\n", first_pass ? "first" : "second")); + goto error_post_compile; + } + + if (!first_pass) + { + int i; + if (num_tags != tnfa->num_tags) + { + DPRINT(("num_tags(%d) != tnfa->num_tags(%d)\n", num_tags, + tnfa->num_tags)); + status = REG_BADPAT; + goto error_post_compile; + } + + tre_purge_regset(regset, tnfa, tag); + DPRINT(("Setting t%d to %s\n", num_tags, + tag_dir_str[direction])); + tnfa->tag_directions[num_tags] = direction; + + if (rtp > reorder_tags + 2 * tnfa->num_reorder_tags) + { + DPRINT(("Processed %d reorder tags instead of %d\n", + (int)(rtp - reorder_tags) / 2, tnfa->num_reorder_tags)); + status = REG_BADPAT; + goto error_post_compile; + } + *rtp = -1; +#if TRE_DEBUG + if (reorder_tags[0] >= 0) + { + DPRINT(("reorder_tags:\n")); + for (rtp = reorder_tags; *rtp >= 0;) + { + DPRINT(("%d after ", *rtp++)); + DPRINT(("%d\n", *rtp++)); + } + } + else + DPRINT(("No reorder_tags\n")); +#endif /* TRE_DEBUG */ + + /* Initialize to_reorder */ + for (i = 0; i < num_tags; i++) + to_reorder[i] = i; + /* Use to_seq_order to convert reorder_tags values, and use those to + * reorder to_reorder */ + for (rtp = reorder_tags; *rtp >= 0;) + { + int j, high, low; + int ti = *rtp++; + + /* Skip reordering the final tag */ + if (ti >= num_tags) + { + DPRINT(("Skipping reorder of %d\n", ti)); + rtp++; + continue; + } + /* The number of the tag to reorder */ + high = to_reorder[ti]; + /* Reorder after this tag */ + low = to_reorder[*rtp++]; + + DPRINT(("ti=%d high=%d low=%d\n", ti, high, low)); + if (low > high) + { + DPRINT(("Tag %d already before %d\n", high, low)); + continue; + } + for (j = 0; j < num_tags; j++) + if (to_reorder[j] > low && to_reorder[j] < high) + to_reorder[j]++; + to_reorder[ti] = low + 1; +#ifdef TRE_DEBUG + DPRINT(("to_reorder=(")); + for (j = 0; j < num_tags; j++) + { + DPRINT(("%d", to_reorder[j])); + if (j < num_tags - 1) + DPRINT((",")); + } + DPRINT((")\n")); +#endif /* TRE_DEBUG */ + } + /* Determine if reordering in really necessary */ + { + int need_reorder = 0; + for (i = 0; i < num_tags; i++) + if(to_reorder[i] != i) + { + need_reorder = 1; + break; + } + /* If need_reorder is not set, free reorder_tags, and set to NULL, + * indicating no reordering is needed */ + if (!need_reorder) + { + DPRINT(("Don't need to reorder\n")); + xfree(reorder_tags); + reorder_tags = NULL; + } + } + } + + if (reorder_tags) + { + int i; + tre_tag_direction_t *new_tag_directions; +#if TRE_DEBUG + DPRINT(("to_reorder:")); + for (i = 0; i < num_tags; i++) + DPRINT((" %d->%d", i, to_reorder[i])); + DPRINT(("\n")); +#endif /* TRE_DEBUG */ + + DPRINT(("Reordering submatch_data\n")); + for (i = 0; i < tnfa->num_submatches; i++) + { +#if TRE_DEBUG + int so = tnfa->submatch_data[i].so_tag; + int eo = tnfa->submatch_data[i].eo_tag; +#endif /* TRE_DEBUG */ + tnfa->submatch_data[i].so_tag = + to_reorder[tnfa->submatch_data[i].so_tag]; + tnfa->submatch_data[i].eo_tag = + tnfa->submatch_data[i].eo_tag < num_tags ? + to_reorder[tnfa->submatch_data[i].eo_tag] : + tnfa->submatch_data[i].eo_tag; + DPRINT(("pmatch[%d]: {%d, %d}->{%d, %d}\n", i, so, eo, + tnfa->submatch_data[i].so_tag, + tnfa->submatch_data[i].eo_tag)); + } + + DPRINT(("Reordering tag_directions\n")); + /* We only allocate num_tags directions and reorder them. The + * num_tags-th direction (end tag) is left unchanged. */ + new_tag_directions = xmalloc(sizeof(*new_tag_directions) * num_tags); + if (new_tag_directions == NULL) + { + status = REG_ESPACE; + goto error_post_compile; + } + for (i = 0; i < num_tags; i++) + { + new_tag_directions[to_reorder[i]] = tnfa->tag_directions[i]; + } +#if TRE_DEBUG + for (i = 0; i < num_tags; i++) + { + DPRINT(("t%d %s->%s\n", i, + tag_dir_str[tnfa->tag_directions[i]], + tag_dir_str[new_tag_directions[i]])); + } + DPRINT(("t%d %s->%s\n", num_tags, + tag_dir_str[tnfa->tag_directions[num_tags]], + tag_dir_str[tnfa->tag_directions[num_tags]])); +#endif /* TRE_DEBUG */ + memcpy(tnfa->tag_directions, new_tag_directions, sizeof(*new_tag_directions) * num_tags); + xfree(new_tag_directions); + + DPRINT(("Reordering minimal_tags\n")); + for (i = 0; tnfa->minimal_tags[i] >= 0; i++) + tnfa->minimal_tags[i] = tnfa->minimal_tags[i] < num_tags ? + to_reorder[tnfa->minimal_tags[i]] : + tnfa->minimal_tags[i]; + + DPRINT(("Reordering AST tags\n")); + STACK_PUSH(stack, voidptr, tree); + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + node = tre_stack_pop_voidptr(stack); + + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = (tre_literal_t *)node->obj; + if (IS_TAG(lit)) + lit->code_max = to_reorder[lit->code_max]; + break; + } + + case UNION: + { + tre_union_t *uni = (tre_union_t *)node->obj; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, voidptr, uni->left); + break; + } + + case CATENATION: + { + tre_catenation_t *cat = (tre_catenation_t *)node->obj; + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, voidptr, cat->left); + break; + } + + case ITERATION: + { + tre_iteration_t *iter = (tre_iteration_t *)node->obj; + STACK_PUSHX(stack, voidptr, iter->arg); + break; + } + + default: + assert(0); + break; + } + } + if (status != REG_OK) + { + DPRINT(("Error while reordering tags\n")); + goto error_post_compile; + } + } + + + if (!first_pass) + { + if (tree->last_matched_branch) + { + tre_last_matched_branch_t *buf, *b, *bb; + tre_last_matched_branch_pre_t *bp; + tre_last_matched_t *u, *uu; + tre_last_matched_pre_t *up; + int *t; + int i; +#ifdef TRE_DEBUG + tre_last_matched_branch_t *_b; + tre_last_matched_t *_u; + int *_t; + + DPRINT(("last_match_branch_pre:\n")); + print_last_match_branch_pre(tree->last_matched_branch, 0, num_tags); +#endif /* TRE_DEBUG */ + buf = (tre_last_matched_branch_t *)xcalloc(1, + tree->last_matched_branch->tot_branches + * sizeof(tre_last_matched_branch_t) + + tree->last_matched_branch->tot_last_matched + * sizeof(tre_last_matched_t) + + tree->last_matched_branch->tot_tags * + sizeof(int)); + if (!buf) + { + status = REG_ESPACE; + goto error_post_compile; + } + + b = buf; + u = (tre_last_matched_t *)(b + + tree->last_matched_branch->tot_branches); + t = (int *)(u + tree->last_matched_branch->tot_last_matched); +#ifdef TRE_DEBUG + _b = b; + _u = u; + _t = t; +#endif /* TRE_DEBUG */ + DPRINT(("Copying info_pre to info\n")); + STACK_PUSH(stack, voidptr, tree->last_matched_branch); + STACK_PUSH(stack, int, 1); + STACK_PUSH(stack, int, COPY_LAST_MATCHED_BRANCH); + + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + switch (tre_stack_pop_int(stack)) + { + case COPY_LAST_MATCHED_BRANCH: + i = tre_stack_pop_int(stack); + /* The tre_last_matched_branch_pre_t * is still on the + stack */ + STACK_PUSHX(stack, voidptr, b); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT); + b += i; + break; + + case COPY_LAST_MATCHED_BRANCH_NEXT: + bb = tre_stack_pop_voidptr(stack); + bp = tre_stack_pop_voidptr(stack); + bb->n_last_matched = bp->n_last_matched; + bb->cmp_tag = bp->cmp_tag; + if (bp->n_tags > 0) + { + int n; + n = bb->n_tags = bp->n_tags; + bb->tags = t; + for (i = 0; i < num_tags; i++) + if (bit_test(bp->tags, i)) + { + *t++ = i; + if (--n <= 0) + break; + } + } + if (bp->next) + { + STACK_PUSHX(stack, voidptr, bp->next); + STACK_PUSHX(stack, voidptr, bb + 1); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH_NEXT); + } + if (bp->n_last_matched > 0) + { + bb->last_matched = u; + STACK_PUSHX(stack, voidptr, bp->last_matched); + STACK_PUSHX(stack, int, bp->n_last_matched); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED); + } + break; + + case COPY_LAST_MATCHED: + i = tre_stack_pop_int(stack); + /* The tre_last_matched_pre_t * is still on the stack */ + STACK_PUSHX(stack, voidptr, u); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT); + u += i; + break; + + case COPY_LAST_MATCHED_NEXT: + uu = tre_stack_pop_voidptr(stack); + up = tre_stack_pop_voidptr(stack); + uu->n_branches = up->n_branches; + uu->branches = b; + uu->start_tag = up->start_tag; + if (up->next) + { + STACK_PUSHX(stack, voidptr, up->next); + STACK_PUSHX(stack, voidptr, uu + 1); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_NEXT); + } + STACK_PUSHX(stack, voidptr, up->branches); + STACK_PUSHX(stack, int, up->n_branches); + STACK_PUSHX(stack, int, COPY_LAST_MATCHED_BRANCH); + break; + } + } + if (status != REG_OK) + goto error_post_compile; +#ifdef TRE_DEBUG + DPRINT(("last_matched_branch:\n")); + print_last_match_branch(buf, 0); + if (b != _b + tree->last_matched_branch->tot_branches) + DPRINT(("b/%p != _b + tree->last_matched_branch->tot_branches/%p\n", + b, _b + tree->last_matched_branch->tot_branches)); + if (u != _u + tree->last_matched_branch->tot_last_matched) + DPRINT(("u/%p != _u + " + "tree->last_matched_branch->tot_last_matched/%p\n", + u, _u + tree->last_matched_branch->tot_last_matched)); + if (t != _t + tree->last_matched_branch->tot_tags) + DPRINT(("t/%p != _t + tree->last_matched_branch->tot_tags/%p\n", + t, _t + tree->last_matched_branch->tot_tags)); +#endif /* TRE_DEBUG */ + tnfa->last_matched_branch = buf; + } +#ifdef TRE_DEBUG + else + DPRINT(("No last_match_branch_pre\n")); +#endif /* TRE_DEBUG */ + } + + DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n", + first_pass? "First pass" : "Second pass", num_tags)); +#ifdef TRE_DEBUG + tre_ast_print(tree); +#endif /* TRE_DEBUG */ + DPRINT(("tre_add_tags: tree->num_tags=%d num_tags=%d\n", tree->num_tags, + num_tags)); + assert(tree->num_tags == num_tags); + tnfa->end_tag = num_tags; + tnfa->num_tags = num_tags; + tnfa->num_minimals = num_minimals; +error_post_compile: + xfree(reorder_tags); +error_reorder_tags: + xfree(orig_regset); +error_regset: + return status; +} + + + +/* + AST to TNFA compilation routines. +*/ + +typedef enum { + COPY_RECURSE, + COPY_SET_RESULT_PTR +} tre_copyast_symbol_t; + +/* Flags for tre_copy_ast(). */ +#define COPY_REMOVE_TAGS 1 +#define COPY_MAXIMIZE_FIRST_TAG 2 + +static reg_errcode_t +tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, + int flags, int *pos_add, tre_tag_direction_t *tag_directions, + tre_ast_node_t **copy, int *max_pos) +{ + reg_errcode_t status = REG_OK; + int bottom = tre_stack_num_objects(stack); + int num_copied = 0; + int first_tag = 1; + tre_ast_node_t **result = copy; + tre_copyast_symbol_t symbol; + + STACK_PUSH(stack, voidptr, ast); + STACK_PUSH(stack, int, COPY_RECURSE); + + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + tre_ast_node_t *node; + if (status != REG_OK) + break; + + symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack); + switch (symbol) + { + case COPY_SET_RESULT_PTR: + result = tre_stack_pop_voidptr(stack); + break; + case COPY_RECURSE: + node = tre_stack_pop_voidptr(stack); + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = node->obj; + int pos = lit->position; + int min = lit->code_min; + int max = lit->code_max; + tre_bracket_match_list_t *list = !IS_SPECIAL(lit) ? + lit->u.bracket_match_list : + NULL; + if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) + { + /* XXX - e.g. [ab] has only one position but two + nodes, so we are creating holes in the state space + here. Not fatal, just wastes memory. */ + pos += *pos_add; + num_copied++; + } + else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS)) + { + /* Change this tag to empty. */ + min = EMPTY; + max = pos = -1; + } + else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) + && first_tag) + { + /* Maximize the first tag. */ + if (tag_directions[max] == TRE_TAG_LEFT_MAXIMIZE) + tag_directions[max] = TRE_TAG_MAXIMIZE; + first_tag = 0; + } + *result = tre_ast_new_literal(mem, min, max, pos); + if (*result == NULL) + status = REG_ESPACE; + + if (pos > *max_pos) + *max_pos = pos; + + if (!IS_SPECIAL(lit)) + ((tre_literal_t *)(*result)->obj)->u.bracket_match_list + = list; + break; + } + case UNION: + { + tre_union_t *uni = node->obj; + tre_union_t *tmp; + *result = tre_ast_new_union(mem, uni->left, uni->right); + if (*result == NULL) + { + status = REG_ESPACE; + break; + } + tmp = (*result)->obj; + result = &tmp->left; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, COPY_RECURSE); + break; + } + case CATENATION: + { + tre_catenation_t *cat = node->obj; + tre_catenation_t *tmp; + *result = tre_ast_new_catenation(mem, cat->left, cat->right); + if (*result == NULL) + { + status = REG_ESPACE; + break; + } + tmp = (*result)->obj; + tmp->left = NULL; + tmp->right = NULL; + result = &tmp->left; + + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, COPY_RECURSE); + break; + } + case ITERATION: + { + tre_iteration_t *iter = node->obj; + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, COPY_RECURSE); + *result = tre_ast_new_iter(mem, iter->arg, iter->min, + iter->max, iter->minimal); + if (*result == NULL) + { + status = REG_ESPACE; + break; + } + iter = (*result)->obj; + result = &iter->arg; + break; + } + default: + assert(0); + break; + } + break; + } + } + *pos_add += num_copied; + return status; +} + +typedef enum { + EXPAND_RECURSE, + EXPAND_AFTER_ITER +} tre_expand_ast_symbol_t; + +/* Expands each iteration node that has a finite nonzero minimum or maximum + iteration count to a catenated sequence of copies of the node. */ +static reg_errcode_t +tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, + int *position, tre_tag_direction_t *tag_directions, + int *max_depth) +{ + reg_errcode_t status = REG_OK; + int bottom = tre_stack_num_objects(stack); + int pos_add = 0; + int pos_add_total = 0; + int max_pos = 0; +#ifdef TRE_APPROX + /* Current approximate matching parameters. */ + int params[TRE_PARAM_LAST]; + /* Approximate parameter nesting level. */ + int params_depth = 0; +#endif /* TRE_APPROX */ + int iter_depth = 0; +#ifdef TRE_APPROX + int i; +#endif /* TRE_APPROX */ + +#ifdef TRE_APPROX + for (i = 0; i < TRE_PARAM_LAST; i++) + params[i] = TRE_PARAM_DEFAULT; +#endif /* TRE_APPROX */ + + STACK_PUSHR(stack, voidptr, ast); + STACK_PUSHR(stack, int, EXPAND_RECURSE); + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + tre_ast_node_t *node; + tre_expand_ast_symbol_t symbol; + + if (status != REG_OK) + break; + + DPRINT(("pos_add %d\n", pos_add)); + + symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); + switch (symbol) + { + case EXPAND_RECURSE: + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit= node->obj; + if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) + { + lit->position += pos_add; + if (lit->position > max_pos) + max_pos = lit->position; + } + break; + } + case UNION: + { + tre_union_t *uni = node->obj; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + break; + } + case CATENATION: + { + tre_catenation_t *cat = node->obj; + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + break; + } + case ITERATION: + { + tre_iteration_t *iter = node->obj; + STACK_PUSHX(stack, int, pos_add); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, EXPAND_AFTER_ITER); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + /* If we are going to expand this node at EXPAND_AFTER_ITER + then don't increase the `pos' fields of the nodes now, it + will get done when expanding. */ + if (iter->min > 1 || iter->max > 1) + pos_add = 0; + iter_depth++; + DPRINT(("iter\n")); + break; + } + default: + assert(0); + break; + } + break; + case EXPAND_AFTER_ITER: + { + tre_iteration_t *iter = node->obj; + int pos_add_last; + pos_add = tre_stack_pop_int(stack); + pos_add_last = pos_add; + /* Originally (in tre_parse_bound), if min == 0 && max == 0, we + immediate replace the whole iteration with EMPTY. This + unfortunately drops any submatches, and messes up setting the + pmatch values (we can get tags of -1, and tag values in the + billions). So we left it there and replace with EMPTY here. */ + if (iter->min == 0 && iter->max == 0) + { + tre_ast_node_t *empty = tre_ast_new_literal(mem, EMPTY, -1, -1); + if (empty == NULL) + return REG_ESPACE; + node->obj = empty->obj; + node->type = empty->type; + } + else if (iter->min > 1 || iter->max > 1) + { + tre_ast_node_t *seq1 = NULL, *seq2 = NULL; + int j; + int pos_add_save = pos_add; + + /* Create a catenated sequence of copies of the node. */ + for (j = 0; j < iter->min; j++) + { + tre_ast_node_t *copy; + /* Remove tags from all but the last copy. */ + int flags = ((j + 1 < iter->min) + ? COPY_REMOVE_TAGS + : COPY_MAXIMIZE_FIRST_TAG); + DPRINT((" pos_add %d\n", pos_add)); + pos_add_save = pos_add; + status = tre_copy_ast(mem, stack, iter->arg, flags, + &pos_add, tag_directions, ©, + &max_pos); + if (status != REG_OK) + return status; + if (seq1 != NULL) + seq1 = tre_ast_new_catenation(mem, seq1, copy); + else + seq1 = copy; + if (seq1 == NULL) + return REG_ESPACE; + } + + if (iter->max == -1) + { + /* No upper limit. */ + pos_add_save = pos_add; + status = tre_copy_ast(mem, stack, iter->arg, 0, + &pos_add, NULL, &seq2, &max_pos); + if (status != REG_OK) + return status; + seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0); + if (seq2 == NULL) + return REG_ESPACE; + } + else + { + for (j = iter->min; j < iter->max; j++) + { + tre_ast_node_t *tmp, *copy; + pos_add_save = pos_add; + status = tre_copy_ast(mem, stack, iter->arg, 0, + &pos_add, NULL, ©, &max_pos); + if (status != REG_OK) + return status; + if (seq2 != NULL) + seq2 = tre_ast_new_catenation(mem, copy, seq2); + else + seq2 = copy; + if (seq2 == NULL) + return REG_ESPACE; + tmp = tre_ast_new_literal(mem, EMPTY, -1, -1); + if (tmp == NULL) + return REG_ESPACE; + seq2 = tre_ast_new_union(mem, tmp, seq2); + if (seq2 == NULL) + return REG_ESPACE; + } + } + + pos_add = pos_add_save; + if (seq1 == NULL) + seq1 = seq2; + else if (seq2 != NULL) + seq1 = tre_ast_new_catenation(mem, seq1, seq2); + if (seq1 == NULL) + return REG_ESPACE; + node->obj = seq1->obj; + node->type = seq1->type; + } + + iter_depth--; + pos_add_total += pos_add - pos_add_last; + if (iter_depth == 0) + pos_add = pos_add_total; + +#ifdef TRE_APPROX + /* If approximate parameters are specified, surround the result + with two parameter setting nodes. The one on the left sets + the specified parameters, and the one on the right restores + the old parameters. */ + if (iter->params) + { + tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy; + int *old_params; + + tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1); + if (!tmp_l) + return REG_ESPACE; + ((tre_literal_t *)tmp_l->obj)->u.params = iter->params; + iter->params[TRE_PARAM_DEPTH] = params_depth + 1; + tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1); + if (!tmp_r) + return REG_ESPACE; + old_params = tre_mem_alloc(mem, sizeof(*old_params) + * TRE_PARAM_LAST); + if (!old_params) + return REG_ESPACE; + for (i = 0; i < TRE_PARAM_LAST; i++) + old_params[i] = params[i]; + ((tre_literal_t *)tmp_r->obj)->u.params = old_params; + old_params[TRE_PARAM_DEPTH] = params_depth; + /* XXX - this is the only place where ast_new_node is + needed -- should be moved inside AST module. */ + node_copy = tre_ast_new_node(mem, ITERATION, + sizeof(tre_iteration_t)); + if (!node_copy) + return REG_ESPACE; + node_copy->obj = node->obj; + tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy); + if (!tmp_node) + return REG_ESPACE; + tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r); + if (!tmp_node) + return REG_ESPACE; + /* Replace the contents of `node' with `tmp_node'. */ + memcpy(node, tmp_node, sizeof(*node)); + node->obj = tmp_node->obj; + node->type = tmp_node->type; + params_depth++; + if (params_depth > *max_depth) + *max_depth = params_depth; + } +#endif /* TRE_APPROX */ + break; + } + default: + assert(0); + break; + } + } + + *position += pos_add_total; + + /* `max_pos' should never be larger than `*position' if the above + code works, but just an extra safeguard let's make sure + `*position' is set large enough so enough memory will be + allocated for the transition table. */ + if (max_pos > *position) + *position = max_pos; + +#ifdef TRE_DEBUG + DPRINT(("Expanded AST:\n")); + tre_ast_print(ast); + DPRINT(("*position %d, max_pos %d\n", *position, max_pos)); +#endif + + return status; +} + +static tre_pos_and_tags_t * +tre_set_empty(tre_mem_t mem) +{ + tre_pos_and_tags_t *new_set; + + new_set = tre_mem_calloc(mem, sizeof(*new_set)); + if (new_set == NULL) + return NULL; + + new_set[0].position = -1; + new_set[0].code_min = -1; + new_set[0].code_max = -1; + + return new_set; +} + +static tre_pos_and_tags_t * +tre_set_one(tre_mem_t mem, int position, int code_min, int code_max, + tre_bracket_match_list_t *bracket_match_list, int backref) +{ + tre_pos_and_tags_t *new_set; + + new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2); + if (new_set == NULL) + return NULL; + + new_set[0].position = position; + new_set[0].code_min = code_min; + new_set[0].code_max = code_max; + new_set[0].bracket_match_list = bracket_match_list; + new_set[0].backref = backref; + new_set[1].position = -1; + new_set[1].code_min = -1; + new_set[1].code_max = -1; + + return new_set; +} + +static tre_pos_and_tags_t * +tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2, + int *tags, int assertions, int *params) +{ + int s1, s2, i, j; + tre_pos_and_tags_t *new_set; + int *new_tags; + int num_tags; + + for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++); + for (s1 = 0; set1[s1].position >= 0; s1++); + for (s2 = 0; set2[s2].position >= 0; s2++); + new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1)); + if (!new_set ) + return NULL; + + for (s1 = 0; set1[s1].position >= 0; s1++) + { + new_set[s1].position = set1[s1].position; + new_set[s1].code_min = set1[s1].code_min; + new_set[s1].code_max = set1[s1].code_max; + new_set[s1].assertions = set1[s1].assertions | assertions; + new_set[s1].bracket_match_list = set1[s1].bracket_match_list; + new_set[s1].backref = set1[s1].backref; + if (set1[s1].tags == NULL && tags == NULL) + new_set[s1].tags = NULL; + else + { + for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++); + new_tags = tre_mem_alloc(mem, (sizeof(*new_tags) + * (i + num_tags + 1))); + if (new_tags == NULL) + return NULL; + for (j = 0; j < i; j++) + new_tags[j] = set1[s1].tags[j]; + for (i = 0; i < num_tags; i++) + new_tags[j + i] = tags[i]; + new_tags[j + i] = -1; + new_set[s1].tags = new_tags; + } + if (set1[s1].params) + new_set[s1].params = set1[s1].params; + if (params) + { + if (!new_set[s1].params) + new_set[s1].params = params; + else + { + new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) * + TRE_PARAM_LAST); + if (!new_set[s1].params) + return NULL; + for (i = 0; i < TRE_PARAM_LAST; i++) + if (params[i] != TRE_PARAM_UNSET) + new_set[s1].params[i] = params[i]; + } + } + } + + for (s2 = 0; set2[s2].position >= 0; s2++) + { + new_set[s1 + s2].position = set2[s2].position; + new_set[s1 + s2].code_min = set2[s2].code_min; + new_set[s1 + s2].code_max = set2[s2].code_max; + /* XXX - why not | assertions here as well? */ + new_set[s1 + s2].assertions = set2[s2].assertions; + new_set[s1 + s2].bracket_match_list = set2[s2].bracket_match_list; + new_set[s1 + s2].backref = set2[s2].backref; + if (set2[s2].tags == NULL) + new_set[s1 + s2].tags = NULL; + else + { + for (i = 0; set2[s2].tags[i] >= 0; i++); + new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1)); + if (new_tags == NULL) + return NULL; + for (j = 0; j < i; j++) + new_tags[j] = set2[s2].tags[j]; + new_tags[j] = -1; + new_set[s1 + s2].tags = new_tags; + } + if (set2[s2].params) + new_set[s1 + s2].params = set2[s2].params; + if (params) + { + if (!new_set[s1 + s2].params) + new_set[s1 + s2].params = params; + else + { + new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) * + TRE_PARAM_LAST); + if (!new_set[s1 + s2].params) + return NULL; + for (i = 0; i < TRE_PARAM_LAST; i++) + if (params[i] != TRE_PARAM_UNSET) + new_set[s1 + s2].params[i] = params[i]; + } + } + } + new_set[s1 + s2].position = -1; + return new_set; +} + +/* Finds the empty path through `node' which is the one that should be + taken according to POSIX.2 rules, and adds the tags on that path to + `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is + set to the number of tags seen on the path. */ +static reg_errcode_t +tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, + int *assertions, int *params, int *num_tags_seen, + int *params_seen) +{ + tre_literal_t *lit; + tre_union_t *uni; + tre_catenation_t *cat; + tre_iteration_t *iter; + int i; + int bottom = tre_stack_num_objects(stack); + reg_errcode_t status = REG_OK; + if (num_tags_seen) + *num_tags_seen = 0; + if (params_seen) + *params_seen = 0; + + status = tre_stack_push_voidptr(stack, node); + + /* Walk through the tree recursively. */ + while (status == REG_OK && tre_stack_num_objects(stack) > bottom) + { + node = tre_stack_pop_voidptr(stack); + + switch (node->type) + { + case LITERAL: + lit = (tre_literal_t *)node->obj; + switch (lit->code_min) + { + case TAG: + if (lit->code_max >= 0) + { + if (tags != NULL) + { + /* Add the tag to `tags'. */ + for (i = 0; tags[i] >= 0; i++) + if (tags[i] == lit->code_max) + break; + if (tags[i] < 0) + { + tags[i] = lit->code_max; + tags[i + 1] = -1; + } + } + if (num_tags_seen) + (*num_tags_seen)++; + } + break; + case ASSERTION: + assert(lit->code_max >= 1 + || lit->code_max <= ASSERT_LAST); + if (assertions != NULL) + *assertions |= lit->code_max; + break; + case PARAMETER: + if (params != NULL) + for (i = 0; i < TRE_PARAM_LAST; i++) + params[i] = lit->u.params[i]; + if (params_seen != NULL) + *params_seen = 1; + break; + case EMPTY: + break; + default: + assert(0); + break; + } + break; + + case UNION: + /* Subexpressions starting earlier take priority over ones + starting later, so we prefer the left subexpression over the + right subexpression. */ + uni = (tre_union_t *)node->obj; + if (uni->left->nullable) + STACK_PUSHX(stack, voidptr, uni->left) + else if (uni->right->nullable) + STACK_PUSHX(stack, voidptr, uni->right) + else + assert(0); + break; + + case CATENATION: + /* The path must go through both children. */ + cat = (tre_catenation_t *)node->obj; + assert(cat->left->nullable); + assert(cat->right->nullable); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, voidptr, cat->right); + break; + + case ITERATION: + /* A match with an empty string is preferred over no match at + all, so we go through the argument if possible. */ + iter = (tre_iteration_t *)node->obj; + if (iter->arg->nullable) + STACK_PUSHX(stack, voidptr, iter->arg); + break; + + default: + assert(0); + break; + } + } + + return status; +} + + +typedef enum { + NFL_RECURSE, + NFL_POST_UNION, + NFL_POST_CATENATION, + NFL_POST_ITERATION +} tre_nfl_stack_symbol_t; + + +/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for + the nodes of the AST `tree'. */ +static reg_errcode_t +tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) +{ + int bottom = tre_stack_num_objects(stack); + + STACK_PUSHR(stack, voidptr, tree); + STACK_PUSHR(stack, int, NFL_RECURSE); + + while (tre_stack_num_objects(stack) > bottom) + { + tre_nfl_stack_symbol_t symbol; + tre_ast_node_t *node; + + symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); + switch (symbol) + { + case NFL_RECURSE: + switch (node->type) + { + case LITERAL: + { + tre_literal_t *lit = (tre_literal_t *)node->obj; + if (IS_BACKREF(lit)) + { + /* Back references: nullable = false, firstpos = {i}, + lastpos = {i}. */ + node->nullable = 0; + node->firstpos = tre_set_one(mem, lit->position, 0, + TRE_CHAR_MAX, NULL, -1); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_one(mem, lit->position, 0, + TRE_CHAR_MAX, NULL, + (int)lit->code_max); + if (!node->lastpos) + return REG_ESPACE; + } + else if (lit->code_min < 0) + { + /* Tags, empty strings, params, and zero width assertions: + nullable = true, firstpos = {}, and lastpos = {}. */ + node->nullable = 1; + node->firstpos = tre_set_empty(mem); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_empty(mem); + if (!node->lastpos) + return REG_ESPACE; + } + else + { + /* Literal at position i: nullable = false, firstpos = {i}, + lastpos = {i}. */ + node->nullable = 0; + node->firstpos = + tre_set_one(mem, lit->position, (int)lit->code_min, + (int)lit->code_max, NULL, -1); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_one(mem, lit->position, + (int)lit->code_min, + (int)lit->code_max, + lit->u.bracket_match_list, + -1); + if (!node->lastpos) + return REG_ESPACE; + } + break; + } + + case UNION: + /* Compute the attributes for the two subtrees, and after that + for this node. */ + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_UNION); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); + STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); + STACK_PUSHR(stack, int, NFL_RECURSE); + break; + + case CATENATION: + /* Compute the attributes for the two subtrees, and after that + for this node. */ + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_CATENATION); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); + STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); + STACK_PUSHR(stack, int, NFL_RECURSE); + break; + + case ITERATION: + /* Compute the attributes for the subtree, and after that for + this node. */ + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_ITERATION); + STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); + STACK_PUSHR(stack, int, NFL_RECURSE); + break; + } + break; /* end case: NFL_RECURSE */ + + case NFL_POST_UNION: + { + tre_union_t *uni = (tre_union_t *)node->obj; + node->nullable = uni->left->nullable || uni->right->nullable; + node->firstpos = tre_set_union(mem, uni->left->firstpos, + uni->right->firstpos, NULL, 0, NULL); + if (!node->firstpos) + return REG_ESPACE; + node->lastpos = tre_set_union(mem, uni->left->lastpos, + uni->right->lastpos, NULL, 0, NULL); + if (!node->lastpos) + return REG_ESPACE; + break; + } + + case NFL_POST_ITERATION: + { + int num_tags, *tags, assertions, params_seen; + int *params; + reg_errcode_t status; + tre_iteration_t *iter = (tre_iteration_t *)node->obj; + + /* From Ville Laurikari's original 2001 Master's thesis, the + firstpos(n) and lastpos(n) of an iteration is just the + corresponding values of the iteration's argument. Unfortunately, + this isn't sufficient for the following BRE: + + \(a*\)*b\(\1\) matched against ab + + The backreference wants to force the first subexpression to + be the empty string, but there is no transition for this. So + we need to modify the lastpos(n) of an iteration to be the + equivalent of that of catentation. Using the same notation as + in the thesis, lastpos(n) is redefined as: + + if nullable(c1) then + lastpos(c1) U + addtags(lastpos(c1), + emptymatch(c1)) + else + lastpos(c1) + + where c1 is the argument node. firstpos(n) remains the same. */ + + /* Compute lastpos. */ + if (iter->min == 0 || iter->arg->nullable) + { + node->nullable = 1; + if (iter->arg->nullable) + { + /* The arg matches the empty string. Make a first pass + with tre_match_empty() to get the number of tags and + parameters. */ + status = tre_match_empty(stack, iter->arg, + NULL, NULL, NULL, &num_tags, + ¶ms_seen); + if (status != REG_OK) + return status; + /* Allocate arrays for the tags and parameters. */ + tags = xmalloc(sizeof(int) * (num_tags + 1)); + if (!tags) + return REG_ESPACE; + tags[0] = -1; + assertions = 0; + params = NULL; + if (params_seen) + { + params = tre_mem_alloc(mem, sizeof(*params) + * TRE_PARAM_LAST); + if (!params) + { + xfree(tags); + return REG_ESPACE; + } + } + /* Second pass with tre_mach_empty() to get the list of + tags and parameters. */ + status = tre_match_empty(stack, iter->arg, tags, + &assertions, params, NULL, NULL); + if (status != REG_OK) + { + xfree(tags); + return status; + } + node->lastpos = + tre_set_union(mem, iter->arg->lastpos, iter->arg->lastpos, + tags, assertions, params); + xfree(tags); + if (!node->lastpos) + return REG_ESPACE; + } + else + node->lastpos = iter->arg->lastpos; + } + else + { + node->nullable = 0; + node->lastpos = iter->arg->lastpos; + } + node->firstpos = iter->arg->firstpos; + break; + } + + case NFL_POST_CATENATION: + { + int num_tags, *tags, assertions, params_seen; + int *params; + reg_errcode_t status; + tre_catenation_t *cat = node->obj; + node->nullable = cat->left->nullable && cat->right->nullable; + + /* Compute firstpos. */ + if (cat->left->nullable) + { + /* The left side matches the empty string. Make a first pass + with tre_match_empty() to get the number of tags and + parameters. */ + status = tre_match_empty(stack, cat->left, + NULL, NULL, NULL, &num_tags, + ¶ms_seen); + if (status != REG_OK) + return status; + /* Allocate arrays for the tags and parameters. */ + tags = xmalloc(sizeof(*tags) * (num_tags + 1)); + if (!tags) + return REG_ESPACE; + tags[0] = -1; + assertions = 0; + params = NULL; + if (params_seen) + { + params = tre_mem_alloc(mem, sizeof(*params) + * TRE_PARAM_LAST); + if (!params) + { + xfree(tags); + return REG_ESPACE; + } + } + /* Second pass with tre_mach_empty() to get the list of + tags and parameters. */ + status = tre_match_empty(stack, cat->left, tags, + &assertions, params, NULL, NULL); + if (status != REG_OK) + { + xfree(tags); + return status; + } + node->firstpos = + tre_set_union(mem, cat->right->firstpos, cat->left->firstpos, + tags, assertions, params); + xfree(tags); + if (!node->firstpos) + return REG_ESPACE; + } + else + { + node->firstpos = cat->left->firstpos; + } + + /* Compute lastpos. */ + if (cat->right->nullable) + { + /* The right side matches the empty string. Make a first pass + with tre_match_empty() to get the number of tags and + parameters. */ + status = tre_match_empty(stack, cat->right, + NULL, NULL, NULL, &num_tags, + ¶ms_seen); + if (status != REG_OK) + return status; + /* Allocate arrays for the tags and parameters. */ + tags = xmalloc(sizeof(int) * (num_tags + 1)); + if (!tags) + return REG_ESPACE; + tags[0] = -1; + assertions = 0; + params = NULL; + if (params_seen) + { + params = tre_mem_alloc(mem, sizeof(*params) + * TRE_PARAM_LAST); + if (!params) + { + xfree(tags); + return REG_ESPACE; + } + } + /* Second pass with tre_mach_empty() to get the list of + tags and parameters. */ + status = tre_match_empty(stack, cat->right, tags, + &assertions, params, NULL, NULL); + if (status != REG_OK) + { + xfree(tags); + return status; + } + node->lastpos = + tre_set_union(mem, cat->left->lastpos, cat->right->lastpos, + tags, assertions, params); + xfree(tags); + if (!node->lastpos) + return REG_ESPACE; + } + else + { + node->lastpos = cat->right->lastpos; + } + break; + } + + default: + assert(0); + break; + } + } + + return REG_OK; +} + + +/* Adds a transition from each position in `p1' to each position in `p2'. */ +static reg_errcode_t +tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, + tre_tnfa_transition_t *transitions, + int *counts, int *offs) +{ + tre_pos_and_tags_t *orig_p2 = p2; + tre_tnfa_transition_t *trans; + int i, j, k, l, dup, prev_p2_pos; + + if (transitions != NULL) + while (p1->position >= 0) + { + p2 = orig_p2; + prev_p2_pos = -1; + while (p2->position >= 0) + { + /* Optimization: if this position was already handled, skip it. */ + if (p2->position == prev_p2_pos) + { + p2++; + continue; + } + prev_p2_pos = p2->position; + /* Set `trans' to point to the next unused transition from + position `p1->position'. */ + trans = transitions + offs[p1->position]; + while (trans->state != NULL) + { +#if 0 + /* If we find a previous transition from `p1->position' to + `p2->position', it is overwritten. This can happen only + if there are nested loops in the regexp, like in "((a)*)*". + In POSIX.2 repetition using the outer loop is always + preferred over using the inner loop. Therefore the + transition for the inner loop is useless and can be thrown + away. */ + /* XXX - The same position is used for all nodes in a bracket + expression, so this optimization cannot be used (it will + break bracket expressions) unless I figure out a way to + detect it here. */ + if (trans->state_id == p2->position) + { + DPRINT(("*")); + break; + } +#endif + trans++; + } + + if (trans->state == NULL) + (trans + 1)->state = NULL; + /* Use the character ranges, assertions, etc. from `p1' for + the transition from `p1' to `p2'. */ + trans->code_min = p1->code_min; + trans->code_max = p1->code_max; + trans->state = transitions + offs[p2->position]; + trans->state_id = p2->position; + trans->assertions = p1->assertions | p2->assertions + | (p1->bracket_match_list != NULL ? ASSERT_BRACKET_MATCH : 0); + if (p1->backref >= 0) + { + assert((trans->assertions & ASSERT_BRACKET_MATCH) == 0); + assert(p2->backref < 0); + trans->u.backref = p1->backref; + trans->assertions |= ASSERT_BACKREF; + } + if (p1->bracket_match_list != NULL) + { + trans->u.bracket_match_list = + xmalloc(SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list)); + if (trans->u.bracket_match_list == NULL) + return REG_ESPACE; + memcpy(trans->u.bracket_match_list, p1->bracket_match_list, + SIZEOF_BRACKET_MATCH_LIST(p1->bracket_match_list)); + } + + /* Find out how many tags this transition has. */ + i = 0; + if (p1->tags != NULL) + while(p1->tags[i] >= 0) + i++; + j = 0; + if (p2->tags != NULL) + while(p2->tags[j] >= 0) + j++; + + /* If we are overwriting a transition, free the old tag array. */ + if (trans->tags != NULL) + xfree(trans->tags); + trans->tags = NULL; + + /* If there were any tags, allocate an array and fill it. */ + if (i + j > 0) + { + trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1)); + if (!trans->tags) + return REG_ESPACE; + i = 0; + if (p1->tags != NULL) + while(p1->tags[i] >= 0) + { + trans->tags[i] = p1->tags[i]; + i++; + } + l = i; + j = 0; + if (p2->tags != NULL) + while (p2->tags[j] >= 0) + { + /* Don't add duplicates. */ + dup = 0; + for (k = 0; k < i; k++) + if (trans->tags[k] == p2->tags[j]) + { + dup = 1; + break; + } + if (!dup) + trans->tags[l++] = p2->tags[j]; + j++; + } + trans->tags[l] = -1; + } + + /* Set the parameter array. If both `p2' and `p1' have same + parameters, the values in `p2' override those in `p1'. */ + if (p1->params || p2->params) + { + if (!trans->params) + trans->params = xmalloc(sizeof(*trans->params) + * TRE_PARAM_LAST); + if (!trans->params) + return REG_ESPACE; + for (i = 0; i < TRE_PARAM_LAST; i++) + { + trans->params[i] = TRE_PARAM_UNSET; + if (p1->params && p1->params[i] != TRE_PARAM_UNSET) + trans->params[i] = p1->params[i]; + if (p2->params && p2->params[i] != TRE_PARAM_UNSET) + trans->params[i] = p2->params[i]; + } + } + else + { + if (trans->params) + xfree(trans->params); + trans->params = NULL; + } + + +#ifdef TRE_DEBUG + { + int *tags; + + DPRINT((" %2d -> %2d on %3d", p1->position, p2->position, + p1->code_min)); + if (p1->code_max != p1->code_min) + DPRINT(("-%3d", p1->code_max)); + tags = trans->tags; + if (tags) + { + DPRINT((", tags [")); + while (*tags >= 0) + { + DPRINT(("%d", *tags)); + tags++; + if (*tags >= 0) + DPRINT((",")); + } + DPRINT(("]")); + } + if (trans->assertions) + DPRINT((", assert %d", trans->assertions)); + if (trans->assertions & ASSERT_BACKREF) + DPRINT((", backref %d", trans->u.backref)); + else if (trans->assertions & ASSERT_BRACKET_MATCH) + DPRINT((", bracket_match_list %p", + trans->u.bracket_match_list)); + if (trans->params) + { + DPRINT((", ")); + tre_print_params(trans->params); + } + DPRINT(("\n")); + } +#endif /* TRE_DEBUG */ + p2++; + } + p1++; + } + else + /* Compute a maximum limit for the number of transitions leaving + from each state. */ + while (p1->position >= 0) + { + p2 = orig_p2; + while (p2->position >= 0) + { + counts[p1->position]++; + p2++; + } + p1++; + } + return REG_OK; +} + +/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are + labelled with one character range (there are no transitions on empty + strings). The TNFA takes O(n^2) space in the worst case, `n' is size of + the regexp. */ +static reg_errcode_t +tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, + int *counts, int *offs) +{ + tre_union_t *uni; + tre_catenation_t *cat; + tre_iteration_t *iter; + reg_errcode_t errcode = REG_OK; + + /* XXX - recurse using a stack!. */ + switch (node->type) + { + case LITERAL: + break; + case UNION: + uni = (tre_union_t *)node->obj; + errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs); + break; + + case CATENATION: + cat = (tre_catenation_t *)node->obj; + /* Add a transition from each position in cat->left->lastpos + to each position in cat->right->firstpos. */ + errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos, + transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs); + break; + + case ITERATION: + iter = (tre_iteration_t *)node->obj; + assert(iter->max == -1 || iter->max == 1); + + if (iter->max == -1) + { + assert(iter->min == 0 || iter->min == 1); + /* Add a transition from each last position in the iterated + expression to each first position. */ + errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos, + transitions, counts, offs); + if (errcode != REG_OK) + return errcode; + } + errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs); + break; + } + return errcode; +} + + +#define ERROR_EXIT(err) \ + do \ + { \ + errcode = err; \ + if (/*CONSTCOND*/1) \ + goto error_exit; \ + } \ + while (/*CONSTCOND*/0) + + +int +tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags, + locale_t loc) +{ + tre_stack_t *stack; + tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; + tre_pos_and_tags_t *p; + int *counts = NULL, *offs = NULL; + int i, add = 0; + tre_tnfa_transition_t *transitions, *initial; + tre_tnfa_t *tnfa = NULL; + tre_submatch_data_t *submatch_data = NULL; + tre_tag_direction_t *tag_directions = NULL; + reg_errcode_t errcode; + tre_mem_t mem; + + /* Parse context. */ + tre_parse_ctx_t parse_ctx; + + /* Allocate a stack used throughout the compilation process for various + purposes. */ + stack = tre_stack_new(512, 10240, 128); + if (!stack) + return REG_ESPACE; + /* Allocate a fast memory allocator. */ + mem = tre_mem_new(); + if (!mem) + { + tre_stack_destroy(stack); + return REG_ESPACE; + } + + /* Parse the regexp. */ + memset(&parse_ctx, 0, sizeof(parse_ctx)); + parse_ctx.mem = mem; + parse_ctx.stack = stack; + parse_ctx.re = regex; + parse_ctx.len = n; + /* Only allow REG_UNGREEDY to be set if both REG_ENHANCED and REG_EXTENDED + are also set */ + if ((cflags & (REG_ENHANCED | REG_EXTENDED)) != (REG_ENHANCED | REG_EXTENDED)) + cflags &= ~REG_UNGREEDY; + parse_ctx.cflags = cflags; + parse_ctx.max_backref = -1; + parse_ctx.loc = loc; + parse_ctx.submatch_id_invisible = SUBMATCH_ID_INVISIBLE_START; + + DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex)); + errcode = tre_parse(&parse_ctx); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + preg->re_nsub = parse_ctx.submatch_id - 1; + tree = parse_ctx.result; + + /* Back references and approximate matching cannot currently be used + in the same regexp. */ + if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx) + ERROR_EXIT(REG_BADPAT); + +#ifdef TRE_DEBUG + tre_ast_print(tree); +#endif /* TRE_DEBUG */ + + /* Referring to nonexistent subexpressions is illegal. */ + if (parse_ctx.max_backref > (int)preg->re_nsub) + ERROR_EXIT(REG_ESUBREG); + + /* Allocate the TNFA struct. */ + tnfa = xcalloc(1, sizeof(tre_tnfa_t)); + if (tnfa == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->have_backrefs = parse_ctx.max_backref >= 0; + tnfa->have_approx = parse_ctx.have_approx; + tnfa->num_submatches = parse_ctx.submatch_id; + tnfa->num_submatches_invisible = parse_ctx.submatch_id_invisible + - SUBMATCH_ID_INVISIBLE_START; + tnfa->num_reorder_tags = parse_ctx.num_reorder_tags; + tnfa->loc = parse_ctx.loc; + + /* Set up tags for submatch addressing. If REG_NOSUB is set and the + regexp does not have back references, this can be skipped. */ + if (tnfa->num_reorder_tags > 0 || !(cflags & REG_NOSUB)) + { + DPRINT(("tre_compile: setting up tags\n")); + + /* Figure out how many tags we will need. */ + errcode = tre_add_tags(NULL, stack, tree, tnfa); + if (errcode != REG_OK) + ERROR_EXIT(errcode); +#ifdef TRE_DEBUG + tre_ast_print(tree); +#endif /* TRE_DEBUG */ + + if (tnfa->num_tags > 0) + { + tag_directions = xmalloc(sizeof(*tag_directions) + * (tnfa->num_tags + 1)); + if (tag_directions == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->tag_directions = tag_directions; + memset(tag_directions, -1, + sizeof(*tag_directions) * (tnfa->num_tags + 1)); + } + tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 3, + sizeof(tnfa->minimal_tags)); + if (tnfa->minimal_tags == NULL) + ERROR_EXIT(REG_ESPACE); + + submatch_data = xcalloc((unsigned)parse_ctx.submatch_id, + sizeof(*submatch_data)); + if (submatch_data == NULL) + ERROR_EXIT(REG_ESPACE); + /* Set the eo_tag value to -1 to indicate that that corresponding + * submatch has not be completed yet */ + for (i = 0; i < parse_ctx.submatch_id; i++) + { + submatch_data[i].eo_tag = -1; + } + tnfa->submatch_data = submatch_data; + + errcode = tre_add_tags(mem, stack, tree, tnfa); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + +#ifdef TRE_DEBUG + for (i = 0; i < parse_ctx.submatch_id; i++) + DPRINT(("pmatch[%d] = {t%d, t%d}\n", + i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); + for (i = 0; i <= tnfa->num_tags; i++) + DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]])); +#endif /* TRE_DEBUG */ + } + + /* Expand iteration nodes. */ + errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, + tag_directions, &tnfa->params_depth); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + + /* Add a dummy node for the final state. + XXX - For certain patterns this dummy node can be optimized away, + for example "a*" or "ab*". Figure out a simple way to detect + this possibility. */ + tmp_ast_l = tree; + tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); + if (tmp_ast_r == NULL) + ERROR_EXIT(REG_ESPACE); + + tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); + if (tree == NULL) + ERROR_EXIT(REG_ESPACE); + +#ifdef TRE_DEBUG + tre_ast_print(tree); + DPRINT(("Number of states: %d\n", parse_ctx.position)); + if (submatch_data) + for (i = 0; i < parse_ctx.submatch_id; i++) + DPRINT(("pmatch[%d] = {t%d, t%d}\n", + i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); + if (tag_directions) + for (i = 0; i <= tnfa->num_tags; i++) + DPRINT(("t%d is %s\n", i, tag_dir_str[tag_directions[i]])); +#endif /* TRE_DEBUG */ + + errcode = tre_compute_nfl(mem, stack, tree); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + + counts = xmalloc(sizeof(int) * parse_ctx.position); + if (counts == NULL) + ERROR_EXIT(REG_ESPACE); + + offs = xmalloc(sizeof(int) * parse_ctx.position); + if (offs == NULL) + ERROR_EXIT(REG_ESPACE); + + for (i = 0; i < parse_ctx.position; i++) + counts[i] = 0; + tre_ast_to_tnfa(tree, NULL, counts, NULL); + + add = 0; + for (i = 0; i < parse_ctx.position; i++) + { + offs[i] = add; + add += counts[i] + 1; + counts[i] = 0; + } + transitions = xcalloc((unsigned)add + 1, sizeof(*transitions)); + if (transitions == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->transitions = transitions; + tnfa->num_transitions = add; + + DPRINT(("Converting to TNFA:\n")); + errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + + /* If in eight bit mode, compute a table of characters that can be the + first character of a match. */ + tnfa->first_char = -1; + if (TRE_MB_CUR_MAX_L(tnfa->loc) == 1 && !tmp_ast_l->nullable) + { + int count = 0; + tre_cint_t k; + DPRINT(("Characters that can start a match:")); + tnfa->firstpos_chars = xcalloc(256, sizeof(char)); + if (tnfa->firstpos_chars == NULL) + ERROR_EXIT(REG_ESPACE); + for (p = tree->firstpos; p->position >= 0; p++) + { + tre_tnfa_transition_t *j = transitions + offs[p->position]; + while (j->state != NULL) + { + for (k = j->code_min; k <= j->code_max && k < 256; k++) + { + DPRINT((" %d", k)); + tnfa->firstpos_chars[k] = 1; + count++; + } + j++; + } + } + DPRINT(("\n")); +#define TRE_OPTIMIZE_FIRST_CHAR 1 +#if TRE_OPTIMIZE_FIRST_CHAR + if (count == 1) + { + for (k = 0; k < 256; k++) + if (tnfa->firstpos_chars[k]) + { + DPRINT(("first char must be %d\n", k)); + tnfa->first_char = k; + xfree(tnfa->firstpos_chars); + tnfa->firstpos_chars = NULL; + break; + } + } +#endif + + } + else + tnfa->firstpos_chars = NULL; + + + p = tree->firstpos; + i = 0; + while (p->position >= 0) + { + i++; + +#ifdef TRE_DEBUG + { + int *tags; + DPRINT(("initial: %d", p->position)); + tags = p->tags; + if (tags != NULL) + { + if (*tags >= 0) + DPRINT(("/")); + while (*tags >= 0) + { + DPRINT(("%d", *tags)); + tags++; + if (*tags >= 0) + DPRINT((",")); + } + } + DPRINT((", assert %d", p->assertions)); + if (p->params) + { + DPRINT((", ")); + tre_print_params(p->params); + } + DPRINT(("\n")); + } +#endif /* TRE_DEBUG */ + + p++; + } + + initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t)); + if (initial == NULL) + ERROR_EXIT(REG_ESPACE); + tnfa->initial = initial; + + i = 0; + for (p = tree->firstpos; p->position >= 0; p++) + { + initial[i].state = transitions + offs[p->position]; + initial[i].state_id = p->position; + initial[i].tags = NULL; + /* Copy the arrays p->tags, and p->params, they are allocated + from a tre_mem object. */ + if (p->tags) + { + int j; + for (j = 0; p->tags[j] >= 0; j++); + initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1)); + if (!initial[i].tags) + ERROR_EXIT(REG_ESPACE); + memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); + } + initial[i].params = NULL; + if (p->params) + { + initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST); + if (!initial[i].params) + ERROR_EXIT(REG_ESPACE); + memcpy(initial[i].params, p->params, + sizeof(*p->params) * TRE_PARAM_LAST); + } + initial[i].assertions = p->assertions; + i++; + } + initial[i].state = NULL; + + tnfa->num_transitions = add; + tnfa->final = transitions + offs[tree->lastpos[0].position]; + tnfa->num_states = parse_ctx.position; + tnfa->cflags = cflags; + + DPRINT(("final state %d (%p)\n", tree->lastpos[0].position, + (void *)tnfa->final)); + + tre_mem_destroy(mem); + tre_stack_destroy(stack); + xfree(counts); + xfree(offs); + +#ifdef TRE_USE_SYSTEM_REGEX_H + preg->re_magic = RE_MAGIC; +#endif /* TRE_USE_SYSTEM_REGEX_H */ + preg->TRE_REGEX_T_FIELD = (void *)tnfa; +#ifdef __LIBC__ + /* In Libc, we need to retain the locale. Outside Libc, we already called + duplocale() which does the retaining. */ + XL_RETAIN(tnfa->loc); +#endif /* __LIBC__ */ + return REG_OK; + + error_exit: + /* Free everything that was allocated and return the error code. */ + tre_mem_destroy(mem); + if (stack != NULL) + tre_stack_destroy(stack); + if (counts != NULL) + xfree(counts); + if (offs != NULL) + xfree(offs); + + /* Set tnfa into preg, so that calling tre_free() will free the contents + of tnfa. But in Libc, NULL out the loc field since we never retained + the locale. Outside Libc, we let tre_free() call freelocale(). */ + preg->TRE_REGEX_T_FIELD = (void *)tnfa; +#ifdef __LIBC__ + if(tnfa) tnfa->loc = NULL; +#endif /* __LIBC__ */ + + tre_free(preg); + return errcode; +} + + + + +void +tre_free(regex_t *preg) +{ + tre_tnfa_t *tnfa; + unsigned int i; + tre_tnfa_transition_t *trans; + +#ifdef TRE_USE_SYSTEM_REGEX_H + preg->re_magic = 0; +#endif /* TRE_USE_SYSTEM_REGEX_H */ + tnfa = (void *)preg->TRE_REGEX_T_FIELD; + if (!tnfa) + return; + preg->TRE_REGEX_T_FIELD = NULL; + + for (i = 0; i < tnfa->num_transitions; i++) + if (tnfa->transitions[i].state) + { + if (tnfa->transitions[i].tags) + xfree(tnfa->transitions[i].tags); + if (tnfa->transitions[i].assertions & ASSERT_BRACKET_MATCH) + xfree(tnfa->transitions[i].u.bracket_match_list); + if (tnfa->transitions[i].params) + xfree(tnfa->transitions[i].params); + } + if (tnfa->transitions) + xfree(tnfa->transitions); + + if (tnfa->initial) + { + for (trans = tnfa->initial; trans->state; trans++) + { + if (trans->tags) + xfree(trans->tags); + if (trans->params) + xfree(trans->params); + } + xfree(tnfa->initial); + } + + if (tnfa->submatch_data) + { + xfree(tnfa->submatch_data); + } + + if (tnfa->tag_directions) + xfree(tnfa->tag_directions); + if (tnfa->firstpos_chars) + xfree(tnfa->firstpos_chars); + if (tnfa->minimal_tags) + xfree(tnfa->minimal_tags); + + if (tnfa->loc) +#ifdef __LIBC__ + XL_RELEASE(tnfa->loc); +#else /* !__LIBC__ */ + freelocale(tnfa->loc); +#endif /* !__LIBC__ */ + + if (tnfa->last_matched_branch) + xfree(tnfa->last_matched_branch); + + xfree(tnfa); +} + +#ifndef __LIBC__ +char * +tre_version(void) +{ + static char str[256]; + char *version; + + if (str[0] == 0) + { + (void) tre_config(TRE_CONFIG_VERSION, &version); + (void) snprintf(str, sizeof(str), "TRE %s (BSD)", version); + } + return str; +} + +int +tre_config(int query, void *result) +{ + int *int_result = result; + const char **string_result = result; + + switch (query) + { + case TRE_CONFIG_APPROX: +#ifdef TRE_APPROX + *int_result = 1; +#else /* !TRE_APPROX */ + *int_result = 0; +#endif /* !TRE_APPROX */ + return REG_OK; + + case TRE_CONFIG_WCHAR: +#ifdef TRE_WCHAR + *int_result = 1; +#else /* !TRE_WCHAR */ + *int_result = 0; +#endif /* !TRE_WCHAR */ + return REG_OK; + + case TRE_CONFIG_MULTIBYTE: +#ifdef TRE_MULTIBYTE + *int_result = 1; +#else /* !TRE_MULTIBYTE */ + *int_result = 0; +#endif /* !TRE_MULTIBYTE */ + return REG_OK; + + case TRE_CONFIG_SYSTEM_ABI: +#ifdef TRE_CONFIG_SYSTEM_ABI + *int_result = 1; +#else /* !TRE_CONFIG_SYSTEM_ABI */ + *int_result = 0; +#endif /* !TRE_CONFIG_SYSTEM_ABI */ + return REG_OK; + + case TRE_CONFIG_VERSION: + *string_result = TRE_VERSION; + return REG_OK; + } + + return REG_NOMATCH; +} +#endif /* !__LIBC__ */ + + +/* EOF */