2003-12-14 Paolo Bonzini * posix/regcomp.c (parse_dup_op): Process OP_DUP_PLUS, OP_DUP_ASTERISK, and OP_DUP_QUESTION like OP_OPEN_DUP_NUM, in order to lower OP_DUP_PLUS and mark subexpressions as OPT_SUBEXP. (optimize_utf8, calc_first, calc_next, calc_epsdest): Don't consider the OP_DUP_PLUS case. * posix/regexec.c (update_regs): OPT_SUBEXP subexpression may now happen even when PMATCH[REG_NUM].RM_SO == -1. (NUMBER_OF_STATES): Unused, remove it. * posix/regex_internal.h (re_token_type_t): Move OP_DUP_PLUS among the tokens rather than among the epsilon-transiting nodes. --- regexec.c.sav 2003-12-14 12:02:56.000000000 +0100 +++ regexec.c 2003-12-14 12:10:46.000000000 +0100 @@ -1351,19 +1351,21 @@ update_regs (dfa, pmatch, cur_node, cur_ int cur_node, cur_idx, nmatch; { int type = dfa->nodes[cur_node].type; - int reg_num; - if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP) - return; - reg_num = dfa->nodes[cur_node].opr.idx + 1; - if (reg_num >= nmatch) - return; if (type == OP_OPEN_SUBEXP) { + int reg_num = dfa->nodes[cur_node].opr.idx + 1; + if (reg_num >= nmatch) + return; + if (dfa->nodes[cur_node].opt_subexp) /* We are at the first node of a repeated subexpression. - For now, we leave it as is because we know that this - subexpression starts where the previous copy ends. */ - ; + Store the index into rm_eo, we will slide it into rm_so + when we find the OP_CLOSE_SUBEXP and if the match is not + empty. This is actually useless in most cases, because + the subexpression will start where the previous copy + ends, but is needed for the first copy of a (RE)* + subexpression. */ + pmatch[reg_num].rm_eo = cur_idx; else { /* We are at the first node of this sub expression. */ @@ -1373,12 +1375,18 @@ update_regs (dfa, pmatch, cur_node, cur_ } else if (type == OP_CLOSE_SUBEXP) { + int reg_num = dfa->nodes[cur_node].opr.idx + 1; + if (reg_num >= nmatch) + return; + if (dfa->nodes[cur_node].opt_subexp) { /* We are at the last node of a repeated subexpression. - If it is not an empty match, we can set it, otherwise - we leave the previous, non-empty match. */ - if (pmatch[reg_num].rm_eo < cur_idx) + If it is not an empty match, or if it is the first match, + we can set it; otherwise, we leave the previous, non-empty + match. */ + if (pmatch[reg_num].rm_eo < cur_idx + || pmatch[reg_num].rm_so == -1) { pmatch[reg_num].rm_so = pmatch[reg_num].rm_eo; pmatch[reg_num].rm_eo = cur_idx; @@ -1390,8 +1398,6 @@ update_regs (dfa, pmatch, cur_node, cur_ } } -#define NUMBER_OF_STATE 1 - /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 and sift the nodes in each states according to the following rules. Updated state_log will be wrote to STATE_LOG. --- regex_internal.h.sav 2003-12-14 10:45:41.000000000 +0100 +++ regex_internal.h 2003-12-14 10:46:04.000000000 +0100 @@ -186,9 +186,8 @@ typedef enum OP_CLOSE_SUBEXP = EPSILON_BIT | 1, OP_ALT = EPSILON_BIT | 2, OP_DUP_ASTERISK = EPSILON_BIT | 3, - OP_DUP_PLUS = EPSILON_BIT | 4, - OP_DUP_QUESTION = EPSILON_BIT | 5, - ANCHOR = EPSILON_BIT | 6, + OP_DUP_QUESTION = EPSILON_BIT | 4, + ANCHOR = EPSILON_BIT | 5, /* Tree type, these are used only by tree. */ CONCAT = 16, @@ -199,6 +198,7 @@ typedef enum OP_CHARSET_RANGE, OP_OPEN_DUP_NUM, OP_CLOSE_DUP_NUM, + OP_DUP_PLUS, OP_NON_MATCH_LIST, OP_OPEN_COLL_ELEM, OP_CLOSE_COLL_ELEM, --- regcomp.c.sav 2003-12-14 10:36:36.000000000 +0100 +++ regcomp.c 2003-12-14 12:21:56.000000000 +0100 @@ -1031,7 +1031,6 @@ optimize_utf8 (dfa) case END_OF_RE: case OP_DUP_ASTERISK: case OP_DUP_QUESTION: - case OP_DUP_PLUS: case OP_OPEN_SUBEXP: case OP_CLOSE_SUBEXP: break; @@ -1176,14 +1175,6 @@ calc_first (dfa, node) case OP_CLOSE_SUBEXP: node->first = idx; break; - case OP_DUP_PLUS: -#ifdef DEBUG - assert (node->left != NULL); -#endif - if (node->left->first == -1) - calc_first (dfa, node->left); - node->first = node->left->first; - break; case OP_ALT: node->first = idx; break; @@ -1223,7 +1214,6 @@ calc_next (dfa, node) switch (type) { case OP_DUP_ASTERISK: - case OP_DUP_PLUS: node->next = idx; break; case CONCAT: @@ -1258,7 +1248,6 @@ calc_epsdest (dfa, node) if (node->type == 0) { if (dfa->nodes[idx].type == OP_DUP_ASTERISK - || dfa->nodes[idx].type == OP_DUP_PLUS || dfa->nodes[idx].type == OP_DUP_QUESTION) { if (node->left->first == -1) @@ -2409,15 +2398,15 @@ parse_dup_op (dup_elem, regexp, dfa, tok reg_errcode_t *err; { re_token_t dup_token; - bin_tree_t *tree = dup_elem, *work_tree; + bin_tree_t *tree = dup_elem, *work_tree, *elem; int start_idx = re_string_cur_idx (regexp); re_token_t start_token = *token; + int start, end, i; + if (token->type == OP_OPEN_DUP_NUM) { - int i; - int end = 0; - int start = fetch_number (regexp, token, syntax); - bin_tree_t *elem; + end = 0; + start = fetch_number (regexp, token, syntax); if (start == -1) { if (token->type == CHARACTER && token->opr.c == ',') @@ -2443,42 +2432,52 @@ parse_dup_op (dup_elem, regexp, dfa, tok else goto parse_dup_op_ebrace; } + if (BE (start == 0 && end == 0, 0)) { /* We treat "{0}" and "{0,0}" as null string. */ fetch_token (token, regexp, syntax); return NULL; } + } + else + { + start = (token->type == OP_DUP_PLUS) ? 1 : 0; + end = (token->type == OP_DUP_QUESTION) ? 1 : -1; + } - /* Extract "{n,m}" to "...{0,}". */ - elem = tree; - for (i = 0; i < start; ++i) - if (i != 0) - { - work_tree = duplicate_tree (elem, dfa, 0); - tree = create_tree (dfa, tree, work_tree, CONCAT, 0); - if (BE (work_tree == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; - } + /* Extract "{n,m}" to "...{0,}". */ + elem = tree; + for (i = 1; i < start; ++i) + { + work_tree = duplicate_tree (elem, dfa, 0); + tree = create_tree (dfa, tree, work_tree, CONCAT, 0); + if (BE (work_tree == NULL || tree == NULL, 0)) + goto parse_dup_op_espace; + } + + if (start != end) + { + if (start > 0 || elem->type == CONCAT) + { + /* Mark this node as a repeated subexpression. */ + elem = duplicate_tree (elem, dfa, 1); + if (BE (elem == NULL, 0)) + goto parse_dup_op_espace; + } if (end == -1) { /* We treat "{0,}" as "*". */ dup_token.type = OP_DUP_ASTERISK; + work_tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); if (start > 0) - { - elem = duplicate_tree (elem, dfa, 1); - work_tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - tree = create_tree (dfa, tree, work_tree, CONCAT, 0); - if (BE (elem == NULL || work_tree == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; - } + tree = create_tree (dfa, tree, work_tree, CONCAT, 0); else - { - tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - if (BE (tree == NULL, 0)) - goto parse_dup_op_espace; - } + tree = work_tree; + + if (BE (work_tree == NULL || tree == NULL, 0)) + goto parse_dup_op_espace; } else if (BE (start > end, 0)) { @@ -2490,20 +2489,15 @@ parse_dup_op (dup_elem, regexp, dfa, tok { /* Then extract "{0,m}" to "??...?". */ dup_token.type = OP_DUP_QUESTION; + elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); if (start > 0) - { - elem = duplicate_tree (elem, dfa, 1); - elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - tree = create_tree (dfa, tree, elem, CONCAT, 0); - if (BE (elem == NULL || tree == NULL, 0)) - goto parse_dup_op_espace; - } + tree = create_tree (dfa, tree, elem, CONCAT, 0); else - { - tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token); - if (BE (tree == NULL, 0)) - goto parse_dup_op_espace; - } + tree = elem; + + if (BE (elem == NULL || tree == NULL, 0)) + goto parse_dup_op_espace; + for (i = 1; i < end - start; ++i) { work_tree = duplicate_tree (elem, dfa, 1); @@ -2516,15 +2510,7 @@ parse_dup_op (dup_elem, regexp, dfa, tok } } } - else - { - tree = re_dfa_add_tree_node (dfa, tree, NULL, token); - if (BE (tree == NULL, 0)) - { - *err = REG_ESPACE; - return NULL; - } - } + fetch_token (token, regexp, syntax); return tree;