This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.
Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi! The following patch speeds up REG_NOSUB searching, adds RE_NO_SUB for the re_compile_pattern interface to get similar behaviour and also speeds up cases when subexps are immediately nested. It removes all OP_OPEN_SUBEXP/OP_CLOSE_SUBEXP nodes that are not needed from the str_tree after parsing, but before analyzing. With REG_NOSUB, all these nodes are unneeded with the exception of those used for backreferences. Without REG_NOSUB, unneeded are just those that are immediately surrounded by another () pair (and aren't used by backreferences). Measurements, tst-regex2 with -DWHOLE_FILE_TIMING, is below. With REG_NOSUB or RE_NO_SUB (test1 and test3), all 3 patterns are searched with the same speed. Without that, 3rd pattern searching is as fast as 2nd pattern search. Vanilla CVS: test 0 pattern 0: 0.840423424s test 0 pattern 1: 1.212836508s test 0 pattern 2: 4.888927188s test 1 pattern 0: 0.812340634s test 1 pattern 1: 1.242682267s test 1 pattern 2: 4.872612232s test 2 pattern 0: 0.763208542s test 2 pattern 1: 1.142578726s test 2 pattern 2: 4.639178880s test 3 pattern 0: 0.764673397s test 3 pattern 1: 1.139890453s test 3 pattern 2: 4.635312775s With patch: test 0 pattern 0: 0.820845810s test 0 pattern 1: 1.261617889s test 0 pattern 2: 1.263218611s test 1 pattern 0: 0.833536480s test 1 pattern 1: 0.821412026s test 1 pattern 2: 0.822183978s test 2 pattern 0: 0.778098488s test 2 pattern 1: 1.191543473s test 2 pattern 2: 1.194518191s test 3 pattern 0: 0.777669430s test 3 pattern 1: 0.790473658s test 3 pattern 2: 0.780549884s 2004-11-18 Jakub Jelinek <jakub@redhat.com> [BZ #544] * posix/regex.h (RE_NO_SUB): New define. * posix/regex_internal.h (OP_DELETED_SUBEXP): New. (re_dfa_t): Add subexp_map. * posix/regcomp.c (struct subexp_optimize): New type. (optimize_subexps): New routine. (re_compile_internal): Call it. (re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB. (free_dfa_content): Free subexp_map. (calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP nodes. * posix/regexec.c (re_search_internal): If subexp_map is not NULL, duplicate registers as needed. * posix/Makefile: Add rules to build and run tst-regex2. * posix/tst-regex2.c: New test. * posix/rxspencer/tests: Fix last two tests (\0 -> \1). Add some new tests for nested subexpressions. --- libc/posix/Makefile.jj 2004-11-15 13:32:43.000000000 +0100 +++ libc/posix/Makefile 2004-11-18 18:01:02.077432232 +0100 @@ -80,7 +80,7 @@ tests := tstgetopt testfnm runtests run bug-regex13 bug-regex14 bug-regex15 bug-regex16 \ bug-regex17 bug-regex18 bug-regex19 bug-regex20 \ bug-regex21 bug-regex22 bug-regex23 bug-regex24 \ - tst-nice tst-nanosleep \ + tst-nice tst-nanosleep tst-regex2 \ transbug tst-rxspencer tst-pcre tst-boost \ bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \ tst-getaddrinfo2 bug-glob1 bug-glob2 @@ -160,6 +160,7 @@ tst-fnmatch-ENV = LOCPATH=$(common-objpf tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata tst-regex-ENV = LOCPATH=$(common-objpfx)localedata +tst-regex2-ENV = LOCPATH=$(common-objpfx)localedata bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata @@ -244,8 +245,10 @@ $(objpfx)tst-getconf.out: tst-getconf.sh ifeq (yes,$(build-shared)) $(objpfx)tst-regex: $(common-objpfx)rt/librt.so +$(objpfx)tst-regex2: $(common-objpfx)rt/librt.so else $(objpfx)tst-regex: $(common-objpfx)rt/librt.a +$(objpfx)tst-regex2: $(common-objpfx)rt/librt.a endif $(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out --- libc/posix/regex_internal.h.jj 2004-11-12 13:57:49.000000000 +0100 +++ libc/posix/regex_internal.h 2004-11-18 13:18:13.000000000 +0100 @@ -189,6 +189,7 @@ typedef enum OP_DUP_PLUS = EPSILON_BIT | 4, OP_DUP_QUESTION = EPSILON_BIT | 5, ANCHOR = EPSILON_BIT | 6, + OP_DELETED_SUBEXP = EPSILON_BIT | 7, /* Tree type, these are used only by tree. */ CONCAT = 16, @@ -644,6 +645,7 @@ struct re_dfa_t int mb_cur_max; bitset word_char; reg_syntax_t syntax; + int *subexp_map; #ifdef DEBUG char* re_str; #endif --- libc/posix/rxspencer/tests.jj 2004-11-10 10:36:48.000000000 +0100 +++ libc/posix/rxspencer/tests 2004-11-18 16:45:27.902282924 +0100 @@ -508,5 +508,21 @@ a*a*a*a*a*a*a* & aaaaaa aaaaaa (\b){0} - x @x - \(\b\)\{0,0\} b abc @abc - a(\b){0}c - ac ac - -a(.*)b(\0){0}c - abc abc @bc,- -a(.*)b(\0){0}c - axbc axbc x,- +a(.*)b(\1){0}c - abc abc @bc,- +a(.*)b(\1){0}c - axbc axbc x,- + +a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb +a\(\([bc]\)\)\2d b abcdabbd abbd b,b +a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c +a(b)(c)d - abcd abcd b,c +a(((b)))c - abc abc b,b,b +a(((b|(((c))))))d - abd abd b,b,b,-,-,- +a(((b*|c|e)))d - abbd abbd bb,bb,bb +a((b|c)){0,0}d - ad ad -,- +a((b|c)){0,1}d - abd abd b,b +a((b|c)){0,2}d - abcd abcd c,c +a((b+|((c)*)))+d - abd abd b,b,-,- +a((b+|((c)*)))+d - abcd abcd c,c,c,c +(((\b))){0} - x @x -,-,- +a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,- +a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,- --- libc/posix/regcomp.c.jj 2004-11-15 13:39:35.000000000 +0100 +++ libc/posix/regcomp.c 2004-11-18 16:20:19.500142900 +0100 @@ -33,6 +33,14 @@ static reg_errcode_t create_initial_stat #ifdef RE_ENABLE_I18N static void optimize_utf8 (re_dfa_t *dfa); #endif +struct subexp_optimize +{ + re_dfa_t *dfa; + re_token_t *nodes; + int no_sub, re_nsub; +}; +static bin_tree_t *optimize_subexps (struct subexp_optimize *so, + bin_tree_t *node, int sidx, int depth); static reg_errcode_t analyze (re_dfa_t *dfa); static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); static void calc_first (re_dfa_t *dfa, bin_tree_t *node); @@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, buf /* And GNU code determines whether or not to get register information by passing null for the REGS argument to re_match, etc., not by - setting no_sub. */ - bufp->no_sub = 0; + setting no_sub, unless RE_NO_SUB is set. */ + bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); /* Match anchors at newline. */ bufp->newline_anchor = 1; @@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa) if (dfa->sb_char != utf8_sb_map) re_free (dfa->sb_char); #endif + re_free (dfa->subexp_map); #ifdef DEBUG re_free (dfa->re_str); #endif @@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, leng optimize_utf8 (dfa); #endif + if (preg->re_nsub > 0) + { + struct subexp_optimize so; + + so.dfa = dfa; + so.nodes = dfa->nodes; + so.no_sub = preg->no_sub; + so.re_nsub = preg->re_nsub; + dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0); + } + /* Analyze the tree and collect information which is necessary to create the dfa. */ err = analyze (dfa); @@ -1121,6 +1141,82 @@ optimize_utf8 (dfa) } #endif +static bin_tree_t * +optimize_subexps (so, node, sidx, depth) + struct subexp_optimize *so; + bin_tree_t *node; + int sidx, depth; +{ + int idx, new_depth, new_sidx; + bin_tree_t *ret; + if (node == NULL) + return NULL; + + new_depth = 0; + new_sidx = sidx; + if ((depth & 1) && node->type == CONCAT + && node->right && node->right->type == 0 + && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP) + { + new_depth = depth + 1; + if (new_depth == 2 + || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map) + && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))) + new_sidx = so->nodes[idx].opr.idx; + } + node->left = optimize_subexps (so, node->left, new_sidx, new_depth); + new_depth = (depth & 1) == 0 && node->type == CONCAT + && node->left && node->left->type == 0 + && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP + ? depth + 1 : 0; + node->right = optimize_subexps (so, node->right, sidx, new_depth); + + if (node->type != CONCAT) + return node; + if ((depth & 1) == 0 + && node->left + && node->left->type == 0 + && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP) + ret = node->right; + else if ((depth & 1) + && node->right + && node->right->type == 0 + && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP) + ret = node->left; + else + return node; + + if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map) + && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)) + return node; + + if (!so->no_sub) + { + int i; + + if (depth < 2) + return node; + + if (so->dfa->subexp_map == NULL) + { + so->dfa->subexp_map = re_malloc (int, so->re_nsub); + if (so->dfa->subexp_map == NULL) + return node; + + for (i = 0; i < so->re_nsub; i++) + so->dfa->subexp_map[i] = i; + } + + i = so->nodes[idx].opr.idx; + assert (sidx < i); + so->dfa->subexp_map[i] = sidx; + } + + so->nodes[idx].type = OP_DELETED_SUBEXP; + ret->parent = node->parent; + return ret; +} + /* Analyze the structure tree, and calculate "first", "next", "edest", "eclosure", and "inveclosure". */ @@ -1525,6 +1621,8 @@ calc_inveclosure (dfa) int src, idx, dest; for (src = 0; src < dfa->nodes_len; ++src) { + if (dfa->nodes[src].type == OP_DELETED_SUBEXP) + continue; for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) { dest = dfa->eclosures[src].elems[idx]; @@ -1560,6 +1658,9 @@ calc_eclosure (dfa) #ifdef DEBUG assert (dfa->eclosures[node_idx].nelem != -1); #endif + if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP) + continue; + /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; --- libc/posix/regexec.c.jj 2004-11-12 13:57:49.000000000 +0100 +++ libc/posix/regexec.c 2004-11-18 15:48:40.891375714 +0100 @@ -882,6 +882,18 @@ re_search_internal (preg, string, length pmatch[reg_idx].rm_so += match_first; pmatch[reg_idx].rm_eo += match_first; } + + if (dfa->subexp_map) + for (reg_idx = 0; + reg_idx + 1 < nmatch && reg_idx < preg->re_nsub; + reg_idx++) + if (dfa->subexp_map[reg_idx] != reg_idx) + { + pmatch[reg_idx + 1].rm_so + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; + pmatch[reg_idx + 1].rm_eo + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; + } } free_return: --- libc/posix/tst-regex2.c.jj 2004-11-18 17:57:44.697602014 +0100 +++ libc/posix/tst-regex2.c 2004-11-18 18:00:13.697052790 +0100 @@ -0,0 +1,244 @@ +#include <fcntl.h> +#include <locale.h> +#include <regex.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <time.h> +#include <unistd.h> + +#ifdef _POSIX_CPUTIME +static clockid_t cl; +static int use_clock; +#endif + +static int +do_test (void) +{ +#ifdef _POSIX_CPUTIME + /* See whether we can use the CPU clock. */ + use_clock = clock_getcpuclockid (0, &cl) == 0; +#endif + + static const char *pat[] = { + ".?.?.?.?.?.?.?Log\\.13", + "(.?)(.?)(.?)(.?)(.?)(.?)(.?)Log\\.13", + "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))" + "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))" + "((((((((((.?))))))))))Log\\.13" }; + + int fd = open ("../ChangeLog.14", O_RDONLY); + if (fd < 0) + { + printf ("Couldn't open ChangeLog.14: %m\n"); + return 1; + } + + struct stat64 st; + if (fstat64 (fd, &st) < 0) + { + printf ("Couldn't fstat ChangeLog.14: %m\n"); + return 1; + } + + char *buf = malloc (st.st_size + 1); + if (buf == NULL) + { + printf ("Couldn't allocate buffer: %m\n"); + return 1; + } + + if (read (fd, buf, st.st_size) != (ssize_t) st.st_size) + { + puts ("Couldn't read ChangeLog.14"); + return 1; + } + + close (fd); + buf[st.st_size] = '\0'; + + setlocale (LC_ALL, "de_DE.UTF-8"); + + char *string = buf; + size_t len = st.st_size; + +#ifndef WHOLE_FILE_TIMING + /* Don't search the whole file normally, it takes too long. */ + if (len > 500000 + 64) + { + string += 500000; + len -= 500000; + } +#endif + + for (int testno = 0; testno < 4; ++testno) + for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i) + { + printf ("test %d pattern %d", testno, i); + + regex_t rbuf; + struct re_pattern_buffer rpbuf; + int err; + if (testno < 2) + { + err = regcomp (&rbuf, pat[i], + REG_EXTENDED | (testno ? REG_NOSUB : 0)); + if (err != 0) + { + putchar ('\n'); + char errstr[300]; + regerror (err, &rbuf, errstr, sizeof (errstr)); + puts (errstr); + return err; + } + } + else + { + re_set_syntax (RE_SYNTAX_POSIX_EGREP + | (testno == 3 ? RE_NO_SUB : 0)); + + memset (&rpbuf, 0, sizeof (rpbuf)); + const char *s = re_compile_pattern (pat[i], strlen (pat[i]), + &rpbuf); + if (s != NULL) + { + printf ("\n%s\n", s); + return 1; + } + + /* Just so that this can be tested with earlier glibc as well. */ + if (testno == 3) + rpbuf.no_sub = 1; + } + +#ifdef _POSIX_CPUTIME + struct timespec start, stop; + if (use_clock) + use_clock = clock_gettime (cl, &start) == 0; +#endif + + if (testno < 2) + { + regmatch_t pmatch[71]; + err = regexec (&rbuf, string, 71, pmatch, 0); + if (err == REG_NOMATCH) + { + puts ("\nregexec failed"); + return 1; + } + + if (testno == 0) + { + if (pmatch[0].rm_eo != pmatch[0].rm_so + 13 + || pmatch[0].rm_eo > len + || pmatch[0].rm_so < len - 100 + || strncmp (string + pmatch[0].rm_so, + " ChangeLog.13 for earlier changes", + sizeof " ChangeLog.13 for earlier changes" - 1) + != 0) + { + puts ("\nregexec without REG_NOSUB did not find the correct match"); + return 1; + } + + if (i > 0) + for (int j = 0, l = 1; j < 7; ++j) + for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l) + if (pmatch[l].rm_so != pmatch[0].rm_so + j + || pmatch[l].rm_eo != pmatch[l].rm_so + 1) + { + printf ("\npmatch[%d] incorrect\n", l); + return 1; + } + } + } + else + { + struct re_registers regs; + + memset (®s, 0, sizeof (regs)); + int match = re_search (&rpbuf, string, len, 0, len, + ®s); + if (match < 0) + { + puts ("\nre_search failed"); + return 1; + } + + if (match + 13 > len + || match < len - 100 + || strncmp (string + match, + " ChangeLog.13 for earlier changes", + sizeof " ChangeLog.13 for earlier changes" - 1) + != 0) + { + puts ("\nre_search did not find the correct match"); + return 1; + } + + if (testno == 2) + { + if (regs.num_regs != 2 + (i == 0 ? 0 : i == 1 ? 7 : 70)) + { + printf ("\nincorrect num_regs %d\n", regs.num_regs); + return 1; + } + + if (regs.start[0] != match || regs.end[0] != match + 13) + { + printf ("\nincorrect regs.{start,end}[0] = { %d, %d}\n", + regs.start[0], regs.end[0]); + return 1; + } + + if (regs.start[regs.num_regs - 1] != -1 + || regs.end[regs.num_regs - 1] != -1) + { + puts ("\nincorrect regs.{start,end}[num_regs - 1]"); + return 1; + } + + if (i > 0) + for (int j = 0, l = 1; j < 7; ++j) + for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l) + if (regs.start[l] != match + j + || regs.end[l] != regs.start[l] + 1) + { + printf ("\nregs.{start,end}[%d] incorrect\n", l); + return 1; + } + } + } + +#ifdef _POSIX_CPUTIME + if (use_clock) + use_clock = clock_gettime (cl, &stop) == 0; + if (use_clock) + { + stop.tv_sec -= start.tv_sec; + if (stop.tv_nsec < start.tv_nsec) + { + stop.tv_sec--; + stop.tv_nsec += 1000000000 - start.tv_nsec; + } + else + stop.tv_nsec -= start.tv_nsec; + printf (": %ld.%09lds\n", (long) stop.tv_sec, (long) stop.tv_nsec); + } + else +#endif + putchar ('\n'); + + if (testno < 2) + regfree (&rbuf); + else + regfree (&rpbuf); + } + + return 0; +} + +#define TIMEOUT 20 +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" --- libc/posix/regex.h.jj 2004-03-05 09:37:25.000000000 +0100 +++ libc/posix/regex.h 2004-11-18 16:14:57.477498572 +0100 @@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t; immediately after an alternation or begin-group operator. */ #define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1) +/* If this bit is set, then no_sub will be set to 1 during + re_compile_pattern. */ +#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) + /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect Jakub
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |