This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
gawk fixes for regex library
- From: Aharon Robbins <arnold at skeeve dot com>
- To: carlo dot bonzini at libero dot it, libc-alpha at sources dot redhat dot com
- Date: Wed, 7 Jan 2004 16:34:30 +0200
- Subject: gawk fixes for regex library
Greetings.
In a fit of <whatever>, I decided to sync the gawk regex routines with
those in the glibc CVS. I managed to find the ones on which my current
code was based, went through them, extracted out the important changes,
and put them into the current code.
I had at one point also done changes to make the code compile under K&R C,
but I decided that this just isn't worth it anymore, so I didn't bother
forward porting that code. (Yay!)
A suggestion. In my humble opinion, it'd be better to change
RE_TRANSLATE_TYPE to `unsigned char *' at the definition, and then
remove the extra `unsigned' casts and declarations throughout the code.
But I'll leave that up to you folks.
Paolo says the current code is faster than the old code. This seems to
be true. But it's still nowhere near as fast as the dfa code in GNU grep.
Thus, for the eventual 3.1.4 release, I've reinstated the old code that
uses dfa.c for ``does it match?'' cases, with the full regex only for
cases where I need the ``where does it match?'' information.
Anyway, here are my diffs. I hope this is useful.
Thanks,
Arnold Robbins
The gawk guy
-----------------------
2004-January-7 Arnold D. Robbins <arnold@skeeve.com>
* regcomp.c (re_compile_fastmap_iter): Fix `icase' assigment if not
RE_ENABLE_I18N.
(parse_expression): Similar for dfa->has_mb_code;
(build_range_exp): Fix `cmp_buf' contents to be correct if don't have
valid multibyte characters.
* regex.c: Include config.h if HAVE_CONFIG_H. For MSC, include <stdio.h>
Include <limits.h> here, before include of regex.h, so that regex.h
can fix the value of `RE_DUP_MAX' correctly.
* regex.c: If __APPLE_CC__ is defined, give empty definition for `__restrict'.
* regex_internal.c (re_string_allocate): Only test `dfa->mb_cur_max' if
`RE_ENABLE_I18N' is defined.
* regex_internal.h: Don't include <limits.h> here, added comment about it.
For other GNU distributions, make sure that `ENABLE_NLS' is defined before
including <libintl.h>.
(BE): Add definition if needed.
* regexec.c (re_search_internal): Set `sb' correctly if `RE_ENABLE_I18N' isn't
defined.
--- /usr/local/src/Gnu/libc/posix/regcomp.c 2004-01-06 23:59:24.000000000 +0200
+++ regcomp.c 2004-01-07 15:43:20.000000000 +0200
@@ -324,7 +324,11 @@
{
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
int node_cnt;
+#ifdef RE_ENABLE_I18N
int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
+#else
+ int icase = (bufp->syntax & RE_ICASE);
+#endif
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
{
int node = init_state->nodes.elems[node_cnt];
@@ -2233,8 +2237,10 @@
*err = REG_ESPACE;
return NULL;
}
+#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
dfa->has_mb_node = 1;
+#endif
break;
case OP_WORD:
tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
@@ -2558,8 +2564,8 @@
? __btowc (start_ch) : start_elem->opr.wch);
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
? __btowc (end_ch) : end_elem->opr.wch);
- cmp_buf[0] = start_wc;
- cmp_buf[4] = end_wc;
+ cmp_buf[0] = start_wc != WEOF ? start_wc : start_ch;
+ cmp_buf[4] = end_wc != WEOF ? end_wc : end_ch;
if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
return REG_ERANGE;
--- /usr/local/src/Gnu/libc/posix/regex.c 2003-11-16 09:03:23.000000000 +0200
+++ regex.c 2004-01-07 15:48:01.000000000 +0200
@@ -43,9 +43,19 @@
# include "../locale/localeinfo.h"
#endif
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
/* POSIX says that <sys/types.h> must be included (by the caller) before
<regex.h>. */
#include <sys/types.h>
+
+#if defined (_MSC_VER)
+#include <stdio.h> /* for size_t */
+#endif
+
+#include <limits.h>
#include <regex.h>
#include "regex_internal.h"
--- /usr/local/src/Gnu/libc/posix/regex.h 2003-11-13 07:58:44.000000000 +0200
+++ regex.h 2004-01-07 15:45:34.000000000 +0200
@@ -543,6 +543,8 @@
# else
# define __restrict
# endif
+# elif defined __APPLE_CC__
+# define __restrict
# endif
#endif
/* gcc 3.1 and up support the [restrict] syntax. */
--- /usr/local/src/Gnu/libc/posix/regex_internal.c 2004-01-07 00:00:16.000000000 +0200
+++ regex_internal.c 2004-01-07 15:47:42.000000000 +0200
@@ -70,7 +70,11 @@
pstr->word_char = dfa->word_char;
pstr->word_ops_used = dfa->word_ops_used;
pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
- pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
+ pstr->valid_len = (pstr->mbs_allocated
+#ifdef RE_ENABLE_I18N
+ || dfa->mb_cur_max > 1
+#endif
+ ) ? pstr->valid_len : len;
pstr->valid_raw_len = pstr->valid_len;
return REG_NOERROR;
}
--- /usr/local/src/Gnu/libc/posix/regex_internal.h 2004-01-06 23:59:50.000000000 +0200
+++ regex_internal.h 2004-01-07 15:58:52.000000000 +0200
@@ -27,7 +27,14 @@
#include <assert.h>
#include <ctype.h>
+#if 0
+/* Don't include this here. On some systems it sets RE_DUP_MAX to a
+ * lower value than GNU regex allows. Instead, include it in
+ * regex.c, before include of <regex.h>, which correctly
+ * #undefs RE_DUP_MAX and sets it to the right value.
+ */
#include <limits.h>
+#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -57,7 +64,7 @@
#endif
/* This is for other GNU distributions with internationalized messages. */
-#if HAVE_LIBINTL_H || defined _LIBC
+#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
# include <libintl.h>
# ifdef _LIBC
# undef gettext
@@ -112,6 +119,13 @@
# define __attribute(arg)
#endif
+#if _LIBC || __GNUC__ >= 3
+# define BE(expr, val) __builtin_expect (expr, val)
+#else
+# define BE(expr, val) (expr)
+# define inline
+#endif
+
extern const char __re_error_msgid[] attribute_hidden;
extern const size_t __re_error_msgid_idx[] attribute_hidden;
--- /usr/local/src/Gnu/libc/posix/regexec.c 2004-01-03 06:06:39.000000000 +0200
+++ regexec.c 2004-01-07 15:52:20.000000000 +0200
@@ -646,7 +646,11 @@
incr = (range < 0) ? -1 : 1;
left_lim = (range < 0) ? start + range : start;
right_lim = (range < 0) ? start : start + range;
+#ifdef RE_ENABLE_I18N
sb = dfa->mb_cur_max == 1;
+#else
+ sb = 1;
+#endif
fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
for (;;)